#!/usr/bin/perl -w
use strict;

# print STDERR "spider $$ [@ARGV]\n";

#
# SWISH-E http method Spider
# $Id: swishspider,v 1.9 2002/09/09 07:15:19 whmoseley Exp $ 
#

# Should SWISH::Filter be use for filtering?  This can be left 1 all the time, but
# will add a little time to processing since.

use constant USE_FILTERS  => 1;  # 1 = yes use SWISH::Filter for filtering, 0 = no. (faster processing if not set)
use constant FILTER_TEXT  => 0;  # set to one to filter text/* content, 0 will save processing time
use constant DEBUG_FILTER => 0;  # set to one to report errors on loading SWISH::Filter module.

use LWP::UserAgent;
use HTTP::Status;
use HTML::Parser 3.00;
use HTML::LinkExtor;

    if (scalar(@ARGV) != 2) {
        print STDERR "Usage: SwishSpider localpath url\n";
        exit(1);
    }

    my $ua = new LWP::UserAgent;
    $ua->agent( "SwishSpider http://swish-e.org" );


    my $localpath = shift;
    my $url = shift;

    my $request = new HTTP::Request( "GET", $url );
    my $response = $ua->simple_request( $request );

    # Save the HTTP code, the content/type (or a redirection header), and a last modified date, if one.

    open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response: $!" );
    print RESP $response->code() . "\n";



    # If failed to fetch doc then write out the code and location and exit
    
    if( $response->code != RC_OK ) {
        print RESP ($response->header( "location" ) ||'') . "\n";
        exit;
    }


    # Filter the document, if possible.

    my ( $content_ref, $content_type ) = filter_doc( $response );


    # Write out the (perhaps new) content type and the last modified date.

    print RESP "$content_type\n",
               ($response->last_modified || 0), "\n";

    close RESP;



    # Now write the content -- really only need to do this on text/* types since that's all swish processes

    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents: $!\n" );
    print CONTENTS $$content_ref;
    close( CONTENTS );


    # Finally, extract out links

    exit unless $content_type =~ m!text/html!;

    open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links: $!\n" );
    my $p = HTML::LinkExtor->new( \&linkcb, $url );
    $p->parse( $$content_ref );
    close( LINKS );

    exit;


sub linkcb {
    my($tag, %links) = @_;

    return unless $tag eq 'a' && $links{href};

    my $link = $links{href};

    # Remove fragments
    $link =~ s/(.*)#.*/$1/;

    # Remove ../  This is important because the abs() function
    # can leave these in and cause never ending loops.
    $link =~ s/\.\.\///g;

    print LINKS "$link\n";
}


# This will optionally attempt to filter the document

sub filter_doc {
    my $response = shift;

    my ( $content, $content_type ) = ( $response->content, $response->header( "content-type" ) );

    my $content_ref = \$content;

    unless ( $content_type ) {
        warn 'URL: ', $response->base, " did not return a content-type\n";
        return ( $content_ref, 'text/plain' );
    }


    return ( $content_ref, $content_type ) unless USE_FILTERS;  # filters enabled?


    # This can avoid loading the filter module if it is known that type text/* will never be filtered.
    
    return ( $content_ref, $content_type )
        if $content_type =~ m!^text/! && !FILTER_TEXT;
    

    eval { require SWISH::Filter };
    if ( $@ ) {
        warn $@ if DEBUG_FILTER;
        return ( $content_ref, $content_type );
    }

    my $filter = SWISH::Filter->new;

    my $filtered = $filter->filter(
        document => $content_ref,
        name     => $response->base,
        content_type => $content_type,
    );

    return $filtered
        ? ( $filter->fetch_doc, $filter->content_type )
        : ( $content_ref, $content_type );
}


