#!/usr/bin/perl

eval 'exec /usr/bin/perl  -S $0 ${1+"$@"}'
    if 0; # not running under some shell
###############################################################################
#
# WebScan:  A utility to traverse a web site and index all the pages found.
#
# Written by Brian C. White <bcwhite@pobox.com>
# Copyright (c) 1996-2001 by Brian White and Behan Webster.
#
###############################################################################


# I've got the power!
use Ferret;


# Other useful libraries
use IO::Handle;
use POSIX;


# Don't even look at these extensions!
# (this should be handled by the "Accept" flag in the HTTP options, but most
# servers don't seem to pay any attention to that.  <sigh>)
$badext='jpg|jpeg|gif|png|bmp|tiff?|xbm|au|wav|mpg|mpeg|qt|mov|dl|gl|fli|zip|exe|com|map|tar|tgz';


# Program-wide variable declarations
$count	= 0;			# Counter for periodic flushing
@urllist;				# List of urls yet to process
%urldone;				# List of processed urls (to avoid looping)
%disallow;				# Disallowed URLs (from "robots.txt")
STDOUT->autoflush(1);



# Get and parse the "robots.txt" file for a site
sub GetRobotsTXT {
	my($site) = @_;

	$disallow{$site} = "";
#	print "fetching http:$site/robots.txt...";
	my $data = eval { Ferret::LoadHTTP("http:$site/robots.txt"); };
#	print "error=$@, data:\n$data\n";
	if ($@ || !$data) {
		return;
	}

	my $apply = 0;
	foreach (split(/\s*\n/,$data)) {
		s/^\s+|\s+$//gs;

		if (m/^user-agent:\s*(.*?)$/i) {
			$apply = ($1 eq '*' || $1 =~ m/ferret/i || $uname =~ m/\Q$1\E/i);
			next;
		}
		next unless $apply;

		if (m/^disallow:\s*(\S+)/i) {
			$disallow{$site} .= "|" if $disallow{$site};
			$disallow{$site} .= "\Q$1\E";
#			print "$site disallowed: '$disallow{$site}'\n";
		}
	}
}



# A subroutine to validate URLs
sub CleanURL {
	my($from,$to) = @_;

	$_ = Ferret::ResolveURL($from,$to);
	if ($uedit) {
#		my $before = $_;
		eval $uedit;
		die $@ if $@;
#		print "* $before -> $_\n" unless $_ eq $before;
	}
	$to = $_;

	# Remove CGI arguments & tags
	$to =~ s/[?\#].*$//;

	my($ftype,$fhost,$fport,$ffile, $ttype,$thost,$tport,$tfile);
	($ftype,$fhost,$fport,$ffile) = ($from =~ m|^(\w+:)?(//[^/:]*)?(:\d+)?(.*)$|i);
	($ttype,$thost,$tport,$tfile) = ($to   =~ m|^(\w+:)?(//[^/:]*)?(:\d+)?(.*)$|i);

#	print "type=$ftype, host=$fhost, port=$fport, file=$ffile -- ";
#	print "type=$ttype, host=$thost, port=$tport, file=$tfile\n";

	my $tsite = $thost;
	$tsite .= "$tport" if $tport;

	return unless $ttype =~ m/^http:$/i;
	return unless $fhost eq $thost || ($mpatt && $thost =~ m{^//($mpatt)$}i);
	return if $nclud && $tfile !~ m!$nclud!i;
	return if $xclud && $tfile =~ m!$xclud!i;

	GetRobotsTXT($tsite) unless exists $disallow{$tsite};
	return if $disallow{$tsite} && $tfile =~ m!^($disallow{$tsite})!;

#	print "return: $to\n";
	return $to;
}



sub StopProgram {
	print "(caught signal... stopping)\n";
	$shutd=1;
}



sub AbortProgram {
	print "(caught signal... aborting)\n";
	$shutd=1;
}



sub SegFault {
	print "*** Segmentation Violation ***\n";
	delete $SIG{SEGV};

	my($package, $filename, $line, $subroutine, $hasargs, $wantargs, $i);
	for ($i=0; $i < 4; $i++) {
		($package, $filename, $line, $subroutine, $hasargs, $wantargs) = caller($i);
		print "*** $subroutine -- called by $filename\:$line\n";
	}

	AbortProgram();
}



###############################################################################
#
# Begin main program
#
###############################################################################



# Parse command line arguments
my $usage = qq"
Use: $0 [--index=<index-file>] [--include=<allowed-filename-pattern>]
     [--exclude=<disallowed-filename-pattern>] [--delay=<delay-time>]
     [--machines=<allowed-machine-pattern>] [--summary=<max-summary-bytes>]
     [--lines=<max-summary-lines>] [--refresh=<url-pattern>] [--shrink]
     [--name=<user's-name>] [--email=<contact-email-address>] [--random]
     [--timeout=<network timeout>] <new-url> [...]

";

$index = 'ferret.index';
$mpatt = '';
$xclud = '';
$nclud = '';
$uedit = '';					# Additional operations to run on urls during cleaning
$ssize = '';
$lines = '';
$sleep = 2;
$flush = 100;
$fresh = '';
$shrnk = 0;
$uname = '';
$email = '';
$rndom = 0;
$tmout = 0;
$shutd  = 0;


die $usage unless @ARGV > 0;

foreach (@ARGV) {
	if (/^--index=(.+)/)	{ $index = $1;	next; }
	if (/^--machines=(.+)/)	{ $mpatt = $1;	next; }
	if (/^--avoid=(.+)/)	{ $xclud = $1;	print '("--avoid" has been depricated.  Please use "--exclude")',"\n"; next; }
	if (/^--exclude=(.+)/)	{ $xclud = $1;	next; }
	if (/^--include=(.+)/)	{ $nclud = $1;	next; }
	if (/^--summary=(.+)/)	{ $ssize = $1;	next; }
	if (/^--lines=(.+)/)	{ $lines = $1;	next; }
	if (/^--delay=(.+)/)	{ $sleep = $1;	next; }
	if (/^--refresh=(.+)/)	{ $fresh = $1;	next; }
	if (/^--shrink$/)		{ $shrnk =  1;	next; }
	if (/^--name=(.+)/)		{ $uname = $1;	next; }
	if (/^--email=(.+)/)	{ $email = $1;	next; }
	if (/^--flush=(.+)/)	{ $flush = $1;	next; }
	if (/^--random$/)		{ $rndom =  1;	next; }
	if (/^--timeout=(.+)/)	{ $tmout = $1;	next; }
	if (/^--urledit=(.+)/)	{ $uedit = $1;	next; }

	if (/^--/)				{ print STDERR "Unknown option '$_'\n"; die $usage; }

	push @urllist,$_;
}
$xclud =~ s/!/\\!/g;
$nclud =~ s/!/\\!/g;
Ferret::SetNetworkTimeout($tmout);
if ($uname) {
	$uname = "Ferret/$Ferret::VERSION ($uname)";
}


# Load the big guns...
$search = new Ferret;
$search->Update($index);


# Set up signal handlers
$SIG{HUP}	= \&StopProgram;
$SIG{QUIT}	= \&StopProgram;
$SIG{INT}	= \&AbortProgram;
$SIG{TERM}	= \&AbortProgram;
$SIG{SEGV}	= \&SegFault;


# Search for helper programs
if (-x "/usr/bin/acroread") {
	$acroread = "/usr/bin/acroread";
} elsif (-x "/usr/local/bin/acroread") {
	$acroread = "/usr/local/bin/acroread";
}


# A friendly reminder
unless ($uname && $email) {
	print "Note: It is considered good netiquette when crawling the web to include both\n";
	print "your name and an email address at which you can be contacted.\n";
}


# Process command-line URL list
for (my $i=0; $i < @urllist; $i++) {
	my $url = $urllist[$i];
	$url = "//$url" unless $url =~ m|/|;
	$url = CleanURL($url);
	if ($url) {
		$urllist[$i] = $url;
	} else {
		print STDERR "Error: Invalid or unaccessible url '$urllist[$i]'\n";
	}
}


# If we're doing a "refresh", load the appropriate urls into the list
push @urllist, grep(/$fresh/,$search->DocumentList()) if $fresh;


# Loop until no more URLs
while (@urllist && !$shutd) {
	my $url  = shift @urllist;
	next unless $url;
	next if $urldone{$url};
	my $lurl = $url;
	my $write= 1;

	# Remove any not-allowed URLs (changes in cmd line args or robots.txt)
	unless (CleanURL($url)) {
		$search->RemoveDocument($url);
		$search->DBDelSummary($url);
		next;
	}

	my($host) = ($url=~m!http://([^/]*)!);

	# Build GET options
	my $opts = "";
#	$opts .= "Connection: Keep-Alive\n";
	$opts .= "Host: $host\n";
	$opts .= "User-agent: $uname\n" if $uname;
	$opts .= "From: $email\n" if $email;
	$opts .= "If-Modified-Since: " . Ferret::TimetoRFC1123($search->DocumentTimestamp($url)) . "\n";
	$opts .= "Accept: application/*, text/*\n";
	$opts .= "Accept-Encoding: *\n";

	print "\n$url:  ";
	my $data = eval { Ferret::LoadHTTP($url,$opts) };
	my $size = (length $data) / 1024;
	if ($@) {
		$urldone{$url} = 1;
		print STDERR "$@";
		next;
	}

	my($head) = ($data =~ m!^(.*?\n)\s*\n!s);
	my($type) = ($head =~ m!^Content-Type:\s+(.*?)\s*$!im);
	my($encd) = ($head =~ m!^Content-Encoding:\s+(.*?)\s*$!im);
	my($rslt) = ($head =~ m!^HTTP\S*\s+(\d+)!im);

	$urldone{$url} = 1;

	if ($rslt == 304) {
		print "(unchanged) ";
		sleep $sleep/2 if $sleep >= 2;
		next;
	}

	$search->RemoveDocument($url);
	$search->DBDelSummary($url);

	if ($rslt == 301 || $rslt == 302) {
		if ($head =~ m/^Location: (.*)/im) { $url = $1; }
		print "-> $url ";
#		print "[" . CleanURL($lurl,$url) . "] ";
		push @urllist, CleanURL($lurl,$url);
		next;
	} elsif ($rslt != 200 && $rslt != 304) {
		$head =~ m!^HTTP\S*\s+(.*)!im;
		print STDERR "Error: Could not load '$url' -- $1\n";
		next;
	}

	unless ($type) {
		print STDERR "Error: '$url' returned with no content-type\n";
		next;
	}
	printf "(%0.1fk) ",$size;
	$data =~ s/^.*?\n\s*\n//s;

	if ($encd) {
		my $prog = Ferret::StdEncodingFilters($encd);
		if ($prog) {
			Ferret::FilterData(\$data,$prog);
			  if (length($data) == 0) {
				  print STDERR "Error: could not decode '$url' with encoding '$encd'\n";
				  next;
			  }
		} else {
			print STDERR "Error: '$url' had unknown encoding-type '$encd'\n";
			next;
		}
	}


	my($title,$summary,$newurl);
	if ($type eq "text/html") {
		print '"HTML" ';
		my @hrefs = ($data =~ m/<a\s[^>]*href\s*=\s*[\"\']?([^>\'\"\s\#\?]+).*?>/gis);
		push @hrefs,($data =~ m/<frame\s[^>]*src\s*=\s*[\"\']?([^>\'\"\s\#\?]+).*?>/gis);
		if (@hrefs) {
			printf "(%u hrefs", scalar @hrefs;
			@hrefs = Ferret::Uniq(sort(@hrefs));
			printf ", %u unique) ", scalar @hrefs;
			foreach $href (@hrefs) {
				Ferret::UnquoteHTML(\$href);
#				print "($href)\n";
				$newurl = CleanURL($url,$href);
				if ($newurl) {
					unless ($newurl =~ m!\.($badext)$!oi) {
						push @urllist, $newurl;
#						print "($newurl)\n";
					}
				}
			}
		}

		Ferret::StripHTML(\$data,\$title,\$summary,$ssize);
		Ferret::ResolveHTMLImages(\$summary,$url,0.75,100);
		Ferret::MakeHTMLSummary(\$summary,"HTML");
		$search->AddDocument($url,$data);
	} elsif ($type =~ m|^text/|) {
		if ($url =~ m/\.(h|hh|hpp|h\+\+|c|cc|cpp|c\+\+)$/i || $data =~ m/^\#\!/) {
			print '"Code" ';
			Ferret::StripCode(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		} elsif ($data =~ m/^(From|Received:) /) {
			print '"Mail" ';
			Ferret::StripMail(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		} else {
			print '"Text" ';
			Ferret::StripText(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		}
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"Text");
		$search->AddDocument($url,$data);
	}

	elsif ($type =~ m!^image/!) {
		print STDERR "Notice: Ignoring image type '$type' from $url\n";
	}

	elsif ($type =~ m!^application/x-(sh|shar|csh)!) {
		print '"Shell" ';
		Ferret::StripCode(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"Code");
		$search->AddDocument($url,$data);
	} elsif ($type eq "application/(msword|vnd.ms-word)") {
		print '"MS-Word" ';
		Ferret::StripMSWord(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"MSWord");
		$search->AddDocument($url,$data);
	} elsif ($type eq "application/x-mif") {
		print '"MIF" ';
		Ferret::StripMIF(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"MIF");
		$search->AddDocument($url,$data);
	} elsif ($type eq "application/postscript") {
		print '"Postscript" ';
		Ferret::StripPostscript(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"Postscript");
		$search->AddDocument($url,$data);
	} elsif ($type eq "application/pdf" && $acroread) {
		print '"PDF" ';

		$file1 = POSIX::tmpnam();
		open(FILE,">$file1") || die "Error: could not write '$file1' -- $!\n";
		print FILE $data;
		close(FILE);
		$file2 = POSIX::tmpnam();
		system("$acroread -toPostScript -level3 -fast <$file1 >$file2 2>/dev/null");
		$data = Ferret::LoadFile($file2);
		unlink $file1,$file2;
		$data =~ s/^%%Title:\s*\((acro.*|stdin)\)\s*[\r\n]//im;

		Ferret::StripPostscript(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary,"Postscript");
		$search->AddDocument($url,$data);
	}

	else {
		print STDERR "Warning: Unsupported content-type '$type' from $url\n";
		next;
	}

#	print "Title: '$title' [$type]\nSummary:\n$summary\n";
	$title .= " [$type]"					if $title;
	$search->DBPutTitle("$url",$title)		if $title;
	$search->DBPutSummary("$url",$summary)	if $summary;

	if ($count++ == $flush) {
		print "Writing index... ";
		$search->Flush();
		$count = 0;
		if ($rndom) {
			@urllist = Ferret::Uniq(sort(@urllist));
			srand(time());
			my $max = @urllist;
			for (my $i=0; $i < $max; $i++) {
				my $j = int(rand($max));
				my $temp	 = $urllist[$i];
				$urllist[$i] = $urllist[$j];
				$urllist[$j] = $temp;
			}
		}
	}

	sleep $sleep if $sleep > 0;
}
print "\n";



# Time to put it all away...  May take a while to write the database.
print "\nWriting index...\n";
$search->Shrink() if $shrnk;
$search->Close();
print "\n";



