#! /usr/bin/perl
###############################################################################
#
# $Id: fields,v 1.1.1.1 2003/04/21 15:30:49 bcwhite Exp $
#
# Example on how to get ferret to index and search multiple fields.
#
# Written by Brian C. White <bcwhite@pobox.com>.
# This example code has been placed in the public domain.
#
###############################################################################



# Ohhhhh!!!  I feel good!  (I knew that I would, now!)
use Ferret;


# Program defaults
$indexdir= "/tmp";
$ferret  = new Ferret;


# Initialize things
use Carp;
use IO::Handle;
autoflush STDOUT 1;



###############################################################################
#
# Data declarations
#
###############################################################################


# File type specifications -- Ferret does indexing on multiple fields
# by maintaining a different index for each field and then combining
# the results.  It is _essential_ that all indecies be kept in sync.
# Otherwise, the internal code numbers assigned to each indexed
# document could not match up and very strange results would occur.
# For that reason, the list of fields is defined below instead of
# passed in as parameters.  If these definitions change, all existing
# index files must be deleted and regenerated!

$fieldlists = {
	'jobs' => {
		'PositionTitle'					=> {},
		'Description'					=> {},
		'Location'						=> {},
		'Degree'						=> { 'multiple' => 1 },
		'RequiredSkillsLanguages'		=> {},
		'RequiredSkillsSpecialization'	=> {},
		'RequiredSkillsTools'			=> {},
		'RequiredSkillsHardware'		=> {},
		'RequiredSkillsProtocols'		=> {},
		'DesiredSkillsLanguages'		=> {},
		'DesiredSkillsSpecialization'	=> {},
		'DesiredSkillsTools'			=> {},
		'DesiredSkillsHardware'			=> {},
		'DesiredSkillsProtocols'		=> {},
	},

	'resumes' => {
		'DesiredPosition'				=> { 'multiple' => 1 },
		'CurrentPosition'				=> {},
		'LastPosition'					=> {},
		'SecondLastPosition'			=> {},
		'DesiredIndustry'				=> { 'multiple' => 1 },
		'CurrentIndustry'				=> {},
		'LastIndustry'					=> {},
		'SecondLastIndustry'			=> {},
		'SkillsLanguages'				=> {},
		'SkillsSpecialization'			=> {},
		'SkillsTools'					=> {},
		'SkillsHardware'				=> {},
		'SkillsProtocols'				=> {},
		'Location'						=> {},
		'Degree'						=> { 'multiple' => 1 },
		'Blurb'							=> {},
	},
};



###############################################################################
#
# Work subroutines
#
###############################################################################


#
# ParseRecordFile:  This will read files and return a hash of keys to data
#
# This routines reads flat-files in the same format as used by Debian
# control files.
#
sub ParseRecordFile {
	my($file) = @_;
	my(%rec,$field,$data);

	open(FILE,"<$file") || die "Error: Could not read '$file' -- $!\n";

	while (<FILE>) {
		chomp;

		if (m/^\S/) {
			if ($field) {
				$rec{$field} = $data;
				$field = "";
				$data  = "";
			}
			($field,$data) = m/^([^:]*):\s*(.*?)\s*$/;
		} else {
			if (m/^\s*\.\s*$/) {
				$data .= "\n\n";
			} else {
				my $chr = chop($data);
				$data .= $chr;
				$data .= " " unless $chr eq "\n";
				s/^\s+|\s+$//g;
				$data .= $_;
			}
		}
	}

	if ($field) {
		$rec{$field} = $data;
	}

	close(FILE);

	return %rec;
}


#
# Index a file with multiple fields
#
sub IndexFields {
	my $usage = 'Usage: IndexFields($ferret, INDEXDIR, DATADIR, FILETYPE, FILES, ...)';
	@_ >= 5 or croak $usage;
	my ($ferret,$idxdir,$datdir,$filtyp,@files) = @_;

	croak $usage if (ref $ferret ne "Ferret");

	my $fields = $$fieldlists{$filtyp};
	unless ($fields) {
		@fields = sort keys %$fieldlists;
		croak "Error: File type '$filtyp' not known; stopped";
	}

	foreach $file (@files) {
#		print "Indexing file '$file'... ";

		# Assume file is under subdirectory of name '<type>'
		my %filedata = ParseRecordFile("$datdir/$filtyp/$file");
		my $allfields= "";

		foreach $field (keys %$fields) {
			my $finfo  = $$fields{$field};
			my $fdata  = $filedata{$field};

			if ($$finfo{multiple}) {
				foreach $f (sort grep(/^\Q$field\E\d+/,keys %filedata)) {
					$fdata .= " " if $fdata;
					$fdata .= $filedata{$f};
				}
			}

			$allfields .= "\n" if $allfields;
			$allfields .= $fdata;

			$ferret->LocalOpen("$idxdir/$filtyp-$field.idx",1);
			$ferret->SetOption(&Ferret::OPT_NOSTOPPERS);
			$ferret->SetOption(&Ferret::OPT_TINY) if $$finfo{opt_tiny};
			$ferret->AddDocument($file,$fdata);
			$ferret->Close();
		}


		$ferret->LocalOpen("$idxdir/$filtyp-all.idx",1);
		$ferret->SetOption(&Ferret::OPT_NOSTOPPERS);
		$ferret->AddDocument($file,$allfields);
		$ferret->Close();

#		print "\n";
	}
}


#
# Query fields
#
sub QueryFields {
	my $usage = 'Usage: @matches = QueryFields($ferret, INDEXDIR, FILETYPE, FIELD QUERY, ...)';
	@_ >= 5 or croak $usage;
	my ($ferret,$idxdir,$filtyp,@query) = @_;

	croak $usage if (ref $ferret ne "Ferret");

	my $result1,$result2,$result;
	my $field,$combine;

	foreach $arg (@query) {
#		print "arg: $arg\n";

		if (!$field) {
			if ($arg =~ m/^(and|or)$/i) {
				$combine = uc($arg);
			} else {
				$field = $arg;
			}
		} else {
#			print "Query '$field' for '$arg'... ";
			$ferret->Close();
			$ferret->Open("$idxdir/$filtyp-$field.idx");
			$result = $ferret->QueryRun($arg);
			($result1 ? $result2 : $result1) = $result;
#			print "#$result\n";

			if ($combine eq "AND") {
				$result = $ferret->QueryAnd($result1,$result2);
				$ferret->FreeQueryResult($result1);
				$ferret->FreeQueryResult($result2);
				$result1 = $result;
#				print "and#$result\n";
			} elsif ($combine eq "OR") {
				$result = $ferret->QueryOr($result1,$result2);
				$ferret->FreeQueryResult($result1);
				$ferret->FreeQueryResult($result2);
				$result1 = $result;
#				print "or#$result\n";
			}

			$field   = "";
			$combine = "";
		}
	}

	my @matches = $ferret->QueryMatches($result1);
	$ferret->FreeQueryResult($result1);
	$ferret->Close();

	return @matches;
}



###############################################################################
#
# Main body
#
###############################################################################


# Check the parameter list
die "Use: $0 index|query|shrink <type> [info] [...]\n" if @ARGV < 2;



$action = shift @ARGV;
$filtyp = shift @ARGV;
@args   = @ARGV;

$fields = $$fieldlists{$filtyp};
unless ($fields) {
	@fields = sort keys %$fieldlists;
	die "Error: File type '$filtyp' not known -- try: @fields\n";
}


print "\nStarting action...\n";
sleep(2);

if ($action eq "index") {

	IndexFields($ferret,$indexdir,".",$filtyp,@args);

} elsif ($action eq "query") {

	@matches = QueryFields($ferret,$indexdir,$filtyp,@args);

	die "$@\n" if $@;

	print "Score Match\n~~~~~ ~~~~~\n";
	foreach (@matches) {
		my($score,$match) = (m/^(\d+) (.*)$/);
		printf " %3d  %s\n",$score,$match;
	}

} elsif ($action eq "shrink") {

	print "Shrinking index... ";

	foreach $field (keys %$fields) {
		print "($field) ";

		$ferret->Update("$indexdir/$filtyp-$field.idx");
		$ferret->Shrink();
		$ferret->Close();
	}

	print "\n";

} else {
	die "Error: Unknown action '$action' -- try: search query shrink\n";
}
