#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell
#TODO: get more extra info (audio, video aspect)

=pod

=head1 NAME

tv_grab_it - Grab TV listings for Italy.

=head1 SYNOPSIS

tv_grab_it --help

tv_grab_it [--config-file FILE] --configure

tv_grab_it [--config-file FILE] [--output FILE] [--days N]
           [--offset N] [--quiet] [--slow]

=head1 DESCRIPTION

Output TV listings for several channels available in Italy.
The data comes from guidatv.libero.it (subpage of an italian portal).  
The grabber relies on parsing HTML so it might stop working at any time.

First run B<tv_grab_it --configure> to choose which channels you want
to download. Then running B<tv_grab_it> with no arguments will output
listings in XML format to standard output.

B<--configure> Prompt for which channels,
and write the configuration file.

B<--config-file FILE> Set the name of the configuration file, the
default is B<~/.xmltv/tv_grab_it.conf>.  This is the file written
by B<--configure> and read when grabbing.

B<--output FILE> write to FILE rather than standard output.

B<--days N> grab N days.  The default is 7.

B<--offset N> start N days in the future.  The default is to start
from today.

B<--quiet> suppress the progress messages normally written to standard
error.

B<--slow> downloads more details (descriptions, actors...). This means 
downloading a new file for each programme, so itE<39>s off by default to
save time.

B<--help> print a help message and exit.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Davide Chiarini, pinwiz@inwind.it.  Based on tv_grab_sn by Stefan G:orling.

=head1 BUGS

'Actors' are not always actors. Due to the way the site is made, we cannot
tell actors from show hosts and the like.

The data source does not include full channels information and the
channels are identified by short names rather than the RFC2838 form
recommended by the XMLTV DTD.

=cut

######################################################################
# initializations
use strict;
use XMLTV::Version '$Id: tv_grab_it.in,v 1.27 2004/05/23 16:23:13 epaepa Exp $ ';

use HTML::Entities;
use HTML::Parser;
use URI::Escape;
use Getopt::Long;
use Date::Manip;
use Memoize;
use XMLTV;
use XMLTV::Memoize;
use XMLTV::Ask;
use XMLTV::Config_file;
use XMLTV::DST;
use XMLTV::Get_nice;
# Todo: perhaps we should internationalize messages and docs?
use XMLTV::Usage <<END
$0: get Italian television listings in XMLTV format
To configure: $0 --configure [--config-file FILE]
To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
        [--offset N] [--quiet] [--slow]
END
  ;

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
    *t = sub {};
    *d = sub { '' };
    }
    else {
    *t = \&Log::TraceMessages::t;
    *d = \&Log::TraceMessages::d;
    Log::TraceMessages::check_argv();
    }
}

#max days on the server
my $MAX_DAYS=7;
# default language
my $LANG="it";
# base url for info
my $domain = 'guidatv.libero.it';
my $base="http://$domain/canali.phtml";
my $rturl="http://$domain/";

######################################################################
# Use Term::ProgressBar if installed.
use constant Have_bar => eval { require Term::ProgressBar; 1 };

######################################################################
# get options
# Get options, including undocumented --cache option.

my $func_name = 'XMLTV::Get_nice::get_nice_aux';
XMLTV::Memoize::check_argv($func_name) # cache on disk
  or memoize($func_name)               # cache in memory
  or die "cannot memoize $func_name: $!";

my ($opt_days,
    $opt_offset,
    $opt_help,
    $opt_output,
    $opt_slow,
    $opt_configure,
    $opt_config_file,
    $opt_quiet,
    $opt_share,
   );

$opt_days   = $MAX_DAYS;   # default
# server only holds 7 days, so if there is an offset days must be
# opt_days-offset or less.
$opt_offset = 0;   # default
$opt_quiet  = 0;   # default
$opt_slow   = 0;   # default
GetOptions('days=i'        => \$opt_days,
       'offset=i'      => \$opt_offset,
       'help'          => \$opt_help,
       'configure'     => \$opt_configure,
       'config-file=s' => \$opt_config_file,
       'output=s'      => \$opt_output,
       'quiet'         => \$opt_quiet,
       'slow'      => \$opt_slow,
       'share=s'       => \$opt_share,       # undocumented
      )
  or usage(0);
die 'number of days must not be negative'
  if (defined $opt_days && $opt_days < 0);
usage(1) if $opt_help;

# share/ directory for storing channel mapping files.  This next line
# is altered by processing through tv_grab_it.PL.  But we can use the
# current directory instead of share/tv_grab_it for development.
#
# The 'source' file tv_grab_it.in has $SHARE_DIR undef, which means
# use the current directory.  In any case the directory can be
# overridden with the --share option (useful for testing).
#
my $SHARE_DIR='/usr/share/xmltv'; # by grab/it/tv_grab_it.PL
$SHARE_DIR = $opt_share if defined $opt_share;
my $OUR_SHARE_DIR = (defined $SHARE_DIR) ? "$SHARE_DIR/tv_grab_it" : '.';
(my $CHANNEL_NAMES_FILE = "$OUR_SHARE_DIR/channel_ids") =~ tr!/!/!s;

# File that stores which channels to download.
my $config_file
  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_it', $opt_quiet);

if ($opt_configure) {
    XMLTV::Config_file::check_no_overwrite($config_file);
}

my %skipchannel;

######################################################################
# write configuration
if ($opt_configure) {
    my $content =  get_nice("$base");

    if (!defined($content)) {
        die "Can't download $base!!\n";
        }
    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
    # find list of available channels
    my $bar = new Term::ProgressBar('getting list of channels', 1)
      if Have_bar && not $opt_quiet;
    my %channels=get_channels_list($content);
    die "no channels could be found" if (scalar(keys(%channels))==0);
    update $bar if Have_bar && not $opt_quiet;

    # Ask about each channel.
    my @chs = sort keys %channels;
    my @names = map { $channels{$_} } @chs;
    my @qs = map { "add channel $_?" } @names;
    my @want = askManyBooleanQuestions(1, @qs);
    foreach (@chs) {
    my $w = shift @want;
    warn("cannot read input, stopping channel questions"), last
      if not defined $w;
    # No need to print to user - XMLTV::Ask is verbose enough.
 
    # Print a config line, but comment it out if channel not wanted.
    print CONF '#' if not $w;
    my $name = shift @names;
        print CONF "channel $_\n";
    }

    close CONF or warn "cannot close $config_file: $!";
    say("Finished configuration.");

    exit();
}

######################################################################
# read configuration
my (%channels, @channels, $ch_did, $ch_name);
my $line_num = 0;
foreach (XMLTV::Config_file::read_lines($config_file)) {
    ++ $line_num;
    next if not defined;
    if (/^channel:?\s+(.*)/) {
    $ch_did = $1;
    push @channels, $ch_did;
    }
    else {
    warn "$config_file:$line_num: bad line\n";
    }
}

# Tables to convert between Latele.it and XMLTV ids of channels.
my (%xmltv_chanid, %seen);
$line_num = 0;
foreach (XMLTV::Config_file::read_lines($CHANNEL_NAMES_FILE, 1)) {
    ++ $line_num;
    next unless defined;
    my $where = "$CHANNEL_NAMES_FILE:$line_num";
    my @fields = split /;/; #because we have a 16:9 channel
    die "$where: wrong number of fields"
      if @fields != 2;

    my ($xmltv_id, $lt_id) = @fields;
    warn "$where: lt id $lt_id seen already\n"
      if defined $xmltv_chanid{$lt_id};
    $xmltv_chanid{$lt_id} = $xmltv_id;
    warn "$where: XMLTV id $xmltv_id seen already\n"
      if $seen{$xmltv_id}++;
}

######################################################################
# begin main program
my %w_args;
if (defined $opt_output) {
    my $fh = new IO::File(">$opt_output");
    die "cannot write to $opt_output: $!" if not defined $fh;
    $w_args{OUTPUT} = $fh;
}
$w_args{encoding} = 'ISO-8859-1';
my $w = new XMLTV::Writer(%w_args);

$w->start({ 'source-info-url'     => "http://$domain/",
        'source-data-url'     => "http://$domain/canali.phtml",
        'generator-info-name' => 'XMLTV',
        'generator-info-url'  => 'http://membled.com/work/apps/xmltv/',
        });


#make a list of the urls to grab, based on date and channel name
my @to_get;
my $url;
my $days2get;
if (($opt_days+$opt_offset) > $MAX_DAYS) {
    $days2get=$MAX_DAYS-$opt_offset;
    warn "The server only has info for $MAX_DAYS days from today.\n";
    if ($opt_offset > $MAX_DAYS) {
    warn "Day offset too big.\n";
    }
    else {
    warn "You'll get listings for only ".($MAX_DAYS-$opt_offset)." days.\n";
    }
}
else {
    $days2get=$opt_days;
}
t "will get $days2get days from $opt_offset onwards";

my $bar2 = new Term::ProgressBar('getting icons', scalar @channels)
  if Have_bar && not $opt_quiet;
foreach my $ch_id (@channels) {
    my $ch_xid=xmltv_chanid($ch_id);

    foreach my $day ($opt_offset .. $days2get + $opt_offset - 1) {
    $url=$base
      ."?giorno2=".uri_escape(url_date($day))
        .'&channel='.url_channel($ch_id."&x=13&y=12");
    push @to_get, [$url, $ch_xid, $day];
    t "will get $ch_xid for day $day";
    }
    #we have to grab one page per channel just to get the icon, but it doesn't
    #matter since we're memoizing anyway
    $w->write_channel({
               id => $ch_xid,
               'display-name' => [ [ $ch_id ] ],
               icon => [{src => get_icon($url)}]
              });
    update $bar2 if Have_bar && not $opt_quiet;
}

my $bar = new Term::ProgressBar('getting listings', scalar @to_get)
  if Have_bar && not $opt_quiet;
foreach (@to_get) {
    my $canale= $_->[1];
    $url   = $_->[0];
    my $data  = $_->[2];

    #following line is useful for debugging
    #warn "now doing $canale\n";

    unless ($skipchannel{$canale}) {
    my $content =  get_nice($url);
    my @dati;
    @dati = parse_page($content, $canale, $data)
      if page_check($content);
    if (not @dati) {
        if ($data==0) {
        warn "\nChannel $canale, no listings on day 0, skipping other days...\n".
          "Might have disappeared from server, you probably want to remove it from configuration file.\n";
        $skipchannel{$canale} = 1;
        }
        else {
        warn "\nNo listings found for channel $canale, day $data\n";
        }
    }
    $w->write_programme($_) foreach @dati;
    }
    #           else {warn "skipping $canale \n";}
    update $bar if Have_bar && not $opt_quiet;
}
$w->end;

######################################################################
# subroutines

####################################################
# page_check
# sometimes the website returns a valid page, but with no listing, so we check for that
# alle volte il sito ritorna una pagina senza risultati
# controlliamo che sia ok
sub page_check {
my $content2check = shift;
 if ($content2check=~/Torna a trovarci domani oppure continua la ricerca per/) {
     return 0;
     }
     else {
         return 1;}
}


####################################################
# xmltv_chanid
# to handle channels that are not yet in the channel_ids file
sub xmltv_chanid {
    my $channel_id = shift;

    if (defined $xmltv_chanid{$channel_id}) {
        return $xmltv_chanid{$channel_id};
        }
    else {
        warn "***Channel $channel_id is not in channel_ids, should be updated.\n";
        $channel_id=~ s/\W//gs;
        return lc($channel_id).".$domain";
    }
}

####################################################
# xmltv_date
# this returns a date formatted like 20021229121300 CET
# first argument is time (like '14:20')
# second is date offset from today
sub xmltv_date {
    my $time = shift;
    my $time_offset = shift;

    $time =~/(.*):(.*)/ or die "bad time $time";
    my $hour=$1; my $min=$2;
    my $data=&DateCalc("today","+ ".$time_offset." days");
    die 'date calculation failed' if not defined $data;
    return utc_offset(UnixDate($data, '%Y%m%d').$hour.$min.'00', '+0100');
}

####################################################
# url_channel
#tiny url encoding for channel names, where spaces become '+' and +'s become '%2B
#stranamente gli spazi diventano '+'
#e i + diventano %2B
sub url_channel {
    my $channel = shift;
    $channel=~ s/\+/%2B/gs;
    $channel=~ s/ /\+/gs;
    return $channel;
}

####################################################
# url_date
# argument is offset from today
# formats today+offset's date like DD/MM/YY 
sub url_date {
    my $time_offset = shift;
    my $data=&DateCalc("today","+ ".$time_offset." days");
    die 'date calculation failed' if not defined $data;

    return UnixDate($data, '%d/%m/%y');
}

####################################################
# dom
# returns day of month + offset
# used to get episode titles in prog_parse
sub dom {
    my $time_offset = shift;
    my $data=&DateCalc("today","+ ".$time_offset." days");
    die 'date calculation failed' if not defined $data;
    return UnixDate($data, '%d');
}

####################################################
# get_channels_list
# parses $content to get channel list
# puts it in a hash. channel id is really just the channel name without spaces
sub get_channels_list {
    my $content = shift;
    my %chan_hash;

    my $chop_start="Seleziona il canale</option>";
    my $chop_end="</select>";
    #takes out everything we don't want
    $content =~ /\Q$chop_start\E..(.*?)(\Q$chop_end\E)/s;
    $content = $1;

    my @channels = split /\n/, $content;

    #we want just the names
    foreach $a (@channels) {
        $a =~ />([^<]*)</s;
        $a = $1;
        $chan_hash{$a}=$a;
        }

    return %chan_hash;
}

# Convert channel ids on the site to XMLTV channel ids.
sub id_to_xid { lc($_[0]) . ".$domain" }

####################################################
# this is the main parsing subroutine
# vars needed for parsing
my ($in_time_start, $in_title, $in_extras, $premiere, $prev_shown, $star_value,
    $title, $chan, $link, $time_start, $category, $category2, $txt, $dd, $orig_lang);
my @programmes;
my %programme;

####################################################
# parse page
# takes 3 arguments: $content of the page, $channelid that were parsing and $dateoffset from today
# returns an array of programme datas (see perldoc XMLTV)
sub parse_page {
my $content = shift;
   $chan    = shift;
   $dd = shift;
   @programmes = (); #just to make sure
   $in_time_start = 0;
   $in_title = 0;
   $in_extras = 0;
   $premiere = 0;
   $prev_shown = 0;
   $star_value = undef;
   $orig_lang = 0;

my $chop_start="/i/ico_giorno_successivo.gif";
my $chop_end="<!-- nb: width e height";
#questo mi toglie tutto quello che non ' la tabella
$content =~ /\Q$chop_start\E(.*)\Q$chop_end\E/s;
$content =$1;

 my $p = HTML::Parser->new(api_version => 3);
    $p->handler( start => \&start_handler, "tagname, attr");
    $p->handler(text => \&text_handler, "dtext");
    $p->handler( end => \&end_handler, "tagname");
    $p->unbroken_text(1);
    $p->parse($content);

  return @programmes;
}

sub start_handler {
    my ($tagname, $attr) = @_;
     if ($tagname eq 'td') {
         if (defined $attr->{class}) {
             if ($attr->{class} eq 'txt2-b3') {
                 $in_time_start=1; }
             elsif ($attr->{class}=~/g-(.*)/) {
                 $category=$1; }
         }
     }
     if ($tagname eq 'a') {
         $in_title=1;
         $attr->{href}=~/Full\('(.*)',0/;
         $link=$1;
     }
     if (($in_extras) && ($tagname eq 'img')) {
        SWITCH: for ($attr->{src}) {
                /(.*)prima/      && do {$premiere=1; last;};
                /(.*)replica/    && do {$prev_shown=1; last;};
                /.*stella(.)/    && do {$star_value=$1; last;};

        #double audio, not in xmltv yet, ignoring...
        /.*doppio_audio/ && do {last;};
        
        /.*lingua_originale/ && do {$orig_lang=1; last;};    #original language
        /.*live/ && do {$category2="Live"; last;};           #live (in sports events)
        /.*differita/ && do {$category2="Differita"; last;}; #delayed (sports)
        #i know there are others, but they are uncommon and cannot add them till i see them!
        warn "unhandled extra attribute found: ".$attr->{src}." \n";
            }
         }
} #start_handler

sub text_handler {
   ($txt) = @_;
   if ($in_time_start==1) { 
       $time_start = $txt;
       $in_time_start=0;}
   elsif ($in_title==1) {
       $title=$txt;} 
} #text handler

sub end_handler {
    my ($tagnome, $attr) = @_;
    if (($in_title==1) && ($tagnome eq 'a')){
        $in_title=0;
        $in_extras=1;
    }

    #we're done with the programme, collect data
    if (($in_extras==1) && ($tagnome eq 'td')){
    # Three mandatory fields: title, start, channel.
    if (not defined $title) {
        warn 'no title found, skipping programme';
        goto FAILED;
    }
        $programme{title}=[[tidy($title), $LANG] ];
    if (not defined $time_start) {
        warn "no start time for title $title, skipping programme";
        goto FAILED;
    }
        $programme{start}=xmltv_date($time_start, $dd);
    if (not defined $chan) {
        warn "no channel for programme $title at $time_start, skipping programme";
        goto FAILED;
    }
        $programme{channel}="$chan";

    $programme{category}=[[tidy($category), $LANG ]]
      if defined $category;
    $programme{_link}="$rturl$link"
      if defined $link;
    
    push (@{$programme{category}}, [tidy($category2), $LANG ])
      if defined $category2;

    # Star value could be zero stars but still 'defined'.
    $programme{'star-rating'}=[$star_value]
      if defined $star_value;
    
    $programme{premiere}=[] if $premiere;

        # Workaround because 'unknown orig-language' is not
        # officially part of the file format.
    $programme{'orig-language'}=['unknown'] if $orig_lang;

        #we don't know when it was previously shown
    $programme{'previously-shown'}->{channel}="$chan" if $prev_shown;
    
        #following line is useful for debugging
        #warn "now parsing $title on $chan, day $dd, time $time_start\n";
        if ($opt_slow) {
            my $content2 =  get_nice($programme{_link});
            prog_parse($content2, \%programme, dom($dd), $time_start);
        }
        #put info in array
        push @programmes, {%programme};

      FAILED:
        #reset vars for next channel;
        $in_extras = 0; $in_title = 0; $in_time_start = 0;
        $premiere = 0;  $prev_shown = 0; $orig_lang = 0;
        ($star_value, $title, $link, $category2, $time_start, $category)
          = (); # set to undef
        %programme = ();
    }
} #end_handler;


##########################################################
# prog_parse
# it parses subpages to get more info about the programmes
# (descriptions, actors, directors, more categories, year, country)
# first argument is content of the page
# second is the hash to wich we add info 
# third is day of month, used to get episode titles when they are available
# 4th is the time i figured it was needed to avoid some problems
sub prog_parse{
    my ($c, $prog_hash, $dayofmonth, $time) = @_;

#let's divide content for easier parsing;

    my $chop1='cellpadding=0 width=400>';
    my $chop2="class=txt2>";
    my $chop3='<script>if [(]!ie[)]document.write[(]"</table>"[)];</script>';

$c =~ /$chop1(.*)$chop2(.*?)($chop3{1}).*<table border(.*)<script>if /s;

    $c =$2; my $part2 = $4;
    my $cast=tidy($1); 
    $cast=~/<br>(.*)<\/td><\/tr>/s;
    $cast=$1;

    $chop1='</b><br>';
    $chop2='</td>';

$c =~ /<b> (.*) $chop1...(.*?)($chop2{1})/s;

    my $description = tidy($2) if ($2 ne "");
    my $category_2 = tidy($1) if ($1 ne "");

    $part2 =~ m%$dayofmonth ...</td>.*?$time(</td><td width=438{1})(.*?)(</td>{1})%s;
    $part2 =$2;
    $part2 =~ />(.*)/s;
    
    my $subtitle=tidy($1) if ($1 ne "");

    my @cast = split /<br>/, $cast;
    foreach  (@cast) {
        if (defined $_) {
        if (/^Regista: (.*)/) {
            push @{$prog_hash->{credits}->{director}}, tidy($1);}
            elsif (! /Cast/){
                if (/^\((.*) (.*)\)$/){
                    if ((defined $1) && ($1 ne "")) {
                        my @countries = split /,/, $1;
                        foreach $a (@countries) {
                            push (@{$prog_hash->{country}},
                                  [tidy($a), $LANG]);
                            }
                    }
                    if ((defined $2) && ($2 ne "")) {$prog_hash->{date}=$2;}
                    }
                else {(push @{$prog_hash->{credits}->{actor}}, tidy($_))
                    unless (/^\s.*/
                            || /colspan=/
                            || /Cerca nella banca dati/
                            || /\.iol\.it/
                            );}
                }
        }
     }
    $prog_hash->{'sub-title'}=[[$subtitle, $LANG] ] if defined $subtitle;
    $prog_hash->{desc}=[[$description, $LANG] ] if defined $description;
    push (@{$prog_hash->{category}} , [$category_2, $LANG ])
        if defined $category2;
}

##########################################################
# tidy
# decodes entities and removes some illegal chars
my $warned_bad_chars;
sub tidy($) {
    for (my $tmp=shift) {
    s/\`/\'/g;          # i really don't know why the site uses ` instead of '    
    s/[\000-\037]//g; # remove control characters   

    #this is to fix some messed up chars
    s/\342\200[\230\231]/\'/g;  # apostrophe
    s/\342\200[\234\235]/\"/g;   
    s/\342\200\246//g;      # ?? no idea
    s/\342\200\223/-/g;     # probably a dash
    s/\303\210/\310/g;      # uppercase e grave
    s/\303\244/\344/g;      # lowercase a with diaeresis
    s/\303\247/\347/g;      # lowercase c with cedilla
    s/\303\274/\374/g;      # lowercase u with diaerisis
    s/\303\246/\346/g;      # lowercase ae
    s/\303\245/\345/g;      # lowercase a with circle
    s/\303\252/\352/g;      # lowercase e with circumflex
    s/\303\211/\311/g;      # uppercase e with acute
    s/\303\226/\326/g;      # uppercase o with diaeresis
    s/\303\261/\361/g;      # lowercase n with tilde
    s/\303\266/\366/g;      # lowercase o with diaeresis
    s/\303\253/\353/g;      # lowercase e with diaeresis
    s/\303\277/\377/g;      # lowercase y with diaeresis

    #there might be others
    if (/([\342\303])(.)(.)/) { 
        warn "Probable messed up char found: (".ord($1).")(".ord($2).")(".ord($3).")\n";
        warn "while parsing $title on $chan, day $dd, time $time_start\n";
    }

    if (s/[\200-\237]//g)
        { warn "removing illegal char: |\\".ord($&)."|\n";
          warn "while parsing $title on $chan, day $dd, time $time_start\n";
        }

    # Just to make sure...
    if (tr/\012\015\040-\176\240-\377//dc) {
	warn 'removing bad characters' unless $warned_bad_chars++;
    }

    return decode_entities($_);
    }
}

##########################################################
# get_icon
# grab channel icon from html page
sub get_icon {
    my $content=get_nice(shift);
    $content=~/src="(statiche.*?)"/;
    return $rturl.$1;
}
