#!/usr/bin/env python

import sys
import urllib
import re
import time
import sys
import getopt
import string
import os
import stat

shortargs='hn:qo:cf:'
longargs=['help', 'version', 'days=', 'quiet', 'output=', 'configure',
	  'offset=', 'config-file=', 'share=',]

# getopt doesn't seem to let you specify options that can _sometimes_
# have an argument.  So we have to resort to this instead.
#
argv = []
i = 1
while i < len(sys.argv):
	a = sys.argv[i]
	i = i + 1
	if a != '--cache':
		argv.append(a)
		continue
	sys.stderr.write("ignoring --cache option\n")

	# Look for an optional next argument.
	if i == len(sys.argv):
		continue
	next_a = sys.argv[i]
	if string.find(next_a, '-') == 0:
		continue
	# The next argument exists and isn't another option, skip it.
	i = i + 1

# Default config file location.  Look in ~/.xmltv/ if possible, else
# current directory.  Migrate from the old filename if necessary.
# This happens even if later on the --config-file option changes the
# filename.
#
# FIXME this is duplicate code, in a way, since it rewrites in Python
# what Config_file.pm provides.
#
home_dir=os.environ.get('HOME')
if home_dir != None and home_dir != '':
	config_dir=home_dir + "/.xmltv"
	if not os.path.exists(config_dir):
		os.makedirs(config_dir)
        config_file=config_dir + "/tv_grab_nz.conf"
        old_config_file=config_dir + "/tv_grab_nz"
else:
        config_file="tv_grab_nz.conf"
        old_config_file="tv_grab_nz"
if (not os.path.exists(config_file)) and os.path.exists(old_config_file):
        if stat.S_ISREG(os.stat(old_config_file)[stat.ST_MODE]):
                sys.stderr.write("migrating config file %s -> %s\n"
                                 % (old_config_file, config_file))
                os.rename(old_config_file, config_file)

def version ():
	# FIXME this duplicates some of XMLTV::Version.
	v = "$Id: tv_grab_nz.in,v 1.13 2003/12/07 21:56:25 epaepa Exp $"
	m = re.search('\$Id: ([^,]+),v (\S+) ([0-9/: -]+)', v)
	if m:
		sys.stdout.write("This is %s version %s, %s\n"
				 % (m.group(1), m.group(2), m.group(3)))
	else:
		sys.stderr.write("strange, couldn't parse version string\n")
		sys.stdout.write("This is %s\n" % v)

def usage ():
        sys.stderr.write(
"Usage: " + sys.argv[0] + """ [OPTION]...
   -h, --help         display this message
   -c, --configure    configure the output options
   --config-file=F    where to store/read configuration data (default: %s)
   -q, --quiet        quiet output
   -n, --days=N       get listing for the next N days (default: 7)
   -o, --output=FILE  output to FILE rather than stdout (default: stdout)
   -f, --offset=N     skip N days of listings (from today) (default: 0)
""" % config_file)

try:
        opts, args= getopt.getopt(argv, shortargs, longargs)
except getopt.error:
        usage()
        sys.exit(2)
if args:
	sys.stderr.write("unknown arguments: %s\n"
			 % string.joinfields(args, ' '))
	usage()
	sys.exit(2)

ndays=7
quiet=0
offset=0
output_file=None
explicit_config_file=0 # used only for printing a message
configure=0
share=None
for o, a in opts:
        if o in ("-h", "--help"):
                usage()
                sys.exit()
	elif o == '--version':
		version()
		sys.exit()
	elif o in ("-n", "--days"):
                ndays=int(a)
	elif o in ("-q", "--quiet"):
		quiet=1
	elif o in ("-c", "--configure"):
		configure=1
	elif o == "--config-file":
		config_file=a
		explicit_config_file=1
	elif o in ("-o", "--output"):
		output_file=a
	elif o in ("-f", "--offset"):
		offset=int(a)
	elif o == "--share":
		share=a
	else:
		assert False

# share/ directory for storing the file of programme titles that can
# legally contain colons.  This next line is altered by processing
# through tv_grab_nz.PL.  But we can use the current directory instead
# of share/tv_grab_nz for development.
#
# The 'source' file tv_grab_nz.in has SHARE_DIR=None, which means use
# the current directory.  In any case the directory can be overridden
# with the --share option (useful for testing).
#
SHARE_DIR='/usr/share/xmltv' # by grab/nz/tv_grab_nz.PL
if share != None:
	SHARE_DIR=share
if SHARE_DIR != None:
	OUR_SHARE_DIR=SHARE_DIR + "/tv_grab_nz"
else:
	OUR_SHARE_DIR="."
SPECIAL_EPISODE_NAMES_FILE=OUR_SHARE_DIR + "/episode_names"

if not explicit_config_file and not quiet:
	sys.stderr.write("using config filename %s\n" % config_file)

if configure:
	# FIXME this duplicates code in Config_file.pm - but in Python!
	if os.path.exists(config_file):
		sys.stderr.write(
"""The configuration file %s already exists.  There is
currently no support for altering an existing configuration, you have
to reconfigure from scratch.

Do you wish to overwrite the old configuration (y/N)? """
		% (config_file))
                answer = sys.stdin.readline()
		if answer not in ("Y\n", "y\n", "yes\n"):
			print "Exiting.\n"
			sys.exit(0)
			
	channels=[]
	for c in ['tv1', 'tv2', 'tv3', 'tv4', 'primetv']:
		sys.stdout.write('Do you wish to download %s (Y/n)? ' % (c))
		answer = sys.stdin.readline()
		if answer == 'Y\n' or answer == 'y\n' or answer == '\n':
			channels.append(c)

	f = open(config_file, 'w')
	f.write('channels=')
	for c in channels:
		f.write('%s ' % (c))
	f.write('\n')
	f.close()
	print "All done, run with no arguments to grab a week's listings.\n"
	sys.exit()

# Not configuring, must be grabbing.
if not os.path.exists(config_file):
	sys.stderr.write("config file %s does not exist, run me with --configure\n" % (config_file))
	sys.exit(1)

if offset >= 7:
	sys.stderr.write("Cannot skip more than a week ahead\n")
	sys.exit(1)

if offset+ndays > 7:
        sys.stderr.write("Cannot get more than 7 days ahead\n")
        sys.exit(1)

class TvGuide:
	"""A class for downloading TV listings from webpages and reformatting
	   them into the xmltv format"""

	def __init__(self, dates, quiet):
		"""Dates defines a list of days in number of seconds 
		   since Epoch format (that returned by time.time()) 
		   for which the channels should be downloaded.
		   Quiet is a boolean on whether or not to display error
		   messages"""
		self.channels={}
		self.dates=dates
		self.quiet=quiet
		
		# Regular Expressions
		self.time_expr1     = re.compile("([0-9]+):([0-9]+)\s*([aApP][mM])")
		self.time_expr2	    = re.compile("([0-9]+):([0-9]+)")
		self.time_expr3	    = re.compile("([0-9][0-9][0-9][0-9])")
		self.date_expr	    = re.compile("^([\s\S]*)\(([0-9][0-9][0-9][0-9])\)([\s\S]*)$")
		self.comments_expr  = re.compile("^(.*)\s*\((.*)\)\s*$")
		self.episode_expr   = re.compile("^(.*)(?:\:|(?: {2,})|\t)(.*)$")
		
		# Some episode names are bad - eg "Earth: Final Conflict"
		names = '###' # should match no episode name
		f=open(SPECIAL_EPISODE_NAMES_FILE, 'r')
		lines=f.readlines()
		for l in lines:
			if names :
				names = names + "|(" + re.escape(string.strip(l)) + ")"
			else:
				names = "(" + re.escape(string.strip(l)) + ")"
		self.specialEpisode_expr = re.compile("^" + names + ".*")
		
	def error(self,msg):
		sys.stderr.write("%s\n" % (msg))

	def progress(self, msg):
		if not self.quiet:
			sys.stderr.write("%s" % (msg))

	def progress_update(self):
		if not self.quiet:
			sys.stderr.write("#")
			

	def add_channel(self,domain,name,url_string,regexp,map,deletes):
		"""Adds a channel definition. Map defines how 
		   the elements in the regexp get transformed 
		   into the time/name/description fields. Deletes
		   defines a list of regular expressions to delete 
		   before searching for the matches"""
		self.channels[domain] = (name,url_string,regexp,map,deletes)

	def clean_html(self, text):
		"""Removes anything from the html that would conflict with
		   the XML output"""
		text = re.sub("\x92", "", text) # PrimeTV had this one
		text = re.sub("\x96", "", text) # And this
		text = re.sub("\0222", "", text) # And this
		text = re.sub("\0226", "", text) # And this
		text = re.sub("&nbsp;", " ", text)
		text = re.sub("&amp;", "&", text)
		text = re.sub("&#39;", "'", text)
#		text = re.sub("\?", "'", text)
        	text = re.sub("<[^>]*>","",text)
		return text

	def html_encode(self, text):
		"""Converts special characters to escape codes"""
		text = re.sub("&", "&amp;", text)

		# Not an HTML special character, but nonetheless
		# something to get rid of before writing XML.
		#
		text = re.sub("\r", "", text)
		return text

	def gen_time(self, time):
		"""Converts from a bunch of different time representations
		   into a simple HHMM 24-hour clock representation"""
		expr=self.time_expr1
		match = expr.match(time)
		if match:
			hour=int(match.group(1))
			minute=int(match.group(2))
			if string.upper(match.group(3)) == "PM" and hour < 12:
				hour=hour+12
			if string.upper(match.group(3)) == "AM" and hour == 12:
				hour=0
			return "%2.2d%2.2d" % (hour, minute)

		expr = self.time_expr2
		match = expr.match(time)
		if match:
			hour=int(match.group(1))
			minute=int(match.group(2))
			return "%2.2d%2.2d" % (hour, minute)

		expr = self.time_expr3
		match = expr.match(time)
		if match:
			return match.group(1)
		
		self.error("Unable to convert time: '%s'" % (time))
		return ""
	
	def get_episode(self, title):
		"""Removes a episode title embedded in the title."""
		episode = ""
		episode_expr   = self.episode_expr				
		specialEpisode_expr    = self.specialEpisode_expr
		episode_match  = episode_expr.match(title)
		if episode_match and (not specialEpisode_expr.match(title)):
			title = episode_match.group(1)
			episode = episode_match.group(2)
#			print title + " " + episode + "\n"
		return (title, episode)
			
	def get_comments(self, title):
		"""Removes any comments embedded in the title. These
		   are identified as anything at the end of the title
		   which is enclosed in '(', ')'
		   Returns a tuple consisting of the modified title and
		   the comments (with parenthesis removed)."""
		comments=""
		expr = self.comments_expr
		match = expr.match(title)
		while match:
			title=match.group(1)
			if comments != "":
				comments=match.group(2)+","+comments
			else:
				comments=match.group(2)
			match = expr.match(title)

		return (title, comments)

	def is_repeat(self, comment):
		return string.upper(comment) in ['RPT', 'REPEAT']

	def is_subtitled(self, comment):
		return string.upper(comment) in ['SUB-TITLED', 'SUBTITLED']

	def is_part(self, comment):
		return(string.find(comment, 'Part ') != -1
		       or string.find(comment, 'Parts ') != 1)

	def is_category(self, comment):
		"""Returns true if comment is a known category, or false
		   otherwise"""
		return string.upper(comment) in ['LIVE']

	def is_rating(self, comment):
		"""Returns true if comment is a known rating, or false 
		   otherwise"""
		return string.upper(comment) in ['G', 'PG', 'PGR', 'AO']

	def is_year(self, comment):
		return len(comment) == 4 and str(string.atoi(comment)) == comment

	def get_date(self, title, desc):
		"""Analyses the title and description to determine if there
		   is an embedded date. Returns a tupple (title, desc, date)
		   if there is a date, or None otherwise"""
		expr = self.date_expr 
		match = expr.match(title)
		if match:
			return (match.group(1) + match.group(3), desc, match.group(2))
		match = expr.match(desc)
		if match:
			return (title, match.group(1) + match.group(3), match.group(2))
		return None

	def get_programs(self, domain, date):
		"""Downloads a webpage for the channel in 'name', and
		   extracts all of the program definitions as specified
		   by the appropriate regular expression & map.
		   Uses date to build some available substitues for inside
		   the channel url"""
		name,url_string,regexp,map,deletes = self.channels[domain]
		retval=[]

		dow_index=self.dates.index(date)
		day=time.localtime(date)
		date="%4.4d%2.2d%2.2d" % (day[0], day[1], day[2])
		dow=time.strftime("%A",day)
		dow_number=int(day[6]) % 7 # 0-6 Mon-Sun

#		print "Getting programs for: " + name
		url=eval(url_string)
		try:
			f = urllib.urlopen(url)
		except IOError, e:
			self.error("Error downloading program listing for %s, URL %s: %s"
				   % (name, url, `e`))
			return []
		if f == None:
			self.error("Error downloading program listing for " + name)
			return []

		data=f.read()
		self.progress_update()

		for d in deletes:
			expr=re.compile(d)
			data=expr.sub("", data)
#		print "Matching using: " + regexp
		expr=re.compile(regexp)
		matches=expr.findall(data)
		if len(matches) == 0:
			self.error("Error parsing program listing for " + name
				   + " - perhaps the page has changed\n"
#				   + "Page : " + data + "\n"
				   + "Url: %s" % (url)
				   )
			return []
		prevtime=""
		for show in matches:
			starttime=""
			endtime=""
			program=""
			episode=""
			desc=""
			rating=None
			categories=[]
			repeat=""
			subtitled=""
			part=""
			create_date=""

#			print show

			if map[0] != -1:
				starttime_day=self.gen_time(self.clean_html(show[map[0]]))
				if starttime_day == '':
					self.error('skipping program because of bad start time')
					continue
				starttime=date+starttime_day
			if map[1] != -1:
				program=self.clean_html(show[map[1]])
				(program,comments)=self.get_comments(program)

				# Turn 'Movie: X' into 'X' with category 'movie'.
				old_program = program
				program = re.sub('^Movie: ', '', program);
				if program != old_program:
					categories.append('movie')
					
				(program,episode) =self.get_episode(program)
				for comment in string.split(comments, ","):
					comment=string.strip(comment)
					if comment == "": 
						continue
					if self.is_rating(comment):
						rating=comment
					elif self.is_repeat(comment):
						repeat=comment
					elif self.is_subtitled(comment):
						subtitled=comment
					elif self.is_part(comment):
						part=comment
					elif self.is_category(comment):
						categories.append(comment)
					elif self.is_year(comment):
						create_date = comment
					else:
						self.error("Invalid comment: "+comment+" from " + comments + ", " + program + " on channel " + domain + "\n")
						
				if string.upper(program) == program:
					program=string.capwords(program)

			if map[2] != -1 and string.strip(show[map[2]]) != "":
				desc=self.clean_html(show[map[2]])

			if map[3] != -1:
				endtime_day=self.gen_time(self.clean_html(show[map[3]]))
				if endtime_day != "":
#					print "Times: " + endtime_day + " & " + starttime_day
					# if the program goes over two days
					if starttime != "" and int(endtime_day) < int(starttime_day):
						endday=time.localtime(time.mktime(day) + 24*60*60)
						enddate=("%4.4d%2.2d%2.2d"
							 % (endday[0], endday[1], endday[2]))
						endtime=enddate+endtime_day
					else:
						endtime=date+endtime_day

			chkdate=self.get_date(program, desc)
			if chkdate:
				(program,desc,create_date)=chkdate

			retval.append([domain, program, starttime, endtime, desc, rating, create_date, categories, repeat, subtitled, part, episode])

		return retval

	def get_all_programs(self):
		"""Downloads the programs for all of the channels"""
		retval=[]
		for channel in self.channels.keys():
			for d in self.dates:
				retval.extend(self.get_programs(channel, d))
		return retval

	def display_program_xml(self, output_file):
		"""Generates the xmltv output file. If output_file is not None
		   then it specifies the location to dump to, otherwise stdout
		   is used"""
		self.progress("Downloading listings: ")
		programs=self.get_all_programs()
		self.progress("\n")

		if output_file == None:
			output=sys.stdout
		else:
			output=open(output_file,'w')
		output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
		output.write('<!DOCTYPE tv SYSTEM "xmltv.dtd">\n\n')
#		output.write('<tv generator-info-name="NZ python generator" date="%s">\n' % (self.date))
		output.write('<tv generator-info-name="NZ python generator">\n')

		for channel in self.channels.keys():
			self.display_channel_xml(channel, output)

		self.progress("Building output: ")

		for prog in programs:
			self.display_program_xml_program(prog, output)
			self.progress_update()
		self.progress("\n")

		output.write('</tv>\n')
		if output_file != None:
			output.close()

	def display_channel_xml(self, domain, output):
		name,url_string,regexp,map,deletes = self.channels[domain]
		output.write('  <channel id="%s">\n' % (domain))
		output.write('    <display-name>%s</display-name>\n' % (name))
		output.write('  </channel>\n')

	def display_program_xml_program(self, program, output):
		output.write('  <programme start="%s" ' % (program[2]))
		if string.strip(program[3]) != "":
			output.write('stop="%s" ' % (program[3]))
		output.write ('channel="%s">\n' % (program[0]))

		# This bit might be either episode number or series name.
		episode_num = None
		series_name = None
		if string.strip(program[10]) != "":
			part = string.strip(program[10])
			m = re.match("[pP]art (\d+)$", part)
			episode_num = None
			if m:
				episode_num = str(int(m.group(1)) - 1)
			else:
				m = re.match("[pP]arts (\d+) &amp; (\d+)$", part)
				if m:
					episode_num = (str(int(m.group(1)) - 1)
						       + "/"
						       + str(int(m.group(2)) - 1))
				else:
					m = re.match("P(\d+)$", part)
					if m:
						episode_num = str(int(m.group(1)) - 1)
					else:
						series_name = part

		if series_name == None:
			output.write('    <title>%s</title>\n' % (string.strip(self.html_encode(program[1]))))
		else:
			output.write('    <title>%s</title>\n' % (string.strip(self.html_encode(series_name))))
			output.write('    <sub-title>%s</sub-title>\n' % (string.strip(self.html_encode(program[1]))))

		if string.strip(program[11]) != "":
			output.write('    <sub-title>%s</sub-title>\n' % string.strip(self.html_encode(program[11])))

		if string.strip(program[4]) != "":
			output.write('    <desc>%s</desc>\n' % (self.html_encode(string.strip(program[4]))))

		if string.strip(program[6]) != "":
			output.write('    <date>%s</date>\n' % (string.strip(program[6])))
		
		for category in program[7]:
			output.write('    <category>%s</category>\n' % (string.strip(category)))
		if string.strip(program[8]) != "":
			output.write('    <previously-shown />\n')

		if string.strip(program[9]) != "":
			output.write('    <subtitles />\n')

		if episode_num != None:
			output.write('    <episode-num system="xmltv_ns"> .  . '
				     + episode_num
				     + '</episode-num>\n')

		if program[5] and string.strip(program[5]) != "":
			output.write('    <rating system="NZBSA">\n')
			output.write('      <value>%s</value>\n' % (string.strip(program[5])))
			output.write('    </rating>\n')

		output.write('  </programme>\n')


if __name__ == '__main__':
	dates=[]
	for i in range(offset,ndays+offset):
	 	dates.append(time.time() + 60 * 60 * 24 * i)
	
	channels = None
	f=open(config_file, 'r')
	lines=f.readlines()
	for l in lines:
		if string.find(l, 'channels=') == 0: # 'starts with'
			if channels:
				sys.stderr.write("seen 'channels' line twice in %s\n" % config_file)
				sys.exit(1)
			channels=string.split(string.strip(l[9:]), ' ')
		else:
			sys.stderr.write("%s: bad line %s" % (config_file, l))
	f.close()
	if not channels:
		sys.stderr.write("did not see any 'channels' line in %s\n" % config_file)
		sys.exit(1)

	guide=TvGuide(dates, quiet)
	nzoomregexp="<B>([^<]*)</B>(</A>)?\s*([0-9:]*) - ([0-9:]*)(<br>([^<]*))?\s*</td></tr>"
	nzoomorder=[2, 0, 5, 3]

	if 'tv1' in channels:
		guide.add_channel("tvone.nzoom.com", "TV1", '"http://ontv.nzoom.com/channel_schedule/0,1591,%s--1,00.html" % (dow)', nzoomregexp, nzoomorder, [])
	if 'tv2' in channels:
		guide.add_channel("tv2.nzoom.com", "TV2", '"http://ontv.nzoom.com/channel_schedule/0,1591,%s--2,00.html" % (dow)', nzoomregexp, nzoomorder, [])
	if 'tv3' in channels:
		guide.add_channel("tv3.co.nz", "TV3", '"http://ontv.nzoom.com/channel_schedule/0,1591,%s--3,00.html" % (dow)', nzoomregexp, nzoomorder, [])
	if 'tv4' in channels:
		guide.add_channel("tv4.co.nz", "TV4", '"http://ontv.nzoom.com/channel_schedule/0,1591,%s--4,00.html" % (dow)', nzoomregexp, nzoomorder, [])
	if 'primetv' in channels:
		guide.add_channel("primetv.co.nz", "Prime", '"http://ontv.nzoom.com/channel_schedule/0,1591,%s--9,00.html" % (dow)', nzoomregexp, nzoomorder, [])
 
	guide.display_program_xml(output_file)

pod_documentation="""

=pod

=head1 NAME

tv_grab_nz - Grab TV listings for New Zealand.

=head1 SYNOPSIS

tv_grab_nz --help

tv_grab_nz [--output FILE] [--days N] [--offset N] [--quiet]

=head1 DESCRIPTION

Output TV listings in XMLTV format for New Zealand.  Running
B<tv_grab_nz> with no arguments will get a weekE<39>s listings.

B<--output FILE> Write output to FILE rather than standard output.

B<--days N> Grab N days rather than as many as possible.

B<--offset N> Start grabbing at today + N days.  N may be negative.

B<--quiet> suppress the progress messages normally written to standard
error.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Andre Renaud, andre@ignavus.net

=head1 BUGS

Occasionally during a grab the message will appear:

    Unable to convert time: ':'

This is thought to be caused by the upstream HTML pages having a time
that is missing, or at least not in the expected place.  But how to
fix it remains to be investigated.

Should probably support SkyTV channels (as well as the regional ones) - 
if anyone knows where to get decent listings of these, please email the 
author.

=cut
"""
