#!/usr/bin/env python
# 
# this script is in the public domain.
# some documentation at http://www.nenie.org/misc/bbcradio/
#
# history
# 200501 added support for getting into frames that have been added into BBC player
#

import sys
import string
import re

import urllib
import urlparse

def a_link (href, a_label):
	"A link, extract label from scheme if no label"
	if a_label == None or a_label == "":
		i = string.find (href, ':')
		if i >= 0:
			a_label = href[:i]
	return '<a href="' + href + '">' + a_label + '</a>'

def tag (a_tag, a_content):
	"XML tag"
	return '<' + a_tag + '>' + a_content + '</' + a_tag + '>\n'

def embed_src (a_line):
	"Extract src attribute from embed tag"
	result = ""
	if string.find (a_line, "<embed ") >= 0:
		m = re.search ("""<embed .*src=['"]([^'"]+)['"]""", a_line)
		if m != None:
			result = m.group (1)
	return result
	
def retrieve_bbc_realaudio (a_bbc_url):
	"Extract the real audio URL from a BBC player page"
	result = ""
	next_url = ""
	f = urllib.urlopen (a_bbc_url)
	for a_line in f.readlines():
		# get next file if frame set
		if not next_url:
			m = re.match ("""<frame name="bbcplayer".* src=['"]([^'"]+)['"]""", a_line)
			if m != None:
				next_url = m.group (1)
		# scan for embed line
		if not result:
			result = embed_src (a_line)
	f.close()
	if next_url and not result:
		result = retrieve_bbc_realaudio (urlparse.urljoin (a_bbc_url, next_url))
	return result

def retrieve_realaudio_target (a_url):
	"First line of content"
	result = ""
	f = urllib.urlopen (a_url)
	result = string.strip (f.read())
	f.close()
	return result
	
def bbc_player_raw_links (a_base_url, a_line):
	result = None
	m = re.match ("""<a .* [hH][rR][eE][fF]=['"](/.*_aod.shtml\?)(.*)['"] onclick=['"].*['"] .*>""", a_line)
	if m != None:
		a_player_url_tag = m.group(2)
		a_player_url = m.group (1)+a_player_url_tag
		a_url = urlparse.urljoin (a_base_url, a_player_url)
		a_ra = urlparse.urljoin (a_base_url, retrieve_bbc_realaudio (a_url))
		if a_ra:
			a_rtp = retrieve_realaudio_target (a_ra)
			result = (a_player_url_tag, a_ra, a_rtp)
	return result
	
def textify_img (a_line):
	"Replace img tags with their alt text"
	s = re.sub ("""<img src="/furniture/tiny.gif" [^>]*alt="" />""", """<!--tiny.gif-->""", a_line)
	return re.sub ("""<img[^>]* alt=['"]([^'"]*)['"][^>]*>""", """<!--img-->[\\1]""", s)

def absolute_href (a_base_url, a_line):
	"Replace href links with absolute links"
	f = lambda m,b=a_base_url: 'href="' + urlparse.urljoin (b, m.group(1)) + '"' 
	return re.sub ("""[hH][rR][eE][fF]=['"]([^'"]*)['"]""", f, a_line)

def process_listenagain (a_mode, a_url, a_filename):
	"Add raw links before listen again player links"
	fout = sys.stdout
	if a_filename != "-":
		fout = open (a_filename, 'w')
	if mode_is_list (a_mode):
		fout.write ("<html><body>\n")
		fout.write (tag ("h1", a_url))
		fout.write ("<ul>\n")
	f = urllib.urlopen (a_url)
	for a_line in f.readlines():
		r = bbc_player_raw_links (a_url, a_line)
		if r:
			if mode_is_list (a_mode):
				fout.write (tag ("li", a_link (r[1], r[0]) + " " + a_link (r[2], None)))
			else:
				fout.write (a_link (r[1], None) + " " + a_link (r[2], None))
				fout.write (textify_img (absolute_href (a_url, a_line)))
	f.close()
	if mode_is_list (a_mode):
		fout.write ("</ul></body></html>\n")
	if fout != sys.stdout:
		fout.close()


def usage ():
	print "usage: listenagain.py [list|patch] <in_url> <out_html>"
	print "  list    : raw list or http/ra URL in HTML format"
	print "  patch   : add URI inline in source page"
	print "  in_url  : input URL from which URL are extracted"
	print "  out_html: output html file (- for stdout)"

def mode_is_list (a):
	"Is the mode set to list"
	return a == "list"
	
def usage_is_mode (a):
	"Is this a know mode?"
	return mode_is_list (a) or a == "patch"
	
default_in_url = "http://www.bbc.co.uk/radio4/progs/listenagain.shtml"
if __name__ == "__main__":
	"Comand line"
	out_html = "-"
	if len(sys.argv) >= 4:
		out_html = sys.argv[3]
		
	in_url = default_in_url
	if len(sys.argv) >= 3 and sys.argv[2] != "":
		in_url = sys.argv[2]
	
	if len(sys.argv) >= 2:
		if usage_is_mode (sys.argv[1]):
			mode = sys.argv[1]
			print "Converting to " + mode + " " + in_url
			process_listenagain (mode, in_url, out_html)
		else:
			# resolve stand alone player link
			print retrieve_realaudio_target (retrieve_bbc_realaudio (sys.argv[1]))
	else:
		usage ()

