pods.py

###############################################
## Program that loads a list of Podcasts and their associated XML files, extracts the podcast file data and presents
## it to the user both in a terminal application and on a web page. Web page also allows the podcast to be played
## embedded in the browser. Imports a host of helper modules based on code I've found from all over the web.
## Only non-standard Python library is: http://labix.org/python-dateutil and possibly http://lxml.de/
###############################################

import csv, urllib, urllib2, re, socket, SimpleHTTPServer, SocketServer, json
from dateutil import parser
from urlparse import urlparse, parse_qs
try:
    from lxml import etree
except ImportError:
    try:
        import xml.etree.cElementTree as etree
    except ImportError:
        import  xml.etree.ElementTree as etree

# Function to format the date in a standard way
def dtfmt(date):
    return date.strftime("%a, %m/%d/%Y %H:%M:%S") + " GMT"

# Function to fail silently on int parsing
def safeIntParse(i):
    try:
        return int(i)
    except:
        pass
    return -1

# Function to read in OPML file - incomplete
def getOPML(opml):
    f = urllib2.urlopen(opml)
    r = f.read()
    opmlTree = etree.fromstring(r)
    pods = [(item.get('text'),item.get('xmlUrl')) for item in opmlTree.findall('outline')]
    print pods

# Function to retrieve XML given a podcast URL and return a list of podcast tuple items.
def getXML(podUrl):
    # If the podcast is an iTunes link, use the web service PickleMonkey.net to convert it into a proper RSS feed
    # Uses urllib to encode the URL string, urllib2 to get the data, and json to load the data into a dictionary.
    if podUrl.find("://itunes.apple.com") != -1:
        try:
            podiTunes = urllib2.urlopen("http://picklemonkey.net/flipper/extractor.php?feed=" + urllib.quote_plus(podUrl))
            podJson = json.load(podiTunes)
            podUrl = podJson["podcast"]
        except Exception as e:
            print "Error getting iTunes URL data:",str(e)
            return None

    # Get the pod XML file and load it into an XML ElementTree object (https://docs.python.org/2/library/xml.etree.elementtree.html)
    # using the xml (or lxml) module. The XML should be in the form:
    # <rss>
    #   <channel>
    #     <item>
    #       <title>Title</title>
    #       <pubDate>Mon, 20 Apr 2009 11:47:00 -0400</pubDate>
    #       <link>http://link.to/file</link>
    #       <enclosure url="http://link.to/file" length="1200000" />
    #     </item>
    #   </channel>
    # </rss>
    podXml = urllib2.urlopen(podUrl.replace(' ','%20')).read()
    podXmlTree = etree.fromstring(podXml)
    #podXmlTree = etree.parse('file.xml') # code to load XML from a file

    podItems = []
    #podHeader = "<a href='{}'>{}</a> - {}".format(podXmlTree.find('link').text,podXmlTree.find('title').text,podXmlTree.find('description').text) # not working yet

    # Loop through the XML using xpath (the 'channel/item' bit) and grab all the channel items
    # under the root element rss.
    for item in podXmlTree.findall('channel/item'):
        # Grab the text of these elements under item
        # Strip out any Unicode characters from the title
        # [Convert Unicode to a string in Python (containing extra symbols) - Stack Overflow]
        # http://stackoverflow.com/questions/1207457/convert-unicode-to-a-string-in-python-containing-extra-symbols
        title = item.find('title').text.encode('ascii','replace')
        pubdate = item.find('pubDate').text

        # Convert that date into a datetime object in case we want to sort later
        # [parsing date string in python (convert string to date) - Stack Overflow]
        # http://stackoverflow.com/questions/10985312/parsing-date-string-in-python-convert-string-to-date
        parsedPubdate = parser.parse(pubdate)

        # The url of the media will either be an attribute of the enclosure element
        # or the link element text
        enclosureurl = length = ""
        try:
            enclosure = item.find('enclosure')
            enclosureurl = enclosure.get('url')
            lengthInt = safeIntParse(enclosure.get('length'))
            if lengthInt > 0:
                length = str(lengthInt/1000000) + " MB"
        except:
            enclosureurl = pubdate = item.find('link').text

        # Add to our list of tuples
        podItems.append((title, parsedPubdate, enclosureurl, length))

    return podItems

# Function to return the final redirected URL. Some pod hosts will store their files with a
# content delivery network and as such the url is dynamically redirected. We use this to get the
# final location as the HTML video player sometimes can't handle it otherwise
# [How to get the URL of a redirect with Python - Stack Overflow]
# http://stackoverflow.com/questions/4902523/how-to-get-the-url-of-a-redirect-with-python
def getRedirectedUrl(url):
    req = urllib2.Request(url)
    res = urllib2.urlopen(req)
    return res.geturl()

# Function to retrieve the list of pod files from a csv file. If the file cannot be found,
# an error is shown and a sample of test data will be substituted
def getPodList():
    podList = [] # List matrix
    try:
        # Using csv module to read in our data as a list matrix.
        # csv can do lots more powerful operations on these types of data files.
        with open('podlist.csv', 'rb') as csvFile:
            podReader = csv.reader(csvFile)
            podList = [pod for pod in podReader]
    except:
        print "There was an error loading podlist.csv. Using test data until problem is corrected"
        testdata = '''CNET Vods,http://feeds2.feedburner.com/allcnetvideopodcasts
CNET UK Podcast,http://www.cnet.co.uk/feeds/podcasts/
This Week In Tech,http://feeds.twit.tv/twit
The Bugle,http://feeds.feedburner.com/thebuglefeed
WNYC RadioLab,http://feeds.wnyc.org/radiolab
NPR Planet Money,http://www.npr.org/rss/podcast.php?id=510289
NFL Network M&M Podcast,http://nohuddle.libsyn.com/rss
Football Weekly,http://www.guardian.co.uk/football/series/footballweekly/podcast.xml
Star Talk Radio,http://feeds.soundcloud.com/users/soundcloud:users:38128127/sounds.rss'''
        podList = [pod.split(",") for pod in testdata.splitlines()]

    return podList

# Basic function for the console app to show the list of feeds, and allow the user to
# show the files from any of the podcast feeds.
def showPodFeeds():
    print
    print
    podList = getPodList()
    for num,pod in enumerate(podList): # use enumerate to create a menu option for each pod
        print("{:3} {}").format(num,pod[0])

    # Get the input of the number corresponding to the pod feed they want to see.
    # Use while loop to ensure that the data is valid before proceeding
    subchoice = ""
    while not subchoice.isdigit() or not ( 0 <= int(subchoice) < len(podList) ):
        subchoice = raw_input ("\n\nWhich podcast feed would you like to see? --> ")
    print
    print

    # Print the data with formatting
    podItems = getXML(podList[int(subchoice)][1])
    print "{:40} {:30} {:10} {}".format("Pod Name","Pub Date","Length","Pod File URL")
    for item in podItems:
        print "{:40} {:30} {:10} {}".format(item[0], dtfmt(item[1]), item[3], item[2])


# Custom web server handler class
# Python has a few web server modules as part of the standard library that will just serve
# files in current directory. We will use/extend the SimpleHTTPServer class and intercept
# the GET requests in our own function that can be used to show our dynamic content
# from: [Blended Technologies >> Blog Archive >> Python Trick: Really Simple HTTP Server]
# http://www.blendedtechnologies.com/python-trick-really-little-http-server/220

class FeedsHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):

    # Top navigation bar TODO:
    def getTopNav(self):
        return '''<h1><a href="/pod">Podcaster Extractor</a></h1>'''

    # Function return display HTML for the given podcast feed
    # Displaying using a list index, though we may want to transition
    # to this method of dictionary interpolation formatting:
    # http://stackoverflow.com/questions/1304025/python-string-interpolation-using-dictionary-and-strings
    def getPodHTML(self, podUrl):
        out = ''
        try:
            podItems = getXML(podUrl)
            for item in podItems:
                out += '''<p>
(<a href='{}'>Download</a>)
{} {}
<a target='player' href='/play?playfile={}'>{}</a></p>'''.format(item[2], dtfmt(item[1]), item[3], item[2], "{} ({})".format(item[0],item[2].split("/")[-1]))
        except Exception as e:
            out = "<p>Error loading XML from: {}</p><p>{}</p>".format(podUrl,str(e))
        return out

    # Function to display HTML for the media player
    # Having some trouble with the Flash based MP4 player
    def getPlayerHTML(self, podFile):
        out = '''<html>
<head><title>MP3 Player</title></head>
<body>
<form name=play id=play method="get" action="/play">
    <input type="text" name="playfile" value="{}">
    <input type=submit>
</form>
<hr>'''.format(podFile)

        # Only display this part if a filename was passed in
        if podFile:
            redirPodFile = getRedirectedUrl(podFile)
            if podFile.lower().find('.mp4') != -1 or podFile.lower().find('.m4v') != -1:
                flashplayerConfig = 'config={"clip"{"url":"'+redirPodFile+'","autoPlay":false,"autoBuffering":true}}'
                out += '''
<video controls='controls'>
    <source src='{}' type='video/mp4' />
</video>
<hr>
<object type='application/x-shockwave-flash' data='http://releases.flowplayer.org/swf/flowplayer-3.2.1.swf' wiidth='480' heiight='270'>
        <param name='movie' value='http://releases.flowplayer.org/swf/flowplayer-3.2.1.swf' />
        <param name='allowFullScreen' value='true' />
        <param name='wmode' value='transparent' />
        <param name='flashvars' value='{}' />
</object>'''.format(redirPodFile,flashplayerConfig)
            elif podFile.lower().find('.mp3') != -1:
                out += '''
<audio controls='controls'>
    <source src='{}' type='audio/mpeg' />
</audio>
<hr>
<embed src='http://tinyurl.com/dropittt/mediaplayer.swf' type='application/x-shockwave-flash' flashvars='file={}&autostart=false' allowscriptaccess='none' height='20' width='200'>
</embed>'''.format(redirPodFile,redirPodFile)

            out += '''
<hr><a href="{}">Link to file</a> '''.format(podFile)
        out += '''</body>
</html>'''
        return out

    # Function to generate the HTML for the list of podcasts, a form to enter a pod, and, if a
    # pod was selected, the items in that pod's RSS feed
    def podHtmlOutput(self, podUrl, content):
        podList = getPodList()
        podListHtml = "\n".join(['<big><a href="/pod?pod={}">{}</a></big> &nbsp; || &nbsp'.format(urllib.quote(pod[1]),pod[0]) for pod in podList])

        # Get the pod's title if we have it in our database. In the future this should be pulled
        # from the RSS feed.
        try:
            title=next((pod[0] for pod in podList if pod[1]==podUrl))
        except StopIteration:
            title=''

        output = '''<html>
    <head><title>Podcaster Extrctor - {}</title></head>
    <body>
        {}
        <form action="/pod">
            POD URL: <input size="100" name="pod" value="{}">
            <input type="submit">
        </form>
        <hr>
        {}
        <hr>
        {}
    </body>
</html>'''.format(title, self.getTopNav(), podUrl, podListHtml, content)

        '''<form action="/pod">
OPML URL: <input size="100" name="pod" value="'.$pod.'"><input type="hidden" name="opml" value="yes"><input type="submit"></form>'''

        return output

    # Decided to write a common-case function for returning data from the web server to the browser
    # It sends back the 200 success code, and the mime type header then the content
    def writeOut(self,content):
        self.send_response(200)
        self.send_header('Content-type','text/html')
        self.end_headers()
        self.wfile.write(content)


    # This method is defined in SompleHTTPServer to simply return a list of files, or the file specified
    # We will override this to handle the cases specific to our program.
    def do_GET(self):

        # Using urlparse module to help us get the items in the querystring that came from links/the form
        # This code parses the self.path variable set in SimpleHTTPServer and turns the querystring
        # into a dictionary object. so http://localhost:8080?q=1&r=ham becomes {"q":[1],"r":["ham"]}
        # [Processing HTTP GET input parameter on server side in python - Stack Overflow]
        # http://stackoverflow.com/questions/8928730/processing-http-get-input-parameter-on-server-side-in-python
        queryDict = parse_qs(urlparse(self.path).query)

        # This is the main decision point for deciding what gets done based on the URL passed.
        # In any other language I would have used a switch statement here. There is a "pythonic" way
        # to do this - to create a dictionary object where the case condition is the key and the function
        # name is the value. Source: [Switch-case statement in Python | The ByteBaker]
        # http://bytebaker.com/2008/11/03/switch-case-statement-in-python/

        # Media player case - this will display media players for mp3 and mp4 files
        # using the filename that was passed in
        if self.path.startswith('/play'):
            playfile = ''
            if 'playfile' in queryDict:
              playfile = queryDict['playfile'][0]
            out = self.getPlayerHTML(playfile)
            self.writeOut(out)
            return

        # Mostly for debugging - will output the csv file (or an error) to the browser
        # This is more of a back door. There is no visible link to this URL from the web app
        elif self.path.startswith('/csv'):
            csvContent = ''
            try:
                with open('podlist.csv', 'rb') as csvFile:
                    csvContent = csvFile.read()
            except:
                csvContent = "Error loading CSV file"
            self.writeOut("<pre>{}</pre>".format(csvContent))
            return

        # Another debugging call - shows verious server variables and the query dictionary
        elif self.path.startswith('/sample'):
            out = '''self.path = {} <br> self.raw_requestline = {} <br> self.client_address = {}
<br> queryDict = {}'''.format(self.path, self.raw_requestline, self.client_address, queryDict)
            self.writeOut(out)
            return

        # Yet another hidden call. If you have a folder named csharp, this will show the default behavior
        # of the SimpleHTTPServer - just shows a list of files in the directory, and the file if clicked
        # if we have HTML files here they will be rendered by the browser.
        elif self.path.startswith('/csharp'):
            #serve files, and directory listings by following self.path from
            #this directory
            return SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)

        # The default/catch-all case - essentially the entry point of the web application
        # Here we will display a list of all the podcasts and, if there is a pod variable in
        # the query, it will load that podcast.
        else:
            podUrl = podContent = ''
            if 'pod' in queryDict:
                podUrl = queryDict['pod'][0]
                podContent = self.getPodHTML(podUrl)
            out = self.podHtmlOutput(podUrl, podContent) #call pod listing function here
            self.writeOut(out)
            return

# Custom TCP Server class - used to prevent "socket in use" errors after the program has exited
# Apparently there is a timeout that has to elapse before you can reuse the same socket. This overrides that.
# [Python: Binding Socket: "Address already in use" - Stack Overflow]
# http://stackoverflow.com/questions/6380057/python-binding-socket-address-already-in-use
class FeedsTCPServer(SocketServer.TCPServer):
    def server_bind(self):
        self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.socket.bind(self.server_address)


# Function that starts the web server using our special class as the handler
def startWebServer(port = 8080): # defines which port the server will run on
    httpd = FeedsTCPServer(('', port),FeedsHandler) # create the server
    try:
        print "Web server started serving at http://this.server.name:" + str(port), "- stop server with CTRL-C"
        httpd.serve_forever() # run until stopped
    except KeyboardInterrupt:
        print "\nShutting down web server"
        httpd.socket.close() # close the socket

# By putting the main code of the program in this function, we can create a way to prevent any code from being run
# if this code is imported into some other python script. We check that and call main() below
def main():
    choice = ""
    # This is another block that could be put into that Pythonic select/case style code
    while choice <> "0" :
        choice = raw_input ("\n\n *** MAIN MENU: 1] Show Podcast feeds     2] Start Web Server    0] Quit      --> ")
        if choice == "1":
            showPodFeeds()
        if choice == "2":
            startWebServer()

# This code will basically only run the main() function only if this script was the main script launched.
if __name__ == "__main__":
    main()