Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###############################################
- ## Program that loads a list of Podcasts and their associated XML files, extracts the podcast file data and presents
- ## it to the user both in a terminal application and on a web page. Web page also allows the podcast to be played
- ## embedded in the browser. Imports a host of helper modules based on code I've found from all over the web.
- ## Only non-standard Python library is: http://labix.org/python-dateutil and possibly http://lxml.de/
- ###############################################
- import csv, urllib, urllib2, re, socket, SimpleHTTPServer, SocketServer, json
- from dateutil import parser
- from urlparse import urlparse, parse_qs
- try:
- from lxml import etree
- except ImportError:
- try:
- import xml.etree.cElementTree as etree
- except ImportError:
- import xml.etree.ElementTree as etree
- # Function to format the date in a standard way
- def dtfmt(date):
- return date.strftime("%a, %m/%d/%Y %H:%M:%S") + " GMT"
- # Function to fail silently on int parsing
- def safeIntParse(i):
- try:
- return int(i)
- except:
- pass
- return -1
- # Function to read in OPML file - incomplete
- def getOPML(opml):
- f = urllib2.urlopen(opml)
- r = f.read()
- opmlTree = etree.fromstring(r)
- pods = [(item.get('text'),item.get('xmlUrl')) for item in opmlTree.findall('outline')]
- print pods
- # Function to retrieve XML given a podcast URL and return a list of podcast tuple items.
- def getXML(podUrl):
- # If the podcast is an iTunes link, use the web service PickleMonkey.net to convert it into a proper RSS feed
- # Uses urllib to encode the URL string, urllib2 to get the data, and json to load the data into a dictionary.
- if podUrl.find("://itunes.apple.com") != -1:
- try:
- podiTunes = urllib2.urlopen("http://picklemonkey.net/flipper/extractor.php?feed=" + urllib.quote_plus(podUrl))
- podJson = json.load(podiTunes)
- podUrl = podJson["podcast"]
- except Exception as e:
- print "Error getting iTunes URL data:",str(e)
- return None
- # Get the pod XML file and load it into an XML ElementTree object (https://docs.python.org/2/library/xml.etree.elementtree.html)
- # using the xml (or lxml) module. The XML should be in the form:
- # <rss>
- # <channel>
- # <item>
- # <title>Title</title>
- # <pubDate>Mon, 20 Apr 2009 11:47:00 -0400</pubDate>
- # <link>http://link.to/file</link>
- # <enclosure url="http://link.to/file" length="1200000" />
- # </item>
- # </channel>
- # </rss>
- podXml = urllib2.urlopen(podUrl.replace(' ','%20')).read()
- podXmlTree = etree.fromstring(podXml)
- #podXmlTree = etree.parse('file.xml') # code to load XML from a file
- podItems = []
- #podHeader = "<a href='{}'>{}</a> - {}".format(podXmlTree.find('link').text,podXmlTree.find('title').text,podXmlTree.find('description').text) # not working yet
- # Loop through the XML using xpath (the 'channel/item' bit) and grab all the channel items
- # under the root element rss.
- for item in podXmlTree.findall('channel/item'):
- # Grab the text of these elements under item
- # Strip out any Unicode characters from the title
- # [Convert Unicode to a string in Python (containing extra symbols) - Stack Overflow]
- # http://stackoverflow.com/questions/1207457/convert-unicode-to-a-string-in-python-containing-extra-symbols
- title = item.find('title').text.encode('ascii','replace')
- pubdate = item.find('pubDate').text
- # Convert that date into a datetime object in case we want to sort later
- # [parsing date string in python (convert string to date) - Stack Overflow]
- # http://stackoverflow.com/questions/10985312/parsing-date-string-in-python-convert-string-to-date
- parsedPubdate = parser.parse(pubdate)
- # The url of the media will either be an attribute of the enclosure element
- # or the link element text
- enclosureurl = length = ""
- try:
- enclosure = item.find('enclosure')
- enclosureurl = enclosure.get('url')
- lengthInt = safeIntParse(enclosure.get('length'))
- if lengthInt > 0:
- length = str(lengthInt/1000000) + " MB"
- except:
- enclosureurl = pubdate = item.find('link').text
- # Add to our list of tuples
- podItems.append((title, parsedPubdate, enclosureurl, length))
- return podItems
- # Function to return the final redirected URL. Some pod hosts will store their files with a
- # content delivery network and as such the url is dynamically redirected. We use this to get the
- # final location as the HTML video player sometimes can't handle it otherwise
- # [How to get the URL of a redirect with Python - Stack Overflow]
- # http://stackoverflow.com/questions/4902523/how-to-get-the-url-of-a-redirect-with-python
- def getRedirectedUrl(url):
- req = urllib2.Request(url)
- res = urllib2.urlopen(req)
- return res.geturl()
- # Function to retrieve the list of pod files from a csv file. If the file cannot be found,
- # an error is shown and a sample of test data will be substituted
- def getPodList():
- podList = [] # List matrix
- try:
- # Using csv module to read in our data as a list matrix.
- # csv can do lots more powerful operations on these types of data files.
- with open('podlist.csv', 'rb') as csvFile:
- podReader = csv.reader(csvFile)
- podList = [pod for pod in podReader]
- except:
- print "There was an error loading podlist.csv. Using test data until problem is corrected"
- testdata = '''CNET Vods,http://feeds2.feedburner.com/allcnetvideopodcasts
- CNET UK Podcast,http://www.cnet.co.uk/feeds/podcasts/
- This Week In Tech,http://feeds.twit.tv/twit
- The Bugle,http://feeds.feedburner.com/thebuglefeed
- WNYC RadioLab,http://feeds.wnyc.org/radiolab
- NPR Planet Money,http://www.npr.org/rss/podcast.php?id=510289
- NFL Network M&M Podcast,http://nohuddle.libsyn.com/rss
- Football Weekly,http://www.guardian.co.uk/football/series/footballweekly/podcast.xml
- Star Talk Radio,http://feeds.soundcloud.com/users/soundcloud:users:38128127/sounds.rss'''
- podList = [pod.split(",") for pod in testdata.splitlines()]
- return podList
- # Basic function for the console app to show the list of feeds, and allow the user to
- # show the files from any of the podcast feeds.
- def showPodFeeds():
- print
- print
- podList = getPodList()
- for num,pod in enumerate(podList): # use enumerate to create a menu option for each pod
- print("{:3} {}").format(num,pod[0])
- # Get the input of the number corresponding to the pod feed they want to see.
- # Use while loop to ensure that the data is valid before proceeding
- subchoice = ""
- while not subchoice.isdigit() or not ( 0 <= int(subchoice) < len(podList) ):
- subchoice = raw_input ("\n\nWhich podcast feed would you like to see? --> ")
- print
- print
- # Print the data with formatting
- podItems = getXML(podList[int(subchoice)][1])
- print "{:40} {:30} {:10} {}".format("Pod Name","Pub Date","Length","Pod File URL")
- for item in podItems:
- print "{:40} {:30} {:10} {}".format(item[0], dtfmt(item[1]), item[3], item[2])
- # Custom web server handler class
- # Python has a few web server modules as part of the standard library that will just serve
- # files in current directory. We will use/extend the SimpleHTTPServer class and intercept
- # the GET requests in our own function that can be used to show our dynamic content
- # from: [Blended Technologies >> Blog Archive >> Python Trick: Really Simple HTTP Server]
- # http://www.blendedtechnologies.com/python-trick-really-little-http-server/220
- class FeedsHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
- # Top navigation bar TODO:
- def getTopNav(self):
- return '''<h1><a href="/pod">Podcaster Extractor</a></h1>'''
- # Function return display HTML for the given podcast feed
- # Displaying using a list index, though we may want to transition
- # to this method of dictionary interpolation formatting:
- # http://stackoverflow.com/questions/1304025/python-string-interpolation-using-dictionary-and-strings
- def getPodHTML(self, podUrl):
- out = ''
- try:
- podItems = getXML(podUrl)
- for item in podItems:
- out += '''<p>
- (<a href='{}'>Download</a>)
- {} {}
- <a target='player' href='/play?playfile={}'>{}</a></p>'''.format(item[2], dtfmt(item[1]), item[3], item[2], "{} ({})".format(item[0],item[2].split("/")[-1]))
- except Exception as e:
- out = "<p>Error loading XML from: {}</p><p>{}</p>".format(podUrl,str(e))
- return out
- # Function to display HTML for the media player
- # Having some trouble with the Flash based MP4 player
- def getPlayerHTML(self, podFile):
- out = '''<html>
- <head><title>MP3 Player</title></head>
- <body>
- <form name=play id=play method="get" action="/play">
- <input type="text" name="playfile" value="{}">
- <input type=submit>
- </form>
- <hr>'''.format(podFile)
- # Only display this part if a filename was passed in
- if podFile:
- redirPodFile = getRedirectedUrl(podFile)
- if podFile.lower().find('.mp4') != -1 or podFile.lower().find('.m4v') != -1:
- flashplayerConfig = 'config={"clip"{"url":"'+redirPodFile+'","autoPlay":false,"autoBuffering":true}}'
- out += '''
- <video controls='controls'>
- <source src='{}' type='video/mp4' />
- </video>
- <hr>
- <object type='application/x-shockwave-flash' data='http://releases.flowplayer.org/swf/flowplayer-3.2.1.swf' wiidth='480' heiight='270'>
- <param name='movie' value='http://releases.flowplayer.org/swf/flowplayer-3.2.1.swf' />
- <param name='allowFullScreen' value='true' />
- <param name='wmode' value='transparent' />
- <param name='flashvars' value='{}' />
- </object>'''.format(redirPodFile,flashplayerConfig)
- elif podFile.lower().find('.mp3') != -1:
- out += '''
- <audio controls='controls'>
- <source src='{}' type='audio/mpeg' />
- </audio>
- <hr>
- <embed src='http://tinyurl.com/dropittt/mediaplayer.swf' type='application/x-shockwave-flash' flashvars='file={}&autostart=false' allowscriptaccess='none' height='20' width='200'>
- </embed>'''.format(redirPodFile,redirPodFile)
- out += '''
- <hr><a href="{}">Link to file</a> '''.format(podFile)
- out += '''</body>
- </html>'''
- return out
- # Function to generate the HTML for the list of podcasts, a form to enter a pod, and, if a
- # pod was selected, the items in that pod's RSS feed
- def podHtmlOutput(self, podUrl, content):
- podList = getPodList()
- podListHtml = "\n".join(['<big><a href="/pod?pod={}">{}</a></big> ||  '.format(urllib.quote(pod[1]),pod[0]) for pod in podList])
- # Get the pod's title if we have it in our database. In the future this should be pulled
- # from the RSS feed.
- try:
- title=next((pod[0] for pod in podList if pod[1]==podUrl))
- except StopIteration:
- title=''
- output = '''<html>
- <head><title>Podcaster Extrctor - {}</title></head>
- <body>
- {}
- <form action="/pod">
- POD URL: <input size="100" name="pod" value="{}">
- <input type="submit">
- </form>
- <hr>
- {}
- <hr>
- {}
- </body>
- </html>'''.format(title, self.getTopNav(), podUrl, podListHtml, content)
- '''<form action="/pod">
- OPML URL: <input size="100" name="pod" value="'.$pod.'"><input type="hidden" name="opml" value="yes"><input type="submit"></form>'''
- return output
- # Decided to write a common-case function for returning data from the web server to the browser
- # It sends back the 200 success code, and the mime type header then the content
- def writeOut(self,content):
- self.send_response(200)
- self.send_header('Content-type','text/html')
- self.end_headers()
- self.wfile.write(content)
- # This method is defined in SompleHTTPServer to simply return a list of files, or the file specified
- # We will override this to handle the cases specific to our program.
- def do_GET(self):
- # Using urlparse module to help us get the items in the querystring that came from links/the form
- # This code parses the self.path variable set in SimpleHTTPServer and turns the querystring
- # into a dictionary object. so http://localhost:8080?q=1&r=ham becomes {"q":[1],"r":["ham"]}
- # [Processing HTTP GET input parameter on server side in python - Stack Overflow]
- # http://stackoverflow.com/questions/8928730/processing-http-get-input-parameter-on-server-side-in-python
- queryDict = parse_qs(urlparse(self.path).query)
- # This is the main decision point for deciding what gets done based on the URL passed.
- # In any other language I would have used a switch statement here. There is a "pythonic" way
- # to do this - to create a dictionary object where the case condition is the key and the function
- # name is the value. Source: [Switch-case statement in Python | The ByteBaker]
- # http://bytebaker.com/2008/11/03/switch-case-statement-in-python/
- # Media player case - this will display media players for mp3 and mp4 files
- # using the filename that was passed in
- if self.path.startswith('/play'):
- playfile = ''
- if 'playfile' in queryDict:
- playfile = queryDict['playfile'][0]
- out = self.getPlayerHTML(playfile)
- self.writeOut(out)
- return
- # Mostly for debugging - will output the csv file (or an error) to the browser
- # This is more of a back door. There is no visible link to this URL from the web app
- elif self.path.startswith('/csv'):
- csvContent = ''
- try:
- with open('podlist.csv', 'rb') as csvFile:
- csvContent = csvFile.read()
- except:
- csvContent = "Error loading CSV file"
- self.writeOut("<pre>{}</pre>".format(csvContent))
- return
- # Another debugging call - shows verious server variables and the query dictionary
- elif self.path.startswith('/sample'):
- out = '''self.path = {} <br> self.raw_requestline = {} <br> self.client_address = {}
- <br> queryDict = {}'''.format(self.path, self.raw_requestline, self.client_address, queryDict)
- self.writeOut(out)
- return
- # Yet another hidden call. If you have a folder named csharp, this will show the default behavior
- # of the SimpleHTTPServer - just shows a list of files in the directory, and the file if clicked
- # if we have HTML files here they will be rendered by the browser.
- elif self.path.startswith('/csharp'):
- #serve files, and directory listings by following self.path from
- #this directory
- return SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)
- # The default/catch-all case - essentially the entry point of the web application
- # Here we will display a list of all the podcasts and, if there is a pod variable in
- # the query, it will load that podcast.
- else:
- podUrl = podContent = ''
- if 'pod' in queryDict:
- podUrl = queryDict['pod'][0]
- podContent = self.getPodHTML(podUrl)
- out = self.podHtmlOutput(podUrl, podContent) #call pod listing function here
- self.writeOut(out)
- return
- # Custom TCP Server class - used to prevent "socket in use" errors after the program has exited
- # Apparently there is a timeout that has to elapse before you can reuse the same socket. This overrides that.
- # [Python: Binding Socket: "Address already in use" - Stack Overflow]
- # http://stackoverflow.com/questions/6380057/python-binding-socket-address-already-in-use
- class FeedsTCPServer(SocketServer.TCPServer):
- def server_bind(self):
- self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
- self.socket.bind(self.server_address)
- # Function that starts the web server using our special class as the handler
- def startWebServer(port = 8080): # defines which port the server will run on
- httpd = FeedsTCPServer(('', port),FeedsHandler) # create the server
- try:
- print "Web server started serving at http://this.server.name:" + str(port), "- stop server with CTRL-C"
- httpd.serve_forever() # run until stopped
- except KeyboardInterrupt:
- print "\nShutting down web server"
- httpd.socket.close() # close the socket
- # By putting the main code of the program in this function, we can create a way to prevent any code from being run
- # if this code is imported into some other python script. We check that and call main() below
- def main():
- choice = ""
- # This is another block that could be put into that Pythonic select/case style code
- while choice <> "0" :
- choice = raw_input ("\n\n *** MAIN MENU: 1] Show Podcast feeds 2] Start Web Server 0] Quit --> ")
- if choice == "1":
- showPodFeeds()
- if choice == "2":
- startWebServer()
- # This code will basically only run the main() function only if this script was the main script launched.
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement