Advertisement
dougllio

pods.py

May 22nd, 2014
218
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 17.52 KB | None | 0 0
  1. ###############################################
  2. ## Program that loads a list of Podcasts and their associated XML files, extracts the podcast file data and presents
  3. ## it to the user both in a terminal application and on a web page. Web page also allows the podcast to be played
  4. ## embedded in the browser. Imports a host of helper modules based on code I've found from all over the web.
  5. ## Only non-standard Python library is: http://labix.org/python-dateutil and possibly http://lxml.de/
  6. ###############################################
  7.  
  8. import csv, urllib, urllib2, re, socket, SimpleHTTPServer, SocketServer, json
  9. from dateutil import parser
  10. from urlparse import urlparse, parse_qs
  11. try:
  12.     from lxml import etree
  13. except ImportError:
  14.     try:
  15.         import xml.etree.cElementTree as etree
  16.     except ImportError:
  17.         import  xml.etree.ElementTree as etree
  18.  
  19. # Function to format the date in a standard way
  20. def dtfmt(date):
  21.     return date.strftime("%a, %m/%d/%Y %H:%M:%S") + " GMT"
  22.  
  23. # Function to fail silently on int parsing    
  24. def safeIntParse(i):
  25.     try:
  26.         return int(i)
  27.     except:
  28.         pass
  29.     return -1
  30.  
  31. # Function to read in OPML file - incomplete
  32. def getOPML(opml):
  33.     f = urllib2.urlopen(opml)
  34.     r = f.read()
  35.     opmlTree = etree.fromstring(r)
  36.     pods = [(item.get('text'),item.get('xmlUrl')) for item in opmlTree.findall('outline')]
  37.     print pods
  38.  
  39. # Function to retrieve XML given a podcast URL and return a list of podcast tuple items.
  40. def getXML(podUrl):
  41.     # If the podcast is an iTunes link, use the web service PickleMonkey.net to convert it into a proper RSS feed
  42.     # Uses urllib to encode the URL string, urllib2 to get the data, and json to load the data into a dictionary.
  43.     if podUrl.find("://itunes.apple.com") != -1:
  44.         try:
  45.             podiTunes = urllib2.urlopen("http://picklemonkey.net/flipper/extractor.php?feed=" + urllib.quote_plus(podUrl))
  46.             podJson = json.load(podiTunes)
  47.             podUrl = podJson["podcast"]
  48.         except Exception as e:
  49.             print "Error getting iTunes URL data:",str(e)
  50.             return None
  51.  
  52.     # Get the pod XML file and load it into an XML ElementTree object (https://docs.python.org/2/library/xml.etree.elementtree.html)
  53.     # using the xml (or lxml) module. The XML should be in the form:
  54.     # <rss>
  55.     #   <channel>
  56.     #     <item>
  57.     #       <title>Title</title>
  58.     #       <pubDate>Mon, 20 Apr 2009 11:47:00 -0400</pubDate>
  59.     #       <link>http://link.to/file</link>
  60.     #       <enclosure url="http://link.to/file" length="1200000" />
  61.     #     </item>
  62.     #   </channel>
  63.     # </rss>
  64.     podXml = urllib2.urlopen(podUrl.replace(' ','%20')).read()
  65.     podXmlTree = etree.fromstring(podXml)
  66.     #podXmlTree = etree.parse('file.xml') # code to load XML from a file
  67.  
  68.     podItems = []
  69.     #podHeader = "<a href='{}'>{}</a> - {}".format(podXmlTree.find('link').text,podXmlTree.find('title').text,podXmlTree.find('description').text) # not working yet
  70.  
  71.     # Loop through the XML using xpath (the 'channel/item' bit) and grab all the channel items
  72.     # under the root element rss.
  73.     for item in podXmlTree.findall('channel/item'):
  74.         # Grab the text of these elements under item
  75.         # Strip out any Unicode characters from the title
  76.         # [Convert Unicode to a string in Python (containing extra symbols) - Stack Overflow]
  77.         # http://stackoverflow.com/questions/1207457/convert-unicode-to-a-string-in-python-containing-extra-symbols
  78.         title = item.find('title').text.encode('ascii','replace')
  79.         pubdate = item.find('pubDate').text
  80.  
  81.         # Convert that date into a datetime object in case we want to sort later
  82.         # [parsing date string in python (convert string to date) - Stack Overflow]
  83.         # http://stackoverflow.com/questions/10985312/parsing-date-string-in-python-convert-string-to-date
  84.         parsedPubdate = parser.parse(pubdate)
  85.  
  86.         # The url of the media will either be an attribute of the enclosure element
  87.         # or the link element text
  88.         enclosureurl = length = ""
  89.         try:
  90.             enclosure = item.find('enclosure')
  91.             enclosureurl = enclosure.get('url')
  92.             lengthInt = safeIntParse(enclosure.get('length'))
  93.             if lengthInt > 0:
  94.                 length = str(lengthInt/1000000) + " MB"
  95.         except:
  96.             enclosureurl = pubdate = item.find('link').text
  97.  
  98.         # Add to our list of tuples
  99.         podItems.append((title, parsedPubdate, enclosureurl, length))
  100.  
  101.     return podItems
  102.  
  103. # Function to return the final redirected URL. Some pod hosts will store their files with a
  104. # content delivery network and as such the url is dynamically redirected. We use this to get the
  105. # final location as the HTML video player sometimes can't handle it otherwise
  106. # [How to get the URL of a redirect with Python - Stack Overflow]
  107. # http://stackoverflow.com/questions/4902523/how-to-get-the-url-of-a-redirect-with-python
  108. def getRedirectedUrl(url):
  109.     req = urllib2.Request(url)
  110.     res = urllib2.urlopen(req)
  111.     return res.geturl()
  112.  
  113. # Function to retrieve the list of pod files from a csv file. If the file cannot be found,
  114. # an error is shown and a sample of test data will be substituted
  115. def getPodList():
  116.     podList = [] # List matrix
  117.     try:
  118.         # Using csv module to read in our data as a list matrix.
  119.         # csv can do lots more powerful operations on these types of data files.
  120.         with open('podlist.csv', 'rb') as csvFile:
  121.             podReader = csv.reader(csvFile)
  122.             podList = [pod for pod in podReader]
  123.     except:
  124.         print "There was an error loading podlist.csv. Using test data until problem is corrected"
  125.         testdata = '''CNET Vods,http://feeds2.feedburner.com/allcnetvideopodcasts
  126. CNET UK Podcast,http://www.cnet.co.uk/feeds/podcasts/
  127. This Week In Tech,http://feeds.twit.tv/twit
  128. The Bugle,http://feeds.feedburner.com/thebuglefeed
  129. WNYC RadioLab,http://feeds.wnyc.org/radiolab
  130. NPR Planet Money,http://www.npr.org/rss/podcast.php?id=510289
  131. NFL Network M&M Podcast,http://nohuddle.libsyn.com/rss
  132. Football Weekly,http://www.guardian.co.uk/football/series/footballweekly/podcast.xml
  133. Star Talk Radio,http://feeds.soundcloud.com/users/soundcloud:users:38128127/sounds.rss'''
  134.         podList = [pod.split(",") for pod in testdata.splitlines()]
  135.  
  136.     return podList
  137.  
  138. # Basic function for the console app to show the list of feeds, and allow the user to
  139. # show the files from any of the podcast feeds.
  140. def showPodFeeds():
  141.     print
  142.     print
  143.     podList = getPodList()
  144.     for num,pod in enumerate(podList): # use enumerate to create a menu option for each pod
  145.         print("{:3} {}").format(num,pod[0])
  146.  
  147.     # Get the input of the number corresponding to the pod feed they want to see.
  148.     # Use while loop to ensure that the data is valid before proceeding
  149.     subchoice = ""
  150.     while not subchoice.isdigit() or not ( 0 <= int(subchoice) < len(podList) ):
  151.         subchoice = raw_input ("\n\nWhich podcast feed would you like to see? --> ")
  152.     print
  153.     print
  154.  
  155.     # Print the data with formatting
  156.     podItems = getXML(podList[int(subchoice)][1])
  157.     print "{:40} {:30} {:10} {}".format("Pod Name","Pub Date","Length","Pod File URL")
  158.     for item in podItems:
  159.         print "{:40} {:30} {:10} {}".format(item[0], dtfmt(item[1]), item[3], item[2])
  160.  
  161.  
  162.  
  163. # Custom web server handler class
  164. # Python has a few web server modules as part of the standard library that will just serve
  165. # files in current directory. We will use/extend the SimpleHTTPServer class and intercept
  166. # the GET requests in our own function that can be used to show our dynamic content
  167. # from: [Blended Technologies >> Blog Archive >> Python Trick: Really Simple HTTP Server]
  168. # http://www.blendedtechnologies.com/python-trick-really-little-http-server/220
  169.  
  170. class FeedsHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
  171.  
  172.     # Top navigation bar TODO:
  173.     def getTopNav(self):
  174.         return '''<h1><a href="/pod">Podcaster Extractor</a></h1>'''
  175.  
  176.     # Function return display HTML for the given podcast feed
  177.     # Displaying using a list index, though we may want to transition
  178.     # to this method of dictionary interpolation formatting:
  179.     # http://stackoverflow.com/questions/1304025/python-string-interpolation-using-dictionary-and-strings
  180.     def getPodHTML(self, podUrl):
  181.         out = ''
  182.         try:
  183.             podItems = getXML(podUrl)
  184.             for item in podItems:
  185.                 out += '''<p>
  186. (<a href='{}'>Download</a>)
  187. {} {}
  188. <a target='player' href='/play?playfile={}'>{}</a></p>'''.format(item[2], dtfmt(item[1]), item[3], item[2], "{} ({})".format(item[0],item[2].split("/")[-1]))
  189.         except Exception as e:
  190.             out = "<p>Error loading XML from: {}</p><p>{}</p>".format(podUrl,str(e))
  191.         return out
  192.  
  193.     # Function to display HTML for the media player
  194.     # Having some trouble with the Flash based MP4 player
  195.     def getPlayerHTML(self, podFile):
  196.         out = '''<html>
  197. <head><title>MP3 Player</title></head>
  198. <body>
  199. <form name=play id=play method="get" action="/play">
  200.     <input type="text" name="playfile" value="{}">
  201.     <input type=submit>
  202. </form>
  203. <hr>'''.format(podFile)
  204.  
  205.         # Only display this part if a filename was passed in
  206.         if podFile:
  207.             redirPodFile = getRedirectedUrl(podFile)
  208.             if podFile.lower().find('.mp4') != -1 or podFile.lower().find('.m4v') != -1:
  209.                 flashplayerConfig = 'config={"clip"{"url":"'+redirPodFile+'","autoPlay":false,"autoBuffering":true}}'
  210.                 out += '''
  211. <video controls='controls'>
  212.     <source src='{}' type='video/mp4' />
  213. </video>
  214. <hr>
  215. <object type='application/x-shockwave-flash' data='http://releases.flowplayer.org/swf/flowplayer-3.2.1.swf' wiidth='480' heiight='270'>
  216.         <param name='movie' value='http://releases.flowplayer.org/swf/flowplayer-3.2.1.swf' />
  217.         <param name='allowFullScreen' value='true' />
  218.         <param name='wmode' value='transparent' />
  219.         <param name='flashvars' value='{}' />
  220. </object>'''.format(redirPodFile,flashplayerConfig)
  221.             elif podFile.lower().find('.mp3') != -1:
  222.                 out += '''
  223. <audio controls='controls'>
  224.     <source src='{}' type='audio/mpeg' />
  225. </audio>
  226. <hr>
  227. <embed src='http://tinyurl.com/dropittt/mediaplayer.swf' type='application/x-shockwave-flash' flashvars='file={}&autostart=false' allowscriptaccess='none' height='20' width='200'>
  228. </embed>'''.format(redirPodFile,redirPodFile)
  229.  
  230.             out += '''
  231. <hr><a href="{}">Link to file</a> '''.format(podFile)
  232.         out += '''</body>
  233. </html>'''
  234.         return out
  235.  
  236.     # Function to generate the HTML for the list of podcasts, a form to enter a pod, and, if a
  237.     # pod was selected, the items in that pod's RSS feed
  238.     def podHtmlOutput(self, podUrl, content):
  239.         podList = getPodList()
  240.         podListHtml = "\n".join(['<big><a href="/pod?pod={}">{}</a></big> &nbsp; || &nbsp'.format(urllib.quote(pod[1]),pod[0]) for pod in podList])
  241.  
  242.         # Get the pod's title if we have it in our database. In the future this should be pulled
  243.         # from the RSS feed.
  244.         try:
  245.             title=next((pod[0] for pod in podList if pod[1]==podUrl))
  246.         except StopIteration:
  247.             title=''
  248.            
  249.         output = '''<html>
  250.    <head><title>Podcaster Extrctor - {}</title></head>
  251.    <body>
  252.        {}
  253.        <form action="/pod">
  254.            POD URL: <input size="100" name="pod" value="{}">
  255.            <input type="submit">
  256.        </form>
  257.        <hr>
  258.        {}
  259.        <hr>
  260.        {}
  261.    </body>
  262. </html>'''.format(title, self.getTopNav(), podUrl, podListHtml, content)
  263.        
  264.         '''<form action="/pod">
  265. OPML URL: <input size="100" name="pod" value="'.$pod.'"><input type="hidden" name="opml" value="yes"><input type="submit"></form>'''
  266.  
  267.         return output
  268.  
  269.     # Decided to write a common-case function for returning data from the web server to the browser
  270.     # It sends back the 200 success code, and the mime type header then the content
  271.     def writeOut(self,content):
  272.         self.send_response(200)
  273.         self.send_header('Content-type','text/html')
  274.         self.end_headers()
  275.         self.wfile.write(content)
  276.        
  277.  
  278.     # This method is defined in SompleHTTPServer to simply return a list of files, or the file specified
  279.     # We will override this to handle the cases specific to our program.
  280.     def do_GET(self):
  281.  
  282.         # Using urlparse module to help us get the items in the querystring that came from links/the form
  283.         # This code parses the self.path variable set in SimpleHTTPServer and turns the querystring
  284.         # into a dictionary object. so http://localhost:8080?q=1&r=ham becomes {"q":[1],"r":["ham"]}
  285.         # [Processing HTTP GET input parameter on server side in python - Stack Overflow]
  286.         # http://stackoverflow.com/questions/8928730/processing-http-get-input-parameter-on-server-side-in-python
  287.         queryDict = parse_qs(urlparse(self.path).query)
  288.  
  289.         # This is the main decision point for deciding what gets done based on the URL passed.
  290.         # In any other language I would have used a switch statement here. There is a "pythonic" way
  291.         # to do this - to create a dictionary object where the case condition is the key and the function
  292.         # name is the value. Source: [Switch-case statement in Python | The ByteBaker]
  293.         # http://bytebaker.com/2008/11/03/switch-case-statement-in-python/
  294.  
  295.         # Media player case - this will display media players for mp3 and mp4 files
  296.         # using the filename that was passed in
  297.         if self.path.startswith('/play'):
  298.             playfile = ''
  299.             if 'playfile' in queryDict:
  300.               playfile = queryDict['playfile'][0]
  301.             out = self.getPlayerHTML(playfile)
  302.             self.writeOut(out)
  303.             return
  304.        
  305.         # Mostly for debugging - will output the csv file (or an error) to the browser
  306.         # This is more of a back door. There is no visible link to this URL from the web app
  307.         elif self.path.startswith('/csv'):
  308.             csvContent = ''
  309.             try:
  310.                 with open('podlist.csv', 'rb') as csvFile:
  311.                     csvContent = csvFile.read()
  312.             except:
  313.                 csvContent = "Error loading CSV file"
  314.             self.writeOut("<pre>{}</pre>".format(csvContent))
  315.             return
  316.  
  317.         # Another debugging call - shows verious server variables and the query dictionary
  318.         elif self.path.startswith('/sample'):
  319.             out = '''self.path = {} <br> self.raw_requestline = {} <br> self.client_address = {}
  320. <br> queryDict = {}'''.format(self.path, self.raw_requestline, self.client_address, queryDict)
  321.             self.writeOut(out)
  322.             return
  323.  
  324.         # Yet another hidden call. If you have a folder named csharp, this will show the default behavior
  325.         # of the SimpleHTTPServer - just shows a list of files in the directory, and the file if clicked
  326.         # if we have HTML files here they will be rendered by the browser.
  327.         elif self.path.startswith('/csharp'):
  328.             #serve files, and directory listings by following self.path from
  329.             #this directory
  330.             return SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)
  331.  
  332.         # The default/catch-all case - essentially the entry point of the web application
  333.         # Here we will display a list of all the podcasts and, if there is a pod variable in
  334.         # the query, it will load that podcast.
  335.         else:
  336.             podUrl = podContent = ''
  337.             if 'pod' in queryDict:
  338.                 podUrl = queryDict['pod'][0]
  339.                 podContent = self.getPodHTML(podUrl)
  340.             out = self.podHtmlOutput(podUrl, podContent) #call pod listing function here
  341.             self.writeOut(out)
  342.             return
  343.  
  344. # Custom TCP Server class - used to prevent "socket in use" errors after the program has exited
  345. # Apparently there is a timeout that has to elapse before you can reuse the same socket. This overrides that.
  346. # [Python: Binding Socket: "Address already in use" - Stack Overflow]
  347. # http://stackoverflow.com/questions/6380057/python-binding-socket-address-already-in-use
  348. class FeedsTCPServer(SocketServer.TCPServer):
  349.     def server_bind(self):
  350.         self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
  351.         self.socket.bind(self.server_address)
  352.  
  353.  
  354.  
  355. # Function that starts the web server using our special class as the handler
  356. def startWebServer(port = 8080): # defines which port the server will run on
  357.     httpd = FeedsTCPServer(('', port),FeedsHandler) # create the server
  358.     try:
  359.         print "Web server started serving at http://this.server.name:" + str(port), "- stop server with CTRL-C"
  360.         httpd.serve_forever() # run until stopped
  361.     except KeyboardInterrupt:
  362.         print "\nShutting down web server"
  363.         httpd.socket.close() # close the socket
  364.  
  365. # By putting the main code of the program in this function, we can create a way to prevent any code from being run
  366. # if this code is imported into some other python script. We check that and call main() below
  367. def main():
  368.     choice = ""
  369.     # This is another block that could be put into that Pythonic select/case style code
  370.     while choice <> "0" :
  371.         choice = raw_input ("\n\n *** MAIN MENU: 1] Show Podcast feeds     2] Start Web Server    0] Quit      --> ")
  372.         if choice == "1":
  373.             showPodFeeds()
  374.         if choice == "2":
  375.             startWebServer()
  376.            
  377. # This code will basically only run the main() function only if this script was the main script launched.
  378. if __name__ == "__main__":
  379.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement