Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Jun 11th, 2012  |  syntax: None  |  size: 11.33 KB  |  hits: 13  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #!/usr/bin/env python
  2.  
  3. # Standard
  4. import xml.parsers.expat
  5. import os
  6. import sys
  7.  
  8. # Mine
  9. from nflx import Api
  10. from collectors import Producer
  11. from settings import SETTINGS
  12.  
  13. # FILENAME = SETTINGS['index_xml']
  14. FILENAME = SETTINGS['index_xml_test']
  15.  
  16.  
  17. class IndexProvider(object):
  18.    """ Interface for objects which provide access to a NFLX index """
  19.  
  20.    def read(self):
  21.       pass
  22.    def size(self):
  23.       pass
  24.    def close(self):
  25.       pass
  26.  
  27.  
  28. class ApiIndexProvider(IndexProvider):
  29.    """ Provides direct access to an HTTP response containing the NFLX index """
  30.  
  31.    def __init__(self):
  32.       api = nflx.Api()
  33.       self.file = api.catalog_index()
  34.       self.size = self.file.info().getheader('Content-Length').strip()
  35.       self.size = int(self.size)
  36.    def read(self, *args):
  37.       return self.file.read(*args)
  38.    def size(self):
  39.       return self.size
  40.  
  41.  
  42. class FileIndexProvider(IndexProvider):
  43.    """ Provides access to an XML file containing the NFLX index """
  44.  
  45.    def __init__(self, filename=FILENAME):
  46.       self.file = open(filename)
  47.    def read(self, *args):
  48.       return self.file.read(*args)
  49.    def size(self):
  50.       return os.stat(FILENAME).st_size
  51.    def close(self):
  52.       self.file.close()
  53.  
  54.  
  55. class Link(object):
  56.    """
  57.    Represents a <link> element in the Netflix index (which points to people
  58.    or webpages)
  59.    """
  60.  
  61.    # The rel attribute of <link> XML elements in the NFLX index contains URIs
  62.    # indicating the 'type' of the link.
  63.    types = {
  64.       'http://schemas.netflix.com/catalog/person.actor': 'actor',
  65.       'http://schemas.netflix.com/catalog/person.director': 'director',
  66.       'alternate': 'webpage'}
  67.    
  68.    @staticmethod
  69.    def get_type(rel):
  70.       return Link.types.get(rel)
  71.  
  72.    def __init__(self, Tag=None):
  73.       self.href  = Tag.attrs['href']
  74.       self.rel   = Tag.attrs['rel']
  75.       self.title = Tag.attrs['title']
  76.       self.type  = Link.get_type(self.rel)
  77.  
  78.    def __str__(self):
  79.       return self.__repr__()
  80.  
  81.  
  82. class Person(Link):
  83.    def __repr__(self):
  84.       return '<Person>: ' + self.title
  85.  
  86.  
  87. class WebPage(Link):
  88.    def __repr__(self):
  89.       return '<WebPage>: ' + self.href
  90.  
  91.  
  92. class LinkFactory(object):
  93.    """
  94.    <link> elements in the NFLX index can represent people or webpages.
  95.  
  96.    When a <link> element is encountered, it can be passed to this factory to
  97.    produce the Link object of the correct subtype
  98.    """
  99.  
  100.    @staticmethod
  101.    def get_link(tag):
  102.       type = Link.get_type(tag.attrs['rel'])
  103.       if type in ['actor', 'director']:
  104.          return PERSON_REGISTRY.new(tag)
  105.       if type in ['webpage']:
  106.          return WebPage(tag)
  107.  
  108.  
  109. class Category(object):
  110.    schemes = {
  111.          'http://api.netflix.com/categories/genres/': 'genre'
  112.    }
  113.  
  114.    def __init__(self, tag):
  115.       self.scheme = tag.attrs['scheme']
  116.       self.label  = tag.attrs['label']
  117.       self.term   = tag.attrs['term']
  118.  
  119.    def __repr__(self):
  120.       cls = self.__class__.__name__
  121.       s = '<%s>: %s' % (cls,self.term)
  122.       return s.encode('utf-8')
  123.  
  124.  
  125. class Genre(Category):pass
  126.  
  127.  
  128. class Format(Category):
  129.    def __init__(self, tag, available_from):
  130.       self.available_from = available_from
  131.       super(Format, self).__init__(tag)
  132.  
  133.  
  134. class Registry(object):
  135.    """ This class is a trick to avoid creating duplicate objects. Objects that
  136.    are likely to be duplicated are stored in a registry, and attempts to create
  137.    new objects of the same type should first check the registry to make sure the
  138.    object does not already exist.
  139.    """
  140.    
  141.    def __init__(self, key, constructor):
  142.       self.index = {}
  143.       self.key = key
  144.       self.constructor = constructor
  145.  
  146.    def new(self, *args, **kwargs):
  147.       """ Create new object and see if an equivalent object is already in the
  148.       registry. If so, return registered object; otherwise, add new object to
  149.       registry and return it """
  150.  
  151.       obj = self.constructor(*args, **kwargs)
  152.       obj = self.get(obj)
  153.  
  154.       return obj
  155.  
  156.    def _hash(self, obj):
  157.       return getattr(obj, self.key)
  158.  
  159.    def get(self, obj):
  160.       hash = self._hash(obj)
  161.  
  162.       try:
  163.          return self.index[hash]
  164.       except KeyError:
  165.          self.index[hash] = obj
  166.          return obj
  167.  
  168.  
  169. class Title(object):
  170.    """ Represents a <title_index_item> in the NFLX index.
  171.    A title_index_item is basically a movie. """
  172.  
  173.    def __init__(self):
  174.       self.id = None
  175.       self.upc = None
  176.       self.type = None
  177.       self.title = None
  178.       self.release_year = None
  179.       self.updated = None
  180.       self.formats = []
  181.       self.upcs = []
  182.       self.actors = []
  183.       self.genres = []
  184.       self.formats = []
  185.       self.directors = []
  186.       self.webpages = []
  187.  
  188.       # Can this be watched online?
  189.       self._is_instant = False
  190.  
  191.    def add(self, path):
  192.       """ Adds an arbitrary xml tag to this title. This will look for internal
  193.       methods named tag_[x] where [x] matches the name of the tag on top of the
  194.       path .
  195.       """
  196.  
  197.       try:
  198.          method_name = 'add_' + path[-1].name
  199.          adder = self.__getattribute__(method_name)
  200.       except AttributeError:
  201.          return
  202.      
  203.       adder(path)
  204.  
  205.    def is_instant(self):
  206.       return self._is_instant
  207.  
  208.    def add_id(self, path):
  209.       tag = path[-1]
  210.       rel = tag.attrs.get('rel')
  211.       if rel:
  212.          if rel == 'http://schemas.netflix.com/catalog/id.upc':
  213.             self.upc = tag.cdata
  214.       else:
  215.          self.id = tag.cdata
  216.          self.type = self.id.split('/')[-2]
  217.  
  218.    def add_release_year(self, path):
  219.       """ Ex: <release_year>1998</release_year> """
  220.  
  221.       self.release_year = path[-1].cdata
  222.  
  223.    def add_title(self, path):
  224.       """Ex: <title>Forrest Gump</title> """
  225.  
  226.       self.title = path[-1].cdata
  227.  
  228.    def add_updated(self, path):
  229.       """Ex: <updated>1233161823</updated> """
  230.  
  231.       self.updated = int(path[-1].cdata)
  232.  
  233.    def add_category(self, path):
  234.       """
  235.          <category> tags can appear under <title> tags OR
  236.          under <availability> tags.
  237.          
  238.          Also, about half of them have a status='deprecated' attribute and
  239.          should be ignored. Fun. Thanks for that, NFLX.
  240.       """
  241.       tag    = path[-1]
  242.       parent = path[-2]
  243.  
  244.       if tag.attrs.get('status','') != 'deprecated':
  245.          if parent.name == 'availability':
  246.             available_from = parent.attrs.get('available_from', 0)
  247.            
  248.             # format = Format(tag, available_from)
  249.             format = FORMAT_REGISTRY.new(tag, available_from)
  250.             self.formats.append(format)
  251.             if format.term == 'instant':
  252.                self._is_instant=True
  253.          else:
  254.             genre = GENRE_REGISTRY.new(tag)
  255.             self.genres.append(genre)
  256.  
  257.    def add_link(self, path):
  258.       """ Adds a link to this title.
  259.       A link represents an actor, director, or webpage
  260.  
  261.       Ex: <link href="[uri]" rel="../person.actor" title="Tom Hanks" />
  262.       """
  263.  
  264.       link = LinkFactory.get_link(path[-1])
  265.  
  266.       if isinstance(link, Person):
  267.          if link.type == 'director':
  268.             self.directors.append(link)
  269.          elif link.type == 'actor':
  270.             self.actors.append(link)
  271.  
  272.       elif isinstance(link, WebPage):
  273.          self.webpages.append(link)
  274.  
  275.    
  276. class Tag(object):
  277.    """ Represents an XML tag in the NFLX index """
  278.  
  279.    def __init__(self, name, attrs):
  280.       self.name  = name
  281.       self.attrs = attrs
  282.       self.cdata = ''
  283.  
  284.  
  285. class TitleParser(Producer):
  286.    """ Parses the NFLX XML title index into an internal object structure.
  287.    
  288.    Internally, this uses an expat parser to efficiently parse the (huge) index
  289.    file. While parsing the index file, this object keeps a stack of all tags
  290.    currently in context, with the direct parent at the top of the stack, and the
  291.    root element of the entire file at the bottom.
  292.  
  293.    For example, an XML snippet and it's corresponding stack looks like this:
  294.  
  295.       Stack:
  296.          [external_ids] <-- Push/Pop operations happen here.
  297.          [title]
  298.          [title_index_item]
  299.          [catalog_title_index]
  300.  
  301.       XML:
  302.          <catalog_title_index>
  303.             <title_index_item>
  304.                <title>
  305.                   <external_ids>
  306.                      <!-- Parser is here -->
  307.                   </external_ids>
  308.                </title>
  309.             </title_index_item>
  310.          </cataglog_title_index>
  311.  
  312.    When <title> elements are encountered, a Title object is created, and nested
  313.    tags are passed to this Title object along with the parser context stack.
  314.    """
  315.  
  316.    def __init__(self, index_provider=None, max_titles=None):
  317.       if not index_provider:
  318.          raise Exception("You MUST provide an IndexProvider argument to this "
  319.                          "constructor (for now)")
  320.       else:
  321.          self.file = index_provider
  322.  
  323.       # Set up internal expat parser
  324.       self.parser = xml.parsers.expat.ParserCreate()
  325.       self.parser.StartElementHandler = self.start_element_handler
  326.       self.parser.EndElementHandler = self.end_element_handler
  327.       self.parser.CharacterDataHandler = self.char_data_handler
  328.  
  329.       # Set initial state
  330.       self.ranges=[]
  331.       self.title = None
  332.       self.current_tag = None
  333.       self.tag_context = []
  334.       self.percent_done = -1
  335.       self.num_titles = 0
  336.       self.max_titles = max_titles or 0
  337.  
  338.       self.machines = []
  339.  
  340.    def run(self):
  341.       """
  342.       Fires up the expat parser and begins parsing the NFLX index.
  343.       Overrides Producer.run()
  344.       """
  345.  
  346.       try:
  347.          self.parser.ParseFile(self.file)
  348.          print '\rParsing XML index (100%)'
  349.       except ValueError, e:
  350.          # HACK: Using an exception caused by performing IO on a closed file to
  351.          # stop generating titles after self.max_titles titles have been produced
  352.          print "\rTERMINATING: max_titles reached (%d titles)" % self.num_titles
  353.       finally:
  354.          self.complete()
  355.  
  356.    def start_element_handler(self, name, attrs):
  357.       self.tag_context += [Tag(name, attrs)]
  358.       if name == 'title_index_item':
  359.          self.current_title = Title()
  360.  
  361.    def end_element_handler(self, name):
  362.       """ Called when expat finishes processing a tag."""
  363.  
  364.       # This takes a while; update user on our progress.
  365.       percent_done = (self.parser.CurrentByteIndex * 100) / self.file.size()
  366.       if percent_done != self.percent_done:
  367.          progress_string = "\rParsing XML index (%d%%)" % percent_done
  368.          sys.stdout.write(progress_string)
  369.          sys.stdout.flush()
  370.          self.percent_done = percent_done
  371.  
  372.       # We just finished parsing a title
  373.       if name == 'title_index_item':
  374.          for machine in self.machines:
  375.             machine.send(self.current_title)
  376.  
  377.          # Housekeeping...
  378.          self.current_title = None
  379.          self.num_titles += 1
  380.  
  381.          if self.max_titles and self.num_titles == self.max_titles:
  382.             self.file.close()
  383.  
  384.       # If we finished parsing any other tag, try to add it to the <title> in
  385.       # context
  386.       elif self.current_title:
  387.          self.current_title.add(self.tag_context)
  388.      
  389.       self.tag_context.pop()
  390.  
  391.    def char_data_handler(self, cdata):
  392.       self.tag_stack[-1].add_cdata(cdata)
  393.  
  394.  
  395. # Registries
  396. GENRE_REGISTRY  = Registry(key='term', constructor=Genre)
  397. FORMAT_REGISTRY = Registry(key='term', constructor=Format)
  398. PERSON_REGISTRY = Registry(key='title', constructor=Person)