- #!/usr/bin/env python
- # Standard
- import xml.parsers.expat
- import os
- import sys
- # Mine
- from nflx import Api
- from collectors import Producer
- from settings import SETTINGS
- # FILENAME = SETTINGS['index_xml']
- FILENAME = SETTINGS['index_xml_test']
- class IndexProvider(object):
- """ Interface for objects which provide access to a NFLX index """
- def read(self):
- pass
- def size(self):
- pass
- def close(self):
- pass
- class ApiIndexProvider(IndexProvider):
- """ Provides direct access to an HTTP response containing the NFLX index """
- def __init__(self):
- api = nflx.Api()
- self.file = api.catalog_index()
- self.size = self.file.info().getheader('Content-Length').strip()
- self.size = int(self.size)
- def read(self, *args):
- return self.file.read(*args)
- def size(self):
- return self.size
- class FileIndexProvider(IndexProvider):
- """ Provides access to an XML file containing the NFLX index """
- def __init__(self, filename=FILENAME):
- self.file = open(filename)
- def read(self, *args):
- return self.file.read(*args)
- def size(self):
- return os.stat(FILENAME).st_size
- def close(self):
- self.file.close()
- class Link(object):
- """
- Represents a <link> element in the Netflix index (which points to people
- or webpages)
- """
- # The rel attribute of <link> XML elements in the NFLX index contains URIs
- # indicating the 'type' of the link.
- types = {
- 'http://schemas.netflix.com/catalog/person.actor': 'actor',
- 'http://schemas.netflix.com/catalog/person.director': 'director',
- 'alternate': 'webpage'}
- @staticmethod
- def get_type(rel):
- return Link.types.get(rel)
- def __init__(self, Tag=None):
- self.href = Tag.attrs['href']
- self.rel = Tag.attrs['rel']
- self.title = Tag.attrs['title']
- self.type = Link.get_type(self.rel)
- def __str__(self):
- return self.__repr__()
- class Person(Link):
- def __repr__(self):
- return '<Person>: ' + self.title
- class WebPage(Link):
- def __repr__(self):
- return '<WebPage>: ' + self.href
- class LinkFactory(object):
- """
- <link> elements in the NFLX index can represent people or webpages.
- When a <link> element is encountered, it can be passed to this factory to
- produce the Link object of the correct subtype
- """
- @staticmethod
- def get_link(tag):
- type = Link.get_type(tag.attrs['rel'])
- if type in ['actor', 'director']:
- return PERSON_REGISTRY.new(tag)
- if type in ['webpage']:
- return WebPage(tag)
- class Category(object):
- schemes = {
- 'http://api.netflix.com/categories/genres/': 'genre'
- }
- def __init__(self, tag):
- self.scheme = tag.attrs['scheme']
- self.label = tag.attrs['label']
- self.term = tag.attrs['term']
- def __repr__(self):
- cls = self.__class__.__name__
- s = '<%s>: %s' % (cls,self.term)
- return s.encode('utf-8')
- class Genre(Category):pass
- class Format(Category):
- def __init__(self, tag, available_from):
- self.available_from = available_from
- super(Format, self).__init__(tag)
- class Registry(object):
- """ This class is a trick to avoid creating duplicate objects. Objects that
- are likely to be duplicated are stored in a registry, and attempts to create
- new objects of the same type should first check the registry to make sure the
- object does not already exist.
- """
- def __init__(self, key, constructor):
- self.index = {}
- self.key = key
- self.constructor = constructor
- def new(self, *args, **kwargs):
- """ Create new object and see if an equivalent object is already in the
- registry. If so, return registered object; otherwise, add new object to
- registry and return it """
- obj = self.constructor(*args, **kwargs)
- obj = self.get(obj)
- return obj
- def _hash(self, obj):
- return getattr(obj, self.key)
- def get(self, obj):
- hash = self._hash(obj)
- try:
- return self.index[hash]
- except KeyError:
- self.index[hash] = obj
- return obj
- class Title(object):
- """ Represents a <title_index_item> in the NFLX index.
- A title_index_item is basically a movie. """
- def __init__(self):
- self.id = None
- self.upc = None
- self.type = None
- self.title = None
- self.release_year = None
- self.updated = None
- self.formats = []
- self.upcs = []
- self.actors = []
- self.genres = []
- self.formats = []
- self.directors = []
- self.webpages = []
- # Can this be watched online?
- self._is_instant = False
- def add(self, path):
- """ Adds an arbitrary xml tag to this title. This will look for internal
- methods named tag_[x] where [x] matches the name of the tag on top of the
- path .
- """
- try:
- method_name = 'add_' + path[-1].name
- adder = self.__getattribute__(method_name)
- except AttributeError:
- return
- adder(path)
- def is_instant(self):
- return self._is_instant
- def add_id(self, path):
- tag = path[-1]
- rel = tag.attrs.get('rel')
- if rel:
- if rel == 'http://schemas.netflix.com/catalog/id.upc':
- self.upc = tag.cdata
- else:
- self.id = tag.cdata
- self.type = self.id.split('/')[-2]
- def add_release_year(self, path):
- """ Ex: <release_year>1998</release_year> """
- self.release_year = path[-1].cdata
- def add_title(self, path):
- """Ex: <title>Forrest Gump</title> """
- self.title = path[-1].cdata
- def add_updated(self, path):
- """Ex: <updated>1233161823</updated> """
- self.updated = int(path[-1].cdata)
- def add_category(self, path):
- """
- <category> tags can appear under <title> tags OR
- under <availability> tags.
- Also, about half of them have a status='deprecated' attribute and
- should be ignored. Fun. Thanks for that, NFLX.
- """
- tag = path[-1]
- parent = path[-2]
- if tag.attrs.get('status','') != 'deprecated':
- if parent.name == 'availability':
- available_from = parent.attrs.get('available_from', 0)
- # format = Format(tag, available_from)
- format = FORMAT_REGISTRY.new(tag, available_from)
- self.formats.append(format)
- if format.term == 'instant':
- self._is_instant=True
- else:
- genre = GENRE_REGISTRY.new(tag)
- self.genres.append(genre)
- def add_link(self, path):
- """ Adds a link to this title.
- A link represents an actor, director, or webpage
- Ex: <link href="[uri]" rel="../person.actor" title="Tom Hanks" />
- """
- link = LinkFactory.get_link(path[-1])
- if isinstance(link, Person):
- if link.type == 'director':
- self.directors.append(link)
- elif link.type == 'actor':
- self.actors.append(link)
- elif isinstance(link, WebPage):
- self.webpages.append(link)
- class Tag(object):
- """ Represents an XML tag in the NFLX index """
- def __init__(self, name, attrs):
- self.name = name
- self.attrs = attrs
- self.cdata = ''
- class TitleParser(Producer):
- """ Parses the NFLX XML title index into an internal object structure.
- Internally, this uses an expat parser to efficiently parse the (huge) index
- file. While parsing the index file, this object keeps a stack of all tags
- currently in context, with the direct parent at the top of the stack, and the
- root element of the entire file at the bottom.
- For example, an XML snippet and it's corresponding stack looks like this:
- Stack:
- [external_ids] <-- Push/Pop operations happen here.
- [title]
- [title_index_item]
- [catalog_title_index]
- XML:
- <catalog_title_index>
- <title_index_item>
- <title>
- <external_ids>
- <!-- Parser is here -->
- </external_ids>
- </title>
- </title_index_item>
- </cataglog_title_index>
- When <title> elements are encountered, a Title object is created, and nested
- tags are passed to this Title object along with the parser context stack.
- """
- def __init__(self, index_provider=None, max_titles=None):
- if not index_provider:
- raise Exception("You MUST provide an IndexProvider argument to this "
- "constructor (for now)")
- else:
- self.file = index_provider
- # Set up internal expat parser
- self.parser = xml.parsers.expat.ParserCreate()
- self.parser.StartElementHandler = self.start_element_handler
- self.parser.EndElementHandler = self.end_element_handler
- self.parser.CharacterDataHandler = self.char_data_handler
- # Set initial state
- self.ranges=[]
- self.title = None
- self.current_tag = None
- self.tag_context = []
- self.percent_done = -1
- self.num_titles = 0
- self.max_titles = max_titles or 0
- self.machines = []
- def run(self):
- """
- Fires up the expat parser and begins parsing the NFLX index.
- Overrides Producer.run()
- """
- try:
- self.parser.ParseFile(self.file)
- print '\rParsing XML index (100%)'
- except ValueError, e:
- # HACK: Using an exception caused by performing IO on a closed file to
- # stop generating titles after self.max_titles titles have been produced
- print "\rTERMINATING: max_titles reached (%d titles)" % self.num_titles
- finally:
- self.complete()
- def start_element_handler(self, name, attrs):
- self.tag_context += [Tag(name, attrs)]
- if name == 'title_index_item':
- self.current_title = Title()
- def end_element_handler(self, name):
- """ Called when expat finishes processing a tag."""
- # This takes a while; update user on our progress.
- percent_done = (self.parser.CurrentByteIndex * 100) / self.file.size()
- if percent_done != self.percent_done:
- progress_string = "\rParsing XML index (%d%%)" % percent_done
- sys.stdout.write(progress_string)
- sys.stdout.flush()
- self.percent_done = percent_done
- # We just finished parsing a title
- if name == 'title_index_item':
- for machine in self.machines:
- machine.send(self.current_title)
- # Housekeeping...
- self.current_title = None
- self.num_titles += 1
- if self.max_titles and self.num_titles == self.max_titles:
- self.file.close()
- # If we finished parsing any other tag, try to add it to the <title> in
- # context
- elif self.current_title:
- self.current_title.add(self.tag_context)
- self.tag_context.pop()
- def char_data_handler(self, cdata):
- self.tag_stack[-1].add_cdata(cdata)
- # Registries
- GENRE_REGISTRY = Registry(key='term', constructor=Genre)
- FORMAT_REGISTRY = Registry(key='term', constructor=Format)
- PERSON_REGISTRY = Registry(key='title', constructor=Person)