imdbparser.py

# -*- coding: cp1252 -*-
import re
import logging
import urllib
import urllib2
import os
import sys
import shutil
from htmlentitydefs import name2codepoint as n2cp
from string import Template


imdb_ext = "com" #com,nl etc.
file_template = Template("$title ($year)")
duplicate_file = Template("$number $filename")


class IMDbPerson:
    def __init__(self, name, url):
        self.url = url
        self.name = name

    def __str__(self):
        return self.name + '\t' + self.url

class FileHandler:

    def __init__(self,rootFolder):
        os.chdir(rootFolder)
        self.rootdir = os.path.normpath(rootFolder)
        self.numofdirs = 0
        self.numofiles = 0

    def printcount(self):
        print '-' * 40
        print "Number of Directories: " + str(self.numofdirs)
        print "Number of Files: " + str(self.numofiles)
        print '-' * 40

    def exploretree(self):
        for x in os.listdir(os.getcwd()):
            if os.path.isdir(x):
                print 'Dir : ' + x
                self.numofdirs += 1
                os.chdir(x)
                self.exploretree()
                os.chdir('..')
            else:
                print 'File: ' + x
                self.numofiles += 1

    def movefiles(self):

        for x in os.listdir(os.getcwd()):
            if os.path.isdir(x):
                os.chdir(x)
                self.movefiles()
                os.chdir('..')
            if os.path.isfile(x) and os.getcwd() != self.rootdir:
                if not os.path.isfile(os.path.join(self.rootdir,x)):
                    shutil.move(x, os.path.join(self.rootdir,x))
                else:
                    shutil.move(x, os.path.join(self.rootdir,self.fileExsists(x)))

    def fileExsists(self,files,index=1):
        if not os.path.isfile(os.path.join(self.rootdir,duplicate_file.substitute(number=index,filename=files))):
            return duplicate_file.substitute(number=index,filename=files)
        else:
            return self.fileExsists(files,index+1)

    def deletedirs(self):
        for x in os.listdir(os.getcwd()):
            if os.path.isdir(x):
                os.chdir(x)
                self.deletedirs()
                os.chdir('..')
            try:
                os.rmdir(x)
                #print 'Removed Directory: ' + x
                self.numofdirs -= 1
            except:
                pass
                #print "Couldn't Delete Directory: " + x

    def parseNFO(self):
        fobj = open(self.findNFO(),"r")
        p = re.compile('(.*imdb.[com,de]+/.*/tt)([0-9]*)', re.IGNORECASE)
        for line in fobj:
            if line.find("imdb.") > 0:
                link = p.search(line)
                movieID = link.group(2)
                self.movieID = movieID
                return movieID
        fobj.close()

    def findNFO(self):
        self.movefiles()
        filesList = os.listdir(self.rootdir)
        for file in filesList:
            if os.path.isfile(os.path.join(self.rootdir,file)) and file.rpartition('.')[-1].lower()=="nfo":
                return os.path.join(self.rootdir,file)

    def parseImdb(self):
        global imdb_ext
        self.movie = IMDbParser("www", imdb_ext, self.movieID )

    def renameAvi(self):
        global file_template
        extList = ["avi", "mkv", "wmv"]
        aviList = []
        p = re.compile('.*sample.*', re.IGNORECASE)
        filesList = os.listdir(self.rootdir)
        for file in filesList:
            if os.path.isfile(os.path.join(self.rootdir,file)) and file.rpartition('.')[-1].lower() in extList and p.search(file) is None:
                aviList.append(file)

        if len(aviList) > 1:
            for file in aviList:
                fileExt = file.rpartition('.')[-1].lower()
                shutil.move(file, file_template.substitute(title=self.movie.title,year=self.movie.year)+" CD"+str(aviList.index(file)+1)+"."+fileExt)
        else:
            fileExt = aviList[0].rpartition('.')[-1].lower()
            shutil.move(aviList[0], file_template.substitute(title=self.movie.title,year=self.movie.year)+"."+fileExt)
        os.chdir(self.rootdir)
        os.chdir("..")
        os.getcwd()
        shutil.move(self.rootdir,os.path.join(os.getcwd(),file_template.substitute(title=self.movie.title,year=self.movie.year)))

class IMDbParser:
    global ext
    if imdb_ext == "com":
        tvSeriesPattern = re.compile('<span>.*TV Series.*</span>')
        titlePattern = re.compile('<h1 class="header">(?P<title>[^<]+)\s+<span>')
        datePattern = re.compile('<h4 class="inline">Release Date:</h4>\s*[0-9]{1,2}\s*[a-zA-Z]+\s*(?P<year>[0-9]{4})\s*\([a-zA-Z]+\)')
        coverPattern = re.compile('<td rowspan="2" id="img_primary">\s+<a[^>]*><img src="(?P<coverURL>http://[^"]+)"[^>]+></a>')
        ratingPattern = re.compile('<span class="rating-rating">(?P<rating>["\d\.]+)<span>')
        directorsPattern = re.compile('<div[^>]*id="director-info"[^>]*>\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<directors>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)</div>\s+</div>')
        directorsSubpattern = re.compile('<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a>')
        creatorsPattern = re.compile('<div class="info">\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<creators>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)')
        creatorsSubpattern = re.compile('<a href="(?P<url>[^"]+)" onclick="[^"]*">(?P<name>[^<]+)</a>')
        #actorsLabelPattern = re.compile('<div class="headerinline"><h3>(?P<label>[^>]+)</h3>')
        #actorPattern = re.compile('<td class="nm"><a[^>]*href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a></td>')
        genrePattern = re.compile('<a.*href="/genre/.*>(?P<genre>[A-Za-z]*)</a>')
    else:
        tvSeriesPattern = re.compile('<span class="tv-extra">[^<]*TV[^<]*</span>')
        titlePattern = re.compile('<div id="tn15title">\s+<h1>(?P<title>[^<]+)\s+<span>\((<a[^>]+>)?(?P<year>\d+)(</a>)?[^)]*\)')
        datePattern = re.compile('<div id="tn15title">\s+<h1>[^<]+\s+<span>\((<a[^>]+>)?(?P<year>\d+)(</a>)?[^)]*\)')
        coverPattern = re.compile('<div class="photo">\s+<a[^>]+><img[^>]+src="(?P<coverURL>http://[^"]+)"[^>]+></a>')
        ratingPattern = re.compile('<div class="starbar-meta">\s+<b>(?P<rating>[\d\.,]+/10)</b>')
        directorsPattern = re.compile('<div[^>]*id="director-info"[^>]*>\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<directors>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)</div>\s+</div>')
        directorsSubpattern = re.compile('<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a>')
        creatorsPattern = re.compile('<div class="info">\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<creators>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)')
        creatorsSubpattern = re.compile('<a href="(?P<url>[^"]+)" onclick="[^"]*">(?P<name>[^<]+)</a>')
        #actorsLabelPattern = re.compile('<div class="headerinline"><h3>(?P<label>[^>]+)</h3>')
        #actorPattern = re.compile('<td class="nm"><a[^>]*href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a></td>')
        genrePattern = re.compile('<a.*href="/genre/.*>(?P<genre>[A-Za-z]*)</a>')

    def __init__(self, subdomain, extension, movieID):
        """ Parses data from a movie/tv show on IMDb """
        self.subdomain = subdomain
        self.extension = extension
        self.movieID = movieID

        self.url = 'http://%s.imdb.%s/title/tt%s' % (subdomain, extension, movieID)

        # default values in case we can't parse the page
        self.title, self.year, self.coverURL, self.rating = 'N/A', 'N/A', 'http://img407.imageshack.us/img407/6493/titleaddposterw.jpg', 'N/A'
        self.ratingInt = 0
        self.creators, self.directors, self.actors = [], [], []
        self.creatorsLabel, self.directorsLabel, self.actorsLabel = 'Creator(s)', 'Director(s)', 'Actors'

        self.parseData()

    def substitute_entity(self,match):
        ent = match.group(3)
        if match.group(1) == "#":
            if match.group(2) == '':
                return unichr(int(ent))
            elif match.group(2) == 'x':
                return unichr(int('0x'+ent, 16))
        else:
            cp = n2cp.get(ent)
            if cp:
                return unichr(cp)
            else:
                return match.group()

    def decode_htmlentities(self,string):
        entity_re = re.compile(r'&(#?)(x?)(\d{1,5}|\w{1,8});')
        clean_pat = re.compile('[^a-zA-Z0-9äüä ]')
        string = entity_re.subn(self.substitute_entity, string)[0]
        return clean_pat.subn("",string)[0]

    def parseData(self):
        html = ''
        nbAttemptsMax = 5
        nbAttempts = 0
        found = False

        # try to download the page several times
        #while not found and nbAttempts < nbAttemptsMax:
            # sometimes urlopen raises an exception, so retry instead of fail

        headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1'}
        url = 'http://www.font.com/cgi-bin/register.cgi'
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        values = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }
        headers = { 'User-Agent' : user_agent }

        data = urllib.urlencode(values)
        req = urllib2.Request(self.url, data, headers)
        response = urllib2.urlopen(req)
        the_page = response.read()
        #print the_page
        html = the_page
        found = (self.titlePattern.search(html) is not None)

        nbAttempts = nbAttempts + 1

        if not found:

            logging.warn('Unable to get html content after %s attempts...' % nbAttemptsMax)
            return

        self.isTvSerie = self.tvSeriesPattern.search(html) is not None
        logging.debug('Is this a TV show ? ' + str(self.isTvSerie))

        # parse title and year
        m = self.titlePattern.search(html)
        self.title = m.group('title')

        m = self.datePattern.search(html)
        self.year = m.group('year')
        self.title = self.decode_htmlentities(self.title)
        # parse cover URL
        m = self.coverPattern.search(html)
        self.coverURL = m.group('coverURL')


        # parse rating
        m = self.ratingPattern.search(html)
        if m is None:
            self.rating = 'N/A'
            self.ratingInt = 0
        else:
            self.rating = m.group('rating')
            self.ratingInt = int(self.rating.replace('/10', '').replace('.', '').replace(',', ''))

        if self.isTvSerie:
            # parse creators
            creators = self.creatorsPattern.search(html)

            if creators is None:
                self.creators.append(IMDbPerson('N/A', ''))
            else:
                self.creatorsLabel = creators.group('label')
                creators = creators.group('creators')
                for c in self.creatorsSubpattern.finditer(creators):
                    self.creators.append(IMDbPerson(c.group('name'), c.group('url')))
        else:
            # parse directors
            directors = self.directorsPattern.search(html)

            if directors is None:
                self.directors.append(IMDbPerson('N/A', ''))
            else:
                self.directorsLabel = directors.group('label')
                directors = directors.group('directors')
                for d in self.directorsSubpattern.finditer(directors):
                    self.directors.append(IMDbPerson(d.group('name'), d.group('url')))

        # parse actors
        #self.actorsLabel = self.actorsLabelPattern.search(html).group('label') + ':'
        #nbActorsMax = 6
        #i = 1
        #for a in self.actorPattern.finditer(html):
        #    if i > nbActorsMax:
        #        break
        #   self.actors.append(IMDbPerson(a.group('name'), a.group('url')))
        #    i = i + 1

    def __str__(self):
        print 'Title:', self.title
        print 'TvSerie:', self.isTvSerie
        print 'Year:', self.year
        print 'URL:', self.url
        print 'Cover URL:', self.coverURL
        print 'Rating:', self.rating
        #if self.isTvSerie:
        #    print self.creatorsLabel
        #    for c in self.creators:
        #        print '', c
        #else:
        #    print self.directorsLabel
        #    for d in self.directors:
        #        print '', d
        #print self.actorsLabel
        #for a in self.actors:
        #    print '', a
        return ''


if len(sys.argv) < 2:
    exit('Need Path as Argument')
if not os.path.isdir(sys.argv[1]):
  exit("Error : Path " + sys.argv[1] + " doesnt exsist")


fileh = FileHandler(sys.argv[1])
fileh.parseNFO()
fileh.parseImdb()
fileh.renameAvi()