Advertisement
Guest User

imdbparser.py

a guest
Jul 15th, 2011
412
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.93 KB | None | 0 0
  1. # -*- coding: cp1252 -*-
  2. import re
  3. import logging
  4. import urllib
  5. import urllib2
  6. import os
  7. import sys
  8. import shutil
  9. from htmlentitydefs import name2codepoint as n2cp
  10. from string import Template
  11.  
  12.  
  13. imdb_ext = "com" #com,nl etc.
  14. file_template = Template("$title ($year)")
  15. duplicate_file = Template("$number $filename")
  16.    
  17.  
  18. class IMDbPerson:
  19.     def __init__(self, name, url):
  20.         self.url = url
  21.         self.name = name
  22.    
  23.     def __str__(self):
  24.         return self.name + '\t' + self.url
  25.  
  26. class FileHandler:
  27.  
  28.     def __init__(self,rootFolder):
  29.         os.chdir(rootFolder)
  30.         self.rootdir = os.path.normpath(rootFolder)
  31.         self.numofdirs = 0
  32.         self.numofiles = 0
  33.  
  34.     def printcount(self):
  35.         print '-' * 40
  36.         print "Number of Directories: " + str(self.numofdirs)
  37.         print "Number of Files: " + str(self.numofiles)
  38.         print '-' * 40
  39.  
  40.     def exploretree(self):
  41.         for x in os.listdir(os.getcwd()):
  42.             if os.path.isdir(x):
  43.                 print 'Dir : ' + x
  44.                 self.numofdirs += 1
  45.                 os.chdir(x)
  46.                 self.exploretree()
  47.                 os.chdir('..')
  48.             else:
  49.                 print 'File: ' + x
  50.                 self.numofiles += 1
  51.  
  52.     def movefiles(self):
  53.        
  54.         for x in os.listdir(os.getcwd()):
  55.             if os.path.isdir(x):
  56.                 os.chdir(x)
  57.                 self.movefiles()
  58.                 os.chdir('..')
  59.             if os.path.isfile(x) and os.getcwd() != self.rootdir:
  60.                 if not os.path.isfile(os.path.join(self.rootdir,x)):
  61.                     shutil.move(x, os.path.join(self.rootdir,x))
  62.                 else:
  63.                     shutil.move(x, os.path.join(self.rootdir,self.fileExsists(x)))
  64.                
  65.     def fileExsists(self,files,index=1):
  66.         if not os.path.isfile(os.path.join(self.rootdir,duplicate_file.substitute(number=index,filename=files))):
  67.             return duplicate_file.substitute(number=index,filename=files)
  68.         else:
  69.             return self.fileExsists(files,index+1)
  70.    
  71.     def deletedirs(self):
  72.         for x in os.listdir(os.getcwd()):
  73.             if os.path.isdir(x):
  74.                 os.chdir(x)
  75.                 self.deletedirs()
  76.                 os.chdir('..')
  77.             try:
  78.                 os.rmdir(x)
  79.                 #print 'Removed Directory: ' + x
  80.                 self.numofdirs -= 1
  81.             except:
  82.                 pass
  83.                 #print "Couldn't Delete Directory: " + x
  84.  
  85.     def parseNFO(self):
  86.         fobj = open(self.findNFO(),"r")
  87.         p = re.compile('(.*imdb.[com,de]+/.*/tt)([0-9]*)', re.IGNORECASE)
  88.         for line in fobj:
  89.             if line.find("imdb.") > 0:
  90.                 link = p.search(line)
  91.                 movieID = link.group(2)
  92.                 self.movieID = movieID
  93.                 return movieID
  94.         fobj.close()
  95.    
  96.     def findNFO(self):
  97.         self.movefiles()
  98.         filesList = os.listdir(self.rootdir)
  99.         for file in filesList:
  100.             if os.path.isfile(os.path.join(self.rootdir,file)) and file.rpartition('.')[-1].lower()=="nfo":
  101.                 return os.path.join(self.rootdir,file)
  102.  
  103.     def parseImdb(self):
  104.         global imdb_ext
  105.         self.movie = IMDbParser("www", imdb_ext, self.movieID )
  106.  
  107.     def renameAvi(self):
  108.         global file_template
  109.         extList = ["avi", "mkv", "wmv"]
  110.         aviList = []
  111.         p = re.compile('.*sample.*', re.IGNORECASE)
  112.         filesList = os.listdir(self.rootdir)
  113.         for file in filesList:
  114.             if os.path.isfile(os.path.join(self.rootdir,file)) and file.rpartition('.')[-1].lower() in extList and p.search(file) is None:
  115.                 aviList.append(file)
  116.  
  117.         if len(aviList) > 1:
  118.             for file in aviList:
  119.                 fileExt = file.rpartition('.')[-1].lower()
  120.                 shutil.move(file, file_template.substitute(title=self.movie.title,year=self.movie.year)+" CD"+str(aviList.index(file)+1)+"."+fileExt)
  121.         else:
  122.             fileExt = aviList[0].rpartition('.')[-1].lower()
  123.             shutil.move(aviList[0], file_template.substitute(title=self.movie.title,year=self.movie.year)+"."+fileExt)
  124.         os.chdir(self.rootdir)
  125.         os.chdir("..")
  126.         os.getcwd()
  127.         shutil.move(self.rootdir,os.path.join(os.getcwd(),file_template.substitute(title=self.movie.title,year=self.movie.year)))
  128.            
  129. class IMDbParser:
  130.     global ext
  131.     if imdb_ext == "com":
  132.         tvSeriesPattern = re.compile('<span>.*TV Series.*</span>')
  133.         titlePattern = re.compile('<h1 class="header">(?P<title>[^<]+)\s+<span>')
  134.         datePattern = re.compile('<h4 class="inline">Release Date:</h4>\s*[0-9]{1,2}\s*[a-zA-Z]+\s*(?P<year>[0-9]{4})\s*\([a-zA-Z]+\)')
  135.         coverPattern = re.compile('<td rowspan="2" id="img_primary">\s+<a[^>]*><img src="(?P<coverURL>http://[^"]+)"[^>]+></a>')
  136.         ratingPattern = re.compile('<span class="rating-rating">(?P<rating>["\d\.]+)<span>')
  137.         directorsPattern = re.compile('<div[^>]*id="director-info"[^>]*>\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<directors>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)</div>\s+</div>')
  138.         directorsSubpattern = re.compile('<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a>')
  139.         creatorsPattern = re.compile('<div class="info">\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<creators>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)')
  140.         creatorsSubpattern = re.compile('<a href="(?P<url>[^"]+)" onclick="[^"]*">(?P<name>[^<]+)</a>')
  141.         #actorsLabelPattern = re.compile('<div class="headerinline"><h3>(?P<label>[^>]+)</h3>')
  142.         #actorPattern = re.compile('<td class="nm"><a[^>]*href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a></td>')
  143.         genrePattern = re.compile('<a.*href="/genre/.*>(?P<genre>[A-Za-z]*)</a>')
  144.     else:
  145.         tvSeriesPattern = re.compile('<span class="tv-extra">[^<]*TV[^<]*</span>')
  146.         titlePattern = re.compile('<div id="tn15title">\s+<h1>(?P<title>[^<]+)\s+<span>\((<a[^>]+>)?(?P<year>\d+)(</a>)?[^)]*\)')
  147.         datePattern = re.compile('<div id="tn15title">\s+<h1>[^<]+\s+<span>\((<a[^>]+>)?(?P<year>\d+)(</a>)?[^)]*\)')
  148.         coverPattern = re.compile('<div class="photo">\s+<a[^>]+><img[^>]+src="(?P<coverURL>http://[^"]+)"[^>]+></a>')
  149.         ratingPattern = re.compile('<div class="starbar-meta">\s+<b>(?P<rating>[\d\.,]+/10)</b>')
  150.         directorsPattern = re.compile('<div[^>]*id="director-info"[^>]*>\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<directors>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)</div>\s+</div>')
  151.         directorsSubpattern = re.compile('<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a>')
  152.         creatorsPattern = re.compile('<div class="info">\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<creators>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)')
  153.         creatorsSubpattern = re.compile('<a href="(?P<url>[^"]+)" onclick="[^"]*">(?P<name>[^<]+)</a>')
  154.         #actorsLabelPattern = re.compile('<div class="headerinline"><h3>(?P<label>[^>]+)</h3>')
  155.         #actorPattern = re.compile('<td class="nm"><a[^>]*href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a></td>')
  156.         genrePattern = re.compile('<a.*href="/genre/.*>(?P<genre>[A-Za-z]*)</a>')
  157.  
  158.     def __init__(self, subdomain, extension, movieID):
  159.         """ Parses data from a movie/tv show on IMDb """
  160.         self.subdomain = subdomain
  161.         self.extension = extension
  162.         self.movieID = movieID
  163.        
  164.         self.url = 'http://%s.imdb.%s/title/tt%s' % (subdomain, extension, movieID)
  165.        
  166.         # default values in case we can't parse the page
  167.         self.title, self.year, self.coverURL, self.rating = 'N/A', 'N/A', 'http://img407.imageshack.us/img407/6493/titleaddposterw.jpg', 'N/A'
  168.         self.ratingInt = 0
  169.         self.creators, self.directors, self.actors = [], [], []
  170.         self.creatorsLabel, self.directorsLabel, self.actorsLabel = 'Creator(s)', 'Director(s)', 'Actors'
  171.        
  172.         self.parseData()
  173.  
  174.     def substitute_entity(self,match):
  175.         ent = match.group(3)
  176.         if match.group(1) == "#":
  177.             if match.group(2) == '':
  178.                 return unichr(int(ent))
  179.             elif match.group(2) == 'x':
  180.                 return unichr(int('0x'+ent, 16))
  181.         else:
  182.             cp = n2cp.get(ent)
  183.             if cp:
  184.                 return unichr(cp)
  185.             else:
  186.                 return match.group()
  187.  
  188.     def decode_htmlentities(self,string):
  189.         entity_re = re.compile(r'&(#?)(x?)(\d{1,5}|\w{1,8});')
  190.         clean_pat = re.compile('[^a-zA-Z0-9äüä ]')
  191.         string = entity_re.subn(self.substitute_entity, string)[0]
  192.         return clean_pat.subn("",string)[0]
  193.  
  194.     def parseData(self):
  195.         html = ''
  196.         nbAttemptsMax = 5
  197.         nbAttempts = 0
  198.         found = False
  199.        
  200.         # try to download the page several times
  201.         #while not found and nbAttempts < nbAttemptsMax:
  202.             # sometimes urlopen raises an exception, so retry instead of fail
  203.            
  204.         headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1'}
  205.         url = 'http://www.font.com/cgi-bin/register.cgi'
  206.         user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
  207.         values = {'name' : 'Michael Foord',
  208.           'location' : 'Northampton',
  209.           'language' : 'Python' }
  210.         headers = { 'User-Agent' : user_agent }
  211.        
  212.         data = urllib.urlencode(values)
  213.         req = urllib2.Request(self.url, data, headers)
  214.         response = urllib2.urlopen(req)
  215.         the_page = response.read()
  216.         #print the_page                
  217.         html = the_page
  218.         found = (self.titlePattern.search(html) is not None)
  219.                          
  220.         nbAttempts = nbAttempts + 1
  221.        
  222.         if not found:
  223.  
  224.             logging.warn('Unable to get html content after %s attempts...' % nbAttemptsMax)
  225.             return
  226.  
  227.         self.isTvSerie = self.tvSeriesPattern.search(html) is not None
  228.         logging.debug('Is this a TV show ? ' + str(self.isTvSerie))
  229.  
  230.         # parse title and year
  231.         m = self.titlePattern.search(html)
  232.         self.title = m.group('title')
  233.        
  234.         m = self.datePattern.search(html)
  235.         self.year = m.group('year')
  236.         self.title = self.decode_htmlentities(self.title)
  237.         # parse cover URL
  238.         m = self.coverPattern.search(html)
  239.         self.coverURL = m.group('coverURL')
  240.  
  241.        
  242.        
  243.         # parse rating
  244.         m = self.ratingPattern.search(html)
  245.         if m is None:
  246.             self.rating = 'N/A'
  247.             self.ratingInt = 0
  248.         else:
  249.             self.rating = m.group('rating')
  250.             self.ratingInt = int(self.rating.replace('/10', '').replace('.', '').replace(',', ''))
  251.  
  252.         if self.isTvSerie:
  253.             # parse creators
  254.             creators = self.creatorsPattern.search(html)
  255.  
  256.             if creators is None:
  257.                 self.creators.append(IMDbPerson('N/A', ''))
  258.             else:
  259.                 self.creatorsLabel = creators.group('label')
  260.                 creators = creators.group('creators')
  261.                 for c in self.creatorsSubpattern.finditer(creators):
  262.                     self.creators.append(IMDbPerson(c.group('name'), c.group('url')))
  263.         else:
  264.             # parse directors
  265.             directors = self.directorsPattern.search(html)
  266.  
  267.             if directors is None:
  268.                 self.directors.append(IMDbPerson('N/A', ''))
  269.             else:
  270.                 self.directorsLabel = directors.group('label')
  271.                 directors = directors.group('directors')
  272.                 for d in self.directorsSubpattern.finditer(directors):
  273.                     self.directors.append(IMDbPerson(d.group('name'), d.group('url')))
  274.  
  275.         # parse actors
  276.         #self.actorsLabel = self.actorsLabelPattern.search(html).group('label') + ':'
  277.         #nbActorsMax = 6
  278.         #i = 1
  279.         #for a in self.actorPattern.finditer(html):
  280.         #    if i > nbActorsMax:
  281.         #        break
  282.         #   self.actors.append(IMDbPerson(a.group('name'), a.group('url')))
  283.         #    i = i + 1
  284.  
  285.     def __str__(self):
  286.         print 'Title:', self.title
  287.         print 'TvSerie:', self.isTvSerie
  288.         print 'Year:', self.year
  289.         print 'URL:', self.url
  290.         print 'Cover URL:', self.coverURL
  291.         print 'Rating:', self.rating
  292.         #if self.isTvSerie:
  293.         #    print self.creatorsLabel
  294.         #    for c in self.creators:
  295.         #        print '', c
  296.         #else:
  297.         #    print self.directorsLabel
  298.         #    for d in self.directors:
  299.         #        print '', d
  300.         #print self.actorsLabel
  301.         #for a in self.actors:
  302.         #    print '', a
  303.         return ''
  304.  
  305.  
  306.    
  307. if len(sys.argv) < 2:
  308.     exit('Need Path as Argument')
  309. if not os.path.isdir(sys.argv[1]):
  310.   exit("Error : Path " + sys.argv[1] + " doesnt exsist")
  311.  
  312.  
  313.  
  314. fileh = FileHandler(sys.argv[1])
  315. fileh.parseNFO()
  316. fileh.parseImdb()
  317. fileh.renameAvi()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement