Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: cp1252 -*-
- import re
- import logging
- import urllib
- import urllib2
- import os
- import sys
- import shutil
- from htmlentitydefs import name2codepoint as n2cp
- from string import Template
- imdb_ext = "com" #com,nl etc.
- file_template = Template("$title ($year)")
- duplicate_file = Template("$number $filename")
- class IMDbPerson:
- def __init__(self, name, url):
- self.url = url
- self.name = name
- def __str__(self):
- return self.name + '\t' + self.url
- class FileHandler:
- def __init__(self,rootFolder):
- os.chdir(rootFolder)
- self.rootdir = os.path.normpath(rootFolder)
- self.numofdirs = 0
- self.numofiles = 0
- def printcount(self):
- print '-' * 40
- print "Number of Directories: " + str(self.numofdirs)
- print "Number of Files: " + str(self.numofiles)
- print '-' * 40
- def exploretree(self):
- for x in os.listdir(os.getcwd()):
- if os.path.isdir(x):
- print 'Dir : ' + x
- self.numofdirs += 1
- os.chdir(x)
- self.exploretree()
- os.chdir('..')
- else:
- print 'File: ' + x
- self.numofiles += 1
- def movefiles(self):
- for x in os.listdir(os.getcwd()):
- if os.path.isdir(x):
- os.chdir(x)
- self.movefiles()
- os.chdir('..')
- if os.path.isfile(x) and os.getcwd() != self.rootdir:
- if not os.path.isfile(os.path.join(self.rootdir,x)):
- shutil.move(x, os.path.join(self.rootdir,x))
- else:
- shutil.move(x, os.path.join(self.rootdir,self.fileExsists(x)))
- def fileExsists(self,files,index=1):
- if not os.path.isfile(os.path.join(self.rootdir,duplicate_file.substitute(number=index,filename=files))):
- return duplicate_file.substitute(number=index,filename=files)
- else:
- return self.fileExsists(files,index+1)
- def deletedirs(self):
- for x in os.listdir(os.getcwd()):
- if os.path.isdir(x):
- os.chdir(x)
- self.deletedirs()
- os.chdir('..')
- try:
- os.rmdir(x)
- #print 'Removed Directory: ' + x
- self.numofdirs -= 1
- except:
- pass
- #print "Couldn't Delete Directory: " + x
- def parseNFO(self):
- fobj = open(self.findNFO(),"r")
- p = re.compile('(.*imdb.[com,de]+/.*/tt)([0-9]*)', re.IGNORECASE)
- for line in fobj:
- if line.find("imdb.") > 0:
- link = p.search(line)
- movieID = link.group(2)
- self.movieID = movieID
- return movieID
- fobj.close()
- def findNFO(self):
- self.movefiles()
- filesList = os.listdir(self.rootdir)
- for file in filesList:
- if os.path.isfile(os.path.join(self.rootdir,file)) and file.rpartition('.')[-1].lower()=="nfo":
- return os.path.join(self.rootdir,file)
- def parseImdb(self):
- global imdb_ext
- self.movie = IMDbParser("www", imdb_ext, self.movieID )
- def renameAvi(self):
- global file_template
- extList = ["avi", "mkv", "wmv"]
- aviList = []
- p = re.compile('.*sample.*', re.IGNORECASE)
- filesList = os.listdir(self.rootdir)
- for file in filesList:
- if os.path.isfile(os.path.join(self.rootdir,file)) and file.rpartition('.')[-1].lower() in extList and p.search(file) is None:
- aviList.append(file)
- if len(aviList) > 1:
- for file in aviList:
- fileExt = file.rpartition('.')[-1].lower()
- shutil.move(file, file_template.substitute(title=self.movie.title,year=self.movie.year)+" CD"+str(aviList.index(file)+1)+"."+fileExt)
- else:
- fileExt = aviList[0].rpartition('.')[-1].lower()
- shutil.move(aviList[0], file_template.substitute(title=self.movie.title,year=self.movie.year)+"."+fileExt)
- os.chdir(self.rootdir)
- os.chdir("..")
- os.getcwd()
- shutil.move(self.rootdir,os.path.join(os.getcwd(),file_template.substitute(title=self.movie.title,year=self.movie.year)))
- class IMDbParser:
- global ext
- if imdb_ext == "com":
- tvSeriesPattern = re.compile('<span>.*TV Series.*</span>')
- titlePattern = re.compile('<h1 class="header">(?P<title>[^<]+)\s+<span>')
- datePattern = re.compile('<h4 class="inline">Release Date:</h4>\s*[0-9]{1,2}\s*[a-zA-Z]+\s*(?P<year>[0-9]{4})\s*\([a-zA-Z]+\)')
- coverPattern = re.compile('<td rowspan="2" id="img_primary">\s+<a[^>]*><img src="(?P<coverURL>http://[^"]+)"[^>]+></a>')
- ratingPattern = re.compile('<span class="rating-rating">(?P<rating>["\d\.]+)<span>')
- directorsPattern = re.compile('<div[^>]*id="director-info"[^>]*>\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<directors>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)</div>\s+</div>')
- directorsSubpattern = re.compile('<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a>')
- creatorsPattern = re.compile('<div class="info">\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<creators>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)')
- creatorsSubpattern = re.compile('<a href="(?P<url>[^"]+)" onclick="[^"]*">(?P<name>[^<]+)</a>')
- #actorsLabelPattern = re.compile('<div class="headerinline"><h3>(?P<label>[^>]+)</h3>')
- #actorPattern = re.compile('<td class="nm"><a[^>]*href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a></td>')
- genrePattern = re.compile('<a.*href="/genre/.*>(?P<genre>[A-Za-z]*)</a>')
- else:
- tvSeriesPattern = re.compile('<span class="tv-extra">[^<]*TV[^<]*</span>')
- titlePattern = re.compile('<div id="tn15title">\s+<h1>(?P<title>[^<]+)\s+<span>\((<a[^>]+>)?(?P<year>\d+)(</a>)?[^)]*\)')
- datePattern = re.compile('<div id="tn15title">\s+<h1>[^<]+\s+<span>\((<a[^>]+>)?(?P<year>\d+)(</a>)?[^)]*\)')
- coverPattern = re.compile('<div class="photo">\s+<a[^>]+><img[^>]+src="(?P<coverURL>http://[^"]+)"[^>]+></a>')
- ratingPattern = re.compile('<div class="starbar-meta">\s+<b>(?P<rating>[\d\.,]+/10)</b>')
- directorsPattern = re.compile('<div[^>]*id="director-info"[^>]*>\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<directors>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)</div>\s+</div>')
- directorsSubpattern = re.compile('<a[^>]+href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a>')
- creatorsPattern = re.compile('<div class="info">\s+<h5>(?P<label>[^<]*)</h5>\s+<div class="info-content">\s+(?P<creators>(<a[^>]+>([^<]+)</a>[^<]*<br/>\s+)+)')
- creatorsSubpattern = re.compile('<a href="(?P<url>[^"]+)" onclick="[^"]*">(?P<name>[^<]+)</a>')
- #actorsLabelPattern = re.compile('<div class="headerinline"><h3>(?P<label>[^>]+)</h3>')
- #actorPattern = re.compile('<td class="nm"><a[^>]*href="(?P<url>[^"]+)"[^>]*>(?P<name>[^<]+)</a></td>')
- genrePattern = re.compile('<a.*href="/genre/.*>(?P<genre>[A-Za-z]*)</a>')
- def __init__(self, subdomain, extension, movieID):
- """ Parses data from a movie/tv show on IMDb """
- self.subdomain = subdomain
- self.extension = extension
- self.movieID = movieID
- self.url = 'http://%s.imdb.%s/title/tt%s' % (subdomain, extension, movieID)
- # default values in case we can't parse the page
- self.title, self.year, self.coverURL, self.rating = 'N/A', 'N/A', 'http://img407.imageshack.us/img407/6493/titleaddposterw.jpg', 'N/A'
- self.ratingInt = 0
- self.creators, self.directors, self.actors = [], [], []
- self.creatorsLabel, self.directorsLabel, self.actorsLabel = 'Creator(s)', 'Director(s)', 'Actors'
- self.parseData()
- def substitute_entity(self,match):
- ent = match.group(3)
- if match.group(1) == "#":
- if match.group(2) == '':
- return unichr(int(ent))
- elif match.group(2) == 'x':
- return unichr(int('0x'+ent, 16))
- else:
- cp = n2cp.get(ent)
- if cp:
- return unichr(cp)
- else:
- return match.group()
- def decode_htmlentities(self,string):
- entity_re = re.compile(r'&(#?)(x?)(\d{1,5}|\w{1,8});')
- clean_pat = re.compile('[^a-zA-Z0-9äüä ]')
- string = entity_re.subn(self.substitute_entity, string)[0]
- return clean_pat.subn("",string)[0]
- def parseData(self):
- html = ''
- nbAttemptsMax = 5
- nbAttempts = 0
- found = False
- # try to download the page several times
- #while not found and nbAttempts < nbAttemptsMax:
- # sometimes urlopen raises an exception, so retry instead of fail
- headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1'}
- url = 'http://www.font.com/cgi-bin/register.cgi'
- user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
- values = {'name' : 'Michael Foord',
- 'location' : 'Northampton',
- 'language' : 'Python' }
- headers = { 'User-Agent' : user_agent }
- data = urllib.urlencode(values)
- req = urllib2.Request(self.url, data, headers)
- response = urllib2.urlopen(req)
- the_page = response.read()
- #print the_page
- html = the_page
- found = (self.titlePattern.search(html) is not None)
- nbAttempts = nbAttempts + 1
- if not found:
- logging.warn('Unable to get html content after %s attempts...' % nbAttemptsMax)
- return
- self.isTvSerie = self.tvSeriesPattern.search(html) is not None
- logging.debug('Is this a TV show ? ' + str(self.isTvSerie))
- # parse title and year
- m = self.titlePattern.search(html)
- self.title = m.group('title')
- m = self.datePattern.search(html)
- self.year = m.group('year')
- self.title = self.decode_htmlentities(self.title)
- # parse cover URL
- m = self.coverPattern.search(html)
- self.coverURL = m.group('coverURL')
- # parse rating
- m = self.ratingPattern.search(html)
- if m is None:
- self.rating = 'N/A'
- self.ratingInt = 0
- else:
- self.rating = m.group('rating')
- self.ratingInt = int(self.rating.replace('/10', '').replace('.', '').replace(',', ''))
- if self.isTvSerie:
- # parse creators
- creators = self.creatorsPattern.search(html)
- if creators is None:
- self.creators.append(IMDbPerson('N/A', ''))
- else:
- self.creatorsLabel = creators.group('label')
- creators = creators.group('creators')
- for c in self.creatorsSubpattern.finditer(creators):
- self.creators.append(IMDbPerson(c.group('name'), c.group('url')))
- else:
- # parse directors
- directors = self.directorsPattern.search(html)
- if directors is None:
- self.directors.append(IMDbPerson('N/A', ''))
- else:
- self.directorsLabel = directors.group('label')
- directors = directors.group('directors')
- for d in self.directorsSubpattern.finditer(directors):
- self.directors.append(IMDbPerson(d.group('name'), d.group('url')))
- # parse actors
- #self.actorsLabel = self.actorsLabelPattern.search(html).group('label') + ':'
- #nbActorsMax = 6
- #i = 1
- #for a in self.actorPattern.finditer(html):
- # if i > nbActorsMax:
- # break
- # self.actors.append(IMDbPerson(a.group('name'), a.group('url')))
- # i = i + 1
- def __str__(self):
- print 'Title:', self.title
- print 'TvSerie:', self.isTvSerie
- print 'Year:', self.year
- print 'URL:', self.url
- print 'Cover URL:', self.coverURL
- print 'Rating:', self.rating
- #if self.isTvSerie:
- # print self.creatorsLabel
- # for c in self.creators:
- # print '', c
- #else:
- # print self.directorsLabel
- # for d in self.directors:
- # print '', d
- #print self.actorsLabel
- #for a in self.actors:
- # print '', a
- return ''
- if len(sys.argv) < 2:
- exit('Need Path as Argument')
- if not os.path.isdir(sys.argv[1]):
- exit("Error : Path " + sys.argv[1] + " doesnt exsist")
- fileh = FileHandler(sys.argv[1])
- fileh.parseNFO()
- fileh.parseImdb()
- fileh.renameAvi()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement