Advertisement
Masoko

IMDB BS parser

Feb 12th, 2017
168
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.21 KB | None | 0 0
  1. import urllib
  2. from bs4 import BeautifulSoup  as bs
  3.  
  4. def get_media_imdb(imdbid):
  5.     url = "http://www.imdb.com/title/"+imdbid+"/"
  6.    
  7.     soup = bs(urllib.urlopen(url), "html5lib")
  8.     #pare rating
  9.     ss = soup.findAll('span',attrs={'itemprop':'ratingValue'})
  10.     try:
  11.         rating = str(ss).split(">")[1].split("<")[0]   
  12.     except:
  13.         rating = None
  14.    
  15.     #parse poster
  16.     poster = soup.findAll('div',attrs={'class':'poster'})
  17.     poster = str(poster).split('"')[9]
  18.    
  19.     #parse genre
  20.     gen = soup.findAll('div',attrs={'class':'subtext'})
  21.     x = bs(str(gen), "html5lib")
  22.     gen = x.findAll('span',attrs={'class':'itemprop'})
  23.     genre = []
  24.     for g in gen:
  25.         genre.append(str(g).split(">")[1].split("<")[0])
  26.    
  27.     #parse title and year
  28.     t = soup.findAll('h1')
  29.     title = str(t).split(">")[1].split("<")[0].split("\\")[0]
  30.    
  31.     y = bs(str(t), "html5lib")
  32. #   if "TV Series" in str(y):
  33. #       print "Serial"
  34. #   else:
  35. #       print "Movie"
  36.        
  37.     year = y.findAll('a')
  38.     try:
  39.         year = str(year).split(">")[1].split("<")[0]
  40.         type = "Movie"
  41.     except:
  42.         type = "Serial"
  43.    
  44.     result = {'Type': type, 'Year': year, 'Title': title, 'imdbRating': rating, 'Poster': poster, 'Genre': genre}
  45.     return result
  46.        
  47. if __name__ == "__main__":
  48.     print get_media_imdb('tt4846340')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement