Advertisement
j7sx

imdb parser

Sep 1st, 2015
113
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.03 KB | None | 0 0
  1. import urllib.request
  2. from bs4 import BeautifulSoup
  3. import pandas as pd
  4.  
  5.  
  6. def get_html(url):
  7.     response = urllib.request.urlopen(url)
  8.     return response.read()
  9.  
  10.  
  11. def parse(html):
  12.     soup = BeautifulSoup(html)
  13.     table = soup.find('table', class_='results')
  14.  
  15.     horror = []
  16.  
  17.     for row in table.find_all('tr')[1:]:
  18.         cols = row.find_all('td')
  19.         year = row.find_all('span')
  20.         rating = row.select('span.value')[0].text if row.select('span.value') else '0.0'
  21.  
  22.         horror.append([
  23.             cols[0].text,  # № п/п
  24.             cols[2].a.text,  # название
  25.             year[1].text,  # год
  26.             rating  # рейтинг
  27.         ])
  28.  
  29.     df = pd.DataFrame(data=horror, columns=['Номер', 'Название', 'Год', 'Рейтинг'])
  30.     df = df.set_index('Номер')
  31.     print(df.head(len(horror)))
  32.  
  33.  
  34. def main():
  35.     parse(get_html('http://www.imdb.com/search/title?at=0&genres=horror&sort=user_rating&title_type=feature'))
  36.  
  37.  
  38. if __name__ == '__main__':
  39.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement