Advertisement
Mr_HO1A

IMDb Scrapper

May 25th, 2019
35
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.60 KB | None | 0 0
  1. '''
  2. Author : Aman Vishwakarma
  3. Windows 10
  4. MIT
  5. '''
  6.  
  7. from bs4 import BeautifulSoup
  8. import requests
  9. import json
  10. import re
  11.  
  12. def GetData(number):
  13.     print(number)
  14.     print((number*250)+1)
  15.     fileToWrite = open("data.txt","a+")
  16.     url = 'https://www.imdb.com/search/title?title_type=feature,tv_movie&colors=color,black_and_white&count=250&start={0}&ref_=adv_nxt'.format(
  17.         str((number*250)+1))
  18.     response = requests.get(url)
  19.     soup = BeautifulSoup(response.text, "lxml")
  20.     defautUrl = "https://www.imdb.com"
  21.  
  22.     # Movie Container List
  23.     movie_Container = soup.find_all('div', class_="lister-item mode-advanced")
  24.     for movie in movie_Container:
  25.         # Get name of movie
  26.         name = movie.h3.a.text
  27.         # Get Movie Url
  28.         movieUrl = defautUrl + movie.h3.a["href"]
  29.         # Now Find Year
  30.         year = movie.h3.find('span', class_="lister-item-year text-muted unbold")
  31.         yearOfRelease = year.text
  32.         # Now check the disc
  33.         Desc = movie.find_all('p', class_="text-muted")[1].text
  34.         # Now Some More Data
  35.         MovieData = movie.find_all('p', class_="text-muted")[0].text
  36.         # Star Rating Can be None
  37.         StarRating = "N/A"
  38.         try:
  39.             StarRating = movie.find('div', class_="inline-block ratings-imdb-rating")['data-value']
  40.         except:
  41.             StarRating = "N/A"
  42.         # MetaScore
  43.         MetaScore = "N/A"
  44.         try:
  45.             MetaScore = movie.find('span', class_="metascore favorable").text
  46.         except:
  47.             MetaScore = "N/A"
  48.         # Votes count
  49.         Votes = "N/A"
  50.         try:
  51.             Votes = movie.find('span', attrs={'name': 'nv'})['data-value']
  52.         except:
  53.             continue
  54.         # For Director and Star Cast
  55.         ListerItem = movie.find('div',class_='lister-item-content')
  56.         getAllPTags = ListerItem.find_all('p')
  57.         CastString = getAllPTags[2].text
  58.         movieDump = {
  59.             "name":name,
  60.             "movieUrl":movieUrl,
  61.             "YOR":yearOfRelease,
  62.             "desc":Desc,
  63.             "movieData":MovieData,
  64.             "starCast":CastString,
  65.             "starRating":StarRating,
  66.             "metaScore":MetaScore,
  67.             "voteCount":Votes
  68.         }
  69.         fileToWrite.write(json.dumps(movieDump)+","+"\n")
  70.     fileToWrite.close()
  71.  
  72. def main():
  73.     print('''
  74.    Enter Number Below
  75.    Movies are dumped on multiple of 250
  76.    i.e if you enter 1 => 1*250 Movies will be scraped
  77.    ''')
  78.     number = int(input("Enter The Number > "))
  79.     for x in range(0,number+1):
  80.         GetData(x)
  81.  
  82. if __name__ == '__main__':
  83.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement