Advertisement
tsounakis

Scraping ver. 1

May 16th, 2022
478
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.42 KB | None | 0 0
  1. import urllib.request
  2. import re
  3.  
  4.  
  5. class Game:
  6.     def __init__(self):
  7.         self.price = None
  8.         self.name = None
  9.         self.rating = None
  10.         self.yearRelease = None
  11.         self.genre = None
  12.  
  13.  
  14. def readNames(file):
  15.     names = []
  16.     game_names = re.findall(
  17.         "<h2 class=\"search-results-row-game-title\">+[a-zA-Z0-9 ]+</h2>", file)
  18.  
  19.     try:
  20.         for game_name in game_names:
  21.             names.append(re.findall("(?<=\>)(.*?)(?=\<)", game_name))
  22.     except:
  23.         print('Error during reading the HTML file.')
  24.     return names
  25.  
  26.  
  27. def readRatings(file):
  28.     ratings = []
  29.     game_ratings = re.findall(
  30.         "    .*    <div class=\"metacritic-spinner-wrapper\">", file)
  31.     try:
  32.         for game in game_ratings:
  33.             ratings.append(re.findall("[+-]?[0-9]+\.?[0-9]*", game))
  34.     except:
  35.         print('Error during reading the HTML file (Ratings).')
  36.     return ratings
  37.  
  38.  
  39. def readYearReleasedAndGenre(file):
  40.     year = []
  41.     genre = []
  42.     games_info = re.findall(
  43.         "<div class=\"search-results-row-game-infos\">(.*)</div>", file)
  44.     try:
  45.         for game in games_info:
  46.             year.append(re.findall("(.*?)\ -", game))
  47.             genre.append(re.findall("\- (.*?)$", game))
  48.     except:
  49.         print('Error during reading the HTML file (Years and genre).')
  50.     return year, genre
  51.  
  52.  
  53. def readPrice(file):
  54.     prices = []
  55.     games_info = re.findall(
  56.         "([0-9]+\.?[0-9]*€)", file)
  57.     try:
  58.         for game in games_info:
  59.             prices.append(re.findall("[+-]?[0-9]+\.?[0-9]*", game))
  60.     except:
  61.         print('Error during reading the HTML file (Prices).')
  62.     return prices
  63.  
  64.  
  65. website = urllib.request.urlopen(
  66.     'https://www.allkeyshop.com/blog/catalogue/genre-pc-games-all/')
  67. html_file = website.read()
  68. text_file = html_file.decode()
  69.  
  70. names = readNames(text_file)
  71. [years, genre] = readYearReleasedAndGenre(text_file)
  72. ratings = readRatings(text_file)
  73. prices = readPrice(text_file)
  74. games = []
  75. for index in range(len(names)):
  76.     games.append(Game())
  77.     games[index].name = names[index]
  78.     games[index].years = years[index]
  79.     games[index].genre = genre[index]
  80.     games[index].rating = ratings[index]
  81.     games[index].price = prices[index]
  82.  
  83. print("Game name:\tPrice:\tYear released:\tGenre:\tRating:\t")
  84. for game in games:
  85.     print("{}\t{}\t{}\t{}\t{}".format(
  86.         game.name, game.price, game.years, game.genre, game.rating))
  87.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement