Advertisement
TankorSmash

Part 2 Parsing the saved HTML

Sep 12th, 2012
249
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.44 KB | None | 0 0
  1. '''now that we've gotten the HTML source of all 68 pages of games from SteamPowered.com
  2. we need to find the game data on each page. in jQuery itd be something like:
  3. $('[class^=gametransaction ]') but all I have for now is bs4. So let's get started'''
  4.  
  5. from bs4 import BeautifulSoup
  6. import glob
  7.  
  8. import json
  9.  
  10. import datetime
  11. import time
  12.  
  13. #need to have a json to store all the games maybe something like:
  14. """
  15. {game title : {
  16.               title  : value,
  17.               genres : values,
  18.               steam_release : date,
  19.               metascore : value,
  20.               price : value
  21.               }
  22.  
  23.               }
  24.  
  25. """
  26.  
  27. #----------------------------------------------------------------------
  28. def saveJsonToFile(filepath, the_dict):
  29.     """saves a json to the filepath, requires a dict to be passed in"""
  30.    
  31.     with open(filepath, 'w') as f:
  32.         json.dump(the_dict, f, encoding='iso-8859-1')
  33.  
  34. def convert_keys_vals_to_unicode(dictionary):
  35.     """Recursively converts dictionary keys to unicode"""
  36.     if not isinstance(dictionary, dict):
  37.         return dictionary
  38.     return dict((unicode(k), convert_keys_vals_to_unicode(unicode(v)))
  39.         for k, v in dictionary.items())
  40.  
  41.  
  42. #create a function that pulls the data from the html page
  43. #----------------------------------------------------------------------
  44. def parseSoupForInfo(soup):
  45.     """parses the given data and pulls out the title, genres,
  46. release date, metascore, price
  47. returns a dict with all of those elems in it"""
  48.  
  49.     #dict to hold all the found data
  50.     game_dict = {}
  51.    
  52.     #find the non-sale price, which is the strikeout text
  53.     # if it exists
  54.     priceTag = soup.find(attrs={'class': "col search_price"})
  55.     if priceTag.find('strike'):
  56.         game_dict['price'] = priceTag.find('strike').text
  57.     else:
  58.         game_dict['price'] = priceTag.text
  59.    
  60.    
  61.     #find the metascore
  62.     metascore = soup.find(attrs= {'class' :"col search_metascore"})
  63.     if metascore:
  64.         game_dict['metascore'] = metascore.text
  65.     else:
  66.         game_dict['metascore'] = ''
  67.    
  68.     #find the steam release data
  69.     time_format = "%d %b %Y"
  70.     alt_time_format = "%B %Y"
  71.     date = soup.find(attrs= {'class' :"col search_released"})
  72.     if date:
  73.         ##parse the date in : day, abbr month, yyyy
  74.         #try:
  75.             #date_obj = datetime.datetime.strptime(date.text, time_format)
  76.         ##if that doesn't work, try: full month, yyyy
  77.         #except ValueError as e:
  78.             #try:
  79.                 #date_obj = datetime.datetime.strptime(date.text, alt_time_format)
  80.             ##if THAT doesn't work, just use the text, because it's probably 'Fall 2012' etc
  81.             #except ValueError as e:
  82.                 #date_obj = date.text
  83.                
  84.            
  85.         date_obj = date.text
  86.         game_dict['released'] = date_obj
  87.     else:
  88.         game_dict['released'] = ''
  89.    
  90.     #find the game name
  91.     name = soup.find('h4')
  92.     if name:
  93.         game_dict['title'] = name.text
  94.     else:
  95.         game_dict['title'] = ''
  96.    
  97.     #find the genre, this'll be ugly.
  98.     pTag = soup.find('p')
  99.     pText = pTag.text.strip()
  100.     if " - Released:" in pText:
  101.         genresUnsplit = pText.partition(' - Released:')[0]
  102.     elif " - Available:" in pText:
  103.         genresUnsplit = pText.partition("- Available:")[0]
  104.     else:
  105.         genresUnsplit = ""
  106.    
  107.     genres = genresUnsplit.split(', ')
  108.     game_dict['genre'] = genres
  109.    
  110.    
  111.    
  112.     return convert_keys_vals_to_unicode(game_dict)
  113.     pass
  114.  
  115. #create a function that pulls out the games for the page source
  116. def parseHtmlForGameRow(html):
  117.     """looks through an entire page source for search_results
  118. returns a list of all those results"""
  119.     print 'making a soup'
  120.     soup = BeautifulSoup(html)
  121.    
  122.     #Finds all elements that start with 'search_result'
  123.     rows = soup.findAll('a', attrs={'class': lambda x:x and x.startswith('search_result')})
  124.    
  125.     #returns all the soups for each of the rows
  126.     return rows
  127.  
  128.  
  129. if __name__ == '__main__':
  130.     """Run the following if module is top module"""
  131.     #big list that will hold the data for all the games
  132.     mega_game_list = []
  133.    
  134.    
  135.     list_of_filepaths = glob.glob(r"searchResults/*.html")
  136.     #print list_of_filepaths
  137.    
  138.     #read all the html files into a list
  139.     list_of_html_pages = []
  140.     for path in list_of_filepaths:
  141.         with open(path) as f:
  142.             page_source = f.read()            
  143.             list_of_html_pages.append(page_source)
  144.     print 'done reading all the html to a list'
  145.                        
  146.     #now that we have all the html pages, we need to pull out
  147.     # the rows with game data in them
  148.     list_of_gameRows = []
  149.     for html in list_of_html_pages:    
  150.         gameRows = parseHtmlForGameRow(html)
  151.         for gameRow in gameRows:
  152.             list_of_gameRows.append(gameRow)
  153.            
  154.     print "done reading all the rows into a list"
  155.        
  156.     #now we've got all the rows of data. Let's pull out the
  157.     # data we want to deal with.
  158.     for row in list_of_gameRows:
  159.         game_dict = parseSoupForInfo(row)        
  160.         mega_game_list.append(game_dict)
  161.     print "done adding all the dicts to the big huge list"
  162.    
  163.     print "Parsed {0} games, and now will try to write them to a file".format(len(mega_game_list))
  164.                
  165.     saveJsonToFile('steam_games_fuck.json', mega_game_list)
  166.    
  167.     print "DONE! \t Saved the JSON to file"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement