Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''now that we've gotten the HTML source of all 68 pages of games from SteamPowered.com
- we need to find the game data on each page. in jQuery itd be something like:
- $('[class^=gametransaction ]') but all I have for now is bs4. So let's get started'''
- from bs4 import BeautifulSoup
- import glob
- import json
- import datetime
- import time
- #need to have a json to store all the games maybe something like:
- """
- {game title : {
- title : value,
- genres : values,
- steam_release : date,
- metascore : value,
- price : value
- }
- }
- """
- #----------------------------------------------------------------------
- def saveJsonToFile(filepath, the_dict):
- """saves a json to the filepath, requires a dict to be passed in"""
- with open(filepath, 'w') as f:
- json.dump(the_dict, f, encoding='iso-8859-1')
- def convert_keys_vals_to_unicode(dictionary):
- """Recursively converts dictionary keys to unicode"""
- if not isinstance(dictionary, dict):
- return dictionary
- return dict((unicode(k), convert_keys_vals_to_unicode(unicode(v)))
- for k, v in dictionary.items())
- #create a function that pulls the data from the html page
- #----------------------------------------------------------------------
- def parseSoupForInfo(soup):
- """parses the given data and pulls out the title, genres,
- release date, metascore, price
- returns a dict with all of those elems in it"""
- #dict to hold all the found data
- game_dict = {}
- #find the non-sale price, which is the strikeout text
- # if it exists
- priceTag = soup.find(attrs={'class': "col search_price"})
- if priceTag.find('strike'):
- game_dict['price'] = priceTag.find('strike').text
- else:
- game_dict['price'] = priceTag.text
- #find the metascore
- metascore = soup.find(attrs= {'class' :"col search_metascore"})
- if metascore:
- game_dict['metascore'] = metascore.text
- else:
- game_dict['metascore'] = ''
- #find the steam release data
- time_format = "%d %b %Y"
- alt_time_format = "%B %Y"
- date = soup.find(attrs= {'class' :"col search_released"})
- if date:
- ##parse the date in : day, abbr month, yyyy
- #try:
- #date_obj = datetime.datetime.strptime(date.text, time_format)
- ##if that doesn't work, try: full month, yyyy
- #except ValueError as e:
- #try:
- #date_obj = datetime.datetime.strptime(date.text, alt_time_format)
- ##if THAT doesn't work, just use the text, because it's probably 'Fall 2012' etc
- #except ValueError as e:
- #date_obj = date.text
- date_obj = date.text
- game_dict['released'] = date_obj
- else:
- game_dict['released'] = ''
- #find the game name
- name = soup.find('h4')
- if name:
- game_dict['title'] = name.text
- else:
- game_dict['title'] = ''
- #find the genre, this'll be ugly.
- pTag = soup.find('p')
- pText = pTag.text.strip()
- if " - Released:" in pText:
- genresUnsplit = pText.partition(' - Released:')[0]
- elif " - Available:" in pText:
- genresUnsplit = pText.partition("- Available:")[0]
- else:
- genresUnsplit = ""
- genres = genresUnsplit.split(', ')
- game_dict['genre'] = genres
- return convert_keys_vals_to_unicode(game_dict)
- pass
- #create a function that pulls out the games for the page source
- def parseHtmlForGameRow(html):
- """looks through an entire page source for search_results
- returns a list of all those results"""
- print 'making a soup'
- soup = BeautifulSoup(html)
- #Finds all elements that start with 'search_result'
- rows = soup.findAll('a', attrs={'class': lambda x:x and x.startswith('search_result')})
- #returns all the soups for each of the rows
- return rows
- if __name__ == '__main__':
- """Run the following if module is top module"""
- #big list that will hold the data for all the games
- mega_game_list = []
- list_of_filepaths = glob.glob(r"searchResults/*.html")
- #print list_of_filepaths
- #read all the html files into a list
- list_of_html_pages = []
- for path in list_of_filepaths:
- with open(path) as f:
- page_source = f.read()
- list_of_html_pages.append(page_source)
- print 'done reading all the html to a list'
- #now that we have all the html pages, we need to pull out
- # the rows with game data in them
- list_of_gameRows = []
- for html in list_of_html_pages:
- gameRows = parseHtmlForGameRow(html)
- for gameRow in gameRows:
- list_of_gameRows.append(gameRow)
- print "done reading all the rows into a list"
- #now we've got all the rows of data. Let's pull out the
- # data we want to deal with.
- for row in list_of_gameRows:
- game_dict = parseSoupForInfo(row)
- mega_game_list.append(game_dict)
- print "done adding all the dicts to the big huge list"
- print "Parsed {0} games, and now will try to write them to a file".format(len(mega_game_list))
- saveJsonToFile('steam_games_fuck.json', mega_game_list)
- print "DONE! \t Saved the JSON to file"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement