Part 2 Parsing the saved HTML

'''now that we've gotten the HTML source of all 68 pages of games from SteamPowered.com
 we need to find the game data on each page. in jQuery itd be something like:
 $('[class^=gametransaction ]') but all I have for now is bs4. So let's get started'''

from bs4 import BeautifulSoup
import glob

import json

import datetime
import time

#need to have a json to store all the games maybe something like:
"""
 {game title : {
               title  : value,
               genres : values,
               steam_release : date,
               metascore : value,
               price : value
               }

               }

"""

#----------------------------------------------------------------------
def saveJsonToFile(filepath, the_dict):
    """saves a json to the filepath, requires a dict to be passed in"""

    with open(filepath, 'w') as f:
        json.dump(the_dict, f, encoding='iso-8859-1')

def convert_keys_vals_to_unicode(dictionary):
    """Recursively converts dictionary keys to unicode"""
    if not isinstance(dictionary, dict):
        return dictionary
    return dict((unicode(k), convert_keys_vals_to_unicode(unicode(v)))
        for k, v in dictionary.items())


#create a function that pulls the data from the html page
#----------------------------------------------------------------------
def parseSoupForInfo(soup):
    """parses the given data and pulls out the title, genres,
release date, metascore, price
returns a dict with all of those elems in it"""

    #dict to hold all the found data
    game_dict = {}

    #find the non-sale price, which is the strikeout text
    # if it exists
    priceTag = soup.find(attrs={'class': "col search_price"})
    if priceTag.find('strike'):
        game_dict['price'] = priceTag.find('strike').text
    else:
        game_dict['price'] = priceTag.text


    #find the metascore
    metascore = soup.find(attrs= {'class' :"col search_metascore"})
    if metascore:
        game_dict['metascore'] = metascore.text
    else:
        game_dict['metascore'] = ''

    #find the steam release data
    time_format = "%d %b %Y"
    alt_time_format = "%B %Y"
    date = soup.find(attrs= {'class' :"col search_released"})
    if date:
        ##parse the date in : day, abbr month, yyyy
        #try:
            #date_obj = datetime.datetime.strptime(date.text, time_format)
        ##if that doesn't work, try: full month, yyyy
        #except ValueError as e:
            #try:
                #date_obj = datetime.datetime.strptime(date.text, alt_time_format)
            ##if THAT doesn't work, just use the text, because it's probably 'Fall 2012' etc
            #except ValueError as e:
                #date_obj = date.text


        date_obj = date.text
        game_dict['released'] = date_obj
    else:
        game_dict['released'] = ''

    #find the game name
    name = soup.find('h4')
    if name:
        game_dict['title'] = name.text
    else:
        game_dict['title'] = ''

    #find the genre, this'll be ugly.
    pTag = soup.find('p')
    pText = pTag.text.strip()
    if " - Released:" in pText:
        genresUnsplit = pText.partition(' - Released:')[0]
    elif " - Available:" in pText:
        genresUnsplit = pText.partition("- Available:")[0]
    else:
        genresUnsplit = ""

    genres = genresUnsplit.split(', ')
    game_dict['genre'] = genres


    return convert_keys_vals_to_unicode(game_dict)
    pass

#create a function that pulls out the games for the page source
def parseHtmlForGameRow(html):
    """looks through an entire page source for search_results
returns a list of all those results"""
    print 'making a soup'
    soup = BeautifulSoup(html)

    #Finds all elements that start with 'search_result'
    rows = soup.findAll('a', attrs={'class': lambda x:x and x.startswith('search_result')})

    #returns all the soups for each of the rows
    return rows


if __name__ == '__main__':
    """Run the following if module is top module"""
    #big list that will hold the data for all the games
    mega_game_list = []


    list_of_filepaths = glob.glob(r"searchResults/*.html")
    #print list_of_filepaths

    #read all the html files into a list
    list_of_html_pages = []
    for path in list_of_filepaths:
        with open(path) as f:
            page_source = f.read()
            list_of_html_pages.append(page_source)
    print 'done reading all the html to a list'

    #now that we have all the html pages, we need to pull out
    # the rows with game data in them
    list_of_gameRows = []
    for html in list_of_html_pages:
        gameRows = parseHtmlForGameRow(html)
        for gameRow in gameRows:
            list_of_gameRows.append(gameRow)

    print "done reading all the rows into a list"

    #now we've got all the rows of data. Let's pull out the
    # data we want to deal with.
    for row in list_of_gameRows:
        game_dict = parseSoupForInfo(row)
        mega_game_list.append(game_dict)
    print "done adding all the dicts to the big huge list"

    print "Parsed {0} games, and now will try to write them to a file".format(len(mega_game_list))

    saveJsonToFile('steam_games_fuck.json', mega_game_list)

    print "DONE! \t Saved the JSON to file"