GetFootballData.py

#!/usr/bin/python
"""Get csv files of football data"""
import urllib2, os, subprocess
from datetime import date as dt

# constants
BASE_URL = 'http://www.football-data.co.uk/'
OUTPUT_PATH = os.path.join(os.getenv('userprofile'), 'FootballData')
YEARS_TO_GO_BACK = 25
VERBOSE = True
VALID_LEAGUES = ['premier league',
                 'division 1',
                 'division 2',
                 'division 3',
                 'championship',
                 'league 1',
                 'league 2',
                 'conference']

class Season(object):
    """Class to hold data relating to a season and retrieve files."""

    def __init__(self, year, index_html):
        """Initialise Season class"""
        self.year = year

        # generate heading to find
        self.heading = 'season ' + str(year) + '/' + str(year+1)
        if VERBOSE: print("\n{0}:").format(self.heading)

        # get part of document containing heading
        doc_fragment = self.__get_fragment(index_html, self.heading, 'season')

        # get links for each league
        pos, prev_pos = 0, -1
        self.datafiles = {} # dictionary containing data files
        while pos > prev_pos:
            prev_pos = pos
            url, name, pos = self.get_next_link(doc_fragment, pos)

            if name in VALID_LEAGUES:
                ## append base url if needed
                if url[0:4] != 'http':
                    url = BASE_URL + url

                self.datafiles[name] = url
                if VERBOSE:
                    print("  found {0}, url: {1}, pos: {2}").format(name, url, pos)

    def __get_fragment(self, doc_text, start_text, end_text):
        """Private method to return text inbetween start and end values."""
        start_pos = doc_text.find(start_text)
        end_pos = doc_text.find(end_text, start_pos+1)
        return doc_text[start_pos:end_pos]

    def get_next_link(self, html, start_pos):
        """Get the next link from some html."""
        start_link = html.find('<a href=', start_pos)

        if start_link == -1:
            url, text, lt = None, None, 0
        else:
            # get url
            start_quote = html.find('"', start_link)
            end_quote = html.find('"', start_quote + 1)
            url = html[start_quote + 1:end_quote]

            # get text
            gt = html.find(">", end_quote)
            lt = html.find("<", gt)
            text = html[gt+1:lt]

        return url, text, lt+1

def get_seasons(index_html):
    """Creates object for each season"""
    data = {}

    # loop for no of years to go back, create a season object for each
    for years_back in range(YEARS_TO_GO_BACK):
        yr = dt.today().year - years_back
        try:
            data[yr] = Season(yr, index_html)
        except:
            print("Error getting data for year: {0}, breaking loop.").format(yr)
            break
    return data

def save_file(file_path, content):
    """Saves the content to a file in the path provided"""
    file_obj = open(file_path, 'w')
    file_obj.write(content)
    file_obj.close

def validate_dir(folder):
    """Creates a directory if it doesn't already exist."""
    if not os.path.exists(folder):
        os.mkdir(folder)

def get_page(url):
    """Get data from URL"""
    return urllib2.urlopen(url).read()

def main():
    """Main control function"""

    # get the html for the index page
    index_html = get_page(BASE_URL+'englandm.php')
    validate_dir(OUTPUT_PATH)
    save_file(os.path.join(OUTPUT_PATH, 'IndexPage.html'), index_html)

    # create dictionary of season objects (year as key)
    seasons = get_seasons(index_html.lower())

    # get data files based on the links found for each season
    for year, season_obj in seasons.items():
        if VERBOSE: print("getting files for {0}...").format(season_obj.heading)
        for league, url in season_obj.datafiles.items():
            f_name = os.path.join(OUTPUT_PATH,
                                  str(year)+'_'+league.replace(' ', '')+'.csv')

            # check if file already exists, only replace if it's for this year
            if not os.path.exists(f_name) or year == dt.today().year:

                # download and save the file
                try:
                    save_file(f_name, get_page(url))
                except:
                    print("Error getting {0} data from {1}"). format(league, url)

    # open the folder
    print('\nOpening directory...')
    subprocess.Popen('explorer "'+OUTPUT_PATH+'"')

if __name__ == '__main__':
    main()