Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- """Get csv files of football data"""
- import urllib2, os, subprocess
- from datetime import date as dt
- # constants
- BASE_URL = 'http://www.football-data.co.uk/'
- OUTPUT_PATH = os.path.join(os.getenv('userprofile'), 'FootballData')
- YEARS_TO_GO_BACK = 25
- VERBOSE = True
- VALID_LEAGUES = ['premier league',
- 'division 1',
- 'division 2',
- 'division 3',
- 'championship',
- 'league 1',
- 'league 2',
- 'conference']
- class Season(object):
- """Class to hold data relating to a season and retrieve files."""
- def __init__(self, year, index_html):
- """Initialise Season class"""
- self.year = year
- # generate heading to find
- self.heading = 'season ' + str(year) + '/' + str(year+1)
- if VERBOSE: print("\n{0}:").format(self.heading)
- # get part of document containing heading
- doc_fragment = self.__get_fragment(index_html, self.heading, 'season')
- # get links for each league
- pos, prev_pos = 0, -1
- self.datafiles = {} # dictionary containing data files
- while pos > prev_pos:
- prev_pos = pos
- url, name, pos = self.get_next_link(doc_fragment, pos)
- if name in VALID_LEAGUES:
- ## append base url if needed
- if url[0:4] != 'http':
- url = BASE_URL + url
- self.datafiles[name] = url
- if VERBOSE:
- print(" found {0}, url: {1}, pos: {2}").format(name, url, pos)
- def __get_fragment(self, doc_text, start_text, end_text):
- """Private method to return text inbetween start and end values."""
- start_pos = doc_text.find(start_text)
- end_pos = doc_text.find(end_text, start_pos+1)
- return doc_text[start_pos:end_pos]
- def get_next_link(self, html, start_pos):
- """Get the next link from some html."""
- start_link = html.find('<a href=', start_pos)
- if start_link == -1:
- url, text, lt = None, None, 0
- else:
- # get url
- start_quote = html.find('"', start_link)
- end_quote = html.find('"', start_quote + 1)
- url = html[start_quote + 1:end_quote]
- # get text
- gt = html.find(">", end_quote)
- lt = html.find("<", gt)
- text = html[gt+1:lt]
- return url, text, lt+1
- def get_seasons(index_html):
- """Creates object for each season"""
- data = {}
- # loop for no of years to go back, create a season object for each
- for years_back in range(YEARS_TO_GO_BACK):
- yr = dt.today().year - years_back
- try:
- data[yr] = Season(yr, index_html)
- except:
- print("Error getting data for year: {0}, breaking loop.").format(yr)
- break
- return data
- def save_file(file_path, content):
- """Saves the content to a file in the path provided"""
- file_obj = open(file_path, 'w')
- file_obj.write(content)
- file_obj.close
- def validate_dir(folder):
- """Creates a directory if it doesn't already exist."""
- if not os.path.exists(folder):
- os.mkdir(folder)
- def get_page(url):
- """Get data from URL"""
- return urllib2.urlopen(url).read()
- def main():
- """Main control function"""
- # get the html for the index page
- index_html = get_page(BASE_URL+'englandm.php')
- validate_dir(OUTPUT_PATH)
- save_file(os.path.join(OUTPUT_PATH, 'IndexPage.html'), index_html)
- # create dictionary of season objects (year as key)
- seasons = get_seasons(index_html.lower())
- # get data files based on the links found for each season
- for year, season_obj in seasons.items():
- if VERBOSE: print("getting files for {0}...").format(season_obj.heading)
- for league, url in season_obj.datafiles.items():
- f_name = os.path.join(OUTPUT_PATH,
- str(year)+'_'+league.replace(' ', '')+'.csv')
- # check if file already exists, only replace if it's for this year
- if not os.path.exists(f_name) or year == dt.today().year:
- # download and save the file
- try:
- save_file(f_name, get_page(url))
- except:
- print("Error getting {0} data from {1}"). format(league, url)
- # open the folder
- print('\nOpening directory...')
- subprocess.Popen('explorer "'+OUTPUT_PATH+'"')
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement