Advertisement
ChrisProsser

GetFootballData.py

Nov 17th, 2014
833
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.62 KB | None | 0 0
  1. #!/usr/bin/python
  2. """Get csv files of football data"""
  3. import urllib2, os, subprocess
  4. from datetime import date as dt
  5.  
  6. # constants
  7. BASE_URL = 'http://www.football-data.co.uk/'
  8. OUTPUT_PATH = os.path.join(os.getenv('userprofile'), 'FootballData')
  9. YEARS_TO_GO_BACK = 25
  10. VERBOSE = True
  11. VALID_LEAGUES = ['premier league',
  12.                  'division 1',
  13.                  'division 2',
  14.                  'division 3',
  15.                  'championship',
  16.                  'league 1',
  17.                  'league 2',
  18.                  'conference']
  19.  
  20. class Season(object):
  21.     """Class to hold data relating to a season and retrieve files."""
  22.  
  23.     def __init__(self, year, index_html):
  24.         """Initialise Season class"""
  25.         self.year = year
  26.        
  27.         # generate heading to find
  28.         self.heading = 'season ' + str(year) + '/' + str(year+1)
  29.         if VERBOSE: print("\n{0}:").format(self.heading)
  30.  
  31.         # get part of document containing heading
  32.         doc_fragment = self.__get_fragment(index_html, self.heading, 'season')
  33.  
  34.         # get links for each league
  35.         pos, prev_pos = 0, -1
  36.         self.datafiles = {} # dictionary containing data files
  37.         while pos > prev_pos:
  38.             prev_pos = pos
  39.             url, name, pos = self.get_next_link(doc_fragment, pos)
  40.  
  41.             if name in VALID_LEAGUES:
  42.                 ## append base url if needed
  43.                 if url[0:4] != 'http':
  44.                     url = BASE_URL + url
  45.  
  46.                 self.datafiles[name] = url
  47.                 if VERBOSE:
  48.                     print("  found {0}, url: {1}, pos: {2}").format(name, url, pos)
  49.            
  50.     def __get_fragment(self, doc_text, start_text, end_text):
  51.         """Private method to return text inbetween start and end values."""
  52.         start_pos = doc_text.find(start_text)
  53.         end_pos = doc_text.find(end_text, start_pos+1)
  54.         return doc_text[start_pos:end_pos]
  55.    
  56.     def get_next_link(self, html, start_pos):
  57.         """Get the next link from some html."""
  58.         start_link = html.find('<a href=', start_pos)
  59.        
  60.         if start_link == -1:
  61.             url, text, lt = None, None, 0
  62.         else:
  63.             # get url
  64.             start_quote = html.find('"', start_link)
  65.             end_quote = html.find('"', start_quote + 1)
  66.             url = html[start_quote + 1:end_quote]
  67.  
  68.             # get text
  69.             gt = html.find(">", end_quote)
  70.             lt = html.find("<", gt)
  71.             text = html[gt+1:lt]
  72.            
  73.         return url, text, lt+1
  74.  
  75. def get_seasons(index_html):
  76.     """Creates object for each season"""
  77.     data = {}
  78.  
  79.     # loop for no of years to go back, create a season object for each
  80.     for years_back in range(YEARS_TO_GO_BACK):
  81.         yr = dt.today().year - years_back
  82.         try:
  83.             data[yr] = Season(yr, index_html)
  84.         except:
  85.             print("Error getting data for year: {0}, breaking loop.").format(yr)
  86.             break
  87.     return data
  88.  
  89. def save_file(file_path, content):
  90.     """Saves the content to a file in the path provided"""
  91.     file_obj = open(file_path, 'w')
  92.     file_obj.write(content)
  93.     file_obj.close
  94.  
  95. def validate_dir(folder):
  96.     """Creates a directory if it doesn't already exist."""
  97.     if not os.path.exists(folder):
  98.         os.mkdir(folder)
  99.  
  100. def get_page(url):
  101.     """Get data from URL"""
  102.     return urllib2.urlopen(url).read()
  103.        
  104. def main():
  105.     """Main control function"""
  106.  
  107.     # get the html for the index page
  108.     index_html = get_page(BASE_URL+'englandm.php')
  109.     validate_dir(OUTPUT_PATH)
  110.     save_file(os.path.join(OUTPUT_PATH, 'IndexPage.html'), index_html)
  111.  
  112.     # create dictionary of season objects (year as key)
  113.     seasons = get_seasons(index_html.lower())
  114.  
  115.     # get data files based on the links found for each season
  116.     for year, season_obj in seasons.items():
  117.         if VERBOSE: print("getting files for {0}...").format(season_obj.heading)
  118.         for league, url in season_obj.datafiles.items():
  119.             f_name = os.path.join(OUTPUT_PATH,
  120.                                   str(year)+'_'+league.replace(' ', '')+'.csv')
  121.  
  122.             # check if file already exists, only replace if it's for this year
  123.             if not os.path.exists(f_name) or year == dt.today().year:
  124.  
  125.                 # download and save the file
  126.                 try:
  127.                     save_file(f_name, get_page(url))
  128.                 except:
  129.                     print("Error getting {0} data from {1}"). format(league, url)
  130.  
  131.     # open the folder
  132.     print('\nOpening directory...')
  133.     subprocess.Popen('explorer "'+OUTPUT_PATH+'"')
  134.  
  135. if __name__ == '__main__':
  136.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement