import urllib2 import re import csv from bs4 import BeautifulSoup teams = {'ana', 'bos', 'buf', 'cal', 'car', 'chi', 'col', 'cbj', 'dal', 'det', 'edm', 'flo', 'lak', 'min', 'mtl', 'nsh', 'njd', 'nyi', 'nyr', 'ott', 'phi', 'phx', 'pit', 'sjs', 'stl', 'tbl', 'tor', 'van', 'wpg', 'wsh'} gameidset = set() print("Collecting game IDs. This may take a while.") for i in teams: response = urllib2.urlopen('http://www.nhl.com/ice/schedulebyseason.htm?team=' + i) html = response.read() soup = BeautifulSoup(html) for link in soup.find_all('a'): linkCandidate = str(link.get('href')) if linkCandidate.startswith('http://www.nhl.com/gamecenter/en/recap?id='): gameidset.add(re.sub(re.escape('http://www.nhl.com/gamecenter/en/recap?id='), '', linkCandidate)) print("Finished collecting game IDs.") datafile = open('data.csv', 'wb') wr = csv.writer(datafile, quoting=csv.QUOTE_NONE) wr.writerow(['GameID', 'HomeTeam', 'AwayTeam', 'HomePIM', 'AwayPIM', 'Referee1', 'Referee2']) for i in gameidset: response = urllib2.urlopen('http://www.nhl.com/gamecenter/en/boxscore?id=' + i) html = response.read() soup = BeautifulSoup(html) gamePack = [i] gamePack.append(re.sub('<[^<]+?>', '', str(soup.find("th", "ht")))) gamePack.append(re.sub('<[^<]+?>', '', str(soup.find("th", "at")))) gamePack.append(re.sub('<[^<]+?>', '', str(soup.find("td", "hPIM")))) gamePack.append(re.sub('<[^<]+?>', '', str(soup.find("td", "aPIM")))) extraInfo = str(re.sub('<[^<]+?>', '', str(soup.find_all("li")))) refs = str(re.findall('Referees:.*Linesmen', extraInfo)) refs = refs[12:] refs = refs[:-12] gamePack.extend([x.strip() for x in refs.split(',')]) print(gamePack) wr.writerow(gamePack) datafile.close() refList = set() teams = {'ana', 'bos', 'buf', 'cal', 'car', 'chi', 'col', 'cbj', 'dal', 'det', 'edm', 'flo', 'lak', 'min', 'mtl', 'nsh', 'njd', 'nyi', 'nyr', 'ott', 'phi', 'phx', 'pit', 'sjs', 'stl', 'tbl', 'tor', 'van', 'wpg', 'wsh'} datafile = open('data.csv', 'rb') reader = csv.reader(datafile, delimiter=',') next(reader, None) for row in reader: refList.add(row[5]) refList.add(row[6]) print(refList) datafile.close() datafile = open('data.csv', 'rb') datafile2 = open('longdata.csv', 'wb') wr = csv.writer(datafile2, quoting=csv.QUOTE_NONE) wr.writerow(['Team', 'PIM', 'Referee1', 'Referee2']) reader = csv.reader(datafile, delimiter=',') next(reader, None) for row in reader: print([row[1], row[3], row[5], row[6]]) print([row[2], row[4], row[5], row[6]]) wr.writerow([row[1], row[3], row[5], row[6]]) wr.writerow([row[2], row[4], row[5], row[6]]) datafile.close() datafile2.close()