Pastebin launched a little side project called VERYVIRAL.com, check it out ;-) Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on Feb 8th, 2014  |  syntax: None  |  size: 2.61 KB  |  views: 31  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. import urllib2
  2. import re
  3. import csv
  4. from bs4 import BeautifulSoup
  5.  
  6. teams = {'ana', 'bos', 'buf', 'cal', 'car', 'chi', 'col', 'cbj', 'dal', 'det', 'edm', 'flo', 'lak', 'min', 'mtl', 'nsh', 'njd', 'nyi', 'nyr', 'ott', 'phi', 'phx', 'pit', 'sjs', 'stl', 'tbl', 'tor', 'van', 'wpg', 'wsh'}
  7. gameidset = set()
  8.  
  9. print("Collecting game IDs. This may take a while.")
  10.  
  11. for i in teams:
  12.         response = urllib2.urlopen('http://www.nhl.com/ice/schedulebyseason.htm?team=' + i)
  13.         html = response.read()
  14.         soup = BeautifulSoup(html)
  15.         for link in soup.find_all('a'):
  16.                 linkCandidate = str(link.get('href'))
  17.                 if linkCandidate.startswith('http://www.nhl.com/gamecenter/en/recap?id='):
  18.                         gameidset.add(re.sub(re.escape('http://www.nhl.com/gamecenter/en/recap?id='), '', linkCandidate))
  19.  
  20. print("Finished collecting game IDs.")
  21.  
  22. datafile = open('data.csv', 'wb')
  23. wr = csv.writer(datafile, quoting=csv.QUOTE_NONE)
  24. wr.writerow(['GameID', 'HomeTeam', 'AwayTeam', 'HomePIM', 'AwayPIM', 'Referee1', 'Referee2'])
  25.  
  26. for i in gameidset:
  27.         response = urllib2.urlopen('http://www.nhl.com/gamecenter/en/boxscore?id=' + i)
  28.         html = response.read()
  29.         soup = BeautifulSoup(html)
  30.         gamePack = [i]
  31.         gamePack.append(re.sub('<[^<]+?>', '', str(soup.find("th", "ht"))))
  32.         gamePack.append(re.sub('<[^<]+?>', '', str(soup.find("th", "at"))))
  33.         gamePack.append(re.sub('<[^<]+?>', '', str(soup.find("td", "hPIM"))))
  34.         gamePack.append(re.sub('<[^<]+?>', '', str(soup.find("td", "aPIM"))))
  35.         extraInfo = str(re.sub('<[^<]+?>', '', str(soup.find_all("li"))))
  36.         refs = str(re.findall('Referees:.*Linesmen', extraInfo))
  37.         refs = refs[12:]
  38.         refs = refs[:-12]
  39.         gamePack.extend([x.strip() for x in refs.split(',')])
  40.         print(gamePack)
  41.         wr.writerow(gamePack)
  42.  
  43. datafile.close()
  44.  
  45. refList = set()
  46. teams = {'ana', 'bos', 'buf', 'cal', 'car', 'chi', 'col', 'cbj', 'dal', 'det', 'edm', 'flo', 'lak', 'min', 'mtl', 'nsh', 'njd', 'nyi', 'nyr', 'ott', 'phi', 'phx', 'pit', 'sjs', 'stl', 'tbl', 'tor', 'van', 'wpg', 'wsh'}
  47.  
  48. datafile = open('data.csv', 'rb')
  49. reader = csv.reader(datafile, delimiter=',')
  50. next(reader, None)
  51. for row in reader:
  52.         refList.add(row[5])
  53.         refList.add(row[6])
  54. print(refList)
  55. datafile.close()
  56.  
  57. datafile = open('data.csv', 'rb')
  58. datafile2 = open('longdata.csv', 'wb')
  59. wr = csv.writer(datafile2, quoting=csv.QUOTE_NONE)
  60. wr.writerow(['Team', 'PIM', 'Referee1', 'Referee2'])
  61. reader = csv.reader(datafile, delimiter=',')
  62. next(reader, None)
  63. for row in reader:
  64.         print([row[1], row[3], row[5], row[6]])
  65.         print([row[2], row[4], row[5], row[6]])
  66.         wr.writerow([row[1], row[3], row[5], row[6]])
  67.         wr.writerow([row[2], row[4], row[5], row[6]])
  68. datafile.close()
  69. datafile2.close()
clone this paste RAW Paste Data