Advertisement
Guest User

Untitled

a guest
Feb 7th, 2016
50
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 33.22 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import re
  3.  
  4. print("Parse start")
  5.  
  6. first_game_id = 1 #+665
  7. last_game_id = 668 #-2
  8.  
  9. season_id = '20152016'
  10. subseason_id = '02'
  11. datafile_id = 'PL'
  12.  
  13. header_row_main = (
  14. 'SEASON_ID'
  15. '|SUBSEASON_ID'
  16. '|GAME_ID'
  17. '|EVENT_ID'
  18. '|PERIOD_CD'
  19. '|STRENGTH_CD'
  20. '|GAME_TM'
  21. '|PLAY_TM'
  22. '|REMAIN_TM'
  23. '|EVENT_CD'
  24. )
  25.  
  26. header_row_rest = (
  27. '|FULL_EVENT_CD'
  28. '|PLAY_ELEMENT_CT'
  29. '|TEAM_ID'
  30. '|JERSEY_ID'
  31. '|PLAYER_TX'
  32.  
  33. '|ASSIST_JERSEY_ID'
  34. '|ASSIST_PLAYER_TX'
  35. '|ASSIST2_JERSEY_ID'
  36. '|ASSIST2_PLAYER_TX'
  37.  
  38. '|MATE_JERSEY_ID'
  39. '|MATE_PLAYER_TX'
  40.  
  41. '|OPP_TEAM_ID'
  42. '|OPP_JERSEY_ID'
  43. '|OPP_PLAYER_TX'
  44.  
  45. '|AWAY_TEAM_ID'
  46. '|AWAY_JERSEY_ID'
  47. '|AWAY_PLAYER_TX'
  48.  
  49. '|HOME_TEAM_ID'
  50. '|HOME_JERSEY_ID'
  51. '|HOME_PLAYER_TX'
  52.  
  53. '|WINNER_TEAM_ID'
  54.  
  55. '|ZONE_CD'
  56.  
  57. '|SHOT_CD'
  58. '|OUTCOME_SHOT_CD'
  59. '|DISTANCE_CT'
  60.  
  61. '|PENALTY_CD'
  62. '|MINUTES_PENALTY_CT'
  63. '|MINUTES2_PENALTY_CT'
  64.  
  65. '|REASON_CD'
  66. '|REASON2_CD'
  67. '|REASON3_CD'
  68. '|RESULT_CD'
  69.  
  70. '|HOURS_TM'
  71. '|MINUTES_TM'
  72. '|TIMEZONE_CD'
  73. '|TIMEZONE_TX'
  74.  
  75. '|SEASON_GOAL_CT'
  76. '|SEASON_ASSIST_CT'
  77. '|SEASON_ASSIST2_CT'
  78. )
  79.  
  80.  
  81. def parse_play(play_tx):
  82. # players with multi names collapsed into one name, temporarily
  83. play_tx = re.sub('DI GIUSEPPE','DI_GIUSEPPE',play_tx, flags=re.IGNORECASE)
  84. play_tx = re.sub('DE HAAN','DE_HAAN',play_tx, flags=re.IGNORECASE)
  85. play_tx = re.sub('DE LA ROSE','DE_LA_ROSE',play_tx, flags=re.IGNORECASE)
  86. play_tx = re.sub('VAN RIEMSDYK','VAN_RIEMSDYK',play_tx, flags=re.IGNORECASE)
  87. play_tx = re.sub('DEL ZOTTO','DEL_ZOTTO',play_tx, flags=re.IGNORECASE)
  88.  
  89. # special keywords to merge with prior words
  90. play_tx = re.sub('. Zone','_Zone',play_tx, flags=re.IGNORECASE)
  91. play_tx = re.sub(' Start-','_Start',play_tx, flags=re.IGNORECASE)
  92. play_tx = re.sub(' End-','_End',play_tx, flags=re.IGNORECASE)
  93. play_tx = re.sub(' time:','_Time',play_tx, flags=re.IGNORECASE)
  94.  
  95. # create a new keyword
  96. play_tx = re.sub(' \#',' Jersey_Id ',play_tx, flags=re.IGNORECASE)
  97. play_tx = re.sub(' min\)',' minutes ',play_tx, flags=re.IGNORECASE)
  98. play_tx = re.sub('ft.','feet',play_tx, flags=re.IGNORECASE)
  99.  
  100. # redundand keywords
  101. play_tx = re.sub('- double minor','',play_tx, flags=re.IGNORECASE)
  102. play_tx = re.sub('(maj)','',play_tx, flags=re.IGNORECASE)
  103. #play_tx = re.sub('-bench','',play_tx, flags=re.IGNORECASE) # not sure if I want to do this here or below
  104. #play_tx = re.sub('- bench','',play_tx, flags=re.IGNORECASE) # not sure if I want to do this here or below
  105.  
  106. # keywords with multiple words will be collapsed into one word
  107. play_tx = re.sub('BLOCKED BY','BLOCKED_BY',play_tx, flags=re.IGNORECASE)
  108. play_tx = re.sub('CLOCK PROBLEM','CLOCK_PROBLEM',play_tx, flags=re.IGNORECASE)
  109. play_tx = re.sub('Early Intermission','Early_Intermission',play_tx, flags=re.IGNORECASE)
  110. play_tx = re.sub('HAND PASS','HAND_PASS',play_tx, flags=re.IGNORECASE)
  111. play_tx = re.sub('HIT CROSSBAR','HIT_CROSSBAR',play_tx, flags=re.IGNORECASE)
  112. play_tx = re.sub('GOAL INTERFERENCE','GOAL_INTERFERENCE',play_tx, flags=re.IGNORECASE)
  113. play_tx = re.sub('GOAL OVERTURNED','GOAL_OVERTURNED',play_tx, flags=re.IGNORECASE)
  114. play_tx = re.sub('GOALIE STOPPED','GOALIE_STOPPED',play_tx, flags=re.IGNORECASE)
  115. play_tx = re.sub('HOME TIMEOUT','HOME_TIMEOUT',play_tx, flags=re.IGNORECASE)
  116. play_tx = re.sub('ICE PROBLEM','ICE_PROBLEM',play_tx, flags=re.IGNORECASE)
  117. play_tx = re.sub('NET OFF','NET_OFF',play_tx, flags=re.IGNORECASE)
  118. play_tx = re.sub('OBJECTS ON ICE','OBJECTS_ONICE',play_tx, flags=re.IGNORECASE)
  119. play_tx = re.sub('OFFICIAL INJURY','OFFICIAL_INJURY',play_tx, flags=re.IGNORECASE)
  120. play_tx = re.sub('OFF-SIDE','OFFSIDE',play_tx, flags=re.IGNORECASE)
  121. play_tx = re.sub('OVER NET','OVER_NET',play_tx, flags=re.IGNORECASE)
  122. # play_tx = re.sub('Over Net','OVER_NET',play_tx, flags=re.IGNORECASE) # contains special character
  123. play_tx = re.sub('PENALTY SHOT','PENALTY_SHOT',play_tx, flags=re.IGNORECASE)
  124. play_tx = re.sub('PLAYER EQUIPMENT','PLAYER_EQUIPMENT',play_tx, flags=re.IGNORECASE)
  125. play_tx = re.sub('PLAYER INJURY','PLAYER_INJURY',play_tx, flags=re.IGNORECASE)
  126. play_tx = re.sub('PUCK FROZEN','PUCK_FROZEN',play_tx, flags=re.IGNORECASE)
  127. play_tx = re.sub('PUCK IN BENCHES','PUCK_INBENCHES',play_tx, flags=re.IGNORECASE)
  128. play_tx = re.sub('PUCK IN CROWD','PUCK_INCROWD',play_tx, flags=re.IGNORECASE)
  129. play_tx = re.sub('PUCK IN NETTING','PUCK_INNETTING',play_tx, flags=re.IGNORECASE)
  130. play_tx = re.sub('REFEREE OR LINESMAN','REFEREE_LINESMAN',play_tx, flags=re.IGNORECASE)
  131. play_tx = re.sub('RINK REPAIR','RINK_REPAIR',play_tx, flags=re.IGNORECASE)
  132. play_tx = re.sub('SHOOTOUT COMPLETED','SHOOTOUT_COMPLETED',play_tx, flags=re.IGNORECASE)
  133. play_tx = re.sub('TV TIMEOUT','TV_TIMEOUT',play_tx, flags=re.IGNORECASE)
  134. play_tx = re.sub('VIDEO REVIEW','VIDEO_REVIEW',play_tx, flags=re.IGNORECASE)
  135. play_tx = re.sub('VISITOR TIMEOUT','VISITOR_TIMEOUT',play_tx, flags=re.IGNORECASE)
  136. play_tx = re.sub('Wide Of Net','WIDE_OFNET',play_tx, flags=re.IGNORECASE)
  137. # play_tx = re.sub('Wide of Net','WIDE_OFNET',play_tx, flags=re.IGNORECASE) # contains special character
  138.  
  139. # penalties
  140. play_tx = re.sub('ABUSE OF OFFICIALS - BENCH','ABUSE_OFFICIALS',play_tx, flags=re.IGNORECASE)
  141. play_tx = re.sub('ABUSE OF OFFICIALS','ABUSE_OFFICIALS',play_tx, flags=re.IGNORECASE)
  142.  
  143. play_tx = re.sub('ABUSIVE LANGUAGE - BENCH','ABUSIVE_LANGUAGE',play_tx, flags=re.IGNORECASE)
  144. play_tx = re.sub('ABUSIVE LANGUAGE','ABUSIVE_LANGUAGE',play_tx, flags=re.IGNORECASE)
  145.  
  146. play_tx = re.sub('BROKEN STICK','BROKEN_STICK',play_tx, flags=re.IGNORECASE)
  147. play_tx = re.sub('Checking from behind','CHECKING_FROMBEHIND',play_tx, flags=re.IGNORECASE)
  148. play_tx = re.sub('Concealing Puck','CONCEALING_PUCK',play_tx, flags=re.IGNORECASE)
  149. play_tx = re.sub('Closing Hand On Puck','CLOSINGHAND_ONPUCK',play_tx, flags=re.IGNORECASE)
  150. play_tx = re.sub('CROSS CHECKING','CROSS_CHECK',play_tx, flags=re.IGNORECASE)
  151. play_tx = re.sub('CROSS CHECK','CROSS_CHECK',play_tx, flags=re.IGNORECASE)
  152.  
  153. play_tx = re.sub('Delay Gm - Face-off Violation','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  154. play_tx = re.sub('Delay of game - bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  155. play_tx = re.sub('Delay of game-bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  156. play_tx = re.sub('DELAY OF GAME','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  157. # play_tx = re.sub('Delay of game','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) # contains special character
  158. play_tx = re.sub('DELAYING GAME-ILL.PLAY GOALIE','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  159. play_tx = re.sub('Delaying Game-Ill. play goalie','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  160. play_tx = re.sub('Delaying Game-Puck over glass','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  161. play_tx = re.sub('Delaying Game-Smothering puck','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  162. play_tx = re.sub('Delaying Game','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  163. play_tx = re.sub('DELAYING THE GAME','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  164. play_tx = re.sub('Face-off violation-bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  165. play_tx = re.sub('Face-off violation','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
  166.  
  167. play_tx = re.sub('DRAWN BY','DRAWN_BY',play_tx, flags=re.IGNORECASE)
  168. # play_tx = re.sub('Drawn By','DRAWN_BY',play_tx, flags=re.IGNORECASE) # contains special character
  169. play_tx = re.sub('GAME MISCONDUCT','GAME_MISCONDUCT',play_tx, flags=re.IGNORECASE)
  170. play_tx = re.sub('Goalie leave crease','GOALIE_CREASE',play_tx, flags=re.IGNORECASE)
  171. play_tx = re.sub('HI-STICKING','HIGH_STICK',play_tx, flags=re.IGNORECASE)
  172. play_tx = re.sub('HI STICK','HIGH_STICK',play_tx, flags=re.IGNORECASE)
  173. play_tx = re.sub('HIGH STICK','HIGH_STICK',play_tx, flags=re.IGNORECASE)
  174. play_tx = re.sub('HOLDING THE STICK','HOLDING_STICK',play_tx, flags=re.IGNORECASE)
  175. play_tx = re.sub('ILLEGAL STICK','ILLEGAL_STICK',play_tx, flags=re.IGNORECASE)
  176. play_tx = re.sub('Illegal check to head','CHECK_HEAD',play_tx, flags=re.IGNORECASE)
  177. play_tx = re.sub('Instigator - face shield','INSTIGATOR_FACESHIELD',play_tx, flags=re.IGNORECASE)
  178. play_tx = re.sub('Instigator - Misconduct','INSTIGATOR_MISCONDUCT',play_tx, flags=re.IGNORECASE)
  179. play_tx = re.sub('Interference - Goalkeeper','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
  180. play_tx = re.sub('Interference on goalkeeper','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
  181. play_tx = re.sub('Interference on goalie','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
  182. play_tx = re.sub('Leaving penalty box - bench','TOOMANY_MEN',play_tx, flags=re.IGNORECASE)
  183. play_tx = re.sub('Match Penalty','MATCH_PENALTY',play_tx, flags=re.IGNORECASE)
  184. play_tx = re.sub('PS-Covering puck in crease','PENALTYSHOT_COVERPUCK',play_tx, flags=re.IGNORECASE)
  185. play_tx = re.sub('PS-Hooking on breakaway','PENALTYSHOT_HOOKING',play_tx, flags=re.IGNORECASE)
  186. play_tx = re.sub('PS-Slash on breakaway','PENALTYSHOT_SLASH',play_tx, flags=re.IGNORECASE)
  187. play_tx = re.sub('PS-Thow object at puck','PENALTYSHOT_THROWOBJECT',play_tx, flags=re.IGNORECASE)
  188. play_tx = re.sub('PS-Throw object at puck','PENALTYSHOT_THROWOBJECT',play_tx, flags=re.IGNORECASE)
  189. play_tx = re.sub('PS-Tripping on breakaway','PENALTYSHOT_TRIPPING',play_tx, flags=re.IGNORECASE)
  190. play_tx = re.sub('Served By: ','SERVED_BY_',play_tx, flags=re.IGNORECASE)
  191. play_tx = re.sub('PREMATURE SUBSTITUTION','PREMATURE_SUBSTITUTION',play_tx, flags=re.IGNORECASE)
  192. play_tx = re.sub('Too many men/ice - bench','TOOMANY_MEN',play_tx, flags=re.IGNORECASE)
  193. play_tx = re.sub('UNSPORTSMANLIKE CONDUCT','UNSPORTSMANLIKE_CONDUCT',play_tx, flags=re.IGNORECASE)
  194.  
  195. # challenges
  196. play_tx = re.sub('CHLG HM','CHALLENGE_HOME',play_tx, flags=re.IGNORECASE)
  197. play_tx = re.sub('CHLG LEAGUE','CHALLENGE_LEAGUE',play_tx, flags=re.IGNORECASE)
  198. play_tx = re.sub('CHLG VIS','CHALLENGE_AWAY',play_tx, flags=re.IGNORECASE)
  199.  
  200.  
  201. # Prepare delimiter of space
  202. play_tx = re.sub(' - ',' ',play_tx, flags=re.IGNORECASE)
  203. play_tx = re.sub('- ',' ',play_tx, flags=re.IGNORECASE)
  204. play_tx = re.sub(', ',' ',play_tx, flags=re.IGNORECASE)
  205. play_tx = re.sub(',',' ',play_tx, flags=re.IGNORECASE)
  206. play_tx = re.sub(': ',' ',play_tx, flags=re.IGNORECASE)
  207. play_tx = re.sub('; ',' ',play_tx, flags=re.IGNORECASE)
  208. play_tx = re.sub(':',' ',play_tx, flags=re.IGNORECASE) # used for HH:MM
  209. play_tx = re.sub('\(',' ',play_tx, flags=re.IGNORECASE)
  210. play_tx = re.sub('\)','',play_tx, flags=re.IGNORECASE)
  211. play_tx = re.sub(' ',' ',play_tx, flags=re.IGNORECASE) # hidden character
  212. play_tx = re.sub(' +',' ',play_tx, flags=re.IGNORECASE) # turn multiple spaces into one
  213.  
  214. return re.split(' ', play_tx)
  215.  
  216. def process_play(event_cd, parsed_play_tx, period_cd, parsed_assist_tx):
  217. # all these are optional fields
  218. play_element_ct = len(parsed_play_tx)
  219. team_id = ''
  220. jersey_id = ''
  221. player_tx = ''
  222. assist_jersey_id = ''
  223. assist_player_tx = ''
  224. assist2_jersey_id = ''
  225. assist2_player_tx = ''
  226. mate_jersey_id = ''
  227. mate_player_tx = ''
  228. opp_team_id = ''
  229. opp_jersey_id = ''
  230. opp_player_tx = ''
  231. away_team_id = ''
  232. away_jersey_id = ''
  233. away_player_tx = ''
  234. home_team_id = ''
  235. home_jersey_id = ''
  236. home_player_tx = ''
  237. winner_team_id = ''
  238. zone_cd = ''
  239. shot_cd = ''
  240. outcome_shot_cd = ''
  241. distance_ct = ''
  242. penalty_cd = ''
  243. minutes_penalty_ct = ''
  244. minutes2_penalty_ct = ''
  245. reason_cd = ''
  246. reason2_cd = ''
  247. reason3_cd = ''
  248. result_cd = ''
  249. hours_tm = ''
  250. minutes_tm = ''
  251. timezone_cd = ''
  252. timezone_tx = ''
  253. season_goal_ct = ''
  254. season_assist_ct = ''
  255. season_assist2_ct = ''
  256.  
  257. if event_cd == 'BLOCK':
  258. full_event_cd = 'SHOT_BLOCKED'
  259. team_id = parsed_play_tx[0]
  260. jersey_id = parsed_play_tx[2]
  261. player_tx = parsed_play_tx[3]
  262. opp_team_id = parsed_play_tx[5]
  263. opp_jersey_id = parsed_play_tx[7]
  264. opp_player_tx = parsed_play_tx[8]
  265. if parsed_play_tx[9].find('_Zone') >= 0:
  266. shot_cd = ''
  267. zone_cd = parsed_play_tx[9]
  268. else:
  269. shot_cd = parsed_play_tx[9]
  270. zone_cd = parsed_play_tx[10]
  271.  
  272. elif event_cd == 'CHL':
  273. full_event_cd = 'CHALLENGE'
  274. if len(parsed_play_tx)>=2:
  275. team_id = parsed_play_tx[0]
  276. reason_cd = parsed_play_tx[2]
  277. result_cd = parsed_play_tx[4]
  278.  
  279. elif event_cd == 'EIEND':
  280. full_event_cd = 'EARLY INTERMISSION ENDED'
  281. timezone_tx = parsed_play_tx[1]
  282. hours_tm = parsed_play_tx[2]
  283. minutes_tm = parsed_play_tx[3]
  284. timezone_cd = parsed_play_tx[4]
  285.  
  286. elif event_cd == 'EISTR':
  287. full_event_cd = 'EARLY INTERMISSION STARTED'
  288. timezone_tx = parsed_play_tx[1]
  289. hours_tm = parsed_play_tx[2]
  290. minutes_tm = parsed_play_tx[3]
  291. timezone_cd = parsed_play_tx[4]
  292.  
  293. elif event_cd == 'FAC':
  294. full_event_cd = 'FACEOFF'
  295. winner_team_id = parsed_play_tx[0]
  296. zone_cd = parsed_play_tx[2]
  297. away_team_id = parsed_play_tx[3]
  298. away_jersey_id = parsed_play_tx[5]
  299. away_player_tx = parsed_play_tx[6]
  300. home_team_id = parsed_play_tx[8]
  301. home_jersey_id = parsed_play_tx[10]
  302. home_player_tx = parsed_play_tx[11]
  303.  
  304. elif event_cd == 'GEND':
  305. full_event_cd = 'GAME ENDED'
  306. timezone_tx = parsed_play_tx[1]
  307. hours_tm = parsed_play_tx[2]
  308. minutes_tm = parsed_play_tx[3]
  309. timezone_cd = parsed_play_tx[4]
  310.  
  311. elif event_cd == 'GIVE':
  312. full_event_cd = 'GIVEAWAY'
  313. team_id = parsed_play_tx[0]
  314. jersey_id = parsed_play_tx[3]
  315. player_tx = parsed_play_tx[4]
  316. zone_cd = parsed_play_tx[5]
  317.  
  318. elif event_cd == 'GOAL':
  319. full_event_cd = 'SHOT_GOAL'
  320. team_id = parsed_play_tx[0]
  321. jersey_id = parsed_play_tx[2]
  322. player_tx = parsed_play_tx[3]
  323.  
  324. if subseason_id == '02' and period_cd == 5: # shootout
  325. if parsed_play_tx[4].find('_Zone') >= 0:
  326. shot_cd = ''
  327. zone_cd = parsed_play_tx[4]
  328. distance_ct = parsed_play_tx[5]
  329. else:
  330. shot_cd = parsed_play_tx[4]
  331. zone_cd = parsed_play_tx[5]
  332. distance_ct = parsed_play_tx[6]
  333. else:
  334. season_goal_ct = parsed_play_tx[4]
  335. if parsed_play_tx[5].find('PENALTY_SHOT') >= 0: # penalty shot
  336. shot_cd = parsed_play_tx[6]
  337. zone_cd = parsed_play_tx[5] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
  338. distance_ct = parsed_play_tx[8]
  339. elif parsed_play_tx[5].find('_Zone') >= 0: # missing shot
  340. shot_cd = ''
  341. zone_cd = parsed_play_tx[5]
  342. distance_ct = parsed_play_tx[6]
  343. else:
  344. shot_cd = parsed_play_tx[5]
  345. zone_cd = parsed_play_tx[6]
  346. distance_ct = parsed_play_tx[7]
  347.  
  348. # assist
  349. for i in range(len(parsed_assist_tx)):
  350. if parsed_assist_tx[i].find('Jersey_Id') >= 0:
  351. if assist_jersey_id == '':
  352. assist_jersey_id = parsed_assist_tx[i+1]
  353. assist_player_tx = parsed_assist_tx[i+2]
  354. season_assist_ct = parsed_assist_tx[i+3]
  355. else: # 2nd assist
  356. assist2_jersey_id = parsed_assist_tx[i+1]
  357. assist2_player_tx = parsed_assist_tx[i+2]
  358. season_assist2_ct = parsed_assist_tx[i+3]
  359.  
  360. elif event_cd == 'GOFF':
  361. full_event_cd = 'GAME OFF'
  362.  
  363. elif event_cd == 'HIT':
  364. full_event_cd = 'HIT'
  365. team_id = parsed_play_tx[0]
  366. jersey_id = parsed_play_tx[2]
  367. player_tx = parsed_play_tx[3]
  368. opp_team_id = parsed_play_tx[5]
  369. opp_jersey_id = parsed_play_tx[7]
  370. opp_player_tx = parsed_play_tx[8]
  371. zone_cd = parsed_play_tx[9]
  372.  
  373. elif event_cd == 'MISS':
  374. full_event_cd = 'SHOT_MISSEDNET'
  375. team_id = parsed_play_tx[0]
  376. jersey_id = parsed_play_tx[2]
  377. player_tx = parsed_play_tx[3]
  378. if subseason_id == '02' and period_cd == 5: # shootout
  379. if parsed_play_tx[5].find('_Zone') >= 0:
  380. shot_cd = ''
  381. outcome_shot_cd = parsed_play_tx[4]
  382. zone_cd = parsed_play_tx[5]
  383. distance_ct = parsed_play_tx[6]
  384. else:
  385. shot_cd = parsed_play_tx[4]
  386. outcome_shot_cd = parsed_play_tx[5]
  387. zone_cd = parsed_play_tx[6]
  388. distance_ct = parsed_play_tx[7]
  389. else:
  390. if parsed_play_tx[4].find('PENALTY_SHOT') >= 0:
  391. shot_cd = parsed_play_tx[5]
  392. outcome_shot_cd = parsed_play_tx[6]
  393. zone_cd = parsed_play_tx[4] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
  394. distance_ct = parsed_play_tx[8]
  395. elif parsed_play_tx[5].find('_Zone') >= 0:
  396. shot_cd = ''
  397. outcome_shot_cd = parsed_play_tx[4]
  398. zone_cd = parsed_play_tx[5]
  399. distance_ct = parsed_play_tx[6]
  400. else:
  401. shot_cd = parsed_play_tx[4]
  402. outcome_shot_cd = parsed_play_tx[5]
  403. zone_cd = parsed_play_tx[6]
  404. distance_ct = parsed_play_tx[7]
  405.  
  406. elif event_cd == 'PEND':
  407. full_event_cd = 'PERIOD ENDED'
  408. timezone_tx = parsed_play_tx[1]
  409. hours_tm = parsed_play_tx[2]
  410. minutes_tm = parsed_play_tx[3]
  411. timezone_cd = parsed_play_tx[4]
  412.  
  413. elif event_cd == 'PENL':
  414. full_event_cd = 'PENALTY'
  415. team_id = parsed_play_tx[0]
  416. # there are two main sections (1) team penalty, (2) regular penalty
  417. if parsed_play_tx[1].find('TEAM') >= 0: # team penalty
  418. jersey_id = '0'
  419. player_tx = parsed_play_tx[1]
  420. penalty_cd = parsed_play_tx[2]
  421. else:
  422. jersey_id = parsed_play_tx[2]
  423. player_tx = parsed_play_tx[3]
  424. penalty_cd = parsed_play_tx[4]
  425.  
  426. # there are four subsections (1) minutes, (2) zone, (3) served by, (4) drawn by
  427. for ss in parsed_play_tx:
  428. if ss.find('_Zone') >= 0:
  429. zone_cd = ss
  430.  
  431. for i in range(len(parsed_play_tx)):
  432. if parsed_play_tx[i].find('minutes') >= 0:
  433. if minutes_penalty_ct == '':
  434. minutes_penalty_ct = parsed_play_tx[i-1]
  435. else: # 2nd penalty for misconduct
  436. minutes2_penalty_ct = parsed_play_tx[i-1]
  437.  
  438. if parsed_play_tx[i].find('SERVED_BY') >= 0:
  439. mate_jersey_id = parsed_play_tx[i+1]
  440. mate_player_tx = parsed_play_tx[i+2]
  441.  
  442. if parsed_play_tx[i].find('DRAWN_BY') >= 0:
  443. opp_team_id = parsed_play_tx[i+1]
  444. opp_jersey_id = parsed_play_tx[i+3]
  445. opp_player_tx = parsed_play_tx[i+4]
  446.  
  447. elif event_cd == 'PSTR':
  448. full_event_cd = 'PERIOD STARTED'
  449. timezone_tx = parsed_play_tx[1]
  450. hours_tm = parsed_play_tx[2]
  451. minutes_tm = parsed_play_tx[3]
  452. timezone_cd = parsed_play_tx[4]
  453.  
  454. elif event_cd == 'SHOT':
  455. full_event_cd = 'SHOT_ONGOAL'
  456. team_id = parsed_play_tx[0]
  457. outcome_shot_cd = parsed_play_tx[1]
  458. jersey_id = parsed_play_tx[3]
  459. player_tx = parsed_play_tx[4]
  460. if parsed_play_tx[5].find('PENALTY_SHOT') >= 0:
  461. shot_cd = parsed_play_tx[6]
  462. zone_cd = parsed_play_tx[5] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
  463. distance_ct = parsed_play_tx[8]
  464. else:
  465. shot_cd = parsed_play_tx[5]
  466. zone_cd = parsed_play_tx[6]
  467. distance_ct = parsed_play_tx[7]
  468.  
  469. elif event_cd == 'SOC':
  470. full_event_cd = 'SHOOTOUT COMPLETED'
  471. timezone_tx = parsed_play_tx[1]
  472. hours_tm = parsed_play_tx[2]
  473. minutes_tm = parsed_play_tx[3]
  474. timezone_cd = parsed_play_tx[4]
  475.  
  476. elif event_cd == 'STOP':
  477. full_event_cd = 'STOPPAGE IN PLAY'
  478. if len(parsed_play_tx) > 0: reason_cd = parsed_play_tx[0]
  479. if len(parsed_play_tx) > 1: reason2_cd = parsed_play_tx[1]
  480. if len(parsed_play_tx) > 2: reason3_cd = parsed_play_tx[2]
  481.  
  482. elif event_cd == 'TAKE':
  483. full_event_cd = 'TAKEAWAY'
  484. team_id = parsed_play_tx[0]
  485. jersey_id = parsed_play_tx[3]
  486. player_tx = parsed_play_tx[4]
  487. zone_cd = parsed_play_tx[5]
  488.  
  489. else:
  490. full_event_cd = 'UNKNOWN'
  491.  
  492. # cleanup player names
  493. player_tx \
  494. = re.sub('_',' ',player_tx, flags=re.IGNORECASE)
  495. assist_player_tx \
  496. = re.sub('_',' ',assist_player_tx, flags=re.IGNORECASE)
  497. assist2_player_tx \
  498. = re.sub('_',' ',assist2_player_tx, flags=re.IGNORECASE)
  499. mate_player_tx \
  500. = re.sub('_',' ',mate_player_tx, flags=re.IGNORECASE)
  501. opp_player_tx \
  502. = re.sub('_',' ',opp_player_tx, flags=re.IGNORECASE)
  503. away_player_tx \
  504. = re.sub('_',' ',away_player_tx, flags=re.IGNORECASE)
  505. home_player_tx \
  506. = re.sub('_',' ',home_player_tx, flags=re.IGNORECASE)
  507.  
  508. outfile.write('|{fulleventcd}'
  509. '|{playelementct}'
  510. '|{teamid}'
  511. '|{jerseyid}'
  512. '|{playertx}'
  513. '|{assistjerseyid}'
  514. '|{assistplayertx}'
  515. '|{assist2jerseyid}'
  516. '|{assist2playertx}'
  517. '|{matejerseyid}'
  518. '|{mateplayertx}'
  519. '|{oppteamid}'
  520. '|{oppjerseyid}'
  521. '|{oppplayertx}'
  522. '|{awayteamid}'
  523. '|{awayjerseyid}'
  524. '|{awayplayertx}'
  525. '|{hometeamid}'
  526. '|{homejerseyid}'
  527. '|{homeplayertx}'
  528. '|{winnerteamid}'
  529. '|{zonecd}'
  530. '|{shotcd}'
  531. '|{outcomeshotcd}'
  532. '|{distancect}'
  533. '|{penaltycd}'
  534. '|{minutespenaltyct}'
  535. '|{minutes2penaltyct}'
  536. '|{reasoncd}'
  537. '|{reason2cd}'
  538. '|{reason3cd}'
  539. '|{resultcd}'
  540. '|{hourstm}'
  541. '|{minutestm}'
  542. '|{timezonecd}'
  543. '|{timezonetx}'
  544. '|{seasongoalct}'
  545. '|{seasonassistct}'
  546. '|{seasonassist2ct}'.format(
  547. fulleventcd=full_event_cd
  548. , playelementct=play_element_ct
  549. , teamid=team_id
  550. , jerseyid=jersey_id
  551. , playertx=player_tx
  552. , assistjerseyid=assist_jersey_id
  553. , assistplayertx=assist_player_tx
  554. , assist2jerseyid=assist2_jersey_id
  555. , assist2playertx=assist2_player_tx
  556. , matejerseyid=mate_jersey_id
  557. , mateplayertx=mate_player_tx
  558. , oppteamid=opp_team_id
  559. , oppjerseyid=opp_jersey_id
  560. , oppplayertx=opp_player_tx
  561. , awayteamid=away_team_id
  562. , awayjerseyid=away_jersey_id
  563. , awayplayertx=away_player_tx
  564. , hometeamid=home_team_id
  565. , homejerseyid=home_jersey_id
  566. , homeplayertx=home_player_tx
  567. , winnerteamid=winner_team_id
  568. , zonecd=zone_cd
  569. , shotcd=shot_cd
  570. , outcomeshotcd=outcome_shot_cd
  571. , distancect=distance_ct
  572. , penaltycd=penalty_cd
  573. , minutespenaltyct=minutes_penalty_ct
  574. , minutes2penaltyct=minutes2_penalty_ct
  575. , reasoncd=reason_cd
  576. , reason2cd=reason2_cd
  577. , reason3cd=reason3_cd
  578. , resultcd=result_cd
  579. , hourstm=hours_tm
  580. , minutestm=minutes_tm
  581. , timezonecd=timezone_cd
  582. , timezonetx=timezone_tx
  583. , seasongoalct=season_goal_ct
  584. , seasonassistct=season_assist_ct
  585. , seasonassist2ct=season_assist2_ct
  586. ))
  587. outfile.write('\n')
  588.  
  589. mergedfile.write('|{fulleventcd}'
  590. '|{playelementct}'
  591. '|{teamid}'
  592. '|{jerseyid}'
  593. '|{playertx}'
  594. '|{assistjerseyid}'
  595. '|{assistplayertx}'
  596. '|{assist2jerseyid}'
  597. '|{assist2playertx}'
  598. '|{matejerseyid}'
  599. '|{mateplayertx}'
  600. '|{oppteamid}'
  601. '|{oppjerseyid}'
  602. '|{oppplayertx}'
  603. '|{awayteamid}'
  604. '|{awayjerseyid}'
  605. '|{awayplayertx}'
  606. '|{hometeamid}'
  607. '|{homejerseyid}'
  608. '|{homeplayertx}'
  609. '|{winnerteamid}'
  610. '|{zonecd}'
  611. '|{shotcd}'
  612. '|{outcomeshotcd}'
  613. '|{distancect}'
  614. '|{penaltycd}'
  615. '|{minutespenaltyct}'
  616. '|{minutes2penaltyct}'
  617. '|{reasoncd}'
  618. '|{reason2cd}'
  619. '|{reason3cd}'
  620. '|{resultcd}'
  621. '|{hourstm}'
  622. '|{minutestm}'
  623. '|{timezonecd}'
  624. '|{timezonetx}'
  625. '|{seasongoalct}'
  626. '|{seasonassistct}'
  627. '|{seasonassist2ct}'.format(
  628. fulleventcd=full_event_cd
  629. , playelementct=play_element_ct
  630. , teamid=team_id
  631. , jerseyid=jersey_id
  632. , playertx=player_tx
  633. , assistjerseyid=assist_jersey_id
  634. , assistplayertx=assist_player_tx
  635. , assist2jerseyid=assist2_jersey_id
  636. , assist2playertx=assist2_player_tx
  637. , matejerseyid=mate_jersey_id
  638. , mateplayertx=mate_player_tx
  639. , oppteamid=opp_team_id
  640. , oppjerseyid=opp_jersey_id
  641. , oppplayertx=opp_player_tx
  642. , awayteamid=away_team_id
  643. , awayjerseyid=away_jersey_id
  644. , awayplayertx=away_player_tx
  645. , hometeamid=home_team_id
  646. , homejerseyid=home_jersey_id
  647. , homeplayertx=home_player_tx
  648. , winnerteamid=winner_team_id
  649. , zonecd=zone_cd
  650. , shotcd=shot_cd
  651. , outcomeshotcd=outcome_shot_cd
  652. , distancect=distance_ct
  653. , penaltycd=penalty_cd
  654. , minutespenaltyct=minutes_penalty_ct
  655. , minutes2penaltyct=minutes2_penalty_ct
  656. , reasoncd=reason_cd
  657. , reason2cd=reason2_cd
  658. , reason3cd=reason3_cd
  659. , resultcd=result_cd
  660. , hourstm=hours_tm
  661. , minutestm=minutes_tm
  662. , timezonecd=timezone_cd
  663. , timezonetx=timezone_tx
  664. , seasongoalct=season_goal_ct
  665. , seasonassistct=season_assist_ct
  666. , seasonassist2ct=season_assist2_ct
  667. ))
  668. mergedfile.write('\n')
  669.  
  670.  
  671. merged_targetfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/merged_parsed_{d}.csv".format(d=datafile_id)
  672. with open(merged_targetfile,'w') as mergedfile:
  673. mergedfile.write(header_row_main + header_row_rest + '\n') # write out the header row
  674.  
  675. for int_game_id in range(first_game_id, last_game_id+1):
  676. game_id = str(int_game_id).zfill(4)
  677. print(game_id)
  678.  
  679. sourcefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
  680. intermediatefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/expanded_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
  681. penaltyfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/penalty_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
  682. targetfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/parsed_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
  683.  
  684. with open(sourcefile,'r') as infile \
  685. , open(intermediatefile,'w') as intermfile \
  686. , open(penaltyfile,'w') as penaltyfile \
  687. , open(targetfile,'w') as outfile \
  688. , open(merged_targetfile,'a') as mergedfile: # append to file
  689. soup = BeautifulSoup(infile, "lxml")
  690. tableRow = soup.findAll('tr')[1:] # infile: skip the first row, which is a header row
  691. outfile.write(header_row_main + header_row_rest + '\n') # outfile: write out the header row
  692. for r in tableRow:
  693. tableRowData = r.findAll('td')
  694. event_id = tableRowData[0].getText()
  695. period_cd = int(tableRowData[1].getText())
  696. strength_cd = tableRowData[2].getText()
  697.  
  698. split_play_tm = tableRowData[3].getText().split(':')
  699. play_tm = int(split_play_tm[0])*60 + int(split_play_tm[1])
  700.  
  701. split_remain_tm = tableRowData[5].getText().split(':')
  702. remain_tm = int(split_remain_tm[0])*60 + int(split_remain_tm[1])
  703.  
  704. if period_cd < 5: # not a shootout
  705. game_tm = (period_cd - 1) * 1200 + play_tm
  706.  
  707. event_cd = tableRowData[6].getText()
  708.  
  709. play_tx = tableRowData[7].getText()
  710. parsed_play = parse_play(play_tx)
  711.  
  712. assist_tx = ''
  713. if event_cd == 'GOAL' and len(tableRowData) >= 10:
  714. assist_tx = tableRowData[9].getText()
  715. parsed_assist = parse_play(assist_tx)
  716.  
  717. outfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
  718. season=season_id
  719. , subseason=subseason_id
  720. , game=game_id
  721. , eventid=event_id
  722. , periodcd=period_cd
  723. , strengthcd=strength_cd
  724. , gametm=game_tm
  725. , playtm=play_tm
  726. , remaintm=remain_tm
  727. , eventcd=event_cd
  728. ))
  729.  
  730. # ========== start: helpful for debugging =============
  731. mergedfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
  732. season=season_id
  733. , subseason=subseason_id
  734. , game=game_id
  735. , eventid=event_id
  736. , periodcd=period_cd
  737. , strengthcd=strength_cd
  738. , gametm=game_tm
  739. , playtm=play_tm
  740. , remaintm=remain_tm
  741. , eventcd=event_cd
  742. ))
  743. intermfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
  744. season=season_id
  745. , subseason=subseason_id
  746. , game=game_id
  747. , eventid=event_id
  748. , periodcd=period_cd
  749. , strengthcd=strength_cd
  750. , gametm=game_tm
  751. , playtm=play_tm
  752. , remaintm=remain_tm
  753. , eventcd=event_cd
  754. ))
  755. if event_cd == 'PENL':
  756. penaltyfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
  757. season=season_id
  758. , subseason=subseason_id
  759. , game=game_id
  760. , eventid=event_id
  761. , periodcd=period_cd
  762. , strengthcd=strength_cd
  763. , gametm=game_tm
  764. , playtm=play_tm
  765. , remaintm=remain_tm
  766. , eventcd=event_cd
  767. ))
  768.  
  769. for p in parsed_play:
  770. intermfile.write('|' + str(p))
  771. if event_cd == 'PENL':
  772. penaltyfile.write('|' + str(p))
  773. intermfile.write('\n')
  774. if event_cd == 'PENL':
  775. penaltyfile.write('\n')
  776. # ========== end: helpful for debugging =============
  777.  
  778. process_play(event_cd, parsed_play, period_cd, parsed_assist)
  779.  
  780.  
  781. print("Parse end")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement