Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import re
- print("Parse start")
- first_game_id = 1 #+665
- last_game_id = 668 #-2
- season_id = '20152016'
- subseason_id = '02'
- datafile_id = 'PL'
- header_row_main = (
- 'SEASON_ID'
- '|SUBSEASON_ID'
- '|GAME_ID'
- '|EVENT_ID'
- '|PERIOD_CD'
- '|STRENGTH_CD'
- '|GAME_TM'
- '|PLAY_TM'
- '|REMAIN_TM'
- '|EVENT_CD'
- )
- header_row_rest = (
- '|FULL_EVENT_CD'
- '|PLAY_ELEMENT_CT'
- '|TEAM_ID'
- '|JERSEY_ID'
- '|PLAYER_TX'
- '|ASSIST_JERSEY_ID'
- '|ASSIST_PLAYER_TX'
- '|ASSIST2_JERSEY_ID'
- '|ASSIST2_PLAYER_TX'
- '|MATE_JERSEY_ID'
- '|MATE_PLAYER_TX'
- '|OPP_TEAM_ID'
- '|OPP_JERSEY_ID'
- '|OPP_PLAYER_TX'
- '|AWAY_TEAM_ID'
- '|AWAY_JERSEY_ID'
- '|AWAY_PLAYER_TX'
- '|HOME_TEAM_ID'
- '|HOME_JERSEY_ID'
- '|HOME_PLAYER_TX'
- '|WINNER_TEAM_ID'
- '|ZONE_CD'
- '|SHOT_CD'
- '|OUTCOME_SHOT_CD'
- '|DISTANCE_CT'
- '|PENALTY_CD'
- '|MINUTES_PENALTY_CT'
- '|MINUTES2_PENALTY_CT'
- '|REASON_CD'
- '|REASON2_CD'
- '|REASON3_CD'
- '|RESULT_CD'
- '|HOURS_TM'
- '|MINUTES_TM'
- '|TIMEZONE_CD'
- '|TIMEZONE_TX'
- '|SEASON_GOAL_CT'
- '|SEASON_ASSIST_CT'
- '|SEASON_ASSIST2_CT'
- )
- def parse_play(play_tx):
- # players with multi names collapsed into one name, temporarily
- play_tx = re.sub('DI GIUSEPPE','DI_GIUSEPPE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('DE HAAN','DE_HAAN',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('DE LA ROSE','DE_LA_ROSE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('VAN RIEMSDYK','VAN_RIEMSDYK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('DEL ZOTTO','DEL_ZOTTO',play_tx, flags=re.IGNORECASE)
- # special keywords to merge with prior words
- play_tx = re.sub('. Zone','_Zone',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(' Start-','_Start',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(' End-','_End',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(' time:','_Time',play_tx, flags=re.IGNORECASE)
- # create a new keyword
- play_tx = re.sub(' \#',' Jersey_Id ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(' min\)',' minutes ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('ft.','feet',play_tx, flags=re.IGNORECASE)
- # redundand keywords
- play_tx = re.sub('- double minor','',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('(maj)','',play_tx, flags=re.IGNORECASE)
- #play_tx = re.sub('-bench','',play_tx, flags=re.IGNORECASE) # not sure if I want to do this here or below
- #play_tx = re.sub('- bench','',play_tx, flags=re.IGNORECASE) # not sure if I want to do this here or below
- # keywords with multiple words will be collapsed into one word
- play_tx = re.sub('BLOCKED BY','BLOCKED_BY',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('CLOCK PROBLEM','CLOCK_PROBLEM',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Early Intermission','Early_Intermission',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('HAND PASS','HAND_PASS',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('HIT CROSSBAR','HIT_CROSSBAR',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('GOAL INTERFERENCE','GOAL_INTERFERENCE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('GOAL OVERTURNED','GOAL_OVERTURNED',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('GOALIE STOPPED','GOALIE_STOPPED',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('HOME TIMEOUT','HOME_TIMEOUT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('ICE PROBLEM','ICE_PROBLEM',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('NET OFF','NET_OFF',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('OBJECTS ON ICE','OBJECTS_ONICE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('OFFICIAL INJURY','OFFICIAL_INJURY',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('OFF-SIDE','OFFSIDE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('OVER NET','OVER_NET',play_tx, flags=re.IGNORECASE)
- # play_tx = re.sub('Over Net','OVER_NET',play_tx, flags=re.IGNORECASE) # contains special character
- play_tx = re.sub('PENALTY SHOT','PENALTY_SHOT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PLAYER EQUIPMENT','PLAYER_EQUIPMENT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PLAYER INJURY','PLAYER_INJURY',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PUCK FROZEN','PUCK_FROZEN',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PUCK IN BENCHES','PUCK_INBENCHES',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PUCK IN CROWD','PUCK_INCROWD',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PUCK IN NETTING','PUCK_INNETTING',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('REFEREE OR LINESMAN','REFEREE_LINESMAN',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('RINK REPAIR','RINK_REPAIR',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('SHOOTOUT COMPLETED','SHOOTOUT_COMPLETED',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('TV TIMEOUT','TV_TIMEOUT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('VIDEO REVIEW','VIDEO_REVIEW',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('VISITOR TIMEOUT','VISITOR_TIMEOUT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Wide Of Net','WIDE_OFNET',play_tx, flags=re.IGNORECASE)
- # play_tx = re.sub('Wide of Net','WIDE_OFNET',play_tx, flags=re.IGNORECASE) # contains special character
- # penalties
- play_tx = re.sub('ABUSE OF OFFICIALS - BENCH','ABUSE_OFFICIALS',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('ABUSE OF OFFICIALS','ABUSE_OFFICIALS',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('ABUSIVE LANGUAGE - BENCH','ABUSIVE_LANGUAGE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('ABUSIVE LANGUAGE','ABUSIVE_LANGUAGE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('BROKEN STICK','BROKEN_STICK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Checking from behind','CHECKING_FROMBEHIND',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Concealing Puck','CONCEALING_PUCK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Closing Hand On Puck','CLOSINGHAND_ONPUCK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('CROSS CHECKING','CROSS_CHECK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('CROSS CHECK','CROSS_CHECK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Delay Gm - Face-off Violation','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Delay of game - bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Delay of game-bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('DELAY OF GAME','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- # play_tx = re.sub('Delay of game','DELAY_OFGAME',play_tx, flags=re.IGNORECASE) # contains special character
- play_tx = re.sub('DELAYING GAME-ILL.PLAY GOALIE','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Delaying Game-Ill. play goalie','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Delaying Game-Puck over glass','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Delaying Game-Smothering puck','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Delaying Game','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('DELAYING THE GAME','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Face-off violation-bench','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Face-off violation','DELAY_OFGAME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('DRAWN BY','DRAWN_BY',play_tx, flags=re.IGNORECASE)
- # play_tx = re.sub('Drawn By','DRAWN_BY',play_tx, flags=re.IGNORECASE) # contains special character
- play_tx = re.sub('GAME MISCONDUCT','GAME_MISCONDUCT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Goalie leave crease','GOALIE_CREASE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('HI-STICKING','HIGH_STICK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('HI STICK','HIGH_STICK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('HIGH STICK','HIGH_STICK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('HOLDING THE STICK','HOLDING_STICK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('ILLEGAL STICK','ILLEGAL_STICK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Illegal check to head','CHECK_HEAD',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Instigator - face shield','INSTIGATOR_FACESHIELD',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Instigator - Misconduct','INSTIGATOR_MISCONDUCT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Interference - Goalkeeper','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Interference on goalkeeper','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Interference on goalie','INTERFERENCE_GOALIE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Leaving penalty box - bench','TOOMANY_MEN',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Match Penalty','MATCH_PENALTY',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PS-Covering puck in crease','PENALTYSHOT_COVERPUCK',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PS-Hooking on breakaway','PENALTYSHOT_HOOKING',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PS-Slash on breakaway','PENALTYSHOT_SLASH',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PS-Thow object at puck','PENALTYSHOT_THROWOBJECT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PS-Throw object at puck','PENALTYSHOT_THROWOBJECT',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PS-Tripping on breakaway','PENALTYSHOT_TRIPPING',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Served By: ','SERVED_BY_',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('PREMATURE SUBSTITUTION','PREMATURE_SUBSTITUTION',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('Too many men/ice - bench','TOOMANY_MEN',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('UNSPORTSMANLIKE CONDUCT','UNSPORTSMANLIKE_CONDUCT',play_tx, flags=re.IGNORECASE)
- # challenges
- play_tx = re.sub('CHLG HM','CHALLENGE_HOME',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('CHLG LEAGUE','CHALLENGE_LEAGUE',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('CHLG VIS','CHALLENGE_AWAY',play_tx, flags=re.IGNORECASE)
- # Prepare delimiter of space
- play_tx = re.sub(' - ',' ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('- ',' ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(', ',' ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(',',' ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(': ',' ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('; ',' ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(':',' ',play_tx, flags=re.IGNORECASE) # used for HH:MM
- play_tx = re.sub('\(',' ',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub('\)','',play_tx, flags=re.IGNORECASE)
- play_tx = re.sub(' ',' ',play_tx, flags=re.IGNORECASE) # hidden character
- play_tx = re.sub(' +',' ',play_tx, flags=re.IGNORECASE) # turn multiple spaces into one
- return re.split(' ', play_tx)
- def process_play(event_cd, parsed_play_tx, period_cd, parsed_assist_tx):
- # all these are optional fields
- play_element_ct = len(parsed_play_tx)
- team_id = ''
- jersey_id = ''
- player_tx = ''
- assist_jersey_id = ''
- assist_player_tx = ''
- assist2_jersey_id = ''
- assist2_player_tx = ''
- mate_jersey_id = ''
- mate_player_tx = ''
- opp_team_id = ''
- opp_jersey_id = ''
- opp_player_tx = ''
- away_team_id = ''
- away_jersey_id = ''
- away_player_tx = ''
- home_team_id = ''
- home_jersey_id = ''
- home_player_tx = ''
- winner_team_id = ''
- zone_cd = ''
- shot_cd = ''
- outcome_shot_cd = ''
- distance_ct = ''
- penalty_cd = ''
- minutes_penalty_ct = ''
- minutes2_penalty_ct = ''
- reason_cd = ''
- reason2_cd = ''
- reason3_cd = ''
- result_cd = ''
- hours_tm = ''
- minutes_tm = ''
- timezone_cd = ''
- timezone_tx = ''
- season_goal_ct = ''
- season_assist_ct = ''
- season_assist2_ct = ''
- if event_cd == 'BLOCK':
- full_event_cd = 'SHOT_BLOCKED'
- team_id = parsed_play_tx[0]
- jersey_id = parsed_play_tx[2]
- player_tx = parsed_play_tx[3]
- opp_team_id = parsed_play_tx[5]
- opp_jersey_id = parsed_play_tx[7]
- opp_player_tx = parsed_play_tx[8]
- if parsed_play_tx[9].find('_Zone') >= 0:
- shot_cd = ''
- zone_cd = parsed_play_tx[9]
- else:
- shot_cd = parsed_play_tx[9]
- zone_cd = parsed_play_tx[10]
- elif event_cd == 'CHL':
- full_event_cd = 'CHALLENGE'
- if len(parsed_play_tx)>=2:
- team_id = parsed_play_tx[0]
- reason_cd = parsed_play_tx[2]
- result_cd = parsed_play_tx[4]
- elif event_cd == 'EIEND':
- full_event_cd = 'EARLY INTERMISSION ENDED'
- timezone_tx = parsed_play_tx[1]
- hours_tm = parsed_play_tx[2]
- minutes_tm = parsed_play_tx[3]
- timezone_cd = parsed_play_tx[4]
- elif event_cd == 'EISTR':
- full_event_cd = 'EARLY INTERMISSION STARTED'
- timezone_tx = parsed_play_tx[1]
- hours_tm = parsed_play_tx[2]
- minutes_tm = parsed_play_tx[3]
- timezone_cd = parsed_play_tx[4]
- elif event_cd == 'FAC':
- full_event_cd = 'FACEOFF'
- winner_team_id = parsed_play_tx[0]
- zone_cd = parsed_play_tx[2]
- away_team_id = parsed_play_tx[3]
- away_jersey_id = parsed_play_tx[5]
- away_player_tx = parsed_play_tx[6]
- home_team_id = parsed_play_tx[8]
- home_jersey_id = parsed_play_tx[10]
- home_player_tx = parsed_play_tx[11]
- elif event_cd == 'GEND':
- full_event_cd = 'GAME ENDED'
- timezone_tx = parsed_play_tx[1]
- hours_tm = parsed_play_tx[2]
- minutes_tm = parsed_play_tx[3]
- timezone_cd = parsed_play_tx[4]
- elif event_cd == 'GIVE':
- full_event_cd = 'GIVEAWAY'
- team_id = parsed_play_tx[0]
- jersey_id = parsed_play_tx[3]
- player_tx = parsed_play_tx[4]
- zone_cd = parsed_play_tx[5]
- elif event_cd == 'GOAL':
- full_event_cd = 'SHOT_GOAL'
- team_id = parsed_play_tx[0]
- jersey_id = parsed_play_tx[2]
- player_tx = parsed_play_tx[3]
- if subseason_id == '02' and period_cd == 5: # shootout
- if parsed_play_tx[4].find('_Zone') >= 0:
- shot_cd = ''
- zone_cd = parsed_play_tx[4]
- distance_ct = parsed_play_tx[5]
- else:
- shot_cd = parsed_play_tx[4]
- zone_cd = parsed_play_tx[5]
- distance_ct = parsed_play_tx[6]
- else:
- season_goal_ct = parsed_play_tx[4]
- if parsed_play_tx[5].find('PENALTY_SHOT') >= 0: # penalty shot
- shot_cd = parsed_play_tx[6]
- zone_cd = parsed_play_tx[5] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
- distance_ct = parsed_play_tx[8]
- elif parsed_play_tx[5].find('_Zone') >= 0: # missing shot
- shot_cd = ''
- zone_cd = parsed_play_tx[5]
- distance_ct = parsed_play_tx[6]
- else:
- shot_cd = parsed_play_tx[5]
- zone_cd = parsed_play_tx[6]
- distance_ct = parsed_play_tx[7]
- # assist
- for i in range(len(parsed_assist_tx)):
- if parsed_assist_tx[i].find('Jersey_Id') >= 0:
- if assist_jersey_id == '':
- assist_jersey_id = parsed_assist_tx[i+1]
- assist_player_tx = parsed_assist_tx[i+2]
- season_assist_ct = parsed_assist_tx[i+3]
- else: # 2nd assist
- assist2_jersey_id = parsed_assist_tx[i+1]
- assist2_player_tx = parsed_assist_tx[i+2]
- season_assist2_ct = parsed_assist_tx[i+3]
- elif event_cd == 'GOFF':
- full_event_cd = 'GAME OFF'
- elif event_cd == 'HIT':
- full_event_cd = 'HIT'
- team_id = parsed_play_tx[0]
- jersey_id = parsed_play_tx[2]
- player_tx = parsed_play_tx[3]
- opp_team_id = parsed_play_tx[5]
- opp_jersey_id = parsed_play_tx[7]
- opp_player_tx = parsed_play_tx[8]
- zone_cd = parsed_play_tx[9]
- elif event_cd == 'MISS':
- full_event_cd = 'SHOT_MISSEDNET'
- team_id = parsed_play_tx[0]
- jersey_id = parsed_play_tx[2]
- player_tx = parsed_play_tx[3]
- if subseason_id == '02' and period_cd == 5: # shootout
- if parsed_play_tx[5].find('_Zone') >= 0:
- shot_cd = ''
- outcome_shot_cd = parsed_play_tx[4]
- zone_cd = parsed_play_tx[5]
- distance_ct = parsed_play_tx[6]
- else:
- shot_cd = parsed_play_tx[4]
- outcome_shot_cd = parsed_play_tx[5]
- zone_cd = parsed_play_tx[6]
- distance_ct = parsed_play_tx[7]
- else:
- if parsed_play_tx[4].find('PENALTY_SHOT') >= 0:
- shot_cd = parsed_play_tx[5]
- outcome_shot_cd = parsed_play_tx[6]
- zone_cd = parsed_play_tx[4] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
- distance_ct = parsed_play_tx[8]
- elif parsed_play_tx[5].find('_Zone') >= 0:
- shot_cd = ''
- outcome_shot_cd = parsed_play_tx[4]
- zone_cd = parsed_play_tx[5]
- distance_ct = parsed_play_tx[6]
- else:
- shot_cd = parsed_play_tx[4]
- outcome_shot_cd = parsed_play_tx[5]
- zone_cd = parsed_play_tx[6]
- distance_ct = parsed_play_tx[7]
- elif event_cd == 'PEND':
- full_event_cd = 'PERIOD ENDED'
- timezone_tx = parsed_play_tx[1]
- hours_tm = parsed_play_tx[2]
- minutes_tm = parsed_play_tx[3]
- timezone_cd = parsed_play_tx[4]
- elif event_cd == 'PENL':
- full_event_cd = 'PENALTY'
- team_id = parsed_play_tx[0]
- # there are two main sections (1) team penalty, (2) regular penalty
- if parsed_play_tx[1].find('TEAM') >= 0: # team penalty
- jersey_id = '0'
- player_tx = parsed_play_tx[1]
- penalty_cd = parsed_play_tx[2]
- else:
- jersey_id = parsed_play_tx[2]
- player_tx = parsed_play_tx[3]
- penalty_cd = parsed_play_tx[4]
- # there are four subsections (1) minutes, (2) zone, (3) served by, (4) drawn by
- for ss in parsed_play_tx:
- if ss.find('_Zone') >= 0:
- zone_cd = ss
- for i in range(len(parsed_play_tx)):
- if parsed_play_tx[i].find('minutes') >= 0:
- if minutes_penalty_ct == '':
- minutes_penalty_ct = parsed_play_tx[i-1]
- else: # 2nd penalty for misconduct
- minutes2_penalty_ct = parsed_play_tx[i-1]
- if parsed_play_tx[i].find('SERVED_BY') >= 0:
- mate_jersey_id = parsed_play_tx[i+1]
- mate_player_tx = parsed_play_tx[i+2]
- if parsed_play_tx[i].find('DRAWN_BY') >= 0:
- opp_team_id = parsed_play_tx[i+1]
- opp_jersey_id = parsed_play_tx[i+3]
- opp_player_tx = parsed_play_tx[i+4]
- elif event_cd == 'PSTR':
- full_event_cd = 'PERIOD STARTED'
- timezone_tx = parsed_play_tx[1]
- hours_tm = parsed_play_tx[2]
- minutes_tm = parsed_play_tx[3]
- timezone_cd = parsed_play_tx[4]
- elif event_cd == 'SHOT':
- full_event_cd = 'SHOT_ONGOAL'
- team_id = parsed_play_tx[0]
- outcome_shot_cd = parsed_play_tx[1]
- jersey_id = parsed_play_tx[3]
- player_tx = parsed_play_tx[4]
- if parsed_play_tx[5].find('PENALTY_SHOT') >= 0:
- shot_cd = parsed_play_tx[6]
- zone_cd = parsed_play_tx[5] # instead of element 7, I'm forcing in PENALTY_SHOT as the "zone"
- distance_ct = parsed_play_tx[8]
- else:
- shot_cd = parsed_play_tx[5]
- zone_cd = parsed_play_tx[6]
- distance_ct = parsed_play_tx[7]
- elif event_cd == 'SOC':
- full_event_cd = 'SHOOTOUT COMPLETED'
- timezone_tx = parsed_play_tx[1]
- hours_tm = parsed_play_tx[2]
- minutes_tm = parsed_play_tx[3]
- timezone_cd = parsed_play_tx[4]
- elif event_cd == 'STOP':
- full_event_cd = 'STOPPAGE IN PLAY'
- if len(parsed_play_tx) > 0: reason_cd = parsed_play_tx[0]
- if len(parsed_play_tx) > 1: reason2_cd = parsed_play_tx[1]
- if len(parsed_play_tx) > 2: reason3_cd = parsed_play_tx[2]
- elif event_cd == 'TAKE':
- full_event_cd = 'TAKEAWAY'
- team_id = parsed_play_tx[0]
- jersey_id = parsed_play_tx[3]
- player_tx = parsed_play_tx[4]
- zone_cd = parsed_play_tx[5]
- else:
- full_event_cd = 'UNKNOWN'
- # cleanup player names
- player_tx \
- = re.sub('_',' ',player_tx, flags=re.IGNORECASE)
- assist_player_tx \
- = re.sub('_',' ',assist_player_tx, flags=re.IGNORECASE)
- assist2_player_tx \
- = re.sub('_',' ',assist2_player_tx, flags=re.IGNORECASE)
- mate_player_tx \
- = re.sub('_',' ',mate_player_tx, flags=re.IGNORECASE)
- opp_player_tx \
- = re.sub('_',' ',opp_player_tx, flags=re.IGNORECASE)
- away_player_tx \
- = re.sub('_',' ',away_player_tx, flags=re.IGNORECASE)
- home_player_tx \
- = re.sub('_',' ',home_player_tx, flags=re.IGNORECASE)
- outfile.write('|{fulleventcd}'
- '|{playelementct}'
- '|{teamid}'
- '|{jerseyid}'
- '|{playertx}'
- '|{assistjerseyid}'
- '|{assistplayertx}'
- '|{assist2jerseyid}'
- '|{assist2playertx}'
- '|{matejerseyid}'
- '|{mateplayertx}'
- '|{oppteamid}'
- '|{oppjerseyid}'
- '|{oppplayertx}'
- '|{awayteamid}'
- '|{awayjerseyid}'
- '|{awayplayertx}'
- '|{hometeamid}'
- '|{homejerseyid}'
- '|{homeplayertx}'
- '|{winnerteamid}'
- '|{zonecd}'
- '|{shotcd}'
- '|{outcomeshotcd}'
- '|{distancect}'
- '|{penaltycd}'
- '|{minutespenaltyct}'
- '|{minutes2penaltyct}'
- '|{reasoncd}'
- '|{reason2cd}'
- '|{reason3cd}'
- '|{resultcd}'
- '|{hourstm}'
- '|{minutestm}'
- '|{timezonecd}'
- '|{timezonetx}'
- '|{seasongoalct}'
- '|{seasonassistct}'
- '|{seasonassist2ct}'.format(
- fulleventcd=full_event_cd
- , playelementct=play_element_ct
- , teamid=team_id
- , jerseyid=jersey_id
- , playertx=player_tx
- , assistjerseyid=assist_jersey_id
- , assistplayertx=assist_player_tx
- , assist2jerseyid=assist2_jersey_id
- , assist2playertx=assist2_player_tx
- , matejerseyid=mate_jersey_id
- , mateplayertx=mate_player_tx
- , oppteamid=opp_team_id
- , oppjerseyid=opp_jersey_id
- , oppplayertx=opp_player_tx
- , awayteamid=away_team_id
- , awayjerseyid=away_jersey_id
- , awayplayertx=away_player_tx
- , hometeamid=home_team_id
- , homejerseyid=home_jersey_id
- , homeplayertx=home_player_tx
- , winnerteamid=winner_team_id
- , zonecd=zone_cd
- , shotcd=shot_cd
- , outcomeshotcd=outcome_shot_cd
- , distancect=distance_ct
- , penaltycd=penalty_cd
- , minutespenaltyct=minutes_penalty_ct
- , minutes2penaltyct=minutes2_penalty_ct
- , reasoncd=reason_cd
- , reason2cd=reason2_cd
- , reason3cd=reason3_cd
- , resultcd=result_cd
- , hourstm=hours_tm
- , minutestm=minutes_tm
- , timezonecd=timezone_cd
- , timezonetx=timezone_tx
- , seasongoalct=season_goal_ct
- , seasonassistct=season_assist_ct
- , seasonassist2ct=season_assist2_ct
- ))
- outfile.write('\n')
- mergedfile.write('|{fulleventcd}'
- '|{playelementct}'
- '|{teamid}'
- '|{jerseyid}'
- '|{playertx}'
- '|{assistjerseyid}'
- '|{assistplayertx}'
- '|{assist2jerseyid}'
- '|{assist2playertx}'
- '|{matejerseyid}'
- '|{mateplayertx}'
- '|{oppteamid}'
- '|{oppjerseyid}'
- '|{oppplayertx}'
- '|{awayteamid}'
- '|{awayjerseyid}'
- '|{awayplayertx}'
- '|{hometeamid}'
- '|{homejerseyid}'
- '|{homeplayertx}'
- '|{winnerteamid}'
- '|{zonecd}'
- '|{shotcd}'
- '|{outcomeshotcd}'
- '|{distancect}'
- '|{penaltycd}'
- '|{minutespenaltyct}'
- '|{minutes2penaltyct}'
- '|{reasoncd}'
- '|{reason2cd}'
- '|{reason3cd}'
- '|{resultcd}'
- '|{hourstm}'
- '|{minutestm}'
- '|{timezonecd}'
- '|{timezonetx}'
- '|{seasongoalct}'
- '|{seasonassistct}'
- '|{seasonassist2ct}'.format(
- fulleventcd=full_event_cd
- , playelementct=play_element_ct
- , teamid=team_id
- , jerseyid=jersey_id
- , playertx=player_tx
- , assistjerseyid=assist_jersey_id
- , assistplayertx=assist_player_tx
- , assist2jerseyid=assist2_jersey_id
- , assist2playertx=assist2_player_tx
- , matejerseyid=mate_jersey_id
- , mateplayertx=mate_player_tx
- , oppteamid=opp_team_id
- , oppjerseyid=opp_jersey_id
- , oppplayertx=opp_player_tx
- , awayteamid=away_team_id
- , awayjerseyid=away_jersey_id
- , awayplayertx=away_player_tx
- , hometeamid=home_team_id
- , homejerseyid=home_jersey_id
- , homeplayertx=home_player_tx
- , winnerteamid=winner_team_id
- , zonecd=zone_cd
- , shotcd=shot_cd
- , outcomeshotcd=outcome_shot_cd
- , distancect=distance_ct
- , penaltycd=penalty_cd
- , minutespenaltyct=minutes_penalty_ct
- , minutes2penaltyct=minutes2_penalty_ct
- , reasoncd=reason_cd
- , reason2cd=reason2_cd
- , reason3cd=reason3_cd
- , resultcd=result_cd
- , hourstm=hours_tm
- , minutestm=minutes_tm
- , timezonecd=timezone_cd
- , timezonetx=timezone_tx
- , seasongoalct=season_goal_ct
- , seasonassistct=season_assist_ct
- , seasonassist2ct=season_assist2_ct
- ))
- mergedfile.write('\n')
- merged_targetfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/merged_parsed_{d}.csv".format(d=datafile_id)
- with open(merged_targetfile,'w') as mergedfile:
- mergedfile.write(header_row_main + header_row_rest + '\n') # write out the header row
- for int_game_id in range(first_game_id, last_game_id+1):
- game_id = str(int_game_id).zfill(4)
- print(game_id)
- sourcefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/stripped_{d}{ss}{g}.HTM".format(d=datafile_id, ss=subseason_id, g=game_id)
- intermediatefile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/expanded_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
- penaltyfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/penalty_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
- targetfile = "C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/parsed_{d}{ss}{g}.csv".format(d=datafile_id, ss=subseason_id, g=game_id)
- with open(sourcefile,'r') as infile \
- , open(intermediatefile,'w') as intermfile \
- , open(penaltyfile,'w') as penaltyfile \
- , open(targetfile,'w') as outfile \
- , open(merged_targetfile,'a') as mergedfile: # append to file
- soup = BeautifulSoup(infile, "lxml")
- tableRow = soup.findAll('tr')[1:] # infile: skip the first row, which is a header row
- outfile.write(header_row_main + header_row_rest + '\n') # outfile: write out the header row
- for r in tableRow:
- tableRowData = r.findAll('td')
- event_id = tableRowData[0].getText()
- period_cd = int(tableRowData[1].getText())
- strength_cd = tableRowData[2].getText()
- split_play_tm = tableRowData[3].getText().split(':')
- play_tm = int(split_play_tm[0])*60 + int(split_play_tm[1])
- split_remain_tm = tableRowData[5].getText().split(':')
- remain_tm = int(split_remain_tm[0])*60 + int(split_remain_tm[1])
- if period_cd < 5: # not a shootout
- game_tm = (period_cd - 1) * 1200 + play_tm
- event_cd = tableRowData[6].getText()
- play_tx = tableRowData[7].getText()
- parsed_play = parse_play(play_tx)
- assist_tx = ''
- if event_cd == 'GOAL' and len(tableRowData) >= 10:
- assist_tx = tableRowData[9].getText()
- parsed_assist = parse_play(assist_tx)
- outfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
- season=season_id
- , subseason=subseason_id
- , game=game_id
- , eventid=event_id
- , periodcd=period_cd
- , strengthcd=strength_cd
- , gametm=game_tm
- , playtm=play_tm
- , remaintm=remain_tm
- , eventcd=event_cd
- ))
- # ========== start: helpful for debugging =============
- mergedfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
- season=season_id
- , subseason=subseason_id
- , game=game_id
- , eventid=event_id
- , periodcd=period_cd
- , strengthcd=strength_cd
- , gametm=game_tm
- , playtm=play_tm
- , remaintm=remain_tm
- , eventcd=event_cd
- ))
- intermfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
- season=season_id
- , subseason=subseason_id
- , game=game_id
- , eventid=event_id
- , periodcd=period_cd
- , strengthcd=strength_cd
- , gametm=game_tm
- , playtm=play_tm
- , remaintm=remain_tm
- , eventcd=event_cd
- ))
- if event_cd == 'PENL':
- penaltyfile.write('{season}|{subseason}|{game}|{eventid}|{periodcd}|{strengthcd}|{gametm}|{playtm}|{remaintm}|{eventcd}'.format(
- season=season_id
- , subseason=subseason_id
- , game=game_id
- , eventid=event_id
- , periodcd=period_cd
- , strengthcd=strength_cd
- , gametm=game_tm
- , playtm=play_tm
- , remaintm=remain_tm
- , eventcd=event_cd
- ))
- for p in parsed_play:
- intermfile.write('|' + str(p))
- if event_cd == 'PENL':
- penaltyfile.write('|' + str(p))
- intermfile.write('\n')
- if event_cd == 'PENL':
- penaltyfile.write('\n')
- # ========== end: helpful for debugging =============
- process_play(event_cd, parsed_play, period_cd, parsed_assist)
- print("Parse end")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement