Advertisement
samipote

Untitled

Oct 17th, 2023
638
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 24.96 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split , KFold , TimeSeriesSplit
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.metrics import accuracy_score, classification_report
  5. from sklearn.impute import SimpleImputer
  6. from sklearn.pipeline import make_pipeline
  7. import datetime
  8. import requests
  9. import re
  10. import io
  11. import numpy as np
  12. from sklearn.svm import SVC
  13. import xgboost as xgb
  14. import unicodedata
  15. from bs4 import BeautifulSoup as bs
  16. from sklearn.metrics import accuracy_score
  17. from elote import EloCompetitor
  18. from sklearn.ensemble import RandomForestClassifier
  19. from sklearn.model_selection import RandomizedSearchCV , StratifiedKFold
  20. from elote import GlickoCompetitor
  21. import warnings
  22. from sklearn.preprocessing import StandardScaler
  23. from sklearn.tree import DecisionTreeClassifier
  24. from sklearn.ensemble import GradientBoostingClassifier
  25. from sklearn.naive_bayes import GaussianNB
  26. from sklearn.neighbors import KNeighborsClassifier
  27. from sklearn.svm import LinearSVC
  28. from sklearn.calibration import CalibratedClassifierCV
  29. from sklearn.model_selection import cross_val_score
  30. from sklearn.metrics import log_loss
  31. from datetime import timedelta
  32. from sklearn.metrics import mean_squared_error
  33. import matplotlib.pyplot as plt
  34. from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
  35.  
  36. warnings.filterwarnings("ignore")
  37.  
  38.  
  39. def remove_accents(input_str):
  40.     nfkd_form = unicodedata.normalize('NFKD', input_str)
  41.     only_ascii = nfkd_form.encode('ASCII', 'ignore')
  42.     return only_ascii
  43.  
  44.  
  45. # Define a dictionary that maps team abbreviations to full names
  46. team_abbr_to_name = {
  47.     'ANA': 'Anaheim Ducks',
  48.     'ARI': 'Arizona Coyotes',
  49.     'BOS': 'Boston Bruins',
  50.     'BUF': 'Buffalo Sabres',
  51.     'CGY': 'Calgary Flames',
  52.     'CAL': 'Calgary Flames',
  53.     'CAR': 'Carolina Hurricanes',
  54.     'CHI': 'Chicago Blackhawks',
  55.     'COL': 'Colorado Avalanche',
  56.     'CBJ': 'Columbus Blue Jackets',
  57.     'CLB':'Columbus Blue Jackets',
  58.     'DAL': 'Dallas Stars',
  59.     'DET': 'Detroit Red Wings',
  60.     'EDM': 'Edmonton Oilers',
  61.     'FLA': 'Florida Panthers',
  62.     'LAK': 'Los Angeles Kings',
  63.     'LA': 'Los Angeles Kings',
  64.     'MIN': 'Minnesota Wild',
  65.     'MTL': 'Montreal Canadiens',
  66.     'MON': 'Montreal Canadiens',
  67.     'NSH': 'Nashville Predators',
  68.     'NAS': 'Nashville Predators',
  69.     'NJD': 'New Jersey Devils',
  70.     'NJ': 'New Jersey Devils',
  71.     'NYI': 'New York Islanders',
  72.     'NYR': 'New York Rangers',
  73.     'OTT': 'Ottawa Senators',
  74.     'PHI': 'Philadelphia Flyers',
  75.     'PIT': 'Pittsburgh Penguins',
  76.     'SEA': 'Seattle Kraken',
  77.     'SJS': 'San Jose Sharks',
  78.     'SJ': 'San Jose Sharks',
  79.     'STL': 'St. Louis Blues',
  80.     'TBL': 'Tampa Bay Lightning',
  81.     'TB': 'Tampa Bay Lightning',
  82.     'TOR': 'Toronto Maple Leafs',
  83.     'VAN': 'Vancouver Canucks',
  84.     'VGK': 'Vegas Golden Knights',
  85.     'VEG': 'Vegas Golden Knights',
  86.     'WSH': 'Washington Capitals',
  87.     'WAS': 'Washington Capitals',
  88.     'WPG': 'Winnipeg Jets',
  89.     'WIN': 'Winnipeg Jets'
  90. }
  91.  
  92. # Define a function to get game data
  93. def get_game_data(season, plus):
  94.     base_url = f"https://statsapi.web.nhl.com/api/v1/schedule?season={season}"
  95.     response = requests.get(base_url)
  96.     data = response.json()
  97.  
  98.     games_data = []
  99.  
  100.     for date_entry in data['dates']:
  101.         for game in date_entry['games']:
  102.             game_id = game['gamePk']
  103.             game_date = datetime.datetime.strptime(date_entry['date'], '%Y-%m-%d').date()
  104.  
  105.             team_data = {
  106.                 'game_id': game_id,
  107.                 'game_date': game_date,
  108.                 'home_team': game['teams']['home']['team']['name'],
  109.                 'home_team_score': game['teams']['home']['score'],
  110.                 'away_team': game['teams']['away']['team']['name'],
  111.                 'away_team_score': game['teams']['away']['score'],
  112.             }
  113.             games_data.append(team_data)
  114.     if plus == True :
  115.         base_url2 = f"https://statsapi.web.nhl.com/api/v1/schedule?season={20232024}"
  116.         response2 = requests.get(base_url2)
  117.         data2 = response2.json()
  118.         for date_entry2 in data2['dates']:
  119.             for game2 in date_entry2['games']:
  120.                 game_id2 = game2['gamePk']
  121.                 game_date2 = datetime.datetime.strptime(date_entry2['date'], '%Y-%m-%d').date()
  122.                 team_data2 = {
  123.                 'game_id': game_id2,
  124.                 'game_date': game_date2,
  125.                 'home_team': game2['teams']['home']['team']['name'],
  126.                 'home_team_score': game2['teams']['home']['score'],
  127.                 'away_team': game2['teams']['away']['team']['name'],
  128.                 'away_team_score': game2['teams']['away']['score'],
  129.                 }
  130.                 games_data.append(team_data2)
  131.     return games_data
  132.  
  133. def get_data_for_seasons(start_season, end_season):
  134.     all_games_data = []
  135.     for season in range(start_season, end_season):
  136.         season_data = get_game_data(str(season) + str(season + 1),False)
  137.         all_games_data.extend(season_data)
  138.     return all_games_data
  139.  
  140. def calculate_current_elo(df):
  141.     from elote import EloCompetitor
  142.     ratings = {}
  143.     for x in df.home_team.unique():
  144.         ratings[x] = GlickoCompetitor()
  145.     for x in df.away_team.unique():
  146.         ratings[x] = GlickoCompetitor()
  147.  
  148.     df = df.sort_values(by='game_date').reset_index(drop=True)
  149.     for i, r in df.iterrows():
  150.         # update ratings
  151.         if r['won_game']:
  152.             ratings[r.home_team].beat(ratings[r.away_team])
  153.         else:
  154.             ratings[r.away_team].beat(ratings[r.home_team])
  155.  
  156.     return ratings
  157.  
  158. def get_odds_data(start_date, days):
  159.     dates = [start_date + datetime.timedelta(days=x) for x in range(days)]
  160.  
  161.     odds_data = []
  162.  
  163.     for d in dates:
  164.         # get the web page with game data on it
  165.         game_day = d.strftime('%Y-%m-%d')
  166.         url = f'https://www.covers.com/Sports/NHL/Matchups?selectedDate={game_day}'
  167.         resp = requests.get(url)
  168.  
  169.         # parse the games
  170.         scraped_games = bs(resp.text, 'html.parser').findAll('div', {'class': 'cmg_matchup_game_box'})
  171.         for g in scraped_games:
  172.             game = {}
  173.             game['home_moneyline'] = g['data-game-odd']
  174.             game['date'] = g['data-game-date']
  175.             game['away_team_abbr'] = g['data-away-team-shortname-search']
  176.             game['home_team_abbr'] = g['data-home-team-shortname-search']
  177.             try:
  178.                 game['home_score'] = g.find('div', {'class': 'cmg_matchup_list_score_home'}).text.strip()
  179.                 game['away_score'] = g.find('div', {'class': 'cmg_matchup_list_score_away'}).text.strip()
  180.             except:
  181.                 game['home_score'] = ''
  182.                 game['away_score'] = ''
  183.  
  184.             odds_data.append(game)
  185.             if len(odds_data) % 500 == 0:
  186.                 # show progress
  187.                 print(datetime.datetime.now(), game_day, len(odds_data))
  188.                 # the actual outcome of the game, true if the the home team won
  189.     odds_df = pd.DataFrame(odds_data)
  190.     odds_df.to_csv('odds_data.csv', index=False)
  191.  
  192. def calculate_bet_value(odds_probability, model_probability):
  193.     return (model_probability / 100) * odds_probability - 1
  194.  
  195.  
  196. def predict_today_games(model,model2, lineup_file, current_elo_ratings, date):
  197.     # Get today's date
  198.     today = date
  199.  
  200.     # Get the schedule for today's games
  201.     schedule_url = f"https://statsapi.web.nhl.com/api/v1/schedule?date={today}"
  202.     response = requests.get(schedule_url)
  203.     data = response.json()
  204.     url = f'https://www.covers.com/Sports/NHL/Matchups?selectedDate={today}'
  205.     resp = requests.get(url)
  206.  
  207.     odds_data = []
  208.     # parse the games
  209.     scraped_games = bs(resp.text, 'html.parser').findAll('div', {'class': 'cmg_matchup_game_box'})
  210.     for g in scraped_games:
  211.         game = {}
  212.         game['home_moneyline'] = g['data-game-odd']
  213.         game['date'] = g['data-game-date']
  214.         game['away_team_abbr'] = g['data-away-team-shortname-search']
  215.         game['home_team_abbr'] = g['data-home-team-shortname-search']
  216.         try:
  217.             game['home_score'] = g.find('div', {'class': 'cmg_matchup_list_score_home'}).text.strip()
  218.             game['away_score'] = g.find('div', {'class': 'cmg_matchup_list_score_away'}).text.strip()
  219.         except:
  220.             game['home_score'] = ''
  221.             game['away_score'] = ''
  222.  
  223.         odds_data.append(game)
  224.  
  225.     odds_df = pd.DataFrame(odds_data)
  226.     odds_df['game_date'] = pd.to_datetime(odds_df['date']).dt.date
  227.     odds_df['home_moneyline'].replace('', np.nan, inplace=True)
  228.     odds_df.dropna(subset=['home_moneyline'], inplace=True)
  229.     odds_df.home_moneyline = pd.to_numeric(odds_df.home_moneyline)
  230.     odds_df['odds_proba']=np.nan
  231.     odds_df['odds_proba'][odds_df.home_moneyline<0] = -odds_df.home_moneyline/(-odds_df.home_moneyline + 100)
  232.     odds_df['odds_proba'][odds_df.home_moneyline>0] = (100/(odds_df.home_moneyline + 100))
  233.     odds_df['home_team_abbr'] = odds_df['home_team_abbr'].replace(team_abbr_to_name)
  234.     odds_df['away_team_abbr'] = odds_df['away_team_abbr'].replace(team_abbr_to_name)
  235.    
  236.  
  237.     odds_shark = fetch_odds_data(date,True)
  238.  
  239.     # Drop duplicates
  240.     odds_shark.drop('Arena', axis=1, inplace=True)
  241.     odds_shark= odds_shark.drop_duplicates()
  242.  
  243.     # If you want to reset the index after dropping duplicates
  244.     odds_shark = odds_shark.reset_index(drop=True)
  245.  
  246.     def moneyline_to_proba(moneyline):
  247.         if moneyline < 0:
  248.             return -moneyline / (-moneyline + 100)
  249.         else:
  250.             return 100 / (moneyline + 100)
  251.     # Convert the odds columns to probability
  252.     cols_to_convert = ['Home MoneyLine', 'Home Spread Price', 'Away MoneyLine', 'Away Spread Price', 'Under Price', 'Over Price']
  253.  
  254.     for col in cols_to_convert:
  255.         odds_shark[col] = odds_shark [col].apply(moneyline_to_proba)
  256.  
  257.     odds_shark['vegas_cut'] = 1 - (odds_shark['Home MoneyLine'] + odds_shark['Away MoneyLine'])
  258.  
  259.     # Create the reverse mapping dictionary
  260.     team_name_to_abbr = {v: k for k, v in team_abbr_to_name.items()}
  261.  
  262.     # Loop through the games and predict the winner
  263.     predictions = []
  264.     for game in data['dates'][0]['games']:
  265.  
  266.         games_data = []
  267.         home_team = remove_accents(game['teams']['home']['team']['name']).decode('utf-8')
  268.         away_team = remove_accents(game['teams']['away']['team']['name']).decode('utf-8')
  269.         game_id = game['gamePk']
  270.  
  271.         team_data = {
  272.             'game_id': game_id,
  273.             'game_date': today,
  274.             'home_team': home_team,
  275.             'away_team': away_team
  276.             }
  277.         games_data.append(team_data)
  278.  
  279.         # Convert the full team names to abbreviations
  280.         home_team_abbr = team_name_to_abbr[home_team]
  281.         away_team_abbr = team_name_to_abbr[away_team]
  282.  
  283.         # Load and preprocess the player performance data
  284.         gar_url = "https://raw.githubusercontent.com/NeilPaine538/NHL-Player-And-Team-Ratings/master/nhl_gar_historical.csv"
  285.         response = requests.get(gar_url)
  286.         csv_content = response.content.decode('ISO-8859-1')
  287.         gar_df = pd.read_csv(io.StringIO(csv_content))
  288.         gar_df = gar_df[gar_df['year_ID'] == 2023]
  289.         gar_df['team_ID'] = gar_df['team_ID'].replace(team_abbr_to_name)
  290.  
  291.         # Get the lineup data for the game
  292.         # Get the lineup data for the game
  293.         game_lineup_data = pd.read_csv(lineup_file)
  294.         home_team_lineup = game_lineup_data[(game_lineup_data['team'] == home_team_abbr) & (game_lineup_data['injury_status'] != "O")]
  295.         away_team_lineup = game_lineup_data[(game_lineup_data['team'] == away_team_abbr) & (game_lineup_data['injury_status'] != "O")]
  296.  
  297.         # Create a dictionary to store the lineup data for each team
  298.         team_lineups = {}
  299.         team_lineups[home_team] = home_team_lineup['first_name'] + ' ' + home_team_lineup['last_name']
  300.         team_lineups[away_team] = away_team_lineup['first_name'] + ' ' + away_team_lineup['last_name']
  301.  
  302.         # Adjust the player performance data based on the expected lineup for each team
  303.         home_team_performance = gar_df[gar_df['team_ID'] == home_team]
  304.         home_team_performance = home_team_performance[home_team_performance['player_name'].isin(team_lineups[home_team])]
  305.  
  306.         away_team_performance = gar_df[gar_df['team_ID'] == away_team]
  307.         away_team_performance = away_team_performance[away_team_performance['player_name'].isin(team_lineups[away_team])]
  308.  
  309.         games_df = pd.DataFrame(games_data)
  310.         home_team_elo = current_elo_ratings[home_team].rating
  311.         away_team_elo = current_elo_ratings[away_team].rating
  312.         games_df['home_team_elo'] = home_team_elo
  313.         games_df['away_team_elo'] = away_team_elo
  314.  
  315.         agg_stats_by_team_and_year = gar_df.groupby(['team_ID', 'year_ID']).agg({
  316.             'OPS': 'mean',
  317.             'DPS': 'mean',
  318.             'GPS': 'mean',
  319.             'PS': 'mean',
  320.             'adj_OGAR': 'mean',
  321.             'adj_DGAR': 'mean',
  322.             'adj_GGAR': 'mean',
  323.             'adj_GAR': 'mean'
  324.             }).reset_index()
  325.        
  326.         home_team_performance = games_df.merge(agg_stats_by_team_and_year, left_on=['home_team'], right_on=['team_ID'])
  327.         away_team_performance = games_df.merge(agg_stats_by_team_and_year, left_on=['away_team'], right_on=['team_ID'])
  328.  
  329.  
  330.         # Combine the team performances and select the desired features
  331.         combined_team_performance = pd.concat([home_team_performance, away_team_performance])
  332.         combined_team_performance.to_csv('dataset.csv', index=False)
  333.  
  334.         odds_shark['game_date'] = pd.to_datetime(odds_shark['Date']).dt.date
  335.         combined_team_performance = combined_team_performance.merge(odds_shark,
  336.                                                             left_on=['game_date','home_team', 'away_team'],
  337.                                                             right_on=['game_date', 'Home Name', 'Away Name'])
  338.         combined_team_performance['is_home_team'] = (combined_team_performance['home_team'] == combined_team_performance['team_ID']).astype(int)
  339.         X = combined_team_performance[['OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR', 'adj_DGAR', 'adj_GGAR', 'adj_GAR', 'Home MoneyLine', 'Away MoneyLine','Home Spread Price', 'Away Spread Price','Homes Spread', 'Away Spread','Home Votes', 'Away Votes','home_team_elo', 'away_team_elo', 'is_home_team', 'vegas_cut']]
  340.  
  341.         X2 = combined_team_performance[['OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR', 'adj_DGAR', 'adj_GGAR', 'adj_GAR','Total','Under Price','Over Price','Over Votes', 'Under Votes','home_team_elo', 'away_team_elo', 'is_home_team']]
  342.  
  343.         # Impute missing values in X
  344.         imputer = SimpleImputer()
  345.         X_imputed = imputer.fit_transform(X)
  346.         X2_imputed = imputer.fit_transform(X2)
  347.  
  348.         # Predict the winner
  349.         winner_prob = model.predict_proba(X_imputed)
  350.         home_win_prob = round(winner_prob[0][1]*100, 2)
  351.         away_win_prob = round(winner_prob[0][0]*100, 2)
  352.         total_score_pred = model2.predict(X2_imputed)[0]
  353.  
  354.  
  355.         # Add the prediction to the list of predictions
  356.         predictions.append({
  357.             'home_team': home_team,
  358.             'away_team': away_team,
  359.             'home_win_prob': home_win_prob,
  360.             'away_win_prob': away_win_prob,
  361.             'home_team_lineup': team_lineups[home_team],
  362.             'away_team_lineup': team_lineups[away_team],
  363.             'home_team_elo': home_team_elo,
  364.             'away_team_elo': away_team_elo,
  365.             'totals': total_score_pred,
  366.             })
  367.  
  368.     return predictions
  369.  
  370. def extract_team_data(json_data,predict):
  371.     # List to store extracted data
  372.     extracted_data = []
  373.    
  374.     # Iterate through the scores list
  375.     for game in json_data['scores']:
  376.         game_data = {}
  377.        
  378.         # Extract home team data
  379.         home_team = game['teams']['home']
  380.         away_team = game['teams']['away']
  381.         game_data['Home Name'] = home_team['names']['name']
  382.         game_data['Home MoneyLine'] = home_team['moneyLine']
  383.         game_data['Home Spread Price'] = home_team['spreadPrice']
  384.         game_data['Home Score'] = home_team['score']
  385.         game_data['Home Votes'] = home_team['votes']
  386.         game_data['Homes Spread'] = home_team['spread']
  387.  
  388.         if predict == False :
  389.             game_data['won_game'] = home_team['score'] > away_team['score']
  390.        
  391.         # Extract away team data
  392.         game_data['Away Name'] = away_team['names']['name']
  393.         game_data['Away MoneyLine'] = away_team['moneyLine']
  394.         game_data['Away Spread Price'] = away_team['spreadPrice']
  395.         game_data['Away Score'] = away_team['score']
  396.         game_data['Away Votes'] = away_team['votes']
  397.         game_data['Away Spread'] = away_team['spread']
  398.        
  399.         # Extract shared data
  400.         game_data['Under Price'] = game['underPrice']
  401.         game_data['Over Price'] = game['overPrice']
  402.         game_data['Over Votes'] = game['overVotes']
  403.         game_data['Under Votes'] = game['underVotes']
  404.         game_data['Total'] = game['total']
  405.         if predict == False :
  406.             game_data['Totals'] = home_team['score'] + away_team['score']
  407.         game_data['Arena'] = game['stadium']
  408.        
  409.         extracted_data.append(game_data)
  410.  
  411.     # Convert the list of dictionaries to a pandas DataFrame
  412.     df = pd.DataFrame(extracted_data)
  413.     return df
  414.  
  415. def fetch_odds_data(date, predict):
  416.     base_url = f"https://www.oddsshark.com/api/scores/nhl/{date}?_format=json"
  417.  
  418.     headers = {
  419.         'Accept': 'application/json, text/plain, */*',
  420.         'Referer': 'https://www.oddsshark.com/nhl/scores',
  421.         'Sec-Ch-Ua': '"Chromium";v="118", "Microsoft Edge";v="118", "Not=A?Brand";v="99"',
  422.         'Sec-Ch-Ua-Mobile': '?0',
  423.         'Sec-Ch-Ua-Platform': '"Windows"',
  424.         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.44'
  425.     }
  426.  
  427.     response = requests.get(base_url, headers=headers)
  428.  
  429.     if response.status_code == 200:
  430.         data = response.json()
  431.         df = extract_team_data(data,predict)
  432.         df['Date'] = date
  433.         return df
  434.     else:
  435.         print(f"Failed to fetch data for date: {date}")
  436.         return None
  437.  
  438. today = datetime.date.today()
  439. yesterday = today - datetime.timedelta(days=1)
  440. tomorrow = today + datetime.timedelta(days=1)
  441. year = 20232024
  442.  
  443. start_date = datetime.date(2014, 10, 12)
  444. end_date = today
  445. days = (end_date - start_date).days
  446.  
  447. dates = [start_date + datetime.timedelta(days=x) for x in range(days)]
  448.  
  449.  
  450. # Call the function with the desired season range
  451. start_season = 2014
  452. end_season = 2023
  453. current_season = 20222023
  454. all_games_data = get_data_for_seasons(start_season, end_season)
  455.  
  456. current_season_data = get_game_data(current_season, True)
  457.  
  458. current_season_data_df = pd.DataFrame(current_season_data)
  459.  
  460. current_season_data_df['home_team'] = current_season_data_df['home_team'].apply(lambda x: remove_accents(x).decode('utf-8'))
  461. current_season_data_df['away_team'] = current_season_data_df['away_team'].apply(lambda x: remove_accents(x).decode('utf-8'))
  462.  
  463. current_season_data_df['won_game'] = current_season_data_df['home_team_score'] > current_season_data_df['away_team_score']
  464.  
  465. current_elo_ratings = calculate_current_elo(current_season_data_df)
  466.  
  467. games_with_elo = pd.DataFrame()
  468.  
  469. all_games_df = pd.DataFrame(all_games_data)
  470.  
  471. all_games_df['won_game'] = all_games_df['home_team_score'] > all_games_df['away_team_score']
  472.  
  473. all_games_df['year'] = pd.to_datetime(all_games_df['game_date']).dt.year
  474.  
  475. # Initialize an empty list to store dataframes for each year
  476. dfs = []
  477.  
  478. # Initialize Elo ratings for each team before looping through the years (to ensure carry-over between seasons)
  479. ratings = {}
  480. for team in np.union1d(all_games_df.home_team.unique(), all_games_df.away_team.unique()):
  481.     ratings[team] = GlickoCompetitor()
  482.  
  483. # First, sort all_games_df by game_date
  484. all_games_df = all_games_df.sort_values(by='game_date').reset_index(drop=True)
  485.  
  486. home_team_elo = []
  487. away_team_elo = []
  488.  
  489. # Loop through each game and update Elo ratings
  490. for i, r in all_games_df.iterrows():
  491.     # Record the current Elo ratings for both teams
  492.     home_team_elo.append(ratings[r.home_team].rating)
  493.     away_team_elo.append(ratings[r.away_team].rating)
  494.    
  495.     # Update ratings based on game outcome
  496.     if r['won_game']:
  497.         ratings[r.home_team].beat(ratings[r.away_team])
  498.     else:
  499.         ratings[r.away_team].beat(ratings[r.home_team])
  500.  
  501. # Add the recorded Elo ratings to the dataset
  502. all_games_df['home_team_elo'] = home_team_elo
  503. all_games_df['away_team_elo'] = away_team_elo
  504.  
  505. games_with_elo = all_games_df.drop('won_game', axis=1)
  506.  
  507. # Load and preprocess the player performance data
  508. gar_url = "https://raw.githubusercontent.com/NeilPaine538/NHL-Player-And-Team-Ratings/master/nhl_gar_historical.csv"
  509. response = requests.get(gar_url)
  510. csv_content = response.content.decode('ISO-8859-1')
  511. gar_df = pd.read_csv(io.StringIO(csv_content))
  512. gar_df = gar_df[(gar_df['year_ID'] >= start_season) & (gar_df['year_ID'] <= end_season)]
  513. gar_df['team_ID'] = gar_df['team_ID'].replace(team_abbr_to_name)
  514.  
  515. # Extract the year from the game_date column
  516. games_with_elo['year'] = pd.to_datetime(games_with_elo['game_date']).dt.year
  517.  
  518. dates = games_with_elo['game_date']
  519. ##dfs = []
  520. ##
  521. ##for date in dates:
  522. ##    print(date)
  523. ##    odds_data = fetch_odds_data(date,False)
  524. ##    if odds_data is not None:
  525. ##        dfs.append(odds_data)
  526. ##
  527. ##final_df = pd.concat(dfs, ignore_index=True)
  528. ##
  529. ##final_df = final_df.drop_duplicates()
  530. ##final_df = final_df.reset_index(drop=True)
  531. ##
  532. ##final_df.to_csv('dataset3.csv', index=False)
  533.  
  534. odds_shark = pd.read_csv('dataset3.csv')
  535.  
  536. ##odds_shark.drop('Arena', axis=1, inplace=True)
  537. odds_shark= odds_shark.drop_duplicates()
  538. games_df = odds_shark
  539.  
  540. # Function to convert moneyline to implied probability
  541. def moneyline_to_proba(moneyline):
  542.     if moneyline < 0:
  543.         return -moneyline / (-moneyline + 100)
  544.     else:
  545.         return 100 / (moneyline + 100)
  546.  
  547. # Convert the odds columns to probability
  548. cols_to_convert = ['Home MoneyLine', 'Home Spread Price', 'Away MoneyLine', 'Away Spread Price', 'Under Price', 'Over Price']
  549.  
  550. for col in cols_to_convert:
  551.     odds_shark[col] = odds_shark [col].apply(moneyline_to_proba)
  552.  
  553. odds_shark['vegas_cut'] = 1 - (odds_shark['Home MoneyLine'] + odds_shark['Away MoneyLine'])
  554.  
  555. gar_df.to_csv('test1.csv', index=False)
  556.  
  557. agg_stats_by_team_and_year = gar_df.groupby(['team_ID', 'year_ID']).agg({
  558.     'OPS': 'mean',
  559.     'DPS': 'mean',
  560.     'GPS': 'mean',
  561.     'PS': 'mean',
  562.     'adj_OGAR': 'mean',
  563.     'adj_DGAR': 'mean',
  564.     'adj_GGAR': 'mean',
  565.     'adj_GAR': 'mean'
  566. }).reset_index()
  567.  
  568. home_team_performance = games_with_elo.merge(agg_stats_by_team_and_year, left_on=['home_team'], right_on=['team_ID'])
  569. away_team_performance = games_with_elo.merge(agg_stats_by_team_and_year, left_on=['away_team'], right_on=['team_ID'])
  570.  
  571. combined_team_performance = pd.concat([home_team_performance, away_team_performance])
  572.  
  573. combined_team_performance = combined_team_performance.drop_duplicates()
  574. odds_shark['game_date'] = pd.to_datetime(odds_shark['Date']).dt.date
  575.  
  576. combined_team_performance = combined_team_performance.merge(odds_shark,
  577.                                                             left_on=['game_date', 'home_team', 'away_team'],
  578.                                                             right_on=['game_date', 'Home Name', 'Away Name'])
  579.  
  580.  
  581.  
  582. combined_team_performance['is_home_team'] = (combined_team_performance['home_team'] == combined_team_performance['team_ID']).astype(int)
  583.  
  584. combined_team_performance = combined_team_performance.sort_values(by='game_date')
  585.  
  586. ### Identify duplicates for checking
  587. duplicates = combined_team_performance[combined_team_performance.duplicated(subset=['game_date', 'home_team', 'away_team'], keep=False)]
  588. ##
  589. # Resolve duplicates (e.g., by keeping the first occurrence)
  590. combined_team_performance = combined_team_performance.drop_duplicates(subset=['game_date', 'home_team', 'away_team'], keep='first')
  591.  
  592.  
  593. combined_team_performance.to_csv('test.csv', index=False)
  594.  
  595. feature_cols = [
  596.     'OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR',
  597.     'adj_DGAR', 'adj_GGAR', 'adj_GAR', 'Home MoneyLine',
  598.     'Away MoneyLine','Home Spread Price', 'Away Spread Price','Homes Spread', 'Away Spread','Home Votes', 'Away Votes','home_team_elo', 'away_team_elo', 'is_home_team', 'vegas_cut'
  599. ]
  600. target_col = 'won_game'
  601.  
  602. # Splitting Data first
  603. split_idx = int(0.9 * combined_team_performance.shape[0])
  604. train_data = combined_team_performance.iloc[:split_idx]
  605. test_data = combined_team_performance.iloc[split_idx:]
  606.  
  607. # Perform imputation on train_data
  608. imputer = SimpleImputer()
  609. X_train = train_data[feature_cols]
  610. y_train = train_data[target_col]
  611. X_train_imputed = imputer.fit_transform(X_train)
  612.  
  613. # Apply the same imputation transformation to test_data
  614. X_test = test_data[feature_cols]
  615. y_test = test_data[target_col]
  616. X_test_imputed = imputer.transform(X_test)
  617.  
  618. # Model Training
  619. model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators = 100)
  620. model.fit(X_train_imputed, y_train)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement