Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.model_selection import train_test_split , KFold , TimeSeriesSplit
- from sklearn.linear_model import LogisticRegression
- from sklearn.metrics import accuracy_score, classification_report
- from sklearn.impute import SimpleImputer
- from sklearn.pipeline import make_pipeline
- import datetime
- import requests
- import re
- import io
- import numpy as np
- from sklearn.svm import SVC
- import xgboost as xgb
- import unicodedata
- from bs4 import BeautifulSoup as bs
- from sklearn.metrics import accuracy_score
- from elote import EloCompetitor
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.model_selection import RandomizedSearchCV , StratifiedKFold
- from elote import GlickoCompetitor
- import warnings
- from sklearn.preprocessing import StandardScaler
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import GradientBoostingClassifier
- from sklearn.naive_bayes import GaussianNB
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.svm import LinearSVC
- from sklearn.calibration import CalibratedClassifierCV
- from sklearn.model_selection import cross_val_score
- from sklearn.metrics import log_loss
- from datetime import timedelta
- from sklearn.metrics import mean_squared_error
- import matplotlib.pyplot as plt
- from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
- warnings.filterwarnings("ignore")
- def remove_accents(input_str):
- nfkd_form = unicodedata.normalize('NFKD', input_str)
- only_ascii = nfkd_form.encode('ASCII', 'ignore')
- return only_ascii
- # Define a dictionary that maps team abbreviations to full names
- team_abbr_to_name = {
- 'ANA': 'Anaheim Ducks',
- 'ARI': 'Arizona Coyotes',
- 'BOS': 'Boston Bruins',
- 'BUF': 'Buffalo Sabres',
- 'CGY': 'Calgary Flames',
- 'CAL': 'Calgary Flames',
- 'CAR': 'Carolina Hurricanes',
- 'CHI': 'Chicago Blackhawks',
- 'COL': 'Colorado Avalanche',
- 'CBJ': 'Columbus Blue Jackets',
- 'CLB':'Columbus Blue Jackets',
- 'DAL': 'Dallas Stars',
- 'DET': 'Detroit Red Wings',
- 'EDM': 'Edmonton Oilers',
- 'FLA': 'Florida Panthers',
- 'LAK': 'Los Angeles Kings',
- 'LA': 'Los Angeles Kings',
- 'MIN': 'Minnesota Wild',
- 'MTL': 'Montreal Canadiens',
- 'MON': 'Montreal Canadiens',
- 'NSH': 'Nashville Predators',
- 'NAS': 'Nashville Predators',
- 'NJD': 'New Jersey Devils',
- 'NJ': 'New Jersey Devils',
- 'NYI': 'New York Islanders',
- 'NYR': 'New York Rangers',
- 'OTT': 'Ottawa Senators',
- 'PHI': 'Philadelphia Flyers',
- 'PIT': 'Pittsburgh Penguins',
- 'SEA': 'Seattle Kraken',
- 'SJS': 'San Jose Sharks',
- 'SJ': 'San Jose Sharks',
- 'STL': 'St. Louis Blues',
- 'TBL': 'Tampa Bay Lightning',
- 'TB': 'Tampa Bay Lightning',
- 'TOR': 'Toronto Maple Leafs',
- 'VAN': 'Vancouver Canucks',
- 'VGK': 'Vegas Golden Knights',
- 'VEG': 'Vegas Golden Knights',
- 'WSH': 'Washington Capitals',
- 'WAS': 'Washington Capitals',
- 'WPG': 'Winnipeg Jets',
- 'WIN': 'Winnipeg Jets'
- }
- # Define a function to get game data
- def get_game_data(season, plus):
- base_url = f"https://statsapi.web.nhl.com/api/v1/schedule?season={season}"
- response = requests.get(base_url)
- data = response.json()
- games_data = []
- for date_entry in data['dates']:
- for game in date_entry['games']:
- game_id = game['gamePk']
- game_date = datetime.datetime.strptime(date_entry['date'], '%Y-%m-%d').date()
- team_data = {
- 'game_id': game_id,
- 'game_date': game_date,
- 'home_team': game['teams']['home']['team']['name'],
- 'home_team_score': game['teams']['home']['score'],
- 'away_team': game['teams']['away']['team']['name'],
- 'away_team_score': game['teams']['away']['score'],
- }
- games_data.append(team_data)
- if plus == True :
- base_url2 = f"https://statsapi.web.nhl.com/api/v1/schedule?season={20232024}"
- response2 = requests.get(base_url2)
- data2 = response2.json()
- for date_entry2 in data2['dates']:
- for game2 in date_entry2['games']:
- game_id2 = game2['gamePk']
- game_date2 = datetime.datetime.strptime(date_entry2['date'], '%Y-%m-%d').date()
- team_data2 = {
- 'game_id': game_id2,
- 'game_date': game_date2,
- 'home_team': game2['teams']['home']['team']['name'],
- 'home_team_score': game2['teams']['home']['score'],
- 'away_team': game2['teams']['away']['team']['name'],
- 'away_team_score': game2['teams']['away']['score'],
- }
- games_data.append(team_data2)
- return games_data
- def get_data_for_seasons(start_season, end_season):
- all_games_data = []
- for season in range(start_season, end_season):
- season_data = get_game_data(str(season) + str(season + 1),False)
- all_games_data.extend(season_data)
- return all_games_data
- def calculate_current_elo(df):
- from elote import EloCompetitor
- ratings = {}
- for x in df.home_team.unique():
- ratings[x] = GlickoCompetitor()
- for x in df.away_team.unique():
- ratings[x] = GlickoCompetitor()
- df = df.sort_values(by='game_date').reset_index(drop=True)
- for i, r in df.iterrows():
- # update ratings
- if r['won_game']:
- ratings[r.home_team].beat(ratings[r.away_team])
- else:
- ratings[r.away_team].beat(ratings[r.home_team])
- return ratings
- def get_odds_data(start_date, days):
- dates = [start_date + datetime.timedelta(days=x) for x in range(days)]
- odds_data = []
- for d in dates:
- # get the web page with game data on it
- game_day = d.strftime('%Y-%m-%d')
- url = f'https://www.covers.com/Sports/NHL/Matchups?selectedDate={game_day}'
- resp = requests.get(url)
- # parse the games
- scraped_games = bs(resp.text, 'html.parser').findAll('div', {'class': 'cmg_matchup_game_box'})
- for g in scraped_games:
- game = {}
- game['home_moneyline'] = g['data-game-odd']
- game['date'] = g['data-game-date']
- game['away_team_abbr'] = g['data-away-team-shortname-search']
- game['home_team_abbr'] = g['data-home-team-shortname-search']
- try:
- game['home_score'] = g.find('div', {'class': 'cmg_matchup_list_score_home'}).text.strip()
- game['away_score'] = g.find('div', {'class': 'cmg_matchup_list_score_away'}).text.strip()
- except:
- game['home_score'] = ''
- game['away_score'] = ''
- odds_data.append(game)
- if len(odds_data) % 500 == 0:
- # show progress
- print(datetime.datetime.now(), game_day, len(odds_data))
- # the actual outcome of the game, true if the the home team won
- odds_df = pd.DataFrame(odds_data)
- odds_df.to_csv('odds_data.csv', index=False)
- def calculate_bet_value(odds_probability, model_probability):
- return (model_probability / 100) * odds_probability - 1
- def predict_today_games(model,model2, lineup_file, current_elo_ratings, date):
- # Get today's date
- today = date
- # Get the schedule for today's games
- schedule_url = f"https://statsapi.web.nhl.com/api/v1/schedule?date={today}"
- response = requests.get(schedule_url)
- data = response.json()
- url = f'https://www.covers.com/Sports/NHL/Matchups?selectedDate={today}'
- resp = requests.get(url)
- odds_data = []
- # parse the games
- scraped_games = bs(resp.text, 'html.parser').findAll('div', {'class': 'cmg_matchup_game_box'})
- for g in scraped_games:
- game = {}
- game['home_moneyline'] = g['data-game-odd']
- game['date'] = g['data-game-date']
- game['away_team_abbr'] = g['data-away-team-shortname-search']
- game['home_team_abbr'] = g['data-home-team-shortname-search']
- try:
- game['home_score'] = g.find('div', {'class': 'cmg_matchup_list_score_home'}).text.strip()
- game['away_score'] = g.find('div', {'class': 'cmg_matchup_list_score_away'}).text.strip()
- except:
- game['home_score'] = ''
- game['away_score'] = ''
- odds_data.append(game)
- odds_df = pd.DataFrame(odds_data)
- odds_df['game_date'] = pd.to_datetime(odds_df['date']).dt.date
- odds_df['home_moneyline'].replace('', np.nan, inplace=True)
- odds_df.dropna(subset=['home_moneyline'], inplace=True)
- odds_df.home_moneyline = pd.to_numeric(odds_df.home_moneyline)
- odds_df['odds_proba']=np.nan
- odds_df['odds_proba'][odds_df.home_moneyline<0] = -odds_df.home_moneyline/(-odds_df.home_moneyline + 100)
- odds_df['odds_proba'][odds_df.home_moneyline>0] = (100/(odds_df.home_moneyline + 100))
- odds_df['home_team_abbr'] = odds_df['home_team_abbr'].replace(team_abbr_to_name)
- odds_df['away_team_abbr'] = odds_df['away_team_abbr'].replace(team_abbr_to_name)
- odds_shark = fetch_odds_data(date,True)
- # Drop duplicates
- odds_shark.drop('Arena', axis=1, inplace=True)
- odds_shark= odds_shark.drop_duplicates()
- # If you want to reset the index after dropping duplicates
- odds_shark = odds_shark.reset_index(drop=True)
- def moneyline_to_proba(moneyline):
- if moneyline < 0:
- return -moneyline / (-moneyline + 100)
- else:
- return 100 / (moneyline + 100)
- # Convert the odds columns to probability
- cols_to_convert = ['Home MoneyLine', 'Home Spread Price', 'Away MoneyLine', 'Away Spread Price', 'Under Price', 'Over Price']
- for col in cols_to_convert:
- odds_shark[col] = odds_shark [col].apply(moneyline_to_proba)
- odds_shark['vegas_cut'] = 1 - (odds_shark['Home MoneyLine'] + odds_shark['Away MoneyLine'])
- # Create the reverse mapping dictionary
- team_name_to_abbr = {v: k for k, v in team_abbr_to_name.items()}
- # Loop through the games and predict the winner
- predictions = []
- for game in data['dates'][0]['games']:
- games_data = []
- home_team = remove_accents(game['teams']['home']['team']['name']).decode('utf-8')
- away_team = remove_accents(game['teams']['away']['team']['name']).decode('utf-8')
- game_id = game['gamePk']
- team_data = {
- 'game_id': game_id,
- 'game_date': today,
- 'home_team': home_team,
- 'away_team': away_team
- }
- games_data.append(team_data)
- # Convert the full team names to abbreviations
- home_team_abbr = team_name_to_abbr[home_team]
- away_team_abbr = team_name_to_abbr[away_team]
- # Load and preprocess the player performance data
- gar_url = "https://raw.githubusercontent.com/NeilPaine538/NHL-Player-And-Team-Ratings/master/nhl_gar_historical.csv"
- response = requests.get(gar_url)
- csv_content = response.content.decode('ISO-8859-1')
- gar_df = pd.read_csv(io.StringIO(csv_content))
- gar_df = gar_df[gar_df['year_ID'] == 2023]
- gar_df['team_ID'] = gar_df['team_ID'].replace(team_abbr_to_name)
- # Get the lineup data for the game
- # Get the lineup data for the game
- game_lineup_data = pd.read_csv(lineup_file)
- home_team_lineup = game_lineup_data[(game_lineup_data['team'] == home_team_abbr) & (game_lineup_data['injury_status'] != "O")]
- away_team_lineup = game_lineup_data[(game_lineup_data['team'] == away_team_abbr) & (game_lineup_data['injury_status'] != "O")]
- # Create a dictionary to store the lineup data for each team
- team_lineups = {}
- team_lineups[home_team] = home_team_lineup['first_name'] + ' ' + home_team_lineup['last_name']
- team_lineups[away_team] = away_team_lineup['first_name'] + ' ' + away_team_lineup['last_name']
- # Adjust the player performance data based on the expected lineup for each team
- home_team_performance = gar_df[gar_df['team_ID'] == home_team]
- home_team_performance = home_team_performance[home_team_performance['player_name'].isin(team_lineups[home_team])]
- away_team_performance = gar_df[gar_df['team_ID'] == away_team]
- away_team_performance = away_team_performance[away_team_performance['player_name'].isin(team_lineups[away_team])]
- games_df = pd.DataFrame(games_data)
- home_team_elo = current_elo_ratings[home_team].rating
- away_team_elo = current_elo_ratings[away_team].rating
- games_df['home_team_elo'] = home_team_elo
- games_df['away_team_elo'] = away_team_elo
- agg_stats_by_team_and_year = gar_df.groupby(['team_ID', 'year_ID']).agg({
- 'OPS': 'mean',
- 'DPS': 'mean',
- 'GPS': 'mean',
- 'PS': 'mean',
- 'adj_OGAR': 'mean',
- 'adj_DGAR': 'mean',
- 'adj_GGAR': 'mean',
- 'adj_GAR': 'mean'
- }).reset_index()
- home_team_performance = games_df.merge(agg_stats_by_team_and_year, left_on=['home_team'], right_on=['team_ID'])
- away_team_performance = games_df.merge(agg_stats_by_team_and_year, left_on=['away_team'], right_on=['team_ID'])
- # Combine the team performances and select the desired features
- combined_team_performance = pd.concat([home_team_performance, away_team_performance])
- combined_team_performance.to_csv('dataset.csv', index=False)
- odds_shark['game_date'] = pd.to_datetime(odds_shark['Date']).dt.date
- combined_team_performance = combined_team_performance.merge(odds_shark,
- left_on=['game_date','home_team', 'away_team'],
- right_on=['game_date', 'Home Name', 'Away Name'])
- combined_team_performance['is_home_team'] = (combined_team_performance['home_team'] == combined_team_performance['team_ID']).astype(int)
- X = combined_team_performance[['OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR', 'adj_DGAR', 'adj_GGAR', 'adj_GAR', 'Home MoneyLine', 'Away MoneyLine','Home Spread Price', 'Away Spread Price','Homes Spread', 'Away Spread','Home Votes', 'Away Votes','home_team_elo', 'away_team_elo', 'is_home_team', 'vegas_cut']]
- X2 = combined_team_performance[['OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR', 'adj_DGAR', 'adj_GGAR', 'adj_GAR','Total','Under Price','Over Price','Over Votes', 'Under Votes','home_team_elo', 'away_team_elo', 'is_home_team']]
- # Impute missing values in X
- imputer = SimpleImputer()
- X_imputed = imputer.fit_transform(X)
- X2_imputed = imputer.fit_transform(X2)
- # Predict the winner
- winner_prob = model.predict_proba(X_imputed)
- home_win_prob = round(winner_prob[0][1]*100, 2)
- away_win_prob = round(winner_prob[0][0]*100, 2)
- total_score_pred = model2.predict(X2_imputed)[0]
- # Add the prediction to the list of predictions
- predictions.append({
- 'home_team': home_team,
- 'away_team': away_team,
- 'home_win_prob': home_win_prob,
- 'away_win_prob': away_win_prob,
- 'home_team_lineup': team_lineups[home_team],
- 'away_team_lineup': team_lineups[away_team],
- 'home_team_elo': home_team_elo,
- 'away_team_elo': away_team_elo,
- 'totals': total_score_pred,
- })
- return predictions
- def extract_team_data(json_data,predict):
- # List to store extracted data
- extracted_data = []
- # Iterate through the scores list
- for game in json_data['scores']:
- game_data = {}
- # Extract home team data
- home_team = game['teams']['home']
- away_team = game['teams']['away']
- game_data['Home Name'] = home_team['names']['name']
- game_data['Home MoneyLine'] = home_team['moneyLine']
- game_data['Home Spread Price'] = home_team['spreadPrice']
- game_data['Home Score'] = home_team['score']
- game_data['Home Votes'] = home_team['votes']
- game_data['Homes Spread'] = home_team['spread']
- if predict == False :
- game_data['won_game'] = home_team['score'] > away_team['score']
- # Extract away team data
- game_data['Away Name'] = away_team['names']['name']
- game_data['Away MoneyLine'] = away_team['moneyLine']
- game_data['Away Spread Price'] = away_team['spreadPrice']
- game_data['Away Score'] = away_team['score']
- game_data['Away Votes'] = away_team['votes']
- game_data['Away Spread'] = away_team['spread']
- # Extract shared data
- game_data['Under Price'] = game['underPrice']
- game_data['Over Price'] = game['overPrice']
- game_data['Over Votes'] = game['overVotes']
- game_data['Under Votes'] = game['underVotes']
- game_data['Total'] = game['total']
- if predict == False :
- game_data['Totals'] = home_team['score'] + away_team['score']
- game_data['Arena'] = game['stadium']
- extracted_data.append(game_data)
- # Convert the list of dictionaries to a pandas DataFrame
- df = pd.DataFrame(extracted_data)
- return df
- def fetch_odds_data(date, predict):
- base_url = f"https://www.oddsshark.com/api/scores/nhl/{date}?_format=json"
- headers = {
- 'Accept': 'application/json, text/plain, */*',
- 'Referer': 'https://www.oddsshark.com/nhl/scores',
- 'Sec-Ch-Ua': '"Chromium";v="118", "Microsoft Edge";v="118", "Not=A?Brand";v="99"',
- 'Sec-Ch-Ua-Mobile': '?0',
- 'Sec-Ch-Ua-Platform': '"Windows"',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.44'
- }
- response = requests.get(base_url, headers=headers)
- if response.status_code == 200:
- data = response.json()
- df = extract_team_data(data,predict)
- df['Date'] = date
- return df
- else:
- print(f"Failed to fetch data for date: {date}")
- return None
- today = datetime.date.today()
- yesterday = today - datetime.timedelta(days=1)
- tomorrow = today + datetime.timedelta(days=1)
- year = 20232024
- start_date = datetime.date(2014, 10, 12)
- end_date = today
- days = (end_date - start_date).days
- dates = [start_date + datetime.timedelta(days=x) for x in range(days)]
- # Call the function with the desired season range
- start_season = 2014
- end_season = 2023
- current_season = 20222023
- all_games_data = get_data_for_seasons(start_season, end_season)
- current_season_data = get_game_data(current_season, True)
- current_season_data_df = pd.DataFrame(current_season_data)
- current_season_data_df['home_team'] = current_season_data_df['home_team'].apply(lambda x: remove_accents(x).decode('utf-8'))
- current_season_data_df['away_team'] = current_season_data_df['away_team'].apply(lambda x: remove_accents(x).decode('utf-8'))
- current_season_data_df['won_game'] = current_season_data_df['home_team_score'] > current_season_data_df['away_team_score']
- current_elo_ratings = calculate_current_elo(current_season_data_df)
- games_with_elo = pd.DataFrame()
- all_games_df = pd.DataFrame(all_games_data)
- all_games_df['won_game'] = all_games_df['home_team_score'] > all_games_df['away_team_score']
- all_games_df['year'] = pd.to_datetime(all_games_df['game_date']).dt.year
- # Initialize an empty list to store dataframes for each year
- dfs = []
- # Initialize Elo ratings for each team before looping through the years (to ensure carry-over between seasons)
- ratings = {}
- for team in np.union1d(all_games_df.home_team.unique(), all_games_df.away_team.unique()):
- ratings[team] = GlickoCompetitor()
- # First, sort all_games_df by game_date
- all_games_df = all_games_df.sort_values(by='game_date').reset_index(drop=True)
- home_team_elo = []
- away_team_elo = []
- # Loop through each game and update Elo ratings
- for i, r in all_games_df.iterrows():
- # Record the current Elo ratings for both teams
- home_team_elo.append(ratings[r.home_team].rating)
- away_team_elo.append(ratings[r.away_team].rating)
- # Update ratings based on game outcome
- if r['won_game']:
- ratings[r.home_team].beat(ratings[r.away_team])
- else:
- ratings[r.away_team].beat(ratings[r.home_team])
- # Add the recorded Elo ratings to the dataset
- all_games_df['home_team_elo'] = home_team_elo
- all_games_df['away_team_elo'] = away_team_elo
- games_with_elo = all_games_df.drop('won_game', axis=1)
- # Load and preprocess the player performance data
- gar_url = "https://raw.githubusercontent.com/NeilPaine538/NHL-Player-And-Team-Ratings/master/nhl_gar_historical.csv"
- response = requests.get(gar_url)
- csv_content = response.content.decode('ISO-8859-1')
- gar_df = pd.read_csv(io.StringIO(csv_content))
- gar_df = gar_df[(gar_df['year_ID'] >= start_season) & (gar_df['year_ID'] <= end_season)]
- gar_df['team_ID'] = gar_df['team_ID'].replace(team_abbr_to_name)
- # Extract the year from the game_date column
- games_with_elo['year'] = pd.to_datetime(games_with_elo['game_date']).dt.year
- dates = games_with_elo['game_date']
- ##dfs = []
- ##
- ##for date in dates:
- ## print(date)
- ## odds_data = fetch_odds_data(date,False)
- ## if odds_data is not None:
- ## dfs.append(odds_data)
- ##
- ##final_df = pd.concat(dfs, ignore_index=True)
- ##
- ##final_df = final_df.drop_duplicates()
- ##final_df = final_df.reset_index(drop=True)
- ##
- ##final_df.to_csv('dataset3.csv', index=False)
- odds_shark = pd.read_csv('dataset3.csv')
- ##odds_shark.drop('Arena', axis=1, inplace=True)
- odds_shark= odds_shark.drop_duplicates()
- games_df = odds_shark
- # Function to convert moneyline to implied probability
- def moneyline_to_proba(moneyline):
- if moneyline < 0:
- return -moneyline / (-moneyline + 100)
- else:
- return 100 / (moneyline + 100)
- # Convert the odds columns to probability
- cols_to_convert = ['Home MoneyLine', 'Home Spread Price', 'Away MoneyLine', 'Away Spread Price', 'Under Price', 'Over Price']
- for col in cols_to_convert:
- odds_shark[col] = odds_shark [col].apply(moneyline_to_proba)
- odds_shark['vegas_cut'] = 1 - (odds_shark['Home MoneyLine'] + odds_shark['Away MoneyLine'])
- gar_df.to_csv('test1.csv', index=False)
- agg_stats_by_team_and_year = gar_df.groupby(['team_ID', 'year_ID']).agg({
- 'OPS': 'mean',
- 'DPS': 'mean',
- 'GPS': 'mean',
- 'PS': 'mean',
- 'adj_OGAR': 'mean',
- 'adj_DGAR': 'mean',
- 'adj_GGAR': 'mean',
- 'adj_GAR': 'mean'
- }).reset_index()
- home_team_performance = games_with_elo.merge(agg_stats_by_team_and_year, left_on=['home_team'], right_on=['team_ID'])
- away_team_performance = games_with_elo.merge(agg_stats_by_team_and_year, left_on=['away_team'], right_on=['team_ID'])
- combined_team_performance = pd.concat([home_team_performance, away_team_performance])
- combined_team_performance = combined_team_performance.drop_duplicates()
- odds_shark['game_date'] = pd.to_datetime(odds_shark['Date']).dt.date
- combined_team_performance = combined_team_performance.merge(odds_shark,
- left_on=['game_date', 'home_team', 'away_team'],
- right_on=['game_date', 'Home Name', 'Away Name'])
- combined_team_performance['is_home_team'] = (combined_team_performance['home_team'] == combined_team_performance['team_ID']).astype(int)
- combined_team_performance = combined_team_performance.sort_values(by='game_date')
- ### Identify duplicates for checking
- duplicates = combined_team_performance[combined_team_performance.duplicated(subset=['game_date', 'home_team', 'away_team'], keep=False)]
- ##
- # Resolve duplicates (e.g., by keeping the first occurrence)
- combined_team_performance = combined_team_performance.drop_duplicates(subset=['game_date', 'home_team', 'away_team'], keep='first')
- combined_team_performance.to_csv('test.csv', index=False)
- feature_cols = [
- 'OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR',
- 'adj_DGAR', 'adj_GGAR', 'adj_GAR', 'Home MoneyLine',
- 'Away MoneyLine','Home Spread Price', 'Away Spread Price','Homes Spread', 'Away Spread','Home Votes', 'Away Votes','home_team_elo', 'away_team_elo', 'is_home_team', 'vegas_cut'
- ]
- target_col = 'won_game'
- # Splitting Data first
- split_idx = int(0.9 * combined_team_performance.shape[0])
- train_data = combined_team_performance.iloc[:split_idx]
- test_data = combined_team_performance.iloc[split_idx:]
- # Perform imputation on train_data
- imputer = SimpleImputer()
- X_train = train_data[feature_cols]
- y_train = train_data[target_col]
- X_train_imputed = imputer.fit_transform(X_train)
- # Apply the same imputation transformation to test_data
- X_test = test_data[feature_cols]
- y_test = test_data[target_col]
- X_test_imputed = imputer.transform(X_test)
- # Model Training
- model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators = 100)
- model.fit(X_train_imputed, y_train)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement