Untitled

import pandas as pd
from sklearn.model_selection import train_test_split , KFold , TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import datetime
import requests
import re
import io
import numpy as np
from sklearn.svm import SVC
import xgboost as xgb
import unicodedata
from bs4 import BeautifulSoup as bs
from sklearn.metrics import accuracy_score
from elote import EloCompetitor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV , StratifiedKFold
from elote import GlickoCompetitor
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss
from datetime import timedelta
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

warnings.filterwarnings("ignore")


def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii


# Define a dictionary that maps team abbreviations to full names
team_abbr_to_name = {
    'ANA': 'Anaheim Ducks',
    'ARI': 'Arizona Coyotes',
    'BOS': 'Boston Bruins',
    'BUF': 'Buffalo Sabres',
    'CGY': 'Calgary Flames',
    'CAL': 'Calgary Flames',
    'CAR': 'Carolina Hurricanes',
    'CHI': 'Chicago Blackhawks',
    'COL': 'Colorado Avalanche',
    'CBJ': 'Columbus Blue Jackets',
    'CLB':'Columbus Blue Jackets',
    'DAL': 'Dallas Stars',
    'DET': 'Detroit Red Wings',
    'EDM': 'Edmonton Oilers',
    'FLA': 'Florida Panthers',
    'LAK': 'Los Angeles Kings',
    'LA': 'Los Angeles Kings',
    'MIN': 'Minnesota Wild',
    'MTL': 'Montreal Canadiens',
    'MON': 'Montreal Canadiens',
    'NSH': 'Nashville Predators',
    'NAS': 'Nashville Predators',
    'NJD': 'New Jersey Devils',
    'NJ': 'New Jersey Devils',
    'NYI': 'New York Islanders',
    'NYR': 'New York Rangers',
    'OTT': 'Ottawa Senators',
    'PHI': 'Philadelphia Flyers',
    'PIT': 'Pittsburgh Penguins',
    'SEA': 'Seattle Kraken',
    'SJS': 'San Jose Sharks',
    'SJ': 'San Jose Sharks',
    'STL': 'St. Louis Blues',
    'TBL': 'Tampa Bay Lightning',
    'TB': 'Tampa Bay Lightning',
    'TOR': 'Toronto Maple Leafs',
    'VAN': 'Vancouver Canucks',
    'VGK': 'Vegas Golden Knights',
    'VEG': 'Vegas Golden Knights',
    'WSH': 'Washington Capitals',
    'WAS': 'Washington Capitals',
    'WPG': 'Winnipeg Jets',
    'WIN': 'Winnipeg Jets'
}

# Define a function to get game data
def get_game_data(season, plus):
    base_url = f"https://statsapi.web.nhl.com/api/v1/schedule?season={season}"
    response = requests.get(base_url)
    data = response.json()

    games_data = []

    for date_entry in data['dates']:
        for game in date_entry['games']:
            game_id = game['gamePk']
            game_date = datetime.datetime.strptime(date_entry['date'], '%Y-%m-%d').date()

            team_data = {
                'game_id': game_id,
                'game_date': game_date,
                'home_team': game['teams']['home']['team']['name'],
                'home_team_score': game['teams']['home']['score'],
                'away_team': game['teams']['away']['team']['name'],
                'away_team_score': game['teams']['away']['score'],
            }
            games_data.append(team_data)
    if plus == True :
        base_url2 = f"https://statsapi.web.nhl.com/api/v1/schedule?season={20232024}"
        response2 = requests.get(base_url2)
        data2 = response2.json()
        for date_entry2 in data2['dates']:
            for game2 in date_entry2['games']:
                game_id2 = game2['gamePk']
                game_date2 = datetime.datetime.strptime(date_entry2['date'], '%Y-%m-%d').date()
                team_data2 = {
                'game_id': game_id2,
                'game_date': game_date2,
                'home_team': game2['teams']['home']['team']['name'],
                'home_team_score': game2['teams']['home']['score'],
                'away_team': game2['teams']['away']['team']['name'],
                'away_team_score': game2['teams']['away']['score'],
                }
                games_data.append(team_data2)
    return games_data

def get_data_for_seasons(start_season, end_season):
    all_games_data = []
    for season in range(start_season, end_season):
        season_data = get_game_data(str(season) + str(season + 1),False)
        all_games_data.extend(season_data)
    return all_games_data

def calculate_current_elo(df):
    from elote import EloCompetitor
    ratings = {}
    for x in df.home_team.unique():
        ratings[x] = GlickoCompetitor()
    for x in df.away_team.unique():
        ratings[x] = GlickoCompetitor()

    df = df.sort_values(by='game_date').reset_index(drop=True)
    for i, r in df.iterrows():
        # update ratings
        if r['won_game']:
            ratings[r.home_team].beat(ratings[r.away_team])
        else:
            ratings[r.away_team].beat(ratings[r.home_team])

    return ratings

def get_odds_data(start_date, days):
    dates = [start_date + datetime.timedelta(days=x) for x in range(days)]

    odds_data = []

    for d in dates:
        # get the web page with game data on it
        game_day = d.strftime('%Y-%m-%d')
        url = f'https://www.covers.com/Sports/NHL/Matchups?selectedDate={game_day}'
        resp = requests.get(url)

        # parse the games
        scraped_games = bs(resp.text, 'html.parser').findAll('div', {'class': 'cmg_matchup_game_box'})
        for g in scraped_games:
            game = {}
            game['home_moneyline'] = g['data-game-odd']
            game['date'] = g['data-game-date']
            game['away_team_abbr'] = g['data-away-team-shortname-search']
            game['home_team_abbr'] = g['data-home-team-shortname-search']
            try:
                game['home_score'] = g.find('div', {'class': 'cmg_matchup_list_score_home'}).text.strip()
                game['away_score'] = g.find('div', {'class': 'cmg_matchup_list_score_away'}).text.strip()
            except:
                game['home_score'] = ''
                game['away_score'] = ''

            odds_data.append(game)
            if len(odds_data) % 500 == 0:
                # show progress
                print(datetime.datetime.now(), game_day, len(odds_data))
                # the actual outcome of the game, true if the the home team won
    odds_df = pd.DataFrame(odds_data)
    odds_df.to_csv('odds_data.csv', index=False)

def calculate_bet_value(odds_probability, model_probability):
    return (model_probability / 100) * odds_probability - 1


def predict_today_games(model,model2, lineup_file, current_elo_ratings, date):
    # Get today's date
    today = date

    # Get the schedule for today's games
    schedule_url = f"https://statsapi.web.nhl.com/api/v1/schedule?date={today}"
    response = requests.get(schedule_url)
    data = response.json()
    url = f'https://www.covers.com/Sports/NHL/Matchups?selectedDate={today}'
    resp = requests.get(url)

    odds_data = []
    # parse the games
    scraped_games = bs(resp.text, 'html.parser').findAll('div', {'class': 'cmg_matchup_game_box'})
    for g in scraped_games:
        game = {}
        game['home_moneyline'] = g['data-game-odd']
        game['date'] = g['data-game-date']
        game['away_team_abbr'] = g['data-away-team-shortname-search']
        game['home_team_abbr'] = g['data-home-team-shortname-search']
        try:
            game['home_score'] = g.find('div', {'class': 'cmg_matchup_list_score_home'}).text.strip()
            game['away_score'] = g.find('div', {'class': 'cmg_matchup_list_score_away'}).text.strip()
        except:
            game['home_score'] = ''
            game['away_score'] = ''

        odds_data.append(game)

    odds_df = pd.DataFrame(odds_data)
    odds_df['game_date'] = pd.to_datetime(odds_df['date']).dt.date
    odds_df['home_moneyline'].replace('', np.nan, inplace=True)
    odds_df.dropna(subset=['home_moneyline'], inplace=True)
    odds_df.home_moneyline = pd.to_numeric(odds_df.home_moneyline)
    odds_df['odds_proba']=np.nan
    odds_df['odds_proba'][odds_df.home_moneyline<0] = -odds_df.home_moneyline/(-odds_df.home_moneyline + 100)
    odds_df['odds_proba'][odds_df.home_moneyline>0] = (100/(odds_df.home_moneyline + 100))
    odds_df['home_team_abbr'] = odds_df['home_team_abbr'].replace(team_abbr_to_name)
    odds_df['away_team_abbr'] = odds_df['away_team_abbr'].replace(team_abbr_to_name)


    odds_shark = fetch_odds_data(date,True)

    # Drop duplicates
    odds_shark.drop('Arena', axis=1, inplace=True)
    odds_shark= odds_shark.drop_duplicates()

    # If you want to reset the index after dropping duplicates
    odds_shark = odds_shark.reset_index(drop=True)

    def moneyline_to_proba(moneyline):
        if moneyline < 0:
            return -moneyline / (-moneyline + 100)
        else:
            return 100 / (moneyline + 100)
    # Convert the odds columns to probability
    cols_to_convert = ['Home MoneyLine', 'Home Spread Price', 'Away MoneyLine', 'Away Spread Price', 'Under Price', 'Over Price']

    for col in cols_to_convert:
        odds_shark[col] = odds_shark [col].apply(moneyline_to_proba)

    odds_shark['vegas_cut'] = 1 - (odds_shark['Home MoneyLine'] + odds_shark['Away MoneyLine'])

    # Create the reverse mapping dictionary
    team_name_to_abbr = {v: k for k, v in team_abbr_to_name.items()}

    # Loop through the games and predict the winner
    predictions = []
    for game in data['dates'][0]['games']:

        games_data = []
        home_team = remove_accents(game['teams']['home']['team']['name']).decode('utf-8')
        away_team = remove_accents(game['teams']['away']['team']['name']).decode('utf-8')
        game_id = game['gamePk']

        team_data = {
            'game_id': game_id,
            'game_date': today,
            'home_team': home_team,
            'away_team': away_team
            }
        games_data.append(team_data)

        # Convert the full team names to abbreviations
        home_team_abbr = team_name_to_abbr[home_team]
        away_team_abbr = team_name_to_abbr[away_team]

        # Load and preprocess the player performance data
        gar_url = "https://raw.githubusercontent.com/NeilPaine538/NHL-Player-And-Team-Ratings/master/nhl_gar_historical.csv"
        response = requests.get(gar_url)
        csv_content = response.content.decode('ISO-8859-1')
        gar_df = pd.read_csv(io.StringIO(csv_content))
        gar_df = gar_df[gar_df['year_ID'] == 2023]
        gar_df['team_ID'] = gar_df['team_ID'].replace(team_abbr_to_name)

        # Get the lineup data for the game
        # Get the lineup data for the game
        game_lineup_data = pd.read_csv(lineup_file)
        home_team_lineup = game_lineup_data[(game_lineup_data['team'] == home_team_abbr) & (game_lineup_data['injury_status'] != "O")]
        away_team_lineup = game_lineup_data[(game_lineup_data['team'] == away_team_abbr) & (game_lineup_data['injury_status'] != "O")]

        # Create a dictionary to store the lineup data for each team
        team_lineups = {}
        team_lineups[home_team] = home_team_lineup['first_name'] + ' ' + home_team_lineup['last_name']
        team_lineups[away_team] = away_team_lineup['first_name'] + ' ' + away_team_lineup['last_name']

        # Adjust the player performance data based on the expected lineup for each team
        home_team_performance = gar_df[gar_df['team_ID'] == home_team]
        home_team_performance = home_team_performance[home_team_performance['player_name'].isin(team_lineups[home_team])]

        away_team_performance = gar_df[gar_df['team_ID'] == away_team]
        away_team_performance = away_team_performance[away_team_performance['player_name'].isin(team_lineups[away_team])]

        games_df = pd.DataFrame(games_data)
        home_team_elo = current_elo_ratings[home_team].rating
        away_team_elo = current_elo_ratings[away_team].rating
        games_df['home_team_elo'] = home_team_elo
        games_df['away_team_elo'] = away_team_elo

        agg_stats_by_team_and_year = gar_df.groupby(['team_ID', 'year_ID']).agg({
            'OPS': 'mean',
            'DPS': 'mean',
            'GPS': 'mean',
            'PS': 'mean',
            'adj_OGAR': 'mean',
            'adj_DGAR': 'mean',
            'adj_GGAR': 'mean',
            'adj_GAR': 'mean'
            }).reset_index()

        home_team_performance = games_df.merge(agg_stats_by_team_and_year, left_on=['home_team'], right_on=['team_ID'])
        away_team_performance = games_df.merge(agg_stats_by_team_and_year, left_on=['away_team'], right_on=['team_ID'])


        # Combine the team performances and select the desired features
        combined_team_performance = pd.concat([home_team_performance, away_team_performance])
        combined_team_performance.to_csv('dataset.csv', index=False)

        odds_shark['game_date'] = pd.to_datetime(odds_shark['Date']).dt.date
        combined_team_performance = combined_team_performance.merge(odds_shark,
                                                            left_on=['game_date','home_team', 'away_team'],
                                                            right_on=['game_date', 'Home Name', 'Away Name'])
        combined_team_performance['is_home_team'] = (combined_team_performance['home_team'] == combined_team_performance['team_ID']).astype(int)
        X = combined_team_performance[['OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR', 'adj_DGAR', 'adj_GGAR', 'adj_GAR', 'Home MoneyLine', 'Away MoneyLine','Home Spread Price', 'Away Spread Price','Homes Spread', 'Away Spread','Home Votes', 'Away Votes','home_team_elo', 'away_team_elo', 'is_home_team', 'vegas_cut']]

        X2 = combined_team_performance[['OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR', 'adj_DGAR', 'adj_GGAR', 'adj_GAR','Total','Under Price','Over Price','Over Votes', 'Under Votes','home_team_elo', 'away_team_elo', 'is_home_team']]

        # Impute missing values in X
        imputer = SimpleImputer()
        X_imputed = imputer.fit_transform(X)
        X2_imputed = imputer.fit_transform(X2)

        # Predict the winner
        winner_prob = model.predict_proba(X_imputed)
        home_win_prob = round(winner_prob[0][1]*100, 2)
        away_win_prob = round(winner_prob[0][0]*100, 2)
        total_score_pred = model2.predict(X2_imputed)[0]


        # Add the prediction to the list of predictions
        predictions.append({
            'home_team': home_team,
            'away_team': away_team,
            'home_win_prob': home_win_prob,
            'away_win_prob': away_win_prob,
            'home_team_lineup': team_lineups[home_team],
            'away_team_lineup': team_lineups[away_team],
            'home_team_elo': home_team_elo,
            'away_team_elo': away_team_elo,
            'totals': total_score_pred,
            })

    return predictions

def extract_team_data(json_data,predict):
    # List to store extracted data
    extracted_data = []

    # Iterate through the scores list
    for game in json_data['scores']:
        game_data = {}

        # Extract home team data
        home_team = game['teams']['home']
        away_team = game['teams']['away']
        game_data['Home Name'] = home_team['names']['name']
        game_data['Home MoneyLine'] = home_team['moneyLine']
        game_data['Home Spread Price'] = home_team['spreadPrice']
        game_data['Home Score'] = home_team['score']
        game_data['Home Votes'] = home_team['votes']
        game_data['Homes Spread'] = home_team['spread']

        if predict == False :
            game_data['won_game'] = home_team['score'] > away_team['score']

        # Extract away team data
        game_data['Away Name'] = away_team['names']['name']
        game_data['Away MoneyLine'] = away_team['moneyLine']
        game_data['Away Spread Price'] = away_team['spreadPrice']
        game_data['Away Score'] = away_team['score']
        game_data['Away Votes'] = away_team['votes']
        game_data['Away Spread'] = away_team['spread']

        # Extract shared data
        game_data['Under Price'] = game['underPrice']
        game_data['Over Price'] = game['overPrice']
        game_data['Over Votes'] = game['overVotes']
        game_data['Under Votes'] = game['underVotes']
        game_data['Total'] = game['total']
        if predict == False :
            game_data['Totals'] = home_team['score'] + away_team['score']
        game_data['Arena'] = game['stadium']

        extracted_data.append(game_data)

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(extracted_data)
    return df

def fetch_odds_data(date, predict):
    base_url = f"https://www.oddsshark.com/api/scores/nhl/{date}?_format=json"

    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Referer': 'https://www.oddsshark.com/nhl/scores',
        'Sec-Ch-Ua': '"Chromium";v="118", "Microsoft Edge";v="118", "Not=A?Brand";v="99"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.44'
    }

    response = requests.get(base_url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        df = extract_team_data(data,predict)
        df['Date'] = date
        return df
    else:
        print(f"Failed to fetch data for date: {date}")
        return None

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)
year = 20232024

start_date = datetime.date(2014, 10, 12)
end_date = today
days = (end_date - start_date).days

dates = [start_date + datetime.timedelta(days=x) for x in range(days)]


# Call the function with the desired season range
start_season = 2014
end_season = 2023
current_season = 20222023
all_games_data = get_data_for_seasons(start_season, end_season)

current_season_data = get_game_data(current_season, True)

current_season_data_df = pd.DataFrame(current_season_data)

current_season_data_df['home_team'] = current_season_data_df['home_team'].apply(lambda x: remove_accents(x).decode('utf-8'))
current_season_data_df['away_team'] = current_season_data_df['away_team'].apply(lambda x: remove_accents(x).decode('utf-8'))

current_season_data_df['won_game'] = current_season_data_df['home_team_score'] > current_season_data_df['away_team_score']

current_elo_ratings = calculate_current_elo(current_season_data_df)

games_with_elo = pd.DataFrame()

all_games_df = pd.DataFrame(all_games_data)

all_games_df['won_game'] = all_games_df['home_team_score'] > all_games_df['away_team_score']

all_games_df['year'] = pd.to_datetime(all_games_df['game_date']).dt.year

# Initialize an empty list to store dataframes for each year
dfs = []

# Initialize Elo ratings for each team before looping through the years (to ensure carry-over between seasons)
ratings = {}
for team in np.union1d(all_games_df.home_team.unique(), all_games_df.away_team.unique()):
    ratings[team] = GlickoCompetitor()

# First, sort all_games_df by game_date
all_games_df = all_games_df.sort_values(by='game_date').reset_index(drop=True)

home_team_elo = []
away_team_elo = []

# Loop through each game and update Elo ratings
for i, r in all_games_df.iterrows():
    # Record the current Elo ratings for both teams
    home_team_elo.append(ratings[r.home_team].rating)
    away_team_elo.append(ratings[r.away_team].rating)

    # Update ratings based on game outcome
    if r['won_game']:
        ratings[r.home_team].beat(ratings[r.away_team])
    else:
        ratings[r.away_team].beat(ratings[r.home_team])

# Add the recorded Elo ratings to the dataset
all_games_df['home_team_elo'] = home_team_elo
all_games_df['away_team_elo'] = away_team_elo

games_with_elo = all_games_df.drop('won_game', axis=1)

# Load and preprocess the player performance data
gar_url = "https://raw.githubusercontent.com/NeilPaine538/NHL-Player-And-Team-Ratings/master/nhl_gar_historical.csv"
response = requests.get(gar_url)
csv_content = response.content.decode('ISO-8859-1')
gar_df = pd.read_csv(io.StringIO(csv_content))
gar_df = gar_df[(gar_df['year_ID'] >= start_season) & (gar_df['year_ID'] <= end_season)]
gar_df['team_ID'] = gar_df['team_ID'].replace(team_abbr_to_name)

# Extract the year from the game_date column
games_with_elo['year'] = pd.to_datetime(games_with_elo['game_date']).dt.year

dates = games_with_elo['game_date']
##dfs = []
##
##for date in dates:
##    print(date)
##    odds_data = fetch_odds_data(date,False)
##    if odds_data is not None:
##        dfs.append(odds_data)
##
##final_df = pd.concat(dfs, ignore_index=True)
##
##final_df = final_df.drop_duplicates()
##final_df = final_df.reset_index(drop=True)
##
##final_df.to_csv('dataset3.csv', index=False)

odds_shark = pd.read_csv('dataset3.csv')

##odds_shark.drop('Arena', axis=1, inplace=True)
odds_shark= odds_shark.drop_duplicates()
games_df = odds_shark

# Function to convert moneyline to implied probability
def moneyline_to_proba(moneyline):
    if moneyline < 0:
        return -moneyline / (-moneyline + 100)
    else:
        return 100 / (moneyline + 100)

# Convert the odds columns to probability
cols_to_convert = ['Home MoneyLine', 'Home Spread Price', 'Away MoneyLine', 'Away Spread Price', 'Under Price', 'Over Price']

for col in cols_to_convert:
    odds_shark[col] = odds_shark [col].apply(moneyline_to_proba)

odds_shark['vegas_cut'] = 1 - (odds_shark['Home MoneyLine'] + odds_shark['Away MoneyLine'])

gar_df.to_csv('test1.csv', index=False)

agg_stats_by_team_and_year = gar_df.groupby(['team_ID', 'year_ID']).agg({
    'OPS': 'mean',
    'DPS': 'mean',
    'GPS': 'mean',
    'PS': 'mean',
    'adj_OGAR': 'mean',
    'adj_DGAR': 'mean',
    'adj_GGAR': 'mean',
    'adj_GAR': 'mean'
}).reset_index()

home_team_performance = games_with_elo.merge(agg_stats_by_team_and_year, left_on=['home_team'], right_on=['team_ID'])
away_team_performance = games_with_elo.merge(agg_stats_by_team_and_year, left_on=['away_team'], right_on=['team_ID'])

combined_team_performance = pd.concat([home_team_performance, away_team_performance])

combined_team_performance = combined_team_performance.drop_duplicates()
odds_shark['game_date'] = pd.to_datetime(odds_shark['Date']).dt.date

combined_team_performance = combined_team_performance.merge(odds_shark,
                                                            left_on=['game_date', 'home_team', 'away_team'],
                                                            right_on=['game_date', 'Home Name', 'Away Name'])


combined_team_performance['is_home_team'] = (combined_team_performance['home_team'] == combined_team_performance['team_ID']).astype(int)

combined_team_performance = combined_team_performance.sort_values(by='game_date')

### Identify duplicates for checking
duplicates = combined_team_performance[combined_team_performance.duplicated(subset=['game_date', 'home_team', 'away_team'], keep=False)]
##
# Resolve duplicates (e.g., by keeping the first occurrence)
combined_team_performance = combined_team_performance.drop_duplicates(subset=['game_date', 'home_team', 'away_team'], keep='first')


combined_team_performance.to_csv('test.csv', index=False)

feature_cols = [
    'OPS', 'DPS', 'GPS', 'PS', 'adj_OGAR',
    'adj_DGAR', 'adj_GGAR', 'adj_GAR', 'Home MoneyLine',
    'Away MoneyLine','Home Spread Price', 'Away Spread Price','Homes Spread', 'Away Spread','Home Votes', 'Away Votes','home_team_elo', 'away_team_elo', 'is_home_team', 'vegas_cut'
]
target_col = 'won_game'

# Splitting Data first
split_idx = int(0.9 * combined_team_performance.shape[0])
train_data = combined_team_performance.iloc[:split_idx]
test_data = combined_team_performance.iloc[split_idx:]

# Perform imputation on train_data
imputer = SimpleImputer()
X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_train_imputed = imputer.fit_transform(X_train)

# Apply the same imputation transformation to test_data
X_test = test_data[feature_cols]
y_test = test_data[target_col]
X_test_imputed = imputer.transform(X_test)

# Model Training
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators = 100)
model.fit(X_train_imputed, y_train)