Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import warnings
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score, log_loss
- from sklearn.linear_model import LogisticRegression
- from sklearn.ensemble import GradientBoostingClassifier
- from sklearn.preprocessing import StandardScaler
- from xgboost import XGBClassifier, DMatrix, train
- from sklearn.linear_model import SGDClassifier
- from sklearn.calibration import CalibratedClassifierCV
- from sklearn.ensemble import VotingClassifier
- import datetime
- warnings.filterwarnings("ignore")
- # Define today's date
- today = datetime.date.today()
- # Calculate yesterday by subtracting one day
- yesterday = today - datetime.timedelta(days=1)
- # Calculate tomorrow by adding one day
- tomorrow = today + datetime.timedelta(days=1)
- def calculate_recent_performance(df, games=10):
- """
- Calculate the team's recent performance metrics for home and away teams efficiently,[^1^][1]
- excluding the current row to prevent data leakage.
- :param df: DataFrame containing the game data.
- :param games: Number of recent games to calculate metrics for[^1^][1].
- :return: DataFrame with calculated metrics for both home and away teams.
- """
- # Prepare DataFrame for rolling calculations
- # Calculate for home games
- home_df = df.copy()
- home_df['home_wins'] = home_df['home_win']
- home_df['home_goals_scored'] = home_df['score_home']
- home_df['home_goals_conceded'] = home_df['score_away']
- home_df.sort_values(['home_team', 'game_date'], inplace=True)
- # Rolling calculate win rate, goals scored, and goals conceded for home team
- # Exclude the current row by shifting the window
- home_df['home_recent_win_rate'] = home_df.groupby('home_team')['home_wins']\
- .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
- home_df['home_avg_goals_scored'] = home_df.groupby('home_team')['home_goals_scored']\
- .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
- home_df['home_avg_goals_conceded'] = home_df.groupby('home_team')['home_goals_conceded']\
- .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
- # Calculate current winning and losing streak for home teams
- # Exclude the current row by using shift
- home_df['home_winning_streak'] = home_df.groupby('home_team')['home_wins']\
- .transform(lambda x: x.shift(1).groupby((x != x.shift()).cumsum()).cumcount() + 1)
- home_df['home_losing_streak'] = home_df.groupby('home_team')['home_wins']\
- .transform(lambda x: (1 - x.shift(1)).groupby((x != x.shift()).cumsum()).cumcount() + 1)
- # Calculate for away games
- away_df = df.copy()
- away_df['away_wins'] = away_df['home_win'].apply(lambda x: 1 if x == 0 else 0)
- away_df['away_goals_scored'] = away_df['score_away']
- away_df['away_goals_conceded'] = away_df['score_home']
- away_df.sort_values(['away_team', 'game_date'], inplace=True)
- # Rolling calculate win rate, goals scored, and goals conceded for away team
- # Exclude the current row by shifting the window
- away_df['away_recent_win_rate'] = away_df.groupby('away_team')['away_wins']\
- .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
- away_df['away_avg_goals_scored'] = away_df.groupby('away_team')['away_goals_scored']\
- .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
- away_df['away_avg_goals_conceded'] = away_df.groupby('away_team')['away_goals_conceded']\
- .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
- # Calculate current winning and losing streak for away teams
- # Exclude the current row by using shift
- away_df['away_winning_streak'] = away_df.groupby('away_team')['away_wins']\
- .transform(lambda x: x.shift(1).groupby((x != x.shift()).cumsum()).cumcount() + 1)
- away_df['away_losing_streak'] = away_df.groupby('away_team')['away_wins']\
- .transform(lambda x: (1 - x.shift(1)).groupby((x != x.shift()).cumsum()).cumcount() + 1)
- # Merge the metrics back to the original dataframe
- df = df.merge(home_df[['game_date', 'home_team', 'home_recent_win_rate', 'home_avg_goals_scored', 'home_avg_goals_conceded', 'home_winning_streak', 'home_losing_streak']], on=['game_date', 'home_team'], how='left')
- df = df.merge(away_df[['game_date', 'away_team', 'away_recent_win_rate', 'away_avg_goals_scored', 'away_avg_goals_conceded', 'away_winning_streak', 'away_losing_streak']], on=['game_date', 'away_team'], how='left')
- return df
- # Load NHL Elo ratings data
- elo_url = 'https://raw.githubusercontent.com/Neil-Paine-1/NHL-Player-And-Team-Ratings/master/nhl_elo.csv'
- elo_df = pd.read_csv(elo_url).drop_duplicates()
- # Preprocess data to assign team names, ratings, probabilities, and scores
- is_home = elo_df['is_home'] == 1
- elo_df['home_team'] = np.where(is_home, elo_df['team1'], elo_df['team2'])
- elo_df['away_team'] = np.where(is_home, elo_df['team2'], elo_df['team1'])
- elo_df['home_team_elo'] = np.where(is_home, elo_df['elo1_pre'], elo_df['elo2_pre'])
- elo_df['away_team_elo'] = np.where(is_home, elo_df['elo2_pre'], elo_df['elo1_pre'])
- elo_df['home_team_prob'] = np.where(is_home, elo_df['prob1'], elo_df['prob2'])
- elo_df['away_team_prob'] = np.where(is_home, elo_df['prob2'], elo_df['prob1'])
- elo_df['home_team_pts'] = np.where(is_home, elo_df['exp_pts1'], elo_df['exp_pts2'])
- elo_df['away_team_pts'] = np.where(is_home, elo_df['exp_pts2'], elo_df['exp_pts1'])
- elo_df['score_home'] = np.where(is_home, elo_df['score1'], elo_df['score2'])
- elo_df['score_away'] = np.where(is_home, elo_df['score2'], elo_df['score1'])
- elo_df['elo_diff'] = elo_df['home_team_elo'] - elo_df['away_team_elo']
- elo_df['home_win'] = (elo_df['score_home'] > elo_df['score_away']).astype(int)
- elo_df.rename(columns={'date': 'game_date'}, inplace=True)
- elo_df['game_date'] = pd.to_datetime(elo_df['game_date'])
- # Calculate rest days and identify back-to-back games
- elo_df.sort_values(['home_team', 'game_date'], inplace=True)
- elo_df['previous_home_game'] = elo_df.groupby('home_team')['game_date'].shift(1)
- elo_df['rest_days_home'] = (elo_df['game_date'] - elo_df['previous_home_game']).dt.days - 1
- elo_df['rest_days_home'].fillna(-1, inplace=True) # For the first game
- elo_df.sort_values(['away_team', 'game_date'], inplace=True)
- elo_df['previous_away_game'] = elo_df.groupby('away_team')['game_date'].shift(1)
- elo_df['rest_days_away'] = (elo_df['game_date'] - elo_df['previous_away_game']).dt.days - 1
- elo_df['rest_days_away'].fillna(-1, inplace=True) # For the first game
- elo_df['back_to_back_home'] = elo_df['rest_days_home'] == 0
- elo_df['back_to_back_away'] = elo_df['rest_days_away'] == 0
- elo_df.drop_duplicates(subset=['game_date', 'home_team', 'away_team'], keep='first', inplace=True)
- elo_df.sort_values('game_date', inplace=True)
- # Apply function to elo_df
- elo_df = calculate_recent_performance(elo_df)
- # Save DataFrame to CSV file
- elo_df.to_csv('nhl.csv', index=False)
- # Define features and target variable for model training
- features = ['home_team_elo', 'away_team_elo', 'rest_days_home', 'rest_days_away',
- 'back_to_back_home', 'back_to_back_away', 'away_team_prob', 'home_team_prob', 'elo_diff','home_recent_win_rate',
- 'away_recent_win_rate','home_avg_goals_scored','home_avg_goals_conceded','away_avg_goals_scored','away_avg_goals_conceded','playoff',
- 'home_winning_streak','away_winning_streak','home_losing_streak', 'away_losing_streak','is_home']
- X = elo_df[features]
- y = elo_df['home_win']
- # Define a cutoff date for splitting the dataset into training and testing sets
- # For example, choosing a date that separates the last 20% of data for testing
- cutoff_date = elo_df['game_date'].quantile(0.8, interpolation='nearest')
- today_2 = pd.Timestamp('now').floor('D')
- yesterday_2 = today_2 - pd.Timedelta(days=1)
- # Split the data based on the cutoff date
- train_df = elo_df[elo_df['game_date'] < cutoff_date]
- test_df = elo_df[(elo_df['game_date'] >= cutoff_date) & (elo_df['game_date'] < today_2)]
- # Extract features and target from the training and testing sets
- X_train = train_df[features]
- y_train = train_df['home_win']
- X_test = test_df[features]
- y_test = test_df['home_win']
- # Normalize features for Logistic Regression
- scaler = StandardScaler()
- X_train_scaled = scaler.fit_transform(X_train)
- X_test_scaled = scaler.transform(X_test)
- # Train and evaluate Logistic Regression model
- model_lr = LogisticRegression(max_iter=1000)
- model_lr.fit(X_train_scaled, y_train)
- probabilities_lr = model_lr.predict_proba(X_test_scaled)[:, 1]
- accuracy_lr = accuracy_score(y_test, model_lr.predict(X_test_scaled))
- log_loss_lr = log_loss(y_test, probabilities_lr)
- print(f"Logistic Regression - Accuracy: {accuracy_lr:.4f}, Log Loss: {log_loss_lr:.4f}")
- # Train and evaluate Gradient Boosting Machine model
- model_gbm = GradientBoostingClassifier()
- model_gbm.fit(X_train, y_train)
- probabilities_gbm = model_gbm.predict_proba(X_test)[:, 1]
- accuracy_gbm = accuracy_score(y_test, model_gbm.predict(X_test))
- log_loss_gbm = log_loss(y_test, probabilities_gbm)
- print(f"Gradient Boosting Machine - Accuracy: {accuracy_gbm:.4f}, Log Loss: {log_loss_gbm:.4f}")
- # Train and evaluate XGBoost model
- # Initialize the XGBClassifier
- model_xgb = XGBClassifier(n_estimators=25, use_label_encoder=False, eval_metric='logloss')
- # Train the model using the high-level API which is compatible with scikit-learn
- model_xgb.fit(X_train, y_train)
- # Predict probabilities and class labels for the test set
- probabilities_xgb = model_xgb.predict_proba(X_test)[:, 1]
- predictions_xgb = (probabilities_xgb >= 0.5).astype(int)
- # Calculate accuracy and log loss
- accuracy_xgb = accuracy_score(y_test, predictions_xgb)
- log_loss_xgb = log_loss(y_test, probabilities_xgb)
- # Print the results
- print(f"XGBoost - Accuracy: {accuracy_xgb:.4f}, Log Loss: {log_loss_xgb:.4f}")
- # Function to predict today's games using the GBM model
- def predict_today_games(model, date, games_df):
- # Ensure date formats match
- if not isinstance(date, datetime.date):
- try:
- date = pd.to_datetime(date).date()
- except ValueError as e:
- print(f"Date conversion error: {e}")
- return None
- # Filter for games on the specified date
- games_today_df = games_df[games_df['game_date'].dt.date == date].copy()
- if games_today_df.empty:
- print(f"No games on {date}")
- return None
- # Predict probabilities for the home team winning
- x_predict = games_today_df[features]
- probabilities = model.predict_proba(x_predict)[:, 1] # Using predict_proba to get probabilities
- games_today_df['predicted_prob_home_win'] = probabilities
- # Print predictions with percentage formatting
- print("Predictions for today's games:")
- for index, row in games_today_df.iterrows():
- percentage_prob = row['predicted_prob_home_win'] * 100 # Convert probability to percentage
- print(f"Game Date: {row['game_date'].strftime('%Y-%m-%d')}")
- print(f"Home Team: {row['home_team']}")
- print(f"Away Team: {row['away_team']}")
- print(f"Predicted Probability of Home Win: {percentage_prob:.1f}%") # Format output as percentage
- print("-" * 30)
- return games_today_df[['game_date', 'home_team', 'away_team', 'predicted_prob_home_win']]
- # Predict today's games
- today_predictions = predict_today_games(model_xgb, yesterday, elo_df)
- print(today_predictions)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement