Advertisement
samipote

Untitled

May 1st, 2024
745
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.11 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import warnings
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.metrics import accuracy_score, log_loss
  6. from sklearn.linear_model import LogisticRegression
  7. from sklearn.ensemble import GradientBoostingClassifier
  8. from sklearn.preprocessing import StandardScaler
  9. from xgboost import XGBClassifier, DMatrix, train
  10. from sklearn.linear_model import SGDClassifier
  11. from sklearn.calibration import CalibratedClassifierCV
  12. from sklearn.ensemble import VotingClassifier
  13.  
  14. import datetime
  15. warnings.filterwarnings("ignore")
  16.  
  17. # Define today's date
  18. today = datetime.date.today()
  19.  
  20. # Calculate yesterday by subtracting one day
  21. yesterday = today - datetime.timedelta(days=1)
  22.  
  23. # Calculate tomorrow by adding one day
  24. tomorrow = today + datetime.timedelta(days=1)
  25.  
  26. def calculate_recent_performance(df, games=10):
  27.     """
  28.    Calculate the team's recent performance metrics for home and away teams efficiently, including current winning
  29.    and losing streaks.
  30.    :param df: DataFrame containing the game data.
  31.    :param games: Number of recent games to calculate metrics for.
  32.    :return: DataFrame with calculated metrics for both home and away teams.
  33.    """
  34.     # Prepare DataFrame for rolling calculations
  35.     # Calculate for home games
  36.     home_df = df.copy()
  37.     home_df['home_wins'] = home_df['home_win']
  38.     home_df['home_goals_scored'] = home_df['score_home']
  39.     home_df['home_goals_conceded'] = home_df['score_away']
  40.    
  41.     home_df.sort_values(['home_team', 'game_date'], inplace=True)
  42.  
  43.     # Rolling calculate win rate, goals scored, and goals conceded for home team
  44.     home_df['home_recent_win_rate'] = home_df.groupby('home_team')['home_wins'].transform(lambda x: x.rolling(window=games, min_periods=1).mean())
  45.     home_df['home_avg_goals_scored'] = home_df.groupby('home_team')['home_goals_scored'].transform(lambda x: x.rolling(window=games, min_periods=1).mean())
  46.     home_df['home_avg_goals_conceded'] = home_df.groupby('home_team')['home_goals_conceded'].transform(lambda x: x.rolling(window=games, min_periods=1).mean())
  47.  
  48.     # Calculate current winning and losing streak for home teams
  49.     home_df['home_winning_streak'] = home_df.groupby('home_team')['home_wins'].transform(lambda x: x.groupby((x != x.shift()).cumsum()).cumcount() + 1)
  50.     home_df['home_losing_streak'] = home_df.groupby('home_team')['home_wins'].transform(lambda x: (1 - x).groupby((x != x.shift()).cumsum()).cumcount() + 1)
  51.  
  52.     # Calculate for away games
  53.     away_df = df.copy()
  54.     away_df['away_wins'] = away_df['home_win'].apply(lambda x: 1 if x == 0 else 0)  # Invert home_win for away team perspective
  55.     away_df['away_goals_scored'] = away_df['score_away']
  56.     away_df['away_goals_conceded'] = away_df['score_home']
  57.    
  58.     away_df.sort_values(['away_team', 'game_date'], inplace=True)
  59.  
  60.     # Rolling calculate win rate, goals scored, and goals conceded for away team
  61.     away_df['away_recent_win_rate'] = away_df.groupby('away_team')['away_wins'].transform(lambda x: x.rolling(window=games, min_periods=1).mean())
  62.     away_df['away_avg_goals_scored'] = away_df.groupby('away_team')['away_goals_scored'].transform(lambda x: x.rolling(window=games, min_periods=1).mean())
  63.     away_df['away_avg_goals_conceded'] = away_df.groupby('away_team')['away_goals_conceded'].transform(lambda x: x.rolling(window=games, min_periods=1).mean())
  64.  
  65.     # Calculate current winning and losing streak for away teams
  66.     away_df['away_winning_streak'] = away_df.groupby('away_team')['away_wins'].transform(lambda x: x.groupby((x != x.shift()).cumsum()).cumcount() + 1)
  67.     away_df['away_losing_streak'] = away_df.groupby('away_team')['away_wins'].transform(lambda x: (1 - x).groupby((x != x.shift()).cumsum()).cumcount() + 1)
  68.  
  69.     # Merge the metrics back to the original dataframe
  70.     df = df.merge(home_df[['game_date', 'home_team', 'home_recent_win_rate', 'home_avg_goals_scored', 'home_avg_goals_conceded', 'home_winning_streak', 'home_losing_streak']], on=['game_date', 'home_team'], how='left')
  71.     df = df.merge(away_df[['game_date', 'away_team', 'away_recent_win_rate', 'away_avg_goals_scored', 'away_avg_goals_conceded', 'away_winning_streak', 'away_losing_streak']], on=['game_date', 'away_team'], how='left')
  72.  
  73.     return df
  74.  
  75.  
  76. # Load NHL Elo ratings data
  77. elo_url = 'https://raw.githubusercontent.com/Neil-Paine-1/NHL-Player-And-Team-Ratings/master/nhl_elo.csv'
  78. elo_df = pd.read_csv(elo_url).drop_duplicates()
  79.  
  80.  
  81. # Preprocess data to assign team names, ratings, probabilities, and scores
  82. is_home = elo_df['is_home'] == 1
  83. elo_df['home_team'] = np.where(is_home, elo_df['team1'], elo_df['team2'])
  84. elo_df['away_team'] = np.where(is_home, elo_df['team2'], elo_df['team1'])
  85. elo_df['home_team_elo'] = np.where(is_home, elo_df['elo1_pre'], elo_df['elo2_pre'])
  86. elo_df['away_team_elo'] = np.where(is_home, elo_df['elo2_pre'], elo_df['elo1_pre'])
  87. elo_df['home_team_prob'] = np.where(is_home, elo_df['prob1'], elo_df['prob2'])
  88. elo_df['away_team_prob'] = np.where(is_home, elo_df['prob2'], elo_df['prob1'])
  89. elo_df['home_team_pts'] = np.where(is_home, elo_df['exp_pts1'], elo_df['exp_pts2'])
  90. elo_df['away_team_pts'] = np.where(is_home, elo_df['exp_pts2'], elo_df['exp_pts1'])
  91. elo_df['score_home'] = np.where(is_home, elo_df['score1'], elo_df['score2'])
  92. elo_df['score_away'] = np.where(is_home, elo_df['score2'], elo_df['score1'])
  93. elo_df['elo_diff'] = elo_df['home_team_elo'] - elo_df['away_team_elo']
  94. elo_df['home_win'] = (elo_df['score_home'] > elo_df['score_away']).astype(int)
  95. elo_df.rename(columns={'date': 'game_date'}, inplace=True)
  96. elo_df['game_date'] = pd.to_datetime(elo_df['game_date'])
  97.  
  98. # Calculate rest days and identify back-to-back games
  99. elo_df.sort_values(['home_team', 'game_date'], inplace=True)
  100. elo_df['previous_home_game'] = elo_df.groupby('home_team')['game_date'].shift(1)
  101. elo_df['rest_days_home'] = (elo_df['game_date'] - elo_df['previous_home_game']).dt.days - 1
  102. elo_df['rest_days_home'].fillna(-1, inplace=True)  # For the first game
  103.  
  104. elo_df.sort_values(['away_team', 'game_date'], inplace=True)
  105. elo_df['previous_away_game'] = elo_df.groupby('away_team')['game_date'].shift(1)
  106. elo_df['rest_days_away'] = (elo_df['game_date'] - elo_df['previous_away_game']).dt.days - 1
  107. elo_df['rest_days_away'].fillna(-1, inplace=True)  # For the first game
  108.  
  109. elo_df['back_to_back_home'] = elo_df['rest_days_home'] == 0
  110. elo_df['back_to_back_away'] = elo_df['rest_days_away'] == 0
  111.  
  112. elo_df.drop_duplicates(subset=['game_date', 'home_team', 'away_team'], keep='first', inplace=True)
  113.  
  114. elo_df.sort_values('game_date', inplace=True)
  115.  
  116. # Apply function to elo_df
  117. elo_df = calculate_recent_performance(elo_df)
  118.  
  119. # Save DataFrame to CSV file
  120. elo_df.to_csv('nhl.csv', index=False)
  121.  
  122. # Define features and target variable for model training
  123. features = ['home_team_elo', 'away_team_elo', 'rest_days_home', 'rest_days_away',
  124.             'back_to_back_home', 'back_to_back_away', 'away_team_prob', 'home_team_prob', 'elo_diff','home_recent_win_rate',
  125.             'away_recent_win_rate','home_avg_goals_scored','home_avg_goals_conceded','away_avg_goals_scored','away_avg_goals_conceded','playoff',
  126.             'home_winning_streak','away_winning_streak','home_losing_streak', 'away_losing_streak','is_home']
  127. X = elo_df[features]
  128. y = elo_df['home_win']
  129.  
  130. # Define a cutoff date for splitting the dataset into training and testing sets
  131. # For example, choosing a date that separates the last 20% of data for testing
  132. cutoff_date = elo_df['game_date'].quantile(0.8, interpolation='nearest')
  133.  
  134. today_2 = pd.Timestamp('now').floor('D')
  135. yesterday_2 = today_2 - pd.Timedelta(days=1)
  136. # Split the data based on the cutoff date
  137. train_df = elo_df[elo_df['game_date'] < cutoff_date]
  138. test_df = elo_df[(elo_df['game_date'] >= cutoff_date) & (elo_df['game_date'] < today_2)]
  139.  
  140. # Extract features and target from the training and testing sets
  141. X_train = train_df[features]
  142. y_train = train_df['home_win']
  143. X_test = test_df[features]
  144. y_test = test_df['home_win']
  145.  
  146.  
  147. # Normalize features for Logistic Regression
  148. scaler = StandardScaler()
  149. X_train_scaled = scaler.fit_transform(X_train)
  150. X_test_scaled = scaler.transform(X_test)
  151.  
  152. # Train and evaluate Logistic Regression model
  153. model_lr = LogisticRegression(max_iter=1000)
  154. model_lr.fit(X_train_scaled, y_train)
  155. probabilities_lr = model_lr.predict_proba(X_test_scaled)[:, 1]
  156. accuracy_lr = accuracy_score(y_test, model_lr.predict(X_test_scaled))
  157. log_loss_lr = log_loss(y_test, probabilities_lr)
  158. print(f"Logistic Regression - Accuracy: {accuracy_lr:.4f}, Log Loss: {log_loss_lr:.4f}")
  159.  
  160. # Train and evaluate Gradient Boosting Machine model
  161. model_gbm = GradientBoostingClassifier()
  162. model_gbm.fit(X_train, y_train)
  163. probabilities_gbm = model_gbm.predict_proba(X_test)[:, 1]
  164. accuracy_gbm = accuracy_score(y_test, model_gbm.predict(X_test))
  165. log_loss_gbm = log_loss(y_test, probabilities_gbm)
  166. print(f"Gradient Boosting Machine - Accuracy: {accuracy_gbm:.4f}, Log Loss: {log_loss_gbm:.4f}")
  167.  
  168. # Train and evaluate XGBoost model
  169. # Initialize the XGBClassifier
  170. model_xgb = XGBClassifier(n_estimators=25, use_label_encoder=False, eval_metric='logloss')
  171.  
  172. # Train the model using the high-level API which is compatible with scikit-learn
  173. model_xgb.fit(X_train, y_train)
  174.  
  175. # Predict probabilities and class labels for the test set
  176. probabilities_xgb = model_xgb.predict_proba(X_test)[:, 1]
  177. predictions_xgb = (probabilities_xgb >= 0.5).astype(int)
  178.  
  179. # Calculate accuracy and log loss
  180. accuracy_xgb = accuracy_score(y_test, predictions_xgb)
  181. log_loss_xgb = log_loss(y_test, probabilities_xgb)
  182.  
  183. # Print the results
  184. print(f"XGBoost - Accuracy: {accuracy_xgb:.4f}, Log Loss: {log_loss_xgb:.4f}")
  185.  
  186. # Function to predict today's games using the GBM model
  187. def predict_today_games(model, date, games_df):
  188.     # Ensure date formats match
  189.     if not isinstance(date, datetime.date):
  190.         try:
  191.             date = pd.to_datetime(date).date()
  192.         except ValueError as e:
  193.             print(f"Date conversion error: {e}")
  194.             return None
  195.  
  196.     # Filter for games on the specified date
  197.     games_today_df = games_df[games_df['game_date'].dt.date == date].copy()
  198.     if games_today_df.empty:
  199.         print(f"No games on {date}")
  200.         return None
  201.  
  202.     # Predict probabilities for the home team winning
  203.     x_predict = games_today_df[features]
  204.     probabilities = model.predict_proba(x_predict)[:, 1]  # Using predict_proba to get probabilities
  205.  
  206.     games_today_df['predicted_prob_home_win'] = probabilities
  207.  
  208.     # Print predictions with percentage formatting
  209.     print("Predictions for today's games:")
  210.     for index, row in games_today_df.iterrows():
  211.         percentage_prob = row['predicted_prob_home_win'] * 100  # Convert probability to percentage
  212.         print(f"Game Date: {row['game_date'].strftime('%Y-%m-%d')}")
  213.         print(f"Home Team: {row['home_team']}")
  214.         print(f"Away Team: {row['away_team']}")
  215.         print(f"Predicted Probability of Home Win: {percentage_prob:.1f}%")  # Format output as percentage
  216.         print("-" * 30)
  217.  
  218.     return games_today_df[['game_date', 'home_team', 'away_team', 'predicted_prob_home_win']]
  219.  
  220.  
  221.  
  222. # Predict today's games
  223. today_predictions = predict_today_games(model_xgb, today, elo_df)
  224. print(today_predictions)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement