Advertisement
samipote

Untitled

May 1st, 2024
718
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.51 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import warnings
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.metrics import accuracy_score, log_loss
  6. from sklearn.linear_model import LogisticRegression
  7. from sklearn.ensemble import GradientBoostingClassifier
  8. from sklearn.preprocessing import StandardScaler
  9. from xgboost import XGBClassifier, DMatrix, train
  10. from sklearn.linear_model import SGDClassifier
  11. from sklearn.calibration import CalibratedClassifierCV
  12. from sklearn.ensemble import VotingClassifier
  13.  
  14. import datetime
  15. warnings.filterwarnings("ignore")
  16.  
  17. # Define today's date
  18. today = datetime.date.today()
  19.  
  20. # Calculate yesterday by subtracting one day
  21. yesterday = today - datetime.timedelta(days=1)
  22.  
  23. # Calculate tomorrow by adding one day
  24. tomorrow = today + datetime.timedelta(days=1)
  25.  
  26. def calculate_recent_performance(df, games=10):
  27.     """
  28.    Calculate the team's recent performance metrics for home and away teams efficiently,[^1^][1]
  29.    excluding the current row to prevent data leakage.
  30.    
  31.    :param df: DataFrame containing the game data.
  32.    :param games: Number of recent games to calculate metrics for[^1^][1].
  33.    :return: DataFrame with calculated metrics for both home and away teams.
  34.    """
  35.     # Prepare DataFrame for rolling calculations
  36.     # Calculate for home games
  37.     home_df = df.copy()
  38.     home_df['home_wins'] = home_df['home_win']
  39.     home_df['home_goals_scored'] = home_df['score_home']
  40.     home_df['home_goals_conceded'] = home_df['score_away']
  41.     home_df.sort_values(['home_team', 'game_date'], inplace=True)
  42.    
  43.     # Rolling calculate win rate, goals scored, and goals conceded for home team
  44.     # Exclude the current row by shifting the window
  45.     home_df['home_recent_win_rate'] = home_df.groupby('home_team')['home_wins']\
  46.         .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
  47.     home_df['home_avg_goals_scored'] = home_df.groupby('home_team')['home_goals_scored']\
  48.         .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
  49.     home_df['home_avg_goals_conceded'] = home_df.groupby('home_team')['home_goals_conceded']\
  50.         .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
  51.    
  52.     # Calculate current winning and losing streak for home teams
  53.     # Exclude the current row by using shift
  54.     home_df['home_winning_streak'] = home_df.groupby('home_team')['home_wins']\
  55.         .transform(lambda x: x.shift(1).groupby((x != x.shift()).cumsum()).cumcount() + 1)
  56.     home_df['home_losing_streak'] = home_df.groupby('home_team')['home_wins']\
  57.         .transform(lambda x: (1 - x.shift(1)).groupby((x != x.shift()).cumsum()).cumcount() + 1)
  58.    
  59.     # Calculate for away games
  60.     away_df = df.copy()
  61.     away_df['away_wins'] = away_df['home_win'].apply(lambda x: 1 if x == 0 else 0)
  62.     away_df['away_goals_scored'] = away_df['score_away']
  63.     away_df['away_goals_conceded'] = away_df['score_home']
  64.     away_df.sort_values(['away_team', 'game_date'], inplace=True)
  65.    
  66.     # Rolling calculate win rate, goals scored, and goals conceded for away team
  67.     # Exclude the current row by shifting the window
  68.     away_df['away_recent_win_rate'] = away_df.groupby('away_team')['away_wins']\
  69.         .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
  70.     away_df['away_avg_goals_scored'] = away_df.groupby('away_team')['away_goals_scored']\
  71.         .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
  72.     away_df['away_avg_goals_conceded'] = away_df.groupby('away_team')['away_goals_conceded']\
  73.         .transform(lambda x: x.shift(1).rolling(window=games, min_periods=1).mean())
  74.    
  75.     # Calculate current winning and losing streak for away teams
  76.     # Exclude the current row by using shift
  77.     away_df['away_winning_streak'] = away_df.groupby('away_team')['away_wins']\
  78.         .transform(lambda x: x.shift(1).groupby((x != x.shift()).cumsum()).cumcount() + 1)
  79.     away_df['away_losing_streak'] = away_df.groupby('away_team')['away_wins']\
  80.         .transform(lambda x: (1 - x.shift(1)).groupby((x != x.shift()).cumsum()).cumcount() + 1)
  81.    
  82.     # Merge the metrics back to the original dataframe
  83.     df = df.merge(home_df[['game_date', 'home_team', 'home_recent_win_rate', 'home_avg_goals_scored', 'home_avg_goals_conceded', 'home_winning_streak', 'home_losing_streak']], on=['game_date', 'home_team'], how='left')
  84.     df = df.merge(away_df[['game_date', 'away_team', 'away_recent_win_rate', 'away_avg_goals_scored', 'away_avg_goals_conceded', 'away_winning_streak', 'away_losing_streak']], on=['game_date', 'away_team'], how='left')
  85.    
  86.     return df
  87.  
  88.  
  89.  
  90. # Load NHL Elo ratings data
  91. elo_url = 'https://raw.githubusercontent.com/Neil-Paine-1/NHL-Player-And-Team-Ratings/master/nhl_elo.csv'
  92. elo_df = pd.read_csv(elo_url).drop_duplicates()
  93.  
  94.  
  95. # Preprocess data to assign team names, ratings, probabilities, and scores
  96. is_home = elo_df['is_home'] == 1
  97. elo_df['home_team'] = np.where(is_home, elo_df['team1'], elo_df['team2'])
  98. elo_df['away_team'] = np.where(is_home, elo_df['team2'], elo_df['team1'])
  99. elo_df['home_team_elo'] = np.where(is_home, elo_df['elo1_pre'], elo_df['elo2_pre'])
  100. elo_df['away_team_elo'] = np.where(is_home, elo_df['elo2_pre'], elo_df['elo1_pre'])
  101. elo_df['home_team_prob'] = np.where(is_home, elo_df['prob1'], elo_df['prob2'])
  102. elo_df['away_team_prob'] = np.where(is_home, elo_df['prob2'], elo_df['prob1'])
  103. elo_df['home_team_pts'] = np.where(is_home, elo_df['exp_pts1'], elo_df['exp_pts2'])
  104. elo_df['away_team_pts'] = np.where(is_home, elo_df['exp_pts2'], elo_df['exp_pts1'])
  105. elo_df['score_home'] = np.where(is_home, elo_df['score1'], elo_df['score2'])
  106. elo_df['score_away'] = np.where(is_home, elo_df['score2'], elo_df['score1'])
  107. elo_df['elo_diff'] = elo_df['home_team_elo'] - elo_df['away_team_elo']
  108. elo_df['home_win'] = (elo_df['score_home'] > elo_df['score_away']).astype(int)
  109. elo_df.rename(columns={'date': 'game_date'}, inplace=True)
  110. elo_df['game_date'] = pd.to_datetime(elo_df['game_date'])
  111.  
  112. # Calculate rest days and identify back-to-back games
  113. elo_df.sort_values(['home_team', 'game_date'], inplace=True)
  114. elo_df['previous_home_game'] = elo_df.groupby('home_team')['game_date'].shift(1)
  115. elo_df['rest_days_home'] = (elo_df['game_date'] - elo_df['previous_home_game']).dt.days - 1
  116. elo_df['rest_days_home'].fillna(-1, inplace=True)  # For the first game
  117.  
  118. elo_df.sort_values(['away_team', 'game_date'], inplace=True)
  119. elo_df['previous_away_game'] = elo_df.groupby('away_team')['game_date'].shift(1)
  120. elo_df['rest_days_away'] = (elo_df['game_date'] - elo_df['previous_away_game']).dt.days - 1
  121. elo_df['rest_days_away'].fillna(-1, inplace=True)  # For the first game
  122.  
  123. elo_df['back_to_back_home'] = elo_df['rest_days_home'] == 0
  124. elo_df['back_to_back_away'] = elo_df['rest_days_away'] == 0
  125.  
  126. elo_df.drop_duplicates(subset=['game_date', 'home_team', 'away_team'], keep='first', inplace=True)
  127.  
  128. elo_df.sort_values('game_date', inplace=True)
  129.  
  130. # Apply function to elo_df
  131. elo_df = calculate_recent_performance(elo_df)
  132.  
  133. # Save DataFrame to CSV file
  134. elo_df.to_csv('nhl.csv', index=False)
  135.  
  136. # Define features and target variable for model training
  137. features = ['home_team_elo', 'away_team_elo', 'rest_days_home', 'rest_days_away',
  138.             'back_to_back_home', 'back_to_back_away', 'away_team_prob', 'home_team_prob', 'elo_diff','home_recent_win_rate',
  139.             'away_recent_win_rate','home_avg_goals_scored','home_avg_goals_conceded','away_avg_goals_scored','away_avg_goals_conceded','playoff',
  140.             'home_winning_streak','away_winning_streak','home_losing_streak', 'away_losing_streak','is_home']
  141. X = elo_df[features]
  142. y = elo_df['home_win']
  143.  
  144. # Define a cutoff date for splitting the dataset into training and testing sets
  145. # For example, choosing a date that separates the last 20% of data for testing
  146. cutoff_date = elo_df['game_date'].quantile(0.8, interpolation='nearest')
  147.  
  148. today_2 = pd.Timestamp('now').floor('D')
  149. yesterday_2 = today_2 - pd.Timedelta(days=1)
  150. # Split the data based on the cutoff date
  151. train_df = elo_df[elo_df['game_date'] < cutoff_date]
  152. test_df = elo_df[(elo_df['game_date'] >= cutoff_date) & (elo_df['game_date'] < today_2)]
  153.  
  154. # Extract features and target from the training and testing sets
  155. X_train = train_df[features]
  156. y_train = train_df['home_win']
  157. X_test = test_df[features]
  158. y_test = test_df['home_win']
  159.  
  160.  
  161. # Normalize features for Logistic Regression
  162. scaler = StandardScaler()
  163. X_train_scaled = scaler.fit_transform(X_train)
  164. X_test_scaled = scaler.transform(X_test)
  165.  
  166. # Train and evaluate Logistic Regression model
  167. model_lr = LogisticRegression(max_iter=1000)
  168. model_lr.fit(X_train_scaled, y_train)
  169. probabilities_lr = model_lr.predict_proba(X_test_scaled)[:, 1]
  170. accuracy_lr = accuracy_score(y_test, model_lr.predict(X_test_scaled))
  171. log_loss_lr = log_loss(y_test, probabilities_lr)
  172. print(f"Logistic Regression - Accuracy: {accuracy_lr:.4f}, Log Loss: {log_loss_lr:.4f}")
  173.  
  174. # Train and evaluate Gradient Boosting Machine model
  175. model_gbm = GradientBoostingClassifier()
  176. model_gbm.fit(X_train, y_train)
  177. probabilities_gbm = model_gbm.predict_proba(X_test)[:, 1]
  178. accuracy_gbm = accuracy_score(y_test, model_gbm.predict(X_test))
  179. log_loss_gbm = log_loss(y_test, probabilities_gbm)
  180. print(f"Gradient Boosting Machine - Accuracy: {accuracy_gbm:.4f}, Log Loss: {log_loss_gbm:.4f}")
  181.  
  182. # Train and evaluate XGBoost model
  183. # Initialize the XGBClassifier
  184. model_xgb = XGBClassifier(n_estimators=25, use_label_encoder=False, eval_metric='logloss')
  185.  
  186. # Train the model using the high-level API which is compatible with scikit-learn
  187. model_xgb.fit(X_train, y_train)
  188.  
  189. # Predict probabilities and class labels for the test set
  190. probabilities_xgb = model_xgb.predict_proba(X_test)[:, 1]
  191. predictions_xgb = (probabilities_xgb >= 0.5).astype(int)
  192.  
  193. # Calculate accuracy and log loss
  194. accuracy_xgb = accuracy_score(y_test, predictions_xgb)
  195. log_loss_xgb = log_loss(y_test, probabilities_xgb)
  196.  
  197. # Print the results
  198. print(f"XGBoost - Accuracy: {accuracy_xgb:.4f}, Log Loss: {log_loss_xgb:.4f}")
  199.  
  200. # Function to predict today's games using the GBM model
  201. def predict_today_games(model, date, games_df):
  202.     # Ensure date formats match
  203.     if not isinstance(date, datetime.date):
  204.         try:
  205.             date = pd.to_datetime(date).date()
  206.         except ValueError as e:
  207.             print(f"Date conversion error: {e}")
  208.             return None
  209.  
  210.     # Filter for games on the specified date
  211.     games_today_df = games_df[games_df['game_date'].dt.date == date].copy()
  212.     if games_today_df.empty:
  213.         print(f"No games on {date}")
  214.         return None
  215.  
  216.     # Predict probabilities for the home team winning
  217.     x_predict = games_today_df[features]
  218.     probabilities = model.predict_proba(x_predict)[:, 1]  # Using predict_proba to get probabilities
  219.  
  220.     games_today_df['predicted_prob_home_win'] = probabilities
  221.  
  222.     # Print predictions with percentage formatting
  223.     print("Predictions for today's games:")
  224.     for index, row in games_today_df.iterrows():
  225.         percentage_prob = row['predicted_prob_home_win'] * 100  # Convert probability to percentage
  226.         print(f"Game Date: {row['game_date'].strftime('%Y-%m-%d')}")
  227.         print(f"Home Team: {row['home_team']}")
  228.         print(f"Away Team: {row['away_team']}")
  229.         print(f"Predicted Probability of Home Win: {percentage_prob:.1f}%")  # Format output as percentage
  230.         print("-" * 30)
  231.  
  232.     return games_today_df[['game_date', 'home_team', 'away_team', 'predicted_prob_home_win']]
  233.  
  234.  
  235.  
  236. # Predict today's games
  237. today_predictions = predict_today_games(model_xgb, yesterday, elo_df)
  238. print(today_predictions)
  239.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement