Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from datetime import datetime, timedelta
- import time
- import numexpr as ne
- start_time = time.time()
- # Set display options
- pd.set_option('display.max_rows', 30000)
- pd.set_option('display.max_columns', 500)
- pd.set_option('display.width', 1000)
- pd.set_option('display.max_colwidth', None)
- # Calculating Selection Thresholds
- # File Paths
- predictions_archive = predictions_archive.csv
- train_data = historical_data_cleaned.csv
- selection_weights_file = selection.csv
- train_data = pd.read_csv(train_data)
- selection_weights = pd.read_csv(selection_weights_file)
- predictions_archive = pd.read_csv(predictions_archive)
- train_data['datetime'] = pd.to_datetime(train_data['datetime'])
- predictions_archive['match_datetime'] = pd.to_datetime(predictions_archive['match_datetime'])
- # print(train_data.head())
- # Print Missing Dates
- # Create a date range for the past year
- # start_date = datetime.now() - timedelta(days=365)
- # end_date = datetime.now()
- # date_range = pd.date_range(start=start_date, end=end_date)
- #
- # # Get the missing dates
- # missing_dates = set(date_range.date) - set(train_data['datetime'].dt.date)
- #
- # # Print out the missing dates
- # if missing_dates:
- # print("The following dates are missing:")
- # for date in sorted(missing_dates):
- # print(date.strftime('%Y-%m-%d'))
- # else:
- # print("All dates for the past year are present in the dataframe.")
- # Setting Selection Weights
- grouped_train_data = train_data.groupby(['country', 'league']).apply(
- lambda x: x[['country', 'league']].iloc[0]).reset_index(drop=True)
- print("Setting default selection Weights")
- grouped_train_data = grouped_train_data.assign(Win=1.1, DNB=0.7, O_1_5=3.2, U_4_5=2.2)
- # Save grouped_train dataset to csv for backtest
- # grouped_train_data.to_csv(selection_weights_file, index=False) # Don't need to do it once done
- # Merging train_data ith predictions_archive to get home_score & away_score against predictions
- predicted = predictions_archive.merge(
- train_data[['datetime', 'home_score', 'away_score', 'country', 'league', 'home_team', 'away_team']],
- left_on=['match_datetime', 'country', 'league', 'home_team', 'away_team'],
- right_on=['datetime', 'country', 'league', 'home_team', 'away_team'],
- how='left')
- predicted.drop('datetime', axis=1, inplace=True) # Remove datetime column from train_data
- # "merging and adding columns from grouped_train_data to predicted"
- # Merge predicted and grouped_train_data dataframes
- merged_data = pd.merge(predicted, grouped_train_data, on=['country', 'league'], how='left')
- # Select the columns to add to predicted
- grouped_train_columns = ['Win', 'DNB', 'O_1_5', 'U_4_5']
- # Add the grouped_train_columns to predicted
- predicted[grouped_train_columns] = merged_data[grouped_train_columns]
- # Adding further columns
- predicted['score_difference'] = abs(predicted['home_score'] - predicted['away_score'])
- predicted['total_score'] = predicted['home_score'] + predicted['away_score']
- predicted['predicted_score_difference'] = abs(predicted['predicted_home_score'] - predicted['predicted_away_score'])
- predicted['predicted_total_score'] = predicted['predicted_home_score'] + predicted['predicted_away_score']
- predicted['result'] = np.where(predicted['home_score'] > predicted['away_score'], 'home',
- np.where(predicted['home_score'] < predicted['away_score'], 'away', 'draw'))
- predicted['predicted_result'] = np.where(predicted['predicted_home_score'] > predicted['predicted_away_score'], 'home',
- np.where(predicted['predicted_home_score'] < predicted['predicted_away_score'],
- 'away', 'draw'))
- predicted['result_match'] = np.where(predicted['result'] == predicted['predicted_result'], 'match', 'no match')
- # Creating selection functions
- def selection(row):
- if row["predicted_score_difference"] > row["Win"] and row["predicted_total_score"] > row["O_1_5"]:
- return "W & O 1.5"
- if row["predicted_score_difference"] > row["Win"]:
- return "W"
- if row["predicted_total_score"] > row["O_1_5"]:
- return "O 1.5"
- if row["predicted_score_difference"] > row["DNB"] and row["predicted_score_difference"] < row["Win"] and row[
- "predicted_total_score"] > row["O_1_5"]:
- return "O 1.5 or DNB"
- if row["predicted_score_difference"] > row["DNB"] and row["predicted_score_difference"] < row["Win"]:
- return "DNB"
- if row["predicted_score_difference"] > row["Win"] and row["predicted_total_score"] < row["U_4_5"]:
- return "W & U 4.5"
- if row["predicted_total_score"] < row["U_4_5"]:
- return "U 4.5"
- if row["predicted_score_difference"] < row["DNB"]:
- return "N"
- def selection_match(row):
- if row["selection"] == "N":
- return "No Sel."
- elif (row["home_score"] + row["away_score"]) < 5 and row["selection"] == "U 4.5":
- return "Match"
- elif row["result"] == row["predicted_result"] and row["selection"] == "W":
- return "Match"
- elif row["result"] == row["predicted_result"] and row["total_score"] > 1 and row["selection"] == "W & O 1.5":
- return "Match"
- elif row["total_score"] > 1 and row["selection"] == "O 1.5":
- return "Match"
- elif (row["result"] == row["predicted_result"] or row["result"] == 'Draw') and row["selection"] == "DNB":
- return "Match"
- elif pd.isna(row["home_score"]): # Fixed
- return "NA"
- else:
- return "No Match"
- predicted['selection'] = predicted.apply(selection, axis=1)
- predicted['selection_match'] = predicted.apply(selection_match, axis=1)
- # Modifying selection weights
- predicted['O_1_5'] = predicted.apply(lambda x: x.predicted_total_score + 0.02 if x.selection_match == "No Match" and (x.selection == "O 1.5" or x.selection == "W & O 1.5") and x.total_score < 2 else x['O_1_5'], axis=1)
- predicted['DNB'] = predicted.apply(lambda x: x.predicted_score_difference + 0.02 if x.selection_match == "No Match" and x.selection == "DNB"else (x.predicted_score_difference + 0.02 if x.selection_match == "No Match" and x.selection == "W & O 1.5" and x.result != x.predicted_result and x.result != 'Draw'else (x.predicted_score_difference + 0.02 if x.selection_match == "No Match" and x.selection == "W" and x.result != x.predicted_result and x.result != 'Draw'else (x.predicted_score_difference + 0.02 if x.selection_match == "No Match" and x.selection == "DNB" and x.result != x.predicted_result and x.result != 'Draw' else x.DNB))), axis=1)
- predicted['Win'] = predicted.apply(lambda x: x.predicted_score_difference + 0.02 if x.selection_match == "No Match" and x.selection in ["W", "W & O 1.5"] and x.result != x.predicted_result and x.result != 'Draw' else(x.predicted_score_difference + 0.02 if x.selection_match == "No Match" and x.selection == "W & O 1.5" and x.result == 'Draw' else(x.predicted_score_difference + 0.02 if x.selection_match == "No Match" and x.selection == "W" and x.result != 'Draw' else(x.predicted_score_difference + 0.02 if x.selection_match == "No Match" and x.selection == "W" and x.result == 'Draw' else x.Win))), axis=1)
- predicted['U_4_5'] = predicted.apply(lambda x: x.predicted_total_score - 0.02 if (x.total_score > 4) and (x.selection == "U 4.5") else x['U_4_5'], axis=1)
- # Grouping selection values
- grouped_predicted = predicted.groupby(['country', 'league']).agg({'Win': 'max', 'DNB': 'max', 'O_1_5': 'max', 'U_4_5': 'min'})
- # Updating Selection Weights
- selection_weights.set_index(['country', 'league'], inplace=True)
- grouped_predicted.index.names = ['country', 'league']
- selection_weights.update(grouped_predicted)
- # Finally saving updated selection weights
- selection_weights.to_csv(r"C:\Users\harshad\Documents\Harshad Projects\Python Projects\Football Predictor\Files\Backtest\selection.csv")
- # Checking if the selection matches
- predicted['selection'] = predicted.apply(selection, axis=1)
- predicted['selection_match'] = predicted.apply(selection_match, axis=1)
- # A bit of insurance
- if not predicted[predicted['selection_match'] == 'No Match'].empty:
- print(predicted[predicted['selection_match'] == 'No Match'])
- else:
- print("All predictions matched with selection.")
- print(predicted.shape[0])
- # print(sorted_df)
- # Time Taken
- end_time = time.time()
- total_time = end_time - start_time
- print(f"Total time taken: {total_time:.2f} seconds")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement