Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- df = pd.read_csv(r'C:\Users\\Desktop\FinalMerge.csv')
- #StartNo
- df['StartNo'] = np.nan
- filtered_rows = df[(df['FHSW_HorseId'].notnull()) &
- (df['M_IsBarrierTrial'] == False) &
- (df['FHSW_HorseId'].notnull()) &
- (df['MH_MDate'].notnull())]
- filtered_rows.sort_values(by=['FHSW_HorseId', 'MH_MDate'], inplace=True)
- filtered_rows['StartNo'] = filtered_rows.groupby('FHSW_HorseId').cumcount() + 1
- df.loc[filtered_rows.index, 'StartNo'] = filtered_rows['StartNo']
- #PFScore
- df['LS_PFScore'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_PFScore'] = previous_start_row['F_PFScore'].values[0]
- print('StartNoDone')
- #TP20R
- df['LS_TP20R'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_TP20R'] = previous_start_row['H_TP20R'].values[0]
- #TP20P
- df['LS_TP20P'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_TP20P'] = previous_start_row['H_TP20P'].values[0]
- print('TP20Done')
- #RPR
- df['LS_RPR'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_RPR'] = previous_start_row['H_RPR'].values[0]
- #RPP
- df['LS_RPP'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_RPP'] = previous_start_row['H_RPP'].values[0]
- #WP20R
- df['LS_WP20R'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_WP20R'] = previous_start_row['H_WP20R'].values[0]
- #WP20P
- df['LS_WP20P'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_WP20P'] = previous_start_row['H_WP20P'].values[0]
- #WPTP20R
- df['LS_WPTP20R'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_WPTP20R'] = previous_start_row['H_WPTP20R'].values[0]
- #WPTP20P
- df['LS_WPTP20P'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_WPTP20P'] = previous_start_row['H_WPTP20P'].values[0]
- #ATWC
- df['LS_ATWC'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_ATWC'] = previous_start_row['H_ATWC'].values[0]
- #AveATWC
- df['LS_AveATWC'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_AveATWC'] = previous_start_row['H_AveATWC'].values[0]
- print('AveATWCDone')
- #Position
- df['LS_Position'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_Position'] = previous_start_row['F_Position'].values[0]
- #Margin
- df['LS_Margin'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_Margin'] = previous_start_row['F_Margin'].values[0]
- #Jumps
- df['LS_Jumps'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_Jumps'] = 1 if previous_start_row['R_Jumps'].values[0] else 0
- #NonTAB
- df['LS_TABMeeting'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_TABMeeting'] = 1 if previous_start_row['M_TABMeeting'].values[0] else 0
- #PrizeMoneyNormalised
- df['PrizeMoneyNormalised'] = np.nan
- filtered_df = df[df['F_Position'] != '99']
- max_f_position = filtered_df.groupby('FRSW_RaceId')['F_Position'].max().reset_index()
- max_f_position.rename(columns={'F_Position': 'Max_F_Position'}, inplace=True)
- df = df.merge(max_f_position, on='FRSW_RaceId', how='left')
- valid_rows = (~df['R_PrizeMoney'].isna() &
- ~df['R_Starters'].isna() &
- ~df['F_Position'].isna() &
- ~df['FRSW_RaceId'].isna() &
- (df['F_Position'] != '99'))
- df.loc[valid_rows, 'PrizeMoneyNormalised'] = df.loc[valid_rows, 'R_PrizeMoney'] * (1 - df.loc[valid_rows, 'F_Position'] / df.loc[valid_rows, 'Max_F_Position'])
- mask = ~df['FRSW_RaceId'].isna()
- df.loc[mask, 'PrizeMoneyNormalised'] = df.loc[mask].groupby('FRSW_RaceId')['PrizeMoneyNormalised'].transform(
- lambda x: x / x.sum() * df.loc[x.index, 'R_PrizeMoney'].iloc[0] if not x.isna().all() and x.sum() != 0 else pd.NA)
- df['PrizeMoneyNormalised'] = df['PrizeMoneyNormalised'].apply(lambda x: round(x) if pd.notna(x) else pd.NA).astype('Int64')
- df.drop(columns=['Max_F_Position'], inplace=True)
- #LSPrizeMoneyNormalised
- df['LS_PrizeMoneyNormalised'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_PrizeMoneyNormalised'] = previous_start_row['PrizeMoneyNormalised'].values[0]
- #BFSP
- df['LS_BFSP'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
- if not previous_start_row.empty:
- df.at[index, 'LS_BFSP'] = previous_start_row['F_BFSP'].values[0] if not pd.isnull(previous_start_row['F_BFSP'].values[0]) else previous_start_row['H_BFSP'].values[0]
- #TabNo
- df['LS_TabNo'] = None
- for index, row in df.iterrows():
- current_start_no = row['StartNo']
- current_horse_id = row['FHSW_HorseId']
- previous_start = df[(df['StartNo'] < current_start_no) & (df['FHSW_HorseId'] == current_horse_id)].tail(1)
- if not previous_start.empty:
- previous_tab_no = previous_start['F_TABNo'].values[0]
- df.at[index, 'LS_TabNo'] = previous_tab_no
- #PrizeMoneyBreakDown
- df['R_PrizeMoneyBreakDown'] = df['R_PrizeMoneyBreakDown'].astype(str)
- df = df[df['F_Position'].notna() & (df['F_Position'] != '') & df['R_PrizeMoneyBreakDown'].notna() & (df['R_PrizeMoneyBreakDown'] != '')]
- def create_prize_money_dict(row):
- entries = row.split(';')
- prize_money_dict = {}
- for entry in entries:
- pos_value = entry.split(',')
- if len(pos_value) != 2:
- continue
- pos, value = pos_value
- pos = ''.join(filter(str.isdigit, pos))
- if pos:
- prize_money_dict[int(pos)] = int(value)
- return prize_money_dict
- df['PrizeMoney'] = df.apply(lambda row: create_prize_money_dict(row['R_PrizeMoneyBreakDown']).get(int(row['F_Position']), np.nan), axis=1)
- #AvgCareerPrizeMoney
- df['AvgCareerPrizeMoney'] = np.nan
- for index, row in df.iterrows():
- if row['FHSW_HorseId']:
- previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])]['PrizeMoneyNormalised']
- if not previous_starts.empty:
- avg_career_prize_money = previous_starts.mean()
- df.at[index, 'AvgCareerPrizeMoney'] = avg_career_prize_money
- #CareerWins
- df['CareerWins'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] != 0:
- horse_id = row['FHSW_HorseId']
- start_no = row['StartNo']
- previous_starts = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)]
- if not previous_starts.empty:
- career_wins = previous_starts[previous_starts['F_Position'] == 1].shape[0]
- df.at[index, 'CareerWins'] = career_wins
- #RacesSinceLastWin
- df['RacesSinceLastWin'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- horse_id = row['FHSW_HorseId']
- start_no = row['StartNo']
- previous_wins = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no) & (df['F_Position'] == 1)]
- if not previous_wins.empty:
- most_recent_win = previous_wins.iloc[-1]
- races_since_last_win = start_no - most_recent_win['StartNo'] - 1
- else:
- races_since_last_win = 0
- df.at[index, 'RacesSinceLastWin'] = races_since_last_win
- #AvgCareerBFSP
- df['AvgCareerBFSP'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
- (df['StartNo'] < row['StartNo'])]
- if not previous_starts.empty:
- previous_bfsp_values = previous_starts['F_BFSP'].astype(float)
- previous_bfsp_values = previous_bfsp_values[previous_bfsp_values != 0.0]
- if not previous_bfsp_values.empty:
- avg_career_bfsp = previous_bfsp_values.mean()
- df.at[index, 'AvgCareerBFSP'] = avg_career_bfsp
- #DaysBetweenLastStart
- df['DaysBetweenStarts'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- current_start_no = row['StartNo']
- horse_id = row['FHSW_HorseId']
- previous_row = df[(df['FHSW_HorseId'] == horse_id) &
- (df['StartNo'] < current_start_no)].tail(1)
- if not previous_row.empty:
- previous_start_date = previous_row['MH_MDate'].values[0]
- current_start_date = row['MH_MDate']
- days_between_starts = (current_start_date - previous_start_date).days
- df.at[index, 'DaysBetweenStarts'] = days_between_starts
- #AgeinDays
- df['AgeInDays'] = np.nan
- for index, row in df.iterrows():
- if pd.isnull(row['MH_MDate']) or pd.isnull(row['F_FoalDate']):
- continue
- age_in_days = (row['MH_MDate'] - row['F_FoalDate']).days
- df.at[index, 'AgeInDays'] = age_in_days
- #AgeinHorseYears
- df['AgeHorseYears'] = 0
- for index, row in df.iterrows():
- if pd.isnull(row['F_FoalDate']):
- continue
- age_in_days = (row['MH_MDate'] - pd.Timestamp(f"{row['F_FoalDate'].year}-08-01")).days
- age_in_years = age_in_days // 365
- df.at[index, 'AgeHorseYears'] = age_in_years
- #StartsbyAge
- df['StartsYr1'] = 0
- df['StartsYr2'] = 0
- df['StartsYr3'] = 0
- for horse_id, group in df[df['AgeHorseYears'] == 1].groupby('FHSW_HorseId'):
- starts_yr1 = group['StartNo'].max() - group['StartNo'].min() + 1
- df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr1'] = starts_yr1
- df.loc[df['AgeHorseYears'] >= 2, 'StartsYr1'] = df['StartsYr1'].replace(0, method='ffill')
- for horse_id, group in df[df['AgeHorseYears'] == 2].groupby('FHSW_HorseId'):
- starts_yr2 = group['StartNo'].max() - group['StartNo'].min() + 1
- df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr2'] = starts_yr2
- df.loc[df['AgeHorseYears'] >= 3, 'StartsYr2'] = df['StartsYr2'].replace(0, method='ffill')
- for horse_id, group in df[df['AgeHorseYears'] == 3].groupby('FHSW_HorseId'):
- starts_yr3 = group['StartNo'].max() - group['StartNo'].min() + 1
- df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr3'] = starts_yr3
- df.loc[df['AgeHorseYears'] >= 4, 'StartsYr3'] = df['StartsYr3'].replace(0, method='ffill')
- #RunsPrep
- df['RunsPrep'] = np.nan
- for horse_id in df['FHSW_HorseId'].unique():
- horse_df = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'].notnull()) & (df['StartNo'] != 0)].sort_values('MH_MDate')
- prep_counter = 1
- last_race_date = None
- for index, row in horse_df.iterrows():
- if row['M_IsBarrierTrial'] and prep_counter > 1:
- continue
- current_date = row['MH_MDate']
- if last_race_date is None or (current_date - last_race_date).days <= 35:
- df.at[index, 'RunsPrep'] = prep_counter
- else:
- prep_counter += 1
- df.at[index, 'RunsPrep'] = prep_counter
- if not row['M_IsBarrierTrial']:
- last_race_date = current_date
- #FieldSize
- df['FieldSize%Change'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_start = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) & (df['StartNo'] < row['StartNo'])]
- if not previous_start.empty:
- previous_start = previous_start.iloc[-1]
- percentage_change = ((row['R_Starters'] - previous_start['R_Starters']) / previous_start['R_Starters']) * 100
- df.at[index, 'FieldSize%Change'] = percentage_change
- #FieldBeaten%
- df['FieldBeaten%'] = np.nan
- last_field_beaten_percentage = {}
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (0, 1):
- if row['F_Position'] == 99:
- df.at[index, 'FieldBeaten%'] = 0
- last_field_beaten_percentage[row['FHSW_HorseId']] = 0
- else:
- if row['R_Starters'] > 1:
- field_beaten_percentage = 1 - ((row['F_Position'] - 1) / (row['R_Starters'] - 1))
- else:
- field_beaten_percentage = np.nan
- if row['FHSW_HorseId'] in last_field_beaten_percentage:
- df.at[index, 'FieldBeaten%'] = last_field_beaten_percentage[row['FHSW_HorseId']]
- else:
- df.at[index, 'FieldBeaten%'] = field_beaten_percentage
- last_field_beaten_percentage[row['FHSW_HorseId']] = field_beaten_percentage
- #DistanceChange
- df['Distance%Change'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) & (df['StartNo'] < row['StartNo'])]
- if not previous_starts.empty:
- previous_start = previous_starts.tail(1)
- previous_start_distance = previous_start['R_Distance'].values[0]
- current_start_distance = row['R_Distance']
- starters_change = row['R_Starters'] - previous_start['R_Starters'].values[0]
- percentage_change = (starters_change / previous_start['R_Starters'].values[0]) * 100
- df.at[index, 'Distance%Change'] = percentage_change
- #DistancePreference
- df['DistancePreference'] = np.nan
- for index, row in df.iterrows():
- if row['StartNo'] not in [0, '1', '0']:
- horse_id = row['FHSW_HorseId']
- previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['RowNo'] < row['RowNo'])]
- within_15_percent_rows = previous_rows[((row['R_Distance'] - previous_rows['R_Distance']) / previous_rows['R_Distance']) * 100 <= 15]
- if within_15_percent_rows.empty:
- df.at[index, 'DistancePreference'] = 0
- else:
- if any((prev_row['FieldBeaten%'] > 0.5 or prev_row['F_Margin'] < 4) for _, prev_row in within_15_percent_rows.iterrows()):
- df.at[index, 'DistancePreference'] = 1
- else:
- df.at[index, 'DistancePreference'] = 0
- #NewDistance
- df['NewDistance'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
- horse_id = row['FHSW_HorseId']
- current_start = row['R_Distance']
- previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < row['StartNo'])]
- if not previous_rows.empty:
- valid_rows = previous_rows[previous_rows['R_Distance'] > current_start]
- if not valid_rows.empty:
- df.at[index, 'NewDistance'] = 0
- else:
- closest_row_index = (previous_rows['R_Distance'] - current_start).abs().idxmin()
- closest_row = previous_rows.loc[[closest_row_index]]
- percentage_change = ((current_start - closest_row['R_Distance'].iloc[0]) / closest_row['R_Distance'].iloc[0]) * 100
- if percentage_change > 15:
- df.at[index, 'NewDistance'] = 1
- #Weight per 100m
- df['WeightPer100m'] = np.nan
- mask = (df['R_Distance'].astype(bool)) & (df['F_Weight'].astype(bool))
- df.loc[mask, 'WeightPer100m'] = (df.loc[mask, 'R_Distance'] / 100) / df.loc[mask, 'F_Weight']
- df['WeightPer100m'].fillna(0.0, inplace=True)
- #Weight of horse minus average weight of other horses in the race
- valid_weights = df[(df['StartNo'].notnull()) & (~df['StartNo'].isin([0, 1]))]
- average_weight_dict = valid_weights.groupby('FRSW_RaceId')['F_Weight'].mean().to_dict()
- df['RelativeWeight'] = df.apply(lambda row: row['F_Weight'] - average_weight_dict.get(row['FRSW_RaceId'], 0), axis=1)
- #Weight drop/rise between this start and last start
- df['WeightChange'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- horse_id = row['FHSW_HorseId']
- current_start_no = row['StartNo']
- previous_start = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < current_start_no)].last_valid_index()
- if previous_start is not None:
- current_distance = row['R_Distance']
- previous_distance = df.at[previous_start, 'R_Distance']
- weight_change = row['F_Weight'] - df.at[previous_start, 'F_Weight']
- df.at[index, 'WeightChange'] = weight_change
- #Career Relative Weight
- df['CareerRelativeWeight'] = np.nan
- for index, row in df.iterrows():
- current_start_no = row['StartNo']
- current_horse_id = row['FHSW_HorseId']
- previous_starts = df[(df['StartNo'] < current_start_no) & (df['FHSW_HorseId'] == current_horse_id)]
- if not previous_starts.empty:
- average_weight_previous_starts = previous_starts['F_Weight'].mean()
- career_relative_weight = ((row['F_Weight'] - average_weight_previous_starts) / average_weight_previous_starts) * 100
- df.at[index, 'CareerRelativeWeight'] = career_relative_weight
- #TrackCondition
- conditions = ['Good to Good', 'Good to Soft', 'Good to Heavy', 'Good to Synthetic',
- 'Soft to Good', 'Soft to Soft', 'Soft to Heavy', 'Soft to Synthetic',
- 'Heavy to Good', 'Heavy to Soft', 'Heavy to Heavy', 'Heavy to Synthetic',
- 'Synthetic to Good', 'Synthetic to Soft', 'Synthetic to Heavy', 'Synthetic to Synthetic']
- for condition in conditions:
- df[condition] = 0.0
- for index, row in df.iterrows():
- if pd.isna(row['StartNo']) or row['StartNo'] in (1, 0):
- for condition in conditions:
- df.at[index, condition] = np.nan
- else:
- horse_id = row['FHSW_HorseId']
- current_start_no = row['StartNo']
- previous_start = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < current_start_no)].sort_values('StartNo', ascending=False).head(1)
- if not previous_start.empty:
- previous_start = previous_start.head(1)
- previous_condition = previous_start['R_GoingDesc'].values[0]
- current_condition = row['R_GoingDesc']
- previous_condition = previous_condition.lower()
- current_condition = current_condition.lower()
- condition_key = f"{previous_condition.capitalize()} to {current_condition.capitalize()}"
- if condition_key in conditions:
- df.at[index, condition_key] = 1
- if previous_condition == 'good' and current_condition == 'good':
- df.at[index, 'Good to Good'] = 1
- elif previous_condition == 'good' and current_condition == 'soft':
- df.at[index, 'Good to Soft'] = 1
- elif previous_condition == 'good' and current_condition == 'heavy':
- df.at[index, 'Good to Heavy'] = 1
- elif previous_condition == 'good' and current_condition == 'synethic':
- df.at[index, 'Good to Synthetic'] = 1
- elif previous_condition == 'soft' and current_condition == 'good':
- df.at[index, 'Soft to Good'] = 1
- elif previous_condition == 'soft' and current_condition == 'soft':
- df.at[index, 'Soft to Soft'] = 1
- elif previous_condition == 'soft' and current_condition == 'heavy':
- df.at[index, 'Soft to Heavy'] = 1
- elif previous_condition == 'soft' and current_condition == 'synthetic':
- df.at[index, 'Soft to Synthetic'] = 1
- elif previous_condition == 'heavy' and current_condition == 'good':
- df.at[index, 'Heavy to Good'] = 1
- elif previous_condition == 'heavy' and current_condition == 'soft':
- df.at[index, 'Heavy to Soft'] = 1
- elif previous_condition == 'heavy' and current_condition == 'heavy':
- df.at[index, 'Heavy to Heavy'] = 1
- elif previous_condition == 'heavy' and current_condition == 'synthetic':
- df.at[index, 'Heavy to Synthetic'] = 1
- elif previous_condition == 'synthetic' and current_condition == 'good':
- df.at[index, 'Synthetic to Good'] = 1
- elif previous_condition == 'synthetic' and current_condition == 'soft':
- df.at[index, 'Synthetic to Soft'] = 1
- elif previous_condition == 'synthetic' and current_condition == 'heavy':
- df.at[index, 'Synthetic to Heavy'] = 1
- elif previous_condition == 'synthetic' and current_condition == 'synthetic':
- df.at[index, 'Synthetic to Synthetic'] = 1
- #Surface Preference
- df['SurfacePreference'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
- horse_id = row['FHSW_HorseId']
- previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['RowNo'] < row['RowNo'])]
- same_surface_rows = previous_rows[previous_rows['R_GoingDesc'] == row['R_GoingDesc']]
- if same_surface_rows.empty:
- df.at[index, 'SurfacePreference'] = 0
- else:
- if any((prev_row['FieldBeaten%'] > 0.5 or prev_row['F_Margin'] < 4) for _, prev_row in same_surface_rows.iterrows()):
- df.at[index, 'SurfacePreference'] = 1
- else:
- df.at[index, 'SurfacePreference'] = 0
- #New Surface
- df['NewSurface'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
- horse_id = row['FHSW_HorseId']
- current_surface = row['R_GoingDesc']
- previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < row['StartNo'])]
- if not previous_rows.empty:
- matching_surface_rows = previous_rows[previous_rows['R_GoingDesc'] == current_surface]
- if not matching_surface_rows.empty:
- df.at[index, 'NewSurface'] = 0
- else:
- df.at[index, 'NewSurface'] = 1
- #Track Average Prize Money
- df = df[df['M_IsBarrierTrial'] == False]
- df['TrackAvgPrizeMoney'] = np.nan
- for index, row in df.iterrows():
- if row['RowNo'] >= 2463450:
- M_TName = row['M_TName']
- unique_race_ids = df[(df['RowNo'] < row['RowNo']) & (df['M_TName'] == M_TName)]['FRSW_RaceId'].unique()
- total_prize_money = df[(df['RowNo'] < row['RowNo']) & (df['M_TName'] == M_TName) & df['FRSW_RaceId'].isin(unique_race_ids)].drop_duplicates(subset=['M_TName', 'FRSW_RaceId'])['R_PrizeMoney'].sum()
- unique_race_count = len(unique_race_ids)
- if unique_race_count != 0:
- avg_prize_money = total_prize_money / unique_race_count
- else:
- avg_prize_money = 0.0
- df.at[index, 'TrackAvgPrizeMoney'] = avg_prize_money
- df = df.reset_index(drop=True)
- #Cover
- df['Cover1200'] = np.nan
- df['Cover1000'] = np.nan
- df['Cover800'] = np.nan
- df['Cover600'] = np.nan
- df['Cover400'] = np.nan
- df['Cover200'] = np.nan
- distance_columns = [1200, 1000, 800, 600, 400, 200]
- for distance in distance_columns:
- df[f'Cover{distance}'] = np.nan
- for distance in distance_columns:
- pos_column = f'SH_POS{distance}'
- wides_column = f'W_Wides{distance}'
- cover_column = f'Cover{distance}'
- mask = (
- df[wides_column].notna() &
- (df[wides_column] > 1) &
- df[pos_column].notna() &
- (df[pos_column] > 1) &
- df['StartNo'].notna() &
- ~df['StartNo'].isin([0, 1])
- )
- for index, row in df[mask].iterrows():
- race_id = row['FRSW_RaceId']
- horse_wides = row[wides_column]
- row_1 = df[(df['FRSW_RaceId'] == race_id) & (df[pos_column] == 1)]
- df.loc[row_1.index, cover_column] = 0
- for i in range(2, int(df[df['FRSW_RaceId'] == race_id][pos_column].max()) + 1):
- other_horses = df[(df['FRSW_RaceId'] == race_id) & (df[pos_column] > i)]
- for _, other_row in other_horses.iterrows():
- other_wides = other_row[wides_column]
- if horse_wides == other_wides:
- df.loc[index, cover_column] = 1
- break
- else:
- df.loc[index, cover_column] = 0
- df.loc[
- ((df[wides_column] == 0) | (df[wides_column] == 99) | (df[pos_column] == 0) | (df[pos_column] == 25)),
- cover_column
- ] = 0
- #Jockey Win Rate
- df['JockeyWin%L125'] = np.nan
- for index, row in df.iterrows():
- current_row_no = row['RowNo']
- current_jockey_id = row['F_JockeyId']
- previous_rows = df[(df['RowNo'] < current_row_no) &
- (df['F_JockeyId'] == current_jockey_id) &
- ~df['FHSW_HorseId'].isnull() &
- (df['M_IsBarrierTrial'] == False)].tail(125)
- win_count = previous_rows[previous_rows['F_Position'] == 1].shape[0]
- if len(previous_rows) == 125:
- win_strike_rate = win_count / 125.0
- df.at[index, 'JockeyWin%L125'] = win_strike_rate
- else:
- df.at[index, 'JockeyWin%L125'] = np.nan
- #Jockey Place Rate
- df['JockeyPlace%L250'] = np.nan
- for index, row in df.iterrows():
- current_row_no = row['RowNo']
- current_jockey_id = row['F_JockeyId']
- previous_rows = df[(df['RowNo'] < current_row_no) &
- (df['F_JockeyId'] == current_jockey_id) &
- ~df['FHSW_HorseId'].isnull() &
- (df['M_IsBarrierTrial'] == False)].tail(250)
- place_count = previous_rows[previous_rows['F_Position'].isin([1, 2, 3])].shape[0]
- if len(previous_rows) == 250:
- place_strike_rate = place_count / 250.0
- df.at[index, 'JockeyPlace%L250'] = place_strike_rate
- else:
- df.at[index, 'JockeyPlace%L250'] = np.nan
- #Jockey Prize Money
- df['JockeyPrizeMoneyL125'] = np.nan
- for index, row in df.iterrows():
- current_row_no = row['RowNo']
- current_jockey_id = row['F_JockeyId']
- previous_rows = df[(df['RowNo'] < current_row_no) &
- (df['F_JockeyId'] == current_jockey_id) &
- ~df['FHSW_HorseId'].isnull() &
- (df['M_IsBarrierTrial'] == False)].tail(250)
- if len(previous_rows) >= 125:
- prize_money_sum = previous_rows['PrizeMoneyNormalised'].sum()
- df.at[index, 'JockeyPrizeMoneyL125'] = prize_money_sum
- #Jockey Prize Money Annual
- df['JockeyPrizeMoneyAnnual'] = np.nan
- for index, row in df.iterrows():
- current_jockey_id = row['F_JockeyId']
- current_date = row['MH_MDate']
- last_year_rows = df[(df['F_JockeyId'] == current_jockey_id) &
- (df['MH_MDate'] >= current_date - pd.DateOffset(years=1)) &
- (df['MH_MDate'] < current_date) &
- (~df['FHSW_HorseId'].isnull()) &
- (df['M_IsBarrierTrial'] == False)]
- if not last_year_rows.empty:
- annual_prize_money = last_year_rows['PrizeMoneyNormalised'].sum()
- df.at[index, 'JockeyPrizeMoneyAnnual'] = annual_prize_money
- else:
- df.at[index, 'JockeyPrizeMoneyAnnual'] = np.nan
- #Jockey Position
- df['AverageJockeyPosition'] = np.nan
- for index, row in df.iterrows():
- current_row_no = row['RowNo']
- current_jockey_id = row['F_JockeyId']
- previous_rows = df[(df['RowNo'] < current_row_no) &
- (df['F_JockeyId'] == current_jockey_id) &
- (~df['FHSW_HorseId'].isnull()) &
- (df['M_IsBarrierTrial'] == False)]
- if not previous_rows.empty:
- average_jockey_position = previous_rows['H_AHSP'].mean()
- df.at[index, 'AverageJockeyPosition'] = average_jockey_position
- #Jockey Streak
- df['JockeyStreak'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['FHSW_HorseId']) and row['M_IsBarrierTrial'] == False:
- meeting_id = row['FM_MeetingId']
- race_id = row['FRSW_RaceId']
- meeting_rows = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] <= race_id)]
- jockey_wins = meeting_rows[meeting_rows['F_Position'] == 1].groupby('F_JockeyId').size()
- current_race_jockeys = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id)]['F_JockeyId'].unique()
- for jockey_id in current_race_jockeys:
- if jockey_id in jockey_wins:
- wins_count = jockey_wins[jockey_id]
- else:
- wins_count = 0
- df.loc[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id) & (df['F_JockeyId'] == jockey_id), 'JockeyStreak'] = wins_count
- #Same Jockey
- df['SameJockey'] = np.nan
- for index, row in df.iterrows():
- current_row_no = row['RowNo']
- current_jockey_id = row['F_JockeyId']
- current_horse_id = row['FHSW_HorseId']
- previous_start = df[(df['RowNo'] < current_row_no) &
- (df['FHSW_HorseId'] == current_horse_id) &
- (~df['FHSW_HorseId'].isnull()) &
- (df['M_IsBarrierTrial'] == False)].tail(1)
- if previous_start.empty:
- df.at[index, 'SameJockey'] = np.nan
- else:
- previous_jockey_id = previous_start['F_JockeyId'].values[0]
- if previous_jockey_id == current_jockey_id:
- df.at[index, 'SameJockey'] = 1
- else:
- df.at[index, 'SameJockey'] = 0
- #Trainer Win Rate
- df['TrainerWin%L100'] = np.nan
- for index, row in df.iterrows():
- current_row_no = row['RowNo']
- current_trainer_id = row['F_TrainerId']
- previous_rows = df[(df['RowNo'] < current_row_no) &
- (df['F_TrainerId'] == current_trainer_id) &
- ~df['FHSW_HorseId'].isnull() &
- (df['M_IsBarrierTrial'] == False)].tail(100)
- win_count = previous_rows[previous_rows['F_Position'] == 1].shape[0]
- if len(previous_rows) == 100:
- win_strike_rate = win_count / 100.0
- df.at[index, 'TrainerWin%L100'] = win_strike_rate
- else:
- df.at[index, 'TrainerWin%L100'] = np.nan
- #Trainer Place Rate
- df['TrainerPlace%L225'] = np.nan
- for index, row in df.iterrows():
- current_row_no = row['RowNo']
- current_trainer_id = row['F_TrainerId']
- previous_rows = df[(df['RowNo'] < current_row_no) &
- (df['F_TrainerId'] == current_trainer_id) &
- ~df['FHSW_HorseId'].isnull() &
- (df['M_IsBarrierTrial'] == False)].tail(225)
- place_count = previous_rows[previous_rows['F_Position'].isin([1, 2, 3])].shape[0]
- if len(previous_rows) == 225:
- place_strike_rate = place_count / 225.0
- df.at[index, 'TrainerPlace%L225'] = place_strike_rate
- else:
- df.at[index, 'TrainerPlace%L225'] = np.nan
- #Trainer Prize Money
- df['TrainerPrizeMoneyL100'] = np.nan
- for index, row in df.iterrows():
- current_row_no = row['RowNo']
- current_trainer_id = row['F_TrainerId']
- previous_rows = df[(df['RowNo'] < current_row_no) &
- (df['F_TrainerId'] == current_trainer_id) &
- ~df['FHSW_HorseId'].isnull() &
- (df['M_IsBarrierTrial'] == False)].tail(100)
- if len(previous_rows) >= 100:
- prize_money_sum = previous_rows['PrizeMoneyNormalised'].sum()
- df.at[index, 'TrainerPrizeMoneyL100'] = prize_money_sum
- #Trainer Prize Money Annual
- df['TrainerPrizeMoneyAnnual'] = np.nan
- for index, row in df.iterrows():
- current_trainer_id = row['F_TrainerId']
- current_date = row['MH_MDate']
- last_year_rows = df[(df['F_TrainerId'] == current_trainer_id) &
- (df['MH_MDate'] >= current_date - pd.DateOffset(years=1)) &
- (df['MH_MDate'] < current_date) &
- (~df['FHSW_HorseId'].isnull()) &
- (df['M_IsBarrierTrial'] == False)]
- if not last_year_rows.empty:
- annual_prize_money = last_year_rows['PrizeMoneyNormalised'].sum()
- df.at[index, 'TrainerPrizeMoneyAnnual'] = annual_prize_money
- else:
- df.at[index, 'TrainerPrizeMoneyAnnual'] = np.nan
- #Trainer Streak
- df['TrainerStreak'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['FHSW_HorseId']) and row['M_IsBarrierTrial'] == False:
- meeting_id = row['FM_MeetingId']
- race_id = row['FRSW_RaceId']
- meeting_rows = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] <= race_id)]
- trainer_wins = meeting_rows[meeting_rows['F_Position'] == 1].groupby('F_TrainerId').size()
- current_race_trainers = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id)]['F_TrainerId'].unique()
- for trainer_id in current_race_trainers:
- if trainer_id in trainer_wins:
- wins_count = trainer_wins[trainer_id]
- else:
- wins_count = 0
- df.loc[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id) & (df['F_TrainerId'] == trainer_id), 'TrainerStreak'] = wins_count
- #Change of Trainer
- from difflib import SequenceMatcher
- df['TrainerChange'] = np.nan
- for index, row in df.iterrows():
- if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
- horse_id = row['FHSW_HorseId']
- start_no = row['StartNo']
- previous_starts = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)]
- if previous_starts.empty:
- df.at[index, 'TrainerChange'] = np.nan
- else:
- previous_start = previous_starts.iloc[-1]
- current_trainer = str(row['F_Trainer'])
- previous_trainer = str(previous_start['F_Trainer'])
- similarity_ratio = SequenceMatcher(None, current_trainer, previous_trainer).ratio()
- if similarity_ratio < 0.75:
- df.at[index, 'TrainerChange'] = 1
- else:
- df.at[index, 'TrainerChange'] = 0
- #NonTABAvgTime
- tracks = ['adaminaby', 'adelaide river', 'alexandra', 'alpha', 'aramac', 'ardlethan', 'armidale', 'atherton', 'augathella', 'avoca', 'balnarring', 'balranald', 'barcaldine', 'barraba', 'bedgerebong', 'bedourie', 'bell', 'berrigan', 'bingara', 'binnaway', 'birdsville', 'blackall', 'bluff', 'bombala', 'bong bong', 'boorowa', 'boulia', 'bourke', 'bowen', 'bowraville', 'braidwood', 'brewarrina', 'broken hill', 'broome', 'buchan', 'bundaberg', 'bundarra', 'burrandowan', 'burrumbeet', 'calliope', 'camooweal', 'camperdown', 'carinda', 'carnarvon', 'carrathool', 'ceduna', 'charleville', 'charters towers', 'chinchilla', 'clare', 'clermont', 'clifton', 'cloncurry', 'cobar', 'colac', 'collarenebri', 'collie', 'come by chance', 'condobolin', 'cooktown', 'cooma', 'coonabarabran', 'coonamble', 'cootamundra', 'corfield', 'corowa', 'cowra', 'crookwell', 'cunnamulla', 'deagon', 'dederang', 'deepwater', 'deniliquin', 'derby', 'dingo', 'dongara', 'drouin', 'dunkeld', 'edenhope', 'eidsvold', 'einasleigh', 'emerald', 'enngonia', 'esk', 'esperance', 'ewan', 'flinton', 'forbes', 'gayndah', 'geurie', 'gilgandra', 'gladstone', 'glen innes', 'goondiwindi', 'gordonvale', 'great western', 'gregory downs', 'grenfell', 'griffith', 'gulargambone', 'gulgong', 'gunbower', 'gundagai', 'gunnedah', 'gympie', 'hamilton', 'hanging rock', 'harden', 'hawker', 'hay', 'healesville', 'hillston', 'hinnomunjie', 'holbrook', 'home hill', 'hughenden', 'ilfracombe', 'ingham', 'injune', 'innisfail', 'isisford', 'jamestown', 'jandowae', 'jerilderie', 'julia creek', 'junction', 'jundah', 'katherine', 'kempsey', 'kerang', 'king island', 'kingscote', 'kojonup', 'kumbia', 'kununurra', 'landor', 'laura', 'laverton', 'leeton', 'leinster', 'leonora', 'lightning ridge', 'lock', 'lockhart', 'longford', 'longreach', 'louth', 'mallawa', 'manangatang', 'mansfield', 'marble bar', 'mareeba', 'maxwelton', 'mckinlay', 'meekatharra', 'mendooran', 'merriwa', 'merton', 'middlemount', 'miles', 'halidon', 'mingenew', 'mitchell', 'monto', 'moora', 'moranbah', 'moree', 'mortlake', 'moruya', 'morven', 'moulamein', 'mt barker', 'mt gambier', 'mt garnet', 'mt isa', 'mt magnet', 'mt perry', 'mudgee', 'mungery', 'mungindi', 'murtoa', 'muttaburra', 'nanango', 'naracoorte', 'narrabri', 'narrandera', 'narrogin', 'narromine', 'newman', 'nhill', 'noorama', 'norseman', 'northam', 'nowra', 'nyngan', 'oak park', 'oakley', 'orange', 'parkes', 'penola', 'penong', 'penshurst', 'pentland', 'pingrup', 'pooncarie', 'port augusta', 'port hedland', 'port lincoln', 'prairie', 'quambone', 'quilpie', 'quirindi', 'quorn', 'richmond', 'roebourne', 'roma', 'roxby downs', 'seymour', 'springsure', 'st arnaud', 'st george', 'stamford', 'stanthorpe', 'stawell', 'streaky bay', 'surat', 'swifts creek', 'tabulam', 'talmoi', 'talwood', 'tambo', 'tara', 'taroom', 'tennant creek', 'texas', 'thangool', 'tocumwal', 'tomingley', 'toodyay', 'tottenham', 'tower hill', 'towong', 'trangie', 'tullibigeal', 'tumbarumba', 'tumut', 'tuncurry', 'twin hills', 'walcha', 'walgett', 'wallabadah', 'wandoan', 'warialda', 'warra', 'warren', 'wauchope', 'wean', 'wellington', 'wentworth', 'winton aus', 'wondai', 'woolamai', 'wycheproof', 'wyndham', 'yalgoo', 'yass', 'yea', 'yeppoon', 'york', 'young']
- df = df[(df['M_IsBarrierTrial'] == False) &
- df['FHSW_HorseId'].notna() &
- (df['R_RaceTime'].notna() & df['R_RaceTime'] != 0)]
- merged_distances = []
- for track in tracks:
- track_data = df[df['M_TName'] == track]
- filtered_track_data = track_data[~track_data['R_GoingDesc'].isin(['sand', 'synthetic'])]
- race_count_by_distance_going = track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].nunique().reset_index()
- race_ids_by_distance_going = track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].unique().reset_index()
- for index, row in race_count_by_distance_going.iterrows():
- distance = row['R_Distance']
- going_desc = row['R_GoingDesc']
- race_count = row['FRSW_RaceId']
- unique_race_ids = list(race_ids_by_distance_going[(race_ids_by_distance_going['R_Distance'] == distance) & (race_ids_by_distance_going['R_GoingDesc'] == going_desc)]['FRSW_RaceId'].values[0])
- if going_desc in ['soft', 'heavy']:
- combined_going_desc = 'soft'
- elif going_desc in ['good', 'firm', 'dead', 'unknown']:
- combined_going_desc = 'good'
- else:
- combined_going_desc = 'other'
- unique_race_times = df[df['FRSW_RaceId'].isin(unique_race_ids)]['R_RaceTime'].drop_duplicates()
- existing_entry = next((entry for entry in merged_distances if entry['Track'] == track and entry['R_Distance'] == distance and entry['Combined GoingDesc'] == combined_going_desc), None)
- if existing_entry:
- existing_entry['Race Count'] += race_count
- existing_entry['Unique RaceIds'] += unique_race_ids
- existing_entry['R_RaceTime'] += unique_race_times.tolist()
- else:
- merged_distances.append({
- 'Track': track,
- 'R_Distance': distance,
- 'Combined GoingDesc': combined_going_desc,
- 'Race Count': race_count,
- 'Unique RaceIds': unique_race_ids,
- 'R_RaceTime': unique_race_times.tolist()
- })
- merged_entries = {}
- for entry in merged_distances:
- track = entry['Track']
- distance = entry['R_Distance']
- combined_going_desc = entry['Combined GoingDesc']
- race_count = entry['Race Count']
- if combined_going_desc in ['good', 'soft'] and race_count < 20:
- key = (track, distance)
- if key in merged_entries:
- merged_entries[key]['Race Count'] += race_count
- merged_entries[key]['Unique RaceIds'].extend(entry['Unique RaceIds'])
- merged_entries[key]['R_RaceTime'].extend(entry['R_RaceTime'])
- else:
- merged_entries[key] = entry
- else:
- merged_entries[(track, distance)] = entry
- merged_distances_list = list(merged_entries.values())
- for entry in merged_distances_list:
- if entry['R_RaceTime']:
- avg_race_time = round(sum(entry['R_RaceTime']) / len(entry['R_RaceTime']), 2)
- entry['Average Race Time'] = avg_race_time
- race_id_to_avg_time = {}
- for entry in merged_distances_list:
- for race_id in entry['Unique RaceIds']:
- race_id_to_avg_time[race_id] = entry['Average Race Time']
- sandsynth = []
- for track in tracks:
- track_data = df[df['M_TName'] == track]
- filtered_track_data = track_data[track_data['R_GoingDesc'].isin(['sand', 'synthetic'])]
- race_count_by_going_distance = filtered_track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].nunique().reset_index()
- race_ids_by_distance_going = filtered_track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].unique().reset_index()
- for index, row in race_count_by_going_distance.iterrows():
- race_count = row['FRSW_RaceId']
- distance = row['R_Distance']
- unique_race_ids = race_ids_by_distance_going[race_ids_by_distance_going['R_Distance'] == distance]['FRSW_RaceId'].values[0]
- unique_race_times = df[df['FRSW_RaceId'].isin(unique_race_ids)]['R_RaceTime'].drop_duplicates()
- avg_race_time = round(sum(unique_race_times) / len(unique_race_times), 2)
- sandsynth.append({
- 'Track': track,
- 'R_Distance': distance,
- 'Race Count': race_count,
- 'Unique RaceIds': unique_race_ids,
- 'R_RaceTime': unique_race_times.tolist(),
- 'Average Race Time': avg_race_time
- })
- sandsynth_avg_time_dict = {}
- for entry in sandsynth:
- unique_race_ids = entry['Unique RaceIds']
- avg_race_time = entry['Average Race Time']
- for race_id in unique_race_ids:
- sandsynth_avg_time_dict[race_id] = avg_race_time
- merged_avg_time_dict = {**race_id_to_avg_time, **sandsynth_avg_time_dict}
- df['NonTABAvgTime'] = df['FRSW_RaceId'].map(merged_avg_time_dict)
- #NonTABTime
- df['NonTABTime'] = np.nan
- def update_non_tabh_time(row):
- if pd.notnull(row['NonTABAvgTime']):
- if row['F_Position'] == 1:
- if pd.notnull(row['R_VerifiedRaceTime']) and row['R_VerifiedRaceTime'] != 0:
- return row['R_VerifiedRaceTime']
- elif pd.notnull(row['R_RaceTime']) and row['R_RaceTime'] != 0:
- return row['R_RaceTime']
- elif pd.notnull(row['F_Margin']) and row['F_Margin'] != 0:
- if pd.notnull(row['R_VerifiedRaceTime']) and row['R_VerifiedRaceTime'] != 0:
- return row['R_VerifiedRaceTime'] + row['F_Margin'] * 0.16
- elif pd.notnull(row['R_RaceTime']) and row['R_RaceTime'] != 0:
- return row['R_RaceTime'] + row['F_Margin'] * 0.16
- return None
- df['NonTABTime'] = df.apply(update_non_tabh_time, axis=1)
- df['NonTABTime'] = df['NonTABTime'].round(2)
- #NonTABFinishAll
- df['NonTABFinishAll'] = np.nan
- df['NonTABFinishAll'] = round(df['NonTABTime'] - df['NonTABAvgTime'], 2)
- #AvgLast3
- df['AvgLast3'] = np.nan
- for index, row in df.iterrows():
- if pd.notnull(row['StartNo']) and row['StartNo'] not in [0, 1]:
- horse_id = row['FHSW_HorseId']
- start_no = row['StartNo']
- previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)].tail(3)
- if len(previous_rows) >= 3:
- if row['RunsPrep'] != 1:
- avg_value = (
- previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[0] * 0.6 +
- previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[1] * 0.3 +
- previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[2] * 0.1
- )
- else:
- avg_value = (
- previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[0] * 0.75 +
- previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[1] * 0.15 +
- previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[2] * 0.1
- )
- df.at[index, 'AvgLast3'] = avg_value
- #TrackBenchmark
- df['TrackBenchmarks'] = np.nan
- filtered_df = df[(df['FHSW_HorseId'].notna()) &
- (df['M_IsBarrierTrial'] == False) &
- (df['F_Position'] == 1) &
- (df['SH_FinishAll'].between(-100, 100))]
- track_avg_dict = filtered_df.groupby(['M_TName', 'R_Distance'])['SH_FinishAll'].mean().to_dict()
- df['TrackBenchmarks'] = df.apply(
- lambda row: track_avg_dict.get((row['M_TName'], row['R_Distance']), np.nan)
- if not row['M_IsBarrierTrial'] else np.nan, axis=1
- )
- df.to_csv("Data.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment