Untitled

df = pd.read_csv(r'C:\Users\\Desktop\FinalMerge.csv')

#StartNo
df['StartNo'] = np.nan

filtered_rows = df[(df['FHSW_HorseId'].notnull()) &
                   (df['M_IsBarrierTrial'] == False) &
                   (df['FHSW_HorseId'].notnull()) &
                   (df['MH_MDate'].notnull())]
filtered_rows.sort_values(by=['FHSW_HorseId', 'MH_MDate'], inplace=True)

filtered_rows['StartNo'] = filtered_rows.groupby('FHSW_HorseId').cumcount() + 1

df.loc[filtered_rows.index, 'StartNo'] = filtered_rows['StartNo']

#PFScore
df['LS_PFScore'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_PFScore'] = previous_start_row['F_PFScore'].values[0]

print('StartNoDone')

#TP20R
df['LS_TP20R'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_TP20R'] = previous_start_row['H_TP20R'].values[0]

#TP20P
df['LS_TP20P'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_TP20P'] = previous_start_row['H_TP20P'].values[0]

print('TP20Done')

#RPR
df['LS_RPR'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_RPR'] = previous_start_row['H_RPR'].values[0]

#RPP
df['LS_RPP'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_RPP'] = previous_start_row['H_RPP'].values[0]

#WP20R
df['LS_WP20R'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_WP20R'] = previous_start_row['H_WP20R'].values[0]

#WP20P
df['LS_WP20P'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_WP20P'] = previous_start_row['H_WP20P'].values[0]

#WPTP20R
df['LS_WPTP20R'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_WPTP20R'] = previous_start_row['H_WPTP20R'].values[0]

#WPTP20P
df['LS_WPTP20P'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_WPTP20P'] = previous_start_row['H_WPTP20P'].values[0]

#ATWC
df['LS_ATWC'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_ATWC'] = previous_start_row['H_ATWC'].values[0]

#AveATWC
df['LS_AveATWC'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_AveATWC'] = previous_start_row['H_AveATWC'].values[0]

print('AveATWCDone')

#Position
df['LS_Position'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_Position'] = previous_start_row['F_Position'].values[0]

#Margin
df['LS_Margin'] = np.nan
for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_Margin'] = previous_start_row['F_Margin'].values[0]

#Jumps
df['LS_Jumps'] = np.nan
for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_Jumps'] = 1 if previous_start_row['R_Jumps'].values[0] else 0
#NonTAB
df['LS_TABMeeting'] = np.nan
for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_TABMeeting'] = 1 if previous_start_row['M_TABMeeting'].values[0] else 0

#PrizeMoneyNormalised
df['PrizeMoneyNormalised'] = np.nan

filtered_df = df[df['F_Position'] != '99']

max_f_position = filtered_df.groupby('FRSW_RaceId')['F_Position'].max().reset_index()
max_f_position.rename(columns={'F_Position': 'Max_F_Position'}, inplace=True)

df = df.merge(max_f_position, on='FRSW_RaceId', how='left')

valid_rows = (~df['R_PrizeMoney'].isna() &
              ~df['R_Starters'].isna() &
              ~df['F_Position'].isna() &
              ~df['FRSW_RaceId'].isna() &
              (df['F_Position'] != '99'))

df.loc[valid_rows, 'PrizeMoneyNormalised'] = df.loc[valid_rows, 'R_PrizeMoney'] * (1 - df.loc[valid_rows, 'F_Position'] / df.loc[valid_rows, 'Max_F_Position'])

mask = ~df['FRSW_RaceId'].isna()

df.loc[mask, 'PrizeMoneyNormalised'] = df.loc[mask].groupby('FRSW_RaceId')['PrizeMoneyNormalised'].transform(
    lambda x: x / x.sum() * df.loc[x.index, 'R_PrizeMoney'].iloc[0] if not x.isna().all() and x.sum() != 0 else pd.NA)

df['PrizeMoneyNormalised'] = df['PrizeMoneyNormalised'].apply(lambda x: round(x) if pd.notna(x) else pd.NA).astype('Int64')

df.drop(columns=['Max_F_Position'], inplace=True)

#LSPrizeMoneyNormalised
df['LS_PrizeMoneyNormalised'] = np.nan
for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_PrizeMoneyNormalised'] = previous_start_row['PrizeMoneyNormalised'].values[0]

#BFSP
df['LS_BFSP'] = np.nan
for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                                (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
        if not previous_start_row.empty:
            df.at[index, 'LS_BFSP'] = previous_start_row['F_BFSP'].values[0] if not pd.isnull(previous_start_row['F_BFSP'].values[0]) else previous_start_row['H_BFSP'].values[0]

#TabNo
df['LS_TabNo'] = None

for index, row in df.iterrows():
    current_start_no = row['StartNo']
    current_horse_id = row['FHSW_HorseId']

    previous_start = df[(df['StartNo'] < current_start_no) & (df['FHSW_HorseId'] == current_horse_id)].tail(1)

    if not previous_start.empty:
        previous_tab_no = previous_start['F_TABNo'].values[0]
        df.at[index, 'LS_TabNo'] = previous_tab_no

#PrizeMoneyBreakDown
df['R_PrizeMoneyBreakDown'] = df['R_PrizeMoneyBreakDown'].astype(str)

df = df[df['F_Position'].notna() & (df['F_Position'] != '') & df['R_PrizeMoneyBreakDown'].notna() & (df['R_PrizeMoneyBreakDown'] != '')]

def create_prize_money_dict(row):
    entries = row.split(';')
    prize_money_dict = {}
    for entry in entries:
        pos_value = entry.split(',')
        if len(pos_value) != 2:
            continue
        pos, value = pos_value
        pos = ''.join(filter(str.isdigit, pos))
        if pos:
            prize_money_dict[int(pos)] = int(value)
    return prize_money_dict

df['PrizeMoney'] = df.apply(lambda row: create_prize_money_dict(row['R_PrizeMoneyBreakDown']).get(int(row['F_Position']), np.nan), axis=1)

#AvgCareerPrizeMoney
df['AvgCareerPrizeMoney'] = np.nan

for index, row in df.iterrows():
    if row['FHSW_HorseId']:
        previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                             (df['StartNo'] < row['StartNo'])]['PrizeMoneyNormalised']

        if not previous_starts.empty:
            avg_career_prize_money = previous_starts.mean()

            df.at[index, 'AvgCareerPrizeMoney'] = avg_career_prize_money

#CareerWins
df['CareerWins'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] != 0:
        horse_id = row['FHSW_HorseId']
        start_no = row['StartNo']

        previous_starts = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)]

        if not previous_starts.empty:
            career_wins = previous_starts[previous_starts['F_Position'] == 1].shape[0]

            df.at[index, 'CareerWins'] = career_wins

#RacesSinceLastWin
df['RacesSinceLastWin'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        horse_id = row['FHSW_HorseId']
        start_no = row['StartNo']

        previous_wins = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no) & (df['F_Position'] == 1)]

        if not previous_wins.empty:
            most_recent_win = previous_wins.iloc[-1]

            races_since_last_win = start_no - most_recent_win['StartNo'] - 1
        else:
            races_since_last_win = 0

        df.at[index, 'RacesSinceLastWin'] = races_since_last_win

#AvgCareerBFSP
df['AvgCareerBFSP'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
                             (df['StartNo'] < row['StartNo'])]

        if not previous_starts.empty:
            previous_bfsp_values = previous_starts['F_BFSP'].astype(float)

            previous_bfsp_values = previous_bfsp_values[previous_bfsp_values != 0.0]

            if not previous_bfsp_values.empty:
                avg_career_bfsp = previous_bfsp_values.mean()

                df.at[index, 'AvgCareerBFSP'] = avg_career_bfsp

#DaysBetweenLastStart
df['DaysBetweenStarts'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        current_start_no = row['StartNo']
        horse_id = row['FHSW_HorseId']
        previous_row = df[(df['FHSW_HorseId'] == horse_id) &
                          (df['StartNo'] < current_start_no)].tail(1)

        if not previous_row.empty:
            previous_start_date = previous_row['MH_MDate'].values[0]
            current_start_date = row['MH_MDate']

            days_between_starts = (current_start_date - previous_start_date).days

            df.at[index, 'DaysBetweenStarts'] = days_between_starts

#AgeinDays
df['AgeInDays'] = np.nan

for index, row in df.iterrows():
    if pd.isnull(row['MH_MDate']) or pd.isnull(row['F_FoalDate']):
        continue

    age_in_days = (row['MH_MDate'] - row['F_FoalDate']).days
    df.at[index, 'AgeInDays'] = age_in_days

#AgeinHorseYears
df['AgeHorseYears'] = 0

for index, row in df.iterrows():
    if pd.isnull(row['F_FoalDate']):
        continue

    age_in_days = (row['MH_MDate'] - pd.Timestamp(f"{row['F_FoalDate'].year}-08-01")).days

    age_in_years = age_in_days // 365

    df.at[index, 'AgeHorseYears'] = age_in_years

#StartsbyAge
df['StartsYr1'] = 0
df['StartsYr2'] = 0
df['StartsYr3'] = 0

for horse_id, group in df[df['AgeHorseYears'] == 1].groupby('FHSW_HorseId'):
    starts_yr1 = group['StartNo'].max() - group['StartNo'].min() + 1
    df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr1'] = starts_yr1

df.loc[df['AgeHorseYears'] >= 2, 'StartsYr1'] = df['StartsYr1'].replace(0, method='ffill')

for horse_id, group in df[df['AgeHorseYears'] == 2].groupby('FHSW_HorseId'):
    starts_yr2 = group['StartNo'].max() - group['StartNo'].min() + 1
    df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr2'] = starts_yr2

df.loc[df['AgeHorseYears'] >= 3, 'StartsYr2'] = df['StartsYr2'].replace(0, method='ffill')

for horse_id, group in df[df['AgeHorseYears'] == 3].groupby('FHSW_HorseId'):
    starts_yr3 = group['StartNo'].max() - group['StartNo'].min() + 1
    df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr3'] = starts_yr3

df.loc[df['AgeHorseYears'] >= 4, 'StartsYr3'] = df['StartsYr3'].replace(0, method='ffill')

#RunsPrep
df['RunsPrep'] = np.nan

for horse_id in df['FHSW_HorseId'].unique():
    horse_df = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'].notnull()) & (df['StartNo'] != 0)].sort_values('MH_MDate')

    prep_counter = 1
    last_race_date = None

    for index, row in horse_df.iterrows():
        if row['M_IsBarrierTrial'] and prep_counter > 1:
            continue

        current_date = row['MH_MDate']

        if last_race_date is None or (current_date - last_race_date).days <= 35:
            df.at[index, 'RunsPrep'] = prep_counter
        else:
            prep_counter += 1
            df.at[index, 'RunsPrep'] = prep_counter

        if not row['M_IsBarrierTrial']:
            last_race_date = current_date

#FieldSize
df['FieldSize%Change'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_start = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) & (df['StartNo'] < row['StartNo'])]

        if not previous_start.empty:
            previous_start = previous_start.iloc[-1]
            percentage_change = ((row['R_Starters'] - previous_start['R_Starters']) / previous_start['R_Starters']) * 100
            df.at[index, 'FieldSize%Change'] = percentage_change

#FieldBeaten%
df['FieldBeaten%'] = np.nan

last_field_beaten_percentage = {}

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (0, 1):
        if row['F_Position'] == 99:
            df.at[index, 'FieldBeaten%'] = 0
            last_field_beaten_percentage[row['FHSW_HorseId']] = 0
        else:
            if row['R_Starters'] > 1:
                field_beaten_percentage = 1 - ((row['F_Position'] - 1) / (row['R_Starters'] - 1))
            else:
                field_beaten_percentage = np.nan

            if row['FHSW_HorseId'] in last_field_beaten_percentage:
                df.at[index, 'FieldBeaten%'] = last_field_beaten_percentage[row['FHSW_HorseId']]
            else:
                df.at[index, 'FieldBeaten%'] = field_beaten_percentage

            last_field_beaten_percentage[row['FHSW_HorseId']] = field_beaten_percentage

#DistanceChange
df['Distance%Change'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) & (df['StartNo'] < row['StartNo'])]

        if not previous_starts.empty:
            previous_start = previous_starts.tail(1)
            previous_start_distance = previous_start['R_Distance'].values[0]
            current_start_distance = row['R_Distance']
            starters_change = row['R_Starters'] - previous_start['R_Starters'].values[0]
            percentage_change = (starters_change / previous_start['R_Starters'].values[0]) * 100
            df.at[index, 'Distance%Change'] = percentage_change

#DistancePreference
df['DistancePreference'] = np.nan

for index, row in df.iterrows():
    if row['StartNo'] not in [0, '1', '0']:
        horse_id = row['FHSW_HorseId']

        previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['RowNo'] < row['RowNo'])]

        within_15_percent_rows = previous_rows[((row['R_Distance'] - previous_rows['R_Distance']) / previous_rows['R_Distance']) * 100 <= 15]

        if within_15_percent_rows.empty:
            df.at[index, 'DistancePreference'] = 0
        else:
            if any((prev_row['FieldBeaten%'] > 0.5 or prev_row['F_Margin'] < 4) for _, prev_row in within_15_percent_rows.iterrows()):
                df.at[index, 'DistancePreference'] = 1
            else:
                df.at[index, 'DistancePreference'] = 0

#NewDistance
df['NewDistance'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
        horse_id = row['FHSW_HorseId']
        current_start = row['R_Distance']

        previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < row['StartNo'])]

        if not previous_rows.empty:
            valid_rows = previous_rows[previous_rows['R_Distance'] > current_start]

            if not valid_rows.empty:
                df.at[index, 'NewDistance'] = 0
            else:
                closest_row_index = (previous_rows['R_Distance'] - current_start).abs().idxmin()
                closest_row = previous_rows.loc[[closest_row_index]]

                percentage_change = ((current_start - closest_row['R_Distance'].iloc[0]) / closest_row['R_Distance'].iloc[0]) * 100

                if percentage_change > 15:
                    df.at[index, 'NewDistance'] = 1

#Weight per 100m
df['WeightPer100m'] = np.nan

mask = (df['R_Distance'].astype(bool)) & (df['F_Weight'].astype(bool))
df.loc[mask, 'WeightPer100m'] = (df.loc[mask, 'R_Distance'] / 100) / df.loc[mask, 'F_Weight']

df['WeightPer100m'].fillna(0.0, inplace=True)

#Weight of horse minus average weight of other horses in the race

valid_weights = df[(df['StartNo'].notnull()) & (~df['StartNo'].isin([0, 1]))]

average_weight_dict = valid_weights.groupby('FRSW_RaceId')['F_Weight'].mean().to_dict()

df['RelativeWeight'] = df.apply(lambda row: row['F_Weight'] - average_weight_dict.get(row['FRSW_RaceId'], 0), axis=1)

#Weight drop/rise between this start and last start
df['WeightChange'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        horse_id = row['FHSW_HorseId']
        current_start_no = row['StartNo']

        previous_start = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < current_start_no)].last_valid_index()

        if previous_start is not None:
            current_distance = row['R_Distance']
            previous_distance = df.at[previous_start, 'R_Distance']

            weight_change = row['F_Weight'] - df.at[previous_start, 'F_Weight']

            df.at[index, 'WeightChange'] = weight_change

#Career Relative Weight
df['CareerRelativeWeight'] = np.nan

for index, row in df.iterrows():
    current_start_no = row['StartNo']
    current_horse_id = row['FHSW_HorseId']

    previous_starts = df[(df['StartNo'] < current_start_no) & (df['FHSW_HorseId'] == current_horse_id)]

    if not previous_starts.empty:
        average_weight_previous_starts = previous_starts['F_Weight'].mean()

        career_relative_weight = ((row['F_Weight'] - average_weight_previous_starts) / average_weight_previous_starts) * 100

        df.at[index, 'CareerRelativeWeight'] = career_relative_weight

#TrackCondition
conditions = ['Good to Good', 'Good to Soft', 'Good to Heavy', 'Good to Synthetic',
              'Soft to Good', 'Soft to Soft', 'Soft to Heavy', 'Soft to Synthetic',
              'Heavy to Good', 'Heavy to Soft', 'Heavy to Heavy', 'Heavy to Synthetic',
              'Synthetic to Good', 'Synthetic to Soft', 'Synthetic to Heavy', 'Synthetic to Synthetic']

for condition in conditions:
    df[condition] = 0.0

for index, row in df.iterrows():
    if pd.isna(row['StartNo']) or row['StartNo'] in (1, 0):
        for condition in conditions:
            df.at[index, condition] = np.nan
    else:
        horse_id = row['FHSW_HorseId']
        current_start_no = row['StartNo']

        previous_start = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < current_start_no)].sort_values('StartNo', ascending=False).head(1)

        if not previous_start.empty:
            previous_start = previous_start.head(1)
            previous_condition = previous_start['R_GoingDesc'].values[0]
            current_condition = row['R_GoingDesc']

            previous_condition = previous_condition.lower()
            current_condition = current_condition.lower()

            condition_key = f"{previous_condition.capitalize()} to {current_condition.capitalize()}"
            if condition_key in conditions:
                df.at[index, condition_key] = 1
            if previous_condition == 'good' and current_condition == 'good':
                df.at[index, 'Good to Good'] = 1
            elif previous_condition == 'good' and current_condition == 'soft':
                df.at[index, 'Good to Soft'] = 1
            elif previous_condition == 'good' and current_condition == 'heavy':
                df.at[index, 'Good to Heavy'] = 1
            elif previous_condition == 'good' and current_condition == 'synethic':
                df.at[index, 'Good to Synthetic'] = 1
            elif previous_condition == 'soft' and current_condition == 'good':
                df.at[index, 'Soft to Good'] = 1
            elif previous_condition == 'soft' and current_condition == 'soft':
                df.at[index, 'Soft to Soft'] = 1
            elif previous_condition == 'soft' and current_condition == 'heavy':
                df.at[index, 'Soft to Heavy'] = 1
            elif previous_condition == 'soft' and current_condition == 'synthetic':
                df.at[index, 'Soft to Synthetic'] = 1
            elif previous_condition == 'heavy' and current_condition == 'good':
                df.at[index, 'Heavy to Good'] = 1
            elif previous_condition == 'heavy' and current_condition == 'soft':
                df.at[index, 'Heavy to Soft'] = 1
            elif previous_condition == 'heavy' and current_condition == 'heavy':
                df.at[index, 'Heavy to Heavy'] = 1
            elif previous_condition == 'heavy' and current_condition == 'synthetic':
                df.at[index, 'Heavy to Synthetic'] = 1
            elif previous_condition == 'synthetic' and current_condition == 'good':
                df.at[index, 'Synthetic to Good'] = 1
            elif previous_condition == 'synthetic' and current_condition == 'soft':
                df.at[index, 'Synthetic to Soft'] = 1
            elif previous_condition == 'synthetic' and current_condition == 'heavy':
                df.at[index, 'Synthetic to Heavy'] = 1
            elif previous_condition == 'synthetic' and current_condition == 'synthetic':
                df.at[index, 'Synthetic to Synthetic'] = 1

#Surface Preference
df['SurfacePreference'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
        horse_id = row['FHSW_HorseId']

        previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['RowNo'] < row['RowNo'])]

        same_surface_rows = previous_rows[previous_rows['R_GoingDesc'] == row['R_GoingDesc']]

        if same_surface_rows.empty:
            df.at[index, 'SurfacePreference'] = 0
        else:
            if any((prev_row['FieldBeaten%'] > 0.5 or prev_row['F_Margin'] < 4) for _, prev_row in same_surface_rows.iterrows()):
                df.at[index, 'SurfacePreference'] = 1
            else:
                df.at[index, 'SurfacePreference'] = 0

#New Surface
df['NewSurface'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
        horse_id = row['FHSW_HorseId']
        current_surface = row['R_GoingDesc']

        previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < row['StartNo'])]

        if not previous_rows.empty:
            matching_surface_rows = previous_rows[previous_rows['R_GoingDesc'] == current_surface]

            if not matching_surface_rows.empty:
                df.at[index, 'NewSurface'] = 0
            else:
                df.at[index, 'NewSurface'] = 1

#Track Average Prize Money
df = df[df['M_IsBarrierTrial'] == False]

df['TrackAvgPrizeMoney'] = np.nan

for index, row in df.iterrows():
    if row['RowNo'] >= 2463450:
        M_TName = row['M_TName']

        unique_race_ids = df[(df['RowNo'] < row['RowNo']) & (df['M_TName'] == M_TName)]['FRSW_RaceId'].unique()

        total_prize_money = df[(df['RowNo'] < row['RowNo']) & (df['M_TName'] == M_TName) & df['FRSW_RaceId'].isin(unique_race_ids)].drop_duplicates(subset=['M_TName', 'FRSW_RaceId'])['R_PrizeMoney'].sum()

        unique_race_count = len(unique_race_ids)

        if unique_race_count != 0:
            avg_prize_money = total_prize_money / unique_race_count
        else:
            avg_prize_money = 0.0

        df.at[index, 'TrackAvgPrizeMoney'] = avg_prize_money

df = df.reset_index(drop=True)

#Cover
df['Cover1200'] = np.nan
df['Cover1000'] = np.nan
df['Cover800'] = np.nan
df['Cover600'] = np.nan
df['Cover400'] = np.nan
df['Cover200'] = np.nan

distance_columns = [1200, 1000, 800, 600, 400, 200]
for distance in distance_columns:
    df[f'Cover{distance}'] = np.nan

for distance in distance_columns:
    pos_column = f'SH_POS{distance}'
    wides_column = f'W_Wides{distance}'
    cover_column = f'Cover{distance}'

    mask = (
        df[wides_column].notna() &
        (df[wides_column] > 1) &
        df[pos_column].notna() &
        (df[pos_column] > 1) &
        df['StartNo'].notna() &
        ~df['StartNo'].isin([0, 1])
    )

    for index, row in df[mask].iterrows():
        race_id = row['FRSW_RaceId']
        horse_wides = row[wides_column]

        row_1 = df[(df['FRSW_RaceId'] == race_id) & (df[pos_column] == 1)]
        df.loc[row_1.index, cover_column] = 0

        for i in range(2, int(df[df['FRSW_RaceId'] == race_id][pos_column].max()) + 1):
            other_horses = df[(df['FRSW_RaceId'] == race_id) & (df[pos_column] > i)]

            for _, other_row in other_horses.iterrows():
                other_wides = other_row[wides_column]

                if horse_wides == other_wides:
                    df.loc[index, cover_column] = 1
                    break
                else:
                    df.loc[index, cover_column] = 0

    df.loc[
        ((df[wides_column] == 0) | (df[wides_column] == 99) | (df[pos_column] == 0) | (df[pos_column] == 25)),
        cover_column
    ] = 0

#Jockey Win Rate
df['JockeyWin%L125'] = np.nan
for index, row in df.iterrows():
    current_row_no = row['RowNo']
    current_jockey_id = row['F_JockeyId']

    previous_rows = df[(df['RowNo'] < current_row_no) &
                       (df['F_JockeyId'] == current_jockey_id) &
                       ~df['FHSW_HorseId'].isnull() &
                       (df['M_IsBarrierTrial'] == False)].tail(125)

    win_count = previous_rows[previous_rows['F_Position'] == 1].shape[0]

    if len(previous_rows) == 125:
        win_strike_rate = win_count / 125.0
        df.at[index, 'JockeyWin%L125'] = win_strike_rate
    else:
        df.at[index, 'JockeyWin%L125'] = np.nan


#Jockey Place Rate
df['JockeyPlace%L250'] = np.nan

for index, row in df.iterrows():
    current_row_no = row['RowNo']
    current_jockey_id = row['F_JockeyId']

    previous_rows = df[(df['RowNo'] < current_row_no) &
                       (df['F_JockeyId'] == current_jockey_id) &
                       ~df['FHSW_HorseId'].isnull() &
                       (df['M_IsBarrierTrial'] == False)].tail(250)

    place_count = previous_rows[previous_rows['F_Position'].isin([1, 2, 3])].shape[0]

    if len(previous_rows) == 250:
        place_strike_rate = place_count / 250.0
        df.at[index, 'JockeyPlace%L250'] = place_strike_rate
    else:
        df.at[index, 'JockeyPlace%L250'] = np.nan

#Jockey Prize Money
df['JockeyPrizeMoneyL125'] = np.nan

for index, row in df.iterrows():
    current_row_no = row['RowNo']
    current_jockey_id = row['F_JockeyId']

    previous_rows = df[(df['RowNo'] < current_row_no) &
                       (df['F_JockeyId'] == current_jockey_id) &
                       ~df['FHSW_HorseId'].isnull() &
                       (df['M_IsBarrierTrial'] == False)].tail(250)


    if len(previous_rows) >= 125:
        prize_money_sum = previous_rows['PrizeMoneyNormalised'].sum()

        df.at[index, 'JockeyPrizeMoneyL125'] = prize_money_sum

#Jockey Prize Money Annual
df['JockeyPrizeMoneyAnnual'] = np.nan

for index, row in df.iterrows():
    current_jockey_id = row['F_JockeyId']
    current_date = row['MH_MDate']

    last_year_rows = df[(df['F_JockeyId'] == current_jockey_id) &
                        (df['MH_MDate'] >= current_date - pd.DateOffset(years=1)) &
                        (df['MH_MDate'] < current_date) &
                        (~df['FHSW_HorseId'].isnull()) &
                        (df['M_IsBarrierTrial'] == False)]

    if not last_year_rows.empty:
        annual_prize_money = last_year_rows['PrizeMoneyNormalised'].sum()
        df.at[index, 'JockeyPrizeMoneyAnnual'] = annual_prize_money
    else:
        df.at[index, 'JockeyPrizeMoneyAnnual'] = np.nan

#Jockey Position
df['AverageJockeyPosition'] = np.nan

for index, row in df.iterrows():
    current_row_no = row['RowNo']
    current_jockey_id = row['F_JockeyId']

    previous_rows = df[(df['RowNo'] < current_row_no) &
                       (df['F_JockeyId'] == current_jockey_id) &
                       (~df['FHSW_HorseId'].isnull()) &
                       (df['M_IsBarrierTrial'] == False)]

    if not previous_rows.empty:
        average_jockey_position = previous_rows['H_AHSP'].mean()
        df.at[index, 'AverageJockeyPosition'] = average_jockey_position


#Jockey Streak
df['JockeyStreak'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['FHSW_HorseId']) and row['M_IsBarrierTrial'] == False:
        meeting_id = row['FM_MeetingId']
        race_id = row['FRSW_RaceId']

        meeting_rows = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] <= race_id)]

        jockey_wins = meeting_rows[meeting_rows['F_Position'] == 1].groupby('F_JockeyId').size()

        current_race_jockeys = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id)]['F_JockeyId'].unique()

        for jockey_id in current_race_jockeys:
            if jockey_id in jockey_wins:
                wins_count = jockey_wins[jockey_id]
            else:
                wins_count = 0

            df.loc[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id) & (df['F_JockeyId'] == jockey_id), 'JockeyStreak'] = wins_count

#Same Jockey
df['SameJockey'] = np.nan

for index, row in df.iterrows():
    current_row_no = row['RowNo']
    current_jockey_id = row['F_JockeyId']
    current_horse_id = row['FHSW_HorseId']

    previous_start = df[(df['RowNo'] < current_row_no) &
                        (df['FHSW_HorseId'] == current_horse_id) &
                        (~df['FHSW_HorseId'].isnull()) &
                        (df['M_IsBarrierTrial'] == False)].tail(1)

    if previous_start.empty:
        df.at[index, 'SameJockey'] = np.nan
    else:
        previous_jockey_id = previous_start['F_JockeyId'].values[0]

        if previous_jockey_id == current_jockey_id:
            df.at[index, 'SameJockey'] = 1
        else:
            df.at[index, 'SameJockey'] = 0

#Trainer Win Rate
df['TrainerWin%L100'] = np.nan

for index, row in df.iterrows():
    current_row_no = row['RowNo']
    current_trainer_id = row['F_TrainerId']

    previous_rows = df[(df['RowNo'] < current_row_no) &
                    (df['F_TrainerId'] == current_trainer_id) &
                    ~df['FHSW_HorseId'].isnull() &
                    (df['M_IsBarrierTrial'] == False)].tail(100)

    win_count = previous_rows[previous_rows['F_Position'] == 1].shape[0]

    if len(previous_rows) == 100:
        win_strike_rate = win_count / 100.0
        df.at[index, 'TrainerWin%L100'] = win_strike_rate
    else:
        df.at[index, 'TrainerWin%L100'] = np.nan


#Trainer Place Rate
df['TrainerPlace%L225'] = np.nan

for index, row in df.iterrows():
    current_row_no = row['RowNo']
    current_trainer_id = row['F_TrainerId']

    previous_rows = df[(df['RowNo'] < current_row_no) &
                       (df['F_TrainerId'] == current_trainer_id) &
                       ~df['FHSW_HorseId'].isnull() &
                       (df['M_IsBarrierTrial'] == False)].tail(225)

    place_count = previous_rows[previous_rows['F_Position'].isin([1, 2, 3])].shape[0]

    if len(previous_rows) == 225:
        place_strike_rate = place_count / 225.0
        df.at[index, 'TrainerPlace%L225'] = place_strike_rate
    else:
        df.at[index, 'TrainerPlace%L225'] = np.nan

#Trainer Prize Money
df['TrainerPrizeMoneyL100'] = np.nan

for index, row in df.iterrows():
    current_row_no = row['RowNo']
    current_trainer_id = row['F_TrainerId']

    previous_rows = df[(df['RowNo'] < current_row_no) &
                       (df['F_TrainerId'] == current_trainer_id) &
                       ~df['FHSW_HorseId'].isnull() &
                       (df['M_IsBarrierTrial'] == False)].tail(100)
    if len(previous_rows) >= 100:
        prize_money_sum = previous_rows['PrizeMoneyNormalised'].sum()

        df.at[index, 'TrainerPrizeMoneyL100'] = prize_money_sum

#Trainer Prize Money Annual
df['TrainerPrizeMoneyAnnual'] = np.nan

for index, row in df.iterrows():
    current_trainer_id = row['F_TrainerId']
    current_date = row['MH_MDate']

    last_year_rows = df[(df['F_TrainerId'] == current_trainer_id) &
                        (df['MH_MDate'] >= current_date - pd.DateOffset(years=1)) &
                        (df['MH_MDate'] < current_date) &
                        (~df['FHSW_HorseId'].isnull()) &
                        (df['M_IsBarrierTrial'] == False)]

    if not last_year_rows.empty:
        annual_prize_money = last_year_rows['PrizeMoneyNormalised'].sum()
        df.at[index, 'TrainerPrizeMoneyAnnual'] = annual_prize_money
    else:
        df.at[index, 'TrainerPrizeMoneyAnnual'] = np.nan


#Trainer Streak
df['TrainerStreak'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['FHSW_HorseId']) and row['M_IsBarrierTrial'] == False:
        meeting_id = row['FM_MeetingId']
        race_id = row['FRSW_RaceId']

        meeting_rows = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] <= race_id)]

        trainer_wins = meeting_rows[meeting_rows['F_Position'] == 1].groupby('F_TrainerId').size()

        current_race_trainers = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id)]['F_TrainerId'].unique()

        for trainer_id in current_race_trainers:
            if trainer_id in trainer_wins:
                wins_count = trainer_wins[trainer_id]
            else:
                wins_count = 0

            df.loc[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id) & (df['F_TrainerId'] == trainer_id), 'TrainerStreak'] = wins_count

#Change of Trainer
from difflib import SequenceMatcher

df['TrainerChange'] = np.nan

for index, row in df.iterrows():
    if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
        horse_id = row['FHSW_HorseId']
        start_no = row['StartNo']

        previous_starts = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)]

        if previous_starts.empty:
            df.at[index, 'TrainerChange'] = np.nan
        else:
            previous_start = previous_starts.iloc[-1]

            current_trainer = str(row['F_Trainer'])
            previous_trainer = str(previous_start['F_Trainer'])

            similarity_ratio = SequenceMatcher(None, current_trainer, previous_trainer).ratio()

            if similarity_ratio < 0.75:
                df.at[index, 'TrainerChange'] = 1
            else:
                df.at[index, 'TrainerChange'] = 0

#NonTABAvgTime
tracks = ['adaminaby', 'adelaide river', 'alexandra', 'alpha', 'aramac', 'ardlethan', 'armidale', 'atherton', 'augathella', 'avoca', 'balnarring', 'balranald', 'barcaldine', 'barraba', 'bedgerebong', 'bedourie', 'bell', 'berrigan', 'bingara', 'binnaway', 'birdsville', 'blackall', 'bluff', 'bombala', 'bong bong', 'boorowa', 'boulia', 'bourke', 'bowen', 'bowraville', 'braidwood', 'brewarrina', 'broken hill', 'broome', 'buchan', 'bundaberg', 'bundarra', 'burrandowan', 'burrumbeet', 'calliope', 'camooweal', 'camperdown', 'carinda', 'carnarvon', 'carrathool', 'ceduna', 'charleville', 'charters towers', 'chinchilla', 'clare', 'clermont', 'clifton', 'cloncurry', 'cobar', 'colac', 'collarenebri', 'collie', 'come by chance', 'condobolin', 'cooktown', 'cooma', 'coonabarabran', 'coonamble', 'cootamundra', 'corfield', 'corowa', 'cowra', 'crookwell', 'cunnamulla', 'deagon', 'dederang', 'deepwater', 'deniliquin', 'derby', 'dingo', 'dongara', 'drouin', 'dunkeld', 'edenhope', 'eidsvold', 'einasleigh', 'emerald', 'enngonia', 'esk', 'esperance', 'ewan', 'flinton', 'forbes', 'gayndah', 'geurie', 'gilgandra', 'gladstone', 'glen innes', 'goondiwindi', 'gordonvale', 'great western', 'gregory downs', 'grenfell', 'griffith', 'gulargambone', 'gulgong', 'gunbower', 'gundagai', 'gunnedah', 'gympie', 'hamilton', 'hanging rock', 'harden', 'hawker', 'hay', 'healesville', 'hillston', 'hinnomunjie', 'holbrook', 'home hill', 'hughenden', 'ilfracombe', 'ingham', 'injune', 'innisfail', 'isisford', 'jamestown', 'jandowae', 'jerilderie', 'julia creek', 'junction', 'jundah', 'katherine', 'kempsey', 'kerang', 'king island', 'kingscote', 'kojonup', 'kumbia', 'kununurra', 'landor', 'laura', 'laverton', 'leeton', 'leinster', 'leonora', 'lightning ridge', 'lock', 'lockhart', 'longford', 'longreach', 'louth', 'mallawa', 'manangatang', 'mansfield', 'marble bar', 'mareeba', 'maxwelton', 'mckinlay', 'meekatharra', 'mendooran', 'merriwa', 'merton', 'middlemount', 'miles', 'halidon', 'mingenew', 'mitchell', 'monto', 'moora', 'moranbah', 'moree', 'mortlake', 'moruya', 'morven', 'moulamein', 'mt barker', 'mt gambier', 'mt garnet', 'mt isa', 'mt magnet', 'mt perry', 'mudgee', 'mungery', 'mungindi', 'murtoa', 'muttaburra', 'nanango', 'naracoorte', 'narrabri', 'narrandera', 'narrogin', 'narromine', 'newman', 'nhill', 'noorama', 'norseman', 'northam', 'nowra', 'nyngan', 'oak park', 'oakley', 'orange', 'parkes', 'penola', 'penong', 'penshurst', 'pentland', 'pingrup', 'pooncarie', 'port augusta', 'port hedland', 'port lincoln', 'prairie', 'quambone', 'quilpie', 'quirindi', 'quorn', 'richmond', 'roebourne', 'roma', 'roxby downs', 'seymour', 'springsure', 'st arnaud', 'st george', 'stamford', 'stanthorpe', 'stawell', 'streaky bay', 'surat', 'swifts creek', 'tabulam', 'talmoi', 'talwood', 'tambo', 'tara', 'taroom', 'tennant creek', 'texas', 'thangool', 'tocumwal', 'tomingley', 'toodyay', 'tottenham', 'tower hill', 'towong', 'trangie', 'tullibigeal', 'tumbarumba', 'tumut', 'tuncurry', 'twin hills', 'walcha', 'walgett', 'wallabadah', 'wandoan', 'warialda', 'warra', 'warren', 'wauchope', 'wean', 'wellington', 'wentworth', 'winton aus', 'wondai', 'woolamai', 'wycheproof', 'wyndham', 'yalgoo', 'yass', 'yea', 'yeppoon', 'york', 'young']
df = df[(df['M_IsBarrierTrial'] == False) &
        df['FHSW_HorseId'].notna() &
        (df['R_RaceTime'].notna() & df['R_RaceTime'] != 0)]

merged_distances = []

for track in tracks:
    track_data = df[df['M_TName'] == track]

    filtered_track_data = track_data[~track_data['R_GoingDesc'].isin(['sand', 'synthetic'])]

    race_count_by_distance_going = track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].nunique().reset_index()

    race_ids_by_distance_going = track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].unique().reset_index()

    for index, row in race_count_by_distance_going.iterrows():
        distance = row['R_Distance']
        going_desc = row['R_GoingDesc']
        race_count = row['FRSW_RaceId']
        unique_race_ids = list(race_ids_by_distance_going[(race_ids_by_distance_going['R_Distance'] == distance) & (race_ids_by_distance_going['R_GoingDesc'] == going_desc)]['FRSW_RaceId'].values[0])

        if going_desc in ['soft', 'heavy']:
            combined_going_desc = 'soft'
        elif going_desc in ['good', 'firm', 'dead', 'unknown']:
            combined_going_desc = 'good'
        else:
            combined_going_desc = 'other'

        unique_race_times = df[df['FRSW_RaceId'].isin(unique_race_ids)]['R_RaceTime'].drop_duplicates()
        existing_entry = next((entry for entry in merged_distances if entry['Track'] == track and entry['R_Distance'] == distance and entry['Combined GoingDesc'] == combined_going_desc), None)
        if existing_entry:
            existing_entry['Race Count'] += race_count
            existing_entry['Unique RaceIds'] += unique_race_ids
            existing_entry['R_RaceTime'] += unique_race_times.tolist()
        else:
            merged_distances.append({
                'Track': track,
                'R_Distance': distance,
                'Combined GoingDesc': combined_going_desc,
                'Race Count': race_count,
                'Unique RaceIds': unique_race_ids,
                'R_RaceTime': unique_race_times.tolist()
            })

merged_entries = {}

for entry in merged_distances:
    track = entry['Track']
    distance = entry['R_Distance']
    combined_going_desc = entry['Combined GoingDesc']
    race_count = entry['Race Count']

    if combined_going_desc in ['good', 'soft'] and race_count < 20:
        key = (track, distance)

        if key in merged_entries:
            merged_entries[key]['Race Count'] += race_count
            merged_entries[key]['Unique RaceIds'].extend(entry['Unique RaceIds'])
            merged_entries[key]['R_RaceTime'].extend(entry['R_RaceTime'])
        else:
            merged_entries[key] = entry
    else:
        merged_entries[(track, distance)] = entry

merged_distances_list = list(merged_entries.values())

for entry in merged_distances_list:
    if entry['R_RaceTime']:
        avg_race_time = round(sum(entry['R_RaceTime']) / len(entry['R_RaceTime']), 2)
        entry['Average Race Time'] = avg_race_time

race_id_to_avg_time = {}
for entry in merged_distances_list:
    for race_id in entry['Unique RaceIds']:
        race_id_to_avg_time[race_id] = entry['Average Race Time']

sandsynth = []

for track in tracks:
    track_data = df[df['M_TName'] == track]

    filtered_track_data = track_data[track_data['R_GoingDesc'].isin(['sand', 'synthetic'])]

    race_count_by_going_distance = filtered_track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].nunique().reset_index()

    race_ids_by_distance_going = filtered_track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].unique().reset_index()

    for index, row in race_count_by_going_distance.iterrows():
        race_count = row['FRSW_RaceId']
        distance = row['R_Distance']
        unique_race_ids = race_ids_by_distance_going[race_ids_by_distance_going['R_Distance'] == distance]['FRSW_RaceId'].values[0]
        unique_race_times = df[df['FRSW_RaceId'].isin(unique_race_ids)]['R_RaceTime'].drop_duplicates()
        avg_race_time = round(sum(unique_race_times) / len(unique_race_times), 2)
        sandsynth.append({
            'Track': track,
            'R_Distance': distance,
            'Race Count': race_count,
            'Unique RaceIds': unique_race_ids,
            'R_RaceTime': unique_race_times.tolist(),
            'Average Race Time': avg_race_time
            })

sandsynth_avg_time_dict = {}

for entry in sandsynth:
    unique_race_ids = entry['Unique RaceIds']
    avg_race_time = entry['Average Race Time']
    for race_id in unique_race_ids:
        sandsynth_avg_time_dict[race_id] = avg_race_time

merged_avg_time_dict = {**race_id_to_avg_time, **sandsynth_avg_time_dict}

df['NonTABAvgTime'] = df['FRSW_RaceId'].map(merged_avg_time_dict)

#NonTABTime
df['NonTABTime'] = np.nan

def update_non_tabh_time(row):
    if pd.notnull(row['NonTABAvgTime']):
        if row['F_Position'] == 1:
            if pd.notnull(row['R_VerifiedRaceTime']) and row['R_VerifiedRaceTime'] != 0:
                return row['R_VerifiedRaceTime']
            elif pd.notnull(row['R_RaceTime']) and row['R_RaceTime'] != 0:
                return row['R_RaceTime']

        elif pd.notnull(row['F_Margin']) and row['F_Margin'] != 0:
            if pd.notnull(row['R_VerifiedRaceTime']) and row['R_VerifiedRaceTime'] != 0:
                return row['R_VerifiedRaceTime'] + row['F_Margin'] * 0.16
            elif pd.notnull(row['R_RaceTime']) and row['R_RaceTime'] != 0:
                return row['R_RaceTime'] + row['F_Margin'] * 0.16

    return None

df['NonTABTime'] = df.apply(update_non_tabh_time, axis=1)

df['NonTABTime'] = df['NonTABTime'].round(2)

#NonTABFinishAll

df['NonTABFinishAll'] = np.nan

df['NonTABFinishAll'] = round(df['NonTABTime'] - df['NonTABAvgTime'], 2)

#AvgLast3

df['AvgLast3'] = np.nan

for index, row in df.iterrows():
    if pd.notnull(row['StartNo']) and row['StartNo'] not in [0, 1]:
        horse_id = row['FHSW_HorseId']
        start_no = row['StartNo']

        previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)].tail(3)

        if len(previous_rows) >= 3:
            if row['RunsPrep'] != 1:
                avg_value = (
                    previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[0] * 0.6 +
                    previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[1] * 0.3 +
                    previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[2] * 0.1
                )
            else:
                avg_value = (
                    previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[0] * 0.75 +
                    previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[1] * 0.15 +
                    previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[2] * 0.1
                )

            df.at[index, 'AvgLast3'] = avg_value


#TrackBenchmark
df['TrackBenchmarks'] = np.nan

filtered_df = df[(df['FHSW_HorseId'].notna()) &
                 (df['M_IsBarrierTrial'] == False) &
                 (df['F_Position'] == 1) &
                 (df['SH_FinishAll'].between(-100, 100))]

track_avg_dict = filtered_df.groupby(['M_TName', 'R_Distance'])['SH_FinishAll'].mean().to_dict()

df['TrackBenchmarks'] = df.apply(
    lambda row: track_avg_dict.get((row['M_TName'], row['R_Distance']), np.nan)
    if not row['M_IsBarrierTrial'] else np.nan, axis=1
)

df.to_csv("Data.csv", index=False)