Guest User

Untitled

a guest
Aug 21st, 2024
451
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 49.82 KB | None | 0 0
  1. df = pd.read_csv(r'C:\Users\\Desktop\FinalMerge.csv')
  2.  
  3. #StartNo
  4. df['StartNo'] = np.nan
  5.  
  6. filtered_rows = df[(df['FHSW_HorseId'].notnull()) &
  7. (df['M_IsBarrierTrial'] == False) &
  8. (df['FHSW_HorseId'].notnull()) &
  9. (df['MH_MDate'].notnull())]
  10. filtered_rows.sort_values(by=['FHSW_HorseId', 'MH_MDate'], inplace=True)
  11.  
  12. filtered_rows['StartNo'] = filtered_rows.groupby('FHSW_HorseId').cumcount() + 1
  13.  
  14. df.loc[filtered_rows.index, 'StartNo'] = filtered_rows['StartNo']
  15.  
  16. #PFScore
  17. df['LS_PFScore'] = np.nan
  18.  
  19. for index, row in df.iterrows():
  20. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  21. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  22. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  23. if not previous_start_row.empty:
  24. df.at[index, 'LS_PFScore'] = previous_start_row['F_PFScore'].values[0]
  25.  
  26. print('StartNoDone')
  27.  
  28. #TP20R
  29. df['LS_TP20R'] = np.nan
  30.  
  31. for index, row in df.iterrows():
  32. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  33. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  34. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  35. if not previous_start_row.empty:
  36. df.at[index, 'LS_TP20R'] = previous_start_row['H_TP20R'].values[0]
  37.  
  38. #TP20P
  39. df['LS_TP20P'] = np.nan
  40.  
  41. for index, row in df.iterrows():
  42. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  43. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  44. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  45. if not previous_start_row.empty:
  46. df.at[index, 'LS_TP20P'] = previous_start_row['H_TP20P'].values[0]
  47.  
  48. print('TP20Done')
  49.  
  50. #RPR
  51. df['LS_RPR'] = np.nan
  52.  
  53. for index, row in df.iterrows():
  54. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  55. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  56. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  57. if not previous_start_row.empty:
  58. df.at[index, 'LS_RPR'] = previous_start_row['H_RPR'].values[0]
  59.  
  60. #RPP
  61. df['LS_RPP'] = np.nan
  62.  
  63. for index, row in df.iterrows():
  64. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  65. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  66. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  67. if not previous_start_row.empty:
  68. df.at[index, 'LS_RPP'] = previous_start_row['H_RPP'].values[0]
  69.  
  70. #WP20R
  71. df['LS_WP20R'] = np.nan
  72.  
  73. for index, row in df.iterrows():
  74. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  75. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  76. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  77. if not previous_start_row.empty:
  78. df.at[index, 'LS_WP20R'] = previous_start_row['H_WP20R'].values[0]
  79.  
  80. #WP20P
  81. df['LS_WP20P'] = np.nan
  82.  
  83. for index, row in df.iterrows():
  84. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  85. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  86. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  87. if not previous_start_row.empty:
  88. df.at[index, 'LS_WP20P'] = previous_start_row['H_WP20P'].values[0]
  89.  
  90. #WPTP20R
  91. df['LS_WPTP20R'] = np.nan
  92.  
  93. for index, row in df.iterrows():
  94. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  95. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  96. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  97. if not previous_start_row.empty:
  98. df.at[index, 'LS_WPTP20R'] = previous_start_row['H_WPTP20R'].values[0]
  99.  
  100. #WPTP20P
  101. df['LS_WPTP20P'] = np.nan
  102.  
  103. for index, row in df.iterrows():
  104. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  105. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  106. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  107. if not previous_start_row.empty:
  108. df.at[index, 'LS_WPTP20P'] = previous_start_row['H_WPTP20P'].values[0]
  109.  
  110. #ATWC
  111. df['LS_ATWC'] = np.nan
  112.  
  113. for index, row in df.iterrows():
  114. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  115. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  116. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  117. if not previous_start_row.empty:
  118. df.at[index, 'LS_ATWC'] = previous_start_row['H_ATWC'].values[0]
  119.  
  120. #AveATWC
  121. df['LS_AveATWC'] = np.nan
  122.  
  123. for index, row in df.iterrows():
  124. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  125. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  126. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  127. if not previous_start_row.empty:
  128. df.at[index, 'LS_AveATWC'] = previous_start_row['H_AveATWC'].values[0]
  129.  
  130. print('AveATWCDone')
  131.  
  132. #Position
  133. df['LS_Position'] = np.nan
  134.  
  135. for index, row in df.iterrows():
  136. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  137. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  138. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  139. if not previous_start_row.empty:
  140. df.at[index, 'LS_Position'] = previous_start_row['F_Position'].values[0]
  141.  
  142. #Margin
  143. df['LS_Margin'] = np.nan
  144. for index, row in df.iterrows():
  145. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  146. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  147. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  148. if not previous_start_row.empty:
  149. df.at[index, 'LS_Margin'] = previous_start_row['F_Margin'].values[0]
  150.  
  151. #Jumps
  152. df['LS_Jumps'] = np.nan
  153. for index, row in df.iterrows():
  154. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  155. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  156. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  157. if not previous_start_row.empty:
  158. df.at[index, 'LS_Jumps'] = 1 if previous_start_row['R_Jumps'].values[0] else 0
  159. #NonTAB
  160. df['LS_TABMeeting'] = np.nan
  161. for index, row in df.iterrows():
  162. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  163. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  164. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  165. if not previous_start_row.empty:
  166. df.at[index, 'LS_TABMeeting'] = 1 if previous_start_row['M_TABMeeting'].values[0] else 0
  167.  
  168. #PrizeMoneyNormalised
  169. df['PrizeMoneyNormalised'] = np.nan
  170.  
  171. filtered_df = df[df['F_Position'] != '99']
  172.  
  173. max_f_position = filtered_df.groupby('FRSW_RaceId')['F_Position'].max().reset_index()
  174. max_f_position.rename(columns={'F_Position': 'Max_F_Position'}, inplace=True)
  175.  
  176. df = df.merge(max_f_position, on='FRSW_RaceId', how='left')
  177.  
  178. valid_rows = (~df['R_PrizeMoney'].isna() &
  179. ~df['R_Starters'].isna() &
  180. ~df['F_Position'].isna() &
  181. ~df['FRSW_RaceId'].isna() &
  182. (df['F_Position'] != '99'))
  183.  
  184. df.loc[valid_rows, 'PrizeMoneyNormalised'] = df.loc[valid_rows, 'R_PrizeMoney'] * (1 - df.loc[valid_rows, 'F_Position'] / df.loc[valid_rows, 'Max_F_Position'])
  185.  
  186. mask = ~df['FRSW_RaceId'].isna()
  187.  
  188. df.loc[mask, 'PrizeMoneyNormalised'] = df.loc[mask].groupby('FRSW_RaceId')['PrizeMoneyNormalised'].transform(
  189. lambda x: x / x.sum() * df.loc[x.index, 'R_PrizeMoney'].iloc[0] if not x.isna().all() and x.sum() != 0 else pd.NA)
  190.  
  191. df['PrizeMoneyNormalised'] = df['PrizeMoneyNormalised'].apply(lambda x: round(x) if pd.notna(x) else pd.NA).astype('Int64')
  192.  
  193. df.drop(columns=['Max_F_Position'], inplace=True)
  194.  
  195. #LSPrizeMoneyNormalised
  196. df['LS_PrizeMoneyNormalised'] = np.nan
  197. for index, row in df.iterrows():
  198. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  199. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  200. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  201. if not previous_start_row.empty:
  202. df.at[index, 'LS_PrizeMoneyNormalised'] = previous_start_row['PrizeMoneyNormalised'].values[0]
  203.  
  204. #BFSP
  205. df['LS_BFSP'] = np.nan
  206. for index, row in df.iterrows():
  207. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  208. previous_start_row = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  209. (df['StartNo'] < row['StartNo'])].sort_values('StartNo', ascending=False).head(1)
  210. if not previous_start_row.empty:
  211. df.at[index, 'LS_BFSP'] = previous_start_row['F_BFSP'].values[0] if not pd.isnull(previous_start_row['F_BFSP'].values[0]) else previous_start_row['H_BFSP'].values[0]
  212.  
  213. #TabNo
  214. df['LS_TabNo'] = None
  215.  
  216. for index, row in df.iterrows():
  217. current_start_no = row['StartNo']
  218. current_horse_id = row['FHSW_HorseId']
  219.  
  220. previous_start = df[(df['StartNo'] < current_start_no) & (df['FHSW_HorseId'] == current_horse_id)].tail(1)
  221.  
  222. if not previous_start.empty:
  223. previous_tab_no = previous_start['F_TABNo'].values[0]
  224. df.at[index, 'LS_TabNo'] = previous_tab_no
  225.  
  226. #PrizeMoneyBreakDown
  227. df['R_PrizeMoneyBreakDown'] = df['R_PrizeMoneyBreakDown'].astype(str)
  228.  
  229. df = df[df['F_Position'].notna() & (df['F_Position'] != '') & df['R_PrizeMoneyBreakDown'].notna() & (df['R_PrizeMoneyBreakDown'] != '')]
  230.  
  231. def create_prize_money_dict(row):
  232. entries = row.split(';')
  233. prize_money_dict = {}
  234. for entry in entries:
  235. pos_value = entry.split(',')
  236. if len(pos_value) != 2:
  237. continue
  238. pos, value = pos_value
  239. pos = ''.join(filter(str.isdigit, pos))
  240. if pos:
  241. prize_money_dict[int(pos)] = int(value)
  242. return prize_money_dict
  243.  
  244. df['PrizeMoney'] = df.apply(lambda row: create_prize_money_dict(row['R_PrizeMoneyBreakDown']).get(int(row['F_Position']), np.nan), axis=1)
  245.  
  246. #AvgCareerPrizeMoney
  247. df['AvgCareerPrizeMoney'] = np.nan
  248.  
  249. for index, row in df.iterrows():
  250. if row['FHSW_HorseId']:
  251. previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  252. (df['StartNo'] < row['StartNo'])]['PrizeMoneyNormalised']
  253.  
  254. if not previous_starts.empty:
  255. avg_career_prize_money = previous_starts.mean()
  256.  
  257. df.at[index, 'AvgCareerPrizeMoney'] = avg_career_prize_money
  258.  
  259. #CareerWins
  260. df['CareerWins'] = np.nan
  261.  
  262. for index, row in df.iterrows():
  263. if not pd.isna(row['StartNo']) and row['StartNo'] != 0:
  264. horse_id = row['FHSW_HorseId']
  265. start_no = row['StartNo']
  266.  
  267. previous_starts = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)]
  268.  
  269. if not previous_starts.empty:
  270. career_wins = previous_starts[previous_starts['F_Position'] == 1].shape[0]
  271.  
  272. df.at[index, 'CareerWins'] = career_wins
  273.  
  274. #RacesSinceLastWin
  275. df['RacesSinceLastWin'] = np.nan
  276.  
  277. for index, row in df.iterrows():
  278. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  279. horse_id = row['FHSW_HorseId']
  280. start_no = row['StartNo']
  281.  
  282. previous_wins = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no) & (df['F_Position'] == 1)]
  283.  
  284. if not previous_wins.empty:
  285. most_recent_win = previous_wins.iloc[-1]
  286.  
  287. races_since_last_win = start_no - most_recent_win['StartNo'] - 1
  288. else:
  289. races_since_last_win = 0
  290.  
  291. df.at[index, 'RacesSinceLastWin'] = races_since_last_win
  292.  
  293. #AvgCareerBFSP
  294. df['AvgCareerBFSP'] = np.nan
  295.  
  296. for index, row in df.iterrows():
  297. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  298. previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) &
  299. (df['StartNo'] < row['StartNo'])]
  300.  
  301. if not previous_starts.empty:
  302. previous_bfsp_values = previous_starts['F_BFSP'].astype(float)
  303.  
  304. previous_bfsp_values = previous_bfsp_values[previous_bfsp_values != 0.0]
  305.  
  306. if not previous_bfsp_values.empty:
  307. avg_career_bfsp = previous_bfsp_values.mean()
  308.  
  309. df.at[index, 'AvgCareerBFSP'] = avg_career_bfsp
  310.  
  311. #DaysBetweenLastStart
  312. df['DaysBetweenStarts'] = np.nan
  313.  
  314. for index, row in df.iterrows():
  315. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  316. current_start_no = row['StartNo']
  317. horse_id = row['FHSW_HorseId']
  318. previous_row = df[(df['FHSW_HorseId'] == horse_id) &
  319. (df['StartNo'] < current_start_no)].tail(1)
  320.  
  321. if not previous_row.empty:
  322. previous_start_date = previous_row['MH_MDate'].values[0]
  323. current_start_date = row['MH_MDate']
  324.  
  325. days_between_starts = (current_start_date - previous_start_date).days
  326.  
  327. df.at[index, 'DaysBetweenStarts'] = days_between_starts
  328.  
  329. #AgeinDays
  330. df['AgeInDays'] = np.nan
  331.  
  332. for index, row in df.iterrows():
  333. if pd.isnull(row['MH_MDate']) or pd.isnull(row['F_FoalDate']):
  334. continue
  335.  
  336. age_in_days = (row['MH_MDate'] - row['F_FoalDate']).days
  337. df.at[index, 'AgeInDays'] = age_in_days
  338.  
  339. #AgeinHorseYears
  340. df['AgeHorseYears'] = 0
  341.  
  342. for index, row in df.iterrows():
  343. if pd.isnull(row['F_FoalDate']):
  344. continue
  345.  
  346. age_in_days = (row['MH_MDate'] - pd.Timestamp(f"{row['F_FoalDate'].year}-08-01")).days
  347.  
  348. age_in_years = age_in_days // 365
  349.  
  350. df.at[index, 'AgeHorseYears'] = age_in_years
  351.  
  352. #StartsbyAge
  353. df['StartsYr1'] = 0
  354. df['StartsYr2'] = 0
  355. df['StartsYr3'] = 0
  356.  
  357. for horse_id, group in df[df['AgeHorseYears'] == 1].groupby('FHSW_HorseId'):
  358. starts_yr1 = group['StartNo'].max() - group['StartNo'].min() + 1
  359. df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr1'] = starts_yr1
  360.  
  361. df.loc[df['AgeHorseYears'] >= 2, 'StartsYr1'] = df['StartsYr1'].replace(0, method='ffill')
  362.  
  363. for horse_id, group in df[df['AgeHorseYears'] == 2].groupby('FHSW_HorseId'):
  364. starts_yr2 = group['StartNo'].max() - group['StartNo'].min() + 1
  365. df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr2'] = starts_yr2
  366.  
  367. df.loc[df['AgeHorseYears'] >= 3, 'StartsYr2'] = df['StartsYr2'].replace(0, method='ffill')
  368.  
  369. for horse_id, group in df[df['AgeHorseYears'] == 3].groupby('FHSW_HorseId'):
  370. starts_yr3 = group['StartNo'].max() - group['StartNo'].min() + 1
  371. df.loc[df['FHSW_HorseId'] == horse_id, 'StartsYr3'] = starts_yr3
  372.  
  373. df.loc[df['AgeHorseYears'] >= 4, 'StartsYr3'] = df['StartsYr3'].replace(0, method='ffill')
  374.  
  375. #RunsPrep
  376. df['RunsPrep'] = np.nan
  377.  
  378. for horse_id in df['FHSW_HorseId'].unique():
  379. horse_df = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'].notnull()) & (df['StartNo'] != 0)].sort_values('MH_MDate')
  380.  
  381. prep_counter = 1
  382. last_race_date = None
  383.  
  384. for index, row in horse_df.iterrows():
  385. if row['M_IsBarrierTrial'] and prep_counter > 1:
  386. continue
  387.  
  388. current_date = row['MH_MDate']
  389.  
  390. if last_race_date is None or (current_date - last_race_date).days <= 35:
  391. df.at[index, 'RunsPrep'] = prep_counter
  392. else:
  393. prep_counter += 1
  394. df.at[index, 'RunsPrep'] = prep_counter
  395.  
  396. if not row['M_IsBarrierTrial']:
  397. last_race_date = current_date
  398.  
  399. #FieldSize
  400. df['FieldSize%Change'] = np.nan
  401.  
  402. for index, row in df.iterrows():
  403. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  404. previous_start = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) & (df['StartNo'] < row['StartNo'])]
  405.  
  406. if not previous_start.empty:
  407. previous_start = previous_start.iloc[-1]
  408. percentage_change = ((row['R_Starters'] - previous_start['R_Starters']) / previous_start['R_Starters']) * 100
  409. df.at[index, 'FieldSize%Change'] = percentage_change
  410.  
  411. #FieldBeaten%
  412. df['FieldBeaten%'] = np.nan
  413.  
  414. last_field_beaten_percentage = {}
  415.  
  416. for index, row in df.iterrows():
  417. if not pd.isna(row['StartNo']) and row['StartNo'] not in (0, 1):
  418. if row['F_Position'] == 99:
  419. df.at[index, 'FieldBeaten%'] = 0
  420. last_field_beaten_percentage[row['FHSW_HorseId']] = 0
  421. else:
  422. if row['R_Starters'] > 1:
  423. field_beaten_percentage = 1 - ((row['F_Position'] - 1) / (row['R_Starters'] - 1))
  424. else:
  425. field_beaten_percentage = np.nan
  426.  
  427. if row['FHSW_HorseId'] in last_field_beaten_percentage:
  428. df.at[index, 'FieldBeaten%'] = last_field_beaten_percentage[row['FHSW_HorseId']]
  429. else:
  430. df.at[index, 'FieldBeaten%'] = field_beaten_percentage
  431.  
  432. last_field_beaten_percentage[row['FHSW_HorseId']] = field_beaten_percentage
  433.  
  434. #DistanceChange
  435. df['Distance%Change'] = np.nan
  436.  
  437. for index, row in df.iterrows():
  438. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  439. previous_starts = df[(df['FHSW_HorseId'] == row['FHSW_HorseId']) & (df['StartNo'] < row['StartNo'])]
  440.  
  441. if not previous_starts.empty:
  442. previous_start = previous_starts.tail(1)
  443. previous_start_distance = previous_start['R_Distance'].values[0]
  444. current_start_distance = row['R_Distance']
  445. starters_change = row['R_Starters'] - previous_start['R_Starters'].values[0]
  446. percentage_change = (starters_change / previous_start['R_Starters'].values[0]) * 100
  447. df.at[index, 'Distance%Change'] = percentage_change
  448.  
  449. #DistancePreference
  450. df['DistancePreference'] = np.nan
  451.  
  452. for index, row in df.iterrows():
  453. if row['StartNo'] not in [0, '1', '0']:
  454. horse_id = row['FHSW_HorseId']
  455.  
  456. previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['RowNo'] < row['RowNo'])]
  457.  
  458. within_15_percent_rows = previous_rows[((row['R_Distance'] - previous_rows['R_Distance']) / previous_rows['R_Distance']) * 100 <= 15]
  459.  
  460. if within_15_percent_rows.empty:
  461. df.at[index, 'DistancePreference'] = 0
  462. else:
  463. if any((prev_row['FieldBeaten%'] > 0.5 or prev_row['F_Margin'] < 4) for _, prev_row in within_15_percent_rows.iterrows()):
  464. df.at[index, 'DistancePreference'] = 1
  465. else:
  466. df.at[index, 'DistancePreference'] = 0
  467.  
  468. #NewDistance
  469. df['NewDistance'] = np.nan
  470.  
  471. for index, row in df.iterrows():
  472. if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
  473. horse_id = row['FHSW_HorseId']
  474. current_start = row['R_Distance']
  475.  
  476. previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < row['StartNo'])]
  477.  
  478. if not previous_rows.empty:
  479. valid_rows = previous_rows[previous_rows['R_Distance'] > current_start]
  480.  
  481. if not valid_rows.empty:
  482. df.at[index, 'NewDistance'] = 0
  483. else:
  484. closest_row_index = (previous_rows['R_Distance'] - current_start).abs().idxmin()
  485. closest_row = previous_rows.loc[[closest_row_index]]
  486.  
  487. percentage_change = ((current_start - closest_row['R_Distance'].iloc[0]) / closest_row['R_Distance'].iloc[0]) * 100
  488.  
  489. if percentage_change > 15:
  490. df.at[index, 'NewDistance'] = 1
  491.  
  492. #Weight per 100m
  493. df['WeightPer100m'] = np.nan
  494.  
  495. mask = (df['R_Distance'].astype(bool)) & (df['F_Weight'].astype(bool))
  496. df.loc[mask, 'WeightPer100m'] = (df.loc[mask, 'R_Distance'] / 100) / df.loc[mask, 'F_Weight']
  497.  
  498. df['WeightPer100m'].fillna(0.0, inplace=True)
  499.  
  500. #Weight of horse minus average weight of other horses in the race
  501.  
  502. valid_weights = df[(df['StartNo'].notnull()) & (~df['StartNo'].isin([0, 1]))]
  503.  
  504. average_weight_dict = valid_weights.groupby('FRSW_RaceId')['F_Weight'].mean().to_dict()
  505.  
  506. df['RelativeWeight'] = df.apply(lambda row: row['F_Weight'] - average_weight_dict.get(row['FRSW_RaceId'], 0), axis=1)
  507.  
  508. #Weight drop/rise between this start and last start
  509. df['WeightChange'] = np.nan
  510.  
  511. for index, row in df.iterrows():
  512. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  513. horse_id = row['FHSW_HorseId']
  514. current_start_no = row['StartNo']
  515.  
  516. previous_start = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < current_start_no)].last_valid_index()
  517.  
  518. if previous_start is not None:
  519. current_distance = row['R_Distance']
  520. previous_distance = df.at[previous_start, 'R_Distance']
  521.  
  522. weight_change = row['F_Weight'] - df.at[previous_start, 'F_Weight']
  523.  
  524. df.at[index, 'WeightChange'] = weight_change
  525.  
  526. #Career Relative Weight
  527. df['CareerRelativeWeight'] = np.nan
  528.  
  529. for index, row in df.iterrows():
  530. current_start_no = row['StartNo']
  531. current_horse_id = row['FHSW_HorseId']
  532.  
  533. previous_starts = df[(df['StartNo'] < current_start_no) & (df['FHSW_HorseId'] == current_horse_id)]
  534.  
  535. if not previous_starts.empty:
  536. average_weight_previous_starts = previous_starts['F_Weight'].mean()
  537.  
  538. career_relative_weight = ((row['F_Weight'] - average_weight_previous_starts) / average_weight_previous_starts) * 100
  539.  
  540. df.at[index, 'CareerRelativeWeight'] = career_relative_weight
  541.  
  542. #TrackCondition
  543. conditions = ['Good to Good', 'Good to Soft', 'Good to Heavy', 'Good to Synthetic',
  544. 'Soft to Good', 'Soft to Soft', 'Soft to Heavy', 'Soft to Synthetic',
  545. 'Heavy to Good', 'Heavy to Soft', 'Heavy to Heavy', 'Heavy to Synthetic',
  546. 'Synthetic to Good', 'Synthetic to Soft', 'Synthetic to Heavy', 'Synthetic to Synthetic']
  547.  
  548. for condition in conditions:
  549. df[condition] = 0.0
  550.  
  551. for index, row in df.iterrows():
  552. if pd.isna(row['StartNo']) or row['StartNo'] in (1, 0):
  553. for condition in conditions:
  554. df.at[index, condition] = np.nan
  555. else:
  556. horse_id = row['FHSW_HorseId']
  557. current_start_no = row['StartNo']
  558.  
  559. previous_start = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < current_start_no)].sort_values('StartNo', ascending=False).head(1)
  560.  
  561. if not previous_start.empty:
  562. previous_start = previous_start.head(1)
  563. previous_condition = previous_start['R_GoingDesc'].values[0]
  564. current_condition = row['R_GoingDesc']
  565.  
  566. previous_condition = previous_condition.lower()
  567. current_condition = current_condition.lower()
  568.  
  569. condition_key = f"{previous_condition.capitalize()} to {current_condition.capitalize()}"
  570. if condition_key in conditions:
  571. df.at[index, condition_key] = 1
  572. if previous_condition == 'good' and current_condition == 'good':
  573. df.at[index, 'Good to Good'] = 1
  574. elif previous_condition == 'good' and current_condition == 'soft':
  575. df.at[index, 'Good to Soft'] = 1
  576. elif previous_condition == 'good' and current_condition == 'heavy':
  577. df.at[index, 'Good to Heavy'] = 1
  578. elif previous_condition == 'good' and current_condition == 'synethic':
  579. df.at[index, 'Good to Synthetic'] = 1
  580. elif previous_condition == 'soft' and current_condition == 'good':
  581. df.at[index, 'Soft to Good'] = 1
  582. elif previous_condition == 'soft' and current_condition == 'soft':
  583. df.at[index, 'Soft to Soft'] = 1
  584. elif previous_condition == 'soft' and current_condition == 'heavy':
  585. df.at[index, 'Soft to Heavy'] = 1
  586. elif previous_condition == 'soft' and current_condition == 'synthetic':
  587. df.at[index, 'Soft to Synthetic'] = 1
  588. elif previous_condition == 'heavy' and current_condition == 'good':
  589. df.at[index, 'Heavy to Good'] = 1
  590. elif previous_condition == 'heavy' and current_condition == 'soft':
  591. df.at[index, 'Heavy to Soft'] = 1
  592. elif previous_condition == 'heavy' and current_condition == 'heavy':
  593. df.at[index, 'Heavy to Heavy'] = 1
  594. elif previous_condition == 'heavy' and current_condition == 'synthetic':
  595. df.at[index, 'Heavy to Synthetic'] = 1
  596. elif previous_condition == 'synthetic' and current_condition == 'good':
  597. df.at[index, 'Synthetic to Good'] = 1
  598. elif previous_condition == 'synthetic' and current_condition == 'soft':
  599. df.at[index, 'Synthetic to Soft'] = 1
  600. elif previous_condition == 'synthetic' and current_condition == 'heavy':
  601. df.at[index, 'Synthetic to Heavy'] = 1
  602. elif previous_condition == 'synthetic' and current_condition == 'synthetic':
  603. df.at[index, 'Synthetic to Synthetic'] = 1
  604.  
  605. #Surface Preference
  606. df['SurfacePreference'] = np.nan
  607.  
  608. for index, row in df.iterrows():
  609. if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
  610. horse_id = row['FHSW_HorseId']
  611.  
  612. previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['RowNo'] < row['RowNo'])]
  613.  
  614. same_surface_rows = previous_rows[previous_rows['R_GoingDesc'] == row['R_GoingDesc']]
  615.  
  616. if same_surface_rows.empty:
  617. df.at[index, 'SurfacePreference'] = 0
  618. else:
  619. if any((prev_row['FieldBeaten%'] > 0.5 or prev_row['F_Margin'] < 4) for _, prev_row in same_surface_rows.iterrows()):
  620. df.at[index, 'SurfacePreference'] = 1
  621. else:
  622. df.at[index, 'SurfacePreference'] = 0
  623.  
  624. #New Surface
  625. df['NewSurface'] = np.nan
  626.  
  627. for index, row in df.iterrows():
  628. if not pd.isna(row['StartNo']) and row['StartNo'] not in [0, 1]:
  629. horse_id = row['FHSW_HorseId']
  630. current_surface = row['R_GoingDesc']
  631.  
  632. previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < row['StartNo'])]
  633.  
  634. if not previous_rows.empty:
  635. matching_surface_rows = previous_rows[previous_rows['R_GoingDesc'] == current_surface]
  636.  
  637. if not matching_surface_rows.empty:
  638. df.at[index, 'NewSurface'] = 0
  639. else:
  640. df.at[index, 'NewSurface'] = 1
  641.  
  642. #Track Average Prize Money
  643. df = df[df['M_IsBarrierTrial'] == False]
  644.  
  645. df['TrackAvgPrizeMoney'] = np.nan
  646.  
  647. for index, row in df.iterrows():
  648. if row['RowNo'] >= 2463450:
  649. M_TName = row['M_TName']
  650.  
  651. unique_race_ids = df[(df['RowNo'] < row['RowNo']) & (df['M_TName'] == M_TName)]['FRSW_RaceId'].unique()
  652.  
  653. total_prize_money = df[(df['RowNo'] < row['RowNo']) & (df['M_TName'] == M_TName) & df['FRSW_RaceId'].isin(unique_race_ids)].drop_duplicates(subset=['M_TName', 'FRSW_RaceId'])['R_PrizeMoney'].sum()
  654.  
  655. unique_race_count = len(unique_race_ids)
  656.  
  657. if unique_race_count != 0:
  658. avg_prize_money = total_prize_money / unique_race_count
  659. else:
  660. avg_prize_money = 0.0
  661.  
  662. df.at[index, 'TrackAvgPrizeMoney'] = avg_prize_money
  663.  
  664. df = df.reset_index(drop=True)
  665.  
  666. #Cover
  667. df['Cover1200'] = np.nan
  668. df['Cover1000'] = np.nan
  669. df['Cover800'] = np.nan
  670. df['Cover600'] = np.nan
  671. df['Cover400'] = np.nan
  672. df['Cover200'] = np.nan
  673.  
  674. distance_columns = [1200, 1000, 800, 600, 400, 200]
  675. for distance in distance_columns:
  676. df[f'Cover{distance}'] = np.nan
  677.  
  678. for distance in distance_columns:
  679. pos_column = f'SH_POS{distance}'
  680. wides_column = f'W_Wides{distance}'
  681. cover_column = f'Cover{distance}'
  682.  
  683. mask = (
  684. df[wides_column].notna() &
  685. (df[wides_column] > 1) &
  686. df[pos_column].notna() &
  687. (df[pos_column] > 1) &
  688. df['StartNo'].notna() &
  689. ~df['StartNo'].isin([0, 1])
  690. )
  691.  
  692. for index, row in df[mask].iterrows():
  693. race_id = row['FRSW_RaceId']
  694. horse_wides = row[wides_column]
  695.  
  696. row_1 = df[(df['FRSW_RaceId'] == race_id) & (df[pos_column] == 1)]
  697. df.loc[row_1.index, cover_column] = 0
  698.  
  699. for i in range(2, int(df[df['FRSW_RaceId'] == race_id][pos_column].max()) + 1):
  700. other_horses = df[(df['FRSW_RaceId'] == race_id) & (df[pos_column] > i)]
  701.  
  702. for _, other_row in other_horses.iterrows():
  703. other_wides = other_row[wides_column]
  704.  
  705. if horse_wides == other_wides:
  706. df.loc[index, cover_column] = 1
  707. break
  708. else:
  709. df.loc[index, cover_column] = 0
  710.  
  711. df.loc[
  712. ((df[wides_column] == 0) | (df[wides_column] == 99) | (df[pos_column] == 0) | (df[pos_column] == 25)),
  713. cover_column
  714. ] = 0
  715.  
  716. #Jockey Win Rate
  717. df['JockeyWin%L125'] = np.nan
  718. for index, row in df.iterrows():
  719. current_row_no = row['RowNo']
  720. current_jockey_id = row['F_JockeyId']
  721.  
  722. previous_rows = df[(df['RowNo'] < current_row_no) &
  723. (df['F_JockeyId'] == current_jockey_id) &
  724. ~df['FHSW_HorseId'].isnull() &
  725. (df['M_IsBarrierTrial'] == False)].tail(125)
  726.  
  727. win_count = previous_rows[previous_rows['F_Position'] == 1].shape[0]
  728.  
  729. if len(previous_rows) == 125:
  730. win_strike_rate = win_count / 125.0
  731. df.at[index, 'JockeyWin%L125'] = win_strike_rate
  732. else:
  733. df.at[index, 'JockeyWin%L125'] = np.nan
  734.  
  735.  
  736. #Jockey Place Rate
  737. df['JockeyPlace%L250'] = np.nan
  738.  
  739. for index, row in df.iterrows():
  740. current_row_no = row['RowNo']
  741. current_jockey_id = row['F_JockeyId']
  742.  
  743. previous_rows = df[(df['RowNo'] < current_row_no) &
  744. (df['F_JockeyId'] == current_jockey_id) &
  745. ~df['FHSW_HorseId'].isnull() &
  746. (df['M_IsBarrierTrial'] == False)].tail(250)
  747.  
  748. place_count = previous_rows[previous_rows['F_Position'].isin([1, 2, 3])].shape[0]
  749.  
  750. if len(previous_rows) == 250:
  751. place_strike_rate = place_count / 250.0
  752. df.at[index, 'JockeyPlace%L250'] = place_strike_rate
  753. else:
  754. df.at[index, 'JockeyPlace%L250'] = np.nan
  755.  
  756. #Jockey Prize Money
  757. df['JockeyPrizeMoneyL125'] = np.nan
  758.  
  759. for index, row in df.iterrows():
  760. current_row_no = row['RowNo']
  761. current_jockey_id = row['F_JockeyId']
  762.  
  763. previous_rows = df[(df['RowNo'] < current_row_no) &
  764. (df['F_JockeyId'] == current_jockey_id) &
  765. ~df['FHSW_HorseId'].isnull() &
  766. (df['M_IsBarrierTrial'] == False)].tail(250)
  767.  
  768.  
  769. if len(previous_rows) >= 125:
  770. prize_money_sum = previous_rows['PrizeMoneyNormalised'].sum()
  771.  
  772. df.at[index, 'JockeyPrizeMoneyL125'] = prize_money_sum
  773.  
  774. #Jockey Prize Money Annual
  775. df['JockeyPrizeMoneyAnnual'] = np.nan
  776.  
  777. for index, row in df.iterrows():
  778. current_jockey_id = row['F_JockeyId']
  779. current_date = row['MH_MDate']
  780.  
  781. last_year_rows = df[(df['F_JockeyId'] == current_jockey_id) &
  782. (df['MH_MDate'] >= current_date - pd.DateOffset(years=1)) &
  783. (df['MH_MDate'] < current_date) &
  784. (~df['FHSW_HorseId'].isnull()) &
  785. (df['M_IsBarrierTrial'] == False)]
  786.  
  787. if not last_year_rows.empty:
  788. annual_prize_money = last_year_rows['PrizeMoneyNormalised'].sum()
  789. df.at[index, 'JockeyPrizeMoneyAnnual'] = annual_prize_money
  790. else:
  791. df.at[index, 'JockeyPrizeMoneyAnnual'] = np.nan
  792.  
  793. #Jockey Position
  794. df['AverageJockeyPosition'] = np.nan
  795.  
  796. for index, row in df.iterrows():
  797. current_row_no = row['RowNo']
  798. current_jockey_id = row['F_JockeyId']
  799.  
  800. previous_rows = df[(df['RowNo'] < current_row_no) &
  801. (df['F_JockeyId'] == current_jockey_id) &
  802. (~df['FHSW_HorseId'].isnull()) &
  803. (df['M_IsBarrierTrial'] == False)]
  804.  
  805. if not previous_rows.empty:
  806. average_jockey_position = previous_rows['H_AHSP'].mean()
  807. df.at[index, 'AverageJockeyPosition'] = average_jockey_position
  808.  
  809.  
  810. #Jockey Streak
  811. df['JockeyStreak'] = np.nan
  812.  
  813. for index, row in df.iterrows():
  814. if not pd.isna(row['FHSW_HorseId']) and row['M_IsBarrierTrial'] == False:
  815. meeting_id = row['FM_MeetingId']
  816. race_id = row['FRSW_RaceId']
  817.  
  818. meeting_rows = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] <= race_id)]
  819.  
  820. jockey_wins = meeting_rows[meeting_rows['F_Position'] == 1].groupby('F_JockeyId').size()
  821.  
  822. current_race_jockeys = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id)]['F_JockeyId'].unique()
  823.  
  824. for jockey_id in current_race_jockeys:
  825. if jockey_id in jockey_wins:
  826. wins_count = jockey_wins[jockey_id]
  827. else:
  828. wins_count = 0
  829.  
  830. df.loc[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id) & (df['F_JockeyId'] == jockey_id), 'JockeyStreak'] = wins_count
  831.  
  832. #Same Jockey
  833. df['SameJockey'] = np.nan
  834.  
  835. for index, row in df.iterrows():
  836. current_row_no = row['RowNo']
  837. current_jockey_id = row['F_JockeyId']
  838. current_horse_id = row['FHSW_HorseId']
  839.  
  840. previous_start = df[(df['RowNo'] < current_row_no) &
  841. (df['FHSW_HorseId'] == current_horse_id) &
  842. (~df['FHSW_HorseId'].isnull()) &
  843. (df['M_IsBarrierTrial'] == False)].tail(1)
  844.  
  845. if previous_start.empty:
  846. df.at[index, 'SameJockey'] = np.nan
  847. else:
  848. previous_jockey_id = previous_start['F_JockeyId'].values[0]
  849.  
  850. if previous_jockey_id == current_jockey_id:
  851. df.at[index, 'SameJockey'] = 1
  852. else:
  853. df.at[index, 'SameJockey'] = 0
  854.  
  855. #Trainer Win Rate
  856. df['TrainerWin%L100'] = np.nan
  857.  
  858. for index, row in df.iterrows():
  859. current_row_no = row['RowNo']
  860. current_trainer_id = row['F_TrainerId']
  861.  
  862. previous_rows = df[(df['RowNo'] < current_row_no) &
  863. (df['F_TrainerId'] == current_trainer_id) &
  864. ~df['FHSW_HorseId'].isnull() &
  865. (df['M_IsBarrierTrial'] == False)].tail(100)
  866.  
  867. win_count = previous_rows[previous_rows['F_Position'] == 1].shape[0]
  868.  
  869. if len(previous_rows) == 100:
  870. win_strike_rate = win_count / 100.0
  871. df.at[index, 'TrainerWin%L100'] = win_strike_rate
  872. else:
  873. df.at[index, 'TrainerWin%L100'] = np.nan
  874.  
  875.  
  876. #Trainer Place Rate
  877. df['TrainerPlace%L225'] = np.nan
  878.  
  879. for index, row in df.iterrows():
  880. current_row_no = row['RowNo']
  881. current_trainer_id = row['F_TrainerId']
  882.  
  883. previous_rows = df[(df['RowNo'] < current_row_no) &
  884. (df['F_TrainerId'] == current_trainer_id) &
  885. ~df['FHSW_HorseId'].isnull() &
  886. (df['M_IsBarrierTrial'] == False)].tail(225)
  887.  
  888. place_count = previous_rows[previous_rows['F_Position'].isin([1, 2, 3])].shape[0]
  889.  
  890. if len(previous_rows) == 225:
  891. place_strike_rate = place_count / 225.0
  892. df.at[index, 'TrainerPlace%L225'] = place_strike_rate
  893. else:
  894. df.at[index, 'TrainerPlace%L225'] = np.nan
  895.  
  896. #Trainer Prize Money
  897. df['TrainerPrizeMoneyL100'] = np.nan
  898.  
  899. for index, row in df.iterrows():
  900. current_row_no = row['RowNo']
  901. current_trainer_id = row['F_TrainerId']
  902.  
  903. previous_rows = df[(df['RowNo'] < current_row_no) &
  904. (df['F_TrainerId'] == current_trainer_id) &
  905. ~df['FHSW_HorseId'].isnull() &
  906. (df['M_IsBarrierTrial'] == False)].tail(100)
  907. if len(previous_rows) >= 100:
  908. prize_money_sum = previous_rows['PrizeMoneyNormalised'].sum()
  909.  
  910. df.at[index, 'TrainerPrizeMoneyL100'] = prize_money_sum
  911.  
  912. #Trainer Prize Money Annual
  913. df['TrainerPrizeMoneyAnnual'] = np.nan
  914.  
  915. for index, row in df.iterrows():
  916. current_trainer_id = row['F_TrainerId']
  917. current_date = row['MH_MDate']
  918.  
  919. last_year_rows = df[(df['F_TrainerId'] == current_trainer_id) &
  920. (df['MH_MDate'] >= current_date - pd.DateOffset(years=1)) &
  921. (df['MH_MDate'] < current_date) &
  922. (~df['FHSW_HorseId'].isnull()) &
  923. (df['M_IsBarrierTrial'] == False)]
  924.  
  925. if not last_year_rows.empty:
  926. annual_prize_money = last_year_rows['PrizeMoneyNormalised'].sum()
  927. df.at[index, 'TrainerPrizeMoneyAnnual'] = annual_prize_money
  928. else:
  929. df.at[index, 'TrainerPrizeMoneyAnnual'] = np.nan
  930.  
  931.  
  932. #Trainer Streak
  933. df['TrainerStreak'] = np.nan
  934.  
  935. for index, row in df.iterrows():
  936. if not pd.isna(row['FHSW_HorseId']) and row['M_IsBarrierTrial'] == False:
  937. meeting_id = row['FM_MeetingId']
  938. race_id = row['FRSW_RaceId']
  939.  
  940. meeting_rows = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] <= race_id)]
  941.  
  942. trainer_wins = meeting_rows[meeting_rows['F_Position'] == 1].groupby('F_TrainerId').size()
  943.  
  944. current_race_trainers = df[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id)]['F_TrainerId'].unique()
  945.  
  946. for trainer_id in current_race_trainers:
  947. if trainer_id in trainer_wins:
  948. wins_count = trainer_wins[trainer_id]
  949. else:
  950. wins_count = 0
  951.  
  952. df.loc[(df['FM_MeetingId'] == meeting_id) & (df['FRSW_RaceId'] == race_id) & (df['F_TrainerId'] == trainer_id), 'TrainerStreak'] = wins_count
  953.  
  954. #Change of Trainer
  955. from difflib import SequenceMatcher
  956.  
  957. df['TrainerChange'] = np.nan
  958.  
  959. for index, row in df.iterrows():
  960. if not pd.isna(row['StartNo']) and row['StartNo'] not in (1, 0):
  961. horse_id = row['FHSW_HorseId']
  962. start_no = row['StartNo']
  963.  
  964. previous_starts = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)]
  965.  
  966. if previous_starts.empty:
  967. df.at[index, 'TrainerChange'] = np.nan
  968. else:
  969. previous_start = previous_starts.iloc[-1]
  970.  
  971. current_trainer = str(row['F_Trainer'])
  972. previous_trainer = str(previous_start['F_Trainer'])
  973.  
  974. similarity_ratio = SequenceMatcher(None, current_trainer, previous_trainer).ratio()
  975.  
  976. if similarity_ratio < 0.75:
  977. df.at[index, 'TrainerChange'] = 1
  978. else:
  979. df.at[index, 'TrainerChange'] = 0
  980.  
  981. #NonTABAvgTime
  982. tracks = ['adaminaby', 'adelaide river', 'alexandra', 'alpha', 'aramac', 'ardlethan', 'armidale', 'atherton', 'augathella', 'avoca', 'balnarring', 'balranald', 'barcaldine', 'barraba', 'bedgerebong', 'bedourie', 'bell', 'berrigan', 'bingara', 'binnaway', 'birdsville', 'blackall', 'bluff', 'bombala', 'bong bong', 'boorowa', 'boulia', 'bourke', 'bowen', 'bowraville', 'braidwood', 'brewarrina', 'broken hill', 'broome', 'buchan', 'bundaberg', 'bundarra', 'burrandowan', 'burrumbeet', 'calliope', 'camooweal', 'camperdown', 'carinda', 'carnarvon', 'carrathool', 'ceduna', 'charleville', 'charters towers', 'chinchilla', 'clare', 'clermont', 'clifton', 'cloncurry', 'cobar', 'colac', 'collarenebri', 'collie', 'come by chance', 'condobolin', 'cooktown', 'cooma', 'coonabarabran', 'coonamble', 'cootamundra', 'corfield', 'corowa', 'cowra', 'crookwell', 'cunnamulla', 'deagon', 'dederang', 'deepwater', 'deniliquin', 'derby', 'dingo', 'dongara', 'drouin', 'dunkeld', 'edenhope', 'eidsvold', 'einasleigh', 'emerald', 'enngonia', 'esk', 'esperance', 'ewan', 'flinton', 'forbes', 'gayndah', 'geurie', 'gilgandra', 'gladstone', 'glen innes', 'goondiwindi', 'gordonvale', 'great western', 'gregory downs', 'grenfell', 'griffith', 'gulargambone', 'gulgong', 'gunbower', 'gundagai', 'gunnedah', 'gympie', 'hamilton', 'hanging rock', 'harden', 'hawker', 'hay', 'healesville', 'hillston', 'hinnomunjie', 'holbrook', 'home hill', 'hughenden', 'ilfracombe', 'ingham', 'injune', 'innisfail', 'isisford', 'jamestown', 'jandowae', 'jerilderie', 'julia creek', 'junction', 'jundah', 'katherine', 'kempsey', 'kerang', 'king island', 'kingscote', 'kojonup', 'kumbia', 'kununurra', 'landor', 'laura', 'laverton', 'leeton', 'leinster', 'leonora', 'lightning ridge', 'lock', 'lockhart', 'longford', 'longreach', 'louth', 'mallawa', 'manangatang', 'mansfield', 'marble bar', 'mareeba', 'maxwelton', 'mckinlay', 'meekatharra', 'mendooran', 'merriwa', 'merton', 'middlemount', 'miles', 'halidon', 'mingenew', 'mitchell', 'monto', 'moora', 'moranbah', 'moree', 'mortlake', 'moruya', 'morven', 'moulamein', 'mt barker', 'mt gambier', 'mt garnet', 'mt isa', 'mt magnet', 'mt perry', 'mudgee', 'mungery', 'mungindi', 'murtoa', 'muttaburra', 'nanango', 'naracoorte', 'narrabri', 'narrandera', 'narrogin', 'narromine', 'newman', 'nhill', 'noorama', 'norseman', 'northam', 'nowra', 'nyngan', 'oak park', 'oakley', 'orange', 'parkes', 'penola', 'penong', 'penshurst', 'pentland', 'pingrup', 'pooncarie', 'port augusta', 'port hedland', 'port lincoln', 'prairie', 'quambone', 'quilpie', 'quirindi', 'quorn', 'richmond', 'roebourne', 'roma', 'roxby downs', 'seymour', 'springsure', 'st arnaud', 'st george', 'stamford', 'stanthorpe', 'stawell', 'streaky bay', 'surat', 'swifts creek', 'tabulam', 'talmoi', 'talwood', 'tambo', 'tara', 'taroom', 'tennant creek', 'texas', 'thangool', 'tocumwal', 'tomingley', 'toodyay', 'tottenham', 'tower hill', 'towong', 'trangie', 'tullibigeal', 'tumbarumba', 'tumut', 'tuncurry', 'twin hills', 'walcha', 'walgett', 'wallabadah', 'wandoan', 'warialda', 'warra', 'warren', 'wauchope', 'wean', 'wellington', 'wentworth', 'winton aus', 'wondai', 'woolamai', 'wycheproof', 'wyndham', 'yalgoo', 'yass', 'yea', 'yeppoon', 'york', 'young']
  983. df = df[(df['M_IsBarrierTrial'] == False) &
  984. df['FHSW_HorseId'].notna() &
  985. (df['R_RaceTime'].notna() & df['R_RaceTime'] != 0)]
  986.  
  987. merged_distances = []
  988.  
  989. for track in tracks:
  990. track_data = df[df['M_TName'] == track]
  991.  
  992. filtered_track_data = track_data[~track_data['R_GoingDesc'].isin(['sand', 'synthetic'])]
  993.  
  994. race_count_by_distance_going = track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].nunique().reset_index()
  995.  
  996. race_ids_by_distance_going = track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].unique().reset_index()
  997.  
  998. for index, row in race_count_by_distance_going.iterrows():
  999. distance = row['R_Distance']
  1000. going_desc = row['R_GoingDesc']
  1001. race_count = row['FRSW_RaceId']
  1002. unique_race_ids = list(race_ids_by_distance_going[(race_ids_by_distance_going['R_Distance'] == distance) & (race_ids_by_distance_going['R_GoingDesc'] == going_desc)]['FRSW_RaceId'].values[0])
  1003.  
  1004. if going_desc in ['soft', 'heavy']:
  1005. combined_going_desc = 'soft'
  1006. elif going_desc in ['good', 'firm', 'dead', 'unknown']:
  1007. combined_going_desc = 'good'
  1008. else:
  1009. combined_going_desc = 'other'
  1010.  
  1011. unique_race_times = df[df['FRSW_RaceId'].isin(unique_race_ids)]['R_RaceTime'].drop_duplicates()
  1012. existing_entry = next((entry for entry in merged_distances if entry['Track'] == track and entry['R_Distance'] == distance and entry['Combined GoingDesc'] == combined_going_desc), None)
  1013. if existing_entry:
  1014. existing_entry['Race Count'] += race_count
  1015. existing_entry['Unique RaceIds'] += unique_race_ids
  1016. existing_entry['R_RaceTime'] += unique_race_times.tolist()
  1017. else:
  1018. merged_distances.append({
  1019. 'Track': track,
  1020. 'R_Distance': distance,
  1021. 'Combined GoingDesc': combined_going_desc,
  1022. 'Race Count': race_count,
  1023. 'Unique RaceIds': unique_race_ids,
  1024. 'R_RaceTime': unique_race_times.tolist()
  1025. })
  1026.  
  1027. merged_entries = {}
  1028.  
  1029. for entry in merged_distances:
  1030. track = entry['Track']
  1031. distance = entry['R_Distance']
  1032. combined_going_desc = entry['Combined GoingDesc']
  1033. race_count = entry['Race Count']
  1034.  
  1035. if combined_going_desc in ['good', 'soft'] and race_count < 20:
  1036. key = (track, distance)
  1037.  
  1038. if key in merged_entries:
  1039. merged_entries[key]['Race Count'] += race_count
  1040. merged_entries[key]['Unique RaceIds'].extend(entry['Unique RaceIds'])
  1041. merged_entries[key]['R_RaceTime'].extend(entry['R_RaceTime'])
  1042. else:
  1043. merged_entries[key] = entry
  1044. else:
  1045. merged_entries[(track, distance)] = entry
  1046.  
  1047. merged_distances_list = list(merged_entries.values())
  1048.  
  1049. for entry in merged_distances_list:
  1050. if entry['R_RaceTime']:
  1051. avg_race_time = round(sum(entry['R_RaceTime']) / len(entry['R_RaceTime']), 2)
  1052. entry['Average Race Time'] = avg_race_time
  1053.  
  1054. race_id_to_avg_time = {}
  1055. for entry in merged_distances_list:
  1056. for race_id in entry['Unique RaceIds']:
  1057. race_id_to_avg_time[race_id] = entry['Average Race Time']
  1058.  
  1059. sandsynth = []
  1060.  
  1061. for track in tracks:
  1062. track_data = df[df['M_TName'] == track]
  1063.  
  1064. filtered_track_data = track_data[track_data['R_GoingDesc'].isin(['sand', 'synthetic'])]
  1065.  
  1066. race_count_by_going_distance = filtered_track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].nunique().reset_index()
  1067.  
  1068. race_ids_by_distance_going = filtered_track_data.groupby(['R_Distance', 'R_GoingDesc'])['FRSW_RaceId'].unique().reset_index()
  1069.  
  1070. for index, row in race_count_by_going_distance.iterrows():
  1071. race_count = row['FRSW_RaceId']
  1072. distance = row['R_Distance']
  1073. unique_race_ids = race_ids_by_distance_going[race_ids_by_distance_going['R_Distance'] == distance]['FRSW_RaceId'].values[0]
  1074. unique_race_times = df[df['FRSW_RaceId'].isin(unique_race_ids)]['R_RaceTime'].drop_duplicates()
  1075. avg_race_time = round(sum(unique_race_times) / len(unique_race_times), 2)
  1076. sandsynth.append({
  1077. 'Track': track,
  1078. 'R_Distance': distance,
  1079. 'Race Count': race_count,
  1080. 'Unique RaceIds': unique_race_ids,
  1081. 'R_RaceTime': unique_race_times.tolist(),
  1082. 'Average Race Time': avg_race_time
  1083. })
  1084.  
  1085. sandsynth_avg_time_dict = {}
  1086.  
  1087. for entry in sandsynth:
  1088. unique_race_ids = entry['Unique RaceIds']
  1089. avg_race_time = entry['Average Race Time']
  1090. for race_id in unique_race_ids:
  1091. sandsynth_avg_time_dict[race_id] = avg_race_time
  1092.  
  1093. merged_avg_time_dict = {**race_id_to_avg_time, **sandsynth_avg_time_dict}
  1094.  
  1095. df['NonTABAvgTime'] = df['FRSW_RaceId'].map(merged_avg_time_dict)
  1096.  
  1097. #NonTABTime
  1098. df['NonTABTime'] = np.nan
  1099.  
  1100. def update_non_tabh_time(row):
  1101. if pd.notnull(row['NonTABAvgTime']):
  1102. if row['F_Position'] == 1:
  1103. if pd.notnull(row['R_VerifiedRaceTime']) and row['R_VerifiedRaceTime'] != 0:
  1104. return row['R_VerifiedRaceTime']
  1105. elif pd.notnull(row['R_RaceTime']) and row['R_RaceTime'] != 0:
  1106. return row['R_RaceTime']
  1107.  
  1108. elif pd.notnull(row['F_Margin']) and row['F_Margin'] != 0:
  1109. if pd.notnull(row['R_VerifiedRaceTime']) and row['R_VerifiedRaceTime'] != 0:
  1110. return row['R_VerifiedRaceTime'] + row['F_Margin'] * 0.16
  1111. elif pd.notnull(row['R_RaceTime']) and row['R_RaceTime'] != 0:
  1112. return row['R_RaceTime'] + row['F_Margin'] * 0.16
  1113.  
  1114. return None
  1115.  
  1116. df['NonTABTime'] = df.apply(update_non_tabh_time, axis=1)
  1117.  
  1118. df['NonTABTime'] = df['NonTABTime'].round(2)
  1119.  
  1120. #NonTABFinishAll
  1121.  
  1122. df['NonTABFinishAll'] = np.nan
  1123.  
  1124. df['NonTABFinishAll'] = round(df['NonTABTime'] - df['NonTABAvgTime'], 2)
  1125.  
  1126. #AvgLast3
  1127.  
  1128. df['AvgLast3'] = np.nan
  1129.  
  1130. for index, row in df.iterrows():
  1131. if pd.notnull(row['StartNo']) and row['StartNo'] not in [0, 1]:
  1132. horse_id = row['FHSW_HorseId']
  1133. start_no = row['StartNo']
  1134.  
  1135. previous_rows = df[(df['FHSW_HorseId'] == horse_id) & (df['StartNo'] < start_no)].tail(3)
  1136.  
  1137. if len(previous_rows) >= 3:
  1138. if row['RunsPrep'] != 1:
  1139. avg_value = (
  1140. previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[0] * 0.6 +
  1141. previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[1] * 0.3 +
  1142. previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[2] * 0.1
  1143. )
  1144. else:
  1145. avg_value = (
  1146. previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[0] * 0.75 +
  1147. previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[1] * 0.15 +
  1148. previous_rows['SH_FinishAll'].apply(lambda x: x if pd.notnull(x) and x > -999 else row['NonTABFinishAll']).iloc[2] * 0.1
  1149. )
  1150.  
  1151. df.at[index, 'AvgLast3'] = avg_value
  1152.  
  1153.  
  1154. #TrackBenchmark
  1155. df['TrackBenchmarks'] = np.nan
  1156.  
  1157. filtered_df = df[(df['FHSW_HorseId'].notna()) &
  1158. (df['M_IsBarrierTrial'] == False) &
  1159. (df['F_Position'] == 1) &
  1160. (df['SH_FinishAll'].between(-100, 100))]
  1161.  
  1162. track_avg_dict = filtered_df.groupby(['M_TName', 'R_Distance'])['SH_FinishAll'].mean().to_dict()
  1163.  
  1164. df['TrackBenchmarks'] = df.apply(
  1165. lambda row: track_avg_dict.get((row['M_TName'], row['R_Distance']), np.nan)
  1166. if not row['M_IsBarrierTrial'] else np.nan, axis=1
  1167. )
  1168.  
  1169. df.to_csv("Data.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment