Advertisement
Brovashift

Untitled

May 23rd, 2023
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 18.66 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. import os
  3. import requests
  4. import sys
  5. import csv
  6.  
  7. from collections import defaultdict
  8. from datetime import datetime, timedelta
  9. from lxml import etree, html
  10. from orjson import loads, dumps
  11. from re import search
  12.  
  13. from utils.going import get_surface
  14. from utils.header import RandomHeader
  15. from utils.lxml_funcs import find
  16. from utils.region import get_region
  17.  
  18.  
  19. random_header = RandomHeader()
  20.  
  21.  
  22. def clean_name(name):
  23.     if name:
  24.         return name.strip().replace("'", '').lower().title()
  25.     else:
  26.         return ''
  27.  
  28.  
  29. def distance_to_furlongs(distance):
  30.     dist = distance.strip().replace('¼', '.25').replace('½', '.5').replace('¾', '.75')
  31.  
  32.     if 'm' in dist:
  33.         if len(dist) > 2:
  34.             dist = int(dist.split('m')[0]) * 8 + float(dist.split('m')[1].strip('f'))
  35.         else:
  36.             dist = int(dist.split('m')[0]) * 8
  37.     else:
  38.         dist = dist.strip('f')
  39.  
  40.     return float(dist)
  41.  
  42.  
  43. def get_going_info(session, date):
  44.     r = session.get(f'https://www.racingpost.com/non-runners/{date}', headers=random_header.header())
  45.     doc = html.fromstring(r.content.decode())
  46.  
  47.     json_str = doc.xpath('//body/script')[0].text.replace('var __PRELOADED_STATE__ = ', '').strip().strip(';')
  48.  
  49.     going_info = defaultdict(dict)
  50.  
  51.     for course in loads(json_str):
  52.         going, rail_movements = parse_going(course['going'])
  53.  
  54.         course_id = 0
  55.         course_name = ''
  56.  
  57.         if course['courseName'] == 'Belmont At The Big A':
  58.             course_id = 255
  59.             course_name = 'Aqueduct'
  60.         else:
  61.             course_id = int(course['raceCardsCourseMeetingsUrl'].split('/')[2])
  62.             course_name = course['courseName']
  63.  
  64.         going_info[course_id]['course'] = course_name
  65.         going_info[course_id]['going'] = going
  66.         going_info[course_id]['stalls'] = course['stallsPosition']
  67.         going_info[course_id]['rail_movements'] = rail_movements
  68.         going_info[course_id]['weather'] = course['weather']
  69.  
  70.     return going_info
  71.  
  72.  
  73. def get_pattern(race_name):
  74.     regex_group = '(\(|\s)((G|g)rade|(G|g)roup) (\d|[A-Ca-c]|I*)(\)|\s)'
  75.     match = search(regex_group, race_name)
  76.  
  77.     if match:
  78.         pattern = f'{match.groups()[1]} {match.groups()[4]}'.title()
  79.         return pattern.title()
  80.  
  81.     if any(x in race_name.lower() for x in {'listed race', '(listed'}):
  82.         return 'Listed'
  83.  
  84.     return ''
  85.  
  86.  
  87. def get_race_type(doc, race, distance):
  88.         race_type = ''
  89.         fences = find(doc, 'div', 'RC-headerBox__stalls')
  90.  
  91.         if 'hurdle' in fences.lower():
  92.             race_type = 'Hurdle'
  93.         elif 'fence' in fences.lower():
  94.             race_type = 'Chase'
  95.         else:
  96.             if distance >= 12:
  97.                 if any(x in race for x in {'national hunt flat', 'nh flat race', 'mares flat race'}):
  98.                     race_type = 'NH Flat'
  99.                 if any(x in race for x in {'inh bumper', ' sales bumper', 'kepak flat race', 'i.n.h. flat race'}):
  100.                     race_type = 'NH Flat'
  101.                 if any(x in race for x in {' hurdle', '(hurdle)'}):
  102.                     race_type = 'Hurdle'
  103.                 if any(x in race for x in {' chase', '(chase)', 'steeplechase', 'steeple-chase', 'steeplchase', 'steepl-chase'}):
  104.                     race_type = 'Chase'
  105.  
  106.         if race_type == '':
  107.             race_type = 'Flat'
  108.  
  109.         return race_type
  110.  
  111.  
  112. def get_race_urls(session, racecard_url):
  113.     r = session.get(racecard_url, headers=random_header.header())
  114.     doc = html.fromstring(r.content)
  115.  
  116.     race_urls = []
  117.  
  118.     for meeting in doc.xpath("//section[@data-accordion-row]"):
  119.         course = meeting.xpath(".//span[contains(@class, 'RC-accordion__courseName')]")[0]
  120.         if valid_course(course.text_content().strip().lower()):
  121.             for race in meeting.xpath(".//a[@class='RC-meetingItem__link js-navigate-url']"):
  122.                 race_urls.append('https://www.racingpost.com' + race.attrib['href'])
  123.  
  124.     return sorted(list(set(race_urls)))
  125.  
  126.  
  127. def get_runners(session, profile_urls):
  128.     runners = {}
  129.  
  130.     for url in profile_urls:
  131.         r = session.get(url, headers=random_header.header())
  132.         doc = html.fromstring(r.content)
  133.  
  134.         runner = {}
  135.  
  136.         try:
  137.             json_str = doc.xpath('//body/script')[0].text.split('window.PRELOADED_STATE =')[1].split('\n')[0].strip().strip(';')
  138.             js = loads(json_str)
  139.         except IndexError:
  140.             split = url.split('/')
  141.             runner['horse_id'] = int(split[5])
  142.             runner['name'] = split[6].replace('-', ' ').title()
  143.             runner['broken_url'] = url
  144.             runners[runner['horse_id']] = runner
  145.             continue
  146.  
  147.         runner['horse_id'] = js['profile']['horseUid']
  148.         runner['name'] = clean_name(js['profile']['horseName'])
  149.         runner['dob'] = js['profile']['horseDateOfBirth'].split('T')[0]
  150.         runner['age'] = int(js['profile']['age'].split('-')[0])
  151.         runner['sex'] = js['profile']['horseSex']
  152.         runner['sex_code'] = js['profile']['horseSexCode']
  153.         runner['colour'] = js['profile']['horseColour']
  154.         runner['region'] = js['profile']['horseCountryOriginCode']
  155.  
  156.         runner['breeder'] = js['profile']['breederName']
  157.         runner['dam'] = clean_name(js['profile']['damHorseName'])
  158.         runner['dam_region'] = js['profile']['damCountryOriginCode']
  159.         runner['sire'] = clean_name(js['profile']['sireHorseName'])
  160.         runner['sire_region'] = js['profile']['sireCountryOriginCode']
  161.         runner['grandsire'] = clean_name(js['profile']['siresSireName'])
  162.         runner['damsire'] = clean_name(js['profile']['damSireHorseName'])
  163.         runner['damsire_region'] = js['profile']['damSireCountryOriginCode']
  164.  
  165.         runner['trainer'] = clean_name(js['profile']['trainerName'])
  166.         runner['trainer_id'] = js['profile']['trainerUid']
  167.         runner['trainer_location'] = js['profile']['trainerLocation']
  168.         runner['trainer_14_days'] = js['profile']['trainerLast14Days']
  169.  
  170.         runner['owner'] = clean_name(js['profile']['ownerName'])
  171.  
  172.         runner['prev_trainers'] = js['profile']['previousTrainers']
  173.  
  174.         if runner['prev_trainers']:
  175.             prev_trainers = []
  176.  
  177.             for trainer in runner['prev_trainers']:
  178.                 prev_trainer = {}
  179.                 prev_trainer['trainer'] = trainer['trainerStyleName']
  180.                 prev_trainer['trainer_id'] = trainer['trainerUid']
  181.                 prev_trainer['change_date'] = trainer['trainerChangeDate'].split('T')[0]
  182.                 prev_trainers.append(prev_trainer)
  183.  
  184.             runner['prev_trainers'] = prev_trainers
  185.  
  186.         runner['prev_owners'] = js['profile']['previousOwners']
  187.  
  188.         if runner['prev_owners']:
  189.             prev_owners = []
  190.  
  191.             for owner in runner['prev_owners']:
  192.                 prev_owner = {}
  193.                 prev_owner['owner'] = owner['ownerStyleName']
  194.                 prev_owner['owner_id'] = owner['ownerUid']
  195.                 prev_owner['change_date'] = owner['ownerChangeDate'].split('T')[0]
  196.                 prev_owners.append(prev_owner)
  197.  
  198.             runner['prev_owners'] = prev_owners
  199.  
  200.         if js['profile']['comments']:
  201.             runner['comment'] = js['profile']['comments'][0]['individualComment']
  202.             runner['spotlight'] = js['profile']['comments'][0]['individualSpotlight']
  203.         else:
  204.             runner['comment'] = None
  205.             runner['spotlight'] = None
  206.  
  207.         if js['profile']['medical']:
  208.             medicals = []
  209.  
  210.             for med in js['profile']['medical']:
  211.                 medical = {}
  212.                 medical['date'] = med['medicalDate'].split('T')[0]
  213.                 medical['type'] = med['medicalType']
  214.                 medicals.append(medical)
  215.  
  216.             runner['medical'] = medicals
  217.  
  218.         runner['quotes'] = None
  219.  
  220.         if js['quotes']:
  221.             quotes = []
  222.  
  223.             for q in js['quotes']:
  224.                 quote = {}
  225.                 quote['date'] = q['raceDate'].split('T')[0]
  226.                 quote['horse'] = q['horseStyleName']
  227.                 quote['horse_id'] = q['horseUid']
  228.                 quote['race'] = q['raceTitle']
  229.                 quote['race_id'] = q['raceId']
  230.                 quote['course'] = q['courseStyleName']
  231.                 quote['course_id'] = q['courseUid']
  232.                 quote['distance_f'] = q['distanceFurlong']
  233.                 quote['distance_y'] = q['distanceYard']
  234.                 quote['quote'] = q['notes']
  235.                 quotes.append(quote)
  236.  
  237.             runner['quotes'] = quotes
  238.  
  239.         runner['stable_tour'] = None
  240.  
  241.         if js['stableTourQuotes']:
  242.             quotes = []
  243.  
  244.             for q in js['stableTourQuotes']:
  245.                 quote = {}
  246.                 quote['horse'] = q['horseName']
  247.                 quote['horse_id'] = q['horseUid']
  248.                 quote['quote'] = q['notes']
  249.                 quotes.append(quote)
  250.  
  251.             runner['stable_tour'] = quotes
  252.  
  253.         runners[runner['horse_id']] = runner
  254.  
  255.     return runners
  256.  
  257.  
  258. def parse_going(going_info):
  259.     going = going_info
  260.     rail_movements = ''
  261.  
  262.     if 'Rail movements' in going_info:
  263.         going_info = going_info.replace('movements:', 'movements')
  264.         rail_movements = [x.strip() for x in going_info.split('Rail movements')[1].strip().strip(')').split(',')]
  265.         going = going_info.split('(Rail movements')[0].strip()
  266.  
  267.     return going, rail_movements
  268.  
  269.  
  270. def parse_races(session, race_urls, date):
  271.     races = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
  272.  
  273.     going_info = get_going_info(session, date)
  274.  
  275.     for url in race_urls:
  276.         r = session.get(url, headers=random_header.header(), allow_redirects=False)
  277.  
  278.         if r.status_code != 200:
  279.             print('Failed to get racecard.')
  280.             print(f'URL: {url}')
  281.             print(f'Response: {r.status_code}')
  282.             continue
  283.  
  284.         try:
  285.             doc = html.fromstring(r.content)
  286.         except etree.ParserError:
  287.             continue
  288.  
  289.         race = {}
  290.  
  291.         url_split = url.split('/')
  292.  
  293.         race['course'] = find(doc, 'h1', 'RC-courseHeader__name')
  294.  
  295.         if race['course'] == 'Belmont At The Big A':
  296.             race['course_id'] = 255
  297.             race['course'] = 'Aqueduct'
  298.         else:
  299.             race['course_id'] = int(url_split[4])
  300.  
  301.         race['race_id'] = int(url_split[7])
  302.         race['date'] = url_split[6]
  303.         race['off_time'] = find(doc, 'span', 'RC-courseHeader__time')
  304.         race['race_name'] = find(doc, 'span', 'RC-header__raceInstanceTitle')
  305.         race['distance_round'] = find(doc, 'strong', 'RC-header__raceDistanceRound')
  306.         race['distance'] = find(doc, 'span', 'RC-header__raceDistance')
  307.         race['distance'] = race['distance_round'] if not race['distance'] else race['distance'].strip('()')
  308.         race['distance_f'] = distance_to_furlongs(race['distance_round'])
  309.         race['region'] = get_region(str(race['course_id']))
  310.         race['pattern'] = get_pattern(race['race_name'].lower())
  311.         race['race_class'] = find(doc, 'span', 'RC-header__raceClass')
  312.         race['race_class'] = race['race_class'].strip('()') if race['race_class'] else ''
  313.         race['type'] = get_race_type(doc, race['race_name'].lower(), race['distance_f'])
  314.  
  315.         if not race['race_class']:
  316.             if race['pattern']:
  317.                 race['race_class'] = 'Class 1'
  318.  
  319.         try:
  320.             band = find(doc, 'span', 'RC-header__rpAges').strip('()').split()
  321.             if band:
  322.                 race['age_band'] = band[0]
  323.                 race['rating_band'] = band[1] if len(band) > 1 else None
  324.             else:
  325.                 race['age_band'] = None
  326.                 race['rating_band'] = None
  327.         except AttributeError:
  328.             race['age_band'] = None
  329.             race['rating_band'] = None
  330.  
  331.         prize = find(doc, 'div', 'RC-headerBox__winner').lower()
  332.         race['prize'] = prize.split('winner:')[1].strip() if 'winner:' in prize else None
  333.         field_size = find(doc, 'div', 'RC-headerBox__runners').lower()
  334.         if field_size:
  335.             race['field_size'] = int(field_size.split('runners:')[1].split('(')[0].strip())
  336.         else:
  337.             race['field_size'] = ''
  338.  
  339.         try:
  340.             race['going_detailed'] = going_info[race['course_id']]['going']
  341.             race['rail_movements'] = going_info[race['course_id']]['rail_movements']
  342.             race['stalls'] = going_info[race['course_id']]['stalls']
  343.             race['weather'] = going_info[race['course_id']]['weather']
  344.         except KeyError:
  345.             race['going'] = None
  346.             race['rail_movements'] = None
  347.             race['stalls'] = None
  348.             race['weather'] = None
  349.  
  350.         going = find(doc, 'div', 'RC-headerBox__going').lower()
  351.         race['going'] = going.split('going:')[1].strip().title() if 'going:' in going else ''
  352.  
  353.         race['surface'] = get_surface(race['going'])
  354.  
  355.         profile_hrefs = doc.xpath("//a[@data-test-selector='RC-cardPage-runnerName']/@href")
  356.         profile_urls = ['https://www.racingpost.com' + a.split('#')[0] + '/form' for a in profile_hrefs]
  357.  
  358.         runners = get_runners(session, profile_urls)
  359.  
  360.         for horse in doc.xpath("//div[contains(@class, ' js-PC-runnerRow')]"):
  361.             horse_id = int(find(horse, 'a', 'RC-cardPage-runnerName', attrib='href').split('/')[3])
  362.  
  363.             if 'broken_url' in runners[horse_id]:
  364.                 sire = find(horse, 'a', 'RC-pedigree__sire').split('(')
  365.                 dam = find(horse, 'a', 'RC-pedigree__dam').split('(')
  366.                 damsire = find(horse, 'a', 'RC-pedigree__damsire').lstrip('(').rstrip(')').split('(')
  367.  
  368.                 runners[horse_id]['sire'] = clean_name(sire[0])
  369.                 runners[horse_id]['dam'] = clean_name(dam[0])
  370.                 runners[horse_id]['damsire'] = clean_name(damsire[0])
  371.  
  372.                 runners[horse_id]['sire_region'] = sire[1].replace(')', '').strip()
  373.                 runners[horse_id]['dam_region'] = dam[1].replace(')', '').strip()
  374.                 runners[horse_id]['damsire_region'] = damsire[1].replace(')', '').strip()
  375.  
  376.                 runners[horse_id]['age'] = find(horse, 'span', 'RC-cardPage-runnerAge', attrib='data-order-age')
  377.  
  378.                 sex = find(horse, 'span', 'RC-pedigree__color-sex').split()
  379.  
  380.                 runners[horse_id]['colour'] = sex[0]
  381.                 runners[horse_id]['sex_code'] = sex[1].capitalize()
  382.  
  383.                 runners[horse_id]['trainer'] = find(horse, 'a', 'RC-cardPage-runnerTrainer-name', attrib='data-order-trainer')
  384.  
  385.             runners[horse_id]['number'] = int(find(horse, 'span', 'RC-cardPage-runnerNumber-no', attrib='data-order-no'))
  386.  
  387.             try:
  388.                 runners[horse_id]['draw'] = int(find(horse, 'span', 'RC-cardPage-runnerNumber-draw', attrib='data-order-draw'))
  389.             except ValueError:
  390.                 runners[horse_id]['draw'] = None
  391.  
  392.             runners[horse_id]['headgear'] = find(horse, 'span', 'RC-cardPage-runnerHeadGear')
  393.             runners[horse_id]['headgear_first'] = find(horse, 'span', 'RC-cardPage-runnerHeadGear-first')
  394.  
  395.             try:
  396.                 runners[horse_id]['lbs'] = int(find(horse, 'span', 'RC-cardPage-runnerWgt-carried', attrib='data-order-wgt'))
  397.             except ValueError:
  398.                 runners[horse_id]['lbs'] = None
  399.  
  400.             try:
  401.                 runners[horse_id]['ofr'] = int(find(horse, 'span', 'RC-cardPage-runnerOr', attrib='data-order-or'))
  402.             except ValueError:
  403.                 runners[horse_id]['ofr'] = None
  404.  
  405.             try:
  406.                 runners[horse_id]['rpr'] = int(find(horse, 'span', 'RC-cardPage-runnerRpr', attrib='data-order-rpr'))
  407.             except ValueError:
  408.                 runners[horse_id]['rpr'] = None
  409.  
  410.             try:
  411.                 runners[horse_id]['ts'] = int(find(horse, 'span', 'RC-cardPage-runnerTs', attrib='data-order-ts'))
  412.             except ValueError:
  413.                 runners[horse_id]['ts'] = None
  414.  
  415.             claim = find(horse, 'span', 'RC-cardPage-runnerJockey-allowance')
  416.             jockey = horse.find('.//a[@data-test-selector="RC-cardPage-runnerJockey-name"]')
  417.  
  418.             if jockey is not None:
  419.                 jock = jockey.attrib['data-order-jockey']
  420.                 runners[horse_id]['jockey'] = jock if not claim else jock + f'({claim})'
  421.                 runners[horse_id]['jockey_id'] = int(jockey.attrib['href'].split('/')[3])
  422.             else:
  423.                 runners[horse_id]['jockey'] = None
  424.                 runners[horse_id]['jockey_id'] = None
  425.  
  426.             try:
  427.                 runners[horse_id]['last_run'] = find(horse, 'div', 'RC-cardPage-runnerStats-lastRun')
  428.             except TypeError:
  429.                 runners[horse_id]['last_run'] = None
  430.  
  431.             runners[horse_id]['form'] = find(horse, 'span', 'RC-cardPage-runnerForm')
  432.  
  433.             try:
  434.                 runners[horse_id]['trainer_rtf'] = find(horse, 'span', 'RC-cardPage-runnerTrainer-rtf')
  435.             except TypeError:
  436.                 runners[horse_id]['trainer_rtf'] = None
  437.  
  438.         race['runners'] = [runner for runner in runners.values()]
  439.         races[race['region']][race['course']][race['off_time']] = race
  440.  
  441.     return races
  442.  
  443.  
  444. def valid_course(course):
  445.     invalid = ['free to air', 'worldwide stakes', '(arab)']
  446.     return all([x not in course for x in invalid])
  447.  
  448. def save_runners_to_csv(runners, filename):
  449.     fieldnames = ['horse_id', 'name', 'dob', 'age', 'sex', 'sex_code', 'colour', 'region',
  450.                   'breeder', 'dam', 'dam_region', 'sire', 'sire_region', 'grandsire',
  451.                   'damsire', 'damsire_region', 'trainer', 'trainer_id', 'trainer_location',
  452.                   'trainer_14_days', 'owner', 'prev_trainers', 'prev_owners', 'comment',
  453.                   'spotlight', 'medical', 'quotes', 'stable_tour']
  454.  
  455.     with open(filename, 'w', newline='') as csvfile:
  456.         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  457.         writer.writeheader()
  458.  
  459.         for runner in runners.values():
  460.             writer.writerow(runner)
  461.  
  462.  
  463. def main():
  464.     if len(sys.argv) != 2 or sys.argv[1].lower() not in {'today', 'tomorrow'}:
  465.         return print('Usage: ./racecards.py [today|tomorrow]')
  466.  
  467.     racecard_url = 'https://www.racingpost.com/racecards'
  468.  
  469.     date = datetime.today().strftime('%Y-%m-%d')
  470.  
  471.     if sys.argv[1].lower() == 'tomorrow':
  472.         racecard_url += '/tomorrow'
  473.         date = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')
  474.  
  475.     session = requests.Session()
  476.  
  477.     race_urls = get_race_urls(session, racecard_url)
  478.     races = parse_races(session, race_urls, date)
  479.  
  480.     if not os.path.exists('../racecards'):
  481.         os.makedirs(f'../racecards')
  482.  
  483.     # Save CSV data
  484.     save_runners_to_csv(races, f'../racecards/{date}.csv')
  485.  
  486.  
  487. if __name__ == '__main__':
  488.     main()
  489.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement