Guest User

tennis scraper

a guest
Sep 19th, 2025
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.73 KB | Source Code | 0 0
  1. import csv
  2. import os
  3. import time
  4. import tempfile
  5. from datetime import datetime
  6. from selenium import webdriver
  7. from concurrent import futures
  8. from selenium.webdriver.common.by import By
  9. from concurrent.futures import ThreadPoolExecutor
  10. from selenium.webdriver.chrome.options import Options
  11. from webdriver_manager.chrome import ChromeDriverManager
  12. from selenium.webdriver.chrome.service import Service
  13. # from webdriver_manager.firefox import GeckoDriverManager
  14.  
  15.  
  16. groups = []
  17. links_path = []
  18. names_links = []
  19. db_path = "./FlashScore_database/"
  20. db_path2 = "./"
  21. driver_path = "./chromedriver-linux64/chromedriver"
  22. chromedriver_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), driver_path)
  23. # from selenium.webdriver.firefox.service import Service
  24. # from webdriver_manager.firefox import GeckoDriverManager
  25.  
  26. def get_driver():
  27.     options = Options()
  28.     options.add_argument("--headless")   # optional
  29.     options.add_argument("--disable-dev-shm-usage")
  30.     options.add_argument("--no-sandbox")
  31.     options.add_argument("--disable-gpu")
  32.     options.add_argument("--window-size=1920, 1080")
  33.     service = Service(chromedriver_path)
  34.     #service = Service(ChromeDriverManager().install())
  35.     return webdriver.Chrome(service=service, options=options)
  36.  
  37.  
  38. def scrape(url, path, path2, player_count):
  39.     print(f"\n\nCurrently on {player_count}\n\n")
  40.     print(f"\n\nURL: {url}")
  41.  
  42.     # CREATING THE HEADERS FOR THE FILE
  43.     os.chdir(db_path2)
  44.     total_data = []
  45.     if not os.path.exists(path) or not path.endswith(".csv"):
  46.         raise FileNotFoundError(f"Invalid csv path: {path}")
  47.     if not os.path.exists(path):
  48.         os.makedirs(os.path.dirname(path), exist_ok=True)
  49.         with open(path, "w", encoding="utf-8") as file:
  50.             csvwriter = csv.writer(file)
  51.             csvwriter.writerow(['Date&Time','Name','Final Scores', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5','Aces','Double Faults','1st Serve Percentage','1st Serve Points Won','2nd Serve Points Won','Break Points Saved','1st Return Points Won','2nd Return Points Won','Break Points Converted','Winners','Unforced Errors','Net Points Won','Max Points In Row','Service Points Won','Return Points Won','Total Points Won','Max Games In Row','Service Games Won','Return Games Won','Total Games Won','PBP(Set1)','PBP(Set2)','PBP(Set3)','PBP(Set4)','PBP(Set5)','odds'])
  52.  
  53.     with open(path, encoding="utf-8") as file:                                                                                                                      csvreader = csv.reader(file)                                                                                                                                for row in csvreader:
  54.             total_data.append(row)
  55.  
  56.     with open(path2 + "/match_details_backup.csv", 'w', encoding="utf-8") as file:
  57.         csvwriter = csv.writer(file)
  58.         for row in total_data:
  59.             csvwriter.writerow(row)
  60.  
  61.  
  62.     total_data = total_data[2:]
  63.     print(total_data)
  64.     if not os.path.exists(path):
  65.         os.makedirs(os.path.dirname(path), exit_ok=True)
  66.     with open(path, 'w') as file:
  67.         csvwriter = csv.writer(file)
  68.         csvwriter.writerow(['Date&Time','Name','Final Scores', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5','Aces','Double Faults','1st Serve Percentage','1st Serve Points Won','2nd Serve Points Won','Break Points Saved','1st Return Points Won','2nd Return Points Won','Break Points Converted','Winners','Unforced Errors','Net Points Won','Max Points In Row','Service Points Won','Return Points Won','Total Points Won','Max Games In Row','Service Games Won','Return Games Won','Total Games Won','PBP(Set1)','PBP(Set2)','PBP(Set3)','PBP(Set4)','PBP(Set5)','odds'])
  69.  
  70.     # INITIALISING DRIVER AND LOCATING THE WEBSITE
  71.     # driver_location = "/usr/local/bin/geckodriver"  # Ensure this is correct
  72.     # binary_location = "/usr/bin/firefox-esr"  # Ensure this is correct
  73.  
  74.     # service = Service(executable_path=driver_location)
  75.     #optns = Options()
  76.     #optns.add_argument("--disable-extensions")
  77.     #optns.add_argument("--disable-images")
  78.     #optns.binary_location = binary_location
  79.     #optns.add_argument("--no-first-run")
  80.     #optns.add_argument("--no-service-autorun")
  81.     #optns.add_argument("--password-store=basic")
  82.     #temp_profile = tempfile.mkdtemp()
  83.     #optns.add_argument(f"--set-data-dir={temp_profile}")
  84.     # driver = webdriver.Firefox(service=service, options=optns)
  85.     #services = Service(ChromeDriverManager().install())
  86.     # driver = webdriver.Chrome(service=services, options = optns)
  87.     driver = get_driver()
  88.     driver.get(url)
  89.     time.sleep(3)
  90.     #driver.save_screenshot(f"{url}")
  91.     print("started")
  92.  
  93.     # ACCEPTING COOKIES
  94.     try:
  95.         cookies_btn_xpath = '//*[@id="onetrust-accept-btn-handler"]'
  96.         cookies_btn = driver.find_element(By.XPATH, cookies_btn_xpath)
  97.         cookies_btn.click()
  98.     except:
  99.         pass
  100.  
  101.  
  102.     # CLICKING ON SHOW MORE MATCHES FOR X AMOUNT OF TIMES
  103.     try:
  104.         for i in range(8):
  105.             show_more_class = 'wclButtonLink'
  106.             show_more_btn = driver.find_element(By.CLASS_NAME, show_more_class)
  107.             show_more_btn.click()
  108.             time.sleep(3)
  109.     except:
  110.         pass
  111.  
  112.     # GETTING THE WIN STATUS FOR ALL THE MATCHES IN A LIST NAMED RESULT
  113.     result = driver.find_elements(By.TAG_NAME, 'span')
  114.     bad_search = []
  115.  
  116.     for i in result:
  117.         if i.get_attribute('data-testid') != 'wcl-scores-simpleText1':
  118.             bad_search.append(i)
  119.  
  120.     for i in bad_search:
  121.         result.remove(i)
  122.  
  123.     results = []
  124.     for i in result:
  125.         results.append(i.text)
  126.  
  127.  
  128.     # GETTING THE ENTIRE MATCH LIST
  129.     match_list_class = 'sportName'
  130.     match_list = driver.find_element(By.CLASS_NAME, match_list_class)
  131.     elems = match_list.find_elements(By.TAG_NAME, 'div')
  132.     match_links = []
  133.     bad_search_matches = []
  134.     for elem in elems:
  135.         class_elem = elem.get_attribute('class')
  136.         if "wclLeagueHeader" not in class_elem and 'event__match' not in class_elem:
  137.             bad_search_matches.append(elem)
  138.  
  139.     for i in bad_search_matches:
  140.         elems.remove(i)
  141.  
  142.     for index in range(len(elems)):
  143.  
  144.         # CHECKING IF THE MATCH LINK IS A HEADER OR AN ACTUAL LINK TO THE MATCH AND THEN CLASSIFYING THEM ACCORDINGLY
  145.         if "wclLeagueHeader" in elems[index].get_attribute('class'):
  146.             header_text = elems[index].text
  147.             match_links.append([header_text, 'header'])
  148.         elif 'event__match' in elems[index].get_attribute('class'):
  149.             try:
  150.                 # GETTING ALL THE MATCH LINKS AVAILABLE ON THE PAGE WITH THE RESULT OF THAT MATCH
  151.                 link = elems[index].find_element(By.CLASS_NAME, 'eventRowLink').get_attribute('href')
  152.  
  153.                 # GETTING THE WIN STATUS OF THE MATCH
  154.                 win_loss = ""
  155.                 win_loss_temp = elems[index].find_elements(By.CLASS_NAME, 'wcl-scores-simpleText-01_ntYoG')
  156.                 for z in win_loss_temp:
  157.                     if z.text == 'W' or z.text == 'L':
  158.                         win_loss = i.text
  159.                         break
  160.  
  161.                 # GETTING THE EVENT TIME
  162.                 event_time_class = 'event__time'
  163.                 event_time = elems[index].find_element(By.CLASS_NAME, event_time_class).text
  164.  
  165.  
  166.                 # GETTING THE RESULTS FOR EACH SET FOR TOP AND BOTTOM PLAYERS IN A GIVEN MATCH
  167.                 results_top_class = 'event__part--home'
  168.                 print(elems[index].get_attribute("class"))
  169.                 results_top_webElem = elems[index].find_elements(By.CLASS_NAME, results_top_class)
  170.                 results_top = []
  171.  
  172.                 for score in results_top_webElem:
  173.                     score_text = "^".join(score.text.split("\n"))
  174.                     results_top.append(score_text)
  175.  
  176.  
  177.                 results_bottom_class = 'event__part--away'
  178.                 results_bottom_webElem = elems[index].find_elements(By.CLASS_NAME, results_bottom_class)
  179.                 results_bottom = []
  180.  
  181.                 for score in results_bottom_webElem:
  182.                     score_text = "^".join(score.text.split("\n"))
  183.                     results_bottom.append(score_text)
  184.  
  185.                 # FIGURING OUT WHICH PLAYER IS TOP AND WHICH PLAYER IS THE BOTTOM ONE
  186.                 # PLAYER 1 = PLAYER USER SEARCHED FOR
  187.                 # PLAYER 2 = OPPONENT
  188.                 player1 = []
  189.                 player2 = []
  190.  
  191.                 # FIGURING OUT IF PLAYER 1 IS ON TOP OR NOT
  192.                 top_score = elems[index].find_element(By.CLASS_NAME, "event__score--home").text
  193.                 bottom_score = elems[index].find_element(By.CLASS_NAME, "event__score--away").text
  194.  
  195.                 if top_score == "-" and bottom_score =="-":
  196.                     continue
  197.                 player_top = True
  198.                 if win_loss == 'L' and top_score > bottom_score:
  199.                     player_top = False
  200.                 elif win_loss == 'W' and bottom_score > top_score:
  201.                     player_top = False
  202.  
  203.                 # ASSIGNING RESULTS ON THE BASIS OF PLAYER 1 POSITION
  204.                 if player_top:
  205.                     player1.append(results_top)
  206.                     player2.append(results_bottom)
  207.                 else:
  208.                     player1.append(results_bottom)
  209.                     player2.append(results_top)
  210.  
  211.                 print(f"Results for the top player: {results_top}")
  212.                 print(f"Results for the bottom player: {results_bottom}")
  213.  
  214.                 flag = False
  215.                 try:
  216.                     for row in total_data:
  217.                         if row[0] == event_time and (top_score == row[2] or bottom_score == row[2]):
  218.                             print("found the same match in the database.\nContinuing with the data extraction.....")
  219.                             flag = True
  220.                             break
  221.                 except:
  222.                     print("some stupid error")
  223.                 if flag:
  224.                     break
  225.  
  226.                 match_links.append([event_time, link, win_loss, player1, player2,'match_link'])
  227.  
  228.             except Exception as e:
  229.                 print("Error with match links:")
  230.                 print(e)
  231.                 continue
  232.  
  233.     # match_links = match_links[::-1]
  234.     # GETTING MATCH DATA FOR EACH GIVEN MATCH
  235.     for i in range(len(match_links)):
  236.         print(f"\n\nCurrently on {player_count}\n\n")
  237.         try:
  238.             if match_links[i][-1] == 'header':
  239.                 with open(path, 'a') as file:
  240.                     csvwriter = csv.writer(file)
  241.                     a = " ".join(match_links[i][0].split('\n')).split(',')
  242.                     dt = [a[0], a[-1].split(" ")[1], 'header']
  243.                     csvwriter.writerow(dt)
  244.  
  245.  
  246. """
  247. After this the code works as intended and has no problems
  248. """
  249.  
Advertisement
Add Comment
Please, Sign In to add comment