tennis scraper

import csv
import os
import time
import tempfile
from datetime import datetime
from selenium import webdriver
from concurrent import futures
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
# from webdriver_manager.firefox import GeckoDriverManager


groups = []
links_path = []
names_links = []
db_path = "./FlashScore_database/"
db_path2 = "./"
driver_path = "./chromedriver-linux64/chromedriver"
chromedriver_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), driver_path)
# from selenium.webdriver.firefox.service import Service
# from webdriver_manager.firefox import GeckoDriverManager

def get_driver():
    options = Options()
    options.add_argument("--headless")   # optional
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920, 1080")
    service = Service(chromedriver_path)
    #service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)


def scrape(url, path, path2, player_count):
    print(f"\n\nCurrently on {player_count}\n\n")
    print(f"\n\nURL: {url}")

    # CREATING THE HEADERS FOR THE FILE
    os.chdir(db_path2)
    total_data = []
    if not os.path.exists(path) or not path.endswith(".csv"):
        raise FileNotFoundError(f"Invalid csv path: {path}")
    if not os.path.exists(path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, "w", encoding="utf-8") as file:
            csvwriter = csv.writer(file)
            csvwriter.writerow(['Date&Time','Name','Final Scores', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5','Aces','Double Faults','1st Serve Percentage','1st Serve Points Won','2nd Serve Points Won','Break Points Saved','1st Return Points Won','2nd Return Points Won','Break Points Converted','Winners','Unforced Errors','Net Points Won','Max Points In Row','Service Points Won','Return Points Won','Total Points Won','Max Games In Row','Service Games Won','Return Games Won','Total Games Won','PBP(Set1)','PBP(Set2)','PBP(Set3)','PBP(Set4)','PBP(Set5)','odds'])

    with open(path, encoding="utf-8") as file:                                                                                                                      csvreader = csv.reader(file)                                                                                                                                for row in csvreader:
            total_data.append(row)

    with open(path2 + "/match_details_backup.csv", 'w', encoding="utf-8") as file:
        csvwriter = csv.writer(file)
        for row in total_data:
            csvwriter.writerow(row)


    total_data = total_data[2:]
    print(total_data)
    if not os.path.exists(path):
        os.makedirs(os.path.dirname(path), exit_ok=True)
    with open(path, 'w') as file:
        csvwriter = csv.writer(file)
        csvwriter.writerow(['Date&Time','Name','Final Scores', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5','Aces','Double Faults','1st Serve Percentage','1st Serve Points Won','2nd Serve Points Won','Break Points Saved','1st Return Points Won','2nd Return Points Won','Break Points Converted','Winners','Unforced Errors','Net Points Won','Max Points In Row','Service Points Won','Return Points Won','Total Points Won','Max Games In Row','Service Games Won','Return Games Won','Total Games Won','PBP(Set1)','PBP(Set2)','PBP(Set3)','PBP(Set4)','PBP(Set5)','odds'])

    # INITIALISING DRIVER AND LOCATING THE WEBSITE
    # driver_location = "/usr/local/bin/geckodriver"  # Ensure this is correct
    # binary_location = "/usr/bin/firefox-esr"  # Ensure this is correct

    # service = Service(executable_path=driver_location)
    #optns = Options()
    #optns.add_argument("--disable-extensions")
    #optns.add_argument("--disable-images")
    #optns.binary_location = binary_location
    #optns.add_argument("--no-first-run")
    #optns.add_argument("--no-service-autorun")
    #optns.add_argument("--password-store=basic")
    #temp_profile = tempfile.mkdtemp()
    #optns.add_argument(f"--set-data-dir={temp_profile}")
    # driver = webdriver.Firefox(service=service, options=optns)
    #services = Service(ChromeDriverManager().install())
    # driver = webdriver.Chrome(service=services, options = optns)
    driver = get_driver()
    driver.get(url)
    time.sleep(3)
    #driver.save_screenshot(f"{url}")
    print("started")

    # ACCEPTING COOKIES
    try:
        cookies_btn_xpath = '//*[@id="onetrust-accept-btn-handler"]'
        cookies_btn = driver.find_element(By.XPATH, cookies_btn_xpath)
        cookies_btn.click()
    except:
        pass


    # CLICKING ON SHOW MORE MATCHES FOR X AMOUNT OF TIMES
    try:
        for i in range(8):
            show_more_class = 'wclButtonLink'
            show_more_btn = driver.find_element(By.CLASS_NAME, show_more_class)
            show_more_btn.click()
            time.sleep(3)
    except:
        pass

    # GETTING THE WIN STATUS FOR ALL THE MATCHES IN A LIST NAMED RESULT
    result = driver.find_elements(By.TAG_NAME, 'span')
    bad_search = []

    for i in result:
        if i.get_attribute('data-testid') != 'wcl-scores-simpleText1':
            bad_search.append(i)

    for i in bad_search:
        result.remove(i)

    results = []
    for i in result:
        results.append(i.text)


    # GETTING THE ENTIRE MATCH LIST
    match_list_class = 'sportName'
    match_list = driver.find_element(By.CLASS_NAME, match_list_class)
    elems = match_list.find_elements(By.TAG_NAME, 'div')
    match_links = []
    bad_search_matches = []
    for elem in elems:
        class_elem = elem.get_attribute('class')
        if "wclLeagueHeader" not in class_elem and 'event__match' not in class_elem:
            bad_search_matches.append(elem)

    for i in bad_search_matches:
        elems.remove(i)

    for index in range(len(elems)):

        # CHECKING IF THE MATCH LINK IS A HEADER OR AN ACTUAL LINK TO THE MATCH AND THEN CLASSIFYING THEM ACCORDINGLY
        if "wclLeagueHeader" in elems[index].get_attribute('class'):
            header_text = elems[index].text
            match_links.append([header_text, 'header'])
        elif 'event__match' in elems[index].get_attribute('class'):
            try:
                # GETTING ALL THE MATCH LINKS AVAILABLE ON THE PAGE WITH THE RESULT OF THAT MATCH
                link = elems[index].find_element(By.CLASS_NAME, 'eventRowLink').get_attribute('href')

                # GETTING THE WIN STATUS OF THE MATCH
                win_loss = ""
                win_loss_temp = elems[index].find_elements(By.CLASS_NAME, 'wcl-scores-simpleText-01_ntYoG')
                for z in win_loss_temp:
                    if z.text == 'W' or z.text == 'L':
                        win_loss = i.text
                        break

                # GETTING THE EVENT TIME
                event_time_class = 'event__time'
                event_time = elems[index].find_element(By.CLASS_NAME, event_time_class).text


                # GETTING THE RESULTS FOR EACH SET FOR TOP AND BOTTOM PLAYERS IN A GIVEN MATCH
                results_top_class = 'event__part--home'
                print(elems[index].get_attribute("class"))
                results_top_webElem = elems[index].find_elements(By.CLASS_NAME, results_top_class)
                results_top = []

                for score in results_top_webElem:
                    score_text = "^".join(score.text.split("\n"))
                    results_top.append(score_text)


                results_bottom_class = 'event__part--away'
                results_bottom_webElem = elems[index].find_elements(By.CLASS_NAME, results_bottom_class)
                results_bottom = []

                for score in results_bottom_webElem:
                    score_text = "^".join(score.text.split("\n"))
                    results_bottom.append(score_text)

                # FIGURING OUT WHICH PLAYER IS TOP AND WHICH PLAYER IS THE BOTTOM ONE
                # PLAYER 1 = PLAYER USER SEARCHED FOR
                # PLAYER 2 = OPPONENT
                player1 = []
                player2 = []

                # FIGURING OUT IF PLAYER 1 IS ON TOP OR NOT
                top_score = elems[index].find_element(By.CLASS_NAME, "event__score--home").text
                bottom_score = elems[index].find_element(By.CLASS_NAME, "event__score--away").text

                if top_score == "-" and bottom_score =="-":
                    continue
                player_top = True
                if win_loss == 'L' and top_score > bottom_score:
                    player_top = False
                elif win_loss == 'W' and bottom_score > top_score:
                    player_top = False

                # ASSIGNING RESULTS ON THE BASIS OF PLAYER 1 POSITION
                if player_top:
                    player1.append(results_top)
                    player2.append(results_bottom)
                else:
                    player1.append(results_bottom)
                    player2.append(results_top)

                print(f"Results for the top player: {results_top}")
                print(f"Results for the bottom player: {results_bottom}")

                flag = False
                try:
                    for row in total_data:
                        if row[0] == event_time and (top_score == row[2] or bottom_score == row[2]):
                            print("found the same match in the database.\nContinuing with the data extraction.....")
                            flag = True
                            break
                except:
                    print("some stupid error")
                if flag:
                    break

                match_links.append([event_time, link, win_loss, player1, player2,'match_link'])

            except Exception as e:
                print("Error with match links:")
                print(e)
                continue

    # match_links = match_links[::-1]
    # GETTING MATCH DATA FOR EACH GIVEN MATCH
    for i in range(len(match_links)):
        print(f"\n\nCurrently on {player_count}\n\n")
        try:
            if match_links[i][-1] == 'header':
                with open(path, 'a') as file:
                    csvwriter = csv.writer(file)
                    a = " ".join(match_links[i][0].split('\n')).split(',')
                    dt = [a[0], a[-1].split(" ")[1], 'header']
                    csvwriter.writerow(dt)


"""
After this the code works as intended and has no problems
"""