Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import os
- import time
- import tempfile
- from datetime import datetime
- from selenium import webdriver
- from concurrent import futures
- from selenium.webdriver.common.by import By
- from concurrent.futures import ThreadPoolExecutor
- from selenium.webdriver.chrome.options import Options
- from webdriver_manager.chrome import ChromeDriverManager
- from selenium.webdriver.chrome.service import Service
- # from webdriver_manager.firefox import GeckoDriverManager
- groups = []
- links_path = []
- names_links = []
- db_path = "./FlashScore_database/"
- db_path2 = "./"
- driver_path = "./chromedriver-linux64/chromedriver"
- chromedriver_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), driver_path)
- # from selenium.webdriver.firefox.service import Service
- # from webdriver_manager.firefox import GeckoDriverManager
- def get_driver():
- options = Options()
- options.add_argument("--headless") # optional
- options.add_argument("--disable-dev-shm-usage")
- options.add_argument("--no-sandbox")
- options.add_argument("--disable-gpu")
- options.add_argument("--window-size=1920, 1080")
- service = Service(chromedriver_path)
- #service = Service(ChromeDriverManager().install())
- return webdriver.Chrome(service=service, options=options)
- def scrape(url, path, path2, player_count):
- print(f"\n\nCurrently on {player_count}\n\n")
- print(f"\n\nURL: {url}")
- # CREATING THE HEADERS FOR THE FILE
- os.chdir(db_path2)
- total_data = []
- if not os.path.exists(path) or not path.endswith(".csv"):
- raise FileNotFoundError(f"Invalid csv path: {path}")
- if not os.path.exists(path):
- os.makedirs(os.path.dirname(path), exist_ok=True)
- with open(path, "w", encoding="utf-8") as file:
- csvwriter = csv.writer(file)
- csvwriter.writerow(['Date&Time','Name','Final Scores', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5','Aces','Double Faults','1st Serve Percentage','1st Serve Points Won','2nd Serve Points Won','Break Points Saved','1st Return Points Won','2nd Return Points Won','Break Points Converted','Winners','Unforced Errors','Net Points Won','Max Points In Row','Service Points Won','Return Points Won','Total Points Won','Max Games In Row','Service Games Won','Return Games Won','Total Games Won','PBP(Set1)','PBP(Set2)','PBP(Set3)','PBP(Set4)','PBP(Set5)','odds'])
- with open(path, encoding="utf-8") as file: csvreader = csv.reader(file) for row in csvreader:
- total_data.append(row)
- with open(path2 + "/match_details_backup.csv", 'w', encoding="utf-8") as file:
- csvwriter = csv.writer(file)
- for row in total_data:
- csvwriter.writerow(row)
- total_data = total_data[2:]
- print(total_data)
- if not os.path.exists(path):
- os.makedirs(os.path.dirname(path), exit_ok=True)
- with open(path, 'w') as file:
- csvwriter = csv.writer(file)
- csvwriter.writerow(['Date&Time','Name','Final Scores', 'Set1', 'Set2', 'Set3', 'Set4', 'Set5','Aces','Double Faults','1st Serve Percentage','1st Serve Points Won','2nd Serve Points Won','Break Points Saved','1st Return Points Won','2nd Return Points Won','Break Points Converted','Winners','Unforced Errors','Net Points Won','Max Points In Row','Service Points Won','Return Points Won','Total Points Won','Max Games In Row','Service Games Won','Return Games Won','Total Games Won','PBP(Set1)','PBP(Set2)','PBP(Set3)','PBP(Set4)','PBP(Set5)','odds'])
- # INITIALISING DRIVER AND LOCATING THE WEBSITE
- # driver_location = "/usr/local/bin/geckodriver" # Ensure this is correct
- # binary_location = "/usr/bin/firefox-esr" # Ensure this is correct
- # service = Service(executable_path=driver_location)
- #optns = Options()
- #optns.add_argument("--disable-extensions")
- #optns.add_argument("--disable-images")
- #optns.binary_location = binary_location
- #optns.add_argument("--no-first-run")
- #optns.add_argument("--no-service-autorun")
- #optns.add_argument("--password-store=basic")
- #temp_profile = tempfile.mkdtemp()
- #optns.add_argument(f"--set-data-dir={temp_profile}")
- # driver = webdriver.Firefox(service=service, options=optns)
- #services = Service(ChromeDriverManager().install())
- # driver = webdriver.Chrome(service=services, options = optns)
- driver = get_driver()
- driver.get(url)
- time.sleep(3)
- #driver.save_screenshot(f"{url}")
- print("started")
- # ACCEPTING COOKIES
- try:
- cookies_btn_xpath = '//*[@id="onetrust-accept-btn-handler"]'
- cookies_btn = driver.find_element(By.XPATH, cookies_btn_xpath)
- cookies_btn.click()
- except:
- pass
- # CLICKING ON SHOW MORE MATCHES FOR X AMOUNT OF TIMES
- try:
- for i in range(8):
- show_more_class = 'wclButtonLink'
- show_more_btn = driver.find_element(By.CLASS_NAME, show_more_class)
- show_more_btn.click()
- time.sleep(3)
- except:
- pass
- # GETTING THE WIN STATUS FOR ALL THE MATCHES IN A LIST NAMED RESULT
- result = driver.find_elements(By.TAG_NAME, 'span')
- bad_search = []
- for i in result:
- if i.get_attribute('data-testid') != 'wcl-scores-simpleText1':
- bad_search.append(i)
- for i in bad_search:
- result.remove(i)
- results = []
- for i in result:
- results.append(i.text)
- # GETTING THE ENTIRE MATCH LIST
- match_list_class = 'sportName'
- match_list = driver.find_element(By.CLASS_NAME, match_list_class)
- elems = match_list.find_elements(By.TAG_NAME, 'div')
- match_links = []
- bad_search_matches = []
- for elem in elems:
- class_elem = elem.get_attribute('class')
- if "wclLeagueHeader" not in class_elem and 'event__match' not in class_elem:
- bad_search_matches.append(elem)
- for i in bad_search_matches:
- elems.remove(i)
- for index in range(len(elems)):
- # CHECKING IF THE MATCH LINK IS A HEADER OR AN ACTUAL LINK TO THE MATCH AND THEN CLASSIFYING THEM ACCORDINGLY
- if "wclLeagueHeader" in elems[index].get_attribute('class'):
- header_text = elems[index].text
- match_links.append([header_text, 'header'])
- elif 'event__match' in elems[index].get_attribute('class'):
- try:
- # GETTING ALL THE MATCH LINKS AVAILABLE ON THE PAGE WITH THE RESULT OF THAT MATCH
- link = elems[index].find_element(By.CLASS_NAME, 'eventRowLink').get_attribute('href')
- # GETTING THE WIN STATUS OF THE MATCH
- win_loss = ""
- win_loss_temp = elems[index].find_elements(By.CLASS_NAME, 'wcl-scores-simpleText-01_ntYoG')
- for z in win_loss_temp:
- if z.text == 'W' or z.text == 'L':
- win_loss = i.text
- break
- # GETTING THE EVENT TIME
- event_time_class = 'event__time'
- event_time = elems[index].find_element(By.CLASS_NAME, event_time_class).text
- # GETTING THE RESULTS FOR EACH SET FOR TOP AND BOTTOM PLAYERS IN A GIVEN MATCH
- results_top_class = 'event__part--home'
- print(elems[index].get_attribute("class"))
- results_top_webElem = elems[index].find_elements(By.CLASS_NAME, results_top_class)
- results_top = []
- for score in results_top_webElem:
- score_text = "^".join(score.text.split("\n"))
- results_top.append(score_text)
- results_bottom_class = 'event__part--away'
- results_bottom_webElem = elems[index].find_elements(By.CLASS_NAME, results_bottom_class)
- results_bottom = []
- for score in results_bottom_webElem:
- score_text = "^".join(score.text.split("\n"))
- results_bottom.append(score_text)
- # FIGURING OUT WHICH PLAYER IS TOP AND WHICH PLAYER IS THE BOTTOM ONE
- # PLAYER 1 = PLAYER USER SEARCHED FOR
- # PLAYER 2 = OPPONENT
- player1 = []
- player2 = []
- # FIGURING OUT IF PLAYER 1 IS ON TOP OR NOT
- top_score = elems[index].find_element(By.CLASS_NAME, "event__score--home").text
- bottom_score = elems[index].find_element(By.CLASS_NAME, "event__score--away").text
- if top_score == "-" and bottom_score =="-":
- continue
- player_top = True
- if win_loss == 'L' and top_score > bottom_score:
- player_top = False
- elif win_loss == 'W' and bottom_score > top_score:
- player_top = False
- # ASSIGNING RESULTS ON THE BASIS OF PLAYER 1 POSITION
- if player_top:
- player1.append(results_top)
- player2.append(results_bottom)
- else:
- player1.append(results_bottom)
- player2.append(results_top)
- print(f"Results for the top player: {results_top}")
- print(f"Results for the bottom player: {results_bottom}")
- flag = False
- try:
- for row in total_data:
- if row[0] == event_time and (top_score == row[2] or bottom_score == row[2]):
- print("found the same match in the database.\nContinuing with the data extraction.....")
- flag = True
- break
- except:
- print("some stupid error")
- if flag:
- break
- match_links.append([event_time, link, win_loss, player1, player2,'match_link'])
- except Exception as e:
- print("Error with match links:")
- print(e)
- continue
- # match_links = match_links[::-1]
- # GETTING MATCH DATA FOR EACH GIVEN MATCH
- for i in range(len(match_links)):
- print(f"\n\nCurrently on {player_count}\n\n")
- try:
- if match_links[i][-1] == 'header':
- with open(path, 'a') as file:
- csvwriter = csv.writer(file)
- a = " ".join(match_links[i][0].split('\n')).split(',')
- dt = [a[0], a[-1].split(" ")[1], 'header']
- csvwriter.writerow(dt)
- """
- After this the code works as intended and has no problems
- """
Advertisement
Add Comment
Please, Sign In to add comment