Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '
- This the parent widget code to which I want to add the web scraper that i have also provided
- # Parent Widget Code:
- #!/usr/bin/env python3
- import tkinter as tk
- from tkinter import messagebox, PhotoImage, font
- from tkinter.ttk import Frame, Button
- from io import StringIO
- import sys
- from ttkthemes import ThemedTk
- from selenium import webdriver
- from selenium.webdriver.firefox.options import Options
- from selenium.webdriver.common.keys import Keys
- from bs4 import BeautifulSoup
- import time
- # Function to redirect console output to text widget
- class TextRedirector(object):
- def __init__(self, widget):
- self.widget = widget
- def write(self, str):
- self.widget.insert(tk.END, str)
- self.widget.see(tk.END)
- def flush(self):
- # This could be left empty as Tkinter's Text widget doesn't need flushing
- pass
- # Your functions
- def get_profile():
- profile = webdriver.FirefoxProfile()
- profile.set_preference("browser.privatebrowsing.autostart", True)
- return profile
- def scroll_to_end(browser):
- scroll_pause_time = 5
- last_height = browser.execute_script("return document.body.scrollHeight")
- while True:
- browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- time.sleep(scroll_pause_time)
- new_height = browser.execute_script("return document.body.scrollHeight")
- if new_height == last_height:
- break
- last_height = new_height
- def save_html(browser, path):
- html_content = browser.page_source
- soup = BeautifulSoup(html_content, 'html.parser')
- pretty_html = soup.prettify()
- with open(path, 'w', encoding='utf-8') as f:
- f.write(pretty_html)
- def scrape_website():
- options = Options()
- options.profile = get_profile()
- browser = webdriver.Firefox(options=options)
- browser.get("https://www.oddsportal.com/matches/football/20240222/")
- scroll_to_end(browser)
- save_html(browser, r"/home/harshad/Projects/Predictor/Football Predictor/research/website_soup.txt")
- browser.quit()
- print("Website scraped successfully!")
- def refresh_selections():
- print("Selections updated successfully!")
- def predict():
- print("Prediction completed successfully!")
- # Create the main window
- root = ThemedTk(theme="arc")
- root.title("Football Predictor")
- # Create a text widget
- output = tk.Text(root)
- output.pack()
- # Redirect console output to text widget
- sys.stdout = TextRedirector(output)
- # Create a frame for the buttons
- button_frame = Frame(root)
- button_frame.pack(fill=tk.BOTH, expand=True)
- # Create buttons for each action
- scrape_button = Button(button_frame, text="Scrape Website", command=scrape_website)
- update_button = Button(button_frame, text="Refresh Selections", command=refresh_selections)
- predict_button = Button(button_frame, text="Predict", command=predict)
- # Add the buttons to the window
- scrape_button.grid(row=0, column=0, sticky='ew')
- update_button.grid(row=1, column=0, sticky='ew')
- predict_button.grid(row=2, column=0, sticky='ew')
- # Configure the rows and columns for responsiveness
- button_frame.columnconfigure(0, weight=1)
- for i in range(3):
- button_frame.rowconfigure(i, weight=1)
- # Start the main loop
- root.mainloop()
- above is the parent code.
- the below is an existing code that scrapes the data from the website:
- # Existing Scraper Code
- def parse_data(url, return_urls=False):
- print(f'Parsing URL: {url}\n')
- browser = create_driver()
- browser.get(url)
- # Wait for the first match element to be present
- wait = WebDriverWait(browser, 20)
- # Wait for the team names to be present
- team_name_class = "participant-name"
- wait.until(EC.presence_of_element_located((By.CLASS_NAME, team_name_class)))
- # Now, check if the first match element contains match data
- match_element_class = "border-black-borders.group.flex" # Class of the match element
- first_match_element = browser.find_element(By.CSS_SELECTOR, f".{match_element_class}")
- match_data_indicator = "participant-name" # Class indicating match data
- wait.until(EC.presence_of_element_located((By.CLASS_NAME, match_data_indicator)))
- # ########## For page to scroll to the end ###########
- scroll_pause_time = 4
- try:
- # Get scroll height
- last_height = browser.execute_script("return document.body.scrollHeight")
- while True:
- import time
- # Scroll down to bottom
- browser.execute_script("window.scrollTo(0, 0);") # Scroll to the top
- # Wait to load page
- time.sleep(1) # Wait briefly
- browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down
- time.sleep(scroll_pause_time)
- # Calculate new scroll height and compare with last scroll height
- new_height = browser.execute_script("return document.body.scrollHeight")
- if new_height == last_height:
- break
- last_height = new_height
- # ########## For page to scroll to the end ###########
- # If browser stalls and gives a timeout exception, refresh the browser
- except TimeoutException:
- print("Timeout exception occurred, refreshing the page...")
- browser.refresh()
- html_content = browser.page_source
- soup = bs(html_content, 'html.parser')
- pretty_html = soup.prettify()
- with open(r"C:\Users\harshad\Documents\Harshad Projects\Python Projects\Football Predictor\Files\z.Research\website_soup.txt", 'w', encoding='utf-8') as f:
- f.write(pretty_html)
- # Using lxml and XPath
- tree = html.fromstring(html_content)
- # Extracting the date from the header
- date = tree.xpath('//h1[@id="next-matches-h1"]/text()')[0]
- # Splitting the date string to get the date part
- date = date.split(", ")[-1]
- data = []
- # Initialize initial values for Country and League
- current_country = None
- current_league = None
- # Extracting match rows
- match_rows = tree.xpath('//div[contains(@class, "eventRow")]')
- # Iterate through each match row
- for match_row in match_rows:
- country = match_row.xpath('.//a[contains(@class, "font-normal")]/p[contains(@class, "truncate")]/text()')
- league = match_row.xpath('.//a[contains(@class, "truncate")]/text()')
- if country:
- current_country = country[0].strip()
- else:
- country = [current_country]
- if league:
- current_league = league[0].strip()
- else:
- league = [current_league]
- # Extract Game Teams
- team1_elements = match_row.xpath('.//a[starts-with(@href, "/football/")][1]/div/p/text()')
- team1 = team1_elements[0].strip() if team1_elements else ''
- team2_elements = match_row.xpath('.//a[starts-with(@href, "/football/")][2]/div/p/text()')
- team2 = team2_elements[0].strip() if team2_elements else ''
- game_teams_str = f"{team1} - {team2}"
- time = match_row.xpath('.//div/p/text()')[0].strip()
- score1_xpath = './/div[contains(@class, "ml-auto mr-3 flex font-bold")]/text()'
- score2_xpath = './/div[contains(@class, "ml-auto mr-3 flex font-bold font-bold")]/text()'
- # Extract the scores
- scores_xpath = './/a/div[contains(@class, "ml-auto mr-3 flex font-bold")]/text()'
- scores = match_row.xpath(scores_xpath)
- scores = [score.strip() for score in scores]
- if len(scores) >= 2:
- score1, score2 = scores[:2]
- score = f"{score1}:{score2}"
- else:
- score = None
- # Check if " " is in the score
- if score and " " in score:
- try:
- score1_xpath = './/div[contains(@class, "ml-auto mr-3 flex font-bold next-m:!hidden")]/text()'
- score2_xpath = './/div[contains(@class, "ml-auto mr-3 flex font-bold next-m:!hidden font-bold")]/text()'
- # Extract the scores
- score1 = match_row.xpath(score1_xpath)[0]
- print("Score1 List: ", score1)
- score1 = score1[0] if score1 else None
- score2 = match_row.xpath(score2_xpath)[0]
- print("Score2 List: ", score2)
- score2 = score2[0] if score2 else None
- if score1 and score2:
- score = f"{score1.strip()}:{score2.strip()}"
- except IndexError:
- score_list = match_row.xpath('//div[@data-v-0398e93e=""]/p/text()')
- print("Score List: ", score_list)
- score = score_list[0].strip() if score_list else None
- # Extract odds and handle the case where odds are not present
- odds = match_row.xpath('.//p[contains(@class, "height-content")]/text()')
- odds = [s.strip() for s in odds if s.strip().replace('.', '', 1).isdigit()]
- home_odds = odds[0] if odds else None
- draw_odds = odds[1] if len(odds) > 1 else None
- away_odds = odds[2] if len(odds) > 2 else None
- # Append extracted data to the data list
- data.append({
- "country": country[0], # Take the first element as the value
- "league": league[0], # Take the first element as the value
- "date": date,
- "time": time,
- "game": game_teams_str,
- "score": score,
- "home_odds": home_odds,
- "draw_odds": draw_odds,
- "away_odds": away_odds
- })
- # Data validation
- if not data:
- print(f"No valid data extracted for URL: {url}")
- return None # Skip this URL and return None
- # Create a DataFrame from the data list
- df = pd.DataFrame(data)
- # Convert the DataFrame to a formatted table
- table = tabulate(df, headers='keys', tablefmt='pretty')
- print(f"Table for URL: {url}\n{table}")
- # Return the DataFrame for this URL
- return df
- I want to refactor this scraper to the parent widget code.
- Please provide me the refactored code.
- '
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement