Advertisement
PyNoob1

LLM Error

Feb 20th, 2024
854
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.05 KB | Source Code | 0 0
  1. '
  2. This the parent widget code to which I want to add the web scraper that i have also provided
  3.  
  4. # Parent Widget Code:
  5.  
  6. #!/usr/bin/env python3
  7.  
  8. import tkinter as tk
  9. from tkinter import messagebox, PhotoImage, font
  10. from tkinter.ttk import Frame, Button
  11. from io import StringIO
  12. import sys
  13. from ttkthemes import ThemedTk
  14. from selenium import webdriver
  15. from selenium.webdriver.firefox.options import Options
  16. from selenium.webdriver.common.keys import Keys
  17. from bs4 import BeautifulSoup
  18. import time
  19.  
  20. # Function to redirect console output to text widget
  21. class TextRedirector(object):
  22.    def __init__(self, widget):
  23.        self.widget = widget
  24.  
  25.    def write(self, str):
  26.        self.widget.insert(tk.END, str)
  27.        self.widget.see(tk.END)
  28.  
  29.    def flush(self):
  30.        # This could be left empty as Tkinter's Text widget doesn't need flushing
  31.        pass
  32.  
  33. # Your functions
  34. def get_profile():
  35.    profile = webdriver.FirefoxProfile()
  36.    profile.set_preference("browser.privatebrowsing.autostart", True)
  37.    return profile
  38.  
  39. def scroll_to_end(browser):
  40.    scroll_pause_time = 5
  41.    last_height = browser.execute_script("return document.body.scrollHeight")
  42.  
  43.    while True:
  44.        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  45.        time.sleep(scroll_pause_time)
  46.        new_height = browser.execute_script("return document.body.scrollHeight")
  47.        if new_height == last_height:
  48.            break
  49.        last_height = new_height
  50.  
  51. def save_html(browser, path):
  52.    html_content = browser.page_source
  53.    soup = BeautifulSoup(html_content, 'html.parser')
  54.    pretty_html = soup.prettify()
  55.  
  56.    with open(path, 'w', encoding='utf-8') as f:
  57.        f.write(pretty_html)
  58.  
  59. def scrape_website():
  60.    options = Options()
  61.    options.profile = get_profile()
  62.    browser = webdriver.Firefox(options=options)
  63.    browser.get("https://www.oddsportal.com/matches/football/20240222/")
  64.    
  65.    scroll_to_end(browser)
  66.    save_html(browser, r"/home/harshad/Projects/Predictor/Football Predictor/research/website_soup.txt")
  67.    
  68.    browser.quit()
  69.    print("Website scraped successfully!")
  70.  
  71. def refresh_selections():
  72.    print("Selections updated successfully!")
  73.  
  74. def predict():
  75.    print("Prediction completed successfully!")
  76.  
  77. # Create the main window
  78. root = ThemedTk(theme="arc")
  79. root.title("Football Predictor")
  80.  
  81. # Create a text widget
  82. output = tk.Text(root)
  83. output.pack()
  84.  
  85. # Redirect console output to text widget
  86. sys.stdout = TextRedirector(output)
  87.  
  88. # Create a frame for the buttons
  89. button_frame = Frame(root)
  90. button_frame.pack(fill=tk.BOTH, expand=True)
  91.  
  92. # Create buttons for each action
  93. scrape_button = Button(button_frame, text="Scrape Website", command=scrape_website)
  94. update_button = Button(button_frame, text="Refresh Selections", command=refresh_selections)
  95. predict_button = Button(button_frame, text="Predict", command=predict)
  96.  
  97. # Add the buttons to the window
  98. scrape_button.grid(row=0, column=0, sticky='ew')
  99. update_button.grid(row=1, column=0, sticky='ew')
  100. predict_button.grid(row=2, column=0, sticky='ew')
  101.  
  102. # Configure the rows and columns for responsiveness
  103. button_frame.columnconfigure(0, weight=1)
  104. for i in range(3):
  105.    button_frame.rowconfigure(i, weight=1)
  106.  
  107. # Start the main loop
  108. root.mainloop()
  109.  
  110. above is the parent code.
  111.  
  112. the below is an existing code that scrapes the data from the website:
  113.  
  114. # Existing Scraper Code
  115.  
  116. def parse_data(url, return_urls=False):
  117.    print(f'Parsing URL: {url}\n')
  118.    browser = create_driver()
  119.    browser.get(url)
  120.    # Wait for the first match element to be present
  121.    wait = WebDriverWait(browser, 20)
  122.    # Wait for the team names to be present
  123.    team_name_class = "participant-name"
  124.    wait.until(EC.presence_of_element_located((By.CLASS_NAME, team_name_class)))
  125.  
  126.    # Now, check if the first match element contains match data
  127.    match_element_class = "border-black-borders.group.flex"  # Class of the match element
  128.    first_match_element = browser.find_element(By.CSS_SELECTOR, f".{match_element_class}")
  129.    match_data_indicator = "participant-name"  # Class indicating match data
  130.    wait.until(EC.presence_of_element_located((By.CLASS_NAME, match_data_indicator)))
  131.  
  132.    # ########## For page to scroll to the end ###########
  133.    scroll_pause_time = 4
  134.    try:
  135.        # Get scroll height
  136.        last_height = browser.execute_script("return document.body.scrollHeight")
  137.  
  138.        while True:
  139.            import time
  140.            # Scroll down to bottom
  141.            browser.execute_script("window.scrollTo(0, 0);")  # Scroll to the top
  142.            # Wait to load page
  143.            time.sleep(1)  # Wait briefly
  144.            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # Scroll down
  145.            time.sleep(scroll_pause_time)
  146.            # Calculate new scroll height and compare with last scroll height
  147.            new_height = browser.execute_script("return document.body.scrollHeight")
  148.            if new_height == last_height:
  149.                break
  150.            last_height = new_height
  151.        # ########## For page to scroll to the end ###########
  152.        # If browser stalls and gives a timeout exception, refresh the browser
  153.    except TimeoutException:
  154.        print("Timeout exception occurred, refreshing the page...")
  155.        browser.refresh()
  156.  
  157.    html_content = browser.page_source
  158.  
  159.    soup = bs(html_content, 'html.parser')
  160.    pretty_html = soup.prettify()
  161.  
  162.    with open(r"C:\Users\harshad\Documents\Harshad Projects\Python Projects\Football Predictor\Files\z.Research\website_soup.txt", 'w', encoding='utf-8') as f:
  163.        f.write(pretty_html)
  164.  
  165.    # Using lxml and XPath
  166.    tree = html.fromstring(html_content)
  167.  
  168.    # Extracting the date from the header
  169.    date = tree.xpath('//h1[@id="next-matches-h1"]/text()')[0]
  170.  
  171.    # Splitting the date string to get the date part
  172.    date = date.split(", ")[-1]
  173.  
  174.    data = []
  175.  
  176.    # Initialize initial values for Country and League
  177.    current_country = None
  178.    current_league = None
  179.  
  180.    # Extracting match rows
  181.    match_rows = tree.xpath('//div[contains(@class, "eventRow")]')
  182.  
  183.    # Iterate through each match row
  184.    for match_row in match_rows:
  185.        country = match_row.xpath('.//a[contains(@class, "font-normal")]/p[contains(@class, "truncate")]/text()')
  186.        league = match_row.xpath('.//a[contains(@class, "truncate")]/text()')
  187.  
  188.        if country:
  189.            current_country = country[0].strip()
  190.        else:
  191.            country = [current_country]
  192.  
  193.        if league:
  194.            current_league = league[0].strip()
  195.        else:
  196.            league = [current_league]
  197.  
  198.        # Extract Game Teams
  199.        team1_elements = match_row.xpath('.//a[starts-with(@href, "/football/")][1]/div/p/text()')
  200.        team1 = team1_elements[0].strip() if team1_elements else ''
  201.        team2_elements = match_row.xpath('.//a[starts-with(@href, "/football/")][2]/div/p/text()')
  202.        team2 = team2_elements[0].strip() if team2_elements else ''
  203.        game_teams_str = f"{team1} - {team2}"
  204.  
  205.        
  206.        time = match_row.xpath('.//div/p/text()')[0].strip()
  207.        score1_xpath = './/div[contains(@class, "ml-auto mr-3 flex font-bold")]/text()'
  208.        score2_xpath = './/div[contains(@class, "ml-auto mr-3 flex font-bold font-bold")]/text()'
  209.        # Extract the scores
  210.        scores_xpath = './/a/div[contains(@class, "ml-auto mr-3 flex font-bold")]/text()'
  211.        scores = match_row.xpath(scores_xpath)
  212.        scores = [score.strip() for score in scores]
  213.  
  214.        if len(scores) >= 2:
  215.            score1, score2 = scores[:2]
  216.            score = f"{score1}:{score2}"
  217.        else:
  218.            score = None
  219.        # Check if " " is in the score
  220.        if score and " " in score:
  221.            try:
  222.                score1_xpath = './/div[contains(@class, "ml-auto mr-3 flex font-bold next-m:!hidden")]/text()'
  223.                score2_xpath = './/div[contains(@class, "ml-auto mr-3 flex font-bold next-m:!hidden font-bold")]/text()'
  224.                # Extract the scores
  225.                score1 = match_row.xpath(score1_xpath)[0]
  226.                print("Score1 List: ", score1)
  227.                score1 = score1[0] if score1 else None
  228.  
  229.                score2 = match_row.xpath(score2_xpath)[0]
  230.                print("Score2 List: ", score2)
  231.                score2 = score2[0] if score2 else None
  232.  
  233.                if score1 and score2:
  234.                    score = f"{score1.strip()}:{score2.strip()}"
  235.            except IndexError:
  236.                score_list = match_row.xpath('//div[@data-v-0398e93e=""]/p/text()')
  237.                print("Score List: ", score_list)
  238.                score = score_list[0].strip() if score_list else None
  239.        # Extract odds and handle the case where odds are not present
  240.        odds = match_row.xpath('.//p[contains(@class, "height-content")]/text()')
  241.        odds = [s.strip() for s in odds if s.strip().replace('.', '', 1).isdigit()]
  242.        home_odds = odds[0] if odds else None
  243.        draw_odds = odds[1] if len(odds) > 1 else None
  244.        away_odds = odds[2] if len(odds) > 2 else None
  245.  
  246.        # Append extracted data to the data list
  247.        data.append({
  248.            "country": country[0],  # Take the first element as the value
  249.            "league": league[0],  # Take the first element as the value
  250.            "date": date,
  251.            "time": time,
  252.            "game": game_teams_str,
  253.            "score": score,
  254.            "home_odds": home_odds,
  255.            "draw_odds": draw_odds,
  256.            "away_odds": away_odds
  257.        })
  258.  
  259.        # Data validation
  260.    if not data:
  261.        print(f"No valid data extracted for URL: {url}")
  262.        return None  # Skip this URL and return None
  263.  
  264.    # Create a DataFrame from the data list
  265.    df = pd.DataFrame(data)
  266.  
  267.    # Convert the DataFrame to a formatted table
  268.    table = tabulate(df, headers='keys', tablefmt='pretty')
  269.    print(f"Table for URL: {url}\n{table}")
  270.  
  271.    # Return the DataFrame for this URL
  272.    return df
  273.  
  274.    I want to refactor this scraper to the parent widget code.
  275.  
  276.    Please provide me the refactored code.
  277.    '
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement