Advertisement
Try95th

Oddsportal Scraper_New [clone] so_q_75058259

Feb 9th, 2023
195
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ## based on https://stackoverflow.com/q/75058259/6146136
  2. ## cloned from https://pastebin.com/Sd0E1Hmm
  3.  
  4. import os
  5. import threading
  6. from math import nan
  7. from multiprocessing.pool import ThreadPool
  8. import time
  9. import pandas as pd
  10. from bs4 import BeautifulSoup as bs
  11. from selenium import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.wait import WebDriverWait
  14. from selenium.webdriver.support import expected_conditions as EC
  15.  
  16.  
  17. class Driver:
  18.     def __init__(self):
  19.         options = webdriver.ChromeOptions()
  20.         options.add_argument("--headless")
  21.         # Un-comment next line to supress logging:
  22.         options.add_experimental_option('excludeSwitches', ['enable-logging'])
  23.         self.driver = webdriver.Chrome(options=options)
  24.  
  25.     def __del__(self):
  26.         self.driver.quit()  # clean up driver when we are cleaned up
  27.     # print('The driver has been "quitted".')
  28.  
  29.  
  30. threadLocal = threading.local()
  31.  
  32.  
  33. def create_driver():
  34.     the_driver = getattr(threadLocal, 'the_driver', None)
  35.     if the_driver is None:
  36.         the_driver = Driver()
  37.         setattr(threadLocal, 'the_driver', the_driver)
  38.     return the_driver.driver
  39.  
  40.  
  41. class GameData:
  42.     def __init__(self):
  43.         self.date = []
  44.         self.time = []
  45.         self.game = []
  46.         self.score = []
  47.         self.home_odds = []
  48.         self.draw_odds = []
  49.         self.away_odds = []
  50.         self.country = []
  51.         self.league = []
  52.  
  53.  
  54. def generate_matches(pgSoup, defaultVal=None):
  55.     evtSel = {
  56.         'time': 'p.whitespace-nowrap',
  57.         'game': 'a div:has(>a[title])',
  58.         'score': 'a:has(a[title])+div.hidden',
  59.         'home_odds': 'a:has(a[title])~div:not(.hidden)',
  60.         'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
  61.         'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)',
  62.     }
  63.  
  64.     events, current_group = [], {}
  65.     pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
  66.     if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
  67.     for evt in pgSoup.select('div[set]>div:last-child'):
  68.         if evt.parent.select(f':scope>div:first-child+div+div'):
  69.             cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
  70.                 evt.parent.select_one(s) for s in
  71.                 [':scope>div:first-child+div>div:first-child',
  72.                  ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
  73.                  ':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
  74.             current_group = dict(zip(['date', 'country', 'league'], cgVals))
  75.             if pgDate: current_group['date'] = pgDate
  76.  
  77.         evtRow = {'date': current_group.get('date', defaultVal)}
  78.  
  79.         for k, v in evtSel.items():
  80.             v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
  81.             evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
  82.         evtTeams = evt.select('a div>a[title]')
  83.         evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
  84.         evtRow['country'] = current_group.get('country', defaultVal)
  85.         evtRow['league'] = current_group.get('league', defaultVal)
  86.  
  87.         events.append(evtRow)
  88.     return events
  89.  
  90.  
  91. def parse_data(url, return_urls=False):
  92.     browser = create_driver()
  93.     browser.get(url)
  94.     WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located(
  95.         (By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)")))
  96.     ########### For page to scroll to the end ###########
  97.     scroll_pause_time = 2
  98.  
  99.     # Get scroll height
  100.     last_height = browser.execute_script("return document.body.scrollHeight")
  101.  
  102.     while True:
  103.         # Scroll down to bottom
  104.         browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  105.  
  106.         # Wait to load page
  107.         time.sleep(scroll_pause_time)
  108.  
  109.         # Calculate new scroll height and compare with last scroll height
  110.         new_height = browser.execute_script("return document.body.scrollHeight")
  111.         if new_height == last_height:
  112.             break
  113.         last_height = new_height
  114.      ########### For page to scroll to the end ###########
  115.     time.sleep(5)
  116.     soup = bs(browser.page_source, "lxml")
  117.  
  118.     game_data = GameData()
  119.     game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
  120.     for row in generate_matches(soup, defaultVal=nan):
  121.         for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
  122.     if return_urls:
  123.         if return_urls:
  124.             a_cont = soup.find('div', {'class': 'tabs'})
  125.             if a_cont is None:
  126.                 a_tags = []
  127.             else:
  128.                 a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True})
  129.             urls = [
  130.                 'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags
  131.                 if not a_tag['href'].startswith('#')  # sections in current page
  132.                 and 'active-item-calendar' not in a_tag['class']  # current page
  133.             ]
  134.             print(pd.DataFrame(urls, columns=['urls']))
  135.         return game_data, urls
  136.     return game_data
  137.  
  138.  
  139. if __name__ == '__main__':
  140.     games = None
  141.     pool = ThreadPool(5)
  142.     # Get today's data and the Urls for the other days:
  143.     url_today = 'https://www.oddsportal.com/matches/soccer'
  144.     game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
  145.     game_data_results = pool.imap(parse_data, urls)
  146.     # ############################ BUILD  DATAFRAME ############################
  147.     # game_n, added_todayGame = 0, False
  148.     # for game_data in game_data_results:
  149.     #     try:
  150.     #         game_n += 1
  151.     #         gd_df = pd.DataFrame(game_data.__dict__)
  152.     #         games = gd_df if games is None else pd.concat([games, gd_df])
  153.     #         if not added_todayGame:
  154.     #             game_n += 1
  155.     #             gdt_df = pd.DataFrame(game_data_today.__dict__)
  156.     #             games, added_todayGame = pd.concat([games, gdt_df]), True
  157.     #     except Exception as e:
  158.     #         print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
  159.     # ##########################################################################
  160.  
  161.     # OR
  162.  
  163.     ############################ BUILD  DATAFRAME ############################
  164.     game_data_dfList, added_todayGame = [], False
  165.     for game_data in game_data_results:
  166.         try:
  167.             game_data_dfList.append(pd.DataFrame(game_data.__dict__))
  168.             if not added_todayGame:
  169.                 game_data_dfList += [pd.DataFrame(game_data_today.__dict__)]
  170.                 added_todayGame = True
  171.         except Exception as e:
  172.             game_n = len(game_data_dfList) + 1
  173.             print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
  174.         # finally: pass ## [ redundant ]
  175.     try:
  176.         games = pd.concat(game_data_dfList, ignore_index=True)
  177.     except Exception as e:
  178.         print('Error concatenating DataFrames:', repr(e))
  179.     ##########################################################################
  180.  
  181.     print('!?NO GAMES?!' if games is None else games)  ## print(games)
  182.     # ensure all the drivers are "quitted":
  183.     del threadLocal  # a little extra insurance
  184.     import gc
  185.  
  186.     gc.collect()
  187.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement