Advertisement
PyNoob1

Oddsportal Scraper_New

Feb 9th, 2023
2,408
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.05 KB | Sports | 0 0
  1. import os
  2. import threading
  3. from math import nan
  4. from multiprocessing.pool import ThreadPool
  5. import time
  6. import pandas as pd
  7. from bs4 import BeautifulSoup as bs
  8. from selenium import webdriver
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support.wait import WebDriverWait
  11. from selenium.webdriver.support import expected_conditions as EC
  12.  
  13.  
  14. class Driver:
  15.     def __init__(self):
  16.         options = webdriver.ChromeOptions()
  17.         options.add_argument("--headless")
  18.         # Un-comment next line to supress logging:
  19.         options.add_experimental_option('excludeSwitches', ['enable-logging'])
  20.         self.driver = webdriver.Chrome(options=options)
  21.  
  22.     def __del__(self):
  23.         self.driver.quit()  # clean up driver when we are cleaned up
  24.     # print('The driver has been "quitted".')
  25.  
  26.  
  27. threadLocal = threading.local()
  28.  
  29.  
  30. def create_driver():
  31.     the_driver = getattr(threadLocal, 'the_driver', None)
  32.     if the_driver is None:
  33.         the_driver = Driver()
  34.         setattr(threadLocal, 'the_driver', the_driver)
  35.     return the_driver.driver
  36.  
  37.  
  38. class GameData:
  39.     def __init__(self):
  40.         self.date = []
  41.         self.time = []
  42.         self.game = []
  43.         self.score = []
  44.         self.home_odds = []
  45.         self.draw_odds = []
  46.         self.away_odds = []
  47.         self.country = []
  48.         self.league = []
  49.  
  50.  
  51. def generate_matches(pgSoup, defaultVal=None):
  52.     evtSel = {
  53.         'time': 'p.whitespace-nowrap',
  54.         'game': 'a div:has(>a[title])',
  55.         'score': 'a:has(a[title])+div.hidden',
  56.         'home_odds': 'a:has(a[title])~div:not(.hidden)',
  57.         'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
  58.         'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)',
  59.     }
  60.  
  61.     events, current_group = [], {}
  62.     pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
  63.     if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
  64.     for evt in pgSoup.select('div[set]>div:last-child'):
  65.         if evt.parent.select(f':scope>div:first-child+div+div'):
  66.             cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
  67.                 evt.parent.select_one(s) for s in
  68.                 [':scope>div:first-child+div>div:first-child',
  69.                  ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
  70.                  ':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
  71.             current_group = dict(zip(['date', 'country', 'league'], cgVals))
  72.             if pgDate: current_group['date'] = pgDate
  73.  
  74.         evtRow = {'date': current_group.get('date', defaultVal)}
  75.  
  76.         for k, v in evtSel.items():
  77.             v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
  78.             evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
  79.         evtTeams = evt.select('a div>a[title]')
  80.         evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
  81.         evtRow['country'] = current_group.get('country', defaultVal)
  82.         evtRow['league'] = current_group.get('league', defaultVal)
  83.  
  84.         events.append(evtRow)
  85.     return events
  86.  
  87.  
  88. def parse_data(url, return_urls=False):
  89.     browser = create_driver()
  90.     browser.get(url)
  91.     WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located(
  92.         (By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)")))
  93.     ########### For page to scroll to the end ###########
  94.     scroll_pause_time = 2
  95.  
  96.     # Get scroll height
  97.     last_height = browser.execute_script("return document.body.scrollHeight")
  98.  
  99.     while True:
  100.         # Scroll down to bottom
  101.         browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  102.  
  103.         # Wait to load page
  104.         time.sleep(scroll_pause_time)
  105.  
  106.         # Calculate new scroll height and compare with last scroll height
  107.         new_height = browser.execute_script("return document.body.scrollHeight")
  108.         if new_height == last_height:
  109.             break
  110.         last_height = new_height
  111.      ########### For page to scroll to the end ###########
  112.     time.sleep(5)
  113.     soup = bs(browser.page_source, "lxml")
  114.  
  115.     game_data = GameData()
  116.     game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
  117.     for row in generate_matches(soup, defaultVal=nan):
  118.         for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
  119.     if return_urls:
  120.         if return_urls:
  121.             a_cont = soup.find('div', {'class': 'tabs'})
  122.             if a_cont is None:
  123.                 a_tags = []
  124.             else:
  125.                 a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True})
  126.             urls = [
  127.                 'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags
  128.                 if not a_tag['href'].startswith('#')  # sections in current page
  129.                 and 'active-item-calendar' not in a_tag['class']  # current page
  130.             ]
  131.             print(pd.DataFrame(urls, columns=['urls']))
  132.         return game_data, urls
  133.     return game_data
  134.  
  135.  
  136. if __name__ == '__main__':
  137.     games = None
  138.     pool = ThreadPool(5)
  139.     # Get today's data and the Urls for the other days:
  140.     url_today = 'https://www.oddsportal.com/matches/soccer'
  141.     game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
  142.     game_data_results = pool.imap(parse_data, urls)
  143.     # ############################ BUILD  DATAFRAME ############################
  144.     # game_n, added_todayGame = 0, False
  145.     # for game_data in game_data_results:
  146.     #     try:
  147.     #         game_n += 1
  148.     #         gd_df = pd.DataFrame(game_data.__dict__)
  149.     #         games = gd_df if games is None else pd.concat([games, gd_df])
  150.     #         if not added_todayGame:
  151.     #             game_n += 1
  152.     #             gdt_df = pd.DataFrame(game_data_today.__dict__)
  153.     #             games, added_todayGame = pd.concat([games, gdt_df]), True
  154.     #     except Exception as e:
  155.     #         print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
  156.     # ##########################################################################
  157.  
  158.     # OR
  159.  
  160.     ############################ BUILD  DATAFRAME ############################
  161.     game_data_dfList, added_todayGame = [], False
  162.     for game_data in game_data_results:
  163.         try:
  164.             game_data_dfList.append(pd.DataFrame(game_data.__dict__))
  165.             if not added_todayGame:
  166.                 game_data_dfList += [pd.DataFrame(game_data_today.__dict__)]
  167.                 added_todayGame = True
  168.         except Exception as e:
  169.             game_n = len(game_data_dfList) + 1
  170.             print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
  171.         # finally: pass ## [ redundant ]
  172.     try:
  173.         games = pd.concat(game_data_dfList, ignore_index=True)
  174.     except Exception as e:
  175.         print('Error concatenating DataFrames:', repr(e))
  176.     ##########################################################################
  177.  
  178.     print('!?NO GAMES?!' if games is None else games)  ## print(games)
  179.     # ensure all the drivers are "quitted":
  180.     del threadLocal  # a little extra insurance
  181.     import gc
  182.  
  183.     gc.collect()
  184.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement