Advertisement
PyNoob1

Oddsportal Scraper

Feb 7th, 2023
976
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.30 KB | Sports | 0 0
  1. import os
  2. import threading
  3. from math import nan
  4. from multiprocessing.pool import ThreadPool
  5.  
  6. import pandas as pd
  7. from bs4 import BeautifulSoup as bs
  8. from selenium import webdriver
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support.wait import WebDriverWait
  11. from selenium.webdriver.support import expected_conditions as EC
  12.  
  13. class Driver:
  14.     def __init__(self):
  15.         options = webdriver.ChromeOptions()
  16.         options.add_argument("--headless")
  17.         # Un-comment next line to supress logging:
  18.         options.add_experimental_option('excludeSwitches', ['enable-logging'])
  19.         self.driver = webdriver.Chrome(options=options)
  20.  
  21.     def __del__(self):
  22.         self.driver.quit()  # clean up driver when we are cleaned up
  23.         # print('The driver has been "quitted".')
  24.  
  25.  
  26. threadLocal = threading.local()
  27.  
  28.  
  29. def create_driver():
  30.     the_driver = getattr(threadLocal, 'the_driver', None)
  31.     if the_driver is None:
  32.         the_driver = Driver()
  33.         setattr(threadLocal, 'the_driver', the_driver)
  34.     return the_driver.driver
  35.  
  36.  
  37. class GameData:
  38.     def __init__(self):
  39.         self.date = []
  40.         self.time = []
  41.         self.game = []
  42.         self.score = []
  43.         self.home_odds = []
  44.         self.draw_odds = []
  45.         self.away_odds = []
  46.         self.country = []
  47.         self.league = []
  48.  
  49.  
  50. def generate_matches(pgSoup, defaultVal=None):
  51.     evtSel = {
  52.         'time': 'p.whitespace-nowrap',
  53.         'game': 'a div:has(>a[title])',
  54.         'score': 'a:has(a[title])+div.hidden',
  55.         'home_odds': 'a:has(a[title])~div:not(.hidden)',
  56.         'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
  57.         'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)',
  58.     }
  59.  
  60.     events, current_group = [], {}
  61.     pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
  62.     if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
  63.     for evt in pgSoup.select('div[set]>div:last-child'):
  64.         if evt.parent.select(f':scope>div:first-child+div+div'):
  65.             cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
  66.                 evt.parent.select_one(s) for s in
  67.                 [':scope>div:first-child+div>div:first-child',
  68.                  ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
  69.                  ':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
  70.             current_group = dict(zip(['date', 'country', 'league'], cgVals))
  71.             if pgDate: current_group['date'] = pgDate
  72.  
  73.         evtRow = {'date': current_group.get('date', defaultVal)}
  74.  
  75.         for k, v in evtSel.items():
  76.             v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
  77.             evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
  78.         evtTeams = evt.select('a div>a[title]')
  79.         evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
  80.         evtRow['country'] = current_group.get('country', defaultVal)
  81.         evtRow['league'] = current_group.get('league', defaultVal)
  82.  
  83.         events.append(evtRow)
  84.     return events
  85.  
  86.  
  87. def parse_data(url, return_urls=False):
  88.     browser = create_driver()
  89.     browser.get(url)
  90.     browser.implicitly_wait(30) # I could not get Explicit wait to work here. implicity_wait does not seem to work at all.
  91.     soup = bs(browser.page_source, "lxml")
  92.  
  93.     game_data = GameData()
  94.     game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
  95.     for row in generate_matches(soup, defaultVal=nan):
  96.         for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
  97.     if return_urls:
  98.         if return_urls:
  99.             a_cont = soup.find('div', {'class': 'tabs'})
  100.             if a_cont is None:
  101.                 a_tags = []
  102.             else:
  103.                 a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True})
  104.             urls = [
  105.                 'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags
  106.                 if not a_tag['href'].startswith('#')  # sections in current page
  107.                 and 'active-item-calendar' not in a_tag['class']  # current page
  108.             ]
  109.             print(pd.DataFrame(urls, columns=['urls']))
  110.         return game_data, urls
  111.     return game_data
  112.  
  113.  
  114. if __name__ == '__main__':
  115.     games = None
  116.     pool = ThreadPool(5)
  117.     # Get today's data and the Urls for the other days:
  118.     url_today = 'https://www.oddsportal.com/matches/soccer'
  119.     game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
  120.     game_data_results = pool.imap(parse_data, urls)
  121.     ############################ BUILD  DATAFRAME ############################
  122.     game_n, added_todayGame = 0, False
  123.     for game_data in game_data_results:
  124.         try:
  125.             game_n += 1
  126.             gd_df = pd.DataFrame(game_data.__dict__)
  127.             games = gd_df if games is None else pd.concat([games, gd_df])
  128.             if not added_todayGame:
  129.                 game_n += 1
  130.                 gdt_df = pd.DataFrame(game_data_today.__dict__)
  131.                 games, added_todayGame = pd.concat([games, gdt_df]), True
  132.         except Exception as e:
  133.             print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
  134.     ##########################################################################
  135.  
  136.     # OR
  137.  
  138.     # ############################ BUILD  DATAFRAME ############################
  139.     # game_data_dfList, added_todayGame = [], False
  140.     # for game_data in game_data_results:
  141.     #     try:
  142.     #         game_data_dfList.append(pd.DataFrame(game_data.__dict__))
  143.     #         if not added_todayGame:
  144.     #             game_data_dfList += [pd.DataFrame(game_data_today.__dict__)]
  145.     #             added_todayGame = True
  146.     #     except Exception as e:
  147.     #         game_n = len(game_data_dfList) + 1
  148.     #         print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
  149.     #     # finally: pass ## [ redundant ]
  150.     # try: games = pd.concat(game_data_dfList, ignore_index=True)
  151.     # except Exception as e: print('Error concatenating DataFrames:', repr(e))
  152.     # ##########################################################################
  153.  
  154.     print('!?NO GAMES?!' if games is None else games)  ## print(games)
  155.     # ensure all the drivers are "quitted":
  156.     del threadLocal  # a little extra insurance
  157.     import gc  
  158.  
  159.     gc.collect()
  160.  
  161. print(games)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement