SHOW:
|
|
- or go back to the newest paste.
| 1 | ## based on https://stackoverflow.com/q/75058259/6146136 | |
| 2 | ## cloned from https://pastebin.com/Sd0E1Hmm | |
| 3 | ||
| 4 | import os | |
| 5 | import threading | |
| 6 | from math import nan | |
| 7 | from multiprocessing.pool import ThreadPool | |
| 8 | import time | |
| 9 | import pandas as pd | |
| 10 | from bs4 import BeautifulSoup as bs | |
| 11 | from selenium import webdriver | |
| 12 | from selenium.webdriver.common.by import By | |
| 13 | from selenium.webdriver.support.wait import WebDriverWait | |
| 14 | from selenium.webdriver.support import expected_conditions as EC | |
| 15 | ||
| 16 | ||
| 17 | class Driver: | |
| 18 | def __init__(self): | |
| 19 | options = webdriver.ChromeOptions() | |
| 20 | options.add_argument("--headless")
| |
| 21 | # Un-comment next line to supress logging: | |
| 22 | options.add_experimental_option('excludeSwitches', ['enable-logging'])
| |
| 23 | self.driver = webdriver.Chrome(options=options) | |
| 24 | ||
| 25 | def __del__(self): | |
| 26 | self.driver.quit() # clean up driver when we are cleaned up | |
| 27 | # print('The driver has been "quitted".')
| |
| 28 | ||
| 29 | ||
| 30 | threadLocal = threading.local() | |
| 31 | ||
| 32 | ||
| 33 | def create_driver(): | |
| 34 | the_driver = getattr(threadLocal, 'the_driver', None) | |
| 35 | if the_driver is None: | |
| 36 | the_driver = Driver() | |
| 37 | setattr(threadLocal, 'the_driver', the_driver) | |
| 38 | return the_driver.driver | |
| 39 | ||
| 40 | ||
| 41 | class GameData: | |
| 42 | def __init__(self): | |
| 43 | self.date = [] | |
| 44 | self.time = [] | |
| 45 | self.game = [] | |
| 46 | self.score = [] | |
| 47 | self.home_odds = [] | |
| 48 | self.draw_odds = [] | |
| 49 | self.away_odds = [] | |
| 50 | self.country = [] | |
| 51 | self.league = [] | |
| 52 | ||
| 53 | ||
| 54 | def generate_matches(pgSoup, defaultVal=None): | |
| 55 | evtSel = {
| |
| 56 | 'time': 'p.whitespace-nowrap', | |
| 57 | 'game': 'a div:has(>a[title])', | |
| 58 | 'score': 'a:has(a[title])+div.hidden', | |
| 59 | 'home_odds': 'a:has(a[title])~div:not(.hidden)', | |
| 60 | 'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)', | |
| 61 | 'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)', | |
| 62 | } | |
| 63 | ||
| 64 | events, current_group = [], {}
| |
| 65 | pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
| |
| 66 | if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
| |
| 67 | for evt in pgSoup.select('div[set]>div:last-child'):
| |
| 68 | if evt.parent.select(f':scope>div:first-child+div+div'): | |
| 69 | cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
| |
| 70 | evt.parent.select_one(s) for s in | |
| 71 | [':scope>div:first-child+div>div:first-child', | |
| 72 | ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)', | |
| 73 | ':scope>div:first-child>a:nth-of-type(3):last-of-type']]] | |
| 74 | current_group = dict(zip(['date', 'country', 'league'], cgVals)) | |
| 75 | if pgDate: current_group['date'] = pgDate | |
| 76 | ||
| 77 | evtRow = {'date': current_group.get('date', defaultVal)}
| |
| 78 | ||
| 79 | for k, v in evtSel.items(): | |
| 80 | v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
| |
| 81 | evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v | |
| 82 | evtTeams = evt.select('a div>a[title]')
| |
| 83 | evtRow['game'] = ' – '.join(a['title'] for a in evtTeams) | |
| 84 | evtRow['country'] = current_group.get('country', defaultVal)
| |
| 85 | evtRow['league'] = current_group.get('league', defaultVal)
| |
| 86 | ||
| 87 | events.append(evtRow) | |
| 88 | return events | |
| 89 | ||
| 90 | ||
| 91 | def parse_data(url, return_urls=False): | |
| 92 | browser = create_driver() | |
| 93 | browser.get(url) | |
| 94 | WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located( | |
| 95 | (By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)"))) | |
| 96 | ########### For page to scroll to the end ########### | |
| 97 | scroll_pause_time = 2 | |
| 98 | ||
| 99 | # Get scroll height | |
| 100 | last_height = browser.execute_script("return document.body.scrollHeight")
| |
| 101 | ||
| 102 | while True: | |
| 103 | # Scroll down to bottom | |
| 104 | browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
| |
| 105 | ||
| 106 | # Wait to load page | |
| 107 | time.sleep(scroll_pause_time) | |
| 108 | ||
| 109 | # Calculate new scroll height and compare with last scroll height | |
| 110 | new_height = browser.execute_script("return document.body.scrollHeight")
| |
| 111 | if new_height == last_height: | |
| 112 | break | |
| 113 | last_height = new_height | |
| 114 | ########### For page to scroll to the end ########### | |
| 115 | time.sleep(5) | |
| 116 | soup = bs(browser.page_source, "lxml") | |
| 117 | ||
| 118 | game_data = GameData() | |
| 119 | game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)] | |
| 120 | for row in generate_matches(soup, defaultVal=nan): | |
| 121 | for k in game_keys: getattr(game_data, k).append(row.get(k, nan)) | |
| 122 | if return_urls: | |
| 123 | if return_urls: | |
| 124 | a_cont = soup.find('div', {'class': 'tabs'})
| |
| 125 | if a_cont is None: | |
| 126 | a_tags = [] | |
| 127 | else: | |
| 128 | a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True})
| |
| 129 | urls = [ | |
| 130 | 'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags | |
| 131 | if not a_tag['href'].startswith('#') # sections in current page
| |
| 132 | and 'active-item-calendar' not in a_tag['class'] # current page | |
| 133 | ] | |
| 134 | print(pd.DataFrame(urls, columns=['urls'])) | |
| 135 | return game_data, urls | |
| 136 | return game_data | |
| 137 | ||
| 138 | ||
| 139 | if __name__ == '__main__': | |
| 140 | games = None | |
| 141 | pool = ThreadPool(5) | |
| 142 | # Get today's data and the Urls for the other days: | |
| 143 | url_today = 'https://www.oddsportal.com/matches/soccer' | |
| 144 | game_data_today, urls = pool.apply(parse_data, args=(url_today, True)) | |
| 145 | game_data_results = pool.imap(parse_data, urls) | |
| 146 | # ############################ BUILD DATAFRAME ############################ | |
| 147 | # game_n, added_todayGame = 0, False | |
| 148 | # for game_data in game_data_results: | |
| 149 | # try: | |
| 150 | # game_n += 1 | |
| 151 | # gd_df = pd.DataFrame(game_data.__dict__) | |
| 152 | # games = gd_df if games is None else pd.concat([games, gd_df]) | |
| 153 | # if not added_todayGame: | |
| 154 | # game_n += 1 | |
| 155 | # gdt_df = pd.DataFrame(game_data_today.__dict__) | |
| 156 | # games, added_todayGame = pd.concat([games, gdt_df]), True | |
| 157 | # except Exception as e: | |
| 158 | # print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
| |
| 159 | # ########################################################################## | |
| 160 | ||
| 161 | # OR | |
| 162 | ||
| 163 | ############################ BUILD DATAFRAME ############################ | |
| 164 | game_data_dfList, added_todayGame = [], False | |
| 165 | for game_data in game_data_results: | |
| 166 | try: | |
| 167 | game_data_dfList.append(pd.DataFrame(game_data.__dict__)) | |
| 168 | if not added_todayGame: | |
| 169 | game_data_dfList += [pd.DataFrame(game_data_today.__dict__)] | |
| 170 | added_todayGame = True | |
| 171 | except Exception as e: | |
| 172 | game_n = len(game_data_dfList) + 1 | |
| 173 | print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
| |
| 174 | # finally: pass ## [ redundant ] | |
| 175 | try: | |
| 176 | games = pd.concat(game_data_dfList, ignore_index=True) | |
| 177 | except Exception as e: | |
| 178 | print('Error concatenating DataFrames:', repr(e))
| |
| 179 | ########################################################################## | |
| 180 | ||
| 181 | print('!?NO GAMES?!' if games is None else games) ## print(games)
| |
| 182 | # ensure all the drivers are "quitted": | |
| 183 | del threadLocal # a little extra insurance | |
| 184 | import gc | |
| 185 | ||
| 186 | gc.collect() | |
| 187 |