SHOW:
|
|
- or go back to the newest paste.
1 | ## based on https://stackoverflow.com/q/75058259/6146136 | |
2 | ## cloned from https://pastebin.com/Sd0E1Hmm | |
3 | ||
4 | import os | |
5 | import threading | |
6 | from math import nan | |
7 | from multiprocessing.pool import ThreadPool | |
8 | import time | |
9 | import pandas as pd | |
10 | from bs4 import BeautifulSoup as bs | |
11 | from selenium import webdriver | |
12 | from selenium.webdriver.common.by import By | |
13 | from selenium.webdriver.support.wait import WebDriverWait | |
14 | from selenium.webdriver.support import expected_conditions as EC | |
15 | ||
16 | ||
17 | class Driver: | |
18 | def __init__(self): | |
19 | options = webdriver.ChromeOptions() | |
20 | options.add_argument("--headless") | |
21 | # Un-comment next line to supress logging: | |
22 | options.add_experimental_option('excludeSwitches', ['enable-logging']) | |
23 | self.driver = webdriver.Chrome(options=options) | |
24 | ||
25 | def __del__(self): | |
26 | self.driver.quit() # clean up driver when we are cleaned up | |
27 | # print('The driver has been "quitted".') | |
28 | ||
29 | ||
30 | threadLocal = threading.local() | |
31 | ||
32 | ||
33 | def create_driver(): | |
34 | the_driver = getattr(threadLocal, 'the_driver', None) | |
35 | if the_driver is None: | |
36 | the_driver = Driver() | |
37 | setattr(threadLocal, 'the_driver', the_driver) | |
38 | return the_driver.driver | |
39 | ||
40 | ||
41 | class GameData: | |
42 | def __init__(self): | |
43 | self.date = [] | |
44 | self.time = [] | |
45 | self.game = [] | |
46 | self.score = [] | |
47 | self.home_odds = [] | |
48 | self.draw_odds = [] | |
49 | self.away_odds = [] | |
50 | self.country = [] | |
51 | self.league = [] | |
52 | ||
53 | ||
54 | def generate_matches(pgSoup, defaultVal=None): | |
55 | evtSel = { | |
56 | 'time': 'p.whitespace-nowrap', | |
57 | 'game': 'a div:has(>a[title])', | |
58 | 'score': 'a:has(a[title])+div.hidden', | |
59 | 'home_odds': 'a:has(a[title])~div:not(.hidden)', | |
60 | 'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)', | |
61 | 'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)', | |
62 | } | |
63 | ||
64 | events, current_group = [], {} | |
65 | pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]') | |
66 | if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip() | |
67 | for evt in pgSoup.select('div[set]>div:last-child'): | |
68 | if evt.parent.select(f':scope>div:first-child+div+div'): | |
69 | cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [ | |
70 | evt.parent.select_one(s) for s in | |
71 | [':scope>div:first-child+div>div:first-child', | |
72 | ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)', | |
73 | ':scope>div:first-child>a:nth-of-type(3):last-of-type']]] | |
74 | current_group = dict(zip(['date', 'country', 'league'], cgVals)) | |
75 | if pgDate: current_group['date'] = pgDate | |
76 | ||
77 | evtRow = {'date': current_group.get('date', defaultVal)} | |
78 | ||
79 | for k, v in evtSel.items(): | |
80 | v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal | |
81 | evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v | |
82 | evtTeams = evt.select('a div>a[title]') | |
83 | evtRow['game'] = ' – '.join(a['title'] for a in evtTeams) | |
84 | evtRow['country'] = current_group.get('country', defaultVal) | |
85 | evtRow['league'] = current_group.get('league', defaultVal) | |
86 | ||
87 | events.append(evtRow) | |
88 | return events | |
89 | ||
90 | ||
91 | def parse_data(url, return_urls=False): | |
92 | browser = create_driver() | |
93 | browser.get(url) | |
94 | WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located( | |
95 | (By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)"))) | |
96 | ########### For page to scroll to the end ########### | |
97 | scroll_pause_time = 2 | |
98 | ||
99 | # Get scroll height | |
100 | last_height = browser.execute_script("return document.body.scrollHeight") | |
101 | ||
102 | while True: | |
103 | # Scroll down to bottom | |
104 | browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
105 | ||
106 | # Wait to load page | |
107 | time.sleep(scroll_pause_time) | |
108 | ||
109 | # Calculate new scroll height and compare with last scroll height | |
110 | new_height = browser.execute_script("return document.body.scrollHeight") | |
111 | if new_height == last_height: | |
112 | break | |
113 | last_height = new_height | |
114 | ########### For page to scroll to the end ########### | |
115 | time.sleep(5) | |
116 | soup = bs(browser.page_source, "lxml") | |
117 | ||
118 | game_data = GameData() | |
119 | game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)] | |
120 | for row in generate_matches(soup, defaultVal=nan): | |
121 | for k in game_keys: getattr(game_data, k).append(row.get(k, nan)) | |
122 | if return_urls: | |
123 | if return_urls: | |
124 | a_cont = soup.find('div', {'class': 'tabs'}) | |
125 | if a_cont is None: | |
126 | a_tags = [] | |
127 | else: | |
128 | a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True}) | |
129 | urls = [ | |
130 | 'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags | |
131 | if not a_tag['href'].startswith('#') # sections in current page | |
132 | and 'active-item-calendar' not in a_tag['class'] # current page | |
133 | ] | |
134 | print(pd.DataFrame(urls, columns=['urls'])) | |
135 | return game_data, urls | |
136 | return game_data | |
137 | ||
138 | ||
139 | if __name__ == '__main__': | |
140 | games = None | |
141 | pool = ThreadPool(5) | |
142 | # Get today's data and the Urls for the other days: | |
143 | url_today = 'https://www.oddsportal.com/matches/soccer' | |
144 | game_data_today, urls = pool.apply(parse_data, args=(url_today, True)) | |
145 | game_data_results = pool.imap(parse_data, urls) | |
146 | # ############################ BUILD DATAFRAME ############################ | |
147 | # game_n, added_todayGame = 0, False | |
148 | # for game_data in game_data_results: | |
149 | # try: | |
150 | # game_n += 1 | |
151 | # gd_df = pd.DataFrame(game_data.__dict__) | |
152 | # games = gd_df if games is None else pd.concat([games, gd_df]) | |
153 | # if not added_todayGame: | |
154 | # game_n += 1 | |
155 | # gdt_df = pd.DataFrame(game_data_today.__dict__) | |
156 | # games, added_todayGame = pd.concat([games, gdt_df]), True | |
157 | # except Exception as e: | |
158 | # print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}') | |
159 | # ########################################################################## | |
160 | ||
161 | # OR | |
162 | ||
163 | ############################ BUILD DATAFRAME ############################ | |
164 | game_data_dfList, added_todayGame = [], False | |
165 | for game_data in game_data_results: | |
166 | try: | |
167 | game_data_dfList.append(pd.DataFrame(game_data.__dict__)) | |
168 | if not added_todayGame: | |
169 | game_data_dfList += [pd.DataFrame(game_data_today.__dict__)] | |
170 | added_todayGame = True | |
171 | except Exception as e: | |
172 | game_n = len(game_data_dfList) + 1 | |
173 | print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}') | |
174 | # finally: pass ## [ redundant ] | |
175 | try: | |
176 | games = pd.concat(game_data_dfList, ignore_index=True) | |
177 | except Exception as e: | |
178 | print('Error concatenating DataFrames:', repr(e)) | |
179 | ########################################################################## | |
180 | ||
181 | print('!?NO GAMES?!' if games is None else games) ## print(games) | |
182 | # ensure all the drivers are "quitted": | |
183 | del threadLocal # a little extra insurance | |
184 | import gc | |
185 | ||
186 | gc.collect() | |
187 |