View difference between Paste ID: <a href="/1XxAfr66">1XxAfr66</a> and <a href="/Sd0E1Hmm">Sd0E1Hmm</a>

## based on https://stackoverflow.com/q/75058259/6146136
1		## based on https://stackoverflow.com/q/75058259/6146136
2		## cloned from https://pastebin.com/Sd0E1Hmm
3
4		import os
5		import threading
6		from math import nan
7		from multiprocessing.pool import ThreadPool
8		import time
9		import pandas as pd
10		from bs4 import BeautifulSoup as bs
11		from selenium import webdriver
12		from selenium.webdriver.common.by import By
13		from selenium.webdriver.support.wait import WebDriverWait
14		from selenium.webdriver.support import expected_conditions as EC
15
16
17		class Driver:
18		def __init__(self):
19		options = webdriver.ChromeOptions()
20		options.add_argument("--headless")
21		# Un-comment next line to supress logging:
22		options.add_experimental_option('excludeSwitches', ['enable-logging'])
23		self.driver = webdriver.Chrome(options=options)
24
25		def __del__(self):
26		self.driver.quit() # clean up driver when we are cleaned up
27		# print('The driver has been "quitted".')
28
29
30		threadLocal = threading.local()
31
32
33		def create_driver():
34		the_driver = getattr(threadLocal, 'the_driver', None)
35		if the_driver is None:
36		the_driver = Driver()
37		setattr(threadLocal, 'the_driver', the_driver)
38		return the_driver.driver
39
40
41		class GameData:
42		def __init__(self):
43		self.date = []
44		self.time = []
45		self.game = []
46		self.score = []
47		self.home_odds = []
48		self.draw_odds = []
49		self.away_odds = []
50		self.country = []
51		self.league = []
52
53
54		def generate_matches(pgSoup, defaultVal=None):
55		evtSel = {
56		'time': 'p.whitespace-nowrap',
57		'game': 'a div:has(>a[title])',
58		'score': 'a:has(a[title])+div.hidden',
59		'home_odds': 'a:has(a[title])~div:not(.hidden)',
60		'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
61		'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)',
62		}
63
64		events, current_group = [], {}
65		pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
66		if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
67		for evt in pgSoup.select('div[set]>div:last-child'):
68		if evt.parent.select(f':scope>div:first-child+div+div'):
69		cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
70		evt.parent.select_one(s) for s in
71		[':scope>div:first-child+div>div:first-child',
72		':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
73		':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
74		current_group = dict(zip(['date', 'country', 'league'], cgVals))
75		if pgDate: current_group['date'] = pgDate
76
77		evtRow = {'date': current_group.get('date', defaultVal)}
78
79		for k, v in evtSel.items():
80		v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
81		evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
82		evtTeams = evt.select('a div>a[title]')
83		evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
84		evtRow['country'] = current_group.get('country', defaultVal)
85		evtRow['league'] = current_group.get('league', defaultVal)
86
87		events.append(evtRow)
88		return events
89
90
91		def parse_data(url, return_urls=False):
92		browser = create_driver()
93		browser.get(url)
94		WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located(
95		(By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)")))
96		########### For page to scroll to the end ###########
97		scroll_pause_time = 2
98
99		# Get scroll height
100		last_height = browser.execute_script("return document.body.scrollHeight")
101
102		while True:
103		# Scroll down to bottom
104		browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
105
106		# Wait to load page
107		time.sleep(scroll_pause_time)
108
109		# Calculate new scroll height and compare with last scroll height
110		new_height = browser.execute_script("return document.body.scrollHeight")
111		if new_height == last_height:
112		break
113		last_height = new_height
114		########### For page to scroll to the end ###########
115		time.sleep(5)
116		soup = bs(browser.page_source, "lxml")
117
118		game_data = GameData()
119		game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
120		for row in generate_matches(soup, defaultVal=nan):
121		for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
122		if return_urls:
123		if return_urls:
124		a_cont = soup.find('div', {'class': 'tabs'})
125		if a_cont is None:
126		a_tags = []
127		else:
128		a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True})
129		urls = [
130		'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags
131		if not a_tag['href'].startswith('#') # sections in current page
132		and 'active-item-calendar' not in a_tag['class'] # current page
133		]
134		print(pd.DataFrame(urls, columns=['urls']))
135		return game_data, urls
136		return game_data
137
138
139		if __name__ == '__main__':
140		games = None
141		pool = ThreadPool(5)
142		# Get today's data and the Urls for the other days:
143		url_today = 'https://www.oddsportal.com/matches/soccer'
144		game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
145		game_data_results = pool.imap(parse_data, urls)
146		# ############################ BUILD DATAFRAME ############################
147		# game_n, added_todayGame = 0, False
148		# for game_data in game_data_results:
149		# try:
150		# game_n += 1
151		# gd_df = pd.DataFrame(game_data.__dict__)
152		# games = gd_df if games is None else pd.concat([games, gd_df])
153		# if not added_todayGame:
154		# game_n += 1
155		# gdt_df = pd.DataFrame(game_data_today.__dict__)
156		# games, added_todayGame = pd.concat([games, gdt_df]), True
157		# except Exception as e:
158		# print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
159		# ##########################################################################
160
161		# OR
162
163		############################ BUILD DATAFRAME ############################
164		game_data_dfList, added_todayGame = [], False
165		for game_data in game_data_results:
166		try:
167		game_data_dfList.append(pd.DataFrame(game_data.__dict__))
168		if not added_todayGame:
169		game_data_dfList += [pd.DataFrame(game_data_today.__dict__)]
170		added_todayGame = True
171		except Exception as e:
172		game_n = len(game_data_dfList) + 1
173		print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
174		# finally: pass ## [ redundant ]
175		try:
176		games = pd.concat(game_data_dfList, ignore_index=True)
177		except Exception as e:
178		print('Error concatenating DataFrames:', repr(e))
179		##########################################################################
180
181		print('!?NO GAMES?!' if games is None else games) ## print(games)
182		# ensure all the drivers are "quitted":
183		del threadLocal # a little extra insurance
184		import gc
185
186		gc.collect()
187