View difference between Paste ID: 1XxAfr66 and Sd0E1Hmm
SHOW: | | - or go back to the newest paste.
1
## based on https://stackoverflow.com/q/75058259/6146136
2
## cloned from https://pastebin.com/Sd0E1Hmm
3
4
import os
5
import threading
6
from math import nan
7
from multiprocessing.pool import ThreadPool
8
import time
9
import pandas as pd
10
from bs4 import BeautifulSoup as bs
11
from selenium import webdriver
12
from selenium.webdriver.common.by import By
13
from selenium.webdriver.support.wait import WebDriverWait
14
from selenium.webdriver.support import expected_conditions as EC
15
16
17
class Driver:
18
    def __init__(self):
19
        options = webdriver.ChromeOptions()
20
        options.add_argument("--headless")
21
        # Un-comment next line to supress logging:
22
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
23
        self.driver = webdriver.Chrome(options=options)
24
25
    def __del__(self):
26
        self.driver.quit()  # clean up driver when we are cleaned up
27
    # print('The driver has been "quitted".')
28
29
30
threadLocal = threading.local()
31
32
33
def create_driver():
34
    the_driver = getattr(threadLocal, 'the_driver', None)
35
    if the_driver is None:
36
        the_driver = Driver()
37
        setattr(threadLocal, 'the_driver', the_driver)
38
    return the_driver.driver
39
40
41
class GameData:
42
    def __init__(self):
43
        self.date = []
44
        self.time = []
45
        self.game = []
46
        self.score = []
47
        self.home_odds = []
48
        self.draw_odds = []
49
        self.away_odds = []
50
        self.country = []
51
        self.league = []
52
53
54
def generate_matches(pgSoup, defaultVal=None):
55
    evtSel = {
56
        'time': 'p.whitespace-nowrap',
57
        'game': 'a div:has(>a[title])',
58
        'score': 'a:has(a[title])+div.hidden',
59
        'home_odds': 'a:has(a[title])~div:not(.hidden)',
60
        'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
61
        'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)',
62
    }
63
64
    events, current_group = [], {}
65
    pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
66
    if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
67
    for evt in pgSoup.select('div[set]>div:last-child'):
68
        if evt.parent.select(f':scope>div:first-child+div+div'):
69
            cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
70
                evt.parent.select_one(s) for s in
71
                [':scope>div:first-child+div>div:first-child',
72
                 ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
73
                 ':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
74
            current_group = dict(zip(['date', 'country', 'league'], cgVals))
75
            if pgDate: current_group['date'] = pgDate
76
77
        evtRow = {'date': current_group.get('date', defaultVal)}
78
79
        for k, v in evtSel.items():
80
            v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
81
            evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
82
        evtTeams = evt.select('a div>a[title]')
83
        evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
84
        evtRow['country'] = current_group.get('country', defaultVal)
85
        evtRow['league'] = current_group.get('league', defaultVal)
86
87
        events.append(evtRow)
88
    return events
89
90
91
def parse_data(url, return_urls=False):
92
    browser = create_driver()
93
    browser.get(url)
94
    WebDriverWait(browser, 10).until(EC.presence_of_all_elements_located(
95
        (By.CSS_SELECTOR, "div[set]>div:last-child a:has(a[title])~div:not(.hidden)")))
96
    ########### For page to scroll to the end ###########
97
    scroll_pause_time = 2
98
99
    # Get scroll height
100
    last_height = browser.execute_script("return document.body.scrollHeight")
101
102
    while True:
103
        # Scroll down to bottom
104
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
105
106
        # Wait to load page
107
        time.sleep(scroll_pause_time)
108
109
        # Calculate new scroll height and compare with last scroll height
110
        new_height = browser.execute_script("return document.body.scrollHeight")
111
        if new_height == last_height:
112
            break
113
        last_height = new_height
114
     ########### For page to scroll to the end ###########
115
    time.sleep(5)
116
    soup = bs(browser.page_source, "lxml")
117
118
    game_data = GameData()
119
    game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
120
    for row in generate_matches(soup, defaultVal=nan):
121
        for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
122
    if return_urls:
123
        if return_urls:
124
            a_cont = soup.find('div', {'class': 'tabs'})
125
            if a_cont is None:
126
                a_tags = []
127
            else:
128
                a_tags = a_cont.find_all('a', {'class': 'h-8', 'href': True})
129
            urls = [
130
                'https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags
131
                if not a_tag['href'].startswith('#')  # sections in current page
132
                and 'active-item-calendar' not in a_tag['class']  # current page
133
            ]
134
            print(pd.DataFrame(urls, columns=['urls']))
135
        return game_data, urls
136
    return game_data
137
138
139
if __name__ == '__main__':
140
    games = None
141
    pool = ThreadPool(5)
142
    # Get today's data and the Urls for the other days:
143
    url_today = 'https://www.oddsportal.com/matches/soccer'
144
    game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
145
    game_data_results = pool.imap(parse_data, urls)
146
    # ############################ BUILD  DATAFRAME ############################
147
    # game_n, added_todayGame = 0, False
148
    # for game_data in game_data_results:
149
    #     try:
150
    #         game_n += 1
151
    #         gd_df = pd.DataFrame(game_data.__dict__)
152
    #         games = gd_df if games is None else pd.concat([games, gd_df])
153
    #         if not added_todayGame:
154
    #             game_n += 1
155
    #             gdt_df = pd.DataFrame(game_data_today.__dict__)
156
    #             games, added_todayGame = pd.concat([games, gdt_df]), True
157
    #     except Exception as e:
158
    #         print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
159
    # ##########################################################################
160
161
    # OR
162
163
    ############################ BUILD  DATAFRAME ############################
164
    game_data_dfList, added_todayGame = [], False
165
    for game_data in game_data_results:
166
        try:
167
            game_data_dfList.append(pd.DataFrame(game_data.__dict__))
168
            if not added_todayGame:
169
                game_data_dfList += [pd.DataFrame(game_data_today.__dict__)]
170
                added_todayGame = True
171
        except Exception as e:
172
            game_n = len(game_data_dfList) + 1
173
            print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
174
        # finally: pass ## [ redundant ]
175
    try:
176
        games = pd.concat(game_data_dfList, ignore_index=True)
177
    except Exception as e:
178
        print('Error concatenating DataFrames:', repr(e))
179
    ##########################################################################
180
181
    print('!?NO GAMES?!' if games is None else games)  ## print(games)
182
    # ensure all the drivers are "quitted":
183
    del threadLocal  # a little extra insurance
184
    import gc
185
186
    gc.collect()
187