Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from bs4 import BeautifulSoup as bs
- from selenium import webdriver
- import threading
- from multiprocessing.pool import ThreadPool
- import os
- import re
- from math import nan
- pd.set_option('display.max_rows', 30000)
- pd.set_option('display.max_columns', 500)
- pd.set_option('display.width', 1000)
- os.chdir(r"G:\My Drive\Oddsportal\Webdriver")
- cwd = os.getcwd()
- class Driver:
- def __init__(self):
- options = webdriver.ChromeOptions()
- options.add_argument("--headless")
- # Un-comment next line to supress logging:
- options.add_experimental_option('excludeSwitches', ['enable-logging'])
- self.driver = webdriver.Chrome(options=options)
- def __del__(self):
- self.driver.quit() # clean up driver when we are cleaned up
- # print('The driver has been "quitted".')
- threadLocal = threading.local()
- def create_driver():
- the_driver = getattr(threadLocal, 'the_driver', None)
- if the_driver is None:
- the_driver = Driver()
- setattr(threadLocal, 'the_driver', the_driver)
- return the_driver.driver
- class GameData:
- def __init__(self):
- self.date = []
- self.time = []
- self.game = []
- self.score = []
- self.home_odds = []
- self.draw_odds = []
- self.away_odds = []
- self.country = []
- self.league = []
- def generate_matches(table):
- tr_tags = table.findAll('tr')
- for tr_tag in tr_tags:
- if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
- th_tag = tr_tag.find('th', {'class': 'first2 tl'})
- a_tags = th_tag.findAll('a')
- country = a_tags[0].text
- league = a_tags[1].text
- else:
- td_tags = tr_tag.findAll('td')
- yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text,
- td_tags[4].text, td_tags[5].text, country, league]
- def parse_data(url, return_urls=False):
- browser = create_driver()
- browser.get(url)
- soup = bs(browser.page_source, "lxml")
- div = soup.find('div', {'id': 'col-content'})
- table = div.find('table', {'class': 'table-main'})
- h1 = soup.find('h1').text
- print(h1)
- m = re.search(r'\d+ \w+ \d{4}$', h1)
- game_date = m[0]
- game_data = GameData()
- for row in generate_matches(table):
- game_data.date.append(game_date)
- game_data.time.append(row[0])
- game_data.game.append(row[1])
- # Score present?
- if ':' not in row[2]:
- # No, shift a few columns right:
- row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
- game_data.score.append(row[2])
- game_data.home_odds.append(nan if row[3] == '-' else row[3])
- game_data.draw_odds.append(nan if row[4] == '-' else row[4])
- game_data.away_odds.append(nan if row[5] == '-' else row[5])
- game_data.country.append(row[6])
- game_data.league.append(row[7])
- if return_urls:
- span = soup.find('span', {'class': 'next-games-date'})
- a_tags = span.findAll('a')
- urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
- return game_data, urls
- return game_data
- if __name__ == '__main__':
- results = None
- pool = ThreadPool(5) # We will be getting, however, 7 URLs
- # Get today's data and the Urls for the other days:
- game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
- urls.pop(1) # Remove url for today: We already have the data for that
- game_data_results = pool.imap(parse_data, urls)
- for i in range(8):
- game_data = game_data_today if i == 1 else next(game_data_results)
- result = pd.DataFrame(game_data.__dict__)
- if results is None:
- results = result
- else:
- results = results.append(result, ignore_index=True)
- print(results)
- # print(results.head())
- # ensure all the drivers are "quitted":
- del threadLocal
- import gc
- gc.collect() # a little extra insurance
- # results = results[results['game'].str.contains("Paraguay")]
- # print(results)
- results.to_csv(r"G:\My Drive\Oddsportal\Files\Oddsportal Data\Pre-processed\oddsportal_upcoming_matches.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement