Advertisement
PyNoob1

current code

Oct 25th, 2021
398
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.27 KB | None | 0 0
  1. import pandas as pd
  2. from bs4 import BeautifulSoup as bs
  3. from selenium import webdriver
  4. import threading
  5. from multiprocessing.pool import ThreadPool
  6. import os
  7. import re
  8. from math import nan
  9.  
  10. pd.set_option('display.max_rows', 30000)
  11. pd.set_option('display.max_columns', 500)
  12. pd.set_option('display.width', 1000)
  13.  
  14. os.chdir(r"G:\My Drive\Oddsportal\Webdriver")
  15. cwd = os.getcwd()
  16.  
  17.  
  18. class Driver:
  19.     def __init__(self):
  20.         options = webdriver.ChromeOptions()
  21.         options.add_argument("--headless")
  22.         # Un-comment next line to supress logging:
  23.         options.add_experimental_option('excludeSwitches', ['enable-logging'])
  24.         self.driver = webdriver.Chrome(options=options)
  25.  
  26.     def __del__(self):
  27.         self.driver.quit()  # clean up driver when we are cleaned up
  28.         # print('The driver has been "quitted".')
  29.  
  30.  
  31. threadLocal = threading.local()
  32.  
  33.  
  34. def create_driver():
  35.     the_driver = getattr(threadLocal, 'the_driver', None)
  36.     if the_driver is None:
  37.         the_driver = Driver()
  38.         setattr(threadLocal, 'the_driver', the_driver)
  39.     return the_driver.driver
  40.  
  41.  
  42. class GameData:
  43.     def __init__(self):
  44.         self.date = []
  45.         self.time = []
  46.         self.game = []
  47.         self.score = []
  48.         self.home_odds = []
  49.         self.draw_odds = []
  50.         self.away_odds = []
  51.         self.country = []
  52.         self.league = []
  53.  
  54.  
  55. def generate_matches(table):
  56.     tr_tags = table.findAll('tr')
  57.     for tr_tag in tr_tags:
  58.         if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
  59.             th_tag = tr_tag.find('th', {'class': 'first2 tl'})
  60.             a_tags = th_tag.findAll('a')
  61.             country = a_tags[0].text
  62.             league = a_tags[1].text
  63.         else:
  64.             td_tags = tr_tag.findAll('td')
  65.             yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text,
  66.                    td_tags[4].text, td_tags[5].text, country, league]
  67.  
  68.  
  69. def parse_data(url, return_urls=False):
  70.     browser = create_driver()
  71.     browser.get(url)
  72.     soup = bs(browser.page_source, "lxml")
  73.     div = soup.find('div', {'id': 'col-content'})
  74.     table = div.find('table', {'class': 'table-main'})
  75.     h1 = soup.find('h1').text
  76.     print(h1)
  77.     m = re.search(r'\d+ \w+ \d{4}$', h1)
  78.     game_date = m[0]
  79.     game_data = GameData()
  80.     for row in generate_matches(table):
  81.         game_data.date.append(game_date)
  82.         game_data.time.append(row[0])
  83.         game_data.game.append(row[1])
  84.         # Score present?
  85.         if ':' not in row[2]:
  86.             # No, shift a few columns right:
  87.             row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
  88.         game_data.score.append(row[2])
  89.         game_data.home_odds.append(nan if row[3] == '-' else row[3])
  90.         game_data.draw_odds.append(nan if row[4] == '-' else row[4])
  91.         game_data.away_odds.append(nan if row[5] == '-' else row[5])
  92.         game_data.country.append(row[6])
  93.         game_data.league.append(row[7])
  94.  
  95.     if return_urls:
  96.         span = soup.find('span', {'class': 'next-games-date'})
  97.         a_tags = span.findAll('a')
  98.         urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
  99.         return game_data, urls
  100.     return game_data
  101.  
  102.  
  103. if __name__ == '__main__':
  104.     results = None
  105.     pool = ThreadPool(5)  # We will be getting, however, 7 URLs
  106.     # Get today's data and the Urls for the other days:
  107.     game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
  108.     urls.pop(1)  # Remove url for today: We already have the data for that
  109.     game_data_results = pool.imap(parse_data, urls)
  110.     for i in range(8):
  111.         game_data = game_data_today if i == 1 else next(game_data_results)
  112.         result = pd.DataFrame(game_data.__dict__)
  113.         if results is None:
  114.             results = result
  115.         else:
  116.             results = results.append(result, ignore_index=True)
  117.  
  118.     print(results)
  119.     # print(results.head())
  120.     # ensure all the drivers are "quitted":
  121.     del threadLocal
  122.     import gc
  123.  
  124.     gc.collect()  # a little extra insurance
  125.  
  126. # results = results[results['game'].str.contains("Paraguay")]
  127.  
  128. # print(results)
  129.  
  130. results.to_csv(r"G:\My Drive\Oddsportal\Files\Oddsportal Data\Pre-processed\oddsportal_upcoming_matches.csv")
  131.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement