Advertisement
PyNoob1

Untitled

Sep 4th, 2021
144
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.26 KB | None | 0 0
  1. import pandas as pd
  2. from selenium import webdriver
  3. from datetime import datetime
  4. from bs4 import BeautifulSoup as bs
  5. from math import nan
  6. from tabulate import tabulate
  7. import os
  8.  
  9. pd.set_option('display.max_rows', 30000)
  10. pd.set_option('display.max_columns', 500)
  11. pd.set_option('display.width', 1000)
  12.  
  13. start = datetime.now()
  14.  
  15. os.chdir(r"C:\Users\harsh\Google Drive\Oddsportal\Webdriver")
  16. cwd = os.getcwd()
  17. print("Path for webdriver : ")
  18. print(cwd)
  19. browser = webdriver.Chrome()
  20.  
  21.  
  22. class GameData:
  23.     def __init__(self):
  24.         self.score = []
  25.         self.country = []
  26.         self.league = []
  27.         self.game = []
  28.  
  29.     def append(self, score):
  30.         pass
  31.  
  32.  
  33. def get_urls(browser, landing_page):
  34.     browser.get(landing_page)
  35.     urls = [i.get_attribute('href') for i in
  36.             browser.find_elements_by_css_selector(
  37.                 '.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
  38.  
  39.     return urls
  40.  
  41.  
  42. def parse_data(html):
  43.     global league
  44.     df = pd.read_html(html, header=0)[0]
  45.     # print(len(df.index))
  46.     # print(df.columns)
  47.     html = browser.page_source
  48.     soup = bs(html, "lxml")
  49.     # print(len(soup.select('#table-matches tr')))
  50.     scores = [i.select_one('.table-score').text if i.select_one('.table-score') is not None else nan for i in
  51.               soup.select('#table-matches tr:nth-of-type(n+2)')]
  52.     cont = soup.find('div', {'id': 'wrap'})
  53.     content = cont.find('div', {'id': 'col-content'})
  54.     content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
  55.     main = content.find('th', {'class': 'first2 tl'})
  56.  
  57.     if main is None:
  58.         return None
  59.  
  60.     count = main.findAll('a')
  61.     country = count[0].text
  62.     game_data = GameData()
  63.     leagues = [i.text for i in soup.select('.first2 > a:last-child')]
  64.  
  65.     n = 0
  66.  
  67.     for number, row in enumerate(df.itertuples()):
  68.         if n == 0 or '»' in row[1]:
  69.             league = leagues[n]
  70.             n += 1
  71.         if not isinstance(row[1], str):
  72.             continue
  73.         elif ':' not in row[1]:
  74.             country = row[1].split('»')[0]
  75.             continue
  76.         game_time = row[1]
  77.         print(len(scores[number]))
  78.         print(scores[number])
  79.  
  80.         game_data.country.append(country)
  81.         game_data.league.append(league)
  82.         game_data.game.append(row[2])
  83.         game_data.score.append(scores[number])
  84.  
  85.     return game_data
  86.  
  87.  
  88. if __name__ == '__main__':
  89.  
  90.     start_url = "https://www.oddsportal.com/matches/soccer/"
  91.     urls = []
  92.     browser = webdriver.Chrome()
  93.     results = None
  94.     urls = get_urls(browser, start_url)
  95.     urls.insert(0, start_url)
  96.  
  97.     for number, url in enumerate(urls):
  98.         if number > 0:
  99.             browser.get(url)
  100.         html = browser.page_source
  101.         game_data = parse_data(html)
  102.  
  103.         if game_data is None:
  104.             continue
  105.  
  106.         result = pd.DataFrame(game_data.__dict__)
  107.  
  108.         if results is None:
  109.             results = result
  110.         else:
  111.             results = results.append(result, ignore_index=True)
  112. #
  113. print(tabulate(results, headers='keys', tablefmt='github'))
  114. #
  115. # results.to_csv(r"C:\Users\harsh\Google Drive\Oddsportal\Files\Oddsportal "
  116. #                r"Data\Pre-processed\oddsportal_upcoming_matches.csv")
  117.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement