Advertisement
PyNoob1

Untitled

Jun 9th, 2021
146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.65 KB | None | 0 0
  1. import pandas as pd
  2. from selenium import webdriver
  3. from datetime import datetime
  4. from bs4 import BeautifulSoup as bs
  5. from math import nan
  6. from tabulate import tabulate
  7. import os
  8.  
  9. pd.set_option('display.max_rows', 30000)
  10. pd.set_option('display.max_columns', 500)
  11. pd.set_option('display.width', 1000)
  12.  
  13. start = datetime.now()
  14.  
  15. os.chdir(r"C:\Users\Harshad\Documents\Oddsportal_Project\Files")
  16. cwd = os.getcwd()
  17. print("Path for webdriver : ")
  18. print(cwd)
  19. browser = webdriver.Chrome()
  20.  
  21.  
  22. class GameData:
  23.     def __init__(self):
  24.         self.score = []
  25.         self.date = []
  26.         self.time = []
  27.         self.country = []
  28.         self.league = []
  29.         self.game = []
  30.         self.home_odds = []
  31.         self.draw_odds = []
  32.         self.away_odds = []
  33.  
  34.     def append(self, score):
  35.         pass
  36.  
  37.  
  38. def get_urls(browser, landing_page):
  39.     browser.get(landing_page)
  40.     urls = [i.get_attribute('href') for i in
  41.             browser.find_elements_by_css_selector(
  42.                 '.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
  43.  
  44.     return urls
  45.  
  46.  
  47. def parse_data(html):
  48.     global league
  49.     df = pd.read_html(html, header=0)[0]
  50.     # print(len(df.index))
  51.     # print(df.columns)
  52.     html = browser.page_source
  53.     soup = bs(html, "lxml")
  54.     # print(len(soup.select('#table-matches tr')))
  55.     scores = [i.select_one('.table-score').text if i.select_one('.table-score') is not None else nan for i in
  56.               soup.select('#table-matches tr:nth-of-type(n+2)')]
  57.     cont = soup.find('div', {'id': 'wrap'})
  58.     content = cont.find('div', {'id': 'col-content'})
  59.     content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
  60.     main = content.find('th', {'class': 'first2 tl'})
  61.  
  62.     if main is None:
  63.         return None
  64.  
  65.     count = main.findAll('a')
  66.     country = count[0].text
  67.     game_data = GameData()
  68.     game_date = datetime.strptime(soup.select_one('.bold')['href'].split('/')[-2], '%Y%m%d').date()
  69.     leagues = [i.text for i in soup.select('.first2 > a:last-child')]
  70.  
  71.     n = 0
  72.  
  73.     for number, row in enumerate(df.itertuples()):
  74.         if n == 0 or '»' in row[1]:
  75.             league = leagues[n]
  76.             n += 1
  77.         if not isinstance(row[1], str):
  78.             continue
  79.         elif ':' not in row[1]:
  80.             country = row[1].split('»')[0]
  81.             continue
  82.         game_time = row[1]
  83.         # print(len(row))
  84.         # print(row[3])
  85.  
  86.         game_data.date.append(game_date)
  87.         game_data.time.append(game_time)
  88.         game_data.country.append(country)
  89.         game_data.league.append(league)
  90.         game_data.game.append(row[2])
  91.         game_data.score.append(scores[number])
  92.         game_data.home_odds.append(row[4])
  93.         game_data.draw_odds.append(row[5])
  94.         game_data.away_odds.append(row[6])
  95.  
  96.     return game_data
  97.  
  98.  
  99. if __name__ == '__main__':
  100.  
  101.     start_url = "https://www.oddsportal.com/matches/soccer/"
  102.     urls = []
  103.     browser = webdriver.Chrome()
  104.     results = None
  105.     urls = get_urls(browser, start_url)
  106.     urls.insert(0, start_url)
  107.  
  108.     for number, url in enumerate(urls):
  109.         if number > 0:
  110.             browser.get(url)
  111.         html = browser.page_source
  112.         game_data = parse_data(html)
  113.  
  114.         if game_data is None:
  115.             continue
  116.  
  117.         result = pd.DataFrame(game_data.__dict__)
  118.  
  119.         if results is None:
  120.             results = result
  121.         else:
  122.             results = results.append(result, ignore_index=True)
  123.  
  124. print(tabulate(results, headers='keys', tablefmt='grid'))
  125.  
  126. results.to_csv(r"C:\Users\Harshad\Documents\Oddsportal_Project\Files\Output\oddsportal_upcoming_matches.csv")
  127.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement