Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from selenium import webdriver
- from datetime import datetime
- from bs4 import BeautifulSoup as bs
- from math import nan
- from tabulate import tabulate
- import os
- pd.set_option('display.max_rows', 30000)
- pd.set_option('display.max_columns', 500)
- pd.set_option('display.width', 1000)
- start = datetime.now()
- os.chdir(r"C:\Users\Harshad\Documents\Oddsportal_Project\Files")
- cwd = os.getcwd()
- print("Path for webdriver : ")
- print(cwd)
- browser = webdriver.Chrome()
- class GameData:
- def __init__(self):
- self.score = []
- self.date = []
- self.time = []
- self.country = []
- self.league = []
- self.game = []
- self.home_odds = []
- self.draw_odds = []
- self.away_odds = []
- def append(self, score):
- pass
- def get_urls(browser, landing_page):
- browser.get(landing_page)
- urls = [i.get_attribute('href') for i in
- browser.find_elements_by_css_selector(
- '.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
- return urls
- def parse_data(html):
- global league
- df = pd.read_html(html, header=0)[0]
- # print(len(df.index))
- # print(df.columns)
- html = browser.page_source
- soup = bs(html, "lxml")
- # print(len(soup.select('#table-matches tr')))
- scores = [i.select_one('.table-score').text if i.select_one('.table-score') is not None else nan for i in
- soup.select('#table-matches tr:nth-of-type(n+2)')]
- cont = soup.find('div', {'id': 'wrap'})
- content = cont.find('div', {'id': 'col-content'})
- content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
- main = content.find('th', {'class': 'first2 tl'})
- if main is None:
- return None
- count = main.findAll('a')
- country = count[0].text
- game_data = GameData()
- game_date = datetime.strptime(soup.select_one('.bold')['href'].split('/')[-2], '%Y%m%d').date()
- leagues = [i.text for i in soup.select('.first2 > a:last-child')]
- n = 0
- for number, row in enumerate(df.itertuples()):
- if n == 0 or '»' in row[1]:
- league = leagues[n]
- n += 1
- if not isinstance(row[1], str):
- continue
- elif ':' not in row[1]:
- country = row[1].split('»')[0]
- continue
- game_time = row[1]
- # print(len(row))
- # print(row[3])
- game_data.date.append(game_date)
- game_data.time.append(game_time)
- game_data.country.append(country)
- game_data.league.append(league)
- game_data.game.append(row[2])
- game_data.score.append(scores[number])
- game_data.home_odds.append(row[4])
- game_data.draw_odds.append(row[5])
- game_data.away_odds.append(row[6])
- return game_data
- if __name__ == '__main__':
- start_url = "https://www.oddsportal.com/matches/soccer/"
- urls = []
- browser = webdriver.Chrome()
- results = None
- urls = get_urls(browser, start_url)
- urls.insert(0, start_url)
- for number, url in enumerate(urls):
- if number > 0:
- browser.get(url)
- html = browser.page_source
- game_data = parse_data(html)
- if game_data is None:
- continue
- result = pd.DataFrame(game_data.__dict__)
- if results is None:
- results = result
- else:
- results = results.append(result, ignore_index=True)
- print(tabulate(results, headers='keys', tablefmt='grid'))
- results.to_csv(r"C:\Users\Harshad\Documents\Oddsportal_Project\Files\Output\oddsportal_upcoming_matches.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement