Advertisement
PyNoob1

Scraping historical Data

Jun 28th, 2021
152
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.17 KB | None | 0 0
  1. import pandas as pd
  2. from selenium import webdriver
  3. from bs4 import BeautifulSoup as bs
  4.  
  5.  
  6. browser = webdriver.Chrome()
  7.  
  8.  
  9. class GameData:
  10.  
  11.     def __init__(self):
  12.         self.date = []
  13.         self.time = []
  14.         self.game = []
  15.         self.score = []
  16.         self.home_odds = []
  17.         self.draw_odds = []
  18.         self.away_odds = []
  19.         self.country = []
  20.         self.league = []
  21.  
  22.  
  23. def parse_data(url):
  24.     browser.get(url)
  25.     df = pd.read_html(browser.page_source, header=0)[0]
  26.     html = browser.page_source
  27.     soup = bs(html, "lxml")
  28.     cont = soup.find('div', {'id': 'wrap'})
  29.     content = cont.find('div', {'id': 'col-content'})
  30.     content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
  31.     main = content.find('th', {'class': 'first2 tl'})
  32.     if main is None:
  33.         return None
  34.     count = main.findAll('a')
  35.     country = count[1].text
  36.     league = count[2].text
  37.     game_data = GameData()
  38.     game_date = None
  39.     for row in df.itertuples():
  40.         if not isinstance(row[1], str):
  41.             continue
  42.         elif ':' not in row[1]:
  43.             game_date = row[1].split('-')[0]
  44.             continue
  45.         game_data.date.append(game_date)
  46.         game_data.time.append(row[1])
  47.         game_data.game.append(row[2])
  48.         game_data.score.append(row[3])
  49.         game_data.home_odds.append(row[4])
  50.         game_data.draw_odds.append(row[5])
  51.         game_data.away_odds.append(row[6])
  52.         game_data.country.append(country)
  53.         game_data.league.append(league)
  54.     return game_data
  55.  
  56.  
  57. urls = {
  58. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/1",
  59. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/2",
  60. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/3",
  61. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/4",
  62. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/5",
  63. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/6",
  64. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/7",
  65. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/8",
  66. "https://www.oddsportal.com/soccer/england/premier-league/results/#/page/9",
  67. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/1",
  68. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/2",
  69. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/3",
  70. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/4",
  71. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/5",
  72. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/6",
  73. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/7",
  74. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/8",
  75. "https://www.oddsportal.com/soccer/england/premier-league-2019-2020/results/#/page/9",
  76. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/1",
  77. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/2",
  78. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/3",
  79. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/4",
  80. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/5",
  81. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/6",
  82. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/7",
  83. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/8",
  84. "https://www.oddsportal.com/soccer/england/premier-league-2018-2019/results/#/page/9",
  85. }
  86.  
  87. if __name__ == '__main__':
  88.  
  89.     results = None
  90.  
  91.     for url in urls:
  92.         game_data = parse_data(url)
  93.         if game_data is None:
  94.             continue
  95.         result = pd.DataFrame(game_data.__dict__)
  96.         if results is None:
  97.             results = result
  98.         else:
  99.             results = results.append(result, ignore_index=True)
  100.  
  101. print(results)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement