Advertisement
Try95th

scrape_oddsportal_matches for so_q_75058259

Jan 12th, 2023 (edited)
256
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.13 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/75058259/6146136 #################################################
  2. ## output at https://docs.google.com/spreadsheets/d/1jvCDg7dJ4KNrHHYlDNx5TsPJgV1UuJ4n3TBcD52Rk90 ####
  3. #####################################################################################################
  4.  
  5.  
  6. ######################################## AUXILIARY FUNCTIONS ########################################
  7. ## for extracting values from bs4 Tags - from https://pastebin.com/ZnZ7xM6u ##
  8. def extract_from_bs4tag(tagSoup, selector, targetAttr=''):
  9.     sel, ta = str(selector).strip(), str(targetAttr).strip()
  10.     el = tagSoup.select_one(sel) if sel else tagSoup
  11.     if el: return el.get(ta) if ta else el.get_text(' ').strip()    
  12. def selectForList(tagSoup, selectors, printList=False):
  13.     if isinstance(selectors, dict):
  14.         return dict(zip(selectors.keys(), selectForList(
  15.             tagSoup, selectors.values(), printList)))
  16.     returnList, isv = [], printList
  17.     for s in selectors:
  18.         sel, ta = s[:2] if isinstance(s, tuple) and len(s) > 1 else (s, '')
  19.         if ta == '"staticVal"': returnList.append(sel)
  20.         else: returnList.append(extract_from_bs4tag(tagSoup, sel, ta))
  21.     returnList = [' '.join(r) if type(r) == list else r for r in returnList]
  22.     if isv and not isinstance(isv, str): print(returnList)
  23.     if isinstance(isv, str): print(*returnList, sep=isv)
  24.     return returnList
  25. #####################################################################################################
  26.  
  27.  
  28. ###################################### MAIN EXTRACTOR FUNCTION ######################################
  29. from urllib.parse import urljoin
  30.  
  31. def from_pageSoup(pgSoup, egPrefix='evtGroup_', baseUrl='https://www.oddsportal.com'):
  32.     ehSel = ':scope>div:first-child'
  33.     es2 = ':nth-of-type(2):nth-last-of-type(2)'
  34.     es3 = ':nth-of-type(3):last-of-type'
  35.     egSel = {
  36.         'event_group_header': ehSel,
  37.         'sport_name': f'{ehSel}>a:first-of-type',
  38.         'country_name': f'{ehSel}>a{es2}',
  39.         'league_name': f'{ehSel}>a{es3}',
  40.         'sport_link': (f'{ehSel}>a[href]:first-of-type', 'href'),
  41.         'country_link': (f'{ehSel}>a[href]{es2}', 'href'),
  42.         'league_link': (f'{ehSel}>a[href]{es3}', 'href'),
  43.         'date': f'{ehSel}+div>div:first-child'
  44.     }
  45.     evtSel = {
  46.         'time': 'p.whitespace-nowrap',
  47.         'team_1': ('a[title]', 'title'),
  48.         'team_2': ('a[title]~a[title]', 'title'),
  49.         'score': 'a:has(a[title])+div.hidden',
  50.         'score_1': 'a[title]>div+div',
  51.         'score_2': 'a[title]~a[title]>div+div',
  52.         'odds_1': 'a:has(a[title])~div:not(.hidden)',
  53.         'odds_X': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
  54.         'odds_2': 'a:has(a[title])~div:nth-last-of-type(2)',
  55.         'odds_f': 'a:has(a[title])~div span>p.gradient-green',
  56.         'Bs': 'a:has(a[title])~div:last-of-type'
  57.     }
  58.     copyKeys = ['event_group', 'event_group_header', 'date']
  59.  
  60.     eventGroups, events = [], []
  61.     for evtRow in pgSoup.select('div[set]>div:last-child'):
  62.         evtGroup = evtRow.parent
  63.         setId, egno = evtGroup.get('set'), len(eventGroups)+1
  64.         if evtGroup.select(f'{ehSel}+div+div'):
  65.             egDets = {'event_group': f'{egPrefix}{egno}', 'set': setId}
  66.             egVals = selectForList(evtGroup, egSel.values())
  67.             for k, v in zip(egSel.keys(), egVals): egDets[k] = v
  68.             eventGroups.append(egDets)
  69.  
  70.         eg_cur = eventGroups[-1] if eventGroups else {}
  71.         evtDets = {k: eg_cur.get(k) for k in copyKeys}
  72.         evtDets['set'] = setId
  73.         evtVals = selectForList(evtRow, evtSel.values())
  74.         for k, v in zip(evtSel.keys(), evtVals): evtDets[k] = v
  75.  
  76.         icons = [', '.join([
  77.             ci.strip('bg-').replace('-icon', '').strip() for ci in
  78.             d.get('class') if ci.startswith('bg-') and
  79.             ci not in ['bg-center', 'bg-no-repeat']
  80.         ]) for d in evtRow.select('a:has(a[title]) div.bg-center')]
  81.         evtDets['icons'] = ' , '.join([ci for ci in icons if ci])
  82.  
  83.         links = [l.get('href') for l in evtRow.select('a[href]')]
  84.         evtDets['links'] = ' , '.join({urljoin(baseUrl, l) for l in links})
  85.  
  86.         events.append(evtDets)
  87.     return eventGroups, events
  88. #####################################################################################################
  89.  
  90.  
  91. ########################################### EXAMPLE USAGE ###########################################
  92. from selenium import webdriver
  93. from selenium.webdriver.common.by import By
  94. from selenium.webdriver.support.ui import WebDriverWait
  95. from selenium.webdriver.support import expected_conditions as EC
  96. from bs4 import BeautifulSoup
  97.  
  98. options = webdriver.ChromeOptions()
  99. options.add_argument('--headless')
  100. browser = webdriver.Chrome(chrome_options=options)
  101.  
  102. opmUrls = [
  103.     'https://www.oddsportal.com/matches/soccer/20230111',
  104.     'https://www.oddsportal.com/matches/soccer/20230112',
  105.     'https://www.oddsportal.com/matches/basketball/20230111',
  106.     'https://www.oddsportal.com/matches/basketball/20230112'
  107. ]
  108.  
  109. evtGroups, events = [], []
  110. for opmUrl in opmUrls:
  111.     browser.get(opmUrl)
  112.     WebDriverWait(browser, 5).until(
  113.         EC.visibility_of_all_elements_located(
  114.             (By.CSS_SELECTOR, 'div[set] a[title]')))
  115.     soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'lxml')
  116.  
  117.     eg_prefix = '_'.join(opmUrl.split('/')[-2:])+'_'
  118.     egl, evl = from_pageSoup(soup, eg_prefix, opmUrl)
  119.     print(f'{len(egl)} eventGroups and {len(evl)} events from {opmUrl}')
  120.     evtGroups += egl
  121.     events += evl
  122. print()
  123.  
  124. ############################## saving the results ##############################
  125. import pandas as pd
  126. import os # just for printing the full path
  127.  
  128. evgFilename = 'oddsportal_eventGroups.csv'
  129. evtFilename = 'oddsportal_events.csv'
  130. pd.DataFrame(evtGroups).to_csv(evgFilename, index=False)
  131. pd.DataFrame(events).to_csv(evtFilename, index=False)
  132. print('saved', len(evtGroups), 'eventGroups to', os.path.abspath(evgFilename))
  133. print('saved', len(events), 'events to', os.path.abspath(evtFilename))
  134. ################################################################################
  135.  
  136. browser.quit()
  137. del browser
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement