Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/75058259/6146136 #################################################
- ## output at https://docs.google.com/spreadsheets/d/1jvCDg7dJ4KNrHHYlDNx5TsPJgV1UuJ4n3TBcD52Rk90 ####
- #####################################################################################################
- ######################################## AUXILIARY FUNCTIONS ########################################
- ## for extracting values from bs4 Tags - from https://pastebin.com/ZnZ7xM6u ##
- def extract_from_bs4tag(tagSoup, selector, targetAttr=''):
- sel, ta = str(selector).strip(), str(targetAttr).strip()
- el = tagSoup.select_one(sel) if sel else tagSoup
- if el: return el.get(ta) if ta else el.get_text(' ').strip()
- def selectForList(tagSoup, selectors, printList=False):
- if isinstance(selectors, dict):
- return dict(zip(selectors.keys(), selectForList(
- tagSoup, selectors.values(), printList)))
- returnList, isv = [], printList
- for s in selectors:
- sel, ta = s[:2] if isinstance(s, tuple) and len(s) > 1 else (s, '')
- if ta == '"staticVal"': returnList.append(sel)
- else: returnList.append(extract_from_bs4tag(tagSoup, sel, ta))
- returnList = [' '.join(r) if type(r) == list else r for r in returnList]
- if isv and not isinstance(isv, str): print(returnList)
- if isinstance(isv, str): print(*returnList, sep=isv)
- return returnList
- #####################################################################################################
- ###################################### MAIN EXTRACTOR FUNCTION ######################################
- from urllib.parse import urljoin
- def from_pageSoup(pgSoup, egPrefix='evtGroup_', baseUrl='https://www.oddsportal.com'):
- ehSel = ':scope>div:first-child'
- es2 = ':nth-of-type(2):nth-last-of-type(2)'
- es3 = ':nth-of-type(3):last-of-type'
- egSel = {
- 'event_group_header': ehSel,
- 'sport_name': f'{ehSel}>a:first-of-type',
- 'country_name': f'{ehSel}>a{es2}',
- 'league_name': f'{ehSel}>a{es3}',
- 'sport_link': (f'{ehSel}>a[href]:first-of-type', 'href'),
- 'country_link': (f'{ehSel}>a[href]{es2}', 'href'),
- 'league_link': (f'{ehSel}>a[href]{es3}', 'href'),
- 'date': f'{ehSel}+div>div:first-child'
- }
- evtSel = {
- 'time': 'p.whitespace-nowrap',
- 'team_1': ('a[title]', 'title'),
- 'team_2': ('a[title]~a[title]', 'title'),
- 'score': 'a:has(a[title])+div.hidden',
- 'score_1': 'a[title]>div+div',
- 'score_2': 'a[title]~a[title]>div+div',
- 'odds_1': 'a:has(a[title])~div:not(.hidden)',
- 'odds_X': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
- 'odds_2': 'a:has(a[title])~div:nth-last-of-type(2)',
- 'odds_f': 'a:has(a[title])~div span>p.gradient-green',
- 'Bs': 'a:has(a[title])~div:last-of-type'
- }
- copyKeys = ['event_group', 'event_group_header', 'date']
- eventGroups, events = [], []
- for evtRow in pgSoup.select('div[set]>div:last-child'):
- evtGroup = evtRow.parent
- setId, egno = evtGroup.get('set'), len(eventGroups)+1
- if evtGroup.select(f'{ehSel}+div+div'):
- egDets = {'event_group': f'{egPrefix}{egno}', 'set': setId}
- egVals = selectForList(evtGroup, egSel.values())
- for k, v in zip(egSel.keys(), egVals): egDets[k] = v
- eventGroups.append(egDets)
- eg_cur = eventGroups[-1] if eventGroups else {}
- evtDets = {k: eg_cur.get(k) for k in copyKeys}
- evtDets['set'] = setId
- evtVals = selectForList(evtRow, evtSel.values())
- for k, v in zip(evtSel.keys(), evtVals): evtDets[k] = v
- icons = [', '.join([
- ci.strip('bg-').replace('-icon', '').strip() for ci in
- d.get('class') if ci.startswith('bg-') and
- ci not in ['bg-center', 'bg-no-repeat']
- ]) for d in evtRow.select('a:has(a[title]) div.bg-center')]
- evtDets['icons'] = ' , '.join([ci for ci in icons if ci])
- links = [l.get('href') for l in evtRow.select('a[href]')]
- evtDets['links'] = ' , '.join({urljoin(baseUrl, l) for l in links})
- events.append(evtDets)
- return eventGroups, events
- #####################################################################################################
- ########################################### EXAMPLE USAGE ###########################################
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from bs4 import BeautifulSoup
- options = webdriver.ChromeOptions()
- options.add_argument('--headless')
- browser = webdriver.Chrome(chrome_options=options)
- opmUrls = [
- 'https://www.oddsportal.com/matches/soccer/20230111',
- 'https://www.oddsportal.com/matches/soccer/20230112',
- 'https://www.oddsportal.com/matches/basketball/20230111',
- 'https://www.oddsportal.com/matches/basketball/20230112'
- ]
- evtGroups, events = [], []
- for opmUrl in opmUrls:
- browser.get(opmUrl)
- WebDriverWait(browser, 5).until(
- EC.visibility_of_all_elements_located(
- (By.CSS_SELECTOR, 'div[set] a[title]')))
- soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'lxml')
- eg_prefix = '_'.join(opmUrl.split('/')[-2:])+'_'
- egl, evl = from_pageSoup(soup, eg_prefix, opmUrl)
- print(f'{len(egl)} eventGroups and {len(evl)} events from {opmUrl}')
- evtGroups += egl
- events += evl
- print()
- ############################## saving the results ##############################
- import pandas as pd
- import os # just for printing the full path
- evgFilename = 'oddsportal_eventGroups.csv'
- evtFilename = 'oddsportal_events.csv'
- pd.DataFrame(evtGroups).to_csv(evgFilename, index=False)
- pd.DataFrame(events).to_csv(evtFilename, index=False)
- print('saved', len(evtGroups), 'eventGroups to', os.path.abspath(evgFilename))
- print('saved', len(events), 'events to', os.path.abspath(evtFilename))
- ################################################################################
- browser.quit()
- del browser
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement