scrape_oddsportal_matches for so_q_75058259

## for https://stackoverflow.com/q/75058259/6146136 #################################################
## output at https://docs.google.com/spreadsheets/d/1jvCDg7dJ4KNrHHYlDNx5TsPJgV1UuJ4n3TBcD52Rk90 ####
#####################################################################################################


######################################## AUXILIARY FUNCTIONS ########################################
## for extracting values from bs4 Tags - from https://pastebin.com/ZnZ7xM6u ##
def extract_from_bs4tag(tagSoup, selector, targetAttr=''):
    sel, ta = str(selector).strip(), str(targetAttr).strip()
    el = tagSoup.select_one(sel) if sel else tagSoup
    if el: return el.get(ta) if ta else el.get_text(' ').strip()
def selectForList(tagSoup, selectors, printList=False):
    if isinstance(selectors, dict):
        return dict(zip(selectors.keys(), selectForList(
            tagSoup, selectors.values(), printList)))
    returnList, isv = [], printList
    for s in selectors:
        sel, ta = s[:2] if isinstance(s, tuple) and len(s) > 1 else (s, '')
        if ta == '"staticVal"': returnList.append(sel)
        else: returnList.append(extract_from_bs4tag(tagSoup, sel, ta))
    returnList = [' '.join(r) if type(r) == list else r for r in returnList]
    if isv and not isinstance(isv, str): print(returnList)
    if isinstance(isv, str): print(*returnList, sep=isv)
    return returnList
#####################################################################################################


###################################### MAIN EXTRACTOR FUNCTION ######################################
from urllib.parse import urljoin

def from_pageSoup(pgSoup, egPrefix='evtGroup_', baseUrl='https://www.oddsportal.com'):
    ehSel = ':scope>div:first-child'
    es2 = ':nth-of-type(2):nth-last-of-type(2)'
    es3 = ':nth-of-type(3):last-of-type'
    egSel = {
        'event_group_header': ehSel,
        'sport_name': f'{ehSel}>a:first-of-type',
        'country_name': f'{ehSel}>a{es2}',
        'league_name': f'{ehSel}>a{es3}',
        'sport_link': (f'{ehSel}>a[href]:first-of-type', 'href'),
        'country_link': (f'{ehSel}>a[href]{es2}', 'href'),
        'league_link': (f'{ehSel}>a[href]{es3}', 'href'),
        'date': f'{ehSel}+div>div:first-child'
    }
    evtSel = {
        'time': 'p.whitespace-nowrap',
        'team_1': ('a[title]', 'title'),
        'team_2': ('a[title]~a[title]', 'title'),
        'score': 'a:has(a[title])+div.hidden',
        'score_1': 'a[title]>div+div',
        'score_2': 'a[title]~a[title]>div+div',
        'odds_1': 'a:has(a[title])~div:not(.hidden)',
        'odds_X': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
        'odds_2': 'a:has(a[title])~div:nth-last-of-type(2)',
        'odds_f': 'a:has(a[title])~div span>p.gradient-green',
        'Bs': 'a:has(a[title])~div:last-of-type'
    }
    copyKeys = ['event_group', 'event_group_header', 'date']

    eventGroups, events = [], []
    for evtRow in pgSoup.select('div[set]>div:last-child'):
        evtGroup = evtRow.parent
        setId, egno = evtGroup.get('set'), len(eventGroups)+1
        if evtGroup.select(f'{ehSel}+div+div'):
            egDets = {'event_group': f'{egPrefix}{egno}', 'set': setId}
            egVals = selectForList(evtGroup, egSel.values())
            for k, v in zip(egSel.keys(), egVals): egDets[k] = v
            eventGroups.append(egDets)

        eg_cur = eventGroups[-1] if eventGroups else {}
        evtDets = {k: eg_cur.get(k) for k in copyKeys}
        evtDets['set'] = setId
        evtVals = selectForList(evtRow, evtSel.values())
        for k, v in zip(evtSel.keys(), evtVals): evtDets[k] = v

        icons = [', '.join([
            ci.strip('bg-').replace('-icon', '').strip() for ci in
            d.get('class') if ci.startswith('bg-') and
            ci not in ['bg-center', 'bg-no-repeat']
        ]) for d in evtRow.select('a:has(a[title]) div.bg-center')]
        evtDets['icons'] = ' , '.join([ci for ci in icons if ci])

        links = [l.get('href') for l in evtRow.select('a[href]')]
        evtDets['links'] = ' , '.join({urljoin(baseUrl, l) for l in links})

        events.append(evtDets)
    return eventGroups, events
#####################################################################################################


########################################### EXAMPLE USAGE ###########################################
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=options)

opmUrls = [
    'https://www.oddsportal.com/matches/soccer/20230111',
    'https://www.oddsportal.com/matches/soccer/20230112',
    'https://www.oddsportal.com/matches/basketball/20230111',
    'https://www.oddsportal.com/matches/basketball/20230112'
]

evtGroups, events = [], []
for opmUrl in opmUrls:
    browser.get(opmUrl)
    WebDriverWait(browser, 5).until(
        EC.visibility_of_all_elements_located(
            (By.CSS_SELECTOR, 'div[set] a[title]')))
    soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'lxml')

    eg_prefix = '_'.join(opmUrl.split('/')[-2:])+'_'
    egl, evl = from_pageSoup(soup, eg_prefix, opmUrl)
    print(f'{len(egl)} eventGroups and {len(evl)} events from {opmUrl}')
    evtGroups += egl
    events += evl
print()

############################## saving the results ##############################
import pandas as pd
import os # just for printing the full path

evgFilename = 'oddsportal_eventGroups.csv'
evtFilename = 'oddsportal_events.csv'
pd.DataFrame(evtGroups).to_csv(evgFilename, index=False)
pd.DataFrame(events).to_csv(evtFilename, index=False)
print('saved', len(evtGroups), 'eventGroups to', os.path.abspath(evgFilename))
print('saved', len(events), 'events to', os.path.abspath(evtFilename))
################################################################################

browser.quit()
del browser