Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # for so_q_75057281
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.chrome.service import Service
- import time
- from urllib.parse import urljoin
- from bs4 import BeautifulSoup
- import pandas as pd
- import os
- url = 'https://www.nike.sk/live-stavky/futbal'
- browser = webdriver.Chrome()
- wait = WebDriverWait(browser, max_wait).until
- browser.get(url)
- input('load and enter')
- soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'html.parser')
- with open('x.html', 'wb') as f:
- f.write(soup.prettify('utf-8'))
- # .bet:is(button,div)
- # div.accordion-header span.ellipsis # header
- scrapeX = 1
- matchSel = 'div.accordion-header+div.accordion-view>div.match'
- matchList = {} # []
- # for m in soup.select(matchSel):
- for rescrape in range(scrapeX):
- print(rescrape+1, 'of', scrapeX)
- soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'html.parser')
- for m in soup.select(matchSel):
- ## Match [Group?] Header ##
- mHead = m.parent.find_previous_sibling().select_one('span.ellipsis')
- mDets = {'id': None, 'header': mHead.get_text(' ').strip()}
- ## link + id + match-more ##
- link = ''.join([l.get('href') for l in m.select('a[href]')[:1]])
- link = link.lstrip('https://www.nike.sk')
- if link and not link[:1] == '/': link = f'/{link}'
- if link:
- mId = link.split('/live-stavky/', 1)[-1].split('/')[:2][-1]
- if mId.isdigit(): mDets['id'] = mId
- moreTxt = soup.select_one(f'a.match-more[href="{link}"]')
- if moreTxt: mDets['more'] = moreTxt.get_text(' ')
- mDets['link'] = f'https://www.nike.sk{link}'
- ## Match Timer ##
- mTimer = m.select_one('div[data-atid="match-timer"][title]')
- if mTimer:
- mDets['timer'] = mTimer.get_text(' ').strip()
- mTimer = mTimer.select_one('span.ellipsis+span')
- if mTimer and mTimer.get('class'):
- mTimer = ' '.join(mTimer.get('class')).strip()
- mDets['timer_status'] = mTimer.lstrip('match-timer-')
- ## teams + other simpler selections ##
- selRef = {
- 'home_team': 'span[data-atid="match-opponents-home"][title]',
- 'away_team': 'span[data-atid="match-opponents-away"][title]',
- 'noOdd_msg': 'span[data-atid="no-odds-at-the-moment"]',
- 'bet_label': 'span.bet-label'
- }
- for k, sel in selRef.items():
- ta = 'title' if sel.endswith('[title]') else ''
- el = m.select_one(sel)
- if k == 'noOdd_msg': print([f"m.select_one('{sel}')", miniStr(el)])
- if el: mDets[k] = el.get(ta) if ta else el.get_text(' ')
- ## scores and icons ##
- for score in m.select('span[data-atid^="tlv-overview-"]'):
- scKey = score.get('data-atid').replace('tlv-overview-', '')
- scKey = scKey.replace(' ', '_').replace('-', '_').strip()
- mDets[scKey] = score.get_text(' ')
- miSel = 'button.match-icon-btn[title]'
- mIcons = ', '.join([f'"{mi.get("title")}"' for mi in m.select(miSel)])
- if mIcons: mDets['icons'] = mIcons
- # odds ## data-atid="tl-bet-lock"
- bSel = '.bet:is(button,div)'
- for gi, betGroup in enumerate(m.select(f'div:has(>{bSel})'),1):
- for ci, bet in enumerate(betGroup.select(bSel), 1):
- bKey = f'odds-g_{gi}-c_{ci}'
- bTxt = bet.get_text(' ').strip()
- atid = bet.get('data-atid', '').replace('tl-bet-', '')
- mDets[bKey] = f"[{atid.strip('odd')}] {bTxt}".replace('[]', '')
- ## reduce multi-lines before adding to matchList ##
- for k, v in mDets.items():
- if isinstance(v,str):mDets[k] = ' '.join(w for w in v.split() if w)
- if mId not in matchList:
- matchList[mId] = mDets
- continue
- for k, v in mDets.items():
- if not matchList[mId].get(k): matchList[mId][k] = v
- # matchList.append(mDets)
- opFilename = 'nike_futbal.csv'
- # pd.DataFrame(matchList).to_csv(opFilename, index=False)
- pd.DataFrame(matchList.values()).to_csv(opFilename, index=False)
- browser.quit()
- del browser
Advertisement
Add Comment
Please, Sign In to add comment