Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/74848587/6146136
- ## sample outputs at https://docs.google.com/spreadsheets/d/1E1QBKqaWoCB2SPAMyz0A_oe58MaV-yQbaSBGxGMBPC8
- '''
- ############# calls used for sample outputs #############
- scrape_pakakumi(5.0, 60)
- scrape_pakakumi_lim(60)
- scrape_pakakumi(10, maxWait=400) ## stopped using keyboard interrupt
- #########################################################
- '''
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from bs4 import BeautifulSoup
- import pandas as pd
- from datetime import datetime
- import requests # only needed for scrape_pakakumi_api
- import time # not needed for scrape_pakakumi_lim
- ############################################### SHORTER VERSION ###############################################
- def scrape_pakakumi_lim(maxRows=50, max_tmo=10, maxWait=100, opfn='pakakumi.csv'):
- maxWait = maxWait if isinstance(maxWait, (int, float)) and maxWait > 0 else 100
- maxRows = maxRows if isinstance(maxRows, int) and maxRows > 0 else 50
- max_tmo = max_tmo if isinstance(max_tmo, int) and max_tmo > 0 else 10
- opfn = opfn if isinstance(opfn, str) and opfn[-4:] == '.csv' else 'pakakumi.csv'
- rootUrl = 'https://play.pakakumi.com'
- # driver = webdriver.Chrome(path)
- # [I just copy chromedriver.exe to the same folder as this py file]
- driver = webdriver.Chrome()
- wait = WebDriverWait(driver, maxWait)
- driver.get(rootUrl)
- wait.until(EC.presence_of_all_elements_located((
- By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))
- tmoCt, rowCt, totalScrapes = 0, 0, 0
- addedIds, games, thref = [], [], ''
- while rowCt < maxRows and tmoCt < max_tmo:
- scrapeTime = datetime.now()
- tSoup = BeautifulSoup(driver.find_element(
- By.CSS_SELECTOR, 'table:has(th.text-center) tbody'
- ).get_attribute('outerHTML'), 'html.parser')
- thref = tSoup.select_one(
- f'tr:first-child>td:first-child>a[href]'
- ).get('href')
- tGames = [{
- 'scraped_at': scrapeTime.isoformat(),
- 'game_id': a.get('href').replace('/games/', '').split('/')[0],
- 'crash': a.get_text(' ', strip=True), 'hash': i.get('value'),
- 'url': f"{rootUrl}{a.get('href')}"
- } for a, i in [(
- r.select_one('td>a[href^="\/games\/"]'),
- r.select_one('td>input[readonly]')
- ) for r in tSoup.select(
- 'tr:has(>td>a[href^="\/games\/"]):has(>td>input[readonly])'
- )]]
- totalScrapes += len(tGames)
- tGames = [t for t in tGames if t['game_id'] not in addedIds]
- rowCt += len(tGames)
- games += tGames
- addedIds += [t['game_id'] for t in tGames]
- print('', end=f'\r{len(tGames)} rows [of {rowCt}, max {maxRows}]')
- # sys.stdout.flush() # use ONLY if you notice large delays in printing
- try:
- thref = thref.replace('\\/', '/').replace('/', '\\/')
- thSel = 'table:has(th.text-center) tbody>tr:first-child>'
- thSel += 'td:first-child>a[href^="\/games\/"]'
- wait.until(EC.presence_of_all_elements_located((
- By.CSS_SELECTOR, f'{thSel}:not([href="{thref}"])')))
- except: tmoCt += 1
- driver.quit()
- del driver # just in case
- print(f'\n{tmoCt} timeouts, {rowCt} rows [{totalScrapes} scrapes]')
- pd.DataFrame(games).to_csv(opfn, index=False)
- print(f'saved to {os.path.abspath(opfn)}')
- ###############################################################################################################
- ###############################################################################################################
- def scrape_pakakumi_api(gameId, prevData):
- apiUrl = f'https://api.pakakumi.com/games/{gameId}'
- gKeys = ['id', 'crash', 'hash', 'created_at', 'plays']
- pKeys = ['game_id', 'user_id', 'bet', 'stopped_at', 'username']
- try:
- resp = requests.get(apiUrl, headers={
- 'Accept': 'application/json, text/plain, */*',
- 'Accept-Api-Version': '45'
- })
- resp.raise_for_status()
- jData = resp.json()
- gData = {('game_id' if k == 'id' else k): (
- jData[k] if k in jData else None
- ) for k in gKeys[:-1]}
- mKeys = [k for k in gKeys[1:] if k not in jData]
- errMsg = f'missing keys {", ".join(mKeys)} | ' if mKeys else ''
- if f'{jData["id"]}' != f'{gameId}':
- errMsg += f'game_id should be "{gameId}" not "{jData["id"]}" | '
- if 'plays' in jData and jData['plays'] == []: errMsg += 'no plays | '
- jpData = jData['plays'] if 'plays' in jData else []
- if not isinstance(jpData, list):
- jpData = []
- errMsg += f'plays should be list, not {type(jpData)} | '
- pData = [
- {k: p[k] if k in p else None for k in pKeys} for p in jpData
- if type(p) == dict and 'user_id' in p and 'game_id' in p
- ]
- if len(pData) != len(jpData):
- errMsg += f'{len(jpData)} plays reduced to {len(pData)} | '
- gData['msg'] = f'[req_status: {resp.status_code} {resp.reason}] '
- gData['msg'] += f'[scraped_at: {datetime.now().isoformat(" ")}] '
- if errMsg.strip():
- gData['msg_type'] = 'warning' if (
- 'id' in jData and f'{jData["id"]}' == f'{gameId}'
- ) else 'error'
- gData['msg'] += errMsg.strip(' | ').strip()
- else: gData['msg_type'] = 'success'
- except Exception as e:
- gData = {k: None for k in (['game_id'] + gKeys[1:-1])}
- gData['game_id'], pData = gameId, []
- gData['msg_type'], gData['msg'] = 'error', f'{type(e)} {e}'
- for k in gKeys[1:-1]:
- if not gData[k] and k in prevData: gData[k] = prevData[k]
- return gData, pData
- def scrape_pakakumi(wAmt=10, maxRows=999, max_tmo=10, maxWait=20, opfn='pakakumi.csv'):
- maxWait = maxWait if isinstance(maxWait, (int, float)) and maxWait > 0 else 20
- maxRows = maxRows if isinstance(maxRows, int) and maxRows > 0 else None
- maxIds = maxRows if maxRows and 100 < maxRows < 500 else 100
- max_tmo = max_tmo if isinstance(max_tmo, int) and max_tmo > 0 else 10
- opfn = opfn if isinstance(opfn, str) and opfn[-4:] == '.csv' else 'pakakumi.csv'
- wAmt = wAmt if isinstance(wAmt, (int, float)) and wAmt > 0 else None
- gfn = f'games_{opfn}'
- pfn = f'gamePlayers_{opfn}'
- gHeaders = ['game_id', 'crash', 'hash', 'created_at', 'msg_type', 'msg']
- pHeaders = ['game_id', 'user_id', 'bet', 'stopped_at', 'username']
- print('Output will be saved to: ')
- for fn, h in [(gfn, gHeaders), (pfn, pHeaders)]:
- if not os.path.isfile(fn):
- # initiate new file with header row [helpful for appending in loop]
- pd.DataFrame([{k: k for k in h}]).to_csv(fn, index=False, header=False)
- print(f' {os.path.abspath(fn)}')
- try:
- prevData = pd.read_csv(gfn).to_dict('records') # get data from previous scrape
- addedIds = [str(g['game_id']) for g in prevData if 'game_id' in g][-1*maxIds:]
- except: addedIds = []
- rootUrl = 'https://play.pakakumi.com'
- # driver = webdriver.Chrome(path)
- # [I just copy chromedriver.exe to the same folder as this py file]
- driver = webdriver.Chrome()
- wait = WebDriverWait(driver, maxWait)
- driver.get(rootUrl)
- wait.until(EC.presence_of_all_elements_located((
- By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))
- tmoCt, tto, rowCt, loopCt, playCt, thref = 0, 0, 0, 0, 0, ''
- while tmoCt < max_tmo:
- if maxRows and rowCt > maxRows: break
- pData, gData = [], []
- addedIds = addedIds[-1*maxIds:]
- loopCt += 1
- tSoup = BeautifulSoup(driver.find_element(
- By.CSS_SELECTOR, 'table:has(th.text-center) tbody'
- ).get_attribute('outerHTML'), 'html.parser')
- thref = tSoup.select_one(
- f'tr:first-child>td:first-child>a[href]'
- ).get('href')
- tGames = [{
- 'game_id': a.get('href').replace('/games/', '').split('/')[0],
- 'crash': a.get_text(' ', strip=True), 'hash': i.get('value'),
- 'url': f"{rootUrl}{a.get('href')}"
- } for a, i in [(
- r.select_one('td>a[href^="\/games\/"]'), r.select_one('td>input[readonly]')
- ) for r in tSoup.select('tr:has(>td>a[href^="\/games\/"]):has(>td>input[readonly])')]]
- for tg in tGames:
- if str(tg['game_id']) in addedIds: continue
- tgg, tgp = scrape_pakakumi_api(tg['game_id'], tg)
- gData.append(tgg)
- pData += tgp
- for ofn, fh, fd in [(gfn, gHeaders, gData), (pfn, pHeaders, pData)]:
- # append [by setting mode='a'] to avoid over-writing previous data
- if fd: pd.DataFrame(fd)[fh].to_csv(ofn, mode='a', index=False, header=False)
- rowCt += len(gData)
- playCt += len(pData)
- addedIds += [str(g['game_id']) for g in gData]
- toPrint = f'[{loopCt} - {tmoCt}tmo] {len(gData)} rows [of {rowCt}'
- toPrint += f', max {maxRows}], {len(pData)} players [of {playCt}]'
- print('', end=f'\r{toPrint}{" "*80}')
- # sys.stdout.flush() # use ONLY if you notice large delays in printing
- if not gData:
- tmoCt += 1
- tto += 1
- # reload page just in case:
- driver.get(rootUrl)
- wait.until(EC.presence_of_all_elements_located((
- By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))
- else: tmoCt = 0
- if isinstance(wAmt, float):
- if not gData:
- print('', end=f'\r{toPrint} - waiting {wAmt}sec...')
- # sys.stdout.flush() # use ONLY if you notice large delays in printing
- time.sleep(wAmt)
- continue
- try:
- thref = thref.replace('\\/', '/').replace('/', '\\/')
- thSel = 'table:has(th.text-center) tbody'
- if isinstance(wAmt, int) and 1 < wAmt < 39:
- thSel = f'{thSel}>tr:nth-child({wAmt})~tr>td:first-child'
- else: thSel = f'{thSel}>tr:first-child~tr>td:first-child'
- wait.until(EC.presence_of_all_elements_located((
- By.CSS_SELECTOR, f'{thSel}>a[href="{thref}"]')))
- except: tmoCt += 1
- driver.quit()
- del driver # just in case
- dRet = {
- 'timeouts_total': tto, 'timeouts_uncleared': tmoCt,
- 'games': rowCt, 'plays': playCt, 'loops': loopCt,
- 'games_op': os.path.abspath(gfn),
- 'plays_op': os.path.abspath(gfn)
- }
- print('\n\n\n'+'\n'.join([f'{k}: {v}' for k, v in dRet.items()]))
- # sys.stdout.flush() # use ONLY if you notice large delays in printing
- return dRet # in case you want to maintain a scrape-log...
Advertisement
Add Comment
Please, Sign In to add comment