pakakumi_so_q_74848587

## for https://stackoverflow.com/q/74848587/6146136
## sample outputs at https://docs.google.com/spreadsheets/d/1E1QBKqaWoCB2SPAMyz0A_oe58MaV-yQbaSBGxGMBPC8
'''
############# calls used for sample outputs #############
scrape_pakakumi(5.0, 60)
scrape_pakakumi_lim(60)
scrape_pakakumi(10, maxWait=400) ## stopped using keyboard interrupt
#########################################################
'''

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

import requests # only needed for scrape_pakakumi_api
import time # not needed for scrape_pakakumi_lim

############################################### SHORTER VERSION ###############################################
def scrape_pakakumi_lim(maxRows=50, max_tmo=10, maxWait=100, opfn='pakakumi.csv'):
    maxWait = maxWait if isinstance(maxWait, (int, float)) and maxWait > 0 else 100
    maxRows = maxRows if isinstance(maxRows, int) and maxRows > 0 else 50
    max_tmo = max_tmo if isinstance(max_tmo, int) and max_tmo > 0 else 10
    opfn = opfn if isinstance(opfn, str) and opfn[-4:] == '.csv' else 'pakakumi.csv'
    rootUrl = 'https://play.pakakumi.com'

    # driver = webdriver.Chrome(path)
    # [I just copy chromedriver.exe to the same folder as this py file]
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, maxWait)
    driver.get(rootUrl)
    wait.until(EC.presence_of_all_elements_located((
        By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))

    tmoCt, rowCt, totalScrapes = 0, 0, 0
    addedIds, games, thref = [], [], ''
    while rowCt < maxRows and tmoCt < max_tmo:
        scrapeTime = datetime.now()
        tSoup = BeautifulSoup(driver.find_element(
            By.CSS_SELECTOR, 'table:has(th.text-center) tbody'
        ).get_attribute('outerHTML'), 'html.parser')
        thref = tSoup.select_one(
            f'tr:first-child>td:first-child>a[href]'
        ).get('href')

        tGames = [{
            'scraped_at': scrapeTime.isoformat(),
            'game_id': a.get('href').replace('/games/', '').split('/')[0],
            'crash': a.get_text(' ', strip=True), 'hash': i.get('value'),
            'url': f"{rootUrl}{a.get('href')}"
        } for a, i in [(
            r.select_one('td>a[href^="\/games\/"]'),
            r.select_one('td>input[readonly]')
        ) for r in tSoup.select(
            'tr:has(>td>a[href^="\/games\/"]):has(>td>input[readonly])'
        )]]
        totalScrapes += len(tGames)
        tGames = [t for t in tGames if t['game_id'] not in addedIds]

        rowCt += len(tGames)
        games += tGames
        addedIds += [t['game_id'] for t in tGames]
        print('', end=f'\r{len(tGames)} rows [of {rowCt}, max {maxRows}]')
        # sys.stdout.flush() # use ONLY if you notice large delays in printing

        try:
            thref = thref.replace('\\/', '/').replace('/', '\\/')
            thSel = 'table:has(th.text-center) tbody>tr:first-child>'
            thSel += 'td:first-child>a[href^="\/games\/"]'
            wait.until(EC.presence_of_all_elements_located((
                By.CSS_SELECTOR, f'{thSel}:not([href="{thref}"])')))
        except: tmoCt += 1

    driver.quit()
    del driver  # just in case

    print(f'\n{tmoCt} timeouts, {rowCt} rows [{totalScrapes} scrapes]')
    pd.DataFrame(games).to_csv(opfn, index=False)
    print(f'saved to {os.path.abspath(opfn)}')

###############################################################################################################
###############################################################################################################

def scrape_pakakumi_api(gameId, prevData):
    apiUrl = f'https://api.pakakumi.com/games/{gameId}'
    gKeys = ['id', 'crash', 'hash', 'created_at', 'plays']
    pKeys = ['game_id', 'user_id', 'bet', 'stopped_at', 'username']

    try:
        resp = requests.get(apiUrl, headers={
            'Accept': 'application/json, text/plain, */*',
            'Accept-Api-Version': '45'
        })
        resp.raise_for_status()
        jData = resp.json()
        gData = {('game_id' if k == 'id' else k): (
            jData[k] if k in jData else None
        ) for k in gKeys[:-1]}
        mKeys = [k for k in gKeys[1:] if k not in jData]
        errMsg = f'missing keys {", ".join(mKeys)} | ' if mKeys else ''
        if f'{jData["id"]}' != f'{gameId}':
            errMsg += f'game_id should be "{gameId}" not "{jData["id"]}" | '

        if 'plays' in jData and jData['plays'] == []: errMsg += 'no plays | '
        jpData = jData['plays'] if 'plays' in jData else []
        if not isinstance(jpData, list):
            jpData = []
            errMsg += f'plays should be list, not {type(jpData)} | '
        pData = [
            {k: p[k] if k in p else None for k in pKeys} for p in jpData
            if type(p) == dict and 'user_id' in p and 'game_id' in p
        ]
        if len(pData) != len(jpData):
            errMsg += f'{len(jpData)} plays reduced to {len(pData)} | '

        gData['msg'] = f'[req_status: {resp.status_code} {resp.reason}] '
        gData['msg'] += f'[scraped_at: {datetime.now().isoformat(" ")}] '
        if errMsg.strip():
            gData['msg_type'] = 'warning' if (
                'id' in jData and f'{jData["id"]}' == f'{gameId}'
            ) else 'error'
            gData['msg'] += errMsg.strip(' | ').strip()
        else: gData['msg_type'] = 'success'
    except Exception as e:
        gData = {k: None for k in (['game_id'] + gKeys[1:-1])}
        gData['game_id'], pData = gameId, []
        gData['msg_type'], gData['msg'] = 'error', f'{type(e)} {e}'

    for k in gKeys[1:-1]:
        if not gData[k] and k in prevData: gData[k] = prevData[k]
    return gData, pData


def scrape_pakakumi(wAmt=10, maxRows=999, max_tmo=10, maxWait=20, opfn='pakakumi.csv'):
    maxWait = maxWait if isinstance(maxWait, (int, float)) and maxWait > 0 else 20
    maxRows = maxRows if isinstance(maxRows, int) and maxRows > 0 else None
    maxIds = maxRows if maxRows and 100 < maxRows < 500 else 100
    max_tmo = max_tmo if isinstance(max_tmo, int) and max_tmo > 0 else 10
    opfn = opfn if isinstance(opfn, str) and opfn[-4:] == '.csv' else 'pakakumi.csv'
    wAmt = wAmt if isinstance(wAmt, (int, float)) and wAmt > 0 else None

    gfn = f'games_{opfn}'
    pfn = f'gamePlayers_{opfn}'
    gHeaders = ['game_id', 'crash', 'hash', 'created_at', 'msg_type', 'msg']
    pHeaders = ['game_id', 'user_id', 'bet', 'stopped_at', 'username']
    print('Output will be saved to: ')
    for fn, h in [(gfn, gHeaders), (pfn, pHeaders)]:
        if not os.path.isfile(fn):
            # initiate new file with header row [helpful for appending in loop]
            pd.DataFrame([{k: k for k in h}]).to_csv(fn, index=False, header=False)
        print(f'    {os.path.abspath(fn)}')

    try:
        prevData = pd.read_csv(gfn).to_dict('records') # get data from previous scrape
        addedIds = [str(g['game_id']) for g in prevData if 'game_id' in g][-1*maxIds:]
    except: addedIds = []

    rootUrl = 'https://play.pakakumi.com'
    # driver = webdriver.Chrome(path)
    # [I just copy chromedriver.exe to the same folder as this py file]
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, maxWait)
    driver.get(rootUrl)
    wait.until(EC.presence_of_all_elements_located((
        By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))

    tmoCt, tto, rowCt, loopCt, playCt, thref = 0, 0, 0, 0, 0, ''
    while tmoCt < max_tmo:
        if maxRows and rowCt > maxRows: break
        pData, gData = [], []
        addedIds = addedIds[-1*maxIds:]
        loopCt += 1

        tSoup = BeautifulSoup(driver.find_element(
            By.CSS_SELECTOR, 'table:has(th.text-center) tbody'
        ).get_attribute('outerHTML'), 'html.parser')
        thref = tSoup.select_one(
            f'tr:first-child>td:first-child>a[href]'
        ).get('href')

        tGames = [{
            'game_id': a.get('href').replace('/games/', '').split('/')[0],
            'crash': a.get_text(' ', strip=True), 'hash': i.get('value'),
            'url': f"{rootUrl}{a.get('href')}"
        } for a, i in [(
            r.select_one('td>a[href^="\/games\/"]'), r.select_one('td>input[readonly]')
        ) for r in tSoup.select('tr:has(>td>a[href^="\/games\/"]):has(>td>input[readonly])')]]

        for tg in tGames:
            if str(tg['game_id']) in addedIds: continue
            tgg, tgp = scrape_pakakumi_api(tg['game_id'], tg)
            gData.append(tgg)
            pData += tgp
        for ofn, fh, fd in [(gfn, gHeaders, gData), (pfn, pHeaders, pData)]:
            # append [by setting mode='a'] to avoid over-writing previous data
            if fd: pd.DataFrame(fd)[fh].to_csv(ofn, mode='a', index=False, header=False)

        rowCt += len(gData)
        playCt += len(pData)
        addedIds += [str(g['game_id']) for g in gData]

        toPrint = f'[{loopCt} - {tmoCt}tmo] {len(gData)} rows [of {rowCt}'
        toPrint += f', max {maxRows}], {len(pData)} players [of {playCt}]'
        print('', end=f'\r{toPrint}{" "*80}')
        # sys.stdout.flush() # use ONLY if you notice large delays in printing

        if not gData:
            tmoCt += 1
            tto += 1

            # reload page just in case:
            driver.get(rootUrl)
            wait.until(EC.presence_of_all_elements_located((
                By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))
        else: tmoCt = 0

        if isinstance(wAmt, float):
            if not gData:
                print('', end=f'\r{toPrint} - waiting {wAmt}sec...')
                # sys.stdout.flush() # use ONLY if you notice large delays in printing
                time.sleep(wAmt)
            continue

        try:
            thref = thref.replace('\\/', '/').replace('/', '\\/')
            thSel = 'table:has(th.text-center) tbody'
            if isinstance(wAmt, int) and 1 < wAmt < 39:
                thSel = f'{thSel}>tr:nth-child({wAmt})~tr>td:first-child'
            else: thSel = f'{thSel}>tr:first-child~tr>td:first-child'
            wait.until(EC.presence_of_all_elements_located((
                By.CSS_SELECTOR, f'{thSel}>a[href="{thref}"]')))
        except: tmoCt += 1

    driver.quit()
    del driver  # just in case

    dRet = {
        'timeouts_total': tto, 'timeouts_uncleared': tmoCt,
        'games': rowCt, 'plays': playCt, 'loops': loopCt,
        'games_op': os.path.abspath(gfn),
        'plays_op': os.path.abspath(gfn)
    }
    print('\n\n\n'+'\n'.join([f'{k}: {v}' for k, v in dRet.items()]))
    # sys.stdout.flush() # use ONLY if you notice large delays in printing
    return dRet # in case you want to maintain a scrape-log...