Try95th

pakakumi_so_q_74848587

Dec 26th, 2022 (edited)
425
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.88 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/74848587/6146136
  2. ## sample outputs at https://docs.google.com/spreadsheets/d/1E1QBKqaWoCB2SPAMyz0A_oe58MaV-yQbaSBGxGMBPC8
  3. '''
  4. ############# calls used for sample outputs #############
  5. scrape_pakakumi(5.0, 60)
  6. scrape_pakakumi_lim(60)
  7. scrape_pakakumi(10, maxWait=400) ## stopped using keyboard interrupt
  8. #########################################################
  9. '''
  10.  
  11. from selenium import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.ui import WebDriverWait
  14. from selenium.webdriver.support import expected_conditions as EC
  15.  
  16. from bs4 import BeautifulSoup
  17. import pandas as pd
  18. from datetime import datetime
  19.  
  20. import requests # only needed for scrape_pakakumi_api
  21. import time # not needed for scrape_pakakumi_lim
  22.  
  23. ############################################### SHORTER VERSION ###############################################
  24. def scrape_pakakumi_lim(maxRows=50, max_tmo=10, maxWait=100, opfn='pakakumi.csv'):
  25.     maxWait = maxWait if isinstance(maxWait, (int, float)) and maxWait > 0 else 100
  26.     maxRows = maxRows if isinstance(maxRows, int) and maxRows > 0 else 50
  27.     max_tmo = max_tmo if isinstance(max_tmo, int) and max_tmo > 0 else 10
  28.     opfn = opfn if isinstance(opfn, str) and opfn[-4:] == '.csv' else 'pakakumi.csv'
  29.     rootUrl = 'https://play.pakakumi.com'
  30.  
  31.     # driver = webdriver.Chrome(path)
  32.     # [I just copy chromedriver.exe to the same folder as this py file]
  33.     driver = webdriver.Chrome()
  34.     wait = WebDriverWait(driver, maxWait)
  35.     driver.get(rootUrl)
  36.     wait.until(EC.presence_of_all_elements_located((
  37.         By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))
  38.  
  39.     tmoCt, rowCt, totalScrapes = 0, 0, 0
  40.     addedIds, games, thref = [], [], ''
  41.     while rowCt < maxRows and tmoCt < max_tmo:
  42.         scrapeTime = datetime.now()
  43.         tSoup = BeautifulSoup(driver.find_element(
  44.             By.CSS_SELECTOR, 'table:has(th.text-center) tbody'
  45.         ).get_attribute('outerHTML'), 'html.parser')
  46.         thref = tSoup.select_one(
  47.             f'tr:first-child>td:first-child>a[href]'
  48.         ).get('href')
  49.  
  50.         tGames = [{
  51.             'scraped_at': scrapeTime.isoformat(),
  52.             'game_id': a.get('href').replace('/games/', '').split('/')[0],
  53.             'crash': a.get_text(' ', strip=True), 'hash': i.get('value'),
  54.             'url': f"{rootUrl}{a.get('href')}"
  55.         } for a, i in [(
  56.             r.select_one('td>a[href^="\/games\/"]'),
  57.             r.select_one('td>input[readonly]')
  58.         ) for r in tSoup.select(
  59.             'tr:has(>td>a[href^="\/games\/"]):has(>td>input[readonly])'
  60.         )]]
  61.         totalScrapes += len(tGames)
  62.         tGames = [t for t in tGames if t['game_id'] not in addedIds]
  63.  
  64.         rowCt += len(tGames)
  65.         games += tGames
  66.         addedIds += [t['game_id'] for t in tGames]
  67.         print('', end=f'\r{len(tGames)} rows [of {rowCt}, max {maxRows}]')
  68.         # sys.stdout.flush() # use ONLY if you notice large delays in printing
  69.  
  70.         try:
  71.             thref = thref.replace('\\/', '/').replace('/', '\\/')
  72.             thSel = 'table:has(th.text-center) tbody>tr:first-child>'
  73.             thSel += 'td:first-child>a[href^="\/games\/"]'
  74.             wait.until(EC.presence_of_all_elements_located((
  75.                 By.CSS_SELECTOR, f'{thSel}:not([href="{thref}"])')))
  76.         except: tmoCt += 1
  77.  
  78.     driver.quit()
  79.     del driver  # just in case
  80.  
  81.     print(f'\n{tmoCt} timeouts, {rowCt} rows [{totalScrapes} scrapes]')
  82.     pd.DataFrame(games).to_csv(opfn, index=False)
  83.     print(f'saved to {os.path.abspath(opfn)}')
  84.    
  85. ###############################################################################################################
  86. ###############################################################################################################
  87.  
  88. def scrape_pakakumi_api(gameId, prevData):
  89.     apiUrl = f'https://api.pakakumi.com/games/{gameId}'
  90.     gKeys = ['id', 'crash', 'hash', 'created_at', 'plays']
  91.     pKeys = ['game_id', 'user_id', 'bet', 'stopped_at', 'username']
  92.  
  93.     try:
  94.         resp = requests.get(apiUrl, headers={
  95.             'Accept': 'application/json, text/plain, */*',
  96.             'Accept-Api-Version': '45'
  97.         })
  98.         resp.raise_for_status()
  99.         jData = resp.json()
  100.         gData = {('game_id' if k == 'id' else k): (
  101.             jData[k] if k in jData else None
  102.         ) for k in gKeys[:-1]}
  103.         mKeys = [k for k in gKeys[1:] if k not in jData]
  104.         errMsg = f'missing keys {", ".join(mKeys)} | ' if mKeys else ''
  105.         if f'{jData["id"]}' != f'{gameId}':
  106.             errMsg += f'game_id should be "{gameId}" not "{jData["id"]}" | '
  107.  
  108.         if 'plays' in jData and jData['plays'] == []: errMsg += 'no plays | '
  109.         jpData = jData['plays'] if 'plays' in jData else []
  110.         if not isinstance(jpData, list):
  111.             jpData = []
  112.             errMsg += f'plays should be list, not {type(jpData)} | '
  113.         pData = [
  114.             {k: p[k] if k in p else None for k in pKeys} for p in jpData
  115.             if type(p) == dict and 'user_id' in p and 'game_id' in p
  116.         ]
  117.         if len(pData) != len(jpData):
  118.             errMsg += f'{len(jpData)} plays reduced to {len(pData)} | '
  119.  
  120.         gData['msg'] = f'[req_status: {resp.status_code} {resp.reason}] '
  121.         gData['msg'] += f'[scraped_at: {datetime.now().isoformat(" ")}] '
  122.         if errMsg.strip():
  123.             gData['msg_type'] = 'warning' if (
  124.                 'id' in jData and f'{jData["id"]}' == f'{gameId}'
  125.             ) else 'error'
  126.             gData['msg'] += errMsg.strip(' | ').strip()
  127.         else: gData['msg_type'] = 'success'
  128.     except Exception as e:
  129.         gData = {k: None for k in (['game_id'] + gKeys[1:-1])}
  130.         gData['game_id'], pData = gameId, []
  131.         gData['msg_type'], gData['msg'] = 'error', f'{type(e)} {e}'
  132.  
  133.     for k in gKeys[1:-1]:
  134.         if not gData[k] and k in prevData: gData[k] = prevData[k]
  135.     return gData, pData
  136.  
  137.  
  138. def scrape_pakakumi(wAmt=10, maxRows=999, max_tmo=10, maxWait=20, opfn='pakakumi.csv'):
  139.     maxWait = maxWait if isinstance(maxWait, (int, float)) and maxWait > 0 else 20
  140.     maxRows = maxRows if isinstance(maxRows, int) and maxRows > 0 else None
  141.     maxIds = maxRows if maxRows and 100 < maxRows < 500 else 100
  142.     max_tmo = max_tmo if isinstance(max_tmo, int) and max_tmo > 0 else 10
  143.     opfn = opfn if isinstance(opfn, str) and opfn[-4:] == '.csv' else 'pakakumi.csv'
  144.     wAmt = wAmt if isinstance(wAmt, (int, float)) and wAmt > 0 else None
  145.  
  146.     gfn = f'games_{opfn}'
  147.     pfn = f'gamePlayers_{opfn}'
  148.     gHeaders = ['game_id', 'crash', 'hash', 'created_at', 'msg_type', 'msg']
  149.     pHeaders = ['game_id', 'user_id', 'bet', 'stopped_at', 'username']
  150.     print('Output will be saved to: ')
  151.     for fn, h in [(gfn, gHeaders), (pfn, pHeaders)]:
  152.         if not os.path.isfile(fn):
  153.             # initiate new file with header row [helpful for appending in loop]
  154.             pd.DataFrame([{k: k for k in h}]).to_csv(fn, index=False, header=False)
  155.         print(f'    {os.path.abspath(fn)}')
  156.  
  157.     try:
  158.         prevData = pd.read_csv(gfn).to_dict('records') # get data from previous scrape
  159.         addedIds = [str(g['game_id']) for g in prevData if 'game_id' in g][-1*maxIds:]
  160.     except: addedIds = []
  161.  
  162.     rootUrl = 'https://play.pakakumi.com'
  163.     # driver = webdriver.Chrome(path)
  164.     # [I just copy chromedriver.exe to the same folder as this py file]
  165.     driver = webdriver.Chrome()
  166.     wait = WebDriverWait(driver, maxWait)
  167.     driver.get(rootUrl)
  168.     wait.until(EC.presence_of_all_elements_located((
  169.         By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))
  170.  
  171.     tmoCt, tto, rowCt, loopCt, playCt, thref = 0, 0, 0, 0, 0, ''
  172.     while tmoCt < max_tmo:
  173.         if maxRows and rowCt > maxRows: break
  174.         pData, gData = [], []
  175.         addedIds = addedIds[-1*maxIds:]
  176.         loopCt += 1
  177.  
  178.         tSoup = BeautifulSoup(driver.find_element(
  179.             By.CSS_SELECTOR, 'table:has(th.text-center) tbody'
  180.         ).get_attribute('outerHTML'), 'html.parser')
  181.         thref = tSoup.select_one(
  182.             f'tr:first-child>td:first-child>a[href]'
  183.         ).get('href')
  184.  
  185.         tGames = [{
  186.             'game_id': a.get('href').replace('/games/', '').split('/')[0],
  187.             'crash': a.get_text(' ', strip=True), 'hash': i.get('value'),
  188.             'url': f"{rootUrl}{a.get('href')}"
  189.         } for a, i in [(
  190.             r.select_one('td>a[href^="\/games\/"]'), r.select_one('td>input[readonly]')
  191.         ) for r in tSoup.select('tr:has(>td>a[href^="\/games\/"]):has(>td>input[readonly])')]]
  192.  
  193.         for tg in tGames:
  194.             if str(tg['game_id']) in addedIds: continue
  195.             tgg, tgp = scrape_pakakumi_api(tg['game_id'], tg)
  196.             gData.append(tgg)
  197.             pData += tgp
  198.         for ofn, fh, fd in [(gfn, gHeaders, gData), (pfn, pHeaders, pData)]:
  199.             # append [by setting mode='a'] to avoid over-writing previous data
  200.             if fd: pd.DataFrame(fd)[fh].to_csv(ofn, mode='a', index=False, header=False)
  201.  
  202.         rowCt += len(gData)
  203.         playCt += len(pData)
  204.         addedIds += [str(g['game_id']) for g in gData]
  205.  
  206.         toPrint = f'[{loopCt} - {tmoCt}tmo] {len(gData)} rows [of {rowCt}'
  207.         toPrint += f', max {maxRows}], {len(pData)} players [of {playCt}]'
  208.         print('', end=f'\r{toPrint}{" "*80}')
  209.         # sys.stdout.flush() # use ONLY if you notice large delays in printing
  210.  
  211.         if not gData:
  212.             tmoCt += 1
  213.             tto += 1
  214.  
  215.             # reload page just in case:
  216.             driver.get(rootUrl)
  217.             wait.until(EC.presence_of_all_elements_located((
  218.                 By.CSS_SELECTOR, 'table:has(th.text-center) tbody>tr>td')))
  219.         else: tmoCt = 0
  220.  
  221.         if isinstance(wAmt, float):
  222.             if not gData:
  223.                 print('', end=f'\r{toPrint} - waiting {wAmt}sec...')
  224.                 # sys.stdout.flush() # use ONLY if you notice large delays in printing
  225.                 time.sleep(wAmt)
  226.             continue
  227.  
  228.         try:
  229.             thref = thref.replace('\\/', '/').replace('/', '\\/')
  230.             thSel = 'table:has(th.text-center) tbody'
  231.             if isinstance(wAmt, int) and 1 < wAmt < 39:
  232.                 thSel = f'{thSel}>tr:nth-child({wAmt})~tr>td:first-child'
  233.             else: thSel = f'{thSel}>tr:first-child~tr>td:first-child'
  234.             wait.until(EC.presence_of_all_elements_located((
  235.                 By.CSS_SELECTOR, f'{thSel}>a[href="{thref}"]')))
  236.         except: tmoCt += 1
  237.  
  238.     driver.quit()
  239.     del driver  # just in case
  240.  
  241.     dRet = {
  242.         'timeouts_total': tto, 'timeouts_uncleared': tmoCt,
  243.         'games': rowCt, 'plays': playCt, 'loops': loopCt,
  244.         'games_op': os.path.abspath(gfn),
  245.         'plays_op': os.path.abspath(gfn)
  246.     }
  247.     print('\n\n\n'+'\n'.join([f'{k}: {v}' for k, v in dRet.items()]))
  248.     # sys.stdout.flush() # use ONLY if you notice large delays in printing
  249.     return dRet # in case you want to maintain a scrape-log...
  250.  
  251.  
  252.  
Advertisement
Add Comment
Please, Sign In to add comment