Try95th

proni_scraper

Nov 14th, 2022 (edited)
178
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 25.18 KB | None | 0 0
  1. ## an example of how to scrape multiple pages with selenium via js hyperlinks (without simple-url hrefs)
  2. ## scroll to bottom for some example usages
  3. ## to view some sample outputs, go to https://bit.ly/proniSc_gdf
  4. #### [list of inputs and more in "proniScraper_logs.json" -> converted to "_proniScraper_logs.csv"]
  5. #### ABOUT OUPUT/S: The "Image Viewer" sheet is added later - it's not automatically generated (next step?) ####
  6.  
  7. ## [an alternate to https://stackoverflow.com/questions/74232047 ]
  8.  
  9. import os
  10. import sys
  11. import copy
  12. import json
  13. import pandas
  14. import urllib.parse
  15. import requests
  16. from bs4 import BeautifulSoup
  17. from datetime import datetime
  18.  
  19. from selenium import webdriver
  20. from selenium.webdriver.common.by import By
  21. from selenium.webdriver.support.ui import WebDriverWait
  22. from selenium.webdriver.common.keys import Keys
  23. from selenium.webdriver.support import expected_conditions as EC
  24. from selenium.webdriver.common.action_chains import ActionChains
  25.  
  26.  
  27. def getErrorMssage(e, excInf, e2Pref=''):
  28.     errMsg = f'{type(e)} on UNKNOWN_LINE - Message: "{str(e)}"'
  29.     try:
  30.         et, em, tb = excInf
  31.         errMsg = f'{et} on line {tb.tb_lineno} - Message: "{em}"'
  32.     except Exception as e2:
  33.         print(f'\n[{e2Pref}]failed to get errorline  -', str(e2))
  34.     return errMsg
  35.  
  36.  
  37. def getObjAttrs(inpObj, isv=False):
  38.     aDict = {}
  39.     for d in dir(inpObj):
  40.         try:
  41.             a = getattr(inpObj, d, None)
  42.             if d[0] == '_' or callable(a):
  43.                 if isv:
  44.                     print(f'--> [not added] {d} {type(a)} {a}')
  45.                 continue
  46.  
  47.             if type(a) not in [str, dict] and hasattr(a, '__iter__'):
  48.                 aDict[d] = [i for i in a]
  49.             else:
  50.                 aDict[d] = a
  51.             if isv:
  52.                 print(f'--> [added] {d} {type(a)} {a}')
  53.         except Exception as e:
  54.             if isv:
  55.                 print(f'!--> {d} {e}')
  56.             aDict[d] = getErrorMssage(e, tuple(sys.exc_info()))
  57.     if isv:
  58.         print(f'\n{aDict}')
  59.     return aDict
  60.  
  61.  
  62. def prepForJson(inpObj, foldThresh=50, omitIter=[]):
  63.     if type(inpObj) in [bool, int, float]:
  64.         return inpObj
  65.     if type(inpObj) == dict:
  66.         return {k: (
  67.             f'omitted [{type(v)} with {len(v)} items]'
  68.             if k in omitIter and type(v) in [list, tuple, set]
  69.             else prepForJson(v, foldThresh, omitIter)
  70.         ) for k, v in inpObj.items()}
  71.     if type(inpObj) in [list, tuple, set]:
  72.         return [prepForJson(x, foldThresh, omitIter) for x in inpObj]
  73.     ioStr = str(inpObj)
  74.     return ioStr if len(ioStr) < foldThresh else [ioStr]
  75.  
  76.  
  77. def get_opFilename(nameRoot='proni_scraped', folderPath='', ext='.xlsx'):  # fnr, fp, ext
  78.     if type(folderPath) != str or not folderPath.strip():
  79.         folderPath = None
  80.     prevFileNums = [int(fnc) for fnc in [
  81.         f.replace(nameRoot+'_', '', 1)[:-1*len(ext)]
  82.         for f in os.listdir(folderPath)
  83.         if f.startswith(nameRoot+'_') and f.endswith(ext)
  84.     ] if fnc.isdigit()]
  85.     fileNum = (max(prevFileNums)+1) if prevFileNums else 0
  86.     folderPath = '' if folderPath is None else folderPath.strip()
  87.     opfilename = os.path.join(folderPath, f'{nameRoot}_{fileNum}{ext}')
  88.     print(f'saving to {opfilename}')
  89.     return opfilename
  90.  
  91.  
  92. def downloadRefImgs(valInp, folderp=''):
  93.     startTime = datetime.now()
  94.  
  95.     def checkIfImgKey(k):
  96.         specialImgs = ['cover_image', 'index_image', 'last_image']
  97.         if type(k) == str:
  98.             if k in specialImgs:
  99.                 return True
  100.             if k.startswith('image_'):
  101.                 return k.replace('image_', '', 1).isdigit()
  102.         return False
  103.     dlg = [{'[rel]imagePath': 'InputFile:'},
  104.            {'[rel]imagePath': 'Total Time (seconds)'},
  105.            {'[rel]imagePath': 'success'}]  # [extra info in top 3 rows]
  106.     if type(valInp) == str:
  107.         valImgs = pandas.read_excel(
  108.             valInp, sheet_name='Reference Images'
  109.         ).to_dict('records')
  110.     elif type(valInp) == list:
  111.         valImgs = valInp[:]
  112.         dlg[0]['status'] = f'[direct input]'
  113.     else:
  114.         valImgs = []
  115.  
  116.     valImgs = [
  117.         vi for vi in valImgs if type(vi) == dict and
  118.         'ProniReference' in vi and type(vi['ProniReference']) == str
  119.         and 'cover_image' in vi and 'total_images' in vi
  120.     ]
  121.     vln = len(valImgs)
  122.     dlg[0]['imageLink'] = f'with {vln} references'
  123.  
  124.     dlSuccess = dlFails = dlUnkn = 0
  125.     for i, vi in enumerate(valImgs):
  126.         valRef = vi['ProniReference'].replace('/', '', 2).replace('/', '-')
  127.         ict = len([c for c in vi.keys() if checkIfImgKey(c)])
  128.         aboutVal = f'{ict} images for {valRef} [{i+1} of {vln}]{" "*10}'
  129.         if ict > 0:
  130.             if not os.path.isdir(os.path.join(folderp, valRef)):
  131.                 os.mkdir(os.path.join(folderp, valRef))
  132.         for col, v in vi.items():
  133.             if not (checkIfImgKey(col) and type(v) == str):
  134.                 continue
  135.             print('', end=f'\rDownloading {col} of {aboutVal}')
  136.             allowedExts = ['jpg', 'jpeg', 'png', 'svg']
  137.             ifne = v.split('.')[-1]
  138.             if not(v.startswith('http') and ifne in allowedExts):
  139.                 continue
  140.             img_fn = os.path.join(folderp, valRef)
  141.             img_fn = os.path.join(img_fn, f'{valRef}_{col}.{ifne}')
  142.             for_dlg = {'[rel]imagePath': img_fn,
  143.                        'status': '?unsaved', 'imageLink': v}
  144.             try:
  145.                 with open(img_fn, "wb") as f:
  146.                     f.write(requests.get(v).content)
  147.                     f.close()
  148.                     del f
  149.                 if os.path.isfile(img_fn):
  150.                     dlSuccess += 1
  151.                     for_dlg['status'] = 'saved'
  152.                 else:
  153.                     dlUnkn += 1
  154.             except Exception as err:
  155.                 erMsg = getErrorMssage(err, tuple(sys.exc_info()))
  156.                 for_dlg['status'] = erMsg
  157.                 print('', end=erMsg)
  158.                 dlFails += 1
  159.             dlg.append(for_dlg)
  160.     print('')
  161.     totalTime = (datetime.now() - startTime).total_seconds()
  162.     dlg[1]['status'] = totalTime
  163.     dlg[2]['[rel]imagePath'] = f'{dlSuccess} successful'
  164.     dlg[2]['status'] = f'{dlFails} known errors'
  165.     dlg[2]['imageLink'] = f'{dlUnkn} unknown errors'
  166.  
  167.     dlgfn = get_opFilename('proni_imgdl_logs', folderp, '.csv')
  168.     try:
  169.         pandas.DataFrame(dlg).to_csv(dlgfn, index=False)
  170.         return dlgfn, totalTime
  171.     except Exception as err:
  172.         emsg = getErrorMssage(err, tuple(sys.exc_info()))
  173.         print(f'[failed to save imageDownload logs] {emsg}')
  174.         return emsg, totalTime
  175.  
  176.  
  177. def proni_scraper(searchFor, max_pages=10, max_images=5, conf={}, sInd=None):
  178.     for_sl, pageTimes = {
  179.         '[inp] searchFor': searchFor, '[inp] max_pages': max_pages,
  180.         '[inp] max_images': max_images, '[inp] conf': conf
  181.     }, []
  182.     # rootUrl = 'https://apps.proni.gov.uk'
  183.     searchUrl = 'https://apps.proni.gov.uk/Val12B/Search.aspx'
  184.     errorMsg = 'No Errors'
  185.     total_refImages = 'UNKNOWN'
  186.     total_scrapedImages = 'UNKNOWN'
  187.     total_res = 'UNKNOWN'
  188.  
  189.     startTime = conf['startTime'] if 'startTime' in conf else 0
  190.     if type(startTime) != type(datetime.now()):
  191.         startTime = datetime.now()
  192.     for_sl['startTime'] = startTime.isoformat()
  193.  
  194.     logSearch = conf['logSearch'] if 'logSearch' in conf else 'proniScraper_logs.json'
  195.     if type(logSearch) == str and logSearch.endswith('.json'):
  196.         try:
  197.             searchLog = json.load(open(logSearch, 'r'))
  198.             searchLog = searchLog if type(
  199.                 searchLog) == list else [copy.deepcopy(searchLog)]
  200.             print(f'{len(searchLog)} items in searchLog')
  201.         except Exception as err:
  202.             slEm = getErrorMssage(err, tuple(sys.exc_info()))
  203.             print('Could not retrieve searchLog', slEm)
  204.             searchLog = []
  205.     else:
  206.         print('scrape will not be logged')
  207.         logSearch = False
  208.  
  209.     errorLog = conf['errorLog'] if 'errorLog' in conf else []
  210.     tryCt = len(errorLog) + 1
  211.  
  212.     remRetries = conf['remRetries'] if 'remRetries' in conf else 5
  213.     remRetries = remRetries if type(
  214.         remRetries) == int and remRetries > 0 else 5
  215.     fnr = conf['fnr'] if 'fnr' in conf else 'proni_scraped'
  216.     fp = conf['fp'] if 'fp' in conf else ''
  217.  
  218.     resDets = conf['resDets'] if 'resDets' in conf else []
  219.     valImgs = conf['valImgs'] if 'valImgs' in conf else []
  220.     val_scraped = conf['val_scraped'] if 'val_scraped' in conf else []
  221.     probRefVals = conf['probRefVals'] if 'probRefVals' in conf else []
  222.  
  223.     scrollElToTop = "arguments[0].scrollIntoView(true);"
  224.     # scrollElToBottom = "arguments[0].scrollIntoView(false);"
  225.     forStr = f'[for "{searchFor}"]'
  226.  
  227.     try:
  228.         driver = webdriver.Chrome()
  229.         driver.get(searchUrl)
  230.  
  231.         # in case it opens on a set of search results
  232.         searchAgain = driver.find_elements(By.ID, 'searchAgain')
  233.         if searchAgain:
  234.             searchAgain[0].click()
  235.  
  236.         pageStart = datetime.now()
  237.  
  238.         # new search
  239.         driver.find_element(By.ID, 'txtSearch').send_keys(str(searchFor))
  240.         searchBtn = driver.find_element(By.ID, 'btnSearch')
  241.         driver.execute_script(scrollElToTop, searchBtn)
  242.         searchBtn.click()
  243.  
  244.         total_res = driver.find_elements(
  245.             By.CSS_SELECTOR, '.resultNavButtons .display > span')
  246.         if total_res:
  247.             total_res = total_res[0].get_attribute('innerText').strip()
  248.             total_res = int(total_res[1:-1].split(' of ')[1])
  249.         elif driver.find_elements(By.ID, 'pnlNoResults'):
  250.             total_res = 0
  251.             print('NO RESULTS FOR', searchFor)
  252.             return
  253.         else:
  254.             total_res = 'UNKNOWN'
  255.  
  256.         print(f'try#{tryCt} [{remRetries} left]')
  257.  
  258.         # page loop
  259.         for pn in range(max_pages):
  260.  
  261.             pgSoup = BeautifulSoup(driver.page_source, 'html.parser')
  262.             pageRows = [{
  263.                 c.span.get('id').replace('lbl', '', 1)
  264.                 if not c.input else 'ProniReference':
  265.                 c.span.get_text(strip=True) if not
  266.                 c.input else c.input.get('value')
  267.                 for c in r.select('td') if c.select('span,input')
  268.             } for r in pgSoup.select('table#gvSearchResults tr:has(td)')]
  269.  
  270.             resDets += pageRows
  271.             print('page', pn + 1, '-', len(pageRows), 'rows added of',
  272.                   f'min({total_res} rows, {max_pages} pages){forStr:30}')
  273.             if max_images > 0:
  274.                 val_new, scrapeVal = list(set([
  275.                     r['ProniReference'] for r in pageRows
  276.                     if r['ProniReference'] not in (val_scraped + probRefVals)
  277.                 ])), True
  278.             else:
  279.                 val_new, scrapeVal = 'N/A', False
  280.             pageTimes.append((datetime.now() - pageStart).total_seconds())
  281.  
  282.             for r in (range(len(pageRows)) if scrapeVal else []):
  283.                 viStart = datetime.now()
  284.                 val_toScrape = [v for v in driver.find_elements(
  285.                     By.CSS_SELECTOR, 'table#gvSearchResults td input[value]'
  286.                 ) if v.get_attribute('value') not in (val_scraped + probRefVals)]
  287.                 if val_toScrape == []:
  288.                     break
  289.  
  290.                 row_images = {
  291.                     'ProniReference': val_toScrape[0].get_attribute('value'),
  292.                     'total_images': 'UNKNOWN', 'scrapeTime': '?',
  293.                     'cover_image': None, 'index_image': None, 'last_image': None
  294.                 }
  295.                 if val_toScrape[0].get_attribute('disabled'):
  296.                     sibSpan = val_toScrape[0].find_elements(
  297.                         By.XPATH, '//following-sibling::span[@id="lblPRONI"][@title]')
  298.                     if sibSpan:
  299.                         vMsg = sibSpan[0].get_attribute('title')
  300.                     else:
  301.                         vMsg = f'UNKNOWN REASON'
  302.                     vMsg = f'Failed to scrape - "{vMsg}"'
  303.                     cStatus = f'Skipping {r+1} of {len(val_new)} [{row_images["ProniReference"]}]'
  304.                     probRefVals.append(row_images['ProniReference'])
  305.                     row_images['total_images'] = vMsg
  306.                     row_images['scrapeTime'] = (
  307.                         datetime.now() - viStart).total_seconds()
  308.                     valImgs.append(row_images)
  309.                     print(f'\r{cStatus} - {vMsg}')
  310.                     continue
  311.  
  312.                 driver.execute_script(scrollElToTop, val_toScrape[0])
  313.                 val_toScrape[0].click()
  314.  
  315.                 try:
  316.                     row_images['total_images'] = int(driver.find_element(
  317.                         By.CSS_SELECTOR, '.navRow3:has(input[id$="tBtn"]) > span'
  318.                     ).get_attribute('innerText').strip()[1:-1].split(' of ')[1])
  319.                 except:
  320.                     row_images['total_images'] = 'FAILED_TO_SCRAPE'
  321.                 cStatus = f'Scraping {r+1} of {len(val_new)} [{row_images["ProniReference"]}]'
  322.                 imgsCt_c = f'{row_images["total_images"]} val imgs'
  323.                 print('', end=f'\r{cStatus} [{imgsCt_c}]')
  324.  
  325.                 for i in range(max_images):
  326.                     imgsCt_c = f'{i+1} of min({row_images["total_images"]}, {max_images})'
  327.                     print('', end=f'\r{cStatus} [{imgsCt_c} val imgs]{" "*10}')
  328.                     imgCol = 'cover_image' if i == 0 else f'image_{i}'
  329.                     row_images[imgCol] = driver.find_element(
  330.                         By.ID, 'ImgCtrlLarge').get_attribute('src')
  331.  
  332.                     nextBtn = driver.find_elements(By.ID, 'NextBtn')
  333.                     if nextBtn:
  334.                         nextBtn[0].click()
  335.                     else:
  336.                         break
  337.                 print('')
  338.  
  339.                 specialImages = [('index_image', 'IndexBtn'),
  340.                                  ('last_image', 'LastBtn')]
  341.                 for si in specialImages:
  342.                     try:
  343.                         driver.find_element(By.ID, si[1]).click()
  344.                         row_images[si[0]] = driver.find_element(
  345.                             By.ID, 'ImgCtrlLarge').get_attribute('src')
  346.                     except:
  347.                         row_images[si[0]] = 'FAILED_TO_SCRAPE'
  348.  
  349.                 val_scraped.append(row_images['ProniReference'])
  350.                 row_images['scrapeTime'] = (
  351.                     datetime.now() - viStart).total_seconds()
  352.                 valImgs.append(row_images)
  353.                 driver.find_element(By.ID, 'backButton').click()
  354.  
  355.             try:
  356.                 total_res = int(pgSoup.select_one(
  357.                     '.resultNavButtons .display > span').get_text(
  358.                         strip=True)[1:-1].split(' of ')[1])
  359.             except:
  360.                 pass
  361.  
  362.             nextPage = driver.find_elements(By.ID, 'NextBtn')
  363.             if not nextPage:
  364.                 print('No More Next Page')
  365.                 break
  366.             else:
  367.                 driver.execute_script(scrollElToTop, nextPage[0])
  368.                 pageStart = datetime.now()
  369.                 nextPage[0].click()
  370.  
  371.         driver.quit()
  372.         total_refImages = sum([
  373.             d['total_images'] for d in valImgs
  374.             if type(d['total_images']) == int
  375.         ])
  376.         total_scrapedImages = sum([len([
  377.             1 for k, v in d.items() if v is not None and
  378.             (k.replace('image_', '').isdigit() or k == 'cover_image')
  379.         ]) for d in valImgs])
  380.  
  381.         imgdl_logpath = imgdl_time = 'N/A [No images downloaded]'
  382.         if 'downloadImages' in conf and conf['downloadImages']:
  383.             if max_images > 0:
  384.                 imgdl_logpath, imgdl_time = downloadRefImgs(valImgs, fp)
  385.  
  386.     except Exception as err:
  387.         errorMsg = getErrorMssage(err, tuple(sys.exc_info()))
  388.         print('\n', errorMsg)
  389.         errorLog.append({
  390.             'conf': prepForJson(copy.deepcopy(conf), omitIter=['errorLog', 'valImgs']),
  391.             'errorMsg': errorMsg,
  392.             'driverJson': prepForJson(getObjAttrs(driver))
  393.         })
  394.  
  395.         if remRetries > 0:
  396.             try:
  397.                 driver.quit()
  398.                 del driver
  399.             except Exception as e:
  400.                 print(f'!unable to quit+del driver',
  401.                       getErrorMssage(e, tuple(sys.exc_info())))
  402.             conf['remRetries'] = remRetries - 1
  403.             conf['valImgs'] = valImgs
  404.             conf['val_scraped'] = val_scraped
  405.             conf['probRefVals'] = probRefVals
  406.             conf['errorLog'] = copy.deepcopy(errorLog)
  407.             conf['logSearch'] = logSearch
  408.             return proni_scraper(
  409.                 searchFor=searchFor, max_pages=max_pages,
  410.                 max_images=max_images, conf=conf
  411.             )
  412.  
  413.     for i, el in enumerate(errorLog):
  414.         try:
  415.             html_fn = get_opFilename('error_pgSrc', fp, '.html')
  416.             pgSrc = el['driverJson']['page_source']
  417.             pgSrc = pgSrc[0] if type(pgSrc) == list and pgSrc else pgSrc
  418.             with open(html_fn, 'wb') as f:
  419.                 f.write(str(pgSrc).encode('utf-8'))
  420.             errorLog[i]['driverJson']['page_source'] = html_fn
  421.         except Exception as err:
  422.             errorLog[i]['html_save_error'] = getErrorMssage(
  423.                 err, tuple(sys.exc_info()), 'html_save_error')
  424.  
  425.     errLg_fn = None
  426.     try:
  427.         if errorLog:
  428.             errLg_fn = get_opFilename('proni_errorLog', fp, '.json')
  429.             errorLog = prepForJson(errorLog, omitIter=['errorLog', 'valImgs'])
  430.             with open(errLg_fn, 'w') as f:
  431.                 json.dump(errorLog, f, indent=4)
  432.     except Exception as err:
  433.         errLg_fn = getErrorMssage(err, tuple(sys.exc_info()), 'save_errLg')
  434.         print('\n', errLg_fn)
  435.  
  436.     secRow = (sum(pageTimes)/len(resDets)) if resDets and pageTimes else 'N/A'
  437.     secPage = (sum(pageTimes)/len(pageTimes)) if pageTimes else 'N/A'
  438.     secVI = [vi['scrapeTime'] for vi in valImgs if 'scrapeTime' in vi]
  439.     secRef = (sum(secVI)/len(valImgs)) if secVI and valImgs else 'N/A'
  440.     secVI = (sum(secVI)/(
  441.         total_scrapedImages if total_scrapedImages else 1
  442.     )) if secVI else 'N/A'
  443.     totalTime = datetime.now() - startTime
  444.  
  445.     # input('Enter')
  446.     dfRefs = [
  447.         {
  448.             'data': [
  449.                 ('Search Keywords', searchFor),
  450.                 ('Maximum Pages [param]', max_pages),
  451.                 ('Maximum Images [param]', max_images),
  452.                 ('', ''),
  453.                 ('Total Results Available', total_res),
  454.                 ('Total Results Scraped', len(resDets)),
  455.                 ('', ''),
  456.                 ('Total References Scraped', len(valImgs)),
  457.                 ('Total Reference Images', total_refImages),
  458.                 ('Total Images Scraped', total_scrapedImages),
  459.                 ('Image-Download Log [filename]', imgdl_logpath),
  460.                 ('Problematic Refs', probRefVals),
  461.                 ('', ''),
  462.                 ('Start Time', startTime.isoformat()),
  463.                 ('Time Taken [tdeltaStr]', str(totalTime)),
  464.                 ('Time Taken [seconds]', totalTime.total_seconds()),
  465.                 ('Average Time per Page', secPage),
  466.                 ('Average Time per Row', secRow),
  467.                 ('Average Time per Reference', secRef),
  468.                 ('Average Time per Image', secVI),
  469.                 ('Total Image Download Time', imgdl_time),
  470.                 ('', ''),
  471.                 ('Tries Taken', tryCt),
  472.                 ('Failed Tries', len(errorLog)),
  473.                 ('Last Error Message', errorMsg),
  474.                 ('Error Log [filename]', errLg_fn)
  475.             ], 'name': 'Search Summary', 'h': False
  476.         },
  477.         {'name': 'Search Results', 'data': resDets, 'h': True},
  478.         {'name': 'Reference Images', 'data': valImgs, 'h': True}
  479.     ]
  480.  
  481.     for k, v in dfRefs[0]['data']:
  482.         skipKeys = ['Search Keywords', 'Maximum Pages [param]']
  483.         skipKeys += ['Maximum Images [param]', 'Start Time']
  484.         if k and k not in skipKeys:
  485.             for_sl[str(k).replace(' ', '_')] = v
  486.  
  487.     sInd = sInd if type(sInd) == int and sInd > -1 else None
  488.     if type(fnr) == str:
  489.         try:
  490.             op_fn = get_opFilename(fnr, fp)
  491.             if sInd is not None:
  492.                 op_fn = op_fn.replace('.xlsx', f'_op-{sInd}.xlsx')
  493.                 print(f'[filename changed to {op_fn}]')
  494.             with pandas.ExcelWriter(op_fn) as w:
  495.                 for r in dfRefs:
  496.                     pandas.DataFrame(r['data']).to_excel(
  497.                         w, sheet_name=r['name'], index=False, header=r['h'])
  498.             for_sl['op_excel'] = op_fn
  499.         except Exception as e:
  500.             errorMsg = getErrorMssage(e, tuple(sys.exc_info()), 'ExcelWriter')
  501.             print(errorMsg)
  502.             op_fn = errorMsg
  503.             for_sl['error_ExcelWriter'] = errorMsg
  504.  
  505.     if logSearch:
  506.         try:
  507.             searchLog = prepForJson(
  508.                 searchLog, omitIter=['errorLog', 'valImgs'])
  509.             with open(logSearch, 'w') as f:
  510.                 json.dump(searchLog + [for_sl], f, indent=4)
  511.         except Exception as e:
  512.             print(getErrorMssage(e, tuple(sys.exc_info())))
  513.  
  514.     if type(fnr) != str:
  515.         return dfRefs
  516.  
  517.     return op_fn if sInd is None else (op_fn, dfRefs)
  518.  
  519.  
  520. def proniList_scraper(searchList, max_pages=10, max_images=5, conf={}):
  521.     if searchList == "allVals":
  522.         conf['[orig] searchList'] = "allVals"
  523.         searchList = [f'VAL/12/B/{i}' for i in range(1, 10)]
  524.         return proniList_scraper(searchList, max_pages, max_images, conf)
  525.  
  526.     startTime = datetime.now()
  527.     logSearch = conf['logSearch'] if 'logSearch' in conf else 'proniScraper_logs.json'
  528.     if not (type(logSearch) == str and logSearch.endswith('.json')):
  529.         logSearch = False
  530.     for_sl = {
  531.         '[inp] searchList': searchList, '[inp] max_pages': max_pages,
  532.         '[inp] max_images': max_images, '[inp] conf': conf
  533.     }
  534.  
  535.     if 'indivLogs' in conf:
  536.         rConf['logSearch'] = conf['indivLogs']
  537.         del rConf['indivLogs']
  538.     resCombo, sik, rConf = [], 'search_index', dict(conf.items())
  539.     fnr = conf['fnr'] if 'fnr' in conf else 'proni_scraped'
  540.     fp = conf['fp'] if 'fp' in conf else ''
  541.  
  542.     for_sl['startTime'] = startTime.isoformat()
  543.  
  544.     resList = [proni_scraper(
  545.         s, max_pages=max_pages, max_images=max_images, conf=rConf, sInd=i
  546.     ) for i, s in enumerate(searchList) if s and type(s) == str]
  547.  
  548.     for_sl['inividual_outputs'] = []
  549.     for i, (rfn, r) in enumerate(resList):
  550.         for_sl['inividual_outputs'].append(rfn)
  551.         for dfRef in r:
  552.             if 'data' not in dfRef or 'name' not in dfRef:
  553.                 continue
  554.             if dfRef['name'] == 'Search Summary':
  555.                 dfRef['data'] = [
  556.                     {k: v for k, v in ([(sik, i)] + dfRef['data']) if k}]
  557.             else:
  558.                 for dri, drr in enumerate(dfRef['data']):
  559.                     dfRef['data'][dri] = {
  560.                         k: v for k, v in ([(sik, i)] + list(drr.items()))}
  561.  
  562.             if not [rc for rc in resCombo if rc['name'] == dfRef['name']]:
  563.                 resCombo.append(dfRef)
  564.                 continue
  565.             for rci, rc in enumerate(resCombo):
  566.                 if rc['name'] == dfRef['name']:
  567.                     resCombo[rci]['data'] += dfRef['data']
  568.  
  569.     if type(fnr) == str:
  570.         opfn = get_opFilename(fnr, fp)
  571.         with pandas.ExcelWriter(opfn) as w:
  572.             for r in resCombo:
  573.                 pandas.DataFrame(r['data']).to_excel(
  574.                     w, sheet_name=r['name'], index=False, header=True)
  575.         finalOp = opfn
  576.         for_sl['op_excel'] = opfn
  577.     else:
  578.         finalOp = resCombo
  579.  
  580.     totalTime = datetime.now() - startTime
  581.     for_sl['Total Time (stringified)'] = str(totalTime)
  582.     for_sl['Total Time (seconds)'] = totalTime.total_seconds()
  583.     if logSearch:
  584.         try:
  585.             searchLog = json.load(open(logSearch, 'r'))
  586.             searchLog = searchLog if type(
  587.                 searchLog) == list else copy.deepcopy(searchLog)
  588.             print(f'found {len(searchLog)} logs at "{logSearch}"')
  589.         except Exception as err:
  590.             slEm = getErrorMssage(err, tuple(sys.exc_info()))
  591.             print(f'Could not retrieve searchLog from "{logSearch}" -', slEm)
  592.             searchLog = []
  593.  
  594.         try:
  595.             searchLog = prepForJson(
  596.                 searchLog, omitIter=['errorLog', 'valImgs'])
  597.             with open(logSearch, 'w') as f:
  598.                 json.dump(searchLog + [for_sl], f, indent=4)
  599.         except Exception as e:
  600.             print(getErrorMssage(e, tuple(sys.exc_info())))
  601.  
  602.     return finalOp
  603.  
  604.  
  605. '''
  606. ## [to convert logs to csv] ##
  607. sl = json.load(open('proniScraper_logs.json', 'r'))
  608. print(len(sl), 'logs')
  609. pandas.DataFrame(sl).to_csv('_proniScraper_logs.csv', index=False)
  610. # '''
  611.  
  612.  
  613.  
  614.  
  615.  
  616. # proni_scraper('Great Victoria', conf={'downloadImages': True})
  617. # proni_scraper('Rossconor', 10, 10, conf={'downloadImages': True})
  618. # proniList_scraper(['Camus', 'Downing', 'Kilmore'], 1, 1)
  619.  
  620. # to scrape all listings, but without ANY reference images
  621. # proniList_scraper('allVals', 100000, 0)
  622.  
  623. # proniList_scraper(['Camus', 'Downing', 'Kilmore'], 1, 0, conf={'indivLogs': False})
  624. # proni_scraper('Cavan', 100, 1000)
  625.  
  626. # to scrape all references and download all images
  627. # proniList_scraper('allVals', 100000, 1000, conf={'downloadImages': True})
  628.  
  629.  
Add Comment
Please, Sign In to add comment