Advertisement
Try95th

queue_scrawler_reqs

Nov 18th, 2022 (edited)
283
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.81 KB | None | 0 0
  1. ## for scraping urls and maintaining queues ##
  2. ## a simplified version [using csv, json and global variables instead of a database] ##
  3.  
  4. ## example of usage at https://pastebin.com/X62sxaNk
  5.  
  6.  
  7. from urllib.parse import urljoin
  8. import pandas, json  
  9.  
  10. ##################################################################################################
  11. ### def linkToSoup... COPY [WITH IMPORTS] from https://pastebin.com/rBTr06vy [adust as needed] ###
  12. ##################################################################################################
  13.  
  14. starterUrl = 'SOME_VALID_URL'
  15.  
  16. ## could be stored in a database, but instead:
  17. pageLimit = 10
  18. maxScrapes = 30
  19. scrapeCt = 0
  20. curUrlId = 0
  21. scrawlLogs = {} ## [in a database this would be one or more tables]
  22. skipUrls = []
  23.  
  24. def get_next_fromScrawlQ():
  25.     ## could be interacting with database intead of:
  26.     global maxScrapes, scrawlLogs
  27.     if scrawlLogs == {}: return starterUrl
  28.     if scrapeCt > maxScrapes: return None
  29.     elig = [k for k, l in scrawlLogs.items() if l['status'] == 'queued'][:1]
  30.     return elig[0] if elig else None
  31.  
  32. def get_urlId(iUrl, refUrlId=None):
  33.     ## could be interacting with database intead of:
  34.     global curUrlId, scrawlLogs
  35.     print('', end=f'\r[curUrlId={curUrlId}] getting id for {iUrl}')
  36.     if (not iUrl) or  iUrl[0] == '!': iUrl = f'{iUrl}_{curUrlId+1}'
  37.     if iUrl not in scrawlLogs:
  38.         if not refUrlId: return None
  39.         scrawlLogs[iUrl] = {'url': iUrl, 'refUrlId': refUrlId, 'status': None}
  40.     if 'urlId' in scrawlLogs[iUrl] and scrawlLogs[iUrl]['urlId']:
  41.         return scrawlLogs[iUrl]['urlId']
  42.     curUrlId += 1
  43.     scrawlLogs[iUrl]['urlId'] = curUrlId
  44.     return curUrlId
  45.  
  46. def addUrl_to_queue(aUrl, rootUrl, refUrlId, qStatus='queued'):
  47.     ## could be interacting with database intead of:
  48.     global scrawlLogs, curUrlId
  49.     aUrl = urljoin(rootUrl, aUrl)
  50.     if aUrl.startswith('javascript:'): return False
  51.     auId = get_urlId(aUrl, refUrlId=refUrlId)
  52.     print('', end=f'\r[curUrlId={curUrlId}] Adding [#{auId}]: {aUrl}')
  53.     if scrawlLogs[aUrl]['status'] in [None, 'retry']:
  54.         scrawlLogs[aUrl]['status'] = 'skip' if aUrl in skipUrls else qStatus
  55.         return not (aUrl in skipUrls)
  56.     return False
  57.  
  58. def logScrape(scrapeRes):
  59.     ## could be interacting with database intead of:
  60.     global pageLimit, scrawlLogs, curUrlId, scrapeCt
  61.     sUrl = scrapeRes['url'] if 'url' in scrapeRes else '!missingUrl'
  62.     auId, em = get_urlId(sUrl), '' # initiate url logId and error message
  63.     print('', end=f'\r[{curUrlId}] Logging [{scrapeCt}][#{auId}]: {sUrl}')
  64.  
  65.     if type(scrapeRes) != dict:
  66.         em = f'ScrapeResults should be <dict> not {type(scrapeRes) != dict}'
  67.         scrawlLogs[sUrl]['errorMessage'] = f'Invalid Format - {em}'
  68.         scrawlLogs[sUrl]['stringifiedResult'] = str(scrapeRes)
  69.         scrawlLogs[sUrl]['status'] = '!fail'
  70.         return get_next_fromScrawlQ()  
  71.     for k, v in scrapeRes.items():
  72.         if k != 'pageUrls': scrawlLogs[sUrl][k] = v
  73.     if sUrl == '!missingUrl':
  74.         scrapeRes['errorMessage'], em = '!missingUrl', '[missingUrl]'
  75.    
  76.     pul = scrapeRes['pageUrls'] if 'pageUrls' in scrapeRes else []
  77.     pageUrls, ntq = (pul[:] if type(pul) == list else []), 0
  78.     for i, pl in enumerate(pageUrls):
  79.         pageUrls[i], qstat  = get_urlId(pl), 'queued'
  80.         if type(pageLimit) == int and pageLimit < ntq:
  81.             qstat = 'page_limit_exceeded'
  82.         ntq += int(addUrl_to_queue(pl, sUrl, auId, qstat))
  83.    
  84.     pageUrls = [p for p in pageUrls if p]
  85.     scrawlLogs[sUrl]['pageUrlCt'] = len(pageUrls)
  86.     scrawlLogs[sUrl]['newToQueue'] = ntq
  87.     if pageUrls: scrawlLogs[sUrl]['pageUrlIds'] = pageUrls
  88.     scrawlLogs[sUrl]['status'] = f'!error{em}' if (
  89.         'errorMessage' in scrapeRes) else (('' if pageUrls else '?')+'scraped')
  90.     return get_next_fromScrawlQ()  
  91.  
  92.  
  93. def getPageUrls(sUrl, scSoup):
  94.     pUrls = [a.get('href') for a in scSoup.select('a[href]')]
  95.     ## CAN ADD SOME FILTERS, [? MAYBE BASED ON sUrl ?] ##
  96.     return pUrls
  97.  
  98.  
  99. def getDataFromPage(srcUrl, sSoup):
  100.     pageUrls = getPageUrls(srcUrl, sSoup)
  101.    
  102.     ######################## EXTRACT SOME PAGE DATA ########################
  103.     ## [only extracting page title and [truncted] body text]
  104.     pageTitle = sSoup.select_one('head title')
  105.     pageTitle = pageTitle.get_text().strip() if pageTitle else None
  106.     pageText = ' '.join([word for word in (
  107.         sSoup.body if sSoup.body else sSoup
  108.     ).get_text().split() if word])
  109.     if len(pageText) > 70: pageText = pageText[:33] + '...' + pageText[-33:]
  110.     ## for a function that can extract various data from html,
  111.     #### take a look at "htreeToDict" at https://pastebin.com/BpjZSQPi
  112.     #### [returns a python dictionary containing specified details]
  113.     ########################################################################
  114.    
  115.     return {
  116.         'url': srcUrl, 'pageTitle': pageTitle,
  117.         'pageText': pageText, 'pageUrls': pageUrls
  118.     }
  119.  
  120.    
  121. def scrapeUrl(sUrl):
  122.     global curUrlId, scrapeCt
  123.     auId = get_urlId(sUrl, '[starter]' if sUrl==starterUrl else None)
  124.     scrawlLogs[sUrl]['status'], scrapeCt = 'scraping', scrapeCt + 1
  125.     print('', end=f'\r[{curUrlId}] Scraping [{scrapeCt}][#{auId}]: {sUrl}')
  126.     scSoup = linkToSoup(sUrl, isv=False, returnErr=True)
  127.     if type(scSoup) == str:
  128.         return {'url': sUrl, 'errorMessage': scSoup}
  129.     return getDataFromPage(sUrl, scSoup)
  130.  
  131.  
  132. def setGlobals(varDict, clearLog=False):
  133.     ## could be interacting with database intead of:
  134.     global starterUrl, pageLimit, maxScrapes, scrapeCt, curUrlId, skipUrls
  135.     if 'starterUrl' in varDict: starterUrl = varDict['starterUrl']
  136.     if 'pageLimit' in varDict: pageLimit = varDict['pageLimit']
  137.     if 'curUrlId' in varDict: curUrlId = varDict['curUrlId']
  138.     if 'skipUrls' in varDict:
  139.         skipUrls = list(set(varDict['skipUrls'] + skipUrls))
  140.     if 'maxScrapes' in varDict: maxScrapes = varDict['maxScrapes']
  141.     if 'scrapeCt' in varDict: scrapeCt = varDict['scrapeCt']
  142.  
  143.     global scrawlLogs
  144.     if clearLog: scrawlLogs = {}
  145.  
  146. ## [probably wouldn't need if using a database]
  147. def saveScrawlSess(logPath, varsPath, logMode='w'):
  148.     logPath, varsPath = str(logPath), str(varsPath)
  149.     if logMode != 'a': logMode = 'w'
  150.     global starterUrl, pageLimit, maxScrapes, scrapeCt, curUrlId, skipUrls
  151.     global scrawlLogs
  152.  
  153.     scrawlDF = pandas.DataFrame(list(scrawlLogs.values()))
  154.     scrawlDF.to_csv(logPath, index=False, mode=logMode)
  155.     print('Saved ScrawlLogs to', logPath)
  156.  
  157.     with open(varsPath, 'w') as f: json.dump({
  158.         'starterUrl': starterUrl, 'pageLimit': pageLimit,
  159.         'maxScrapes': maxScrapes, 'scrapeCt': scrapeCt,
  160.         'curUrlId': curUrlId, 'skipUrls': skipUrls
  161.     }, f, indent=4)
  162.     print('Saved gloabals to', varsPath)
  163.  
  164. ## [probably wouldn't need if using a database]
  165. def loadScrawlSess(logPath, varsPath, mode='continue'):
  166.     global scrawlLogs
  167.     prevLog, prevVars = [], {}
  168.     try: prevLog = pandas.read_csv(logPath).to_dict('records')
  169.     except Exception as e: print(f'Unable to load log from {logPath}: {e}')
  170.     try: prevVars = json.load(open(varsPath, 'r'))
  171.     except Exception as e: print(f'Unable to load vars from {varsPath}: {e}')
  172.  
  173.     scrawlLogs = {r['url']: r for r in (list(scrawlLogs.values()) + prevLog)}
  174.     if mode != 'continue': prevVars['scrapeCt'] = 0
  175.     setGlobals(prevVars)
  176.  
  177.     ## [if using database, this could be a scheduled procedure]
  178.     if mode == 'q<--page_limit_exceeded':
  179.         eligRefs, eligUrls = list(set([
  180.             l['refUrlId'] for l in scrawlLogs.values()
  181.             if l['status'] == 'page_limit_exceeded'
  182.         ])), []
  183.         for r in eligRefs:
  184.             eligUrls += [
  185.                 u for u, l in scrawlLogs.items() if l['refUrlId'] == r
  186.             ][:pageLimit]
  187.         for u in eligUrls: scrawlLogs[u]['status'] = 'queued'
  188.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement