Advertisement
Try95th

queue_scrawler_classDef

Jan 29th, 2023 (edited)
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.54 KB | None | 0 0
  1. ## for scraping urls according to queues and maintaining said queues ##################
  2. ## A more object-oriented version of original [ https://pastebin.com/TBtYja5D ] #######
  3. ## a simplified version [using csv, json and global variables instead of a database] ##
  4.  
  5. ## example of usage at
  6. ### https://docs.google.com/spreadsheets/d/1rMgkO9S1s_HKXBzJps5qfvrjVf-YtmqJ9Hp6_UFqGls
  7.  
  8.  
  9. from urllib.parse import urljoin, urlsplit
  10. import pandas, json  
  11.  
  12. ##################################################################################################
  13. ### FOR FETCHING AND PARSING HTML FROM URL #######################################################
  14. ### from https://pastebin.com/rBTr06vy [also suggests variations with other libraries] ###########
  15. import requests
  16. from bs4 import BeautifulSoup
  17. def linkToSoup(targetUrl, conf={}, isv=True, returnErr=False):
  18.       parser = conf["parser"] if "parser" in conf else None
  19.       try:
  20.           r = requests.get(targetUrl)
  21.           if isv: print(f'<{r.status_code} {r.reason}> /{parser} from {r.url}')
  22.           if r.status_code == 200: return BeautifulSoup(r.content, parser) ## [CAN +MORE TESTS] ##
  23.              
  24.           errMsg = f'<{r.status_code} {r.reason}> - '
  25.           errMsg = f'{errMsg}Failed to scrape {targetUrl}'
  26.       except Exception as e: errMsg = f'Failed to scrape {targetUrl} \n - errorMsg: "{str(e)}"'
  27.       if isv: print(errMsg)
  28.       return errMsg if returnErr else None
  29. #### SOME ALTERNATIVES:
  30. ###### with ScrapingAnt: https://pastebin.com/5ibz2F6p ###########################################
  31. ###### with selenium: https://pastebin.com/VLZ2vPYK , https://pastebin.com/kEC9gPC8 ##############
  32. ##################################################################################################
  33.  
  34.  
  35. class qScrawlr:
  36.     def __init__(self, starter, setVars={}, clrLog=False):
  37.         self.starterUrl = starter # SOME VALID URL
  38.  
  39.         ## defaults:
  40.         self.pageLimit = 10
  41.         self.maxScrapes = 30
  42.         self.scrapeCt = 0
  43.         self.curUrlId = 0
  44.         self.scrawlLogs = {} # [in a database this would be one or more tables]
  45.         self.skipUrls = []
  46.         self.samesite = False
  47.  
  48.         if isinstance(starter, str):
  49.             setVars['starterUrl'] = starter
  50.             self.setGlobals(setVars, clrLog)
  51.         else:
  52.           if isinstance(starter, dict): self.loadScrawlSess(**starter)
  53.           else: self.loadScrawlSess(*starter)
  54.  
  55.     def setGlobals(self, varDict, clearLog=False):
  56.         print('updating properties with', varDict)
  57.         ## could be interacting with database intead of:
  58.         if 'starterUrl' in varDict: self.starterUrl = varDict['starterUrl']
  59.         if 'pageLimit' in varDict: self.pageLimit = varDict['pageLimit']
  60.         if 'curUrlId' in varDict: self.curUrlId = varDict['curUrlId']
  61.         if 'skipUrls' in varDict:
  62.             self.skipUrls = list(set(varDict['skipUrls'] + self.skipUrls))
  63.         if 'maxScrapes' in varDict: self.maxScrapes = varDict['maxScrapes']
  64.         if 'scrapeCt' in varDict: self.scrapeCt = varDict['scrapeCt']
  65.         if 'samesite' in varDict: self.samesite = varDict['samesite']
  66.  
  67.         if clearLog: self.scrawlLogs = {}
  68.         print('properties after updating:', {
  69.             k: v for k, v in self.__dict__.items() if k != 'scrawlLogs'})
  70.  
  71.  
  72.     def get_next_fromScrawlQ(self):
  73.         ## could be interacting with database intead of:
  74.         if self.scrawlLogs == {}: return self.starterUrl
  75.         if self.scrapeCt > self.maxScrapes: return None
  76.         elig = [
  77.             k for k, l in self.scrawlLogs.items()
  78.             if l['status'] == 'queued'
  79.         ][:1]
  80.         return elig[0] if elig else None
  81.  
  82.     def get_urlId(self, iUrl, refUrlId=None):
  83.         ## could be interacting with database intead of:
  84.         print('', end=f'\r[curUrlId={self.curUrlId}] getting id for {iUrl}')
  85.         if (not iUrl) or iUrl[0] == '!': iUrl = f'{iUrl}_{self.curUrlId+1}'
  86.         if iUrl not in self.scrawlLogs:
  87.             if not refUrlId: return None
  88.             self.scrawlLogs[iUrl] = {
  89.                 'url': iUrl, 'refUrlId': refUrlId, 'status': None}
  90.         if 'urlId' in self.scrawlLogs[iUrl] and self.scrawlLogs[iUrl]['urlId']:
  91.             return self.scrawlLogs[iUrl]['urlId']
  92.         self.curUrlId += 1
  93.         self.scrawlLogs[iUrl]['urlId'] = self.curUrlId
  94.         return self.curUrlId
  95.  
  96.     def addUrl_to_queue(self, aUrl, rootUrl, refUrlId, qStatus='queued'):
  97.         ## could be interacting with database intead of:
  98.         aUrl = urljoin(rootUrl, aUrl)
  99.         if aUrl.startswith('javascript:'): return False
  100.         auId = self.get_urlId(aUrl, refUrlId=refUrlId)
  101.         print('', end=f'\r[curUrlId={self.curUrlId}] Adding [#{auId}]: {aUrl}')
  102.         if self.scrawlLogs[aUrl]['status'] in [None, 'retry']:
  103.             self.scrawlLogs[aUrl]['status'] = (
  104.                 'skip' if aUrl in self.skipUrls else qStatus )
  105.             return not (aUrl in self.skipUrls)
  106.         return False
  107.  
  108.     def logScrape(self, scrapeRes):
  109.         ## could be interacting with database intead of:
  110.         sUrl = scrapeRes['url'] if 'url' in scrapeRes else '!missingUrl'
  111.         auId, em = self.get_urlId(sUrl), '' # initiate url logId & error msg
  112.         print('', end=(f'\r[{self.curUrlId}] Logging ' +
  113.                         f'[{self.scrapeCt}][#{auId}]: {sUrl}'))
  114.  
  115.         if type(scrapeRes) != dict:
  116.             em = f'ScrapeResults should be <dict> not {type(scrapeRes)!=dict}'
  117.             self.scrawlLogs[sUrl]['errorMessage'] = f'Invalid Format - {em}'
  118.             self.scrawlLogs[sUrl]['stringifiedResult'] = str(scrapeRes)
  119.             self.scrawlLogs[sUrl]['status'] = '!fail'
  120.             return self.get_next_fromScrawlQ()  
  121.         for k, v in scrapeRes.items():
  122.             if k != 'pageUrls': self.scrawlLogs[sUrl][k] = v
  123.         if sUrl == '!missingUrl':
  124.             scrapeRes['errorMessage'], em = '!missingUrl', '[missingUrl]'
  125.        
  126.         pul = scrapeRes['pageUrls'] if 'pageUrls' in scrapeRes else []
  127.         pageUrls, ntq = (pul[:] if type(pul) == list else []), 0
  128.         for i, pl in enumerate(pageUrls):
  129.             pageUrls[i], qstat  = self.get_urlId(pl,auId), 'queued'
  130.             if type(self.pageLimit) == int and self.pageLimit < ntq:
  131.                 qstat = 'page_limit_exceeded'
  132.             ntq += int(self.addUrl_to_queue(pl, sUrl, auId, qstat))
  133.        
  134.         pageUrls = [p for p in pageUrls if p]
  135.         self.scrawlLogs[sUrl]['pageUrlCt'] = len(pageUrls)
  136.         self.scrawlLogs[sUrl]['newToQueue'] = ntq
  137.         if pageUrls: self.scrawlLogs[sUrl]['pageUrlIds'] = pageUrls
  138.         self.scrawlLogs[sUrl]['status'] = f'!error{em}' if (
  139.             'errorMessage' in scrapeRes) else ((
  140.                 '' if pageUrls else '?')+'scraped')
  141.         return self.get_next_fromScrawlQ()  
  142.  
  143.  
  144.     def getPageUrls(self, sUrl, scSoup):
  145.         pUrls = [urljoin(sUrl, a['href']) for a in scSoup.select('a[href]')]
  146.         if self.samesite:
  147.             rnl = urlsplit(sUrl).netloc
  148.             pUrls = [u for u in pUrls if urlsplit(u).netloc==rnl]
  149.         ## CAN ADD MORE FILTERS ##
  150.         return pUrls
  151.  
  152.  
  153.     def getDataFromPage(self, srcUrl, sSoup):
  154.         pageUrls = self.getPageUrls(srcUrl, sSoup)
  155.        
  156.         ####################### EXTRACT SOME PAGE DATA #######################
  157.         ## [only extracting page title and [truncted] body text]
  158.         pageTitle = sSoup.select_one('head title')
  159.         pageTitle = pageTitle.get_text().strip() if pageTitle else None
  160.         pageText = ' '.join([word for word in (
  161.             sSoup.main if sSoup.main else (sSoup.body if sSoup.body else sSoup)
  162.         ).get_text(' ').split() if word])
  163.         if len(pageText) > 70:
  164.             pageText = pageText[:33] + '...' + pageText[-33:]
  165.         ## for a function that can extract various data from html,
  166.         #### take a look at "htreeToDict" at https://pastebin.com/BpjZSQPi
  167.         #### [returns a python dictionary containing specified details]
  168.         ######################################################################
  169.        
  170.         return {
  171.             'url': srcUrl, 'pageTitle': pageTitle,
  172.             'pageText': pageText, 'pageUrls': pageUrls
  173.         }
  174.  
  175.        
  176.     def scrapeUrl(self, sUrl):
  177.         auId = self.get_urlId(
  178.             sUrl, '[starter]' if sUrl==self.starterUrl else None)
  179.         self.scrawlLogs[sUrl]['status'] = 'scraping'
  180.         self.scrapeCt = self.scrapeCt + 1
  181.         print('', end=(
  182.             f'\r[{self.curUrlId}] Scraping ' +
  183.             f'[{self.scrapeCt}][#{auId}]: {sUrl}'))
  184.         scSoup = linkToSoup(sUrl, isv=False, returnErr=True)
  185.         if type(scSoup) == str:
  186.             return {'url': sUrl, 'errorMessage': scSoup}
  187.         return self.getDataFromPage(sUrl, scSoup)
  188.  
  189.  
  190.     ## [probably wouldn't need if using a database]
  191.     def saveScrawlSess(self, logPath, varsPath, logMode='w'):
  192.         logPath, varsPath = str(logPath), str(varsPath)
  193.         if logMode != 'a': logMode = 'w'
  194.  
  195.         scrawlDF = pandas.DataFrame(list(self.scrawlLogs.values()))
  196.         scrawlDF.to_csv(logPath, index=False, mode=logMode)
  197.         print('Saved scrawlLogs to', logPath)
  198.  
  199.         with open(varsPath, 'w') as f: json.dump({
  200.             'starterUrl': self.starterUrl, 'pageLimit': self.pageLimit,
  201.             'maxScrapes': self.maxScrapes, 'scrapeCt': self.scrapeCt,
  202.             'curUrlId': self.curUrlId, 'skipUrls': self.skipUrls
  203.         }, f, indent=4)
  204.         print('Saved gloabals to', varsPath)
  205.  
  206.     ## [probably wouldn't need if using a database]
  207.     def loadScrawlSess(self, logPath, varsPath, mode='continue'):
  208.         prevLog, prevVars = [], {}
  209.         try: prevLog = pandas.read_csv(logPath).to_dict('records')
  210.         except Exception as e:
  211.             print(f'Unable to load log from {logPath}: {e}')
  212.         try: prevVars = json.load(open(varsPath, 'r'))
  213.         except Exception as e:
  214.             print(f'Unable to load vars from {varsPath}: {e}')
  215.  
  216.         self.scrawlLogs = {  r['url']: r for r in (
  217.                 list(self.scrawlLogs.values()) + prevLog)  }
  218.         print('queue length after loading:', len(self.scrawlLogs))
  219.         if mode != 'continue': prevVars['scrapeCt'] = 0
  220.         self.setGlobals(prevVars)
  221.  
  222.       ## [if using database, this could be a scheduled procedure]
  223.         if mode in ['q<--page_limit_exceeded', 'q<--allUnscraped']:
  224.             eligRefs, eligUrls = list(set([
  225.                 l['refUrlId'] for l in self.scrawlLogs.values()
  226.                 if l['status'] == 'page_limit_exceeded'
  227.             ])), []
  228.             for r in eligRefs:
  229.                 eligUrls += [
  230.                     u for u, l in self.scrawlLogs.items()
  231.                     if l['refUrlId'] == r
  232.                 ][:self.pageLimit]
  233.             for u in eligUrls: self.scrawlLogs[u]['status'] = 'queued'
  234.         if mode in ['q<--NaN', 'q<--allUnscraped']:
  235.             eligRefs, eligUrls = list(set([
  236.                 l['refUrlId'] for l in self.scrawlLogs.values()
  237.                 if not l['status'] or pandas.isna(l['status'])
  238.             ])), []
  239.             for r in eligRefs:
  240.                 eligUrls += [
  241.                     u for u, l in self.scrawlLogs.items()
  242.                     if l['refUrlId'] == r
  243.                 ][:self.pageLimit]
  244.             for u in eligUrls: self.scrawlLogs[u]['status'] = 'queued'
  245.  
  246.  
  247.     def run(self, saveTo=None):
  248.         nextUrl = self.get_next_fromScrawlQ()
  249.         while nextUrl: nextUrl = self.logScrape(self.scrapeUrl(nextUrl))
  250.         print()
  251.         if isinstance(saveTo, dict): self.saveScrawlSess(**saveTo)
  252.         elif hasattr(saveTo, '__iter__'): self.saveScrawlSess(*saveTo)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement