Advertisement
Try95th

recursive crawler

Nov 17th, 2022 (edited)
157
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.33 KB | None | 0 0
  1. ## recursively scrape pages and then the pages each url on the page leads to ##
  2. ## !WARNING: exponential - even with small "pageLimit" things can get out of control
  3. ### also, maxDepth cannot exceed recursion limit (though there will probably be memory issues by then)
  4. ### also, there are reduncancies --> see [  ] for queue-based method
  5. ### sample usage (output saved as json at https://pastebin.com/432uqPH5):
  6. ##### linkTreeMapper('https://en.wikipedia.org/wiki/Special:Random', 3, 4)
  7.  
  8. ## REQUIRED:
  9. ## def linkToSoup ... copy [WITH IMPORTS] from https://pastebin.com/rBTr06vy [adust as needed]
  10. import urllib.parse
  11.  
  12. ### parts with "lCtr" and/or "statusTxt" are OPTIONAL ###
  13. lCtr = 0
  14. def linkTreeMapper(ltmUrl, pageLimit=10, maxDepth=3, curDepth=0, statusTxt=''):
  15.     try:
  16.         global lCtr
  17.         lCtr += 1
  18.         print('',end=f'\r[#{lCtr}][d={curDepth}] {statusTxt}: {ltmUrl}')
  19.         if type(maxDepth) != int:  maxDepth = 3
  20.         if type(curDepth) != int or curDepth < 0:  curDepth = maxDepth
  21.         pageDets = {'url': ltmUrl, 'atDepth': curDepth}
  22.  
  23.         lSoup = linkToSoup(ltmUrl, returnErr=True)
  24.         if type(lSoup) == str:  pageDets['errorMessage'] = lSoup
  25.         else: ### OPTIONAL ## Extract Page Details ###
  26.             pageTitle = lSoup.select_one('head title')
  27.             if pageTitle is None: pageTitle = lSoup.head
  28.             if pageTitle is not None:
  29.                 pageTitle = pageTitle.get_text(strip=True)
  30.             if not lSoup.body: pbText = '!! NO HTML BODY FOUND !!'
  31.             else :
  32.                 pbText = lSoup.body.get_text(' ')
  33.                 pbText = ' '.join([w for w in pbText.split() if w])
  34.             if len(pbText) > 20:  pbText = pbText[:8] + '...' + pbText[-8:]
  35.             pageDets['pageTitle'], pageDets['pageBodyText'] = pageTitle, pbText
  36.             ###########################################
  37.        
  38.         if type(lSoup) == str or maxDepth < (curDepth + 1): return pageDets
  39.  
  40.         pageDets['pageUrls'], linkSel = [], 'a[href^="\/"], a[href^="http"]'
  41.         luParts = urllib.parse.urlsplit(ltmUrl)
  42.         rootUrl = f'{luParts.scheme}://{luParts.netloc}'
  43.         pageLinks = [( ######### PROCESS URLS HERE #########
  44.             a.get('href') if urllib.parse.urlsplit(a.get('href')).netloc
  45.             else urllib.parse.urljoin(ltmUrl, a.get('href'))
  46.         ) for a in lSoup.select(linkSel)]
  47.         pageLinks = [  # FILTER URLS HERE ##########
  48.             pl for pl in pageLinks if pl.startswith(rootUrl) and
  49.             urllib.parse.urlsplit(pl).path not in ['/', '']
  50.         ]
  51.         if type(pageLimit) == int and pageLimit > 1:
  52.             pageLinks = pageLinks[:pageLimit]
  53.         pulen = len(pageLinks)
  54.         for i, pUrl in enumerate(pageLinks):
  55.             if not urllib.parse.urlsplit(pUrl).netloc:
  56.                 pUrl = urllib.parse.urljoin(ltmUrl, pUrl)
  57.             pageDets['pageUrls'].append(linkTreeMapper(
  58.                 pUrl, pageLimit=pageLimit, maxDepth=maxDepth,
  59.                 curDepth=(curDepth + 1),
  60.                 statusTxt=f'{statusTxt}[Page{i+1} of {pulen}] '))
  61.        
  62.         return pageDets
  63.     except Exception as e:
  64.         try:
  65.             lineNo = f'line {sys.exc_info()[2].tb_lineno}'
  66.         except Exception as e2:
  67.             lineNo = f'UNKNOWN LINE ["{e2}"]'
  68.         return {'url': ltmUrl, 'errorMessage': f'{lineNo}: "{e}"'}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement