Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## recursively scrape pages and then the pages each url on the page leads to ##
- ## !WARNING: exponential - even with small "pageLimit" things can get out of control
- ### also, maxDepth cannot exceed recursion limit (though there will probably be memory issues by then)
- ### also, there are reduncancies --> see [ ] for queue-based method
- ### sample usage (output saved as json at https://pastebin.com/432uqPH5):
- ##### linkTreeMapper('https://en.wikipedia.org/wiki/Special:Random', 3, 4)
- ## REQUIRED:
- ## def linkToSoup ... copy [WITH IMPORTS] from https://pastebin.com/rBTr06vy [adust as needed]
- import urllib.parse
- ### parts with "lCtr" and/or "statusTxt" are OPTIONAL ###
- lCtr = 0
- def linkTreeMapper(ltmUrl, pageLimit=10, maxDepth=3, curDepth=0, statusTxt=''):
- try:
- global lCtr
- lCtr += 1
- print('',end=f'\r[#{lCtr}][d={curDepth}] {statusTxt}: {ltmUrl}')
- if type(maxDepth) != int: maxDepth = 3
- if type(curDepth) != int or curDepth < 0: curDepth = maxDepth
- pageDets = {'url': ltmUrl, 'atDepth': curDepth}
- lSoup = linkToSoup(ltmUrl, returnErr=True)
- if type(lSoup) == str: pageDets['errorMessage'] = lSoup
- else: ### OPTIONAL ## Extract Page Details ###
- pageTitle = lSoup.select_one('head title')
- if pageTitle is None: pageTitle = lSoup.head
- if pageTitle is not None:
- pageTitle = pageTitle.get_text(strip=True)
- if not lSoup.body: pbText = '!! NO HTML BODY FOUND !!'
- else :
- pbText = lSoup.body.get_text(' ')
- pbText = ' '.join([w for w in pbText.split() if w])
- if len(pbText) > 20: pbText = pbText[:8] + '...' + pbText[-8:]
- pageDets['pageTitle'], pageDets['pageBodyText'] = pageTitle, pbText
- ###########################################
- if type(lSoup) == str or maxDepth < (curDepth + 1): return pageDets
- pageDets['pageUrls'], linkSel = [], 'a[href^="\/"], a[href^="http"]'
- luParts = urllib.parse.urlsplit(ltmUrl)
- rootUrl = f'{luParts.scheme}://{luParts.netloc}'
- pageLinks = [( ######### PROCESS URLS HERE #########
- a.get('href') if urllib.parse.urlsplit(a.get('href')).netloc
- else urllib.parse.urljoin(ltmUrl, a.get('href'))
- ) for a in lSoup.select(linkSel)]
- pageLinks = [ # FILTER URLS HERE ##########
- pl for pl in pageLinks if pl.startswith(rootUrl) and
- urllib.parse.urlsplit(pl).path not in ['/', '']
- ]
- if type(pageLimit) == int and pageLimit > 1:
- pageLinks = pageLinks[:pageLimit]
- pulen = len(pageLinks)
- for i, pUrl in enumerate(pageLinks):
- if not urllib.parse.urlsplit(pUrl).netloc:
- pUrl = urllib.parse.urljoin(ltmUrl, pUrl)
- pageDets['pageUrls'].append(linkTreeMapper(
- pUrl, pageLimit=pageLimit, maxDepth=maxDepth,
- curDepth=(curDepth + 1),
- statusTxt=f'{statusTxt}[Page{i+1} of {pulen}] '))
- return pageDets
- except Exception as e:
- try:
- lineNo = f'line {sys.exc_info()[2].tb_lineno}'
- except Exception as e2:
- lineNo = f'UNKNOWN LINE ["{e2}"]'
- return {'url': ltmUrl, 'errorMessage': f'{lineNo}: "{e}"'}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement