recursive crawler

## recursively scrape pages and then the pages each url on the page leads to ##
## !WARNING: exponential - even with small "pageLimit" things can get out of control
### also, maxDepth cannot exceed recursion limit (though there will probably be memory issues by then)
### also, there are reduncancies --> see [  ] for queue-based method
### sample usage (output saved as json at https://pastebin.com/432uqPH5):
##### linkTreeMapper('https://en.wikipedia.org/wiki/Special:Random', 3, 4)

## REQUIRED:
## def linkToSoup ... copy [WITH IMPORTS] from https://pastebin.com/rBTr06vy [adust as needed]
import urllib.parse

### parts with "lCtr" and/or "statusTxt" are OPTIONAL ###
lCtr = 0
def linkTreeMapper(ltmUrl, pageLimit=10, maxDepth=3, curDepth=0, statusTxt=''):
    try:
        global lCtr
        lCtr += 1
        print('',end=f'\r[#{lCtr}][d={curDepth}] {statusTxt}: {ltmUrl}')
        if type(maxDepth) != int:  maxDepth = 3
        if type(curDepth) != int or curDepth < 0:  curDepth = maxDepth
        pageDets = {'url': ltmUrl, 'atDepth': curDepth}

        lSoup = linkToSoup(ltmUrl, returnErr=True)
        if type(lSoup) == str:  pageDets['errorMessage'] = lSoup
        else: ### OPTIONAL ## Extract Page Details ###
            pageTitle = lSoup.select_one('head title')
            if pageTitle is None: pageTitle = lSoup.head
            if pageTitle is not None:
                pageTitle = pageTitle.get_text(strip=True)
            if not lSoup.body: pbText = '!! NO HTML BODY FOUND !!'
            else :
                pbText = lSoup.body.get_text(' ')
                pbText = ' '.join([w for w in pbText.split() if w])
            if len(pbText) > 20:  pbText = pbText[:8] + '...' + pbText[-8:]
            pageDets['pageTitle'], pageDets['pageBodyText'] = pageTitle, pbText
            ###########################################

        if type(lSoup) == str or maxDepth < (curDepth + 1): return pageDets

        pageDets['pageUrls'], linkSel = [], 'a[href^="\/"], a[href^="http"]'
        luParts = urllib.parse.urlsplit(ltmUrl)
        rootUrl = f'{luParts.scheme}://{luParts.netloc}'
        pageLinks = [( ######### PROCESS URLS HERE #########
            a.get('href') if urllib.parse.urlsplit(a.get('href')).netloc
            else urllib.parse.urljoin(ltmUrl, a.get('href'))
        ) for a in lSoup.select(linkSel)]
        pageLinks = [  # FILTER URLS HERE ##########
            pl for pl in pageLinks if pl.startswith(rootUrl) and
            urllib.parse.urlsplit(pl).path not in ['/', '']
        ]
        if type(pageLimit) == int and pageLimit > 1:
            pageLinks = pageLinks[:pageLimit]
        pulen = len(pageLinks)
        for i, pUrl in enumerate(pageLinks):
            if not urllib.parse.urlsplit(pUrl).netloc:
                pUrl = urllib.parse.urljoin(ltmUrl, pUrl)
            pageDets['pageUrls'].append(linkTreeMapper(
                pUrl, pageLimit=pageLimit, maxDepth=maxDepth,
                curDepth=(curDepth + 1),
                statusTxt=f'{statusTxt}[Page{i+1} of {pulen}] '))

        return pageDets
    except Exception as e:
        try:
            lineNo = f'line {sys.exc_info()[2].tb_lineno}'
        except Exception as e2:
            lineNo = f'UNKNOWN LINE ["{e2}"]'
        return {'url': ltmUrl, 'errorMessage': f'{lineNo}: "{e}"'}