queue_scrawler_reqs

## for scraping urls and maintaining queues ##
## a simplified version [using csv, json and global variables instead of a database] ##

## example of usage at https://pastebin.com/X62sxaNk


from urllib.parse import urljoin
import pandas, json

##################################################################################################
### def linkToSoup... COPY [WITH IMPORTS] from https://pastebin.com/rBTr06vy [adust as needed] ###
##################################################################################################

starterUrl = 'SOME_VALID_URL'

## could be stored in a database, but instead:
pageLimit = 10
maxScrapes = 30
scrapeCt = 0
curUrlId = 0
scrawlLogs = {} ## [in a database this would be one or more tables]
skipUrls = []

def get_next_fromScrawlQ():
    ## could be interacting with database intead of:
    global maxScrapes, scrawlLogs
    if scrawlLogs == {}: return starterUrl
    if scrapeCt > maxScrapes: return None
    elig = [k for k, l in scrawlLogs.items() if l['status'] == 'queued'][:1]
    return elig[0] if elig else None

def get_urlId(iUrl, refUrlId=None):
    ## could be interacting with database intead of:
    global curUrlId, scrawlLogs
    print('', end=f'\r[curUrlId={curUrlId}] getting id for {iUrl}')
    if (not iUrl) or  iUrl[0] == '!': iUrl = f'{iUrl}_{curUrlId+1}'
    if iUrl not in scrawlLogs:
        if not refUrlId: return None
        scrawlLogs[iUrl] = {'url': iUrl, 'refUrlId': refUrlId, 'status': None}
    if 'urlId' in scrawlLogs[iUrl] and scrawlLogs[iUrl]['urlId']:
        return scrawlLogs[iUrl]['urlId']
    curUrlId += 1
    scrawlLogs[iUrl]['urlId'] = curUrlId
    return curUrlId

def addUrl_to_queue(aUrl, rootUrl, refUrlId, qStatus='queued'):
    ## could be interacting with database intead of:
    global scrawlLogs, curUrlId
    aUrl = urljoin(rootUrl, aUrl)
    if aUrl.startswith('javascript:'): return False
    auId = get_urlId(aUrl, refUrlId=refUrlId)
    print('', end=f'\r[curUrlId={curUrlId}] Adding [#{auId}]: {aUrl}')
    if scrawlLogs[aUrl]['status'] in [None, 'retry']:
        scrawlLogs[aUrl]['status'] = 'skip' if aUrl in skipUrls else qStatus
        return not (aUrl in skipUrls)
    return False

def logScrape(scrapeRes):
    ## could be interacting with database intead of:
    global pageLimit, scrawlLogs, curUrlId, scrapeCt
    sUrl = scrapeRes['url'] if 'url' in scrapeRes else '!missingUrl'
    auId, em = get_urlId(sUrl), '' # initiate url logId and error message
    print('', end=f'\r[{curUrlId}] Logging [{scrapeCt}][#{auId}]: {sUrl}')

    if type(scrapeRes) != dict:
        em = f'ScrapeResults should be <dict> not {type(scrapeRes) != dict}'
        scrawlLogs[sUrl]['errorMessage'] = f'Invalid Format - {em}'
        scrawlLogs[sUrl]['stringifiedResult'] = str(scrapeRes)
        scrawlLogs[sUrl]['status'] = '!fail'
        return get_next_fromScrawlQ()
    for k, v in scrapeRes.items():
        if k != 'pageUrls': scrawlLogs[sUrl][k] = v
    if sUrl == '!missingUrl':
        scrapeRes['errorMessage'], em = '!missingUrl', '[missingUrl]'

    pul = scrapeRes['pageUrls'] if 'pageUrls' in scrapeRes else []
    pageUrls, ntq = (pul[:] if type(pul) == list else []), 0
    for i, pl in enumerate(pageUrls):
        pageUrls[i], qstat  = get_urlId(pl), 'queued'
        if type(pageLimit) == int and pageLimit < ntq:
            qstat = 'page_limit_exceeded'
        ntq += int(addUrl_to_queue(pl, sUrl, auId, qstat))

    pageUrls = [p for p in pageUrls if p]
    scrawlLogs[sUrl]['pageUrlCt'] = len(pageUrls)
    scrawlLogs[sUrl]['newToQueue'] = ntq
    if pageUrls: scrawlLogs[sUrl]['pageUrlIds'] = pageUrls
    scrawlLogs[sUrl]['status'] = f'!error{em}' if (
        'errorMessage' in scrapeRes) else (('' if pageUrls else '?')+'scraped')
    return get_next_fromScrawlQ()


def getPageUrls(sUrl, scSoup):
    pUrls = [a.get('href') for a in scSoup.select('a[href]')]
    ## CAN ADD SOME FILTERS, [? MAYBE BASED ON sUrl ?] ##
    return pUrls


def getDataFromPage(srcUrl, sSoup):
    pageUrls = getPageUrls(srcUrl, sSoup)

    ######################## EXTRACT SOME PAGE DATA ########################
    ## [only extracting page title and [truncted] body text]
    pageTitle = sSoup.select_one('head title')
    pageTitle = pageTitle.get_text().strip() if pageTitle else None
    pageText = ' '.join([word for word in (
        sSoup.body if sSoup.body else sSoup
    ).get_text().split() if word])
    if len(pageText) > 70: pageText = pageText[:33] + '...' + pageText[-33:]
    ## for a function that can extract various data from html,
    #### take a look at "htreeToDict" at https://pastebin.com/BpjZSQPi
    #### [returns a python dictionary containing specified details]
    ########################################################################

    return {
        'url': srcUrl, 'pageTitle': pageTitle,
        'pageText': pageText, 'pageUrls': pageUrls
    }


def scrapeUrl(sUrl):
    global curUrlId, scrapeCt
    auId = get_urlId(sUrl, '[starter]' if sUrl==starterUrl else None)
    scrawlLogs[sUrl]['status'], scrapeCt = 'scraping', scrapeCt + 1
    print('', end=f'\r[{curUrlId}] Scraping [{scrapeCt}][#{auId}]: {sUrl}')
    scSoup = linkToSoup(sUrl, isv=False, returnErr=True)
    if type(scSoup) == str:
        return {'url': sUrl, 'errorMessage': scSoup}
    return getDataFromPage(sUrl, scSoup)


def setGlobals(varDict, clearLog=False):
    ## could be interacting with database intead of:
    global starterUrl, pageLimit, maxScrapes, scrapeCt, curUrlId, skipUrls
    if 'starterUrl' in varDict: starterUrl = varDict['starterUrl']
    if 'pageLimit' in varDict: pageLimit = varDict['pageLimit']
    if 'curUrlId' in varDict: curUrlId = varDict['curUrlId']
    if 'skipUrls' in varDict:
        skipUrls = list(set(varDict['skipUrls'] + skipUrls))
    if 'maxScrapes' in varDict: maxScrapes = varDict['maxScrapes']
    if 'scrapeCt' in varDict: scrapeCt = varDict['scrapeCt']

    global scrawlLogs
    if clearLog: scrawlLogs = {}

## [probably wouldn't need if using a database]
def saveScrawlSess(logPath, varsPath, logMode='w'):
    logPath, varsPath = str(logPath), str(varsPath)
    if logMode != 'a': logMode = 'w'
    global starterUrl, pageLimit, maxScrapes, scrapeCt, curUrlId, skipUrls
    global scrawlLogs

    scrawlDF = pandas.DataFrame(list(scrawlLogs.values()))
    scrawlDF.to_csv(logPath, index=False, mode=logMode)
    print('Saved ScrawlLogs to', logPath)

    with open(varsPath, 'w') as f: json.dump({
        'starterUrl': starterUrl, 'pageLimit': pageLimit,
        'maxScrapes': maxScrapes, 'scrapeCt': scrapeCt,
        'curUrlId': curUrlId, 'skipUrls': skipUrls
    }, f, indent=4)
    print('Saved gloabals to', varsPath)

## [probably wouldn't need if using a database]
def loadScrawlSess(logPath, varsPath, mode='continue'):
    global scrawlLogs
    prevLog, prevVars = [], {}
    try: prevLog = pandas.read_csv(logPath).to_dict('records')
    except Exception as e: print(f'Unable to load log from {logPath}: {e}')
    try: prevVars = json.load(open(varsPath, 'r'))
    except Exception as e: print(f'Unable to load vars from {varsPath}: {e}')

    scrawlLogs = {r['url']: r for r in (list(scrawlLogs.values()) + prevLog)}
    if mode != 'continue': prevVars['scrapeCt'] = 0
    setGlobals(prevVars)

    ## [if using database, this could be a scheduled procedure]
    if mode == 'q<--page_limit_exceeded':
        eligRefs, eligUrls = list(set([
            l['refUrlId'] for l in scrawlLogs.values()
            if l['status'] == 'page_limit_exceeded'
        ])), []
        for r in eligRefs:
            eligUrls += [
                u for u, l in scrawlLogs.items() if l['refUrlId'] == r
            ][:pageLimit]
        for u in eligUrls: scrawlLogs[u]['status'] = 'queued'