Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - ## for scraping urls and maintaining queues ##
 - ## a simplified version [using csv, json and global variables instead of a database] ##
 - ## example of usage at https://pastebin.com/X62sxaNk
 - from urllib.parse import urljoin
 - import pandas, json
 - ##################################################################################################
 - ### def linkToSoup... COPY [WITH IMPORTS] from https://pastebin.com/rBTr06vy [adust as needed] ###
 - ##################################################################################################
 - starterUrl = 'SOME_VALID_URL'
 - ## could be stored in a database, but instead:
 - pageLimit = 10
 - maxScrapes = 30
 - scrapeCt = 0
 - curUrlId = 0
 - scrawlLogs = {} ## [in a database this would be one or more tables]
 - skipUrls = []
 - def get_next_fromScrawlQ():
 - ## could be interacting with database intead of:
 - global maxScrapes, scrawlLogs
 - if scrawlLogs == {}: return starterUrl
 - if scrapeCt > maxScrapes: return None
 - elig = [k for k, l in scrawlLogs.items() if l['status'] == 'queued'][:1]
 - return elig[0] if elig else None
 - def get_urlId(iUrl, refUrlId=None):
 - ## could be interacting with database intead of:
 - global curUrlId, scrawlLogs
 - print('', end=f'\r[curUrlId={curUrlId}] getting id for {iUrl}')
 - if (not iUrl) or iUrl[0] == '!': iUrl = f'{iUrl}_{curUrlId+1}'
 - if iUrl not in scrawlLogs:
 - if not refUrlId: return None
 - scrawlLogs[iUrl] = {'url': iUrl, 'refUrlId': refUrlId, 'status': None}
 - if 'urlId' in scrawlLogs[iUrl] and scrawlLogs[iUrl]['urlId']:
 - return scrawlLogs[iUrl]['urlId']
 - curUrlId += 1
 - scrawlLogs[iUrl]['urlId'] = curUrlId
 - return curUrlId
 - def addUrl_to_queue(aUrl, rootUrl, refUrlId, qStatus='queued'):
 - ## could be interacting with database intead of:
 - global scrawlLogs, curUrlId
 - aUrl = urljoin(rootUrl, aUrl)
 - if aUrl.startswith('javascript:'): return False
 - auId = get_urlId(aUrl, refUrlId=refUrlId)
 - print('', end=f'\r[curUrlId={curUrlId}] Adding [#{auId}]: {aUrl}')
 - if scrawlLogs[aUrl]['status'] in [None, 'retry']:
 - scrawlLogs[aUrl]['status'] = 'skip' if aUrl in skipUrls else qStatus
 - return not (aUrl in skipUrls)
 - return False
 - def logScrape(scrapeRes):
 - ## could be interacting with database intead of:
 - global pageLimit, scrawlLogs, curUrlId, scrapeCt
 - sUrl = scrapeRes['url'] if 'url' in scrapeRes else '!missingUrl'
 - auId, em = get_urlId(sUrl), '' # initiate url logId and error message
 - print('', end=f'\r[{curUrlId}] Logging [{scrapeCt}][#{auId}]: {sUrl}')
 - if type(scrapeRes) != dict:
 - em = f'ScrapeResults should be <dict> not {type(scrapeRes) != dict}'
 - scrawlLogs[sUrl]['errorMessage'] = f'Invalid Format - {em}'
 - scrawlLogs[sUrl]['stringifiedResult'] = str(scrapeRes)
 - scrawlLogs[sUrl]['status'] = '!fail'
 - return get_next_fromScrawlQ()
 - for k, v in scrapeRes.items():
 - if k != 'pageUrls': scrawlLogs[sUrl][k] = v
 - if sUrl == '!missingUrl':
 - scrapeRes['errorMessage'], em = '!missingUrl', '[missingUrl]'
 - pul = scrapeRes['pageUrls'] if 'pageUrls' in scrapeRes else []
 - pageUrls, ntq = (pul[:] if type(pul) == list else []), 0
 - for i, pl in enumerate(pageUrls):
 - pageUrls[i], qstat = get_urlId(pl), 'queued'
 - if type(pageLimit) == int and pageLimit < ntq:
 - qstat = 'page_limit_exceeded'
 - ntq += int(addUrl_to_queue(pl, sUrl, auId, qstat))
 - pageUrls = [p for p in pageUrls if p]
 - scrawlLogs[sUrl]['pageUrlCt'] = len(pageUrls)
 - scrawlLogs[sUrl]['newToQueue'] = ntq
 - if pageUrls: scrawlLogs[sUrl]['pageUrlIds'] = pageUrls
 - scrawlLogs[sUrl]['status'] = f'!error{em}' if (
 - 'errorMessage' in scrapeRes) else (('' if pageUrls else '?')+'scraped')
 - return get_next_fromScrawlQ()
 - def getPageUrls(sUrl, scSoup):
 - pUrls = [a.get('href') for a in scSoup.select('a[href]')]
 - ## CAN ADD SOME FILTERS, [? MAYBE BASED ON sUrl ?] ##
 - return pUrls
 - def getDataFromPage(srcUrl, sSoup):
 - pageUrls = getPageUrls(srcUrl, sSoup)
 - ######################## EXTRACT SOME PAGE DATA ########################
 - ## [only extracting page title and [truncted] body text]
 - pageTitle = sSoup.select_one('head title')
 - pageTitle = pageTitle.get_text().strip() if pageTitle else None
 - pageText = ' '.join([word for word in (
 - sSoup.body if sSoup.body else sSoup
 - ).get_text().split() if word])
 - if len(pageText) > 70: pageText = pageText[:33] + '...' + pageText[-33:]
 - ## for a function that can extract various data from html,
 - #### take a look at "htreeToDict" at https://pastebin.com/BpjZSQPi
 - #### [returns a python dictionary containing specified details]
 - ########################################################################
 - return {
 - 'url': srcUrl, 'pageTitle': pageTitle,
 - 'pageText': pageText, 'pageUrls': pageUrls
 - }
 - def scrapeUrl(sUrl):
 - global curUrlId, scrapeCt
 - auId = get_urlId(sUrl, '[starter]' if sUrl==starterUrl else None)
 - scrawlLogs[sUrl]['status'], scrapeCt = 'scraping', scrapeCt + 1
 - print('', end=f'\r[{curUrlId}] Scraping [{scrapeCt}][#{auId}]: {sUrl}')
 - scSoup = linkToSoup(sUrl, isv=False, returnErr=True)
 - if type(scSoup) == str:
 - return {'url': sUrl, 'errorMessage': scSoup}
 - return getDataFromPage(sUrl, scSoup)
 - def setGlobals(varDict, clearLog=False):
 - ## could be interacting with database intead of:
 - global starterUrl, pageLimit, maxScrapes, scrapeCt, curUrlId, skipUrls
 - if 'starterUrl' in varDict: starterUrl = varDict['starterUrl']
 - if 'pageLimit' in varDict: pageLimit = varDict['pageLimit']
 - if 'curUrlId' in varDict: curUrlId = varDict['curUrlId']
 - if 'skipUrls' in varDict:
 - skipUrls = list(set(varDict['skipUrls'] + skipUrls))
 - if 'maxScrapes' in varDict: maxScrapes = varDict['maxScrapes']
 - if 'scrapeCt' in varDict: scrapeCt = varDict['scrapeCt']
 - global scrawlLogs
 - if clearLog: scrawlLogs = {}
 - ## [probably wouldn't need if using a database]
 - def saveScrawlSess(logPath, varsPath, logMode='w'):
 - logPath, varsPath = str(logPath), str(varsPath)
 - if logMode != 'a': logMode = 'w'
 - global starterUrl, pageLimit, maxScrapes, scrapeCt, curUrlId, skipUrls
 - global scrawlLogs
 - scrawlDF = pandas.DataFrame(list(scrawlLogs.values()))
 - scrawlDF.to_csv(logPath, index=False, mode=logMode)
 - print('Saved ScrawlLogs to', logPath)
 - with open(varsPath, 'w') as f: json.dump({
 - 'starterUrl': starterUrl, 'pageLimit': pageLimit,
 - 'maxScrapes': maxScrapes, 'scrapeCt': scrapeCt,
 - 'curUrlId': curUrlId, 'skipUrls': skipUrls
 - }, f, indent=4)
 - print('Saved gloabals to', varsPath)
 - ## [probably wouldn't need if using a database]
 - def loadScrawlSess(logPath, varsPath, mode='continue'):
 - global scrawlLogs
 - prevLog, prevVars = [], {}
 - try: prevLog = pandas.read_csv(logPath).to_dict('records')
 - except Exception as e: print(f'Unable to load log from {logPath}: {e}')
 - try: prevVars = json.load(open(varsPath, 'r'))
 - except Exception as e: print(f'Unable to load vars from {varsPath}: {e}')
 - scrawlLogs = {r['url']: r for r in (list(scrawlLogs.values()) + prevLog)}
 - if mode != 'continue': prevVars['scrapeCt'] = 0
 - setGlobals(prevVars)
 - ## [if using database, this could be a scheduled procedure]
 - if mode == 'q<--page_limit_exceeded':
 - eligRefs, eligUrls = list(set([
 - l['refUrlId'] for l in scrawlLogs.values()
 - if l['status'] == 'page_limit_exceeded'
 - ])), []
 - for r in eligRefs:
 - eligUrls += [
 - u for u, l in scrawlLogs.items() if l['refUrlId'] == r
 - ][:pageLimit]
 - for u in eligUrls: scrawlLogs[u]['status'] = 'queued'
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment