Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for scraping urls and maintaining queues ##
- ## a simplified version [using csv, json and global variables instead of a database] ##
- ## example of usage at https://pastebin.com/X62sxaNk
- from urllib.parse import urljoin
- import pandas, json
- ##################################################################################################
- ### def linkToSoup... COPY [WITH IMPORTS] from https://pastebin.com/rBTr06vy [adust as needed] ###
- ##################################################################################################
- starterUrl = 'SOME_VALID_URL'
- ## could be stored in a database, but instead:
- pageLimit = 10
- maxScrapes = 30
- scrapeCt = 0
- curUrlId = 0
- scrawlLogs = {} ## [in a database this would be one or more tables]
- skipUrls = []
- def get_next_fromScrawlQ():
- ## could be interacting with database intead of:
- global maxScrapes, scrawlLogs
- if scrawlLogs == {}: return starterUrl
- if scrapeCt > maxScrapes: return None
- elig = [k for k, l in scrawlLogs.items() if l['status'] == 'queued'][:1]
- return elig[0] if elig else None
- def get_urlId(iUrl, refUrlId=None):
- ## could be interacting with database intead of:
- global curUrlId, scrawlLogs
- print('', end=f'\r[curUrlId={curUrlId}] getting id for {iUrl}')
- if (not iUrl) or iUrl[0] == '!': iUrl = f'{iUrl}_{curUrlId+1}'
- if iUrl not in scrawlLogs:
- if not refUrlId: return None
- scrawlLogs[iUrl] = {'url': iUrl, 'refUrlId': refUrlId, 'status': None}
- if 'urlId' in scrawlLogs[iUrl] and scrawlLogs[iUrl]['urlId']:
- return scrawlLogs[iUrl]['urlId']
- curUrlId += 1
- scrawlLogs[iUrl]['urlId'] = curUrlId
- return curUrlId
- def addUrl_to_queue(aUrl, rootUrl, refUrlId, qStatus='queued'):
- ## could be interacting with database intead of:
- global scrawlLogs, curUrlId
- aUrl = urljoin(rootUrl, aUrl)
- if aUrl.startswith('javascript:'): return False
- auId = get_urlId(aUrl, refUrlId=refUrlId)
- print('', end=f'\r[curUrlId={curUrlId}] Adding [#{auId}]: {aUrl}')
- if scrawlLogs[aUrl]['status'] in [None, 'retry']:
- scrawlLogs[aUrl]['status'] = 'skip' if aUrl in skipUrls else qStatus
- return not (aUrl in skipUrls)
- return False
- def logScrape(scrapeRes):
- ## could be interacting with database intead of:
- global pageLimit, scrawlLogs, curUrlId, scrapeCt
- sUrl = scrapeRes['url'] if 'url' in scrapeRes else '!missingUrl'
- auId, em = get_urlId(sUrl), '' # initiate url logId and error message
- print('', end=f'\r[{curUrlId}] Logging [{scrapeCt}][#{auId}]: {sUrl}')
- if type(scrapeRes) != dict:
- em = f'ScrapeResults should be <dict> not {type(scrapeRes) != dict}'
- scrawlLogs[sUrl]['errorMessage'] = f'Invalid Format - {em}'
- scrawlLogs[sUrl]['stringifiedResult'] = str(scrapeRes)
- scrawlLogs[sUrl]['status'] = '!fail'
- return get_next_fromScrawlQ()
- for k, v in scrapeRes.items():
- if k != 'pageUrls': scrawlLogs[sUrl][k] = v
- if sUrl == '!missingUrl':
- scrapeRes['errorMessage'], em = '!missingUrl', '[missingUrl]'
- pul = scrapeRes['pageUrls'] if 'pageUrls' in scrapeRes else []
- pageUrls, ntq = (pul[:] if type(pul) == list else []), 0
- for i, pl in enumerate(pageUrls):
- pageUrls[i], qstat = get_urlId(pl), 'queued'
- if type(pageLimit) == int and pageLimit < ntq:
- qstat = 'page_limit_exceeded'
- ntq += int(addUrl_to_queue(pl, sUrl, auId, qstat))
- pageUrls = [p for p in pageUrls if p]
- scrawlLogs[sUrl]['pageUrlCt'] = len(pageUrls)
- scrawlLogs[sUrl]['newToQueue'] = ntq
- if pageUrls: scrawlLogs[sUrl]['pageUrlIds'] = pageUrls
- scrawlLogs[sUrl]['status'] = f'!error{em}' if (
- 'errorMessage' in scrapeRes) else (('' if pageUrls else '?')+'scraped')
- return get_next_fromScrawlQ()
- def getPageUrls(sUrl, scSoup):
- pUrls = [a.get('href') for a in scSoup.select('a[href]')]
- ## CAN ADD SOME FILTERS, [? MAYBE BASED ON sUrl ?] ##
- return pUrls
- def getDataFromPage(srcUrl, sSoup):
- pageUrls = getPageUrls(srcUrl, sSoup)
- ######################## EXTRACT SOME PAGE DATA ########################
- ## [only extracting page title and [truncted] body text]
- pageTitle = sSoup.select_one('head title')
- pageTitle = pageTitle.get_text().strip() if pageTitle else None
- pageText = ' '.join([word for word in (
- sSoup.body if sSoup.body else sSoup
- ).get_text().split() if word])
- if len(pageText) > 70: pageText = pageText[:33] + '...' + pageText[-33:]
- ## for a function that can extract various data from html,
- #### take a look at "htreeToDict" at https://pastebin.com/BpjZSQPi
- #### [returns a python dictionary containing specified details]
- ########################################################################
- return {
- 'url': srcUrl, 'pageTitle': pageTitle,
- 'pageText': pageText, 'pageUrls': pageUrls
- }
- def scrapeUrl(sUrl):
- global curUrlId, scrapeCt
- auId = get_urlId(sUrl, '[starter]' if sUrl==starterUrl else None)
- scrawlLogs[sUrl]['status'], scrapeCt = 'scraping', scrapeCt + 1
- print('', end=f'\r[{curUrlId}] Scraping [{scrapeCt}][#{auId}]: {sUrl}')
- scSoup = linkToSoup(sUrl, isv=False, returnErr=True)
- if type(scSoup) == str:
- return {'url': sUrl, 'errorMessage': scSoup}
- return getDataFromPage(sUrl, scSoup)
- def setGlobals(varDict, clearLog=False):
- ## could be interacting with database intead of:
- global starterUrl, pageLimit, maxScrapes, scrapeCt, curUrlId, skipUrls
- if 'starterUrl' in varDict: starterUrl = varDict['starterUrl']
- if 'pageLimit' in varDict: pageLimit = varDict['pageLimit']
- if 'curUrlId' in varDict: curUrlId = varDict['curUrlId']
- if 'skipUrls' in varDict:
- skipUrls = list(set(varDict['skipUrls'] + skipUrls))
- if 'maxScrapes' in varDict: maxScrapes = varDict['maxScrapes']
- if 'scrapeCt' in varDict: scrapeCt = varDict['scrapeCt']
- global scrawlLogs
- if clearLog: scrawlLogs = {}
- ## [probably wouldn't need if using a database]
- def saveScrawlSess(logPath, varsPath, logMode='w'):
- logPath, varsPath = str(logPath), str(varsPath)
- if logMode != 'a': logMode = 'w'
- global starterUrl, pageLimit, maxScrapes, scrapeCt, curUrlId, skipUrls
- global scrawlLogs
- scrawlDF = pandas.DataFrame(list(scrawlLogs.values()))
- scrawlDF.to_csv(logPath, index=False, mode=logMode)
- print('Saved ScrawlLogs to', logPath)
- with open(varsPath, 'w') as f: json.dump({
- 'starterUrl': starterUrl, 'pageLimit': pageLimit,
- 'maxScrapes': maxScrapes, 'scrapeCt': scrapeCt,
- 'curUrlId': curUrlId, 'skipUrls': skipUrls
- }, f, indent=4)
- print('Saved gloabals to', varsPath)
- ## [probably wouldn't need if using a database]
- def loadScrawlSess(logPath, varsPath, mode='continue'):
- global scrawlLogs
- prevLog, prevVars = [], {}
- try: prevLog = pandas.read_csv(logPath).to_dict('records')
- except Exception as e: print(f'Unable to load log from {logPath}: {e}')
- try: prevVars = json.load(open(varsPath, 'r'))
- except Exception as e: print(f'Unable to load vars from {varsPath}: {e}')
- scrawlLogs = {r['url']: r for r in (list(scrawlLogs.values()) + prevLog)}
- if mode != 'continue': prevVars['scrapeCt'] = 0
- setGlobals(prevVars)
- ## [if using database, this could be a scheduled procedure]
- if mode == 'q<--page_limit_exceeded':
- eligRefs, eligUrls = list(set([
- l['refUrlId'] for l in scrawlLogs.values()
- if l['status'] == 'page_limit_exceeded'
- ])), []
- for r in eligRefs:
- eligUrls += [
- u for u, l in scrawlLogs.items() if l['refUrlId'] == r
- ][:pageLimit]
- for u in eligUrls: scrawlLogs[u]['status'] = 'queued'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement