Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for logging errors [to a csv file] while scraping with requests+bs4 ###
- ## sample of usage at https://stackoverflow.com/a/74709035/6146136 #######
- ## simpler example of error-logging at https://pastebin.com/vJTKLXk8 #####
- ### input arguments:
- #### logId [required] - arbitratry string/integer to help identify the origin of scrape
- #### url [required] - the url that was supposed to be scraped from
- #### req [otional, STRONGLY recommended] - a requests.Response object
- ##### [ like one returned by requests.get(url) ]
- #### msg [otional, recommended] - a brief explanation of why the error was logged
- #### rSoup [optional] - a BeautifulSoup object
- ##### [necessary only if you want to log prettified html]
- ##### [set conf['minify']=False and do NOT set conf['saveHtml']]
- ##### [ignored if you set conf['saveHtml']]
- #### conf [optional] - a dictionary with keys like
- ##### minify [default: True] - if True, remove excess whitespace from html
- ##### errorLogFp - path to the CSV file to which error log is saved
- ##### saveHtml [default: False] - can be used to specify a file to save html to [just set True to generate a name]
- # import pandas as pd
- # import os
- def logError_scrapes(logId, url, req=None, msg='UNKNOWN ERROR', rSoup=None, conf={}, returnVal=None):
- ## confs and defaults
- minify = conf.get('minify', True)
- errLogFp = conf.get('errLogFp', 'scrape_fails.csv') #<-- path to error log csv
- saveHtml = conf.get('saveHtml', False)
- if conf.get('printMsg', False): print(msg)
- if not req and not rSoup:
- htmlStr = '[RECIEVED NO RESPONSE NOR BS4 OBJECT]'
- elif saveHtml:
- # form a name for html file
- if type(saveHtml) != str:
- htmlFp = f'errHtml_{logId}_{url.split("://", 1)[-1]}'
- rfp = os.path.split(errLogFp)[0]
- else: rfp, htmlFp = os.path.split(saveHtml)
- htmlFp = '_'.join(w for w in ''.join([
- c if str(c).isalpha() or c.isdigit() or c in '._+'
- else ' ' for c in htmlFp.strip('.html')
- ]).split())
- htmlFp = os.path.join(rfp, f'{htmlFp}.html')
- # save as html
- with open(htmlFp, 'wb') as f:
- f.write(req.content if req else rSoup.prettify().encode('utf-8'))
- htmlStr = f'[ saved to "{htmlFp}" ]'
- else:
- htmlStr = rSoup.prettify() if rSoup else req.text
- if minify:
- htmlStr = ' '.join([
- ' '.join(w for w in l.split() if w) for l in
- htmlStr.splitlines() if l.strip()
- ]) ## [ minimize whitespace ]
- noRespMsg = '[RECIEVED NO RESPONSE OBJECT]'
- if req: reqUrl, reqStatus = req.url, f'{req.status_code} {req.reason}'
- else: reqUrl, reqStatus = conf.get('reqUrl', noRespMsg), conf.get('reqStatus', noRespMsg)
- pd.DataFrame([{
- 'logId': logId, 'url': url, 'reqUrl': reqUrl, 'errorMsg': msg,
- 'reqStatus': reqStatus, 'htmlString': htmlStr
- }]).to_csv(
- errLogFp, mode='a', index=False,
- header=not os.path.isfile(errLogFp)
- )
- return returnVal
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement