Advertisement
Try95th

logError_scrapes

Dec 6th, 2022 (edited)
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.04 KB | None | 0 0
  1. ## for logging errors [to a csv file] while scraping with requests+bs4 ###
  2. ## sample of usage at https://stackoverflow.com/a/74709035/6146136 #######
  3. ## simpler example of error-logging at https://pastebin.com/vJTKLXk8 #####
  4.  
  5. ### input arguments:
  6. #### logId [required] - arbitratry string/integer to help identify the origin of scrape
  7. #### url [required] - the url that was supposed to be scraped from
  8. #### req [otional, STRONGLY recommended] - a requests.Response object
  9.    ##### [ like one returned by requests.get(url) ]
  10. #### msg [otional, recommended] - a brief explanation of why the error was logged
  11. #### rSoup [optional] - a BeautifulSoup object
  12.    ##### [necessary only if you want to log prettified html]
  13.    ##### [set conf['minify']=False and do NOT set conf['saveHtml']]
  14.    ##### [ignored if you set conf['saveHtml']]
  15. #### conf [optional] - a dictionary with keys like
  16.    ##### minify [default: True] - if True, remove excess whitespace from html
  17.    ##### errorLogFp - path to the CSV file to which error log is saved
  18.    ##### saveHtml [default: False] - can be used to specify a file to save html to [just set True to generate a name]
  19.  
  20.  
  21. # import pandas as pd
  22. # import os
  23.  
  24. def logError_scrapes(logId, url, req=None, msg='UNKNOWN ERROR', rSoup=None, conf={}, returnVal=None):
  25.     ## confs and defaults
  26.     minify = conf.get('minify', True)
  27.     errLogFp = conf.get('errLogFp', 'scrape_fails.csv') #<-- path to error log csv
  28.     saveHtml = conf.get('saveHtml', False)  
  29.     if conf.get('printMsg', False): print(msg)
  30.  
  31.     if not req and not rSoup:
  32.         htmlStr = '[RECIEVED NO RESPONSE NOR BS4 OBJECT]'
  33.     elif saveHtml:
  34.         # form a name for html file
  35.         if type(saveHtml) != str:
  36.             htmlFp = f'errHtml_{logId}_{url.split("://", 1)[-1]}'
  37.             rfp = os.path.split(errLogFp)[0]
  38.         else: rfp, htmlFp = os.path.split(saveHtml)
  39.         htmlFp = '_'.join(w for w in ''.join([
  40.             c if str(c).isalpha() or c.isdigit() or c in '._+'
  41.             else ' ' for c in htmlFp.strip('.html')
  42.         ]).split())
  43.         htmlFp = os.path.join(rfp, f'{htmlFp}.html')
  44.  
  45.         # save as html
  46.         with open(htmlFp, 'wb') as f:
  47.             f.write(req.content if req else rSoup.prettify().encode('utf-8'))  
  48.         htmlStr = f'[ saved to "{htmlFp}" ]'
  49.     else:
  50.         htmlStr = rSoup.prettify() if rSoup else req.text
  51.         if minify:
  52.             htmlStr = ' '.join([
  53.                 ' '.join(w for w in l.split() if w) for l in
  54.                 htmlStr.splitlines() if l.strip()
  55.             ]) ## [ minimize whitespace ]
  56.  
  57.     noRespMsg = '[RECIEVED NO RESPONSE OBJECT]'
  58.     if req: reqUrl, reqStatus = req.url, f'{req.status_code} {req.reason}'
  59.     else: reqUrl, reqStatus = conf.get('reqUrl', noRespMsg), conf.get('reqStatus', noRespMsg)
  60.     pd.DataFrame([{
  61.         'logId': logId, 'url': url, 'reqUrl': reqUrl, 'errorMsg': msg,
  62.         'reqStatus': reqStatus, 'htmlString': htmlStr
  63.     }]).to_csv(
  64.         errLogFp, mode='a', index=False,
  65.         header=not os.path.isfile(errLogFp)
  66.     )
  67.  
  68.     return returnVal
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement