logError_scrapes

## for logging errors [to a csv file] while scraping with requests+bs4 ###
## sample of usage at https://stackoverflow.com/a/74709035/6146136 #######
## simpler example of error-logging at https://pastebin.com/vJTKLXk8 #####

### input arguments:
#### logId [required] - arbitratry string/integer to help identify the origin of scrape
#### url [required] - the url that was supposed to be scraped from
#### req [otional, STRONGLY recommended] - a requests.Response object
   ##### [ like one returned by requests.get(url) ]
#### msg [otional, recommended] - a brief explanation of why the error was logged
#### rSoup [optional] - a BeautifulSoup object
   ##### [necessary only if you want to log prettified html]
   ##### [set conf['minify']=False and do NOT set conf['saveHtml']]
   ##### [ignored if you set conf['saveHtml']]
#### conf [optional] - a dictionary with keys like
   ##### minify [default: True] - if True, remove excess whitespace from html
   ##### errorLogFp - path to the CSV file to which error log is saved
   ##### saveHtml [default: False] - can be used to specify a file to save html to [just set True to generate a name]


# import pandas as pd
# import os

def logError_scrapes(logId, url, req=None, msg='UNKNOWN ERROR', rSoup=None, conf={}, returnVal=None):
    ## confs and defaults
    minify = conf.get('minify', True)
    errLogFp = conf.get('errLogFp', 'scrape_fails.csv') #<-- path to error log csv
    saveHtml = conf.get('saveHtml', False)
    if conf.get('printMsg', False): print(msg)

    if not req and not rSoup:
        htmlStr = '[RECIEVED NO RESPONSE NOR BS4 OBJECT]'
    elif saveHtml:
        # form a name for html file
        if type(saveHtml) != str:
            htmlFp = f'errHtml_{logId}_{url.split("://", 1)[-1]}'
            rfp = os.path.split(errLogFp)[0]
        else: rfp, htmlFp = os.path.split(saveHtml)
        htmlFp = '_'.join(w for w in ''.join([
            c if str(c).isalpha() or c.isdigit() or c in '._+'
            else ' ' for c in htmlFp.strip('.html')
        ]).split())
        htmlFp = os.path.join(rfp, f'{htmlFp}.html')

        # save as html
        with open(htmlFp, 'wb') as f:
            f.write(req.content if req else rSoup.prettify().encode('utf-8'))
        htmlStr = f'[ saved to "{htmlFp}" ]'
    else:
        htmlStr = rSoup.prettify() if rSoup else req.text
        if minify:
            htmlStr = ' '.join([
                ' '.join(w for w in l.split() if w) for l in
                htmlStr.splitlines() if l.strip()
            ]) ## [ minimize whitespace ]

    noRespMsg = '[RECIEVED NO RESPONSE OBJECT]'
    if req: reqUrl, reqStatus = req.url, f'{req.status_code} {req.reason}'
    else: reqUrl, reqStatus = conf.get('reqUrl', noRespMsg), conf.get('reqStatus', noRespMsg)
    pd.DataFrame([{
        'logId': logId, 'url': url, 'reqUrl': reqUrl, 'errorMsg': msg,
        'reqStatus': reqStatus, 'htmlString': htmlStr
    }]).to_csv(
        errLogFp, mode='a', index=False,
        header=not os.path.isfile(errLogFp)
    )

    return returnVal