linkToSoup

## to fetch and parse html from url ##
## if simple requests are not enough, try cloudscraper and then HTMLSession
### if the blockers are such that all 3 fail [and/or for dynamic pages]
##### try ScrapingAnt https://pastebin.com/5ibz2F6p OR
##### try Selenium https://pastebin.com/kEC9gPC8

from bs4 import BeautifulSoup
import requests # [if you simply use requests] OR
# import  cloudscraper # [if you use cloudscraper]
# from requests_html import HTMLSession # [if you use HTMLSession]

def linkToSoup(targetUrl, conf={}, isv=True, returnErr=False, returnResp=False):
    bsParser = conf.get('parser', 'html.parser')
    reqArgs = {k:v for k,v in conf.items() if k!='parser'}
    try:
        r = requests.get(targetUrl, **reqArgs) ## can pass headers/cookies/etc via conf
        # r = cloudscraper.create_scraper().get(targetUrl)
        # r = HTMLSession().get(targetUrl)

        if r.status_code == 200:
            ######## CAN ADD OTHER CHECKS ########
            if isv: print(repr(r), '[ parser:', bsParser, '] from', r.url)
            soup = BeautifulSoup(r.content, bsParser)
            return (soup, r) if returnResp else soup

        errMsg = f'<Response [{r.status_code} {r.reason}]> - '
        errMsg = f'{errMsg}Failed to scrape {targetUrl}'
    except Exception as e:
        errMsg = f'Failed to scrape {targetUrl} \n - errorMsg: "{str(e)}"'
    if isv: print(errMsg)

    ret1 = errMsg if returnErr else None
    return (ret1, r) if returnResp else ret1