Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## to fetch and parse html from url ##
- ## if simple requests are not enough, try cloudscraper and then HTMLSession
- ### if the blockers are such that all 3 fail [and/or for dynamic pages]
- ##### try ScrapingAnt https://pastebin.com/5ibz2F6p OR
- ##### try Selenium https://pastebin.com/kEC9gPC8
- from bs4 import BeautifulSoup
- import requests # [if you simply use requests] OR
- # import cloudscraper # [if you use cloudscraper]
- # from requests_html import HTMLSession # [if you use HTMLSession]
- def linkToSoup(targetUrl, conf={}, isv=True, returnErr=False, returnResp=False):
- bsParser = conf.get('parser', 'html.parser')
- reqArgs = {k:v for k,v in conf.items() if k!='parser'}
- try:
- r = requests.get(targetUrl, **reqArgs) ## can pass headers/cookies/etc via conf
- # r = cloudscraper.create_scraper().get(targetUrl)
- # r = HTMLSession().get(targetUrl)
- if r.status_code == 200:
- ######## CAN ADD OTHER CHECKS ########
- if isv: print(repr(r), '[ parser:', bsParser, '] from', r.url)
- soup = BeautifulSoup(r.content, bsParser)
- return (soup, r) if returnResp else soup
- errMsg = f'<Response [{r.status_code} {r.reason}]> - '
- errMsg = f'{errMsg}Failed to scrape {targetUrl}'
- except Exception as e:
- errMsg = f'Failed to scrape {targetUrl} \n - errorMsg: "{str(e)}"'
- if isv: print(errMsg)
- ret1 = errMsg if returnErr else None
- return (ret1, r) if returnResp else ret1
Advertisement
Add Comment
Please, Sign In to add comment