Try95th

linkToSoup

Nov 16th, 2022 (edited)
284
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.47 KB | None | 0 0
  1. ## to fetch and parse html from url ##
  2. ## if simple requests are not enough, try cloudscraper and then HTMLSession
  3. ### if the blockers are such that all 3 fail [and/or for dynamic pages]
  4. ##### try ScrapingAnt https://pastebin.com/5ibz2F6p OR
  5. ##### try Selenium https://pastebin.com/kEC9gPC8
  6.  
  7. from bs4 import BeautifulSoup
  8. import requests # [if you simply use requests] OR
  9. # import  cloudscraper # [if you use cloudscraper]
  10. # from requests_html import HTMLSession # [if you use HTMLSession]
  11.  
  12. def linkToSoup(targetUrl, conf={}, isv=True, returnErr=False, returnResp=False):
  13.     bsParser = conf.get('parser', 'html.parser')  
  14.     reqArgs = {k:v for k,v in conf.items() if k!='parser'}
  15.     try:
  16.         r = requests.get(targetUrl, **reqArgs) ## can pass headers/cookies/etc via conf
  17.         # r = cloudscraper.create_scraper().get(targetUrl)
  18.         # r = HTMLSession().get(targetUrl)
  19.        
  20.         if r.status_code == 200:
  21.             ######## CAN ADD OTHER CHECKS ########
  22.             if isv: print(repr(r), '[ parser:', bsParser, '] from', r.url)
  23.             soup = BeautifulSoup(r.content, bsParser)
  24.             return (soup, r) if returnResp else soup
  25.  
  26.         errMsg = f'<Response [{r.status_code} {r.reason}]> - '
  27.         errMsg = f'{errMsg}Failed to scrape {targetUrl}'
  28.     except Exception as e:
  29.         errMsg = f'Failed to scrape {targetUrl} \n - errorMsg: "{str(e)}"'
  30.     if isv: print(errMsg)
  31.    
  32.     ret1 = errMsg if returnErr else None
  33.     return (ret1, r) if returnResp else ret1
  34.  
Advertisement
Add Comment
Please, Sign In to add comment