Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## FIRST, register & get an API Token from https://scrapingant.com/
- ## facilitates simple requests+bs4 for sites with blockers, but
- ## !free tier allows a limited number of requests per month
- ## !it can be very slow (but the proxies can be useful)
- ## request/cloudscraper/HTMLSession version[s] at https://pastebin.com/rBTr06vy
- ## [simplified] selenium version at https://pastebin.com/VLZ2vPYK
- ## sample usage: similar to first example at https://pastebin.com/E3sCEr9r
- import requests
- from bs4 import BeautifulSoup
- import urllib.parse
- def linkToSoup_scrapingAnt(url_to_Scrape, pCountry=None, setResid=False
- apiKey=None, loadCss=None, fparser='html.parser', isv=True, returnErr=False):
- defaultKey = 'YOUR_API_TOKEN' # paste here
- sa_api = 'https://api.scrapingant.com/v2/general'
- sa_key = str(apiKey) if apiKey else defaultKey
- qParams = {'url': url_to_Scrape, 'x-api-key': sa_key}
- if setResid: qParams['proxy_type'] = 'residential' # more expensive
- if pCountry: qParams['proxy_country'] = pCountry # more expensive
- if loadCss: qParams['wait_for_selector'] = loadCss
- reqUrl = f'{sa_api}?{urllib.parse.urlencode(qParams)}'
- if isv: print('fetching with ScrapingAnt:', url_to_Scrape, '\nwith ', reqUrl)
- r = requests.get(reqUrl)
- try:
- if [*r.json()] == ['detail']:
- errMsg = f'{r.json()["detail"]} [<response {r.status_code}> {r.reason}] {r.url}'
- except:
- if r.status_code == 200: return BeautifulSoup(r.content, fparser)
- errMsg = f'failed to fetch page [{r.status_code} {r.reason}] {r.url}'
- if isv: print(errMsg)
- return errMsg if returnErr else None
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement