Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## some examples of usage can be found at https://pastebin.com/E3sCEr9r ##
- ## simpler version [with only a fixed-wait option] at https://pastebin.com/VLZ2vPYK ##
- # Takes a URL and returns a BeautifulSoup object (or None/errorMsg if there is an error) #
- ## For when BeautifulSoup(requests.get(url).content) is not enough - for
- #### sites with blockers and/or
- #### dynamic pages loaded with JS/APIs and/or
- #### sites that require you to confirm age/cookies/etc and/or close pop-up dialogs
- ## [if you want a quick tutorial on selenium, see https://www.scrapingbee.com/blog/selenium-python/]
- #### REQUIRED: download chromedriver.exe from https://chromedriver.chromium.org/downloads ####
- #### [AND copy chromedriver.exe to the same folder as this py file] ####
- import time
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- def linkToSoup_selenium(
- l, ecx=None, clickFirst=None, strictMode=False, by_method='x',
- scrollN=0, tmout=25, returnErr=False, fparser='html.parser', isv=True):
- # pass strictMode=True if you don't want to continue when ecx/clickFirst can't be loaded/clicked
- # scrollElToTop = "arguments[0].scrollIntoView(true);"
- scrollElToBottom = "arguments[0].scrollIntoView(false);"
- scrollToBottom = "window.scrollTo(0, document.body.scrollHeight);"
- try:
- by_xc = By.CSS_SELECTOR if 'css' in by_method else By.XPATH
- driver = webdriver.Chrome('chromedriver.exe')
- # I copy chromedriver.exe to the same folder as this py file
- # send tmout as string --> one extra wait
- extraWait = False
- if type(tmout) not in [int, float]:
- if str(tmout).isdigit():
- tmout = int(str(tmout))
- extraWait = True
- else: tmout = 25 # default
- # driver.set_page_load_timeout(tmout)
- # for shortening some lines
- wwait_til = WebDriverWait(driver, tmout).until
- ecc = EC.element_to_be_clickable
- ecv = EC.visibility_of_all_elements_located
- driver.get(l) # go to link
- driver.maximize_window()
- if extraWait: time.sleep(tmout) # wait
- if type(scrollN) == tuple and len(scrollN) == 2:
- time.sleep(scrollN[1])
- for i in range(scrollN[0]):
- driver.execute_script(scrollToBottom)
- time.sleep(scrollN[1])
- # if something needs to be confirmed by click
- if clickFirst:
- # can pass as either string (single) or list (multiple)
- if type(clickFirst) == list: clickFirst = [str(c) for c in clickFirst]
- else: clickFirst = [str(clickFirst)]
- for cf in clickFirst:
- try:
- wwait_til(ecc((by_xc, cf)))
- cfEl = driver.find_element(by_xc, cf)
- driver.execute_script(scrollElToBottom, cfEl)
- cfEl.click()
- except Exception as e:
- errMsg = f'could not click [{cf}] - {type(e)}: {e}'
- if strictMode:
- if isv: print(f'quitting bc {errMsg}')
- return errMsg if returnErr else None
- elif isv: print(f'[continuing even though] {errMsg}')
- # if some section needs to be loaded first
- if ecx:
- # can pass as either string (single) or list (multiple)
- if type(ecx) == list: ecx = [str(e) for e in ecx]
- else: ecx = [str(ecx)]
- for e in ecx:
- try: wwait_til(ecv((by_xc, e)))
- except Exception as ex:
- errMsg = f'could not load [{e}] - {type(ex)}: {ex}'
- if strictMode:
- if isv: print(f'quitting bc {errMsg}')
- return errMsg if returnErr else None
- elif isv: print(f'[continuing even though] {errMsg}')
- lSoup = BeautifulSoup(driver.page_source, fparser)
- driver.close() # (just in case)
- del driver # (just in case)
- return lSoup
- except Exception as e:
- errMsg = f'could not scrape [{l}] \n{type(e)}: {e}'
- if isv: print(errMsg)
- return errMsg if returnErr else None
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement