Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - ## some examples of usage can be found at https://pastebin.com/E3sCEr9r ##
 - ## simpler version [with only a fixed-wait option] at https://pastebin.com/VLZ2vPYK ##
 - # Takes a URL and returns a BeautifulSoup object (or None/errorMsg if there is an error) #
 - ## For when BeautifulSoup(requests.get(url).content) is not enough - for
 - #### sites with blockers and/or
 - #### dynamic pages loaded with JS/APIs and/or
 - #### sites that require you to confirm age/cookies/etc and/or close pop-up dialogs
 - ## [if you want a quick tutorial on selenium, see https://www.scrapingbee.com/blog/selenium-python/]
 - #### REQUIRED: download chromedriver.exe from https://chromedriver.chromium.org/downloads ####
 - #### [AND copy chromedriver.exe to the same folder as this py file] ####
 - import time
 - from bs4 import BeautifulSoup
 - from selenium import webdriver
 - from selenium.webdriver.common.by import By
 - from selenium.webdriver.support.ui import WebDriverWait
 - from selenium.webdriver.support import expected_conditions as EC
 - def linkToSoup_selenium(
 - l, ecx=None, clickFirst=None, strictMode=False, by_method='x',
 - scrollN=0, tmout=25, returnErr=False, fparser='html.parser', isv=True):
 - # pass strictMode=True if you don't want to continue when ecx/clickFirst can't be loaded/clicked
 - # scrollElToTop = "arguments[0].scrollIntoView(true);"
 - scrollElToBottom = "arguments[0].scrollIntoView(false);"
 - scrollToBottom = "window.scrollTo(0, document.body.scrollHeight);"
 - try:
 - by_xc = By.CSS_SELECTOR if 'css' in by_method else By.XPATH
 - driver = webdriver.Chrome('chromedriver.exe')
 - # I copy chromedriver.exe to the same folder as this py file
 - # send tmout as string --> one extra wait
 - extraWait = False
 - if type(tmout) not in [int, float]:
 - if str(tmout).isdigit():
 - tmout = int(str(tmout))
 - extraWait = True
 - else: tmout = 25 # default
 - # driver.set_page_load_timeout(tmout)
 - # for shortening some lines
 - wwait_til = WebDriverWait(driver, tmout).until
 - ecc = EC.element_to_be_clickable
 - ecv = EC.visibility_of_all_elements_located
 - driver.get(l) # go to link
 - driver.maximize_window()
 - if extraWait: time.sleep(tmout) # wait
 - if type(scrollN) == tuple and len(scrollN) == 2:
 - time.sleep(scrollN[1])
 - for i in range(scrollN[0]):
 - driver.execute_script(scrollToBottom)
 - time.sleep(scrollN[1])
 - # if something needs to be confirmed by click
 - if clickFirst:
 - # can pass as either string (single) or list (multiple)
 - if type(clickFirst) == list: clickFirst = [str(c) for c in clickFirst]
 - else: clickFirst = [str(clickFirst)]
 - for cf in clickFirst:
 - try:
 - wwait_til(ecc((by_xc, cf)))
 - cfEl = driver.find_element(by_xc, cf)
 - driver.execute_script(scrollElToBottom, cfEl)
 - cfEl.click()
 - except Exception as e:
 - errMsg = f'could not click [{cf}] - {type(e)}: {e}'
 - if strictMode:
 - if isv: print(f'quitting bc {errMsg}')
 - return errMsg if returnErr else None
 - elif isv: print(f'[continuing even though] {errMsg}')
 - # if some section needs to be loaded first
 - if ecx:
 - # can pass as either string (single) or list (multiple)
 - if type(ecx) == list: ecx = [str(e) for e in ecx]
 - else: ecx = [str(ecx)]
 - for e in ecx:
 - try: wwait_til(ecv((by_xc, e)))
 - except Exception as ex:
 - errMsg = f'could not load [{e}] - {type(ex)}: {ex}'
 - if strictMode:
 - if isv: print(f'quitting bc {errMsg}')
 - return errMsg if returnErr else None
 - elif isv: print(f'[continuing even though] {errMsg}')
 - lSoup = BeautifulSoup(driver.page_source, fparser)
 - driver.close() # (just in case)
 - del driver # (just in case)
 - return lSoup
 - except Exception as e:
 - errMsg = f'could not scrape [{l}] \n{type(e)}: {e}'
 - if isv: print(errMsg)
 - return errMsg if returnErr else None
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment