linkToSoup_selenium.py

## some examples of usage can be found at https://pastebin.com/E3sCEr9r ##
## simpler version [with only a fixed-wait option] at https://pastebin.com/VLZ2vPYK ##

# Takes a URL and returns a BeautifulSoup object (or None/errorMsg if there is an error) #
## For when BeautifulSoup(requests.get(url).content) is not enough - for
#### sites with blockers and/or
#### dynamic pages loaded with JS/APIs and/or
#### sites that require you to confirm age/cookies/etc and/or close pop-up dialogs

## [if you want a quick tutorial on selenium, see https://www.scrapingbee.com/blog/selenium-python/]

#### REQUIRED: download chromedriver.exe from https://chromedriver.chromium.org/downloads ####
#### [AND copy chromedriver.exe to the same folder as this py file] ####

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def linkToSoup_selenium(
    l, ecx=None, clickFirst=None, strictMode=False, by_method='x',
    scrollN=0, tmout=25, returnErr=False, fparser='html.parser', isv=True):
    # pass strictMode=True if you don't want to continue when ecx/clickFirst can't be loaded/clicked

    # scrollElToTop = "arguments[0].scrollIntoView(true);"
    scrollElToBottom = "arguments[0].scrollIntoView(false);"
    scrollToBottom = "window.scrollTo(0, document.body.scrollHeight);"


    try:
        by_xc = By.CSS_SELECTOR if 'css' in by_method else By.XPATH
        driver = webdriver.Chrome('chromedriver.exe')
        # I copy chromedriver.exe to the same folder as this py file

        # send tmout as string --> one extra wait
        extraWait = False
        if type(tmout) not in [int, float]:
            if str(tmout).isdigit():
                tmout = int(str(tmout))
                extraWait = True
            else: tmout = 25 # default
        # driver.set_page_load_timeout(tmout)

        # for shortening some lines
        wwait_til = WebDriverWait(driver, tmout).until
        ecc = EC.element_to_be_clickable
        ecv = EC.visibility_of_all_elements_located


        driver.get(l) # go to link
        driver.maximize_window()
        if extraWait: time.sleep(tmout) # wait

        if type(scrollN) == tuple and len(scrollN) == 2:
            time.sleep(scrollN[1])
            for i in range(scrollN[0]):
                driver.execute_script(scrollToBottom)
                time.sleep(scrollN[1])

        # if something needs to be confirmed by click
        if clickFirst:
            # can pass as either string (single) or list (multiple)
            if type(clickFirst) == list: clickFirst = [str(c) for c in clickFirst]
            else: clickFirst = [str(clickFirst)]

            for cf in clickFirst:
                try:
                    wwait_til(ecc((by_xc, cf)))
                    cfEl = driver.find_element(by_xc, cf)
                    driver.execute_script(scrollElToBottom, cfEl)
                    cfEl.click()
                except Exception as e:
                    errMsg = f'could not click [{cf}] - {type(e)}: {e}'
                    if strictMode:
                        if isv: print(f'quitting bc {errMsg}')
                        return errMsg if returnErr else None
                    elif isv: print(f'[continuing even though] {errMsg}')

        # if some section needs to be loaded first
        if ecx:
            # can pass as either string (single) or list (multiple)
            if type(ecx) == list: ecx = [str(e) for e in ecx]
            else: ecx = [str(ecx)]

            for e in ecx:
                try: wwait_til(ecv((by_xc, e)))
                except Exception as ex:
                    errMsg = f'could not load [{e}] - {type(ex)}: {ex}'
                    if strictMode:
                        if isv: print(f'quitting bc {errMsg}')
                        return errMsg if returnErr else None
                    elif isv: print(f'[continuing even though] {errMsg}')

        lSoup = BeautifulSoup(driver.page_source, fparser)
        driver.close() # (just in case)
        del driver # (just in case)
        return lSoup
    except Exception as e:
        errMsg = f'could not scrape [{l}] \n{type(e)}: {e}'
        if isv: print(errMsg)
        return errMsg if returnErr else None