Advertisement
Try95th

linkToSoup_selenium.py

Nov 13th, 2022 (edited)
474
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.43 KB | None | 0 0
  1. ## some examples of usage can be found at https://pastebin.com/E3sCEr9r ##
  2. ## simpler version [with only a fixed-wait option] at https://pastebin.com/VLZ2vPYK ##
  3.  
  4. # Takes a URL and returns a BeautifulSoup object (or None/errorMsg if there is an error) #
  5. ## For when BeautifulSoup(requests.get(url).content) is not enough - for
  6. #### sites with blockers and/or
  7. #### dynamic pages loaded with JS/APIs and/or
  8. #### sites that require you to confirm age/cookies/etc and/or close pop-up dialogs
  9.  
  10. ## [if you want a quick tutorial on selenium, see https://www.scrapingbee.com/blog/selenium-python/]
  11.  
  12. #### REQUIRED: download chromedriver.exe from https://chromedriver.chromium.org/downloads ####
  13. #### [AND copy chromedriver.exe to the same folder as this py file] ####
  14.  
  15. import time
  16. from bs4 import BeautifulSoup
  17. from selenium import webdriver
  18. from selenium.webdriver.common.by import By
  19. from selenium.webdriver.support.ui import WebDriverWait
  20. from selenium.webdriver.support import expected_conditions as EC
  21.  
  22. def linkToSoup_selenium(
  23.     l, ecx=None, clickFirst=None, strictMode=False, by_method='x',
  24.     scrollN=0, tmout=25, returnErr=False, fparser='html.parser', isv=True):
  25.     # pass strictMode=True if you don't want to continue when ecx/clickFirst can't be loaded/clicked
  26.    
  27.     # scrollElToTop = "arguments[0].scrollIntoView(true);"
  28.     scrollElToBottom = "arguments[0].scrollIntoView(false);"
  29.     scrollToBottom = "window.scrollTo(0, document.body.scrollHeight);"
  30.    
  31.    
  32.     try:
  33.         by_xc = By.CSS_SELECTOR if 'css' in by_method else By.XPATH
  34.         driver = webdriver.Chrome('chromedriver.exe')
  35.         # I copy chromedriver.exe to the same folder as this py file
  36.        
  37.         # send tmout as string --> one extra wait
  38.         extraWait = False
  39.         if type(tmout) not in [int, float]:
  40.             if str(tmout).isdigit():
  41.                 tmout = int(str(tmout))
  42.                 extraWait = True
  43.             else: tmout = 25 # default
  44.         # driver.set_page_load_timeout(tmout)
  45.  
  46.         # for shortening some lines
  47.         wwait_til = WebDriverWait(driver, tmout).until
  48.         ecc = EC.element_to_be_clickable
  49.         ecv = EC.visibility_of_all_elements_located
  50.        
  51.        
  52.         driver.get(l) # go to link
  53.         driver.maximize_window()
  54.         if extraWait: time.sleep(tmout) # wait
  55.            
  56.         if type(scrollN) == tuple and len(scrollN) == 2:
  57.             time.sleep(scrollN[1])
  58.             for i in range(scrollN[0]):
  59.                 driver.execute_script(scrollToBottom)
  60.                 time.sleep(scrollN[1])
  61.        
  62.         # if something needs to be confirmed by click
  63.         if clickFirst:
  64.             # can pass as either string (single) or list (multiple)
  65.             if type(clickFirst) == list: clickFirst = [str(c) for c in clickFirst]
  66.             else: clickFirst = [str(clickFirst)]
  67.                
  68.             for cf in clickFirst:
  69.                 try:
  70.                     wwait_til(ecc((by_xc, cf)))
  71.                     cfEl = driver.find_element(by_xc, cf)
  72.                     driver.execute_script(scrollElToBottom, cfEl)
  73.                     cfEl.click()
  74.                 except Exception as e:
  75.                     errMsg = f'could not click [{cf}] - {type(e)}: {e}'
  76.                     if strictMode:
  77.                         if isv: print(f'quitting bc {errMsg}')
  78.                         return errMsg if returnErr else None
  79.                     elif isv: print(f'[continuing even though] {errMsg}')
  80.                
  81.         # if some section needs to be loaded first
  82.         if ecx:
  83.             # can pass as either string (single) or list (multiple)
  84.             if type(ecx) == list: ecx = [str(e) for e in ecx]
  85.             else: ecx = [str(ecx)]
  86.  
  87.             for e in ecx:
  88.                 try: wwait_til(ecv((by_xc, e)))
  89.                 except Exception as ex:
  90.                     errMsg = f'could not load [{e}] - {type(ex)}: {ex}'
  91.                     if strictMode:
  92.                         if isv: print(f'quitting bc {errMsg}')
  93.                         return errMsg if returnErr else None
  94.                     elif isv: print(f'[continuing even though] {errMsg}')
  95.            
  96.         lSoup = BeautifulSoup(driver.page_source, fparser)
  97.         driver.close() # (just in case)
  98.         del driver # (just in case)
  99.         return lSoup
  100.     except Exception as e:
  101.         errMsg = f'could not scrape [{l}] \n{type(e)}: {e}'
  102.         if isv: print(errMsg)
  103.         return errMsg if returnErr else None
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement