Advertisement
alperiox

full selenium scraper

Oct 29th, 2022 (edited)
1,800
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.20 KB | None | 0 0
  1. import time
  2.  
  3. import pandas as pd
  4. from selenium import webdriver
  5. from selenium.webdriver import Chrome
  6. from selenium.webdriver.chrome.service import Service
  7. from selenium.webdriver.common.by import By
  8. from webdriver_manager.chrome import ChromeDriverManager
  9.  
  10. def parse_img_url(url):
  11.     # get the first url
  12.     url = url.split(", ")[0]
  13.     # split it by `/`
  14.     splitted_url = url.split("/")
  15.     # loop over the elements to find where `cloudfront` url begins
  16.     for idx, part in enumerate(splitted_url):
  17.         if "cloudfront" in part:
  18.             # add the HTTP scheme and concatenate the rest of the URL
  19.             # then return the processed url
  20.             return "https://" + "/".join(splitted_url[idx:])
  21.    
  22.     # as we don't know if that's the only measurement to take,
  23.     # return None if the cloudfront couldn't be found
  24.     return None
  25.  
  26. def extract_data(element):
  27.     img = element.find_element(By.TAG_NAME, "img").get_attribute("srcset")
  28.     img = parse_img_url(img)
  29.  
  30.     # A>B means the B elements where A is the parent element.
  31.     dietary_attrs = element.find_elements(By.CSS_SELECTOR, "div[class*='DietaryAttributes']>span")
  32.     # if there aren't any, then `dietary_attrs` will be None and `if` block won't work
  33.     # but if there are any dietary attributes, extract the text from them
  34.     if dietary_attrs:
  35.         dietary_attrs = [attr.text for attr in dietary_attrs]
  36.     else:
  37.         # set the variable to None if there aren't any dietary attributes found.
  38.         dietary_attrs = None
  39.  
  40.     # get the span elements where the parent is a `div` element that
  41.     # has `ItemBCardDefault` substring in the `class` attribute
  42.     price = element.find_elements(By.CSS_SELECTOR, "div[class*='ItemBCardDefault']>span")
  43.     # extract the price text if we could find the price span
  44.     if price:
  45.         price = price[0].text
  46.     else:
  47.         price = None
  48.  
  49.     name = element.find_element(By.TAG_NAME, "h2").text
  50.     size = element.find_element(By.CSS_SELECTOR, "div[class*='Size']").text
  51.  
  52.     return {
  53.         "price": price,
  54.         "name": name,
  55.         "size": size,
  56.         "attrs": dietary_attrs,
  57.         "img": img
  58.     }
  59.  
  60. # start by defining the options
  61. options = webdriver.ChromeOptions()
  62. options.headless = True # it's more scalable to work in headless mode
  63. # normally, selenium waits for all resources to download
  64. # we don't need it as the page also populated with the running javascript code.
  65. options.page_load_strategy = 'none'
  66. # this returns the path web driver downloaded
  67. chrome_path = ChromeDriverManager().install()
  68. chrome_service = Service(chrome_path)
  69. # pass the defined options and service objects to initialize the web driver
  70. driver = Chrome(options=options, service=chrome_service)
  71. driver.implicitly_wait(5)
  72.  
  73. url = "https://www.instacart.com/store/sprouts/collections/bread?guest=True"
  74.  
  75. driver.get(url)
  76. time.sleep(10)
  77.  
  78. content = driver.find_element(By.CSS_SELECTOR, "div[class*='ItemsGridWithPostAtcRecommendations'")
  79. breads = content.find_elements(By.TAG_NAME, "li")
  80.  
  81. data = []
  82.  
  83. for bread in breads:
  84.     extracted_data = extract_data(bread)
  85.     data.append(extracted_data)
  86.  
  87. df = pd.DataFrame(data)
  88. df.to_csv("result.csv", index=False)
  89.  
  90. driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement