full selenium scraper

import time

import pandas as pd
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

def parse_img_url(url):
    # get the first url
    url = url.split(", ")[0]
    # split it by `/`
    splitted_url = url.split("/")
    # loop over the elements to find where `cloudfront` url begins
    for idx, part in enumerate(splitted_url):
        if "cloudfront" in part:
            # add the HTTP scheme and concatenate the rest of the URL
            # then return the processed url
            return "https://" + "/".join(splitted_url[idx:])

    # as we don't know if that's the only measurement to take,
    # return None if the cloudfront couldn't be found
    return None

def extract_data(element):
    img = element.find_element(By.TAG_NAME, "img").get_attribute("srcset")
    img = parse_img_url(img)

    # A>B means the B elements where A is the parent element.
    dietary_attrs = element.find_elements(By.CSS_SELECTOR, "div[class*='DietaryAttributes']>span")
    # if there aren't any, then `dietary_attrs` will be None and `if` block won't work
    # but if there are any dietary attributes, extract the text from them
    if dietary_attrs:
        dietary_attrs = [attr.text for attr in dietary_attrs]
    else:
        # set the variable to None if there aren't any dietary attributes found.
        dietary_attrs = None

    # get the span elements where the parent is a `div` element that
    # has `ItemBCardDefault` substring in the `class` attribute
    price = element.find_elements(By.CSS_SELECTOR, "div[class*='ItemBCardDefault']>span")
    # extract the price text if we could find the price span
    if price:
        price = price[0].text
    else:
        price = None

    name = element.find_element(By.TAG_NAME, "h2").text
    size = element.find_element(By.CSS_SELECTOR, "div[class*='Size']").text

    return {
        "price": price,
        "name": name,
        "size": size,
        "attrs": dietary_attrs,
        "img": img
    }

# start by defining the options
options = webdriver.ChromeOptions()
options.headless = True # it's more scalable to work in headless mode
# normally, selenium waits for all resources to download
# we don't need it as the page also populated with the running javascript code.
options.page_load_strategy = 'none'
# this returns the path web driver downloaded
chrome_path = ChromeDriverManager().install()
chrome_service = Service(chrome_path)
# pass the defined options and service objects to initialize the web driver
driver = Chrome(options=options, service=chrome_service)
driver.implicitly_wait(5)

url = "https://www.instacart.com/store/sprouts/collections/bread?guest=True"

driver.get(url)
time.sleep(10)

content = driver.find_element(By.CSS_SELECTOR, "div[class*='ItemsGridWithPostAtcRecommendations'")
breads = content.find_elements(By.TAG_NAME, "li")

data = []

for bread in breads:
    extracted_data = extract_data(bread)
    data.append(extracted_data)

df = pd.DataFrame(data)
df.to_csv("result.csv", index=False)

driver.quit()