Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from typing import List
- from time import sleep
- from selenium.webdriver.common.by import By
- from selenium import webdriver
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.chrome.options import Options
- class Scraper:
- @staticmethod
- def _chrome_driver_configuration() -> Options:
- chrome_options = Options()
- chrome_options.add_argument("--disable-notifications")
- chrome_options.add_argument("--disable-extensions")
- chrome_options.add_argument("--disable-popup-blocking")
- chrome_options.add_argument("--disable-default-apps")
- chrome_options.add_argument("--disable-infobars")
- chrome_options.add_argument("--disable-web-security")
- chrome_options.add_argument(
- "--disable-features=IsolateOrigins,site-per-process"
- )
- chrome_options.add_argument(
- "--enable-features=NetworkService,NetworkServiceInProcess"
- )
- chrome_options.add_argument("--profile-directory=Default")
- chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
- return chrome_options
- class BaseInstagramScraper(Scraper):
- def __init__(self, user_id: str, base_url: str) -> None:
- super().__init__()
- self._user_id = user_id
- self._base_url = base_url.format(self._user_id)
- self._driver = webdriver.Chrome(options=self._chrome_driver_configuration())
- self._driver.get(self._base_url)
- self._wait = WebDriverWait(self._driver, 10)
- def scroll_page_callback(driver, callback) -> None:
- """
- Scrolls the page to load more data from a website
- """
- try:
- last_height = driver.execute_script("return document.body.scrollHeight")
- consecutive_scrolls = 0
- while consecutive_scrolls < 3:
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- sleep(3)
- new_height = driver.execute_script("return document.body.scrollHeight")
- if new_height == last_height:
- consecutive_scrolls += 1
- else:
- consecutive_scrolls = 0
- last_height = new_height
- callback(driver)
- except Exception as e:
- logs.log_error(f"Error occurred while scrolling: {e}")
- class ProfileScraper(BaseInstagramScraper):
- def __init__(self, user_id: str) -> None:
- super().__init__(user_id, base_url=f"https://www.instagram.com/{user_id}/")
- self._driver.add_cookie(
- {
- "name": "sessionid",
- "value": "PUTITHERE",
- "domain": ".instagram.com",
- }
- )
- self._refresh_driver()
- def _refresh_driver(self) -> None:
- self._driver.refresh()
- def extract_images(self):
- extracted_image_urls = []
- try:
- def extract_callback(driver):
- img_elements = self._driver.find_elements(
- By.CLASS_NAME,
- "x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3",
- )
- for img_element in img_elements:
- src_attribute = img_element.get_attribute("src")
- if src_attribute and src_attribute not in extracted_image_urls:
- extracted_image_urls.append(src_attribute)
- scroll_page_callback(self._driver, extract_callback)
- except Exception as e:
- print(f"An error occurred while extracting images: {e}")
- return extracted_image_urls
- if __name__ == "__main__":
- scraper = ProfileScraper("sawardega_wataha")
- data = scraper.extract_images()
- print(len(data))
- print(data[0])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement