Advertisement
Guest User

main.py

a guest
Oct 28th, 2023
234
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.73 KB | Source Code | 0 0
  1. from typing import List
  2. from time import sleep
  3. from selenium.webdriver.common.by import By
  4. from selenium import webdriver
  5. from selenium.webdriver.support.ui import WebDriverWait
  6. from selenium.webdriver.chrome.options import Options
  7.  
  8.  
  9. class Scraper:
  10.  
  11.     @staticmethod
  12.     def _chrome_driver_configuration() -> Options:
  13.         chrome_options = Options()
  14.         chrome_options.add_argument("--disable-notifications")
  15.         chrome_options.add_argument("--disable-extensions")
  16.         chrome_options.add_argument("--disable-popup-blocking")
  17.         chrome_options.add_argument("--disable-default-apps")
  18.         chrome_options.add_argument("--disable-infobars")
  19.         chrome_options.add_argument("--disable-web-security")
  20.         chrome_options.add_argument(
  21.             "--disable-features=IsolateOrigins,site-per-process"
  22.         )
  23.         chrome_options.add_argument(
  24.             "--enable-features=NetworkService,NetworkServiceInProcess"
  25.         )
  26.         chrome_options.add_argument("--profile-directory=Default")
  27.         chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
  28.         return chrome_options
  29.  
  30.  
  31.  
  32. class BaseInstagramScraper(Scraper):
  33.     def __init__(self, user_id: str, base_url: str) -> None:
  34.         super().__init__()
  35.         self._user_id = user_id
  36.         self._base_url = base_url.format(self._user_id)
  37.         self._driver = webdriver.Chrome(options=self._chrome_driver_configuration())
  38.         self._driver.get(self._base_url)
  39.         self._wait = WebDriverWait(self._driver, 10)
  40.  
  41.  
  42.  
  43. def scroll_page_callback(driver, callback) -> None:
  44.     """
  45.    Scrolls the page to load more data from a website
  46.    """
  47.     try:
  48.         last_height = driver.execute_script("return document.body.scrollHeight")
  49.         consecutive_scrolls = 0
  50.  
  51.         while consecutive_scrolls < 3:
  52.             driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  53.  
  54.             sleep(3)
  55.             new_height = driver.execute_script("return document.body.scrollHeight")
  56.  
  57.             if new_height == last_height:
  58.                 consecutive_scrolls += 1
  59.             else:
  60.                 consecutive_scrolls = 0
  61.  
  62.             last_height = new_height
  63.  
  64.             callback(driver)
  65.  
  66.     except Exception as e:
  67.         logs.log_error(f"Error occurred while scrolling: {e}")
  68.  
  69. class ProfileScraper(BaseInstagramScraper):
  70.     def __init__(self, user_id: str) -> None:
  71.         super().__init__(user_id, base_url=f"https://www.instagram.com/{user_id}/")
  72.         self._driver.add_cookie(
  73.             {
  74.                 "name": "sessionid",
  75.                 "value": "PUTITHERE",
  76.                 "domain": ".instagram.com",
  77.             }
  78.         )
  79.         self._refresh_driver()
  80.  
  81.     def _refresh_driver(self) -> None:
  82.         self._driver.refresh()
  83.  
  84.     def extract_images(self):
  85.         extracted_image_urls = []
  86.         try:
  87.  
  88.             def extract_callback(driver):
  89.                 img_elements = self._driver.find_elements(
  90.                     By.CLASS_NAME,
  91.                     "x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3",
  92.                 )
  93.                 for img_element in img_elements:
  94.                     src_attribute = img_element.get_attribute("src")
  95.                     if src_attribute and src_attribute not in extracted_image_urls:
  96.                         extracted_image_urls.append(src_attribute)
  97.  
  98.             scroll_page_callback(self._driver, extract_callback)
  99.  
  100.         except Exception as e:
  101.             print(f"An  error occurred while extracting images: {e}")
  102.  
  103.         return extracted_image_urls
  104.  
  105.  
  106. if __name__ == "__main__":
  107.     scraper = ProfileScraper("sawardega_wataha")
  108.     data = scraper.extract_images()
  109.     print(len(data))
  110.     print(data[0])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement