Advertisement
sourav8256

Untitled

Jul 30th, 2023 (edited)
842
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.87 KB | None | 0 0
  1. from selenium import webdriver
  2. from selenium.webdriver.chrome.service import Service
  3. from selenium.webdriver.common.by import By
  4. from webdriver_manager.chrome import ChromeDriverManager
  5. import os
  6. import requests
  7. import time
  8.  
  9.  
  10. # # Path to the Chrome driver executable
  11. # driver_path = ChromeDriverManager().install()
  12.  
  13. # # Check if the driver executable already exists
  14. # if os.path.exists(driver_path):
  15. #     # Delete the existing driver
  16. #     os.remove(driver_path)
  17.  
  18.  
  19. def download_file(url, filename_without_extension):
  20.     try:
  21.         # Send an HTTP GET request to the URL
  22.         response = requests.get(url)
  23.  
  24.         # Check if the request was successful (status code 200)
  25.         if response.status_code == 200:
  26.             # Create the "output" folder if it doesn't exist
  27.             output_folder = "output"
  28.             if not os.path.exists(output_folder):
  29.                 os.makedirs(output_folder)
  30.  
  31.             # Get the file extension from the response headers
  32.             content_type = response.headers.get('content-type')
  33.             file_extension = content_type.split('/')[-1]
  34.  
  35.             # Combine the filename and extension
  36.             filename_with_extension = f"{filename_without_extension}.{file_extension}"
  37.  
  38.             # Save the file to the output folder
  39.             output_path = os.path.join(output_folder, filename_with_extension)
  40.             with open(output_path, 'wb') as file:
  41.                 file.write(response.content)
  42.             print(f"File downloaded and saved as '{output_path}'")
  43.         else:
  44.             print("Failed to download file.")
  45.     except Exception as e:
  46.         print(f"Error while downloading the file: {str(e)}")
  47.  
  48.  
  49.  
  50.  
  51.  
  52. def create_and_save_text_file(filename, text_content):
  53.     try:
  54.         # Append ".txt" extension if not already provided
  55.         if not filename.endswith(".txt"):
  56.             filename += ".txt"
  57.  
  58.         # Open the file in write mode
  59.         with open(filename, 'w') as file:
  60.             # Write the text content to the file
  61.             file.write(text_content)
  62.         print(f"Text file '{filename}' created and text saved successfully.")
  63.     except Exception as e:
  64.         print(f"Error while creating and saving the text file: {str(e)}")
  65.  
  66.  
  67.  
  68. # Set up Chrome driver service
  69. service = Service(ChromeDriverManager(version='114.0.5735.90').install())
  70.  
  71. # Configure Chrome options
  72. options = webdriver.ChromeOptions()
  73. # options.add_argument("--headless")  # Run Chrome in headless mode (without GUI)
  74.  
  75. # Initialize Chrome driver
  76. driver = webdriver.Chrome(service=service, options=options)
  77.  
  78. # Specify the URL of the website you want to scrape
  79. url = "https://www.9gag.com"
  80.  
  81. # Navigate to the website
  82. driver.get(url)
  83.  
  84.  
  85. while True:
  86.     # Scrape elements using XPath
  87.     articles = driver.find_elements(By.XPATH, "//article")
  88.     # Scroll down to the bottom of the page
  89.     driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  90.     time.sleep(2)
  91.  
  92.     if len(articles) >= 10:
  93.         break
  94.  
  95. # driver.execute_script("window.scrollTo(0, 0);")
  96. time.sleep(2)
  97.  
  98. for i,article in enumerate(articles):
  99.     try:
  100.         # Scroll to the current article
  101.         driver.execute_script("arguments[0].scrollIntoView();", article)
  102.        
  103.         # Add a short delay to let the page scroll and load any dynamic content (adjust the time as needed)
  104.         time.sleep(1)
  105.         title =  article.find_element(By.XPATH,'./header/a').text
  106.         id =  article.find_element(By.XPATH,'./header/a/@data-entry-id').text
  107.         mediaSrc =  article.find_element(By.XPATH,'.//video/source/@src | .//picture/img/@src').text
  108.         print("Article " +str(i)+' : '+ article.find_element(By.XPATH,'./header/a').text)
  109.         download_file(mediaSrc,id)
  110.         create_and_save_text_file(id,title)
  111.     except Exception as e:
  112.         continue
  113. # Quit the driver and close the browser
  114. driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement