Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from selenium import webdriver
- from selenium.webdriver.chrome.service import Service
- from selenium.webdriver.common.by import By
- from webdriver_manager.chrome import ChromeDriverManager
- import os
- import requests
- import time
- # # Path to the Chrome driver executable
- # driver_path = ChromeDriverManager().install()
- # # Check if the driver executable already exists
- # if os.path.exists(driver_path):
- # # Delete the existing driver
- # os.remove(driver_path)
- def download_file(url, filename_without_extension):
- try:
- # Send an HTTP GET request to the URL
- response = requests.get(url)
- # Check if the request was successful (status code 200)
- if response.status_code == 200:
- # Create the "output" folder if it doesn't exist
- output_folder = "output"
- if not os.path.exists(output_folder):
- os.makedirs(output_folder)
- # Get the file extension from the response headers
- content_type = response.headers.get('content-type')
- file_extension = content_type.split('/')[-1]
- # Combine the filename and extension
- filename_with_extension = f"{filename_without_extension}.{file_extension}"
- # Save the file to the output folder
- output_path = os.path.join(output_folder, filename_with_extension)
- with open(output_path, 'wb') as file:
- file.write(response.content)
- print(f"File downloaded and saved as '{output_path}'")
- else:
- print("Failed to download file.")
- except Exception as e:
- print(f"Error while downloading the file: {str(e)}")
- def create_and_save_text_file(filename, text_content):
- try:
- # Append ".txt" extension if not already provided
- if not filename.endswith(".txt"):
- filename += ".txt"
- # Open the file in write mode
- with open(filename, 'w') as file:
- # Write the text content to the file
- file.write(text_content)
- print(f"Text file '{filename}' created and text saved successfully.")
- except Exception as e:
- print(f"Error while creating and saving the text file: {str(e)}")
- # Set up Chrome driver service
- service = Service(ChromeDriverManager(version='114.0.5735.90').install())
- # Configure Chrome options
- options = webdriver.ChromeOptions()
- # options.add_argument("--headless") # Run Chrome in headless mode (without GUI)
- # Initialize Chrome driver
- driver = webdriver.Chrome(service=service, options=options)
- # Specify the URL of the website you want to scrape
- url = "https://www.9gag.com"
- # Navigate to the website
- driver.get(url)
- while True:
- # Scrape elements using XPath
- articles = driver.find_elements(By.XPATH, "//article")
- # Scroll down to the bottom of the page
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- time.sleep(2)
- if len(articles) >= 10:
- break
- # driver.execute_script("window.scrollTo(0, 0);")
- time.sleep(2)
- for i,article in enumerate(articles):
- try:
- # Scroll to the current article
- driver.execute_script("arguments[0].scrollIntoView();", article)
- # Add a short delay to let the page scroll and load any dynamic content (adjust the time as needed)
- time.sleep(1)
- title = article.find_element(By.XPATH,'./header/a').text
- id = article.find_element(By.XPATH,'./header/a/@data-entry-id').text
- mediaSrc = article.find_element(By.XPATH,'.//video/source/@src | .//picture/img/@src').text
- print("Article " +str(i)+' : '+ article.find_element(By.XPATH,'./header/a').text)
- download_file(mediaSrc,id)
- create_and_save_text_file(id,title)
- except Exception as e:
- continue
- # Quit the driver and close the browser
- driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement