Advertisement
Abhisek92

scrape_market.py

Apr 12th, 2024 (edited)
704
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.56 KB | None | 0 0
  1. import warnings
  2. import argparse
  3. from pathlib import Path
  4. from selenium import webdriver
  5. from datetime import datetime, timedelta
  6. from selenium.webdriver.common.by import By
  7. from selenium.webdriver.support.ui import WebDriverWait
  8. from webdriver_manager.chrome import ChromeDriverManager
  9. from webdriver_manager.core.os_manager import ChromeType
  10. from selenium.webdriver.support import expected_conditions as EC
  11. from selenium.webdriver.chrome.service import Service as ChromiumService
  12.  
  13.  
  14. def wait_and_click(driver, locator, wait_time=20):
  15.     """
  16.    Waits for an element to be clickable and clicks it.
  17.    Accepts both XPATH and ID locators.
  18.    """
  19.     element = WebDriverWait(driver, wait_time).until(
  20.         EC.element_to_be_clickable(locator)
  21.     )
  22.     element.click()
  23.  
  24. def setup_driver(download_dir):
  25.     """
  26.    Initializes and returns a Chrome WebDriver with options.
  27.    """
  28.     options = webdriver.ChromeOptions()
  29.     options.add_argument('--headless')
  30.     options.add_argument('--enable-logging')
  31.     options.add_experimental_option("prefs", {"download.default_directory": download_dir})
  32.     options.add_experimental_option("detach", True)
  33.     driver = webdriver.Chrome(service=ChromiumService(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()), options=options)
  34.     return driver
  35.  
  36. def select_date(driver, target_date):
  37.     today = date.today()
  38.     month_difference = ((target_date.year -today.year) * 12) - today.month + target_date.month
  39.     day = target_date.strftime("%d")
  40.  
  41.     try:
  42.         wait_and_click(driver, (By.ID, "data-end-date"))
  43.         if month_difference < 0:
  44.             for _ in range(abs(month_difference)):
  45.                 wait_and_click(driver, (By.CSS_SELECTOR, ".fa.fa-chevron-left"))
  46.         elif month_difference > 0:
  47.             for _ in range(month_difference):
  48.                 wait_and_click(driver, (By.CSS_SELECTOR, ".fa.fa-chevron-right"))
  49.         wait_and_click(driver, (By.ID, f'//span[text()="{day}"]'))
  50.     except Exception as e:
  51.         warnings.warn(f"An error occurred: {e}")
  52.  
  53. def navigate_and_export_data(driver, target_date):
  54.     """
  55.    Navigates through the UI and exports data for a given month and day.
  56.    """
  57.     try:
  58.         # Open date picker
  59.         select_date(driver=driver, target_date=target_date)
  60.  
  61.         # Export data to Excel
  62.         wait_and_click(driver, (By.XPATH, '//span[@title="Export data to Excel"]'))
  63.  
  64.     except Exception as e:
  65.         warnings.warn(f"An error occurred: {e}")
  66.  
  67. def scrape(driver, target_date):
  68.     driver.get("https://www.nordpoolgroup.com/en/market-data12/Intraday/Market-data1/Market-data1/Overview/?dd=SE3&view=table")
  69.     navigate_and_export_data(driver=driver, target_date=target_date)
  70.  
  71.  
  72. if __name__ == "__main__":
  73.     parser = argparse.ArgumentParser(description='Scrape NordPoolGroup Market Data')
  74.     parser.add_argument('-d', '--date_format', default="%d-%m-%Y", type=str, help="Set date format", dest="date_format")
  75.     parser.add_argument('-s', '--start_date', type=str, help='Start date', dest="start_date")
  76.     parser.add_argument('-e', '--end_date', type=str, help='End date', dest="end_date")
  77.     parser.add_argument('-f', '--file', type=str, help="Path to file containing dates", dest="file_path")
  78.     parser.add_argument('-o', '--dst_dir', type=str, help='Destination directory for download', dest="dst_dir")
  79.  
  80.     args = parser.parse_args()
  81.  
  82.     if args.dst_dir is None:
  83.         dst_dir = Path()
  84.     else:
  85.         dst_dir = Path(args.dst_dir)
  86.  
  87.     dst_dir.mkdir(parents=True, exist_ok=True)
  88.  
  89.     date_format = args.date_format
  90.  
  91.     if args.start_date is not None:
  92.         try:
  93.             start_date = datetime.strptime(args.start_date, date_format)
  94.         except ValueError:
  95.             warnings.warn(f"Invalid date or Illformed date wrt {date_format} format: {args.start_date}")
  96.             start_date = None
  97.  
  98.     if args.start_date is not None:
  99.         try:
  100.             end_date = datetime.strptime(args.end_date, date_format)
  101.         except ValueError:
  102.             warnings.warn(f"Invalid date or Illformed date wrt {date_format} format: {args.end_date}")
  103.             end_date = None
  104.  
  105.     if (start_date is not None) and (end_date is not None):
  106.         assert start_date <= end_date, "Start date ({start_date}) cannot be later than end date ({end_date}) !!"
  107.  
  108.     dates = None
  109.     if args.file_path is not None:
  110.         file_path = Path(args.file_path)
  111.         if file_path.is_file():
  112.             with open(file_path, 'r') as src:
  113.                 lines = src.readlines()
  114.  
  115.             df = lines[0]
  116.             dates = list()
  117.             for ds in lines[1:]:
  118.                 try:
  119.                     dt = datetime.strptime(ds, df)
  120.                 except ValueError as ve:
  121.                     warnings.warn(f"Invalid date or Illformed date wrt {df} format: {ds}")
  122.                 if ((start_date is None) or (start_date <= dt)) and ((end_date is None) or (end_date >= dt)):
  123.                     dates.append(dt)
  124.             dates = list(set(dates))
  125.  
  126.     if dates is None:
  127.         if start_date is None:
  128.             raise ValueError("Start date must be set when file with dates are not specified!")
  129.         if end_date is None:
  130.             raise ValueError("End date must be set when file with dates are not specified!")
  131.  
  132.         dates = list()
  133.         current_date = start_date
  134.         while current_date <= end_date:
  135.             dates.append(current_date)
  136.             current_date += timedelta(days=1)
  137.  
  138.     driver = setup_driver(str(dst_dir))
  139.     for date in dates:
  140.         scrape(driver=driver, target_date=date)
  141.     driver.quit()
  142.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement