Advertisement
Guest User

Untitled

a guest
Apr 13th, 2024
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.94 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from urllib.parse import urljoin
  4. from urllib.parse import urlparse, urlunparse
  5.  
  6. def get_video_urls(url):
  7.     try:
  8.         response = requests.get(url)
  9.         soup = BeautifulSoup(response.text, 'html.parser')
  10.         video_urls = set()  # Using a set to store unique URLs
  11.  
  12.         # Find all anchor tags with href containing "/video-"
  13.         anchor_tags = soup.find_all('a', href=lambda href: href and '/video-' in href)
  14.  
  15.         # Get unique URLs from anchor tags
  16.         for tag in anchor_tags:
  17.             video_urls.add(urljoin(response.url, tag['href']))
  18.  
  19.         return list(video_urls)  # Convert set to list
  20.     except Exception as e:
  21.         print("An error occurred:", e)
  22.         return []
  23.  
  24. def write_urls_to_file(urls, output_file="output.txt"):
  25.     try:
  26.         with open(output_file, 'a') as file:
  27.             for url in urls:
  28.                 file.write(url + '\n')
  29.         print(f"Appended {len(urls)} unique URLs to {output_file}")
  30.     except Exception as e:
  31.         print("An error occurred while writing URLs to file:", e)
  32.  
  33.  
  34.  
  35. def get_page_urls(url, num_pages):
  36.     page_urls = [url]
  37.     parsed_url = urlparse(url)
  38.     for i in range(2, num_pages + 1):
  39.         page_path = parsed_url.path.rstrip('/') + f'/{i}'
  40.         page_url = urlunparse(parsed_url._replace(path=page_path))
  41.         page_urls.append(page_url)
  42.     return page_urls
  43.  
  44.  
  45.  
  46. if __name__ == "__main__":
  47.     webpage_url = input("Enter the URL of the webpage: ")
  48.     num_pages = int(input("How many additional pages would you like to check? "))
  49.  
  50.     all_video_urls = []
  51.  
  52.     page_urls = get_page_urls(webpage_url, num_pages)
  53.  
  54.     print("Fetching page URLs:")
  55.     print(page_urls)
  56.  
  57.     for page_url in page_urls:
  58.         print(f"Fetching video URLs from page: {page_url}")
  59.         video_urls = get_video_urls(page_url)
  60.         all_video_urls.extend(video_urls)
  61.  
  62.     write_urls_to_file(all_video_urls)
  63.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement