Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
- from urllib.parse import urlparse, urlunparse
- def get_video_urls(url):
- try:
- response = requests.get(url)
- soup = BeautifulSoup(response.text, 'html.parser')
- video_urls = set() # Using a set to store unique URLs
- # Find all anchor tags with href containing "/video-"
- anchor_tags = soup.find_all('a', href=lambda href: href and '/video-' in href)
- # Get unique URLs from anchor tags
- for tag in anchor_tags:
- video_urls.add(urljoin(response.url, tag['href']))
- return list(video_urls) # Convert set to list
- except Exception as e:
- print("An error occurred:", e)
- return []
- def write_urls_to_file(urls, output_file="output.txt"):
- try:
- with open(output_file, 'a') as file:
- for url in urls:
- file.write(url + '\n')
- print(f"Appended {len(urls)} unique URLs to {output_file}")
- except Exception as e:
- print("An error occurred while writing URLs to file:", e)
- def get_page_urls(url, num_pages):
- page_urls = [url]
- parsed_url = urlparse(url)
- for i in range(2, num_pages + 1):
- page_path = parsed_url.path.rstrip('/') + f'/{i}'
- page_url = urlunparse(parsed_url._replace(path=page_path))
- page_urls.append(page_url)
- return page_urls
- if __name__ == "__main__":
- webpage_url = input("Enter the URL of the webpage: ")
- num_pages = int(input("How many additional pages would you like to check? "))
- all_video_urls = []
- page_urls = get_page_urls(webpage_url, num_pages)
- print("Fetching page URLs:")
- print(page_urls)
- for page_url in page_urls:
- print(f"Fetching video URLs from page: {page_url}")
- video_urls = get_video_urls(page_url)
- all_video_urls.extend(video_urls)
- write_urls_to_file(all_video_urls)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement