Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- import re
- from urllib.parse import urljoin
- def get_video_urls(url):
- try:
- response = requests.get(url)
- soup = BeautifulSoup(response.text, 'html.parser')
- video_urls = []
- # Find all anchor tags
- anchor_tags = soup.find_all('a')
- # Filter anchor tags with resolution format such as "1080p"
- for tag in anchor_tags:
- if re.search(r'\b\d{3,4}p\b', tag.text):
- video_urls.append(urljoin(response.url, tag['href']))
- return video_urls
- except Exception as e:
- print("An error occurred:", e)
- return []
- def write_urls_to_file(video_urls, output_file="output.txt"):
- try:
- with open(output_file, 'a') as file:
- for url in video_urls:
- file.write(url + '\n')
- print(f"Appended {len(video_urls)} URLs to {output_file}")
- except Exception as e:
- print("An error occurred while writing URLs to file:", e)
- def get_page_urls(url, num_pages):
- page_urls = [url]
- for i in range(1, num_pages):
- if i == 1:
- page_urls.append(urljoin(url, f"&p={i}"))
- else:
- page_urls.append(urljoin(url, f"?p={i}"))
- return page_urls
- if __name__ == "__main__":
- webpage_url = input("Enter the URL of the webpage: ")
- num_pages = int(input("How many additional pages would you like to check? "))
- all_video_urls = []
- page_urls = get_page_urls(webpage_url, num_pages + 1)
- for page_url in page_urls:
- video_urls = get_video_urls(page_url)
- all_video_urls.extend(video_urls)
- write_urls_to_file(all_video_urls)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement