Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- pip install requests beautifulsoup4
- import requests
- from bs4 import BeautifulSoup
- import re
- def fetch_filtered_links(url, keyword):
- response = requests.get(url)
- soup = BeautifulSoup(response.content, 'html.parser')
- posts = soup.find_all('article', class_='post')
- # List to store the direct links of posts with the keyword in the thread title
- filtered_links = []
- known_bad_threads = set() # Use a set to track known bad threads
- # Iterate through each post to find the direct link and check the thread title
- for post in posts:
- try:
- linkButton = post.find('a', title='Link to this post')
- if linkButton:
- link = linkButton['href']
- postId = extract_second_number(link)
- thread = extract_first_number(link)
- #print("Checking thread:", thread)
- if thread not in known_bad_threads:
- post_response = requests.get(link)
- post_soup = BeautifulSoup(post_response.content, 'html.parser')
- post_title = post_soup.find(class_="post_title")
- if post_title and keyword.lower() in post_title.text.lower():
- filtered_links.append(link)
- print("Keyword found, link added:", link)
- postData = post_soup.find(id=postId)
- print(postData.get_text())
- else:
- known_bad_threads.add(thread)
- except (TypeError, AttributeError) as e:
- print("Error processing a post; missing elements or attributes.", e)
- continue
- return filtered_links
- def extract_first_number(url):
- match = re.search(r'\d+', url)
- if match:
- return match.group() # Return the first match found
- else:
- return None # Adjusted to return None if no number is found
- def extract_second_number(url):
- matches = re.findall(r'\d+', url)
- if len(matches) >= 2:
- return matches[1] # Return the second match
- else:
- return None # Return None if there are less than two numbers
- # URL and keyword to search
- url = "https://archive.palanq.win/vt/search/text/michi/"
- keyword = "pcg"
- # Get the filtered links
- filtered_links = fetch_filtered_links(url, keyword)
- print("Filtered links:", filtered_links)
Advertisement
Add Comment
Please, Sign In to add comment