good michi post getter

pip install requests beautifulsoup4


import requests
from bs4 import BeautifulSoup
import re

def fetch_filtered_links(url, keyword):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    posts = soup.find_all('article', class_='post')

    # List to store the direct links of posts with the keyword in the thread title
    filtered_links = []
    known_bad_threads = set()  # Use a set to track known bad threads

    # Iterate through each post to find the direct link and check the thread title
    for post in posts:
        try:
            linkButton = post.find('a', title='Link to this post')
            if linkButton:
                link = linkButton['href']
                postId = extract_second_number(link)
                thread = extract_first_number(link)
                #print("Checking thread:", thread)
                if thread not in known_bad_threads:
                    post_response = requests.get(link)
                    post_soup = BeautifulSoup(post_response.content, 'html.parser')
                    post_title = post_soup.find(class_="post_title")
                    if post_title and keyword.lower() in post_title.text.lower():
                        filtered_links.append(link)
                        print("Keyword found, link added:", link)
                        postData = post_soup.find(id=postId)
                        print(postData.get_text())
                    else:
                        known_bad_threads.add(thread)
        except (TypeError, AttributeError) as e:
            print("Error processing a post; missing elements or attributes.", e)
            continue

    return filtered_links

def extract_first_number(url):
    match = re.search(r'\d+', url)
    if match:
        return match.group()  # Return the first match found
    else:
        return None  # Adjusted to return None if no number is found

def extract_second_number(url):
    matches = re.findall(r'\d+', url)
    if len(matches) >= 2:
        return matches[1]  # Return the second match
    else:
        return None  # Return None if there are less than two numbers

# URL and keyword to search
url = "https://archive.palanq.win/vt/search/text/michi/"
keyword = "pcg"

# Get the filtered links
filtered_links = fetch_filtered_links(url, keyword)
print("Filtered links:", filtered_links)