Guest User

good michi post getter

a guest
Apr 22nd, 2024
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.37 KB | None | 0 0
  1. pip install requests beautifulsoup4
  2.  
  3.  
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import re
  7.  
  8. def fetch_filtered_links(url, keyword):
  9. response = requests.get(url)
  10. soup = BeautifulSoup(response.content, 'html.parser')
  11. posts = soup.find_all('article', class_='post')
  12.  
  13. # List to store the direct links of posts with the keyword in the thread title
  14. filtered_links = []
  15. known_bad_threads = set() # Use a set to track known bad threads
  16.  
  17. # Iterate through each post to find the direct link and check the thread title
  18. for post in posts:
  19. try:
  20. linkButton = post.find('a', title='Link to this post')
  21. if linkButton:
  22. link = linkButton['href']
  23. postId = extract_second_number(link)
  24. thread = extract_first_number(link)
  25. #print("Checking thread:", thread)
  26. if thread not in known_bad_threads:
  27. post_response = requests.get(link)
  28. post_soup = BeautifulSoup(post_response.content, 'html.parser')
  29. post_title = post_soup.find(class_="post_title")
  30. if post_title and keyword.lower() in post_title.text.lower():
  31. filtered_links.append(link)
  32. print("Keyword found, link added:", link)
  33. postData = post_soup.find(id=postId)
  34. print(postData.get_text())
  35. else:
  36. known_bad_threads.add(thread)
  37. except (TypeError, AttributeError) as e:
  38. print("Error processing a post; missing elements or attributes.", e)
  39. continue
  40.  
  41. return filtered_links
  42.  
  43. def extract_first_number(url):
  44. match = re.search(r'\d+', url)
  45. if match:
  46. return match.group() # Return the first match found
  47. else:
  48. return None # Adjusted to return None if no number is found
  49.  
  50. def extract_second_number(url):
  51. matches = re.findall(r'\d+', url)
  52. if len(matches) >= 2:
  53. return matches[1] # Return the second match
  54. else:
  55. return None # Return None if there are less than two numbers
  56.  
  57. # URL and keyword to search
  58. url = "https://archive.palanq.win/vt/search/text/michi/"
  59. keyword = "pcg"
  60.  
  61. # Get the filtered links
  62. filtered_links = fetch_filtered_links(url, keyword)
  63. print("Filtered links:", filtered_links)
Advertisement
Add Comment
Please, Sign In to add comment