Advertisement
Guest User

Untitled

a guest
Apr 19th, 2023
5,220
1
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.77 KB | None | 1 0
  1. import requests
  2. import os
  3. from bs4 import BeautifulSoup
  4.  
  5. # set the starting number and maximum number of URLs to parse
  6. start = 1
  7. max_num = 17047267 # Highest post ID I could find
  8. terms = ["replacemewithsubreddits", "orwordslikegonewild"]
  9.  
  10. # create the "originals" folder if it doesn't exist
  11. if not os.path.exists("originals"):
  12. os.mkdir("originals")
  13.  
  14. # loop through the range of numbers and make a request to each URL
  15. for i in reversed(range(max_num)):
  16. response = None
  17. while response is None:
  18. print(f"Getting number {i} ...")
  19. url = f"https://jizz2.com/pics/{i}"
  20. try:
  21. response = requests.get(url)
  22. except:
  23. pass
  24.  
  25. soup = BeautifulSoup(response.content, "html.parser")
  26.  
  27. tags = soup.find("p", class_="tags")
  28. if tags is None:
  29. continue
  30. found = False
  31. print(tags.text)
  32. for term in terms:
  33. if term in str(tags.text).lower():
  34. print(f"{i} contains {term}")
  35. found = True
  36.  
  37. if not found:
  38. continue
  39.  
  40. # parse the HTML content with BeautifulSoup
  41. # find the original URL in the <p> tag with class "desc"
  42. desc = soup.find("p", class_="desc")
  43. if desc is not None:
  44. original_url = desc.text.split('original: ')[1]
  45. print(original_url)
  46. if 'jpg' in original_url or 'png' in original_url or 'gif' in original_url :
  47. # download the original URL and save it to a file
  48. os.system(f"wget {original_url} -P originals/")
  49. os.system(f"echo {i}:{original_url} >> url_list")
  50. # The following was too computationally intensive - wanted to drop removed pages and HTML files
  51. #os.system("grep -Rli \"DOCTYPE html\" * | grep -v py | xargs rm; find . -type f -size 503c -delete")
  52.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement