Untitled

import requests
import os
from bs4 import BeautifulSoup

# set the starting number and maximum number of URLs to parse
start = 1
max_num = 17047267 # Highest post ID I could find
terms = ["replacemewithsubreddits", "orwordslikegonewild"]

# create the "originals" folder if it doesn't exist
if not os.path.exists("originals"):
    os.mkdir("originals")

# loop through the range of numbers and make a request to each URL
for i in reversed(range(max_num)):
    response = None
    while response is None:
      print(f"Getting number {i} ...")
      url = f"https://jizz2.com/pics/{i}"
      try:
        response = requests.get(url)
      except:
        pass

    soup = BeautifulSoup(response.content, "html.parser")

    tags = soup.find("p", class_="tags")
    if tags is None:
        continue
    found = False
    print(tags.text)
    for term in terms:
        if term in str(tags.text).lower():
            print(f"{i} contains {term}")
            found = True

    if not found:
        continue

    # parse the HTML content with BeautifulSoup
    # find the original URL in the <p> tag with class "desc"
    desc = soup.find("p", class_="desc")
    if desc is not None:
        original_url = desc.text.split('original: ')[1]
        print(original_url)
        if 'jpg' in original_url or 'png' in original_url or 'gif' in original_url :
            # download the original URL and save it to a file
            os.system(f"wget {original_url} -P originals/")
            os.system(f"echo {i}:{original_url} >> url_list")
            # The following was too computationally intensive - wanted to drop removed pages and HTML files
            #os.system("grep -Rli \"DOCTYPE html\" * | grep -v py | xargs rm; find . -type f -size 503c -delete")