Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import os
- from bs4 import BeautifulSoup
- # set the starting number and maximum number of URLs to parse
- start = 1
- max_num = 17047267 # Highest post ID I could find
- terms = ["replacemewithsubreddits", "orwordslikegonewild"]
- # create the "originals" folder if it doesn't exist
- if not os.path.exists("originals"):
- os.mkdir("originals")
- # loop through the range of numbers and make a request to each URL
- for i in reversed(range(max_num)):
- response = None
- while response is None:
- print(f"Getting number {i} ...")
- url = f"https://jizz2.com/pics/{i}"
- try:
- response = requests.get(url)
- except:
- pass
- soup = BeautifulSoup(response.content, "html.parser")
- tags = soup.find("p", class_="tags")
- if tags is None:
- continue
- found = False
- print(tags.text)
- for term in terms:
- if term in str(tags.text).lower():
- print(f"{i} contains {term}")
- found = True
- if not found:
- continue
- # parse the HTML content with BeautifulSoup
- # find the original URL in the <p> tag with class "desc"
- desc = soup.find("p", class_="desc")
- if desc is not None:
- original_url = desc.text.split('original: ')[1]
- print(original_url)
- if 'jpg' in original_url or 'png' in original_url or 'gif' in original_url :
- # download the original URL and save it to a file
- os.system(f"wget {original_url} -P originals/")
- os.system(f"echo {i}:{original_url} >> url_list")
- # The following was too computationally intensive - wanted to drop removed pages and HTML files
- #os.system("grep -Rli \"DOCTYPE html\" * | grep -v py | xargs rm; find . -type f -size 503c -delete")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement