Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import os
- import re
- import bs4
- import shutil
- import requests
- from pathlib import Path
- # Contants
- scrape_dir = Path.cwd().joinpath('scraped')
- try:
- scrape_dir.mkdir()
- except:
- pass
- parent = "amateur-tv"
- headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/75.0.3770.90 Chrome/75.0.3770.90 Safari/537.36'}
- css = ".more-link"
- css_title = ".entry-title"
- url_base = "amateurblog.tv/page/"
- uploads_head = "http://amateurblog.tv/wp-content/"
- # Make Parent Directory
- if not os.path.exists(scrape_dir + parent):
- os.mkdir(scrape_dir + parent)
- os.chdir(scrape_dir + parent)
- # Regex and Requests Setup
- regex = re.compile(r"uploads/[1234567890a-zA-Z/_]*")
- bad_status_image_list = []
- # Get Pages
- for i in range(1, 300):
- response = requests.get(url_base + str(i), headers=headers)
- response.raise_for_status()
- # Get URLS per page
- soup = bs4.BeautifulSoup(response.text, 'html.parser')
- elems = soup.select(css)
- sets = [elems[i]['href'] for i in range(len(elems))]
- # Open Set Pages
- for link in sets:
- res = requests.get(link, headers=headers)
- res.raise_for_status()
- soup2 = bs4.BeautifulSoup(res.text, 'html.parser')
- title = soup2.find('title').contents[0]
- title = ''.join(title.split(' '))
- # Make Directory
- if not os.path.exists(scrape_dir + parent + f"{title}"):
- os.mkdir(scrape_dir + parent + os.sep + f"{title}")
- print("Making " + scrape_dir + parent + os.sep + f"{title}")
- os.chdir(scrape_dir + parent + os.sep + f"{title}")
- # Parse for Images
- mo = regex.findall(res.text)
- mo = list(set(mo))
- # Build Image URLS
- image_urls = [(uploads_head + uploads_tail + ".jpg") for uploads_tail in mo if 'background' not in uploads_tail]
- for image in image_urls:
- response = requests.get(image, stream=True, headers=headers)
- if response.status_code == 200:
- image_path = image.split('/')
- if not image_path[-1] in os.listdir():
- with open(image_path[-1] + ".jpg", 'wb') as fo:
- shutil.copyfileobj(response.raw, fo)
- print("Saving " + image_path[-1])
- del response
- else:
- print("Already Saved Image")
- else:
- bad_status_image_list.append(image)
- os.chdir(scrape_dir + parent)
- with open(bad_status_image_list + ".txt", 'a') as fo:
- for line in bad_status_image_list:
- fo.write(line + "\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement