Advertisement
Guest User

Untitled

a guest
Dec 9th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.66 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. import os
  4. import re
  5. import bs4
  6. import shutil
  7. import requests
  8. from pathlib import Path
  9.  
  10.  
  11. # Contants
  12. scrape_dir = Path.cwd().joinpath('scraped')
  13. try:
  14.     scrape_dir.mkdir()
  15. except:
  16.     pass
  17. parent = "amateur-tv"
  18. headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/75.0.3770.90 Chrome/75.0.3770.90 Safari/537.36'}
  19. css = ".more-link"
  20. css_title = ".entry-title"
  21. url_base = "amateurblog.tv/page/"
  22. uploads_head = "http://amateurblog.tv/wp-content/"
  23.  
  24. # Make Parent Directory
  25. if not os.path.exists(scrape_dir + parent):
  26.     os.mkdir(scrape_dir + parent)
  27. os.chdir(scrape_dir + parent)
  28.  
  29. # Regex and Requests Setup
  30. regex = re.compile(r"uploads/[1234567890a-zA-Z/_]*")
  31. bad_status_image_list = []
  32.  
  33.  
  34. # Get Pages
  35. for i in range(1, 300):
  36.     response = requests.get(url_base + str(i), headers=headers)
  37.     response.raise_for_status()
  38.  
  39.     # Get URLS per page
  40.     soup = bs4.BeautifulSoup(response.text, 'html.parser')
  41.     elems = soup.select(css)
  42.     sets = [elems[i]['href'] for i in range(len(elems))]
  43.  
  44.     # Open Set Pages
  45.     for link in sets:
  46.         res = requests.get(link, headers=headers)
  47.         res.raise_for_status()
  48.         soup2 = bs4.BeautifulSoup(res.text, 'html.parser')
  49.         title = soup2.find('title').contents[0]
  50.         title = ''.join(title.split(' '))
  51.  
  52.         # Make Directory
  53.         if not os.path.exists(scrape_dir + parent + f"{title}"):
  54.             os.mkdir(scrape_dir + parent + os.sep +  f"{title}")
  55.             print("Making " + scrape_dir + parent + os.sep +  f"{title}")
  56.         os.chdir(scrape_dir + parent + os.sep + f"{title}")
  57.  
  58.         # Parse for Images
  59.         mo = regex.findall(res.text)
  60.         mo = list(set(mo))
  61.  
  62.         # Build Image URLS
  63.         image_urls = [(uploads_head + uploads_tail + ".jpg") for uploads_tail in mo if 'background' not in uploads_tail]
  64.  
  65.  
  66.         for image in image_urls:
  67.             response = requests.get(image, stream=True, headers=headers)
  68.             if response.status_code == 200:
  69.                 image_path = image.split('/')
  70.                 if not image_path[-1] in os.listdir():
  71.                     with open(image_path[-1] + ".jpg", 'wb') as fo:
  72.                         shutil.copyfileobj(response.raw, fo)
  73.                         print("Saving " + image_path[-1])
  74.                     del response
  75.                 else:
  76.                     print("Already Saved Image")
  77.             else:
  78.                 bad_status_image_list.append(image)
  79.  
  80. os.chdir(scrape_dir + parent)
  81. with open(bad_status_image_list + ".txt", 'a') as fo:
  82.     for line in bad_status_image_list:
  83.         fo.write(line + "\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement