Advertisement
Guest User

Untitled

a guest
May 22nd, 2018
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.32 KB | None | 0 0
  1.  
  2. from urllib.request import urlopen
  3. from urllib.error import HTTPError
  4. from bs4 import BeautifulSoup
  5. from tqdm import tqdm
  6. import urllib.request
  7. import re
  8. import os
  9. import sys
  10.  
  11.  
  12. def get_page(url):
  13.     '''Request a page.
  14.  
  15.    If successfull, would return BeautifulSoup object, if not,
  16.    would return None'''
  17.     try:
  18.         html = urlopen(url)
  19.     except HTTPError:
  20.         sys.error("Can't load the page: \n{}".format(url))
  21.         return None
  22.  
  23.     bs_obj = BeautifulSoup(html, "html.parser")
  24.     try:
  25.         bs_obj.h2
  26.     except AttributeError:
  27.         sys.error("Can't load the page: \n{}".format(url))
  28.         return None
  29.  
  30.     return bs_obj
  31.  
  32.  
  33. def get_image_name(thumbnail_url, folder_id=False):
  34.     '''Get name of an image from URL.
  35.  
  36.    folder_id -- if set to True, would return ID of the folder, in
  37.    which image was stored on server.'''
  38.     if folder_id is True:
  39.         result = re.search(
  40.             r'\/[0-9]*\/([A-Za-z0-9_]*)\.(png|gif|jpg|jpeg)', thumbnail_url)
  41.         return re.sub(r'thumbnail_', '', result.group(0))
  42.     else:
  43.         return re.search(
  44.             r'([A-Za-z0-9]*)\.(png|gif|jpg|jpeg)', thumbnail_url).group(0)
  45.  
  46.  
  47. def download_image(url, directory='./'):
  48.     '''Download image from url and save it to the directory.
  49.  
  50.    default directory -- script folder.'''
  51.     return urllib.request.urlretrieve(
  52.         url, "{}{}".format(directory, get_image_name(url)))
  53.  
  54.  
  55. booru_url = 'https://safebooru.org/'
  56.  
  57. if __name__ == "__main__":
  58.     print('Please, enter a set of tags to search for (seperated by spaces): ',
  59.           end='')
  60.     tags = input()
  61.  
  62.     tags = '+'.join([tag for tag in tags.split(' ')])
  63.     bs_obj = get_page(booru_url + 'index.php?page=post&s=list&tags=' + tags)
  64.  
  65.     # If there's no result for that set of tags.
  66.     if bs_obj.find("h1"):
  67.         if bs_obj.find("h1").get_text() == "Nothing found, try google? ":
  68.             sys.error("No result for \"" + tags + " tags.")
  69.  
  70.     print('Choose name for the folder (/images/YOUR_NAME): ', end='')
  71.     folder = input()
  72.  
  73.     try:
  74.         os.mkdir('./images/')
  75.     except FileExistsError:
  76.         pass
  77.  
  78.     print('Trying to create \"' + './images/' + folder + '/' + '\"...')
  79.     try:
  80.         os.mkdir('./images/' + folder + '/')
  81.         print('Directory has been successfully created.')
  82.     except FileExistsError:
  83.         print('Directory already exist.')
  84.  
  85.     last_page = bs_obj.find("a", {"alt": "last page"})
  86.  
  87.     # last_page would be None, if there's only 1 page of images.
  88.     if last_page is None:
  89.         print("There is only 1 page of images.")
  90.  
  91.         for preview_image in tqdm(bs_obj.find_all("img", {"class": "preview"}),
  92.                                   desc="Downloading..."):
  93.             download_image(booru_url + "images" +
  94.                            get_image_name(preview_image.attrs['src'], True),
  95.                            './images/' + folder + '/')
  96.  
  97.     else:
  98.         last_page = re.sub(r'^\?.*&pid=', '', last_page.attrs['href'])
  99.         print("There's {} pages, ~{} files. "
  100.               .format(int(last_page) / 40, last_page))
  101.  
  102.         pages = 0
  103.         while True:
  104.             print('How many pages do you want to download? (1 - {}): '
  105.                   .format(int(last_page) / 40), end='')
  106.             pages = int(input())
  107.  
  108.             if pages > (int(last_page) / 40) or pages < 1:
  109.                 print('Invalid number of pages. Please, choose between 1 - {}'
  110.                       .format(int(last_page) / 40))
  111.                 continue
  112.             else:
  113.                 break
  114.  
  115.         for page_id in tqdm(range(0, (pages * 40), 40), desc="Downloading..."):
  116.  
  117.             page = get_page(booru_url +
  118.                             'index.php?page=post&s=list&tags=' +
  119.                             tags +
  120.                             "&pid=" + str(page_id)
  121.                             )
  122.  
  123.             for preview_image in tqdm(page.find_all(
  124.                     "img", {"class": "preview"}),
  125.                     desc="Page #{}/{}".format(int(page_id / 40) + 1, pages),
  126.                     unit_scale=True):
  127.                 download_image(booru_url + "images" +
  128.                                get_image_name(preview_image.attrs['src'],
  129.                                               True),
  130.                                './images/' + folder + '/')
  131.  
  132.         print('\n')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement