Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #webscrape_images.py
- import urllib.request
- import os
- import re
- import time
- import random
- from PIL import Image
- internal = []
- keep = []
- links_only = 1
- def get_data(link):
- size = link.replace('h_', '')
- try:
- ttt, size = size.split('/fill/w_')
- w, h = [int(i) for i in size.split(',')[:2]]
- if w > 500 and h > 500:
- return link
- except:
- 0
- def addlinks(keyURL):
- url = keyURL
- def trim(target):
- try:
- while target[-1] in '/\\':
- target = target[:-1]
- except:
- pass
- return target
- url = trim(url)
- try:
- html = urllib.request.urlopen(url).read().decode('utf-8')
- except:
- return
- def filterHTML():
- links = re.findall(r'href="([^"]+)"', html)
- zzz = links[:]
- for z in zzz:
- if 'deviantart.' in z and '/art/' not in z and '/gallery/' not in z:
- links.remove(z)
- return sorted(list(set(links)))
- links = filterHTML()
- avoid = 'java. mailto. about. about. shop. welcome. adverti /apply/'.split()
- for x in range(len(links)-1):
- link = links[x]
- for i in list('''"'#=''')+avoid:
- if i in link:
- link=None
- break
- if link:
- link=trim(link)
- if link not in keep:
- keep.append(link)
- internal.append(link)
- base_folder = r'c:\Z-DL-TEMP'
- if not os.path.exists(base_folder):
- os.makedirs(base_folder)
- #fixed duplicate checker function
- dupl = set()
- def download_image(url):
- if url in dupl:
- return
- try:
- file = urllib.request.urlopen(url)
- except:
- return
- try:
- file_name = url.split('strp/')[1]
- file_name = file_name.split('?token')[0]
- except:
- return
- print("Image:", file_name)
- try:
- #open and download images using Pillow library
- img = Image.open(file)
- width, height = img.size
- print([width, height])
- if width > 500 and height > 500:
- dupl.add(url)
- if img.mode != 'RGB':
- img = img.convert('RGB')
- img.save(os.path.join(base_folder, file_name), format='JPEG', quality=90, optimize=True)
- print('+'*20, 'IMAGE SAVED', '+'*20, '\n')
- elif width < 200 and height < 200:
- print('')
- return 1
- else:
- print('__too small__')
- except:
- print('__unable to process file__')
- print('')
- exts = ".jpg .jpeg .png .webp".split()
- def extract_image_links(html):
- urls = []
- parts = html.split('src="')
- for part in parts:
- if 'http' not in part:
- continue
- url = part = 'http' + part.split('http', 1)[1]
- if 'image' not in part:
- continue
- part = part.split('image')[1]
- for ext in exts:
- if ext in part:
- url = url.split('"')[0]
- url = url.split(' ')[0]
- urls.append(url)
- break
- return urls
- def from_page(u):
- print('='*20)
- print("+++", u)
- print('='*20)
- print("Please Wait...")
- print('')
- try:
- in_page = urllib.request.urlopen(u).read().decode('utf-8')
- except:
- return
- links = extract_image_links(in_page)
- for link in links:
- t = download_image(link)
- if t:
- break
- def main(z):
- internal.append(z)
- keep.append(z)
- while internal:
- random.shuffle(internal)
- link = internal.pop(0)
- if '.deviantart.com' in link:
- addlinks(link)
- from_page(link)
- z = 'https://***.deviantart.com/'
- main(z)
- print('\n','_'*10,'DONE!','_'*10, '\n')
- print('Downloaded Images:')
- for d in dupl:
- print(d)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement