Advertisement
here2share

#webscrape_images.py

Aug 21st, 2023
1,049
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.47 KB | None | 0 0
  1. #webscrape_images.py
  2.  
  3. import urllib.request
  4. import os
  5. import re
  6. import time
  7. import random
  8. from PIL import Image
  9.  
  10. internal = []
  11. keep = []
  12. links_only = 1
  13.  
  14. def get_data(link):
  15.     size = link.replace('h_', '')
  16.     try:
  17.         ttt, size = size.split('/fill/w_')
  18.         w, h = [int(i) for i in size.split(',')[:2]]
  19.         if w > 500 and h > 500:
  20.             return link
  21.        
  22.     except:
  23.         0
  24.  
  25. def addlinks(keyURL):
  26.     url = keyURL
  27.  
  28.     def trim(target):
  29.         try:
  30.             while target[-1] in '/\\':
  31.                 target = target[:-1]
  32.         except:
  33.             pass
  34.         return target
  35.     url = trim(url)
  36.  
  37.     try:
  38.         html = urllib.request.urlopen(url).read().decode('utf-8')
  39.     except:
  40.         return
  41.  
  42.     def filterHTML():
  43.         links = re.findall(r'href="([^"]+)"', html)
  44.         zzz = links[:]
  45.         for z in zzz:
  46.             if 'deviantart.' in z and '/art/' not in z and '/gallery/' not in z:
  47.                 links.remove(z)
  48.  
  49.         return sorted(list(set(links)))
  50.  
  51.     links = filterHTML()
  52.  
  53.     avoid = 'java. mailto. about. about. shop. welcome. adverti /apply/'.split()
  54.     for x in range(len(links)-1):
  55.         link = links[x]
  56.         for i in list('''"'#=''')+avoid:
  57.             if i in link:
  58.                 link=None
  59.                 break
  60.         if link:
  61.             link=trim(link)
  62.             if link not in keep:
  63.                 keep.append(link)
  64.                 internal.append(link)
  65.  
  66. base_folder = r'c:\Z-DL-TEMP'
  67. if not os.path.exists(base_folder):
  68.     os.makedirs(base_folder)
  69.  
  70. #fixed duplicate checker function
  71. dupl = set()
  72.  
  73. def download_image(url):
  74.     if url in dupl:
  75.         return
  76.     try:
  77.         file = urllib.request.urlopen(url)
  78.     except:
  79.         return
  80.     try:
  81.         file_name = url.split('strp/')[1]
  82.         file_name = file_name.split('?token')[0]
  83.     except:
  84.         return
  85.     print("Image:", file_name)
  86.     try:
  87.         #open and download images using Pillow library
  88.         img = Image.open(file)
  89.         width, height = img.size
  90.        
  91.         print([width, height])
  92.         if width > 500 and height > 500:
  93.             dupl.add(url)
  94.             if img.mode != 'RGB':
  95.                 img = img.convert('RGB')
  96.             img.save(os.path.join(base_folder, file_name), format='JPEG', quality=90, optimize=True)
  97.             print('+'*20, 'IMAGE SAVED', '+'*20, '\n')
  98.         elif width < 200 and height < 200:
  99.             print('')
  100.             return 1
  101.         else:
  102.             print('__too small__')
  103.     except:
  104.         print('__unable to process file__')
  105.     print('')
  106.  
  107. exts = ".jpg .jpeg .png .webp".split()
  108. def extract_image_links(html):
  109.     urls = []
  110.     parts = html.split('src="')
  111.     for part in parts:
  112.         if 'http' not in part:
  113.             continue
  114.         url = part = 'http' + part.split('http', 1)[1]
  115.         if 'image' not in part:
  116.             continue
  117.         part = part.split('image')[1]
  118.         for ext in exts:
  119.             if ext in part:
  120.                 url = url.split('"')[0]
  121.                 url = url.split(' ')[0]
  122.                 urls.append(url)
  123.                 break
  124.     return urls
  125.  
  126. def from_page(u):
  127.     print('='*20)
  128.     print("+++", u)
  129.     print('='*20)
  130.     print("Please Wait...")
  131.     print('')
  132.     try:
  133.         in_page = urllib.request.urlopen(u).read().decode('utf-8')
  134.     except:
  135.         return
  136.     links = extract_image_links(in_page)
  137.     for link in links:
  138.         t = download_image(link)
  139.         if t:
  140.             break
  141.  
  142. def main(z):
  143.     internal.append(z)
  144.     keep.append(z)
  145.     while internal:
  146.         random.shuffle(internal)
  147.         link = internal.pop(0)
  148.         if '.deviantart.com' in link:
  149.             addlinks(link)
  150.             from_page(link)
  151.  
  152. z = 'https://***.deviantart.com/'
  153. main(z)
  154.  
  155. print('\n','_'*10,'DONE!','_'*10, '\n')
  156. print('Downloaded Images:')
  157.  
  158. for d in dupl:
  159.     print(d)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement