Advertisement
JamesTan

Untitled

Apr 9th, 2020
383
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 3.31 KB | None | 0 0
  1. import datetime
  2. import os
  3. import re
  4. import random
  5. from urllib import request
  6.  
  7.  
  8. class ImageCrawler(object):
  9.  
  10.     @staticmethod
  11.     def get_url(url):
  12.         headers = {
  13.             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
  14.         }
  15.         url = request.Request(url, headers=headers)
  16.         target_html = request.urlopen(url)
  17.         html_content = target_html.read()
  18.         html_content = html_content.decode('utf-8', "ignore")
  19.         return html_content
  20.  
  21.     @staticmethod
  22.     def get_specific_img(target_html, input_array):
  23.         img_list = []
  24.         for input1 in input_array:
  25.             reg = r'https:\/\/[^\s,"]*\%s.jpg' % input1
  26.             img_reg = re.compile(reg)
  27.             try:
  28.                 img_list.append(re.findall(img_reg, target_html)[0])
  29.             except:
  30.                 print('cannot find picture %s' % input1)
  31.         return img_list
  32.  
  33.     @staticmethod
  34.     def get_img(target_html):
  35.         reg = r'https:\/\/[^\s,"]*\.jpg'
  36.         img_reg = re.compile(reg)
  37.         img_list = re.findall(img_reg, target_html)
  38.         return img_list
  39.  
  40.     @staticmethod
  41.     def save_img(target_img_list):
  42.         file_dir = os.getcwd()
  43.         file_path = os.path.join(file_dir, 'downlosd_image')
  44.         check_exists = os.path.exists(file_path)
  45.         if not check_exists:
  46.             os.makedirs(file_path)
  47.         count_num = 0
  48.         for img in target_img_list:
  49.             count_num = count_num + 1
  50.             now_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
  51.             image_name = r"{0}\{1}_image_{2}.jpg".format(file_path, now_time, count_num)
  52.             request.urlretrieve(img, image_name)
  53.             print("Save image:{0}".format(image_name))
  54.  
  55. # input_array = ['DSC01533', 'DSC01536', 'DSC01543']
  56.  
  57. if __name__ == '__main__':
  58.     imageCrawler = ImageCrawler()
  59.     test_target_html = imageCrawler.get_url(
  60.         "https://www.ecu.edu.au/service-centres/MACSC/gallery/gallery.php?folder=152")
  61.     option = input("choose mosde:"
  62.                    "1.Download specific thumbnails  "
  63.                    "2.Download ALL thumbnails  "
  64.                    "3.Download images in a range  "
  65.                    "4.Download random n pictures  ")
  66.     if option == '1':
  67.         n = input("input number of pictures:")
  68.         input_array = [input("input picture name:") for i in range(int(n))]
  69.         test_img_list = imageCrawler.get_specific_img(test_target_html, input_array)
  70.  
  71.     elif option == '2':
  72.         test_img_list = imageCrawler.get_img(test_target_html)
  73.  
  74.     elif option == '3':
  75.         range_left = int(input("input range left:")[3:])
  76.         range_right = int(input("input range right:")[3:])
  77.         alphabet = 'DSC0'
  78.  
  79.         input_array = []
  80.         for picture_digit in range(range_left, range_right, 1):
  81.             picture_code = alphabet + str(picture_digit)
  82.             input_array.append(picture_code)
  83.         test_img_list = imageCrawler.get_specific_img(test_target_html, input_array)
  84.  
  85.     elif option == '4':
  86.         n = int(input("input number of random pictures:"))
  87.         test_img_list = imageCrawler.get_img(test_target_html)
  88.         test_img_list = random.choices(test_img_list, k=n)
  89.  
  90.     else:
  91.         print('wrong command')
  92.  
  93.     imageCrawler.save_img(test_img_list)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement