Advertisement
Guest User

Untitled

a guest
Dec 10th, 2019
270
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.55 KB | None | 0 0
  1. #------------------------------------------------------------------------------
  2. # The famous data set:cats vs dogs is used in this example. The data set contains
  3. # 12500 dog pictures and 12500 cat pictures. All the images are shuffled randomly
  4. # and 20000 images are used to train, 5000 images are used to test. The images
  5. # can be resized to different sizes but the size of the .hdf5 file differs very
  6. # far depending on the size of the images. The file is 1.14G when the size of the
  7. # images is (128,128) and 4.57G for (256,256), 18.3G for (512,512).
  8. #------------------------------------------------------------------------------
  9.  
  10. ########################## first part: prepare data ###########################
  11. from random import shuffle
  12. import glob
  13.  
  14. shuffle_data = True  # shuffle the addresses
  15.  
  16. hdf5_path = '/Users/fathansatriaanandika/Documents/SistemCerdas/code-week13/datasets/cat.h5'  # file path for the created .hdf5 file
  17.  
  18. cat_dog_train_path = '/Users/fathansatriaanandika/Documents/SistemCerdas/code-week13/datasets/Cat/*.jpg' # the original data path
  19.  
  20. # get all the image paths
  21. addrs = glob.glob(cat_dog_train_path)
  22.  
  23. # label the data as 0=cat, 1=dog
  24. labels = [0 if 'cat' in addr else 1 for addr in addrs]
  25.  
  26. # shuffle data
  27. if shuffle_data:
  28.     c = list(zip(addrs, labels)) # use zip() to bind the images and labels together
  29.     shuffle(c)
  30.  
  31.     (addrs, labels) = zip(*c)  # *c is used to separate all the tuples in the list c,  
  32.                                # "addrs" then contains all the shuffled paths and
  33.                                # "labels" contains all the shuffled labels.
  34.                                
  35. # Divide the data into 80% for train and 20% for test
  36. train_addrs = addrs[0:int(0.8*len(addrs))]
  37. train_labels = labels[0:int(0.8*len(labels))]
  38.  
  39. test_addrs = addrs[int(0.8*len(addrs)):]
  40. test_labels = labels[int(0.8*len(labels)):]
  41.  
  42.  
  43.  
  44. ##################### second part: create the h5py object #####################
  45. import numpy as np
  46. import h5py
  47.  
  48. train_shape = (len(train_addrs), 128, 128, 3)
  49. test_shape = (len(test_addrs), 128, 128, 3)
  50.  
  51. # open a hdf5 file and create earrays
  52. f = h5py.File(hdf5_path, mode='w')
  53.  
  54. # PIL.Image: the pixels range is 0-255,dtype is uint.
  55. # matplotlib: the pixels range is 0-1,dtype is float.
  56. f.create_dataset("train_img", train_shape, np.uint8)
  57. f.create_dataset("test_img", test_shape, np.uint8)  
  58.  
  59. # the ".create_dataset" object is like a dictionary, the "train_labels" is the key.
  60. f.create_dataset("train_labels", (len(train_addrs),), np.uint8)
  61. f["train_labels"][...] = train_labels
  62.  
  63. f.create_dataset("test_labels", (len(test_addrs),), np.uint8)
  64. f["test_labels"][...] = test_labels
  65.  
  66. ######################## third part: write the images #########################
  67.  
  68.  
  69. import cv2
  70.  
  71. # loop over train paths
  72. for i in range(len(train_addrs)):
  73.  
  74.     if i % 1000 == 0 and i > 1:
  75.         print ('Train data: {}/{}'.format(i, len(train_addrs)) )
  76.  
  77.     addr = train_addrs[i]
  78.     img = cv2.imread(addr)
  79.     img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_CUBIC)# resize to (128,128)
  80.     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # cv2 load images as BGR, convert it to RGB
  81.     f["train_img"][i, ...] = img[None]
  82.  
  83. # loop over test paths
  84. for i in range(len(test_addrs)):
  85.  
  86.     if i % 1000 == 0 and i > 1:
  87.         print ('Test data: {}/{}'.format(i, len(test_addrs)) )
  88.  
  89.     addr = test_addrs[i]
  90.     img = cv2.imread(addr)
  91.     img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_CUBIC)
  92.     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  93.     f["test_img"][i, ...] = img[None]
  94.  
  95. f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement