Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import time
- from typing import List, Tuple, Dict
- import os
- import random
- import time
- from typing import List, Tuple, Dict
- import PIL
- import matplotlib.pyplot as plt
- import numpy
- from PIL import Image, ImageEnhance, ImageFilter
- from sklearn.ensemble import RandomForestClassifier
- def normalize_and_ravel(image):
- I = numpy.array(image)
- del image
- I = numpy.ravel(I, order='C')
- f = lambda x: x / 255
- normalized = f(I)
- return normalized
- def normalize_and_ravel_set(set_pictures):
- new_dict = dict()
- for a, b in set_pictures.items():
- new_dict[a] = list()
- for i in b:
- new_dict[a].append(normalize_and_ravel(i))
- return new_dict
- def find_freq(data: Dict[str, List[str]]):
- freq = dict()
- for a, b in data.items():
- freq[int(a)] = len(b)
- return freq
- def plot_freq(freq):
- y = list()
- x = list()
- for a, b in freq.items():
- x.append(int(a))
- y.append(b)
- plt.figure()
- plt.bar(x, y)
- def return_top_level_dirs(path_to_dir: str) -> List[str]:
- """
- :param path_to_dir: path to the directory
- :return: list of names of only top level directories located in the directory which location is described by path_to_dir
- """
- direct = next(os.walk(path_to_dir))[1]
- return direct
- def stack_vectors_prepare_labels(training_dict):
- labels = []
- data = []
- for a, b in training_dict.items():
- for i in b:
- data.append(i)
- labels.append(a)
- return numpy.array(data), numpy.array(labels)
- def split_vids(data: List[List[str]]) -> Tuple[List[List[str]], List[List[str]]]:
- data_1 = list()
- data_2 = list()
- random.shuffle(data)
- for elem in range(len(data)):
- if elem >= int(len(data) * 0.8):
- data_2.append(data[elem])
- else:
- data_1.append(data[elem])
- return data_2, data_1
- def change_picture(image):
- brightness_param = random.uniform(0.5, 2)
- contrast_param = random.uniform(0.5, 2)
- blur_param = random.uniform(0, 1.25)
- brightness = PIL.ImageEnhance.Brightness(image)
- image = brightness.enhance(brightness_param)
- contrast = PIL.ImageEnhance.Contrast(image)
- image = contrast.enhance(contrast_param)
- image = image.filter(ImageFilter.GaussianBlur(radius=blur_param))
- return image
- def augment(training_set_pictures, freq, support_pick=None):
- max = 0
- for a, b in freq.items():
- if b > max:
- max = b
- for a, b in freq.items():
- length = len(training_set_pictures[a])
- for i in range(0, max - b):
- if not support_pick:
- random_picture_index = random.randint(0, length - 1)
- random_picture = training_set_pictures[a][random_picture_index]
- else:
- random_picture = support_pick(a)
- training_set_pictures[a].append(change_picture(random_picture))
- if support_pick:
- for class_ in range(0, 43):
- if class_ not in training_set_pictures:
- training_set_pictures[class_] = []
- for i in range(max):
- random_picture = support_pick(class_)
- training_set_pictures[class_].append(random_picture)
- return training_set_pictures
- def format_set(set, size):
- formatted_set = dict()
- for a, b in set.items():
- formatted_set[a] = list()
- for i in b:
- formatted_set[a].append(change_to_format(i, size))
- return formatted_set
- # Padd rectangular images to square shape - add zero pixels
- def return_files_in_dir(path_to_dir: str) -> List[str]:
- """
- :param path_to_dir: path to the directory
- :return: list of paths to files stored in the directory which location is described by path_to_dir
- """
- files = []
- # r=root, d=directories, f = files
- for r, d, f in os.walk(path_to_dir):
- for file in f:
- files.append(os.path.join(r, file))
- return files
- def create_images_set(files_set):
- images_set = dict()
- for a, b in files_set.items():
- images_set[int(a)] = list()
- for path in b:
- image = Image.open(path)
- image.load()
- images_set[int(a)].append(image)
- return images_set
- def change_to_format(old_im, size):
- old_size = old_im.size
- width, height = old_size
- if width > height:
- desired_size = width
- else:
- desired_size = height
- new_size = (desired_size, desired_size)
- new_im = Image.new("RGB", new_size)
- new_im.paste(old_im, ((new_size[0] - old_size[0]),
- (new_size[1] - old_size[1])))
- new_im = new_im.resize(size, Image.ANTIALIAS)
- return new_im
- def get_num_of_vids_for_class(files_in_dict: List[str]) -> int:
- files = files_in_dict
- max = 0
- for j in files:
- if j.split(".")[-1] == "ppm":
- splitted = (j.split("/")[-1]).split("_")[0]
- if int(splitted) > max:
- max = int(splitted)
- return max + 1
- def get_classes(path_to_dir):
- return return_top_level_dirs(path_to_dir)
- def get_images_of_class(class_name, path_to_dir):
- path_to_dir = path_to_dir + "/" + class_name
- files = return_files_in_dir(path_to_dir)
- images = []
- for file in files:
- if file.split(".")[-1] == "ppm":
- images.append(file)
- return images
- def split_vids_within_one_class(cls, path_to_dir):
- training_set_files = []
- testing_set = []
- cls_images = get_images_of_class(cls, path_to_dir)
- num_of_vids = get_num_of_vids_for_class(cls_images)
- vids_list: List[List[str]] = list()
- for _ in range(num_of_vids):
- vids_list.append(list())
- for image in cls_images:
- vid_id = int((image.split("/")[-1]).split("_")[0])
- vids_list[vid_id].append(image)
- testing_data, training_data = split_vids(vids_list)
- for vid in testing_data:
- for image in vid:
- testing_set.append(image)
- for vid in training_data:
- for image in vid:
- training_set_files.append(image)
- return training_set_files, testing_set
- def get_track_files(file_set, track):
- from collections import defaultdict
- result = defaultdict(list)
- for class_, files in file_set.items():
- for path_str in files:
- from pathlib import Path
- path = Path(path_str)
- current_track = int(path.parts[-1].split("_")[0])
- if current_track == track:
- result[class_].append(path_str)
- return result
- def prepare_batch_data(path_to_dir, size, augment_flag):
- """
- Like `prepare_data`, but produces a generator which will give batches of
- training data to better fit in memory.
- """
- classes: List[str] = get_classes(path_to_dir)
- training_set_files: Dict[str, List[str]] = dict()
- testing_set: Dict[str, List[str]] = dict()
- for cls in classes:
- training_set_files[cls], testing_set[cls] = split_vids_within_one_class(cls, path_to_dir)
- def support_pick(class_):
- class_str = str(class_).zfill(5)
- random_file = random.choice(training_set_files[class_str])
- wtf = create_images_set({0: [random_file]})
- wtf = format_set(wtf, size)
- return wtf[0][0]
- def training_set_generator():
- # taking one track per class
- track = 0
- first_images = None
- while True: # breaks when there are no more actual images
- images = get_track_files(training_set_files, track)
- if not images:
- break
- images = create_images_set(images)
- images = format_set(images, size)
- freq = find_freq(images)
- if augment_flag:
- images = augment(images, freq, support_pick)
- images = normalize_and_ravel_set(images)
- data, labels = stack_vectors_prepare_labels(images)
- yield data, labels
- track += 1
- validation_set_pictures = create_images_set(testing_set)
- validation_set_pictures = format_set(validation_set_pictures, size)
- validation_set_pictures = normalize_and_ravel_set(validation_set_pictures)
- validation_data, validation_labels = stack_vectors_prepare_labels(validation_set_pictures)
- return training_set_generator, validation_data, validation_labels
- def prepare_data(path_to_dir, size, augument_flag):
- classes: List[str] = get_classes(path_to_dir)
- training_set_files: Dict[str, List[str]] = dict()
- testing_set: Dict[str, List[str]] = dict()
- for cls in classes:
- training_set_files[cls], testing_set[cls] = split_vids_within_one_class(cls, path_to_dir)
- freq = find_freq(training_set_files)
- plot_freq(freq)
- training_set_pictures = create_images_set(training_set_files)
- training_set_pictures = format_set(training_set_pictures, size)
- if augument_flag:
- augment(training_set_pictures, freq)
- freq = find_freq(training_set_pictures)
- plot_freq(freq)
- training_set_pictures = normalize_and_ravel_set(training_set_pictures)
- validation_set_pictures = create_images_set(testing_set)
- validation_set_pictures = format_set(validation_set_pictures, size)
- validation_set_pictures = normalize_and_ravel_set(validation_set_pictures)
- trainig_data, training_labels = stack_vectors_prepare_labels(training_set_pictures)
- validation_data, validation_labels = stack_vectors_prepare_labels(validation_set_pictures)
- return trainig_data, training_labels, validation_data, validation_labels
- def main():
- sizes = [(5, 5), (15, 15), (30, 30), (50, 50), (100, 100)]
- for augument_flag in [True, False]:
- to_plot_acc = []
- to_plot_time = []
- for size in sizes:
- begin = time.monotonic()
- print(size, augument_flag)
- data_tr, labels_tr, data_val, labels_val = prepare_data("../Images/", size, augument_flag)
- clf = RandomForestClassifier(n_jobs=1, random_state=0)
- clf.fit(data_tr, labels_tr)
- guessed_data = clf.predict(data_val)
- print(labels_val)
- print(guessed_data)
- guesses_ok = 0
- for a, b in zip(guessed_data, labels_val):
- if a == b:
- guesses_ok = guesses_ok + 1
- to_plot_acc.append(guesses_ok / len(guessed_data))
- total = time.monotonic() - begin
- to_plot_time.append(total)
- plt.figure()
- plt.title(f"Time vs. Image size with augmented: {augument_flag}")
- plt.xlabel(f"sizes (px)")
- plt.ylabel(f"time (sec)")
- plt.plot(sizes, to_plot_time)
- plt.figure()
- plt.title(f"Accuracy vs. Image size with augmented: {augument_flag}")
- plt.xlabel(f"sizes (px)")
- plt.ylabel(f"accuracy")
- plt.plot(sizes, to_plot_acc)
- plt.show()
- def batch_main():
- sizes = [(5, 5), (15, 15), (30, 30), (50, 50), (100, 100)]
- for augument_flag in [True, False]:
- to_plot_acc = []
- to_plot_time = []
- for size in sizes:
- begin = time.monotonic()
- print(size, augument_flag)
- training_gen, data_val, labels_val = prepare_batch_data("../Images/",
- size,
- augument_flag)
- clf = RandomForestClassifier(n_jobs=1, random_state=0,
- warm_start=True, n_estimators=0)
- n_estimators = 10
- for data_tr, labels_tr in training_gen():
- clf.set_params(n_estimators=n_estimators)
- clf.fit(data_tr, labels_tr)
- n_estimators += 10
- guessed_data = clf.predict(data_val)
- print(labels_val)
- print(guessed_data)
- guesses_ok = 0
- for a, b in zip(guessed_data, labels_val):
- if a == b:
- guesses_ok = guesses_ok + 1
- to_plot_acc.append(guesses_ok / len(guessed_data))
- total = time.monotonic() - begin
- to_plot_time.append(total)
- plt.figure()
- plt.title(f"Time vs. Image size with augmented: {augument_flag}")
- plt.xlabel(f"sizes (px)")
- plt.ylabel(f"time (sec)")
- plt.plot(sizes, to_plot_time)
- plt.figure()
- plt.title(f"Accuracy vs. Image size with augmented: {augument_flag}")
- plt.xlabel(f"sizes (px)")
- plt.ylabel(f"accuracy")
- plt.plot(sizes, to_plot_acc)
- plt.show()
- batch_main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement