Guest User

Untitled

a guest
Oct 19th, 2017
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.17 KB | None | 0 0
  1. from itertools import cycle
  2. import numpy as np
  3. import os
  4. import pandas as pd
  5. import skvideo.io as skv
  6.  
  7. from warnings import filterwarnings
  8. filterwarnings('ignore')
  9.  
  10. from multilabel import multilabel_train_test_split
  11.  
  12.  
  13. class Dataset(object):
  14.  
  15. def __init__(self, datapath, dataset_type='nano', reduce_frames=True, val_size=0.3, batch_size=16, test=False):
  16.  
  17. self.datapath = datapath
  18. self.dataset_type = dataset_type
  19. self.reduce_frames = reduce_frames
  20. self.val_size = val_size
  21. self.batch_size = batch_size
  22.  
  23. # boolean for test mode
  24. self.test = test
  25.  
  26. # params based on dataset type
  27. if self.dataset_type == 'nano':
  28. self.height = 16
  29. self.width = 16
  30. elif self.dataset_type == 'micro':
  31. self.height = 64
  32. self.width = 64
  33. elif self.dataset_type == 'raw':
  34. print("\nRaw videos have variable size... \nsetting height and width to None... \nfirst video in test will determine size (test must be True ")
  35. self.height = None
  36. self.weidth = None
  37. else:
  38. raise NotImplementedError("Please set dataset_type as raw, micro, or nano.")
  39.  
  40. # params based on frame reduction
  41. if self.reduce_frames:
  42. self.num_frames = 15
  43. else:
  44. self.num_frames = 30
  45.  
  46. # for tracking errors
  47. self.bad_videos = []
  48.  
  49. # training and validation
  50. self.X_train, self.X_val, self.y_train, self.y_val = self.split_training_into_validation()
  51.  
  52. # params of data based on training data
  53. self.num_classes = self.y_train.shape[1]
  54. self.class_names = self.y_train.columns.values
  55. assert self.num_classes == self.y_val.shape[1]
  56. self.num_samples = self.y_train.shape[0]
  57. self.num_batches = self.num_samples // self.batch_size
  58.  
  59. # test paths and prediction matrix
  60. self.X_test_ids, self.predictions = self.prepare_test_data_and_prediction()
  61.  
  62. # variables to make batch generating easier
  63. self.batch_idx = cycle(range(self.num_batches))
  64. self.batch_num = next(self.batch_idx)
  65.  
  66. self.num_val_batches = self.y_val.shape[0] // self.batch_size
  67. self.val_batch_idx = cycle(range(self.num_val_batches))
  68. self.val_batch_num = next(self.val_batch_idx)
  69.  
  70. self.num_test_samples = self.X_test_ids.shape[0]
  71. self.num_test_batches = self.num_test_samples // self.batch_size
  72. self.test_batch_idx = cycle(range(self.num_test_batches))
  73. self.test_batch_num = next(self.test_batch_idx)
  74.  
  75. # for testing iterator in test_mode
  76. self.train_data_seen = pd.DataFrame(data={'seen': 0}, index=self.y_train.index)
  77.  
  78. # test the generator
  79. if test:
  80. self._test_batch_generator()
  81.  
  82. def prepare_test_data_and_prediction(self):
  83. """
  84. Returns paths to test data indexed by subject_id
  85. and preallocates prediction dataframe.
  86. """
  87.  
  88. predpath = os.path.join(self.datapath, 'submission_format.csv')
  89. predictions = pd.read_csv(predpath, index_col='filename')
  90. test_idx = predictions.index
  91. subjpath = os.path.join(self.datapath, self.dataset_type)
  92. #subject_ids = pd.read_csv(subjpath, index_col=0)
  93. subject_ids = pd.DataFrame(data=subjpath, columns=['filepath'], index=test_idx)
  94. for row in subject_ids.itertuples():
  95. subject_ids.loc[row.Index] = os.path.join(row.filepath, row.Index)
  96.  
  97. return test_idx, predictions
  98.  
  99.  
  100. def split_training_into_validation(self):
  101. """
  102. Uses the multilabel_train_test_split function
  103. to maintain class distributions between train
  104. and validation sets.
  105. """
  106.  
  107. datapath = self.datapath
  108. dataset_type = self.dataset_type
  109. val_size = self.val_size
  110.  
  111. # load training labels
  112. labelpath = os.path.join(datapath, 'train_labels.csv')
  113. labels = pd.read_csv(labelpath, index_col='filename')
  114.  
  115. # load subject labels (assumed to have same index as training labels)
  116. subjpath = os.path.join(datapath, dataset_type)
  117. #subject_ids = pd.read_csv(subjpath, index_col=0)
  118. subject_ids = pd.DataFrame(data=subjpath, columns=['filepath'], index=labels.index)
  119. for row in subject_ids.itertuples():
  120. subject_ids.loc[row.Index] = os.path.join(row.filepath, row.Index)
  121.  
  122. # split
  123. X_train, X_val, y_train, y_val = multilabel_train_test_split(subject_ids, labels, size=val_size, min_count=1, seed=0)
  124.  
  125. # check distribution is maintained
  126. dist_diff = (y_train.sum()/y_train.shape[0] - y_val.sum() / y_val.shape[0]).sum()
  127. #print(dist_diff)
  128. assert np.isclose(dist_diff, 0, rtol=1e-04, atol=1e-02)
  129.  
  130. return X_train, X_val, y_train, y_val
  131.  
  132. def batches(self, verbose=False):
  133. """This method yields the next batch of videos for training."""
  134.  
  135. reduce_frames = self.reduce_frames
  136. batch_size = self.batch_size
  137. num_train = self.y_train.shape[0]
  138.  
  139.  
  140.  
  141. while 1:
  142. # get videos
  143. start = self.batch_size*self.batch_num
  144. stop = self.batch_size*(self.batch_num + 1)
  145.  
  146. # print batch ranges if testing
  147. if self.test:
  148. print(f"batch {self.batch_num}:\t{start} --> {stop-1}")
  149.  
  150. x_paths = self.X_train.iloc[start:stop]
  151. x, failed = self._get_video_batch(x_paths,
  152. reduce_frames=reduce_frames,
  153. verbose=verbose)
  154. x_paths = x_paths.drop(failed)
  155. self.bad_videos += failed
  156.  
  157. # get labels
  158. y = self.y_train.iloc[start:stop]
  159. y = y.drop(failed)
  160.  
  161. # check match for labels and videos
  162. assert (x_paths.index==y.index).all()
  163. assert x.shape[0] == y.shape[0]
  164.  
  165. # report failures if verbose
  166. if len(failed) != 0 and verbose==True:
  167. print(f"\t\t\t*** ERROR FETCHING BATCH {self.batch_num}/{self.num_batches} ***")
  168. print(f"Dropped {len(failed)} videos:")
  169. for failure in failed:
  170. print(f"\t{failure}\n\n")
  171.  
  172. # increment batch number
  173. self.batch_num = next(self.batch_idx)
  174.  
  175. # update dataframe of seen training indices for testing
  176. self.train_data_seen.loc[y.index.values] = 1
  177. yield (x, y)
  178.  
  179.  
  180. def val_batches(self, verbose=False):
  181. """This method yields the next batch of videos for validation."""
  182.  
  183. reduce_frames = self.reduce_frames
  184. batch_size = self.batch_size
  185. num_train = self.y_train.shape[0]
  186.  
  187.  
  188.  
  189. while 1:
  190. # get videos
  191. start = self.batch_size*self.val_batch_num
  192. stop = self.batch_size*(self.val_batch_num + 1)
  193.  
  194. x_paths = self.X_train.iloc[start:stop]
  195. x, failed = self._get_video_batch(x_paths,
  196. reduce_frames=reduce_frames,
  197. verbose=verbose)
  198. x_paths = x_paths.drop(failed)
  199. self.bad_videos += failed
  200.  
  201. # get labels
  202. y = self.y_train.iloc[start:stop]
  203. y = y.drop(failed)
  204.  
  205. # check match for labels and videos
  206. assert (x_paths.index==y.index).all()
  207. assert x.shape[0] == y.shape[0]
  208.  
  209. # report failures if verbose
  210. if len(failed) != 0 and verbose==True:
  211. print(f"\t\t\t*** ERROR FETCHING BATCH {self.batch_num}/{self.num_batches} ***")
  212. print(f"Dropped {len(failed)} videos:")
  213. for failure in failed:
  214. print(f"\t{failure}\n\n")
  215.  
  216. # increment batch number
  217. self.val_batch_num = next(self.val_batch_idx)
  218.  
  219. yield (x, y)
  220.  
  221. def test_batches(self, verbose=False):
  222. """This method yields the next batch of videos for testing."""
  223.  
  224. reduce_frames = self.reduce_frames
  225. batch_size = self.batch_size
  226. num_test = self.num_test_samples
  227.  
  228. test_dir = os.path.join(self.datapath, self.dataset_type)
  229.  
  230.  
  231. while 1:
  232. # get videos
  233. start = self.batch_size*self.test_batch_num
  234. stop = self.batch_size*(self.test_batch_num + 1)
  235.  
  236. x_ids = self.X_test_ids[start:stop]
  237. x_paths = pd.DataFrame(data=[os.path.join(test_dir, f"{filename}") for filename in x_ids],
  238. columns=['filepath'],
  239. index=x_ids)
  240. #print(x_paths)
  241. x, failed = self._get_video_batch(x_paths,
  242. reduce_frames=reduce_frames,
  243. verbose=verbose)
  244.  
  245. self.test_batch_ids = x_ids.values
  246.  
  247. # increment batch number
  248. self.test_batch_num = next(self.test_batch_idx)
  249.  
  250. yield x
  251.  
  252.  
  253. def _get_video_batch(self, x_paths, as_grey=True, reduce_frames=True, verbose=False):
  254. """
  255. Returns ndarray of shape (batch_size, num_frames, width, height, channels).
  256. If as_grey, then channels dimension is squeezed out.
  257. """
  258.  
  259. videos = []
  260. failed = []
  261.  
  262. for row in x_paths.itertuples():
  263. filepath = row.filepath
  264. obf_id = row.Index
  265.  
  266. # load
  267. video = skv.vread(filepath, as_grey=as_grey)
  268.  
  269. # fill video if neccessary
  270. if video.shape[0] < self.num_frames:
  271. video = self._fill_video(video)
  272.  
  273. # reduce
  274. if reduce_frames:
  275. frames = np.arange(0, video.shape[0], 2)
  276. try:
  277. video = video[frames, :, :] #.squeeze()
  278. videos.append(video)
  279.  
  280. except IndexError:
  281. if verbose:
  282. print(f"FAILED TO REDUCE: {filepath}")
  283. print(f"id:\t{obf_id}")
  284. failed.append(obf_id)
  285.  
  286. return np.array(videos), failed
  287.  
  288. def _fill_video(self, video):
  289. """Returns a video with self.num_frames given at least one frame."""
  290.  
  291. # establish boundaries
  292. target_num_frames = self.num_frames
  293. num_to_fill = target_num_frames - video.shape[0]
  294.  
  295. # preallocate array for filler
  296. filler_frames = np.zeros(shape=(num_to_fill, self.width, self.height, 1)) # assumes grey
  297.  
  298. # fill frames
  299. source_frame = cycle(np.arange(0, video.shape[0]))
  300. for i in range(num_to_fill):
  301. filler_frames[i, :, :] = video[next(source_frame), :, :]
  302.  
  303. return np.concatenate((video, filler_frames), axis=0)
  304.  
  305.  
  306. def _test_batch_generator(self):
  307.  
  308. print('Testing train batch generation...')
  309.  
  310. for i in range(self.num_batches):
  311. if self.batch_num % 10 == 0:
  312. print(f"\n\t\t\tBATCH \t{self.batch_num}/{self.num_batches}\n")
  313.  
  314. batch = self.batches(verbose=True)
  315. x,y = next(batch)
  316.  
  317. # same batches for videos and labels
  318. assert x.shape[0] == y.shape[0]
  319.  
  320. # square videos
  321. assert x.shape[2] == x.shape[3]
  322.  
  323. # black and white
  324. assert x.shape[4] == 1
  325.  
  326.  
  327. # assert we've seen all data up to remainder of a batch
  328. assert (self.y_train.shape[0] - self.train_data_seen.sum().values[0]) < self.batch_size
  329.  
  330. # check that batch_num is reset
  331. assert self.batch_num == 0
  332.  
  333. # turn off test mode
  334. if self.test == True:
  335. self.test = False
  336.  
  337. print('Test passed.')
  338.  
  339. def update_predictions(self, results):
  340. self.predictions.loc[self.test_batch_ids] = results
Add Comment
Please, Sign In to add comment