Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # classify_video.py will classify a video using
- # (1) singleFrame RGB model
- # (2) singleFrame flow model
- # (3) 0.5/0.5 singleFrame RGB/singleFrame flow fusion
- # (4) 0.33/0.67 singleFrame RGB/singleFrame flow fusion
- # (5) LRCN RGB model
- # (6) LRCN flow model
- # (7) 0.5/0.5 LRCN RGB/LRCN flow model
- # (8) 0.33/0.67 LRCN RGB/LRCN flow model
- # Before using, change RGB_video_path and flow_video_path.
- # Use: classify.py video, where video is the video you wish to classify.
- # If no video is specified, the video "v_Archery_g01_c01" will be classified.
- import numpy as np
- import glob
- import sys; sys.path.insert(0, '/opt/caffe/python')
- import caffe
- import pickle
- caffe.set_mode_gpu()
- RGB_video_path = '/data/frames/'
- flow_video_path = '/data/flow_images/'
- if len(sys.argv) > 1:
- video = sys.argv[1]
- else:
- video = 'v_Archery_g01_c01'
- # Initialize transformers
- def initialize_transformer(image_mean, is_flow):
- shape = (10 * 16, 3, 227, 227)
- transformer = caffe.io.Transformer({'data': shape})
- channel_mean = np.zeros((3, 227, 227))
- for channel_index, mean_val in enumerate(image_mean):
- channel_mean[channel_index, ...] = mean_val
- transformer.set_mean('data', channel_mean)
- transformer.set_raw_scale('data', 255)
- transformer.set_channel_swap('data', (2, 1, 0))
- transformer.set_transpose('data', (2, 0, 1))
- transformer.set_is_flow('data', is_flow)
- return transformer
- ucf_mean_RGB = np.zeros((3, 1, 1))
- ucf_mean_flow = np.zeros((3, 1, 1))
- ucf_mean_flow[:, :, :] = 128
- ucf_mean_RGB[0, :, :] = 103.939
- ucf_mean_RGB[1, :, :] = 116.779
- ucf_mean_RGB[2, :, :] = 128.68
- transformer_RGB = initialize_transformer(ucf_mean_RGB, False)
- transformer_flow = initialize_transformer(ucf_mean_flow, True)
- # Extract list of frames in video
- RGB_frames = glob.glob('%s%s/*.jpg' % (RGB_video_path, video))
- flow_frames = glob.glob('%s%s/*.jpg' % (flow_video_path, video))
- RGB_videos = glob.glob('%s/*' % RGB_video_path)
- RGB_v = []
- for v in RGB_videos:
- RGB_v.append(glob.glob('%s/*.jpg' % v))
- # classify video with LRCN model
- def LRCN_classify_video(frames, net, transformer, is_flow):
- clip_length = 16
- offset = 8
- input_images = []
- for im in frames:
- input_im = caffe.io.load_image(im)
- if (input_im.shape[0] < 240):
- input_im = caffe.io.resize_image(input_im, (240, 320))
- input_images.append(input_im)
- vid_length = len(input_images)
- input_data = []
- for i in range(0, vid_length, offset):
- if (i + clip_length) < vid_length:
- input_data.extend(input_images[i:i + clip_length])
- else: # video may not be divisible by clip_length
- input_data.extend(input_images[-clip_length:])
- output_predictions = np.zeros((len(input_data), 101))
- for i in range(0, len(input_data), clip_length):
- clip_input = input_data[i:i + clip_length]
- clip_input = caffe.io.oversample(clip_input, [227, 227])
- clip_clip_markers = np.ones((clip_input.shape[0], 1, 1, 1))
- clip_clip_markers[0:10, :, :, :] = 0
- # if is_flow: #need to negate the values when mirroring
- # clip_input[5:,:,:,0] = 1 - clip_input[5:,:,:,0]
- caffe_in = np.zeros(np.array(clip_input.shape)[[0, 3, 1, 2]], dtype=np.float32)
- for ix, inputs in enumerate(clip_input):
- caffe_in[ix] = transformer.preprocess('data', inputs)
- out = net.forward_all(data=caffe_in, clip_markers=np.array(clip_clip_markers))
- output_predictions[i:i + clip_length] = np.mean(out['probs'], 1)
- return np.mean(output_predictions, 0).argmax(), output_predictions
- # classify video with singleFrame model
- def singleFrame_classify_video(frames, net, transformer, is_flow):
- batch_size = 16
- input_images = []
- for im in frames:
- input_im = caffe.io.load_image(im)
- if (input_im.shape[0] < 240):
- input_im = caffe.io.resize_image(input_im, (240, 320))
- input_images.append(input_im)
- vid_length = len(input_images)
- output_predictions = np.zeros((len(input_images), 101))
- for i in range(0, len(input_images), batch_size):
- clip_input = input_images[i:min(i + batch_size, len(input_images))]
- clip_input = caffe.io.oversample(clip_input, [227, 227])
- clip_clip_markers = np.ones((clip_input.shape[0], 1, 1, 1))
- clip_clip_markers[0:10, :, :, :] = 0
- if is_flow: # need to negate the values when mirroring
- clip_input[5:, :, :, 0] = 1 - clip_input[5:, :, :, 0]
- caffe_in = np.zeros(np.array(clip_input.shape)[[0, 3, 1, 2]], dtype=np.float32)
- for ix, inputs in enumerate(clip_input):
- caffe_in[ix] = transformer.preprocess('data', inputs)
- net.blobs['data'].reshape(caffe_in.shape[0], caffe_in.shape[1], caffe_in.shape[2], caffe_in.shape[3])
- out = net.forward_all(data=caffe_in)
- output_predictions[i:i + batch_size] = np.mean(out['probs'].reshape(10, caffe_in.shape[0] / 10, 101), 0)
- return np.mean(output_predictions, 0).argmax(), output_predictions
- # Models and weights
- singleFrame_model = 'deploy_singleFrame.prototxt'
- lstm_model = 'deploy_lstm.prototxt'
- RGB_singleFrame = 'single_frame_all_layers_hyb_RGB_iter_5000.caffemodel'
- flow_singleFrame = 'single_frame_all_layers_hyb_flow_iter_50000.caffemodel'
- RGB_lstm = 'RGB_lstm_model_iter_30000.caffemodel'
- flow_lstm = 'flow_lstm_model_iter_50000.caffemodel'
- RGB_singleFrame_net = caffe.Net(singleFrame_model, RGB_singleFrame, caffe.TEST)
- for v in RGB_v:
- label, predictions = singleFrame_classify_video(RGB_frames, RGB_singleFrame_net, transformer_RGB, False)
- import code; code.interact(local=dict(globals(), **locals()))
- # class_RGB_singleFrame, predictions_RGB_singleFrame = \
- # singleFrame_classify_video(RGB_frames, RGB_singleFrame_net, transformer_RGB, False)
- # del RGB_singleFrame_net
- # flow_singleFrame_net = caffe.Net(singleFrame_model, flow_singleFrame, caffe.TEST)
- # class_flow_singleFrame, predictions_flow_singleFrame = \
- # singleFrame_classify_video(flow_frames, flow_singleFrame_net, transformer_flow, True)
- # del flow_singleFrame_net
- #
- # RGB_lstm_net = caffe.Net(lstm_model, RGB_lstm, caffe.TEST)
- # class_RGB_LRCN, predictions_RGB_LRCN = \
- # LRCN_classify_video(RGB_frames, RGB_lstm_net, transformer_RGB, False)
- # del RGB_lstm_net
- #
- # flow_lstm_net = caffe.Net(lstm_model, flow_lstm, caffe.TEST)
- # class_flow_LRCN, predictions_flow_LRCN = \
- # LRCN_classify_video(flow_frames, flow_lstm_net, transformer_flow, True)
- # del flow_lstm_net
- #
- #
- # def compute_fusion(RGB_pred, flow_pred, p):
- # return np.argmax(p * np.mean(RGB_pred, 0) + (1 - p) * np.mean(flow_pred, 0))
- # Load activity label hash
- # action_hash = pickle.load(open('action_hash_rev.p', 'rb'))
- # print "RGB single frame model classified video as: %s.\n" % (action_hash[class_RGB_singleFrame])
- # print "Flow single frame model classified video as: %s.\n" % (action_hash[class_flow_singleFrame])
- # print "RGB LRCN model classified video as: %s.\n" % (action_hash[class_RGB_LRCN])
- # print "Flow LRCN frame model classified video as: %s.\n" % (action_hash[class_flow_LRCN])
- # print "0.5/0.5 single frame fusion model classified video as: %s. \n" % (
- # action_hash[compute_fusion(predictions_RGB_singleFrame, predictions_flow_singleFrame, 0.5)])
- # print "0.33/0.67 single frame fusion model classified video as: %s. \n" % (
- # action_hash[compute_fusion(predictions_RGB_singleFrame, predictions_flow_singleFrame, 0.33)])
- # print "0.5/0.5 LRCN fusion model classified video as: %s. \n" % (
- # action_hash[compute_fusion(predictions_RGB_LRCN, predictions_flow_LRCN, 0.5)])
- # print "0.33/0.67 LRCN fusion model classified video as: %s. \n" % (
- # action_hash[compute_fusion(predictions_RGB_LRCN, predictions_flow_LRCN, 0.33)])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement