Untitled

import os
import numpy as np
import tensorflow as tf
#from train.dataprocessor import DataProcessor
from tensorflow.data import Iterator
import os.path


# Path for tf.summary.FileWriter and to store model checkpoints
train_filewriter_path = os.path.join(FLAGS.project_saves_path,"TCNN_train_tensorboard")
val_filewriter_path = os.path.join(FLAGS.project_saves_path, "TCNN_val_tensorboard")
checkpoint_path = os.path.join(FLAGS.project_saves_path, "TCNN_checkpoints")


# Create parent path if it doesn't exist
if not os.path.isdir(checkpoint_path):
    os.mkdir(checkpoint_path)
# Create parent path if it doesn't exist
if not os.path.isdir(train_filewriter_path):
    os.mkdir(train_filewriter_path)
if not os.path.isdir(val_filewriter_path):
    os.mkdir(val_filewriter_path)

TOWER_NAME = "tower"

# Input parameters
IMAGENET_MEAN = tf.constant([123.68, 116.779, 103.939], dtype=tf.float32)
HEIGHT = 480
WIDTH = 640
CHANNELS = 3 # RGB

BATCH_SIZE = 4
DEPTH = 5
SEQ_LEN = 10
NUM_GPUS = 1

# Learning parameters
LEARN_RATE = 1e-4
NUM_EPOCHS = 2

# Network params
DROPOUT_RATE = 0.5
KEEP_PROB_TRAIN = 0.25
OUTPUT_DIMS = 1 # only steering angles

# The parameters of the LSTM that keeps the model state.
RNN_NUM_UNITS = 32
RNN_NUM_PROJ = 32


def parse_img(img_path):
    img_path = tf.read_file(img_path)
    img_decoded = tf.image.decode_png(img_path, channels=3)
    img_resized = tf.image.resize_images(img_decoded, [227, 227])

    img_centered = tf.subtract(img_resized, IMAGENET_MEAN)             # not needed??????????????????

    # RGB -> BGR
    img_bgr = img_centered[:, :, ::-1]

    return img_bgr


def get_optimizer(loss, learn_rate):
    optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate)
    gradvars = optimizer.compute_gradients(loss)

    grads, vars = zip(*gradvars)

    #print([x.name for x in vars])
    # grad clipping
    grads, _ = tf.clip_by_global_norm(grads, 15.0)

    return optimizer.apply_gradients(zip(grads, vars))


def add_depth(prev_img_batch, curr_img_batch):
    # curr_img_batch on input - batch size * seq length = 40,227,227,3
    # curr_img_batch [BATCH_SIZE = 4, DEPTH = 5 + SEQ_LEN = 10, 227, 227, 3]
    # label_batch [BATCH_SIZE= 4, SEQ_LEN = 10 , angle] (40,)

    cur_batch_images = []
    # [40, 227, 227, 3]
    for i in range(BATCH_SIZE):
        seq_start_id = i * SEQ_LEN

        # add context frames for sequence
        if (seq_start_id - DEPTH < 0):
            # not enough previous frames
            added_context = curr_img_batch[0:seq_start_id,:,:,:]
            #print("added_context", added_context.shape)


            # no prev batch
            if (prev_img_batch == None):
                first_img = curr_img_batch[0,:,:,:]
                first_img = tf.reshape(first_img, [1,227,227,CHANNELS])
                for i in range(DEPTH - seq_start_id):
                    cur_batch_images.append(first_img)
            else:
                cur_batch_images.append(prev_img_batch[seq_start_id - DEPTH:,:,:,:])
        else:
            added_context = curr_img_batch[seq_start_id - DEPTH : seq_start_id,:,:,:]

        # add sequence
        if (added_context.shape[0] > 0):
            cur_batch_images.append(added_context)
        cur_batch_images.append(curr_img_batch[seq_start_id : seq_start_id + SEQ_LEN,:,:,:])


    curr_img_batch = tf.concat(cur_batch_images, 0)

    return curr_img_batch

################################ Temporal Convolution part ########################################

# expects data of shape = [BATCH_SIZE, DEPTH + SEQ_LEN, HEIGT, WIDTH, CHANNELS]
def temporal_cnn(images, keep_prob):
    with tf.variable_scope("TCNN", reuse=None) as scope:
        #conv 1
        net = tf.contrib.layers.conv3d(images,
                                       num_outputs=64,
                                       kernel_size=[3,12,12],
                                       stride=[1,1,6],
                                       padding='VALID',
                                       rate=1,
                                       activation_fn=tf.nn.relu,
                                       normalizer_fn=None,
                                       normalizer_params=None,
                                       weights_initializer=tf.contrib.layers.xavier_initializer(),
                                       weights_regularizer=None,
                                       biases_initializer=tf.zeros_initializer(),
                                       biases_regularizer=None)
        net = tf.nn.dropout(x=net, keep_prob=keep_prob)
        reshaped_data = tf.reshape(net[:, -SEQ_LEN:, :, :, :],
                                  [BATCH_SIZE, SEQ_LEN, -1])
        aux1 = tf.contrib.layers.fully_connected(reshaped_data,
                                                 128, activation_fn=None)

        #conv 2
        net = tf.contrib.layers.conv3d(net,
                                       num_outputs=64,
                                       kernel_size=[2,5,5],
                                       stride=[1,2,2],
                                       padding='VALID')
        net = tf.nn.dropout(x=net, keep_prob=keep_prob)
        aux2 = tf.contrib.layers.fully_connected(reshaped_data,
                                                 128, activation_fn=None)

        #conv 3
        net = tf.contrib.layers.conv3d(net,
                                       num_outputs=64,
                                       kernel_size=[2,5,5],
                                       stride=[1,1,1],
                                       padding='VALID')
        net = tf.nn.dropout(x=net, keep_prob=keep_prob)
        aux3 = tf.contrib.layers.fully_connected(reshaped_data,
                                                 128, activation_fn=None)

        #conv 4
        net = tf.contrib.layers.conv3d(net,
                                       num_outputs=64,
                                       kernel_size=[2,5,5],
                                       stride=[1,1,1],
                                       padding='VALID')
        net = tf.nn.dropout(x=net, keep_prob=keep_prob)
        aux4 = tf.contrib.layers.fully_connected(reshaped_data,
                                                 128, activation_fn=None)

        # fc 1
        net = tf.reshape(net, [BATCH_SIZE, SEQ_LEN, -1])
        net = tf.contrib.layers.fully_connected(net,
                                    1024, activation_fn=tf.nn.relu)
        net = tf.nn.dropout(x=net, keep_prob=keep_prob)

        # fc 2
        net = tf.contrib.layers.fully_connected(net,
                                    512, activation_fn=tf.nn.relu)
        net = tf.nn.dropout(x=net, keep_prob=keep_prob)

        # fc 3
        net = tf.contrib.layers.fully_connected(net,
                                    256, activation_fn=tf.nn.relu)
        net = tf.nn.dropout(x=net, keep_prob=keep_prob)

        # fc 4
        net = tf.contrib.layers.fully_connected(net,
                                    128, activation_fn=None)


        # define layer normalization function
        layer_norm = lambda x: tf.contrib.layers.layer_norm(inputs=x,
                                                            center=True,
                                                            scale=True,
                                                            reuse=tf.AUTO_REUSE,
                                                            scope=scope,
                                                            activation_fn=None,
                                                            trainable=True)

    # aux[1-4] are residual connections (shortcuts)
    return  layer_norm(tf.nn.elu(net + aux1 + aux2 + aux3 + aux4))

#######################################################################################################
def get_rnn_initial_state(complex_state_tuple_sizes):
    # flatten
    flat_sizes = tf.contrib.framework.nest.flatten(complex_state_tuple_sizes)
    initial_state_flat = [tf.tile(
            multiples=[BATCH_SIZE, 1],
            input=tf.get_variable("controller_initial_state_%d" % index, initializer=tf.zeros_initializer, shape=([1, size]), dtype=tf.float32))
            for index,size in enumerate(flat_sizes)]

    # pack the flat copy into the original tuple structure
    initial_state = tf.contrib.framework.nest.pack_sequence_as(
                                structure=complex_state_tuple_sizes,
                                flat_sequence=initial_state_flat)
    return initial_state

def deep_copy_initial_state(complex_state_tuple):
    # flatten
    flat_state = tf.contrib.framework.nest.flatten(complex_state_tuple)

    # copy each each element
    flat_copy = [tf.identity(s) for s in flat_state]

    # pack the flat copy into the original tuple structure
    deep_copy = tf.contrib.framework.nest.pack_sequence_as(
                                        structure=complex_state_tuple,
                                        flat_sequence=flat_copy)
    return deep_copy


class DualRNNCell(tf.nn.rnn_cell.RNNCell):

    def __init__(self, OUTPUT_DIMS, use_ground_truth, internal_cell):
        self._OUTPUT_DIMS = OUTPUT_DIMS # predctions
        self._use_ground_truth = use_ground_truth # boolean
        self._internal_cell = internal_cell # may be LSTM or GRU or anything

    @property
    def state_size(self):
        # previous output and bottleneck state
        return self._OUTPUT_DIMS, self._internal_cell.state_size

    @property
    def output_size(self):
        return self._OUTPUT_DIMS

    def __call__(self, data, prev_state, scope=None):
        (visual_feats, current_ground_truth) = data
        #print("visual_feats",visual_feats.shape)#4,128
        #print("current_ground_truth",current_ground_truth.shape)#4,1

        prev_output, prev_state_internal = prev_state
        #print("prev_output",prev_output.shape)#4,1
        #print("prev_state_internal[0]",prev_state_internal[0].shape)#4,32
        #print("prev_state_internal[1]",prev_state_internal[1].shape)#4,32

        # 4,1 and 4,128 -> 4,129
        context = tf.concat([prev_output, visual_feats], axis=1)

        # call internal cell
        new_output_internal, new_state_internal = self._internal_cell(context, prev_state_internal)


        # FC
        new_output = tf.contrib.layers.fully_connected(
            inputs=tf.concat([new_output_internal, prev_output, visual_feats], axis=1),
            num_outputs=self._OUTPUT_DIMS,
            activation_fn=None,
            scope="OutputProjection")

        return new_output, (current_ground_truth if self._use_ground_truth else new_output, new_state_internal)

###############################################################################################

def model_losses(output_with_gt, output_with_pred, targets, aux_cost_weight):
    # mean of the squared error
    mse_gt = tf.reduce_mean(tf.squared_difference(output_with_gt, targets))
    mse_pred = tf.reduce_mean(tf.squared_difference(output_with_pred, targets))
    mse_pred_steering = tf.reduce_mean(tf.squared_difference(output_with_pred[:, :, 0], targets[:, :, 0]))

    combined_loss = mse_pred_steering + aux_cost_weight * (mse_gt + mse_pred)
    tf.add_to_collection('losses', combined_loss)

    # additional stats
    tf.add_to_collection('rmse_collection', tf.sqrt(mse_gt))
    tf.add_to_collection('rmse_collection', tf.sqrt(mse_pred))
    tf.add_to_collection('rmse_collection', tf.sqrt(mse_pred_steering))

    # The total loss is defined as the combined_loss plus all of the weight decay terms (L2 loss) - NONE YET.
    return tf.add_n(tf.get_collection('losses'), name='total_tower_loss')


def inference(images, labels, keep_prob):
    #######################  TCNN part ##########################
    visual_data = temporal_cnn(images=images, keep_prob=keep_prob)
    visual_data = tf.reshape(visual_data, [BATCH_SIZE, SEQ_LEN, -1])
    visual_data = tf.nn.dropout(x=visual_data, keep_prob=keep_prob)


    ######################## LSTM part #########################

    # inputs for the LSTM part
    data_with_gt = (visual_data, labels)

    #no predictions yet => zeros
    zero_pred = tf.zeros(shape=(BATCH_SIZE, SEQ_LEN, OUTPUT_DIMS),dtype=tf.float32)
    data_with_pred = (visual_data, zero_pred)

    # the internal LSTM cell for our custom dual cell
    rnn_internal_cell = tf.nn.rnn_cell.LSTMCell(num_units=RNN_NUM_UNITS,
                                                use_peepholes=False,
                                                cell_clip=None,
                                                initializer=None,
                                                num_proj=RNN_NUM_PROJ,
                                                proj_clip=None,
                                                num_unit_shards=None,
                                                num_proj_shards=None,
                                                forget_bias=1.0,
                                                state_is_tuple=True,
                                                activation=None)

    # cell with ground truth
    rnn_cell_with_gt = DualRNNCell(OUTPUT_DIMS=OUTPUT_DIMS,
                                   use_ground_truth=True,
                                   internal_cell=rnn_internal_cell)

    # cell with predictions
    rnn_cell_with_pred = DualRNNCell(OUTPUT_DIMS=OUTPUT_DIMS,
                                     use_ground_truth=False,
                                     internal_cell=rnn_internal_cell)


    rnn_initial_state = get_rnn_initial_state(rnn_cell_with_pred.state_size)
    # initial states for the 2 customs RNN cells
    initial_state_gt = deep_copy_initial_state(rnn_initial_state)
    initial_state_pred = deep_copy_initial_state(rnn_initial_state)

    # predict using our custom cells
    with tf.variable_scope("predictor"):
        output_with_gt, final_state_gt = tf.nn.dynamic_rnn(cell=rnn_cell_with_gt,
                                                                      inputs=data_with_gt,
                                                                      sequence_length=[SEQ_LEN] * BATCH_SIZE,
                                                                      initial_state=initial_state_gt,
                                                                      dtype=tf.float32,
                                                                      swap_memory=True,
                                                                      time_major=False)


    with tf.variable_scope("predictor", reuse=True):
        output_with_pred, final_state_pred = tf.nn.dynamic_rnn(cell=rnn_cell_with_pred,
                                                                          inputs=data_with_pred,
                                                                          sequence_length=[SEQ_LEN] * BATCH_SIZE,
                                                                          initial_state=initial_state_pred,
                                                                          dtype=tf.float32,
                                                                          swap_memory=True,
                                                                          time_major=False)

    tf.add_to_collection('cell_final_states', (final_state_gt, final_state_pred))

    return output_with_gt, output_with_pred


def tower_loss(scope, img_batch, label_batch, keep_prob, aux_cost_weight):
    # Calculate the total loss on a single tower running the whole model.
    # scope: unique prefix string identifying the tower, e.g. 'tower_0'
    # returns total loss for a batch of data

    # Build infer Graph.
    output_with_gt, output_with_pred = inference(img_batch, label_batch, keep_prob)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss across all towers using a custom function below.
    _ = model_losses(output_with_gt, output_with_pred, label_batch, aux_cost_weight)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)
    rmse_collection = tf.get_collection('rmse_collection', scope)
    # Calculate the total loss for the current tower.
    total_tower_loss = tf.add_n(losses, name='total_tower_loss')

    # Compute the moving average of all individual losses and the total loss.
    ema = tf.train.ExponentialMovingAverage(0.9, name='avg')


    # need scope:  TOWER_NAME_1/mean_sq_error/avg/
    #with tf.variable_scope(scope):
    with tf.variable_scope("ema", reuse=tf.AUTO_REUSE):
        ema_op = ema.apply(losses + rmse_collection + [total_tower_loss])       ##IDKKKKKKKKKKKKKKKKKKKKKKK


    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + rmse_collection + [total_tower_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        #loss_name = l.op.name.replace('%s_[0-9]*/' % TOWER_NAME, '')
        loss_name = l.op.name
        # Name each loss as '(raw)' and name the moving average version of the loss
        # as the original loss name.
        tf.summary.scalar(loss_name + ' (raw)', l)
        tf.summary.scalar(loss_name, ema.average(l))

    with tf.control_dependencies([ema_op]):
        total_tower_loss = tf.identity(total_tower_loss)

    return total_tower_loss


def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
    Note that this function provides a synchronization point across all towers.
    Args:
    tower_grads: List of lists of (gradient, variable) tuples. The outer list
      is over individual gradients. The inner list is over the gradient
      calculation for each tower.
    Returns:
     List of pairs of (gradient, variable) where the gradient has been averaged
     across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)


        # Average over the 'tower' dimension.
        grad = tf.concat(grads, axis=0)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]

        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)

    return average_grads


def train():
    with tf.device('/cpu:0'):
        ####################### Data input pipeline ###############################
        epoch_is_done = False
        # Images in dataset: 25,172
        # interpolated 1 -> 4401 - val ~ 12%
        # interpolated 2 -> 15,796
        # interpolated 4 -> 1974
        # interpolated 5 -> 4235 - test ~ 12%
        # interpolated 6 -> 7402
        # total -> 33,808

        train_iter_init_ops = []
        next_batch_ops = []
        # define datasets
        train_dataset_1 = tf.data.experimental.make_csv_dataset([os.path.join(FLAGS.data_path, "interpolated_2_4_6_ONE.csv")],
                                                               batch_size=BATCH_SIZE*SEQ_LEN,
                                                               select_columns=[5,6],
                                                               label_name='angle',
                                                               shuffle=False,
                                                               column_defaults=[tf.string, tf.float32])

        # create an reinitializable iterator given the dataset structure
        iterator_1 = Iterator.from_structure(train_dataset_1.output_types, train_dataset_1.output_shapes)
        # Ops for initializing the two different iterators
        train_iter_init_op_1 = iterator_1.make_initializer(train_dataset_1)
        train_iter_init_ops.append(train_iter_init_op_1)
        next_batch_op_1 = iterator_1.get_next()
        next_batch_ops.append(next_batch_op_1)

        train_dataset_2 = tf.data.experimental.make_csv_dataset([os.path.join(FLAGS.data_path, "interpolated_2_4_6_TWO.csv")],
                                                               batch_size=BATCH_SIZE*SEQ_LEN,
                                                               select_columns=[5,6],
                                                               label_name='angle',
                                                               shuffle=False,
                                                               column_defaults=[tf.string, tf.float32])

        # create an reinitializable iterator given the dataset structure
        iterator_2 = Iterator.from_structure(train_dataset_2.output_types, train_dataset_2.output_shapes)
        # Ops for initializing the two different iterators
        train_iter_init_op_2 = iterator_2.make_initializer(train_dataset_2)
        train_iter_init_ops.append(train_iter_init_op_2)
        next_batch_op_2 = iterator_2.get_next()
        next_batch_ops.append(next_batch_op_2)

        train_dataset_3 = tf.data.experimental.make_csv_dataset([os.path.join(FLAGS.data_path, "interpolated_2_4_6_THREE.csv")],
                                                               batch_size=BATCH_SIZE*SEQ_LEN,
                                                               select_columns=[5,6],
                                                               label_name='angle',
                                                               shuffle=False,
                                                               column_defaults=[tf.string, tf.float32])

        # create an reinitializable iterator given the dataset structure
        iterator_3 = Iterator.from_structure(train_dataset_3.output_types, train_dataset_3.output_shapes)
        # Ops for initializing the two different iterators
        train_iter_init_op_3 = iterator_3.make_initializer(train_dataset_3)
        train_iter_init_ops.append(train_iter_init_op_3)
        next_batch_op_3 = iterator_3.get_next()
        next_batch_ops.append(next_batch_op_3)


        train_dataset_4 = tf.data.experimental.make_csv_dataset([os.path.join(FLAGS.data_path, "interpolated_2_4_6_FOUR.csv")],
                                                               batch_size=BATCH_SIZE*SEQ_LEN,
                                                               select_columns=[5,6],
                                                               label_name='angle',
                                                               shuffle=False,
                                                               column_defaults=[tf.string, tf.float32])


        # create an reinitializable iterator given the dataset structure
        iterator_4 = Iterator.from_structure(train_dataset_4.output_types, train_dataset_4.output_shapes)
        # Ops for initializing the two different iterators
        train_iter_init_op_4 = iterator_4.make_initializer(train_dataset_4)
        train_iter_init_ops.append(train_iter_init_op_4)
        next_batch_op_4 = iterator_4.get_next()
        next_batch_ops.append(next_batch_op_4)


        #############################################################################

        # learning placeholders
        keep_prob = tf.placeholder_with_default(input=1.0, shape=())
        aux_cost_weight = tf.placeholder_with_default(input=0.1, shape=())
        learn_rate = tf.placeholder_with_default(input=1e-4, shape=())

        optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate)

        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(0), trainable=False)

        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()) as scope:
            for i in range(NUM_GPUS):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                        # Calculate the loss for one tower of the model. This function
                        # constructs the entire model but shares the variables across
                        # all towers.

                        img_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, DEPTH + SEQ_LEN, 227, 227, 3), name="img_batch_placeholder_%d" % i)
                        label_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, SEQ_LEN, OUTPUT_DIMS), name="label_batch_placeholder_%d" % i)


                        loss = tower_loss(scope, img_batch, label_batch, keep_prob, aux_cost_weight)


                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

                        # Retain the Batch Normalization updates operations only from the
                        # final tower. Ideally, we should grab the updates from all towers
                        # but these stats accumulate extremely fast so we can ignore the
                        # other stats from the other towers without significant detriment.
                        #batchnorm_updates = tf.get_collection(ops.GraphKeys.UPDATE_OPS, scope)      #ot BDD proveri kvo stava


                        # Calculate the gradients for the batch of data on this tower.
                        gradvars = optimizer.compute_gradients(loss, tf.trainable_variables())
                        # get values with existing gradients only
                        vars_with_grads = [v for (g, v) in gradvars if g is not None]
                        # recompute their gradients only
                        gradvars = optimizer.compute_gradients(loss, vars_with_grads)

                        # grad clipping
                        grads, vars = zip(*gradvars)
                        grads, _ = tf.clip_by_global_norm(grads, 15.0)

                        # Keep track of the gradients across all towers.
                        tower_grads.append(zip(grads, vars))


        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        gradvars = average_gradients(tower_grads)

        # Add schedule here !!!!!!
        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', learn_rate))

        # Add histograms for gradients.
        #for grad, var in gradvars:
        #    summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))
        for grad_var_summ in [tf.summary.histogram(var.op.name + '/gradients', grad) for (grad, var) in gradvars]:
            summaries.append(grad_var_summ)

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = optimizer.apply_gradients(gradvars, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))


        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())

        #batchnorm_updates_op = tf.group(*batchnorm_updates)
        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)#, batchnorm_updates_op)

        # final_states for LSTM cells
        final_cell_states = tf.get_collection('cell_final_states')


        # Create a saver.
        saver = tf.train.Saver()


        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        train_writer = tf.summary.FileWriter(train_filewriter_path)

        # Build an initialization operation to run below.
        init_op = tf.global_variables_initializer()

        ##################### Session ##################################


        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.

        config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth = True),
                                allow_soft_placement=True,
                                log_device_placement=False)
        sess = tf.Session(config=config)
        sess.run(init_op)


        step_start = int(sess.run(global_step))

        # Load previous model version
        model_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
        if model_checkpoint:
            print("Restoring from", model_checkpoint)
            saver.restore(sess=sess, save_path=model_checkpoint)


        print("Training...")
        prev_img_batches = []
        curr_final_cell_states = []
        # Loop over number of epochs
        for epoch in range(1, NUM_EPOCHS + 1):
            print("Epoch number: %d" %epoch)

            # initialize all iterators
            for i in range(NUM_GPUS):
                sess.run(train_iter_init_ops[i])
                prev_img_batches.append(None)
                # curr_final_state_gt, curr_final_state_pred
                curr_final_cell_states.append((None, None))

            # go through all batches in sets
            while True:
                try:
                    feed_dict = {}
                    feed_dict[learn_rate] = LEARN_RATE
                    feed_dict[keep_prob] = KEEP_PROB_TRAIN

                    for i in range(NUM_GPUS):
                        curr_final_state_gt, curr_final_state_pred = curr_final_cell_states[i]
                        if curr_final_state_gt is not None:
                            # first part of the RNN state tuple - Tensor of shape = BATCH_SIZE
                            final_state_gt_result = tf.get_default_graph().get_tensor_by_name('tower_%d/predictor/rnn/while/Exit_3:0' % i)
                            # the internal states for the 2 LSTM cells
                            final_state_gt_internal_1 = tf.get_default_graph().get_tensor_by_name('tower_%d/predictor/rnn/while/Exit_4:0' % i)
                            final_state_gt_internal_2 = tf.get_default_graph().get_tensor_by_name('tower_%d/predictor/rnn/while/Exit_5:0' % i)

                            feed_dict[final_state_gt_result] = curr_final_state_gt[0]
                            feed_dict[final_state_gt_internal_1] = curr_final_state_gt[1][0]
                            feed_dict[final_state_gt_internal_2] = curr_final_state_gt[1][1]


                        if curr_final_state_pred is not None:
                            # first part of the RNN state tuple - Tensor of shape = BATCH_SIZE
                            initial_state_pred_result = tf.get_default_graph().get_tensor_by_name('tower_%d/Identity_3:0' % i)
                            # the internal states for the 2 LSTM cells
                            initial_state_pred_internal_1 = tf.get_default_graph().get_tensor_by_name('tower_%d/Identity_4:0' % i)
                            initial_state_pred_internal_2 = tf.get_default_graph().get_tensor_by_name('tower_%d/Identity_5:0' % i)

                            feed_dict[initial_state_pred_result] = curr_final_state_pred[0]
                            feed_dict[initial_state_pred_internal_1] = curr_final_state_pred[1][0]
                            feed_dict[initial_state_pred_internal_2] = curr_final_state_pred[1][1]


                        img_batch_placeholder = tf.get_default_graph().get_tensor_by_name("tower_%d/img_batch_placeholder_%d:0" % (i,i))
                        label_batch_placeholder = tf.get_default_graph().get_tensor_by_name("tower_%d/label_batch_placeholder_%d:0" % (i,i))

                        img_batch_dict, label_batch = sess.run(next_batch_ops[i])

                        # organize data
                        img_batch_paths_encoded = sess.run(tf.convert_to_tensor(img_batch_dict["filename"], dtype=tf.string))
                        img_batch_paths = []
                        for p in img_batch_paths_encoded:
                            img_batch_paths.append(os.path.join(FLAGS.data_path, p.decode('UTF-8')).rstrip())     ##CHANGE .replace("/", "\")

                        img_batch_list = []
                        for p in img_batch_paths:
                            img_batch_list.append(parse_img(p))

                        img_batch = tf.stack(img_batch_list)

                        # add DEPTH preceding frames to every sequence
                        img_batch = add_depth(prev_img_batches[i], curr_img_batch=img_batch)
                        prev_img_batches[i] = img_batch

                        img_batch = tf.reshape(img_batch, [BATCH_SIZE, DEPTH + SEQ_LEN, 227, 227, CHANNELS])
                        label_batch = tf.reshape(label_batch, [BATCH_SIZE, SEQ_LEN, OUTPUT_DIMS])


                        # feed data
                        feed_dict[img_batch_placeholder] = sess.run(img_batch)
                        feed_dict[label_batch_placeholder] = sess.run(label_batch)


                    # perform training
                    print("Running graph...")
                    _, loss_value, summary, curr_step, curr_final_cell_states = sess.run([train_op, loss, summary_op, global_step, final_cell_states],
                                                                                         feed_dict=feed_dict)
                    print('global_step %d, loss = %.2f' % (curr_step, loss_value))

                except tf.errors.OutOfRangeError:
                    break


            break
            # save model after epoch
            train_writer.add_summary(summary, global_step)
            saver.save(sess, checkpoint_path, global_step=global_step)
            #if epoch > 14 or epoch == 5 or epoch == 10:
            #    saver.save(sess, checkpoint_path, global_step=global_step)


train()