Untitled

from utils import *

params = {}


def create_variable(scope, name, shape, trainable=True, on_cpu=True, **kwargs) -> tf.Variable:
    def _create_variable():
        with tf.variable_scope(scope):
            _w = tf.get_variable(name, shape, trainable=trainable, **kwargs)
            params[_w.name] = _w
        return _w

    if on_cpu:
        with tf.device("/cpu:0"):
            w = _create_variable()
    else:
        w = _create_variable()

    return w


def get_variable(scope, name, trainable=True) -> tf.Variable:
    with tf.variable_scope(scope, reuse=True):
        w = tf.get_variable(name, trainable=trainable)
        params[w.name] = w
    return w


def get_toy_data(n, xd):
    xs = np.concatenate([np.random.random((n, xd)) / 2, np.random.random((n, xd)) / 2 + 0.5])
    ys = np.concatenate([np.zeros((n,), dtype=np.int), np.ones((n,), dtype=np.int)])
    permut = np.random.permutation(len(xs))
    xs = xs[permut]
    ys = ys[permut]
    return xs, np.eye(2)[ys]


def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
    Note that this function provides a synchronization point across all towers.
    Args:
        tower_grads: List of lists of (gradient, variable) tuples. The outer list
        is over individual gradients. The inner list is over the gradient
          calculation for each tower.
    Returns:
        List of pairs of (gradient, variable) where the gradient has been averaged
        across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads


def main():
    n = 6000
    xd = 14 * 14
    hd = 100

    xs, ys = get_toy_data(n, xd)

    X = tf.placeholder(tf.float32, [None, xd], name="X")
    Y = tf.placeholder(tf.float32, [None, 2], name="Y")

    w1 = create_variable("layer1", "weight", (xd, hd))
    h = tf.nn.relu(tf.matmul(X, w1))

    w2 = create_variable("layer2", "weight", (hd, hd))
    h = tf.nn.relu(tf.matmul(h, w2))

    w3 = create_variable("layer3", "weight", (hd, 2))
    h = tf.matmul(h, w3)
    hhat = tf.nn.softmax(h)

    opt = tf.train.AdamOptimizer(learning_rate=0.001, name="opt")

    gpu_names = get_available_gpu_names([1])

    batch_size = 300
    batch_size_per_gpu = batch_size // len(gpu_names)

    grad_list = []
    loss_list = []

    with tf.variable_scope(tf.get_variable_scope()):
        for i, gpu_name in enumerate(gpu_names):
            with tf.device(gpu_name):
                idx_start = i * batch_size_per_gpu
                idx_end = (i + 1) * batch_size_per_gpu

                loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
                    logits=h[idx_start:idx_end], labels=Y[idx_start:idx_end],
                ))
                tf.get_variable_scope().reuse_variables()
                grad = opt.compute_gradients(loss)

                loss_list.append(loss)
                grad_list.append(grad)

    grads = average_gradients(grad_list)
    train_op = opt.apply_gradients(grads)

    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=True))
    sess.run(tf.global_variables_initializer())

    num_batch = n // batch_size
    for epoch in range(100):
        total_loss = 0
        for batch_idx in range(num_batch):

            idx_start = batch_idx * batch_size
            idx_end = (batch_idx + 1) * batch_size

            xs_b = xs[idx_start:idx_end]
            ys_b = ys[idx_start:idx_end]

            _, loss_value = sess.run([train_op, loss_list], feed_dict={
                X: xs_b,
                Y: ys_b
            })
            total_loss += np.mean(loss_value)

        print(total_loss)


if __name__ == '__main__':
    main()