Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- InvalidArgumentError (see above for traceback): Cannot assign a device to node 'save/RestoreV2_14': Could not satisfy explicit device specification '/job:ps/task:0/device:CPU:0' because no devices matching that specification are registered in this process; available devices: /job:local/replica:0/task:0/cpu:0, /job:local/replica:0/task:1/cpu:0, /job:worker/replica:0/task:1/cpu:0
- [[Node: save/RestoreV2_14 = RestoreV2[dtypes=[DT_INT32], _device="/job:ps/task:0/device:CPU:0"](save/Const, save/RestoreV2_14/tensor_names, save/RestoreV2_14/shape_and_slices)]]
- parser = argparse.ArgumentParser(description='tensorflow')
- parser.add_argument('--job_name', dest='job_name')
- parser.add_argument('--task_index', dest='task_index', default=0)
- args = parser.parse_args()
- ps_hosts = ['localhost:2222']
- worker_hosts = ['localhost:2223', 'localhost:2224']
- job_name = args.job_name
- task_index = int(args.task_index)
- # Create a cluster from the parameter server and worker hosts.
- cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
- # Create and start a server for the local task.
- server = tf.train.Server(cluster, job_name=job_name, task_index=task_index)
- if job_name == "ps":
- server.join()
- elif job_name == "worker":
- with tf.device(tf.train.replica_device_setter(
- worker_device="/job:worker/task:%d" % task_index,
- cluster=cluster)):
- total_input_features = len(train_x[0])
- x = tf.placeholder('float', [None, total_input_features])
- y = tf.placeholder('float')
- global_step = tf.Variable(0, name="global_step", trainable=False)
- is_chief = (task_index == 0)
- prediction = neural_network_model(x, total_input_features, n_nodes_hl1,
- first_layer_activation,
- n_nodes_hl2,
- second_layer_activation)
- total_loss = tf.reduce_mean(tf.square(prediction - y))
- optimizer = tf.train.AdamOptimizer()
- train_op = optimizer.minimize(total_loss, global_step=global_step)
- init_op = tf.initialize_all_variables()
- sv = tf.train.Supervisor(
- is_chief=is_chief,
- logdir="/tmp/train_logs",
- init_op=init_op,
- global_step=global_step)
- print '******** ALL CREATED ********'
- # The supervisor takes care of session initialization, restoring from
- # a checkpoint, and closing when done or an error occurs.
- with sv.managed_session(server.target) as sess:
- # Loop until the supervisor shuts down or 1000000 steps have completed.
- step = 0
- while not sv.should_stop() and step < 1000000:
- # Run a training step asynchronously.
- # See `tf.train.SyncReplicasOptimizer` for additional details on how to
- # perform *synchronous* training.
- train_feed = {x: train_x, y: train_y}
- _, step = sess.run([train_op, global_step], feed_dict=train_feed)
- if step % 100 == 0:
- print "Done step %d" % step
- sv.stop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement