Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import sys
- import time
- import numpy as np
- import tensorflow as tf
- from alternets import *
- from dopamine.agents.dqn import dqn_agent
- class DCAgent(dqn_agent.DQNAgent):
- def _build_train_op(self):
- replay_action_one_hot = tf.one_hot(
- self._replay.actions, self.num_actions, 1., 0., name='action_one_hot')
- replay_chosen_q = tf.reduce_sum(
- self._replay_net_outputs.q_values * replay_action_one_hot,
- reduction_indices=1,
- name='replay_chosen_q')
- target = tf.stop_gradient(self._build_target_q_op())
- huber_loss = tf.losses.huber_loss(
- target, replay_chosen_q, reduction=tf.losses.Reduction.NONE)
- total_loss = tf.math.add_n(list(map(tf.reduce_mean, tf.losses.get_losses())))
- if self.summary_writer is not None:
- with tf.variable_scope('Losses'):
- tf.summary.scalar('HuberLoss', tf.reduce_mean(huber_loss))
- tf.summary.scalar('Total_loss', total_loss)
- return self.optimizer.minimize(total_loss)
- def bundle_and_checkpoint(self, checkpoint_dir, iteration_number):
- return None
- class fbAgent(dqn_agent.DQNAgent):
- def __init__(self, *args, **kwargs):
- super(fbAgent, self).__init__(*args, **kwargs)
- self.fb_data = {}
- self.fb_assign_ops = {}
- self.fb_a = 0.5
- self.fb_k = 30
- self._fb_init()
- def _fb_init(self):
- self._fb_ph = tf.placeholder(tf.float32, shape=None, name="FBPH")
- for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Online'):
- self.fb_assign_ops[var.name] = tf.assign(var, self._fb_ph)
- def _fb_back(self):
- for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Online'):
- if var.name in self.fb_data.keys():
- target_value = self._sess.run(var) * self.fb_a + self.fb_data[var.name] * (1 - self.fb_a)
- self._sess.run(self.fb_assign_ops[var.name], feed_dict={self._fb_ph: target_value})
- self.fb_data[var.name] = self._sess.run(var)
- def _train_step(self):
- if self._replay.memory.add_count > self.min_replay_history and self.training_steps % (
- self.fb_k * self.update_period) == 0:
- self._fb_back()
- super(fbAgent, self)._train_step()
- def bundle_and_checkpoint(self, checkpoint_dir, iteration_number):
- return None
- class MIIMAgent(dqn_agent.DQNAgent):
- def _build_sync_op(self):
- """Builds ops for assigning weights from online to target network.
- Returns:
- ops: A list of ops assigning weights from online to target network.
- """
- # Get trainable variables from online and target DQNs
- sync_qt_ops = []
- trainables_online = tf.get_collection(
- tf.GraphKeys.TRAINABLE_VARIABLES, scope='Online')
- trainables_target = tf.get_collection(
- tf.GraphKeys.TRAINABLE_VARIABLES, scope='Target')
- for (w_online, w_target) in zip(trainables_online, trainables_target):
- # Assign weights from online to target network.
- middle=(w_online+w_target)/2
- sync_qt_ops.append(w_target.assign(middle, use_locking=True))
- sync_qt_ops.append(w_online.assign(middle, use_locking=True))
- return sync_qt_ops
- import numpy as np
- import os
- from dopamine.agents.dqn import dqn_agent
- from dopamine.discrete_domains import run_experiment
- from dopamine.colab import utils as colab_utils
- from dopamine.discrete_domains import atari_lib
- import alteragents
- from alternets import *
- from absl import flags
- import gin.tf
- # import setGPU
- import tensorflow as tf
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument('-m', "--mod", action='store')
- parser.add_argument('-g', "--game", action='store')
- parser.add_argument('-f', "--folder", action='store')
- args = parser.parse_args()
- print(args)
- config_str = args.mod
- tf.logging.set_verbosity(tf.logging.INFO)
- GAME = args.game # @param
- BASE_PATH = os.path.join("ae", str(args.folder), str(config_str),"data", str(GAME), str(np.random.randint(1000000)))
- metaAgent_config = """
- # Hyperparameters follow the classic Nature DQN, but we modify as necessary to
- # match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
- # comparison.
- import dopamine.discrete_domains.atari_lib
- import dopamine.discrete_domains.run_experiment
- import dopamine.replay_memory.circular_replay_buffer
- import gin.tf.external_configurables
- atari_lib.create_atari_environment.game_name = '{}'
- # Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
- atari_lib.create_atari_environment.sticky_actions = True
- Runner.num_iterations = 40
- Runner.training_steps = 250000 # agent steps
- Runner.evaluation_steps = 125000 # agent steps
- Runner.max_steps_per_episode = 27000 # agent steps
- WrappedReplayBuffer.replay_capacity = 1000000
- WrappedReplayBuffer.batch_size = 32
- """.format(GAME)
- gin.parse_config(metaAgent_config, skip_unknown=False)
- LOG_PATH = BASE_PATH
- settings={}
- settings['base']={'network':atari_lib.nature_dqn_network,'agent':alteragents.DCAgent,'target_update_period':8000}
- settings['fb']={'network':atari_lib.nature_dqn_network,'agent':alteragents.fbAgent,'target_update_period':80}
- if args.mod in settings.keys():
- network=settings[args.mod]['network']
- agent = settings[args.mod]['agent']
- target_update_period = settings[args.mod]['target_update_period']
- else:
- raise Exception("Mode?")
- def create_agent(sess, environment, summary_writer=None):
- return agent(sess, num_actions=environment.action_space.n, summary_writer=summary_writer,
- gamma=0.99, update_horizon=1, min_replay_history=20000
- , update_period=1, target_update_period=target_update_period, epsilon_train=0.01, epsilon_eval=0.001,
- epsilon_decay_period=250000, tf_device='/gpu:0', max_tf_checkpoints_to_keep=0,
- optimizer=tf.train.RMSPropOptimizer(
- learning_rate=0.00025,
- decay=0.95,
- momentum=0.0,
- epsilon=0.00001,
- centered=True), network=network)
- runner = run_experiment.TrainRunner(LOG_PATH, create_agent)
- print(LOG_PATH)
- runner.run_experiment()
- print(GAME+' Done training!')
- sys.exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement