Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import gym
- import os
- import random
- import time
- import pickle
- import tensorflow as tf
- import numpy as np
- microtime = lambda: int(round(time.time() * 1000))
- start_t = microtime()
- class ExperienceBuffer():
- def __init__(self, buf_size):
- self.buf_size = buf_size
- self.buffer = []
- # Store experiences
- def add(self, experience):
- if len(self.buffer) + 1 >= self.buf_size:
- self.buffer.pop(0)
- self.buffer.append(experience)
- # Retrtieve a random sample of experiences
- def sample(self, batch_size, trace_length):
- sampled_episodes = random.sample(self.buffer, batch_size)
- sampled_traces = []
- for episode in sampled_episodes:
- p = len(episode) + 1 - trace_length
- if p <= 0:
- p = len(episode)
- pt = np.random.randint(0, p)
- sampled_traces = sampled_traces + episode[pt:pt + trace_length]
- return np.array(sampled_traces)
- class QNetwork():
- def __init__(self, num_states, num_actions, save_file=None, gamma=0.99, lr=0.1):
- self.num_states = num_states
- self.num_actions = num_actions
- self.save_file = save_file
- self.gamma = gamma
- self.lr = lr
- if save_file is not None:
- try:
- load = pickle.load(open(save_file, "rb"))
- self.W = tf.Variable(load)
- print("Loaded %s" % save_file)
- print(load)
- print("")
- except FileNotFoundError:
- self.W = tf.Variable(tf.random_uniform(
- [self.num_states, self.num_actions],
- 0,
- 0.01),
- dtype=tf.float32)
- except Exception as _e:
- print(_e)
- else:
- self.W = tf.Variable(tf.random_uniform([self.num_states, self.num_actions]), dtype=tf.float32)
- self.input_state = tf.placeholder(shape=[None], dtype=tf.int32, name="input_state")
- self.input_state_one_hot = tf.one_hot(
- indices=tf.cast(self.input_state, tf.int32),
- depth=self.num_states
- )
- self.Q = tf.matmul(self.input_state_one_hot, self.W)
- self.Q_target = tf.placeholder(
- shape=[None, self.num_actions],
- dtype=tf.float32,
- name="Q_target"
- )
- self.best_action = tf.argmax(self.Q, 1)
- self.loss = tf.reduce_sum(tf.square(self.Q_target - self.Q), 1)
- self.trainer = tf.train.GradientDescentOptimizer(learning_rate=lr)
- self.train_op = self.trainer.minimize(self.loss)
- def save(self, val):
- if self.save_file is None:
- return
- pickle.dump(val, open(self.save_file, "wb"))
- # Setup
- train = True
- batch_train = True
- test = True
- pre_train_steps = 50000
- train_freq = 25
- num_episodes = 10000
- num_episodes_test = 100
- num_steps = 100
- e_start = 0.1
- e_end = 0.01
- #QN1 = QNetwork(16, 4, save_file="FrozenLake-v0.p", gamma=0.99, lr=0.1)
- QN1 = QNetwork(16, 4, gamma=0.99, lr=0.1)
- # Variables
- env = gym.make("FrozenLake-v0")
- env = gym.wrappers.Monitor(env, "tmp/FrozenLake-0.1", force=True)
- exp_buf = ExperienceBuffer(1000)
- e_factor = 2.0 * ((e_start - e_end) / num_episodes)
- e = e_start
- bench = [[], [], [], [], []]
- # Add an operation to initialize global variables.
- init_op = tf.global_variables_initializer()
- # Training
- with tf.Session() as sess:
- sess.run(init_op)
- if train == True:
- print("Training started\n")
- batch_training_started = False
- total_batch_trained = 0
- all_rewards = []
- all_steps = []
- for episode in range(num_episodes):
- os.system("title \"Training... Episode %i/%i\"" % (episode, num_episodes))
- if episode % 100 == 0 and episode != 0:
- t = microtime()
- W_val = sess.run(QN1.W)
- QN1.save(W_val)
- print("Episodes %04d - %04d: %i succeeded, %.2f avg steps/episode, e=%.4f" % (
- episode - 100,
- episode,
- sum(all_rewards[-100:]),
- np.mean(all_steps[-100:]),
- e
- )
- )
- bench[0].append((microtime() - t))
- # Reset episode-specific parameters
- state = env.reset()
- steps = 0
- episode_reward = 0
- episode_buffer = [] # s, a, r, s', d
- done = False
- # Do steps in the game
- while steps <= num_steps:
- if done == True:
- break
- # Obtain the best action and current Q_values for this state
- t = microtime()
- act, curr_Qs = sess.run([QN1.best_action, QN1.Q], feed_dict={
- QN1.input_state: [state]
- })
- bench[1].append((microtime() - t))
- act = act[0]
- # An e chance of randomly selection an action
- if np.random.rand(1) < e:
- act = env.action_space.sample()
- # Advance a state
- t = microtime()
- new_state, reward, done, _ = env.step(act)
- bench[2].append((microtime() - t))
- # Store this experience
- episode_buffer.append([state, act, reward, new_state, done])
- # Train from memory
- total_steps = sum(all_steps)
- if (batch_train == True) and (total_steps > pre_train_steps) and ((total_steps % train_freq) == 0):
- if batch_training_started == False:
- batch_training_started = True
- print("Batch training started")
- training_batch = exp_buf.sample(4, 4)
- t = microtime()
- training_states = [int(x[3]) for x in training_batch] # s'
- batch_new_Qs = sess.run(QN1.Q, feed_dict={
- QN1.input_state: training_states
- }) # Q(s', a')
- training_states = [int(x[0]) for x in training_batch] # s
- batch_curr_Qs = sess.run(QN1.Q, feed_dict={
- QN1.input_state: training_states
- }) # Q(s, a)
- bench[3].append((microtime() - t))
- # Best possible outcome of the new states (per state)
- new_Qs_max = np.max(batch_new_Qs, 1) # max a' for Q(s', a')
- target_Qs = batch_curr_Qs.copy()
- for i, experience in enumerate(training_batch):
- s, a, r, ss, d = experience # s a r s' d
- if int(r) == 1:
- e -= e_factor
- if e < e_end:
- e = e_end
- target_Qs[i][int(a)] = r + QN1.gamma * new_Qs_max[i]
- # target for a = r + y*maxa'Q(s', a')
- total_batch_trained += len(training_batch)
- else:
- # Obtain Q-values for the new state if we couldnt use the buffer
- t = microtime()
- new_Qs = sess.run(QN1.Q, feed_dict={
- QN1.input_state: [new_state]
- })
- bench[3].append((microtime() - t))
- # Best possible outcome of the new state
- new_Qs_max = np.max(new_Qs)
- # Set target_Qs for the old state
- target_Qs = curr_Qs.copy()
- target_Qs[0, act] = reward + QN1.gamma * new_Qs_max
- training_states = [state]
- # Train with the given state(s) and target_Qs
- t = microtime()
- sess.run(QN1.train_op, feed_dict={
- QN1.input_state: training_states,
- QN1.Q_target: target_Qs
- }) # train with target and s
- bench[4].append((microtime() - t))
- steps += 1
- episode_reward += reward
- state = new_state
- # Decrease the random % for every successful run
- if episode_reward > 0:
- e -= e_factor
- if e < e_end:
- e = e_end
- all_rewards.append(episode_reward)
- all_steps.append(steps)
- # Store this episode's experiences
- exp_buf.add(episode_buffer)
- W_val = sess.run(QN1.W)
- QN1.save(W_val)
- print("\nCompleted %i organic steps" % sum(all_steps))
- print("Completed %i batch-trained steps" % total_batch_trained)
- if test == True:
- # Testing
- print("\nTesting...")
- all_rewards = []
- all_steps = []
- for episode in range(num_episodes_test):
- os.system("title \"Testing... Episode %i/%i\"" % (episode, num_episodes_test))
- # Reset episode-specific parameters
- state = env.reset()
- steps = 0
- episode_reward = 0
- done = False
- # Do steps in the game
- while steps <= num_steps:
- if done == True:
- break
- act = sess.run(QN1.best_action, feed_dict={
- QN1.input_state: [state]
- })
- act = act[0]
- new_state, reward, done, _ = env.step(act)
- steps += 1
- episode_reward += reward
- state = new_state
- all_rewards.append(episode_reward)
- all_steps.append(steps)
- print("Finished. %i/%i succeeded, avg. steps %.2f" % (
- sum(all_rewards),
- num_episodes_test,
- np.mean(all_steps)
- ))
- print("\nTimes:\nsave, get_act, step, get_new_Qs, train:")
- print(", ".join([str(sum(t)) for t in bench]))
- print("\nTotal took %i ms" % (microtime() - start_t))
- env.close()
Add Comment
Please, Sign In to add comment