Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tensorflow as tf
- import numpy as np
- import gym
- import time
- import myenv
- import threading
- from colorama import init
- from colorama import Fore, Back, Style
- import collections
- import itertools
- import pickle
- ##################### hyper parameters ####################
- MAX_EPISODES = 200
- MAX_EP_STEPS = 200
- LR_A = 0.001 # learning rate for actor
- LR_C = 0.002 # learning rate for critic
- GAMMA = 0.9 # reward discount
- TAU = 0.01 # soft replacement
- MEMORY_CAPACITY = 10000
- BATCH_SIZE = 32
- RENDER = True
- ENV_NAME = 'myenv-v0'
- init(convert=True) #Colors Related
- ############################### DDPG ####################################
- class DDPG(object):
- def __init__(self, a_dim, s_dim, a_bound,):
- try:
- with open('Training_Memory', 'rb') as f:
- self.memory = pickle.load(f)
- except FileNotFoundError:
- self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
- print(self.memory)
- self.pointer = 0
- self.sess = tf.Session()
- self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
- s = env.reset() #!!!!!!!!!!!!!
- print(env.shapingsize)
- print("printed env.shapingsize")
- print(type(env.shapingsize))
- print("printed type shapingsize")
- shapingsize = env.shapingsize
- shapingsize1 = shapingsize[0]
- shapingsize2 = shapingsize[1]
- print(shapingsize1)
- print("printed shapingsize1")
- print(type(shapingsize1))
- print("printed type of new shapingsize1")
- print(shapingsize2)
- print("printed shapingsize2")
- print(type(shapingsize2))
- print("printed type of new shapingsize2")
- self.S = tf.placeholder(np.float32, [shapingsize2, shapingsize1], 's')
- self.S_ = tf.placeholder(np.float32, [shapingsize2, shapingsize1], 's_')
- # self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
- # self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
- self.R = tf.placeholder(tf.float32, [None, 1], 'r')
- print(self.S)
- print("printed self.S in placeholder")
- print(type(self.S))
- print("printed type of self.S in placeholder")
- print(s)
- print("Printed s before a build ")
- print(s.shape)
- print("Printed s shape before a build ")
- self.a = self._build_a(self.S,)
- q = self._build_c(self.S, self.a, )
- a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
- c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
- ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement
- def ema_getter(getter, name, *args, **kwargs):
- return ema.average(getter(name, *args, **kwargs))
- target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation
- a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter) # replaced target parameters
- q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)
- a_loss = - tf.reduce_mean(q) # maximize the q
- self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
- with tf.control_dependencies(target_update): # soft replacement happened at here
- q_target = self.R + GAMMA * q_
- td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
- self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
- self.sess.run(tf.global_variables_initializer())
- def choose_action(self, s):
- # s = np.array(s, dtype=object)
- print(s)
- print("printed state in choose action")
- return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
- def learn(self):
- indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
- bt = self.memory[indices, :]
- bs = bt[:, :self.s_dim]
- ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
- br = bt[:, -self.s_dim - 1: -self.s_dim]
- bs_ = bt[:, -self.s_dim:]
- self.sess.run(self.atrain, {self.S: bs})
- self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
- def store_transition(self, s, a, r, s_):
- # s_ = np.array(s_)
- transition = np.hstack((s, a, [r], s_))
- index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
- self.memory[index, :] = transition
- with open('Training_Memory', 'wb') as f:
- pickle.dump(self.memory, f)
- print(self.memory)
- self.pointer += 1
- def _build_a(self, s, reuse=None, custom_getter=None):
- trainable = True if reuse is None else False
- with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
- s = tf.reshape(s, [1, -1])
- print(s)
- print("Printed s in build a ")
- print(s.get_shape())
- print("Printed s shape in build a ")
- net = tf.layers.dense(s, 5, activation=tf.nn.relu, name='l1', trainable=trainable) #Neurons
- a = tf.layers.dense(net, 3, activation=tf.nn.tanh, name='a', trainable=trainable)
- # return tf.multiply(a, self.a_bound, name='scaled_a')
- return tf.nn.softmax(a)
- def _build_c(self, s, a, reuse=None, custom_getter=None):
- # s = env.reset()
- shapingsize = env.shapingsize
- shapingsize1 = shapingsize[0]
- shapingsize2 = shapingsize[1]
- print(shapingsize1)
- print("Printed shapingsize1 in build c")
- trainable = True if reuse is None else False
- with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
- n_l1 = shapingsize1
- w1_s = tf.get_variable('w1_s', [shapingsize1 * shapingsize2, n_l1], trainable=trainable)
- w1_a = tf.get_variable('w1_a', [3, n_l1], trainable=trainable)
- print(w1_s)
- print("printed w1_s")
- print(w1_a)
- print("printed w1_a")
- print(s)
- print("printed s in build c")
- print(a)
- print("printed a")
- print(s.get_shape())
- print("printed s tf shape in build c")
- print(w1_s.get_shape())
- print("printed w1 s tf shape")
- print(w1_a.get_shape())
- print("printed w1 a tf shape")
- b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
- print(b1)
- print("printed b1")
- print(b1.get_shape())
- print("printed b1 shape")
- s = tf.reshape(s, [1, -1])
- net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
- return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
- ############################### training ####################################
- env = gym.make(ENV_NAME)
- env = env.unwrapped
- env.seed(1)
- def startpoint():
- LoadingNumber = 0
- print("")
- print(Fore.CYAN + "#====================#")
- print("")
- while True:
- print("Starting Main Code...")
- print("")
- LoadingNumber = LoadingNumber + 1
- time.sleep(5)
- if LoadingNumber > 4:
- break
- print(Fore.CYAN + "#====================#")
- print("")
- env = gym.make(ENV_NAME)
- env = env.unwrapped
- env.seed(1)
- s = env.reset()
- print(env.shapingsize)
- print("printed env.shapingsize")
- print(type(env.shapingsize))
- print("printed type shapingsize")
- shapingsize = env.shapingsize
- shapingsize1 = shapingsize[0]
- shapingsize2 = shapingsize[1]
- print(shapingsize1)
- print("printed shapingsize1")
- print(type(shapingsize1))
- print("printed type of new shapingsize1")
- print(shapingsize2)
- print("printed shapingsize2")
- print(type(shapingsize2))
- print("printed type of new shapingsize2")
- s_dim = shapingsize2
- a_dim = 3
- a_bound = 3
- a_bound = np.array(a_bound)
- # s = np.array(s, dtype=object)
- print(s)
- print("printed s in the begin")
- print(s_dim)
- print("printed s_dim in the begin")
- ddpg = DDPG(a_dim, s_dim, a_bound)
- var = 3 # control exploration
- t1 = time.time()
- CurrentEpisode = 0
- for i in range(MAX_EPISODES):
- # mi = mi[mi]
- CurrentEpisode += 1
- print("Currently in Episode : " + str(CurrentEpisode))
- # s = np.array(s, dtype=object)
- ep_reward = 0
- for j in range(MAX_EP_STEPS):
- # if RENDER:
- # env.render()
- # Add exploration noise
- a = ddpg.choose_action(s)
- s_, r, done, info = env.step(a)
- ddpg.store_transition(s, a, r / 10, s_)
- if ddpg.pointer > MEMORY_CAPACITY:
- var *= .9995 # decay the action randomness
- ddpg.learn()
- print(s)
- print("printed s in loop")
- print(s.getshape())
- print("printed s shape in loop")
- s = s_
- ep_reward += r
- print(s)
- print("printed s in loop after s thing")
- print(s.getshape())
- print("printed s shape in loop after s thing")
- if env.StopEpisode:
- print("")
- print("Next Episode")
- print("")
- break
- # if j == MAX_EP_STEPS-1:
- # print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
- # # if ep_reward > -300:RENDER = True
- # break
- #print('Running time: ', time.time() - t1)
- startpoint()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement