Untitled

import tensorflow as tf
import numpy as np
import gym
import time
import myenv
import threading
from colorama import init
from colorama import Fore, Back, Style
import collections
import itertools
import pickle


#####################  hyper parameters  ####################

MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.001    # learning rate for actor
LR_C = 0.002    # learning rate for critic
GAMMA = 0.9     # reward discount
TAU = 0.01      # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32

RENDER = True
ENV_NAME = 'myenv-v0'

init(convert=True) #Colors Related

###############################  DDPG  ####################################


class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound,):
        try:
            with open('Training_Memory', 'rb') as f:
                self.memory = pickle.load(f)

        except FileNotFoundError:
            self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)

        print(self.memory)
        self.pointer = 0
        self.sess = tf.Session()

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,

        s = env.reset() #!!!!!!!!!!!!!

        print(env.shapingsize)
        print("printed env.shapingsize")
        print(type(env.shapingsize))
        print("printed type shapingsize")

        shapingsize = env.shapingsize

        shapingsize1 = shapingsize[0]

        shapingsize2 = shapingsize[1]

        print(shapingsize1)
        print("printed shapingsize1")
        print(type(shapingsize1))
        print("printed type of new shapingsize1")

        print(shapingsize2)
        print("printed shapingsize2")
        print(type(shapingsize2))
        print("printed type of new shapingsize2")

        self.S = tf.placeholder(np.float32, [shapingsize2, shapingsize1], 's')
        self.S_ = tf.placeholder(np.float32, [shapingsize2, shapingsize1], 's_')
#        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
#        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')

        print(self.S)
        print("printed self.S in placeholder")
        print(type(self.S))
        print("printed type of self.S in placeholder")

        print(s)
        print("Printed s before a build ")

        print(s.shape)
        print("Printed s shape before a build ")

        self.a = self._build_a(self.S,)
        q = self._build_c(self.S, self.a, )
        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
        ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)          # soft replacement

        def ema_getter(getter, name, *args, **kwargs):
            return ema.average(getter(name, *args, **kwargs))

        target_update = [ema.apply(a_params), ema.apply(c_params)]      # soft update operation
        a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter)   # replaced target parameters
        q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)

        a_loss = - tf.reduce_mean(q)  # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)

        with tf.control_dependencies(target_update):    # soft replacement happened at here
            q_target = self.R + GAMMA * q_
            td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
            self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s):
#        s = np.array(s, dtype=object)
        print(s)
        print("printed state in choose action")
        return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]

    def learn(self):
        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
        bt = self.memory[indices, :]
        bs = bt[:, :self.s_dim]
        ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
        br = bt[:, -self.s_dim - 1: -self.s_dim]
        bs_ = bt[:, -self.s_dim:]

        self.sess.run(self.atrain, {self.S: bs})
        self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})

    def store_transition(self, s, a, r, s_):
#        s_ = np.array(s_)
        transition = np.hstack((s, a, [r], s_))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory[index, :] = transition
        with open('Training_Memory', 'wb') as f:
            pickle.dump(self.memory, f)
        print(self.memory)
        self.pointer += 1

    def _build_a(self, s, reuse=None, custom_getter=None):
        trainable = True if reuse is None else False
        with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):

            s = tf.reshape(s, [1, -1])

            print(s)
            print("Printed s in build a ")

            print(s.get_shape())
            print("Printed s shape in build a ")

            net = tf.layers.dense(s, 5, activation=tf.nn.relu, name='l1', trainable=trainable) #Neurons
            a = tf.layers.dense(net, 3, activation=tf.nn.tanh, name='a', trainable=trainable)
            # return tf.multiply(a, self.a_bound, name='scaled_a')
            return tf.nn.softmax(a)

    def _build_c(self, s, a, reuse=None, custom_getter=None):
#        s = env.reset()

        shapingsize = env.shapingsize

        shapingsize1 = shapingsize[0]

        shapingsize2 = shapingsize[1]

        print(shapingsize1)

        print("Printed shapingsize1 in build c")

        trainable = True if reuse is None else False
        with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):

            n_l1 = shapingsize1

            w1_s = tf.get_variable('w1_s', [shapingsize1 * shapingsize2, n_l1], trainable=trainable)
            w1_a = tf.get_variable('w1_a', [3, n_l1], trainable=trainable)

            print(w1_s)
            print("printed w1_s")
            print(w1_a)
            print("printed w1_a")
            print(s)
            print("printed s in build c")
            print(a)
            print("printed a")

            print(s.get_shape())
            print("printed s tf shape in build c")
            print(w1_s.get_shape())
            print("printed w1 s tf shape")
            print(w1_a.get_shape())
            print("printed w1 a tf shape")

            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)

            print(b1)
            print("printed b1")
            print(b1.get_shape())
            print("printed b1 shape")

            s = tf.reshape(s, [1, -1])

            net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
            return tf.layers.dense(net, 1, trainable=trainable)  # Q(s,a)


###############################  training  ####################################


env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)

def startpoint():

    LoadingNumber = 0
    print("")
    print(Fore.CYAN + "#====================#")
    print("")
    while True:
        print("Starting Main Code...")
        print("")
        LoadingNumber = LoadingNumber + 1
        time.sleep(5)
        if LoadingNumber > 4:
            break
    print(Fore.CYAN + "#====================#")
    print("")
    env = gym.make(ENV_NAME)
    env = env.unwrapped
    env.seed(1)


    s = env.reset()

    print(env.shapingsize)
    print("printed env.shapingsize")
    print(type(env.shapingsize))
    print("printed type shapingsize")

    shapingsize = env.shapingsize

    shapingsize1 = shapingsize[0]

    shapingsize2 = shapingsize[1]

    print(shapingsize1)
    print("printed shapingsize1")
    print(type(shapingsize1))
    print("printed type of new shapingsize1")

    print(shapingsize2)
    print("printed shapingsize2")
    print(type(shapingsize2))
    print("printed type of new shapingsize2")

    s_dim = shapingsize2
    a_dim = 3
    a_bound = 3
    a_bound = np.array(a_bound)
#    s = np.array(s, dtype=object)

    print(s)
    print("printed s in the begin")

    print(s_dim)
    print("printed s_dim in the begin")

    ddpg = DDPG(a_dim, s_dim, a_bound)

    var = 3  # control exploration
    t1 = time.time()

    CurrentEpisode = 0

    for i in range(MAX_EPISODES):
#        mi = mi[mi]
        CurrentEpisode += 1
        print("Currently in Episode : " + str(CurrentEpisode))
#        s = np.array(s, dtype=object)
        ep_reward = 0
        for j in range(MAX_EP_STEPS):
 #           if RENDER:
#                env.render()

            # Add exploration noise
            a = ddpg.choose_action(s)
            s_, r, done, info = env.step(a)

            ddpg.store_transition(s, a, r / 10, s_)

            if ddpg.pointer > MEMORY_CAPACITY:
                var *= .9995    # decay the action randomness
                ddpg.learn()

            print(s)
            print("printed s in loop")

            print(s.getshape())
            print("printed s shape in loop")

            s = s_
            ep_reward += r

            print(s)
            print("printed s in loop after s thing")

            print(s.getshape())
            print("printed s shape in loop after s thing")

            if env.StopEpisode:
                print("")
                print("Next Episode")
                print("")
                break
#            if j == MAX_EP_STEPS-1:
 #               print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
 #               # if ep_reward > -300:RENDER = True
 #               break


#print('Running time: ', time.time() - t1)

startpoint()