Advertisement
Guest User

Untitled

a guest
Apr 25th, 2017
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.34 KB | None | 0 0
  1. from keras.layers import Dense, Activation, LSTM
  2. from keras.models import Sequential
  3. from keras.optimizers import RMSprop
  4.  
  5. import os
  6. os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=gpu,floatX=float32"
  7.  
  8. import numpy as np
  9. import gym
  10. from itertools import count
  11. from collections import deque
  12. import matplotlib.pyplot as plt
  13.  
  14. env = gym.make('CartPole-v0')
  15.  
  16. REPLAY_MEMORY_SIZE = 100000
  17. NUM_EPISODES = 200
  18. LEARNING_RATE = 0.001
  19. DISCOUNT_FACTOR = 0.9
  20. BATCH_SIZE = 32
  21. EPSILON_MIN = 0.1
  22. EPSILON_DECAY = 0.9
  23. epsilon = 0.5
  24. replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
  25.  
  26. Q_network = Sequential()
  27. Q_network.add(Dense(20, activation='relu', input_dim=4))
  28. Q_network.add(Dense(20, activation='relu', init='uniform'))
  29. Q_network.add(Dense(2))
  30. Q_network.add(Activation('linear'))
  31. Q_network.compile(loss='mse', optimizer=RMSprop(lr=LEARNING_RATE))
  32.  
  33.  
  34. def epsilon_greedy_policy(state):
  35.     if np.random.uniform() < epsilon:
  36.         return env.action_space.sample()
  37.     action_values = Q_network.predict(state)[0]
  38.     return np.argmax(action_values)
  39.  
  40.  
  41. t_values = []
  42. for i_episode in range(NUM_EPISODES):
  43.     state = np.reshape(env.reset(), [1, 4])
  44.  
  45.     for t in count():
  46.         env.render()
  47.         action = epsilon_greedy_policy(state)
  48.         next_state, reward, done, _ = env.step(action)
  49.         reward = reward if not done else -5
  50.         next_state = np.reshape(next_state, [1, 4])
  51.         replay_memory.append((state, action, reward, next_state, done))
  52.         if done:
  53.             print 'Episode', i_episode, 'finished.', 'Episode length: ', t
  54.             t_values.append(t)
  55.             break
  56.         state = next_state
  57.  
  58.     replay_memory_len = len(replay_memory)
  59.     batches = min(BATCH_SIZE, replay_memory_len)
  60.     X = np.zeros((batches, 4))
  61.     Y = np.zeros((batches, 2))
  62.     batches = np.random.choice(replay_memory_len,
  63.                                size=batches)
  64.     for i, i_batch in enumerate(batches):
  65.         state, action, reward, next_state, done = replay_memory[i_batch]
  66.         Q_target = reward if done else reward + DISCOUNT_FACTOR * np.max(Q_network.predict(next_state))
  67.         Q = Q_network.predict(state)
  68.         Q[0][action] = Q_target
  69.         X[i] = state
  70.         Y[i] = Q
  71.  
  72.     Q_network.fit(X, Y, verbose=0)
  73.  
  74.     if epsilon >= EPSILON_MIN:
  75.         epsilon *= EPSILON_DECAY
  76.  
  77. plt.plot(t_values)
  78. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement