Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from keras.layers import Dense, Activation, LSTM
- from keras.models import Sequential
- from keras.optimizers import RMSprop
- import os
- os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=gpu,floatX=float32"
- import numpy as np
- import gym
- from itertools import count
- from collections import deque
- import matplotlib.pyplot as plt
- env = gym.make('CartPole-v0')
- REPLAY_MEMORY_SIZE = 100000
- NUM_EPISODES = 200
- LEARNING_RATE = 0.001
- DISCOUNT_FACTOR = 0.9
- BATCH_SIZE = 32
- EPSILON_MIN = 0.1
- EPSILON_DECAY = 0.9
- epsilon = 0.5
- replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
- Q_network = Sequential()
- Q_network.add(Dense(20, activation='relu', input_dim=4))
- Q_network.add(Dense(20, activation='relu', init='uniform'))
- Q_network.add(Dense(2))
- Q_network.add(Activation('linear'))
- Q_network.compile(loss='mse', optimizer=RMSprop(lr=LEARNING_RATE))
- def epsilon_greedy_policy(state):
- if np.random.uniform() < epsilon:
- return env.action_space.sample()
- action_values = Q_network.predict(state)[0]
- return np.argmax(action_values)
- t_values = []
- for i_episode in range(NUM_EPISODES):
- state = np.reshape(env.reset(), [1, 4])
- for t in count():
- env.render()
- action = epsilon_greedy_policy(state)
- next_state, reward, done, _ = env.step(action)
- reward = reward if not done else -5
- next_state = np.reshape(next_state, [1, 4])
- replay_memory.append((state, action, reward, next_state, done))
- if done:
- print 'Episode', i_episode, 'finished.', 'Episode length: ', t
- t_values.append(t)
- break
- state = next_state
- replay_memory_len = len(replay_memory)
- batches = min(BATCH_SIZE, replay_memory_len)
- X = np.zeros((batches, 4))
- Y = np.zeros((batches, 2))
- batches = np.random.choice(replay_memory_len,
- size=batches)
- for i, i_batch in enumerate(batches):
- state, action, reward, next_state, done = replay_memory[i_batch]
- Q_target = reward if done else reward + DISCOUNT_FACTOR * np.max(Q_network.predict(next_state))
- Q = Q_network.predict(state)
- Q[0][action] = Q_target
- X[i] = state
- Y[i] = Q
- Q_network.fit(X, Y, verbose=0)
- if epsilon >= EPSILON_MIN:
- epsilon *= EPSILON_DECAY
- plt.plot(t_values)
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement