Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import gym
- import time
- import random
- def get_states(dst_observation):
- deg = dst_observation[2] * 180 / 3.14 / 1.2
- v1 = dst_observation[1] * 5
- v2 = dst_observation[3] * 5
- if deg > 10.0:
- deg = 10.0
- elif deg < -10.0:
- deg = -10.0
- if v1 > 10.0:
- v1 = 10.0
- elif v1 < -10.0:
- v1 = -10.0
- if v2 > 10.0:
- v2 = 10.0
- elif v2 < -10.0:
- v2 = -10.0
- return int(deg + 10.0) * 21 * 21 \
- + int(v1 + 10.0) * 21 \
- + int(v2 + 10.0)
- env = gym.make('CartPole-v0')
- Q = [[random.random(), random.random()] for _ in range(21 * 21 * 21)]
- alpha = 0.05 # learning rate
- gamma = 0.99 # saving rate
- epsilon = 0.1 # rate for epsilon greedy method
- for episode in range(10001):
- observation = env.reset()
- R = [0 for _ in range(200)]
- H = [[0, 0] for _ in range(200)]
- i_end = 0
- for i in range(200):
- states = get_states(observation)
- # epsilon greedy
- action = 0
- if random.random() < epsilon:
- action = random.randint(0, 1)
- else:
- action = Q[states].index(max(Q[states]))
- observation, reward, done, info = env.step(action)
- # logging history
- H[i] = [states, action]
- # calculate income
- rate = 1.0
- for j in reversed(range(0, i + 1)):
- R[j] += reward * rate
- rate *= gamma
- if episode % 2000 == 0:
- env.render()
- time.sleep(1.0 / 20)
- if done and not episode % 1000 == 0:
- i_end = i
- break
- # update Q
- for i in range(i_end - 1):
- S_t = H[i][0]
- S_t_1 = H[i + 1][0]
- A_t = H[i][1]
- R_t_1 = R[i + 1]
- Q_max = max(Q[S_t_1])
- Q[S_t][A_t] += alpha * (gamma * Q_max - Q[S_t][A_t] + R_t_1)
- print("episode: ", episode, i_end)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement