Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- !pip install cmake 'gym[atari]' scipy
- !pip install git+https://github.com/jsh9/python-plot-utilities@v0.6.1
- import sys
- import gym
- import numpy as np
- import random
- import math
- from collections import defaultdict, deque
- import matplotlib.pyplot as plt
- #from plot_utils import plot_values
- #from plot_utils import plot_values
- %matplotlib inline
- env = gym.make('CliffWalking-v0')
- def update_Q(alpha, gamma, Q, state, action, reward, next_state=None):
- """Returns updated Q-value for the most recent experience."""
- current = Q[state][action] # estimate in Q-table (for current state, action pair)
- Qsa_next = np.max(Q[next_state]) if next_state is not None else 0 # value of next state
- target = reward + (gamma * Qsa_next) # construct TD target
- new_value = current + (alpha * (target - current)) # get updated value
- return new_value
- def epsilon_greedy(Q, state, nA, eps):
- """Selects epsilon-greedy action for supplied state.
- Params
- ======
- Q (dictionary): action-value function
- state (int): current state
- nA (int): number actions in the environment
- eps (float): epsilon
- """
- if random.random() > eps: # select greedy action with probability epsilon
- return np.argmax(Q[state])
- else: # otherwise, select an action randomly
- return random.choice(np.arange(env.action_space.n))
- def q_learning(env, num_episodes, alpha, gamma=1.0, plot_every=100):
- """Q-Learning - TD Control
- Params
- ======
- num_episodes (int): number of episodes to run the algorithm
- alpha (float): learning rate
- gamma (float): discount factor
- plot_every (int): number of episodes to use when calculating average score
- """
- nA = env.action_space.n # number of actions
- Q = defaultdict(lambda: np.zeros(nA)) # initialize empty dictionary of arrays
- # monitor performance
- tmp_scores = deque(maxlen=plot_every) # deque for keeping track of scores
- avg_scores = deque(maxlen=num_episodes) # average scores over every plot_every episodes
- for i_episode in range(1, num_episodes+1):
- # monitor progress
- if i_episode % 100 == 0:
- print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
- sys.stdout.flush()
- score = 0 # initialize score
- state = env.reset() # start episode
- eps = 1.0 / i_episode # set value of epsilon
- while True:
- action = epsilon_greedy(Q, state, nA, eps) # epsilon-greedy action selection
- next_state, reward, done, info = env.step(action) # take action A, observe R, S'
- score += reward # add reward to agent's score
- Q[state][action] = update_Q(alpha, gamma, Q, \
- state, action, reward, next_state)
- state = next_state # S <- S'
- if done:
- tmp_scores.append(score) # append score
- break
- if (i_episode % plot_every == 0):
- avg_scores.append(np.mean(tmp_scores))
- # plot performance
- plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores))
- plt.xlabel('Episode Number')
- plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
- plt.show()
- # print best 100-episode performance
- print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores))
- return Q
- # obtain the estimated optimal policy and corresponding action-value function
- # obtain the estimated optimal policy and corresponding action-value function
- Q_sarsamax = q_learning(env, 5000, .01)
- # print the estimated optimal policy
- policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12))
- print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
- print(policy_sarsamax)
- # plot the estimated optimal state-value function
- somethingnew = np.array([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)]).reshape(4,12)
- #print(somethingnew)
- for line in somethingnew:
- print(*line)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement