Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import gym
- import random
- # Setup Env
- env = gym.make('Taxi-v2')
- # Params
- maxEpisodes = 1000
- maxSteps = 5000
- epsilon = 1.0
- degradeRate = 0.01
- gamma = 0.9
- alpha = 1.0
- # Setup the qTable
- qTable = np.zeros((env.observation_space.n, env.action_space.n))
- # Update qTable
- def updateQTable(state, action, reward, statePrime, Terminal):
- maxTerm = max(qTable[statePrime])
- if Terminal:
- print("Terminal")
- maxTerm = 0
- qTable[state][action] = ((1 - alpha) * qTable[state][action]) + alpha * (reward + gamma * maxTerm)
- # Generate Action
- def generateAction(currentState):
- # Epsilon Greedy
- # Random action with probability epsilon
- # Best known action with probability 1-epsilon
- if random.random() < epsilon:
- return env.action_space.sample()
- else:
- return np.argmax(qTable[currentState])
- # Run Learner
- for episode in range(maxEpisodes): # Loop for each episode
- state = env.reset() # Initialize S
- epsilon = epsilon - degradeRate # Degrade Epsilon
- terminal = False
- for step in range(maxSteps): # Loop for each step of episode
- action = generateAction(state) # Choose A from S using policy derived from Q
- statePrime, reward, done, info = env.step(action) # Take action A, observe R, S'
- if reward == 20:
- terminal = True
- updateQTable(state, action, reward, statePrime, terminal) # Update Q table
- if terminal:
- break
- state = statePrime
- print(qTable)
- print(qTable[462][4])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement