Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- # Define the environment (grid world) with named spaces
- env = np.array([["S", "A", "#"],
- ["B", "C", "G"]])
- # Define actions (up, down, left, right)
- actions = ["UP", "DOWN", "LEFT", "RIGHT"]
- num_actions = len(actions)
- # Define Q-learning parameters
- learning_rate = 0.1
- discount_factor = 0.9
- num_episodes = 1000
- # Convert environment to a flat array
- env_flat = env.flatten()
- # Q-learning table (Q-values for each state-action pair)
- q_table = np.zeros((env_flat.shape[0], num_actions))
- # Q-learning algorithm
- for episode in range(num_episodes):
- state = 0 # Starting state (index of "S")
- done = False
- while not done:
- # Choose an action using epsilon-greedy policy
- epsilon = 0.1
- if np.random.rand() < epsilon:
- action = np.random.choice(num_actions)
- else:
- action = np.argmax(q_table[state, :])
- # Perform the action and observe the next state and reward
- next_state = state + 1 if action == 3 else state - 1 # Move right or left
- reward = 1 if env_flat[next_state] == "G" else 0
- # Update Q-value using the Q-learning update rule
- q_table[state, action] = q_table[state, action] + learning_rate * \
- (reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])
- state = next_state
- done = (env_flat[state] == "G")
- print("Training complete.\n")
- # Print the final Q-table values for each state-action pair
- for state in range(len(env_flat)):
- state_name = env_flat[state]
- for action in range(num_actions):
- action_name = actions[action]
- q_value = q_table[state, action]
- print(f"State: {state_name}, Action: {action_name}, Q-value: {q_value}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement