Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import torch
- import torch.nn as nn
- import torch.optim as optim
- # Define the environment (grid world)
- # S: Start, G: Goal, #: Obstacle
- # Agent's goal is to reach G while avoiding obstacles
- # -----------
- # | S | |
- # | # | G |
- # -----------
- # Define the grid world as a numpy array
- env = np.array([["S", " ", "#"],
- [" ", " ", "G"]])
- # Define actions (up, down, left, right)
- actions = ["UP", "DOWN", "LEFT", "RIGHT"]
- num_actions = len(actions)
- # Define Q-learning parameters
- learning_rate = 0.1
- discount_factor = 0.9
- num_episodes = 1000
- # Convert environment to a flat array
- env_flat = env.flatten()
- # Q-learning table (Q-values for each state-action pair)
- q_table = np.zeros((env_flat.shape[0], num_actions))
- # Q-learning algorithm
- for episode in range(num_episodes):
- state = 0 # Starting state (index of "S")
- done = False
- while not done:
- # Choose an action using epsilon-greedy policy
- epsilon = 0.1
- if np.random.rand() < epsilon:
- action = np.random.choice(num_actions)
- else:
- action = np.argmax(q_table[state, :])
- # Perform the action and observe the next state and reward
- next_state = state + 1 if action == 3 else state - 1 # Move right or left
- reward = 1 if env_flat[next_state] == "G" else 0
- # Update Q-value using the Q-learning update rule
- q_table[state, action] = q_table[state, action] + learning_rate * \
- (reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])
- state = next_state
- done = (env_flat[state] == "G")
- print("Q-table:")
- print(q_table)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement