Untitled

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define the environment (grid world)
# S: Start, G: Goal, #: Obstacle
# Agent's goal is to reach G while avoiding obstacles
# -----------
# | S |     |
# | # |  G  |
# -----------

# Define the grid world as a numpy array
env = np.array([["S", " ", "#"],
                [" ", " ", "G"]])

# Define actions (up, down, left, right)
actions = ["UP", "DOWN", "LEFT", "RIGHT"]
num_actions = len(actions)

# Define Q-learning parameters
learning_rate = 0.1
discount_factor = 0.9
num_episodes = 1000

# Convert environment to a flat array
env_flat = env.flatten()

# Q-learning table (Q-values for each state-action pair)
q_table = np.zeros((env_flat.shape[0], num_actions))

# Q-learning algorithm
for episode in range(num_episodes):
    state = 0  # Starting state (index of "S")
    done = False

    while not done:
        # Choose an action using epsilon-greedy policy
        epsilon = 0.1
        if np.random.rand() < epsilon:
            action = np.random.choice(num_actions)
        else:
            action = np.argmax(q_table[state, :])

        # Perform the action and observe the next state and reward
        next_state = state + 1 if action == 3 else state - 1  # Move right or left
        reward = 1 if env_flat[next_state] == "G" else 0

        # Update Q-value using the Q-learning update rule
        q_table[state, action] = q_table[state, action] + learning_rate * \
                                (reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])

        state = next_state
        done = (env_flat[state] == "G")

print("Q-table:")
print(q_table)