Untitled

import numpy as np

# Define the environment (grid world) with named spaces
env = np.array([["S", "A", "#"],
                ["B", "C", "G"]])

# Define actions (up, down, left, right)
actions = ["UP", "DOWN", "LEFT", "RIGHT"]
num_actions = len(actions)

# Define Q-learning parameters
learning_rate = 0.1
discount_factor = 0.9
num_episodes = 1000

# Convert environment to a flat array
env_flat = env.flatten()

# Q-learning table (Q-values for each state-action pair)
q_table = np.zeros((env_flat.shape[0], num_actions))

# Q-learning algorithm
for episode in range(num_episodes):
    state = 0  # Starting state (index of "S")
    done = False

    while not done:
        # Choose an action using epsilon-greedy policy
        epsilon = 0.1
        if np.random.rand() < epsilon:
            action = np.random.choice(num_actions)
        else:
            action = np.argmax(q_table[state, :])

        # Perform the action and observe the next state and reward
        next_state = state + 1 if action == 3 else state - 1  # Move right or left
        reward = 1 if env_flat[next_state] == "G" else 0

        # Update Q-value using the Q-learning update rule
        q_table[state, action] = q_table[state, action] + learning_rate * \
                                (reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])

        state = next_state
        done = (env_flat[state] == "G")

print("Training complete.\n")

# Print the final Q-table values for each state-action pair
for state in range(len(env_flat)):
    state_name = env_flat[state]
    for action in range(num_actions):
        action_name = actions[action]
        q_value = q_table[state, action]
        print(f"State: {state_name}, Action: {action_name}, Q-value: {q_value}")