Tttrl

# -*- coding: utf-8 -*-
"""Assignment_2_Part_2_tic_tac_toe_RL

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/12OFLrrG-isI6BCcZ9jx7SCMd44aoHq5F

Here is an implementation of a tic tac toe game that uses Reinforcement Learning (RL)

RL Steps

Selecting the RL method
Define states
Define actions
Define Rewards

TIC TAC TOE RL

I will use Q-Learning for this implementation
There are 9 squares on a game board and each square has 3 possible states - X, O, or empty. So, there are 3^9 = 19683 possible states in this game. We can create a q-table to help us organize this.
You can potentialy place a piece in any square so thats 9 possible actions. In our Q-table that gives us 177147 cells!
We will award a winning game with 1, a draw with a 0, and a loss with -1

So where to start?

Well lets create a game with a player that just plays any random empty space, then we can use that to train our agent.

Tic Tac Toe implementation

the next few cells have to do with the basic game
"""

import numpy as np
import time

EMPTY = '_'
GRID_SIZE = 3
PLAYER = ['X', 'O']

def show_board(board):
    board = np.array(board)
    print()
    for i in range(GRID_SIZE):
        for j in range(GRID_SIZE):
            print('|', end='')
            print(board[i,j], end='')
        print('|')
    print()

def get_legal_moves(board):
    legal_moves = []
    for i in range(len(board)):
        for j in range(len(board[i])):
            if board[i][j] == EMPTY:
                legal_moves.append((i, j))
    return legal_moves

def get_human_move(player):
    human = input(f"Player {player}, enter a square: ")
    #num_pad_map = {'7': (0, 0), '8': (0, 1), '9': (0, 2), '4': (1, 0), '5': (1, 1), '6': (1, 2), '1': (2, 0), '2': (2, 1), '3': (2, 2)}
    valid = map_index(human)
    while not valid:
        print("Invalid input! Please enter a number between 1 and 9.")
        return get_human_move(player)
    return valid[1]

def map_index(int_choice):
    num_pad_map = {'7': (0, 0), '8': (0, 1), '9': (0, 2), '4': (1, 0), '5': (1, 1), '6': (1, 2), '1': (2, 0), '2': (2, 1), '3': (2, 2)}
    if int_choice in num_pad_map:
        return True, num_pad_map[int_choice]
    else:
        return False

def make_move(move, board, player):
    row, col = move
    if board[row, col] == EMPTY:
        board[row, col] = player
    else:
        print("That square is already taken!")
        move = get_human_move(player)  # Ask for input again
        make_move(move, board, player)
    return board

"""Returns a tuple with bool and player char"""

def check_win(board):
    # Check rows
    for row in board:
        if row[0] == row[1] == row[2] and row[0] != EMPTY:
            return True, row[0]

    # Check columns
    for col in range(3):
        if board[0][col] == board[1][col] == board[2][col] and board[0][col] != EMPTY:
            return True, board[0][col]

    # Check diagonals
    if board[0][0] == board[1][1] == board[2][2] and board[0][0] != EMPTY:
        return True, board[0][0]
    if board[0][2] == board[1][1] == board[2][0] and board[0][2] != EMPTY:
        return True, board[0][2]

    # Check draw
    if all(board[i][j] != EMPTY for i in range(3) for j in range(3)):
        return True, 'Draw'

"""ok, that works for now, lets implement q learning

parameters for model
"""

# Initialize Q-learning parameters
alpha = .7  # Learning rate
gamma = .7 # Discount factor
epsilon = .2  # Exploration rate
decay_rate = 0.0001  # Rate of epsilon decay
episodes = 177147  # Number of training episodes

def get_action(legal_moves, Q_table, state_index, epsilon):
    #print()
    rand = np.random.rand()
    #print(f"{rand = }")
    if rand < epsilon:
        # Explore: select a random legal action
        #print("Explore")
        action = np.random.choice(len(legal_moves))  # choose a random index from legal_moves
        return legal_moves[action]  # return the tuple directly
    else:
        # Exploit: select the legal action with the highest Q-value
        #print("Exploit")
        # Initialize variables to keep track of the best action and its Q-value

        best_action = None
        best_q_value = float('-inf')  # Initialize with negative infinity

        # Iterate over legal moves and find the action with the highest Q-value
        for action in legal_moves:
            # Convert (row, col) to 1D index for the Q-table
            action_index = action[0] * GRID_SIZE + action[1]
            q_value = Q_table[state_index][action_index]
            if q_value > best_q_value:
                best_q_value = q_value
                best_action = action

        #print(f"Best action selected: {best_action}")
        return best_action

"""this is supposed to dynamically change the epsilon value but i found it made the program incredibly slow!"""

def decay_epsilon(epsilon, episode, decay_rate):
    epsilon = epsilon / (1 + decay_rate * episode)
    return epsilon

"""This function changes the characters to integers so the computer can understand them"""

def state_to_index(state):
    # Map string values to integers: 'X' -> 1, 'O' -> -1, '_' -> 0
    mapping = {'X': 1, 'O': -1, EMPTY: 0}
    # Convert state tuple to an index
    index = 0
    for i, value in enumerate(state):
        index += (3 ** i) * mapping[value]
    return abs(index)

"""Pick random move"""

def opponent(moves):
    #print(moves)
    move = np.random.randint(len(moves))
    return moves[move]

"""This is the training loop.

We create an empty Q Table and initialize it with all zeros, as training happens these zeros will be replaced with values between -1 and 1 depending on the rewards.

We enter the trainnig loop which is a certain number of episodes long
we create an empty board and enter the game loop.

As the game plays the agent will play moves based on the q table.
which is influenced by the configurables we initialised ata the beggining.

the epsilon is used to alter the amount of exploration the agent does. the agent will either play if safe and pick the best move from the given legal moves or pick a random move from the legal moves. if no exploration is allowed the agent will never fill the table and will be very short sighted in its play strategy. if the agent is allowed full explotation, training will take a very long time. a balance needs to be found, there are algorithms that can change the value of the epsilon during the episodes, this could allow for exploration at first and playing conservative later. there are lots of variables to experiment with here.

lots of the code in thsi section is simply casting data into the right format for the various function.

that actual Q learning algorithm is

Q(s,a) = Q(s,a) + α * (r + γ * max(Q(s',a')) - Q(s,a))

or

Q_value[current_state][action] = Q_value[current_state][action] +
                                 learning_rate * (reward + discount_factor * maximum_future_reward - Q_value[current_state][action])


Each game plays to completion, the rewards are given, we update the q table using the formular above and we start the next episode.

training is finished when the set number of episodes have been reached.
"""

# Q-learning loop

#initialize the able to all 0s
q_table = np.zeros((3 ** (GRID_SIZE * GRID_SIZE), GRID_SIZE * GRID_SIZE))
#print(q_table.shape)
#start time for stats
start_time = time.time()
print("Training started...")

for episode in range(episodes):
    winner = False
    # Initialize the board
    board = [[EMPTY] * GRID_SIZE for _ in range(GRID_SIZE)]
    #print("-------------------------------")
    #print(f"Episode {episode}/{episodes} ({(episode / episodes) * 100:.2f}% completed)")
    #print("-------------------------------")
    turn = 0
    # Game loop
    while not winner:

        # Get legal moves
        legal_moves = get_legal_moves(board)
        #print(f"{legal_moves = }")
        state = tuple(np.array(board).flatten())
        state_index = state_to_index(state)

        #print(f"{PLAYER[turn %2]}'s turn")
        #if PLAYER[turn %2] == 'X':
            # Get current state
            #state = tuple(np.array(board).flatten())
            #state_index = state_to_index(state)
            # Select action using epsilon-greedy policy
        action = get_action(legal_moves, q_table, state_index, epsilon)
            #print(f"{action = }")
            #action = tuple(np.array(action).flatten())
            #print(f"{action = }")
        row, col = action
            #print(row, col)
        board[row][col] = PLAYER[0]
        winner = check_win(board)
        #print(board)


        #print(f"{player = }")
        #print(f"{board = }")

        # Check if game ended
        winner = check_win(board)
        #print(f"{winner = }")
        if winner:
            # Game ended
            if winner[1] == 'X':
                #print("x wins")
                reward = 1  # Reward for winning
            elif winner[1] == 'O':
                reward = -1  # Punishment for losing
                #print("x loses")
            else:
                reward = -.2  # Reward for draw
                #print("draw")

            #print(f"{reward = }")
        else:
            legal_moves = get_legal_moves(board)
            #print(f"{legal_moves = }")
            row, col = opponent(legal_moves)
            # Update board with selected action
            board[row][col] = PLAYER[1]
            reward = 0

        # Get next state
        next_state = tuple(np.array(board).flatten())
        #print(f"{state = }")
        # Update Q-value
        #print(f"{state_index = }")
        next_state_index = state_to_index(next_state)
        #print(f"{next_state = } ")
        action = row * GRID_SIZE + col
        #print(f"{q_table[state_index][action] = }")
        q_table[state_index][action] += alpha * (reward + gamma * np.max(q_table[next_state_index]) - q_table[state_index][action])

        # Flatten the table into a single list
        flattened_q_table =  q_table.flatten()
        #print(f"{flattened_q_table = }")
        average_q_table = np.mean(flattened_q_table)
        #print(f"{average_q_table = }")

        #show_board(board)
        turn += 1
        #time.sleep(.2)
        #print(f"{turn = }")
        #print()
    #decay eplison
    #epsilon = epsilon - decay_rate

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training time: {elapsed_time} seconds")
print(epsilon)

"""checking to see how full the table is"""

total_elements = np.size(q_table)
explored = np.count_nonzero(q_table)
unexplored = total_elements - explored

values = f"{'Total Elements':>15}|{total_elements:<10}\n" \
         f"{'Explored':>15}|{explored:<10}\n" \
         f"{'Unexplored':>15}|{unexplored:<10}"
print(f"The agent explored {(explored / total_elements) * 100}% of the q table")
print(values)

"""Once we have trained the agent, lets save the table because training takes a loooooong time!

Python has a built in pickle function to save variables and such.
"""

import pickle

# Assuming q_table is your Q-table numpy array

def save_q_table(q_table, filename):
    with open(filename, 'wb') as f:
        pickle.dump(q_table, f)

# Save Q-table to a file
save_q_table(q_table, 'q_table.pkl')

"""But exporting it as an excel file will allow us to inspect the table in speadsheet software"""

import numpy as np
import pandas as pd

# Assuming q_table is your Q-table numpy array

def save_q_table_to_excel(q_table, filename):
    # Convert Q-table to DataFrame
    df = pd.DataFrame(q_table)

    # Save DataFrame to Excel
    df.to_excel(filename, index=False)

    print(df)

# Save Q-table to an Excel file
save_q_table_to_excel(q_table,f"q_table_{episodes}_episodes.xlsx")

"""Load the table if nessesery"""

def load_q_table(filename):
    with open(filename, 'rb') as f:
        q_table = pickle.load(f)
    return q_table

# Load Q-table from the file
loaded_q_table = load_q_table('q_table.pkl')

# Visualize or analyze the loaded Q-table
print(loaded_q_table)

"""Finally we have the game interface, we actually play against our agent here! It would be fun to implement different algorithms and play them agains eachother here"""

print("Welcome to Tic Tac Toe!")
print("The board is numbered for use with a num pad, as follows:")
print("|7|8|9|")
print("|4|5|6|")
print("|1|2|3|")
print("Player X goes first.")

board = np.full((GRID_SIZE, GRID_SIZE), EMPTY, dtype=str)

show_board(board)

game_over = False
round = 0

while not game_over:
    legal_moves = get_legal_moves(board)
    if PLAYER[round %2] == 'X':
        move = opponent(legal_moves) #random opponent
        #move = get_human_move(PLAYER[round %2])
        print(move)
    else:
        #move = opponent(legal_moves) #random opponent
        state = tuple(np.array(board).flatten())
        state_index = state_to_index(state)
        move = get_action(legal_moves, q_table, state_index, epsilon)
        print(move)
    board = make_move(move, board, PLAYER[round %2])
    show_board(board)
    game_over = check_win(board)
    round += 1

if game_over[1] == 'Draw':
    print("It's a draw!")
else:
    print(f"Player {game_over[1]} wins!")

"""ok, now lets, build some kind of table to see if we can see any interesting results, lets play 100 games and keep a tally of who won, the random player or our agent. Then we will swtch sides and try again, surly we will see our agent win most games! 🤞"""

print("Welcome to Tic Tac Toe!")
print("The board is numbered for use with a num pad, as follows:")
print("|7|8|9|")
print("|4|5|6|")
print("|1|2|3|")
starting_player = 'X'
print(f"Player {starting_player} goes first.")


#show_board(board)
rounds = 1000
round = 0
wins = 0
losses = 0
draws = 0
for i in range(rounds):
    board = np.full((GRID_SIZE, GRID_SIZE), EMPTY, dtype=str)
    game_over = False
    while not game_over:
        legal_moves = get_legal_moves(board)
        if PLAYER[round %2] == starting_player:
            #move = opponent(legal_moves) #random opponent
            state = tuple(np.array(board).flatten())
            state_index = state_to_index(state)
            best_action = None
            best_q_value = float('-inf')  # Initialize with negative infinity

            # Iterate over legal moves and find the action with the highest Q-value
            for action in legal_moves:
                # Convert (row, col) to 1D index for the Q-table
                action_index = action[0] * GRID_SIZE + action[1]
                q_value = q_table[state_index][action_index]
                if q_value > best_q_value:
                    best_q_value = q_value
                    best_action = action
            move = action
            #print(move)
        else:
            move = opponent(legal_moves) #random opponent
            #move = get_human_move(PLAYER[round %2])
            #print(move)
        board = make_move(move, board, PLAYER[round %2])
        show_board(board)
        game_over = check_win(board)
        round += 1

    if game_over[1] == 'Draw':
        draws +=1
        print("It's a draw!")
    else:
        print(f"Player {game_over[1]} wins!")
        if game_over[1] == 'X':
            wins +=1
        else:
            losses +=1


# Create a DataFrame
df = pd.DataFrame({
    'Result': ['Wins', 'Losses', 'Draws'],
    'Count': [wins, losses, draws]
})

# Print the DataFrame
print(df.to_string(index=False))