Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """Assignment_2_Part_2_tic_tac_toe_RL
- Automatically generated by Colab.
- Original file is located at
- https://colab.research.google.com/drive/12OFLrrG-isI6BCcZ9jx7SCMd44aoHq5F
- Here is an implementation of a tic tac toe game that uses Reinforcement Learning (RL)
- RL Steps
- Selecting the RL method
- Define states
- Define actions
- Define Rewards
- TIC TAC TOE RL
- I will use Q-Learning for this implementation
- There are 9 squares on a game board and each square has 3 possible states - X, O, or empty. So, there are 3^9 = 19683 possible states in this game. We can create a q-table to help us organize this.
- You can potentialy place a piece in any square so thats 9 possible actions. In our Q-table that gives us 177147 cells!
- We will award a winning game with 1, a draw with a 0, and a loss with -1
- So where to start?
- Well lets create a game with a player that just plays any random empty space, then we can use that to train our agent.
- Tic Tac Toe implementation
- the next few cells have to do with the basic game
- """
- import numpy as np
- import time
- EMPTY = '_'
- GRID_SIZE = 3
- PLAYER = ['X', 'O']
- def show_board(board):
- board = np.array(board)
- print()
- for i in range(GRID_SIZE):
- for j in range(GRID_SIZE):
- print('|', end='')
- print(board[i,j], end='')
- print('|')
- print()
- def get_legal_moves(board):
- legal_moves = []
- for i in range(len(board)):
- for j in range(len(board[i])):
- if board[i][j] == EMPTY:
- legal_moves.append((i, j))
- return legal_moves
- def get_human_move(player):
- human = input(f"Player {player}, enter a square: ")
- #num_pad_map = {'7': (0, 0), '8': (0, 1), '9': (0, 2), '4': (1, 0), '5': (1, 1), '6': (1, 2), '1': (2, 0), '2': (2, 1), '3': (2, 2)}
- valid = map_index(human)
- while not valid:
- print("Invalid input! Please enter a number between 1 and 9.")
- return get_human_move(player)
- return valid[1]
- def map_index(int_choice):
- num_pad_map = {'7': (0, 0), '8': (0, 1), '9': (0, 2), '4': (1, 0), '5': (1, 1), '6': (1, 2), '1': (2, 0), '2': (2, 1), '3': (2, 2)}
- if int_choice in num_pad_map:
- return True, num_pad_map[int_choice]
- else:
- return False
- def make_move(move, board, player):
- row, col = move
- if board[row, col] == EMPTY:
- board[row, col] = player
- else:
- print("That square is already taken!")
- move = get_human_move(player) # Ask for input again
- make_move(move, board, player)
- return board
- """Returns a tuple with bool and player char"""
- def check_win(board):
- # Check rows
- for row in board:
- if row[0] == row[1] == row[2] and row[0] != EMPTY:
- return True, row[0]
- # Check columns
- for col in range(3):
- if board[0][col] == board[1][col] == board[2][col] and board[0][col] != EMPTY:
- return True, board[0][col]
- # Check diagonals
- if board[0][0] == board[1][1] == board[2][2] and board[0][0] != EMPTY:
- return True, board[0][0]
- if board[0][2] == board[1][1] == board[2][0] and board[0][2] != EMPTY:
- return True, board[0][2]
- # Check draw
- if all(board[i][j] != EMPTY for i in range(3) for j in range(3)):
- return True, 'Draw'
- """ok, that works for now, lets implement q learning
- parameters for model
- """
- # Initialize Q-learning parameters
- alpha = .7 # Learning rate
- gamma = .7 # Discount factor
- epsilon = .2 # Exploration rate
- decay_rate = 0.0001 # Rate of epsilon decay
- episodes = 177147 # Number of training episodes
- def get_action(legal_moves, Q_table, state_index, epsilon):
- #print()
- rand = np.random.rand()
- #print(f"{rand = }")
- if rand < epsilon:
- # Explore: select a random legal action
- #print("Explore")
- action = np.random.choice(len(legal_moves)) # choose a random index from legal_moves
- return legal_moves[action] # return the tuple directly
- else:
- # Exploit: select the legal action with the highest Q-value
- #print("Exploit")
- # Initialize variables to keep track of the best action and its Q-value
- best_action = None
- best_q_value = float('-inf') # Initialize with negative infinity
- # Iterate over legal moves and find the action with the highest Q-value
- for action in legal_moves:
- # Convert (row, col) to 1D index for the Q-table
- action_index = action[0] * GRID_SIZE + action[1]
- q_value = Q_table[state_index][action_index]
- if q_value > best_q_value:
- best_q_value = q_value
- best_action = action
- #print(f"Best action selected: {best_action}")
- return best_action
- """this is supposed to dynamically change the epsilon value but i found it made the program incredibly slow!"""
- def decay_epsilon(epsilon, episode, decay_rate):
- epsilon = epsilon / (1 + decay_rate * episode)
- return epsilon
- """This function changes the characters to integers so the computer can understand them"""
- def state_to_index(state):
- # Map string values to integers: 'X' -> 1, 'O' -> -1, '_' -> 0
- mapping = {'X': 1, 'O': -1, EMPTY: 0}
- # Convert state tuple to an index
- index = 0
- for i, value in enumerate(state):
- index += (3 ** i) * mapping[value]
- return abs(index)
- """Pick random move"""
- def opponent(moves):
- #print(moves)
- move = np.random.randint(len(moves))
- return moves[move]
- """This is the training loop.
- We create an empty Q Table and initialize it with all zeros, as training happens these zeros will be replaced with values between -1 and 1 depending on the rewards.
- We enter the trainnig loop which is a certain number of episodes long
- we create an empty board and enter the game loop.
- As the game plays the agent will play moves based on the q table.
- which is influenced by the configurables we initialised ata the beggining.
- the epsilon is used to alter the amount of exploration the agent does. the agent will either play if safe and pick the best move from the given legal moves or pick a random move from the legal moves. if no exploration is allowed the agent will never fill the table and will be very short sighted in its play strategy. if the agent is allowed full explotation, training will take a very long time. a balance needs to be found, there are algorithms that can change the value of the epsilon during the episodes, this could allow for exploration at first and playing conservative later. there are lots of variables to experiment with here.
- lots of the code in thsi section is simply casting data into the right format for the various function.
- that actual Q learning algorithm is
- Q(s,a) = Q(s,a) + α * (r + γ * max(Q(s',a')) - Q(s,a))
- or
- Q_value[current_state][action] = Q_value[current_state][action] +
- learning_rate * (reward + discount_factor * maximum_future_reward - Q_value[current_state][action])
- Each game plays to completion, the rewards are given, we update the q table using the formular above and we start the next episode.
- training is finished when the set number of episodes have been reached.
- """
- # Q-learning loop
- #initialize the able to all 0s
- q_table = np.zeros((3 ** (GRID_SIZE * GRID_SIZE), GRID_SIZE * GRID_SIZE))
- #print(q_table.shape)
- #start time for stats
- start_time = time.time()
- print("Training started...")
- for episode in range(episodes):
- winner = False
- # Initialize the board
- board = [[EMPTY] * GRID_SIZE for _ in range(GRID_SIZE)]
- #print("-------------------------------")
- #print(f"Episode {episode}/{episodes} ({(episode / episodes) * 100:.2f}% completed)")
- #print("-------------------------------")
- turn = 0
- # Game loop
- while not winner:
- # Get legal moves
- legal_moves = get_legal_moves(board)
- #print(f"{legal_moves = }")
- state = tuple(np.array(board).flatten())
- state_index = state_to_index(state)
- #print(f"{PLAYER[turn %2]}'s turn")
- #if PLAYER[turn %2] == 'X':
- # Get current state
- #state = tuple(np.array(board).flatten())
- #state_index = state_to_index(state)
- # Select action using epsilon-greedy policy
- action = get_action(legal_moves, q_table, state_index, epsilon)
- #print(f"{action = }")
- #action = tuple(np.array(action).flatten())
- #print(f"{action = }")
- row, col = action
- #print(row, col)
- board[row][col] = PLAYER[0]
- winner = check_win(board)
- #print(board)
- #print(f"{player = }")
- #print(f"{board = }")
- # Check if game ended
- winner = check_win(board)
- #print(f"{winner = }")
- if winner:
- # Game ended
- if winner[1] == 'X':
- #print("x wins")
- reward = 1 # Reward for winning
- elif winner[1] == 'O':
- reward = -1 # Punishment for losing
- #print("x loses")
- else:
- reward = -.2 # Reward for draw
- #print("draw")
- #print(f"{reward = }")
- else:
- legal_moves = get_legal_moves(board)
- #print(f"{legal_moves = }")
- row, col = opponent(legal_moves)
- # Update board with selected action
- board[row][col] = PLAYER[1]
- reward = 0
- # Get next state
- next_state = tuple(np.array(board).flatten())
- #print(f"{state = }")
- # Update Q-value
- #print(f"{state_index = }")
- next_state_index = state_to_index(next_state)
- #print(f"{next_state = } ")
- action = row * GRID_SIZE + col
- #print(f"{q_table[state_index][action] = }")
- q_table[state_index][action] += alpha * (reward + gamma * np.max(q_table[next_state_index]) - q_table[state_index][action])
- # Flatten the table into a single list
- flattened_q_table = q_table.flatten()
- #print(f"{flattened_q_table = }")
- average_q_table = np.mean(flattened_q_table)
- #print(f"{average_q_table = }")
- #show_board(board)
- turn += 1
- #time.sleep(.2)
- #print(f"{turn = }")
- #print()
- #decay eplison
- #epsilon = epsilon - decay_rate
- end_time = time.time()
- elapsed_time = end_time - start_time
- print(f"Training time: {elapsed_time} seconds")
- print(epsilon)
- """checking to see how full the table is"""
- total_elements = np.size(q_table)
- explored = np.count_nonzero(q_table)
- unexplored = total_elements - explored
- values = f"{'Total Elements':>15}|{total_elements:<10}\n" \
- f"{'Explored':>15}|{explored:<10}\n" \
- f"{'Unexplored':>15}|{unexplored:<10}"
- print(f"The agent explored {(explored / total_elements) * 100}% of the q table")
- print(values)
- """Once we have trained the agent, lets save the table because training takes a loooooong time!
- Python has a built in pickle function to save variables and such.
- """
- import pickle
- # Assuming q_table is your Q-table numpy array
- def save_q_table(q_table, filename):
- with open(filename, 'wb') as f:
- pickle.dump(q_table, f)
- # Save Q-table to a file
- save_q_table(q_table, 'q_table.pkl')
- """But exporting it as an excel file will allow us to inspect the table in speadsheet software"""
- import numpy as np
- import pandas as pd
- # Assuming q_table is your Q-table numpy array
- def save_q_table_to_excel(q_table, filename):
- # Convert Q-table to DataFrame
- df = pd.DataFrame(q_table)
- # Save DataFrame to Excel
- df.to_excel(filename, index=False)
- print(df)
- # Save Q-table to an Excel file
- save_q_table_to_excel(q_table,f"q_table_{episodes}_episodes.xlsx")
- """Load the table if nessesery"""
- def load_q_table(filename):
- with open(filename, 'rb') as f:
- q_table = pickle.load(f)
- return q_table
- # Load Q-table from the file
- loaded_q_table = load_q_table('q_table.pkl')
- # Visualize or analyze the loaded Q-table
- print(loaded_q_table)
- """Finally we have the game interface, we actually play against our agent here! It would be fun to implement different algorithms and play them agains eachother here"""
- print("Welcome to Tic Tac Toe!")
- print("The board is numbered for use with a num pad, as follows:")
- print("|7|8|9|")
- print("|4|5|6|")
- print("|1|2|3|")
- print("Player X goes first.")
- board = np.full((GRID_SIZE, GRID_SIZE), EMPTY, dtype=str)
- show_board(board)
- game_over = False
- round = 0
- while not game_over:
- legal_moves = get_legal_moves(board)
- if PLAYER[round %2] == 'X':
- move = opponent(legal_moves) #random opponent
- #move = get_human_move(PLAYER[round %2])
- print(move)
- else:
- #move = opponent(legal_moves) #random opponent
- state = tuple(np.array(board).flatten())
- state_index = state_to_index(state)
- move = get_action(legal_moves, q_table, state_index, epsilon)
- print(move)
- board = make_move(move, board, PLAYER[round %2])
- show_board(board)
- game_over = check_win(board)
- round += 1
- if game_over[1] == 'Draw':
- print("It's a draw!")
- else:
- print(f"Player {game_over[1]} wins!")
- """ok, now lets, build some kind of table to see if we can see any interesting results, lets play 100 games and keep a tally of who won, the random player or our agent. Then we will swtch sides and try again, surly we will see our agent win most games! 🤞"""
- print("Welcome to Tic Tac Toe!")
- print("The board is numbered for use with a num pad, as follows:")
- print("|7|8|9|")
- print("|4|5|6|")
- print("|1|2|3|")
- starting_player = 'X'
- print(f"Player {starting_player} goes first.")
- #show_board(board)
- rounds = 1000
- round = 0
- wins = 0
- losses = 0
- draws = 0
- for i in range(rounds):
- board = np.full((GRID_SIZE, GRID_SIZE), EMPTY, dtype=str)
- game_over = False
- while not game_over:
- legal_moves = get_legal_moves(board)
- if PLAYER[round %2] == starting_player:
- #move = opponent(legal_moves) #random opponent
- state = tuple(np.array(board).flatten())
- state_index = state_to_index(state)
- best_action = None
- best_q_value = float('-inf') # Initialize with negative infinity
- # Iterate over legal moves and find the action with the highest Q-value
- for action in legal_moves:
- # Convert (row, col) to 1D index for the Q-table
- action_index = action[0] * GRID_SIZE + action[1]
- q_value = q_table[state_index][action_index]
- if q_value > best_q_value:
- best_q_value = q_value
- best_action = action
- move = action
- #print(move)
- else:
- move = opponent(legal_moves) #random opponent
- #move = get_human_move(PLAYER[round %2])
- #print(move)
- board = make_move(move, board, PLAYER[round %2])
- show_board(board)
- game_over = check_win(board)
- round += 1
- if game_over[1] == 'Draw':
- draws +=1
- print("It's a draw!")
- else:
- print(f"Player {game_over[1]} wins!")
- if game_over[1] == 'X':
- wins +=1
- else:
- losses +=1
- # Create a DataFrame
- df = pd.DataFrame({
- 'Result': ['Wins', 'Losses', 'Draws'],
- 'Count': [wins, losses, draws]
- })
- # Print the DataFrame
- print(df.to_string(index=False))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement