Advertisement
KDT85

Tttrl

May 6th, 2024
515
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.51 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """Assignment_2_Part_2_tic_tac_toe_RL
  3.  
  4. Automatically generated by Colab.
  5.  
  6. Original file is located at
  7.    https://colab.research.google.com/drive/12OFLrrG-isI6BCcZ9jx7SCMd44aoHq5F
  8.  
  9. Here is an implementation of a tic tac toe game that uses Reinforcement Learning (RL)
  10.  
  11. RL Steps
  12.  
  13. Selecting the RL method
  14. Define states
  15. Define actions
  16. Define Rewards
  17.  
  18. TIC TAC TOE RL
  19.  
  20. I will use Q-Learning for this implementation
  21. There are 9 squares on a game board and each square has 3 possible states - X, O, or empty. So, there are 3^9 = 19683 possible states in this game. We can create a q-table to help us organize this.
  22. You can potentialy place a piece in any square so thats 9 possible actions. In our Q-table that gives us 177147 cells!
  23. We will award a winning game with 1, a draw with a 0, and a loss with -1
  24.  
  25. So where to start?
  26.  
  27. Well lets create a game with a player that just plays any random empty space, then we can use that to train our agent.
  28.  
  29. Tic Tac Toe implementation
  30.  
  31. the next few cells have to do with the basic game
  32. """
  33.  
  34. import numpy as np
  35. import time
  36.  
  37. EMPTY = '_'
  38. GRID_SIZE = 3
  39. PLAYER = ['X', 'O']
  40.  
  41. def show_board(board):
  42.     board = np.array(board)
  43.     print()
  44.     for i in range(GRID_SIZE):
  45.         for j in range(GRID_SIZE):
  46.             print('|', end='')
  47.             print(board[i,j], end='')
  48.         print('|')
  49.     print()
  50.  
  51. def get_legal_moves(board):
  52.     legal_moves = []
  53.     for i in range(len(board)):
  54.         for j in range(len(board[i])):
  55.             if board[i][j] == EMPTY:
  56.                 legal_moves.append((i, j))
  57.     return legal_moves
  58.  
  59. def get_human_move(player):
  60.     human = input(f"Player {player}, enter a square: ")
  61.     #num_pad_map = {'7': (0, 0), '8': (0, 1), '9': (0, 2), '4': (1, 0), '5': (1, 1), '6': (1, 2), '1': (2, 0), '2': (2, 1), '3': (2, 2)}
  62.     valid = map_index(human)
  63.     while not valid:
  64.         print("Invalid input! Please enter a number between 1 and 9.")
  65.         return get_human_move(player)
  66.     return valid[1]
  67.  
  68. def map_index(int_choice):
  69.     num_pad_map = {'7': (0, 0), '8': (0, 1), '9': (0, 2), '4': (1, 0), '5': (1, 1), '6': (1, 2), '1': (2, 0), '2': (2, 1), '3': (2, 2)}
  70.     if int_choice in num_pad_map:
  71.         return True, num_pad_map[int_choice]
  72.     else:
  73.         return False
  74.  
  75. def make_move(move, board, player):
  76.     row, col = move
  77.     if board[row, col] == EMPTY:
  78.         board[row, col] = player
  79.     else:
  80.         print("That square is already taken!")
  81.         move = get_human_move(player)  # Ask for input again
  82.         make_move(move, board, player)
  83.     return board
  84.  
  85. """Returns a tuple with bool and player char"""
  86.  
  87. def check_win(board):
  88.     # Check rows
  89.     for row in board:
  90.         if row[0] == row[1] == row[2] and row[0] != EMPTY:
  91.             return True, row[0]
  92.  
  93.     # Check columns
  94.     for col in range(3):
  95.         if board[0][col] == board[1][col] == board[2][col] and board[0][col] != EMPTY:
  96.             return True, board[0][col]
  97.  
  98.     # Check diagonals
  99.     if board[0][0] == board[1][1] == board[2][2] and board[0][0] != EMPTY:
  100.         return True, board[0][0]
  101.     if board[0][2] == board[1][1] == board[2][0] and board[0][2] != EMPTY:
  102.         return True, board[0][2]
  103.  
  104.     # Check draw
  105.     if all(board[i][j] != EMPTY for i in range(3) for j in range(3)):
  106.         return True, 'Draw'
  107.  
  108. """ok, that works for now, lets implement q learning
  109.  
  110. parameters for model
  111. """
  112.  
  113. # Initialize Q-learning parameters
  114. alpha = .7  # Learning rate
  115. gamma = .7 # Discount factor
  116. epsilon = .2  # Exploration rate
  117. decay_rate = 0.0001  # Rate of epsilon decay
  118. episodes = 177147  # Number of training episodes
  119.  
  120. def get_action(legal_moves, Q_table, state_index, epsilon):
  121.     #print()
  122.     rand = np.random.rand()
  123.     #print(f"{rand = }")
  124.     if rand < epsilon:
  125.         # Explore: select a random legal action
  126.         #print("Explore")
  127.         action = np.random.choice(len(legal_moves))  # choose a random index from legal_moves
  128.         return legal_moves[action]  # return the tuple directly
  129.     else:
  130.         # Exploit: select the legal action with the highest Q-value
  131.         #print("Exploit")
  132.         # Initialize variables to keep track of the best action and its Q-value
  133.  
  134.         best_action = None
  135.         best_q_value = float('-inf')  # Initialize with negative infinity
  136.  
  137.         # Iterate over legal moves and find the action with the highest Q-value
  138.         for action in legal_moves:
  139.             # Convert (row, col) to 1D index for the Q-table
  140.             action_index = action[0] * GRID_SIZE + action[1]
  141.             q_value = Q_table[state_index][action_index]
  142.             if q_value > best_q_value:
  143.                 best_q_value = q_value
  144.                 best_action = action
  145.  
  146.         #print(f"Best action selected: {best_action}")
  147.         return best_action
  148.  
  149. """this is supposed to dynamically change the epsilon value but i found it made the program incredibly slow!"""
  150.  
  151. def decay_epsilon(epsilon, episode, decay_rate):
  152.     epsilon = epsilon / (1 + decay_rate * episode)
  153.     return epsilon
  154.  
  155. """This function changes the characters to integers so the computer can understand them"""
  156.  
  157. def state_to_index(state):
  158.     # Map string values to integers: 'X' -> 1, 'O' -> -1, '_' -> 0
  159.     mapping = {'X': 1, 'O': -1, EMPTY: 0}
  160.     # Convert state tuple to an index
  161.     index = 0
  162.     for i, value in enumerate(state):
  163.         index += (3 ** i) * mapping[value]
  164.     return abs(index)
  165.  
  166. """Pick random move"""
  167.  
  168. def opponent(moves):
  169.     #print(moves)
  170.     move = np.random.randint(len(moves))
  171.     return moves[move]
  172.  
  173. """This is the training loop.
  174.  
  175. We create an empty Q Table and initialize it with all zeros, as training happens these zeros will be replaced with values between -1 and 1 depending on the rewards.
  176.  
  177. We enter the trainnig loop which is a certain number of episodes long
  178. we create an empty board and enter the game loop.
  179.  
  180. As the game plays the agent will play moves based on the q table.
  181. which is influenced by the configurables we initialised ata the beggining.
  182.  
  183. the epsilon is used to alter the amount of exploration the agent does. the agent will either play if safe and pick the best move from the given legal moves or pick a random move from the legal moves. if no exploration is allowed the agent will never fill the table and will be very short sighted in its play strategy. if the agent is allowed full explotation, training will take a very long time. a balance needs to be found, there are algorithms that can change the value of the epsilon during the episodes, this could allow for exploration at first and playing conservative later. there are lots of variables to experiment with here.
  184.  
  185. lots of the code in thsi section is simply casting data into the right format for the various function.
  186.  
  187. that actual Q learning algorithm is
  188.  
  189. Q(s,a) = Q(s,a) + α * (r + γ * max(Q(s',a')) - Q(s,a))
  190.  
  191. or
  192.  
  193. Q_value[current_state][action] = Q_value[current_state][action] +
  194.                                 learning_rate * (reward + discount_factor * maximum_future_reward - Q_value[current_state][action])
  195.  
  196.  
  197. Each game plays to completion, the rewards are given, we update the q table using the formular above and we start the next episode.
  198.  
  199. training is finished when the set number of episodes have been reached.
  200. """
  201.  
  202. # Q-learning loop
  203.  
  204. #initialize the able to all 0s
  205. q_table = np.zeros((3 ** (GRID_SIZE * GRID_SIZE), GRID_SIZE * GRID_SIZE))
  206. #print(q_table.shape)
  207. #start time for stats
  208. start_time = time.time()
  209. print("Training started...")
  210.  
  211. for episode in range(episodes):
  212.     winner = False
  213.     # Initialize the board
  214.     board = [[EMPTY] * GRID_SIZE for _ in range(GRID_SIZE)]
  215.     #print("-------------------------------")
  216.     #print(f"Episode {episode}/{episodes} ({(episode / episodes) * 100:.2f}% completed)")
  217.     #print("-------------------------------")
  218.     turn = 0
  219.     # Game loop
  220.     while not winner:
  221.  
  222.         # Get legal moves
  223.         legal_moves = get_legal_moves(board)
  224.         #print(f"{legal_moves = }")
  225.         state = tuple(np.array(board).flatten())
  226.         state_index = state_to_index(state)
  227.  
  228.         #print(f"{PLAYER[turn %2]}'s turn")
  229.         #if PLAYER[turn %2] == 'X':
  230.             # Get current state
  231.             #state = tuple(np.array(board).flatten())
  232.             #state_index = state_to_index(state)
  233.             # Select action using epsilon-greedy policy
  234.         action = get_action(legal_moves, q_table, state_index, epsilon)
  235.             #print(f"{action = }")
  236.             #action = tuple(np.array(action).flatten())
  237.             #print(f"{action = }")
  238.         row, col = action
  239.             #print(row, col)
  240.         board[row][col] = PLAYER[0]
  241.         winner = check_win(board)
  242.         #print(board)
  243.        
  244.  
  245.         #print(f"{player = }")
  246.         #print(f"{board = }")
  247.  
  248.         # Check if game ended
  249.         winner = check_win(board)
  250.         #print(f"{winner = }")
  251.         if winner:
  252.             # Game ended
  253.             if winner[1] == 'X':
  254.                 #print("x wins")
  255.                 reward = 1  # Reward for winning
  256.             elif winner[1] == 'O':
  257.                 reward = -1  # Punishment for losing
  258.                 #print("x loses")
  259.             else:
  260.                 reward = -.2  # Reward for draw
  261.                 #print("draw")
  262.  
  263.             #print(f"{reward = }")
  264.         else:
  265.             legal_moves = get_legal_moves(board)
  266.             #print(f"{legal_moves = }")
  267.             row, col = opponent(legal_moves)
  268.             # Update board with selected action
  269.             board[row][col] = PLAYER[1]
  270.             reward = 0
  271.  
  272.         # Get next state
  273.         next_state = tuple(np.array(board).flatten())
  274.         #print(f"{state = }")
  275.         # Update Q-value
  276.         #print(f"{state_index = }")
  277.         next_state_index = state_to_index(next_state)
  278.         #print(f"{next_state = } ")
  279.         action = row * GRID_SIZE + col
  280.         #print(f"{q_table[state_index][action] = }")
  281.         q_table[state_index][action] += alpha * (reward + gamma * np.max(q_table[next_state_index]) - q_table[state_index][action])
  282.  
  283.         # Flatten the table into a single list
  284.         flattened_q_table =  q_table.flatten()
  285.         #print(f"{flattened_q_table = }")
  286.         average_q_table = np.mean(flattened_q_table)
  287.         #print(f"{average_q_table = }")
  288.  
  289.         #show_board(board)
  290.         turn += 1
  291.         #time.sleep(.2)
  292.         #print(f"{turn = }")
  293.         #print()
  294.     #decay eplison
  295.     #epsilon = epsilon - decay_rate
  296.  
  297. end_time = time.time()
  298. elapsed_time = end_time - start_time
  299. print(f"Training time: {elapsed_time} seconds")
  300. print(epsilon)
  301.  
  302. """checking to see how full the table is"""
  303.  
  304. total_elements = np.size(q_table)
  305. explored = np.count_nonzero(q_table)
  306. unexplored = total_elements - explored
  307.  
  308. values = f"{'Total Elements':>15}|{total_elements:<10}\n" \
  309.          f"{'Explored':>15}|{explored:<10}\n" \
  310.          f"{'Unexplored':>15}|{unexplored:<10}"
  311. print(f"The agent explored {(explored / total_elements) * 100}% of the q table")
  312. print(values)
  313.  
  314. """Once we have trained the agent, lets save the table because training takes a loooooong time!
  315.  
  316. Python has a built in pickle function to save variables and such.
  317. """
  318.  
  319. import pickle
  320.  
  321. # Assuming q_table is your Q-table numpy array
  322.  
  323. def save_q_table(q_table, filename):
  324.     with open(filename, 'wb') as f:
  325.         pickle.dump(q_table, f)
  326.  
  327. # Save Q-table to a file
  328. save_q_table(q_table, 'q_table.pkl')
  329.  
  330. """But exporting it as an excel file will allow us to inspect the table in speadsheet software"""
  331.  
  332. import numpy as np
  333. import pandas as pd
  334.  
  335. # Assuming q_table is your Q-table numpy array
  336.  
  337. def save_q_table_to_excel(q_table, filename):
  338.     # Convert Q-table to DataFrame
  339.     df = pd.DataFrame(q_table)
  340.  
  341.     # Save DataFrame to Excel
  342.     df.to_excel(filename, index=False)
  343.  
  344.     print(df)
  345.  
  346. # Save Q-table to an Excel file
  347. save_q_table_to_excel(q_table,f"q_table_{episodes}_episodes.xlsx")
  348.  
  349. """Load the table if nessesery"""
  350.  
  351. def load_q_table(filename):
  352.     with open(filename, 'rb') as f:
  353.         q_table = pickle.load(f)
  354.     return q_table
  355.  
  356. # Load Q-table from the file
  357. loaded_q_table = load_q_table('q_table.pkl')
  358.  
  359. # Visualize or analyze the loaded Q-table
  360. print(loaded_q_table)
  361.  
  362. """Finally we have the game interface, we actually play against our agent here! It would be fun to implement different algorithms and play them agains eachother here"""
  363.  
  364. print("Welcome to Tic Tac Toe!")
  365. print("The board is numbered for use with a num pad, as follows:")
  366. print("|7|8|9|")
  367. print("|4|5|6|")
  368. print("|1|2|3|")
  369. print("Player X goes first.")
  370.  
  371. board = np.full((GRID_SIZE, GRID_SIZE), EMPTY, dtype=str)
  372.  
  373. show_board(board)
  374.  
  375. game_over = False
  376. round = 0
  377.  
  378. while not game_over:
  379.     legal_moves = get_legal_moves(board)
  380.     if PLAYER[round %2] == 'X':
  381.         move = opponent(legal_moves) #random opponent
  382.         #move = get_human_move(PLAYER[round %2])
  383.         print(move)
  384.     else:
  385.         #move = opponent(legal_moves) #random opponent
  386.         state = tuple(np.array(board).flatten())
  387.         state_index = state_to_index(state)
  388.         move = get_action(legal_moves, q_table, state_index, epsilon)
  389.         print(move)
  390.     board = make_move(move, board, PLAYER[round %2])
  391.     show_board(board)
  392.     game_over = check_win(board)
  393.     round += 1
  394.  
  395. if game_over[1] == 'Draw':
  396.     print("It's a draw!")
  397. else:
  398.     print(f"Player {game_over[1]} wins!")
  399.  
  400. """ok, now lets, build some kind of table to see if we can see any interesting results, lets play 100 games and keep a tally of who won, the random player or our agent. Then we will swtch sides and try again, surly we will see our agent win most games! 🤞"""
  401.  
  402. print("Welcome to Tic Tac Toe!")
  403. print("The board is numbered for use with a num pad, as follows:")
  404. print("|7|8|9|")
  405. print("|4|5|6|")
  406. print("|1|2|3|")
  407. starting_player = 'X'
  408. print(f"Player {starting_player} goes first.")
  409.  
  410.  
  411.  
  412. #show_board(board)
  413. rounds = 1000
  414. round = 0
  415. wins = 0
  416. losses = 0
  417. draws = 0
  418. for i in range(rounds):
  419.     board = np.full((GRID_SIZE, GRID_SIZE), EMPTY, dtype=str)
  420.     game_over = False
  421.     while not game_over:
  422.         legal_moves = get_legal_moves(board)
  423.         if PLAYER[round %2] == starting_player:
  424.             #move = opponent(legal_moves) #random opponent
  425.             state = tuple(np.array(board).flatten())
  426.             state_index = state_to_index(state)
  427.             best_action = None
  428.             best_q_value = float('-inf')  # Initialize with negative infinity
  429.  
  430.             # Iterate over legal moves and find the action with the highest Q-value
  431.             for action in legal_moves:
  432.                 # Convert (row, col) to 1D index for the Q-table
  433.                 action_index = action[0] * GRID_SIZE + action[1]
  434.                 q_value = q_table[state_index][action_index]
  435.                 if q_value > best_q_value:
  436.                     best_q_value = q_value
  437.                     best_action = action
  438.             move = action
  439.             #print(move)
  440.         else:
  441.             move = opponent(legal_moves) #random opponent
  442.             #move = get_human_move(PLAYER[round %2])
  443.             #print(move)
  444.         board = make_move(move, board, PLAYER[round %2])
  445.         show_board(board)
  446.         game_over = check_win(board)
  447.         round += 1
  448.  
  449.     if game_over[1] == 'Draw':
  450.         draws +=1
  451.         print("It's a draw!")
  452.     else:
  453.         print(f"Player {game_over[1]} wins!")
  454.         if game_over[1] == 'X':
  455.             wins +=1
  456.         else:
  457.             losses +=1
  458.  
  459.  
  460. # Create a DataFrame
  461. df = pd.DataFrame({
  462.     'Result': ['Wins', 'Losses', 'Draws'],
  463.     'Count': [wins, losses, draws]
  464. })
  465.  
  466. # Print the DataFrame
  467. print(df.to_string(index=False))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement