Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import os
- import sys
- import numpy as np
- verbose = False
- def newBoard():
- return np.zeros((3,3), np.uint8)
- # helper functions
- def getState(board):
- board = board.reshape((9))
- ret = 0
- for i in range(9):
- ret *= 3
- ret += board[i]
- return ret
- # value = p(x winning | state,action)
- value = np.zeros(((3**9), 9))
- value += 0.5
- if os.path.isfile("values.npy"):
- value = np.load("values.npy")
- def isGameOver(board):
- for x in range(3):
- if np.all(board[x] == 1) or np.all(board[x] == 2):
- return board[x, 0]
- if np.all(board[:, x] == 1) or np.all(board[:, x] == 2):
- return board[0, x]
- d1 = np.array([board[0,0], board[1,1], board[2,2]])
- d2 = np.array([board[0,2], board[1,1], board[2,0]])
- if np.all(d1 == 1) or np.all(d1 == 2):
- return board[1,1]
- if np.all(d2 == 1) or np.all(d2 == 2):
- return board[1,1]
- if np.all(board != 0):
- return -1
- return 0
- def sample(a, temperature=1.0):
- a = np.array(a)**(1/temperature)
- p_sum = a.sum()
- sample_temp = a/p_sum
- return np.argmax(np.random.multinomial(1, sample_temp, 1))
- temp = 10.0
- def makeMove(board, turn, argmax=False):
- state = getState(board)
- # remove illegal moves
- mask = np.zeros(9) + 1.0
- board9 = board.reshape(9)
- for j in range(9):
- if board9[j] != 0:
- mask[j] = 0
- value_norm = np.copy(value[state])
- if turn == 2:
- value_norm = 1.0 - value_norm
- else:
- value_norm = value_norm
- value_norm *= mask
- value_norm /= np.sum(value_norm)
- if argmax: #or random.randint(0,4) != 0:
- move_choice = np.argmax(value_norm)
- else:
- move_choice = sample(value_norm, temp)
- return move_choice
- import random
- def agentPlay():
- board = newBoard()
- turn = 1
- moves = []
- while isGameOver(board) == 0:
- state = getState(board)
- move_choice = makeMove(board, turn)
- board.reshape(9)[move_choice] = turn
- moves.append((state, move_choice, turn))
- turn = 2 if turn == 1 else 1
- gg = isGameOver(board)
- adj = 0.01
- #for move in moves[::-1]:
- for move in [random.choice(moves)]:
- if gg == -1:
- if value[move[0], move[1]] > 0.5:
- value[move[0], move[1]] -= adj
- elif value[move[0], move[1]] < 0.5:
- value[move[0], move[1]] += adj
- if gg == 1:
- value[move[0], move[1]] += adj
- if gg == 2:
- value[move[0], move[1]] -= adj
- value[move[0], move[1]] = np.clip(value[move[0], move[1]], 0.0001, 0.9999)
- #adj *= 0.9
- return gg
- def player():
- board = newBoard()
- while isGameOver(board) == 0:
- print board
- try:
- x,y = raw_input("Move? ").split(",")
- x,y = int(x), int(y)
- except Exception:
- continue
- if board[y,x] == 0:
- board[y,x] = 1
- else:
- print "illegal move"
- continue
- # computer is o
- move_choice = makeMove(board, 2, argmax=True)
- board.reshape(9)[move_choice] = 2
- print board
- print "gg", isGameOver(board)
- test = [None] * (3**9)
- possible_boards = []
- def testValue(board, turn):
- global test, possible_boards
- state = getState(board)
- # memoize
- if test[state] is not None:
- return test[state]
- possible_boards.append(board)
- if isGameOver(board) != 0:
- # no more moves
- #print board
- gg = isGameOver(board)
- test[state] = gg
- return test[state]
- # moves
- next_turn = 2 if turn == 1 else 1
- possible = []
- for move_choice in range(9):
- # if legal move
- if board.reshape(9)[move_choice] == 0:
- tboard = board.copy()
- tboard.reshape(9)[move_choice] = turn
- possible.append(testValue(tboard, next_turn))
- if state == 0:
- print possible
- if turn == 1:
- if 1 in possible:
- test[state] = 1
- elif -1 in possible:
- test[state] = -1
- else:
- test[state] = 2
- if turn == 2:
- if 2 in possible:
- test[state] = 2
- elif -1 in possible:
- test[state] = -1
- else:
- test[state] = 1
- return test[state]
- def pval(xx):
- ret = []
- for x in xx:
- ret.append("%.2f" % x)
- return ' '.join(ret)
- from tqdm import tqdm
- def runTest():
- if test[0] is None:
- testValue(newBoard(), 1)
- wrong = 0
- for nn, board in tqdm(enumerate(possible_boards)):
- tboard = np.copy(board)
- #print getState(tboard)
- turn = 2
- if np.sum(tboard == 1) == np.sum(tboard == 2):
- turn = 1
- while isGameOver(tboard) == 0:
- move_choice = makeMove(tboard, turn, argmax=True)
- tboard.reshape(9)[move_choice] = turn
- turn = 2 if turn == 1 else 1
- if test[getState(board)] != isGameOver(tboard):
- wrong += 1
- if verbose:
- print "BAD STATE at", nn
- print board
- print getState(board)
- print value[getState(board)]
- print "it should be:", test[getState(board)]
- print tboard
- print "it is w argmax policy:", isGameOver(tboard)
- print "wrong: %d/%d" % (wrong, len(possible_boards))
- def train():
- global temp
- try:
- games = []
- while 1:
- games.append(agentPlay())
- tg = games[-1000:]
- if len(games) % 100 == 0:
- state = getState(np.array(([[1,2,0],[0,0,0],[0,0,1]])))
- print "running: %d/%d played %d with temp %f" % (np.sum(np.array(tg) == -1), len(tg), len(games), temp), pval(value[state])
- temp *= 0.995
- if len(games) % 10000 == 0:
- runTest()
- except KeyboardInterrupt:
- print "saving"
- np.save("values.npy", value)
- if len(sys.argv) > 1 and sys.argv[1] == "play":
- player()
- elif len(sys.argv) > 1 and sys.argv[1] == "test":
- verbose = True
- runTest()
- else:
- train()
Add Comment
Please, Sign In to add comment