Untitled

#!/usr/bin/env python
import os
import sys
import numpy as np

verbose = False

def newBoard():
  return np.zeros((3,3), np.uint8)

# helper functions
def getState(board):
  board = board.reshape((9))
  ret = 0
  for i in range(9):
    ret *= 3
    ret += board[i]
  return ret

# value = p(x winning | state,action)
value = np.zeros(((3**9), 9))
value += 0.5

if os.path.isfile("values.npy"):
  value = np.load("values.npy")

def isGameOver(board):
  for x in range(3):
    if np.all(board[x] == 1) or np.all(board[x] == 2):
      return board[x, 0]
    if np.all(board[:, x] == 1) or np.all(board[:, x] == 2):
      return board[0, x]
    d1 = np.array([board[0,0], board[1,1], board[2,2]])
    d2 = np.array([board[0,2], board[1,1], board[2,0]])
    if np.all(d1 == 1) or np.all(d1 == 2):
      return board[1,1]
    if np.all(d2 == 1) or np.all(d2 == 2):
      return board[1,1]
  if np.all(board != 0):
    return -1
  return 0

def sample(a, temperature=1.0):
  a = np.array(a)**(1/temperature)
  p_sum = a.sum()
  sample_temp = a/p_sum
  return np.argmax(np.random.multinomial(1, sample_temp, 1))

temp = 10.0
def makeMove(board, turn, argmax=False):
  state = getState(board)

  # remove illegal moves
  mask = np.zeros(9) + 1.0
  board9 = board.reshape(9)
  for j in range(9):
    if board9[j] != 0:
      mask[j] = 0

  value_norm = np.copy(value[state])
  if turn == 2:
    value_norm = 1.0 - value_norm
  else:
    value_norm = value_norm
  value_norm *= mask
  value_norm /= np.sum(value_norm)
  if argmax: #or random.randint(0,4) != 0:
    move_choice = np.argmax(value_norm)
  else:
    move_choice = sample(value_norm, temp)
  return move_choice

import random
def agentPlay():
  board = newBoard()
  turn = 1

  moves = []
  while isGameOver(board) == 0:
    state = getState(board)
    move_choice = makeMove(board, turn)
    board.reshape(9)[move_choice] = turn
    moves.append((state, move_choice, turn))

    turn = 2 if turn == 1 else 1

  gg = isGameOver(board)

  adj = 0.01
  #for move in moves[::-1]:
  for move in [random.choice(moves)]:
    if gg == -1:
      if value[move[0], move[1]] > 0.5:
        value[move[0], move[1]] -= adj
      elif value[move[0], move[1]] < 0.5:
        value[move[0], move[1]] += adj
    if gg == 1:
      value[move[0], move[1]] += adj
    if gg == 2:
      value[move[0], move[1]] -= adj
    value[move[0], move[1]] = np.clip(value[move[0], move[1]], 0.0001, 0.9999)
    #adj *= 0.9
  return gg


def player():
  board = newBoard()
  while isGameOver(board) == 0:
    print board
    try:
      x,y = raw_input("Move? ").split(",")
      x,y = int(x), int(y)
    except Exception:
      continue

    if board[y,x] == 0:
      board[y,x] = 1
    else:
      print "illegal move"
      continue

    # computer is o
    move_choice = makeMove(board, 2, argmax=True)
    board.reshape(9)[move_choice] = 2
  print board
  print "gg", isGameOver(board)

test = [None] * (3**9)
possible_boards = []
def testValue(board, turn):
  global test, possible_boards
  state = getState(board)

  # memoize
  if test[state] is not None:
    return test[state]

  possible_boards.append(board)

  if isGameOver(board) != 0:
    # no more moves
    #print board
    gg = isGameOver(board)
    test[state] = gg
    return test[state]

  # moves
  next_turn = 2 if turn == 1 else 1
  possible = []
  for move_choice in range(9):
    # if legal move
    if board.reshape(9)[move_choice] == 0:
      tboard = board.copy()
      tboard.reshape(9)[move_choice] = turn
      possible.append(testValue(tboard, next_turn))

  if state == 0:
    print possible

  if turn == 1:
    if 1 in possible:
      test[state] = 1
    elif -1 in possible:
      test[state] = -1
    else:
      test[state] = 2

  if turn == 2:
    if 2 in possible:
      test[state] = 2
    elif -1 in possible:
      test[state] = -1
    else:
      test[state] = 1

  return test[state]

def pval(xx):
  ret = []
  for x in xx:
    ret.append("%.2f" % x)
  return ' '.join(ret)

from tqdm import tqdm
def runTest():
  if test[0] is None:
    testValue(newBoard(), 1)
  wrong = 0
  for nn, board in tqdm(enumerate(possible_boards)):
    tboard = np.copy(board)
    #print getState(tboard)
    turn = 2
    if np.sum(tboard == 1) == np.sum(tboard == 2):
      turn = 1
    while isGameOver(tboard) == 0:
      move_choice = makeMove(tboard, turn, argmax=True)
      tboard.reshape(9)[move_choice] = turn
      turn = 2 if turn == 1 else 1
    if test[getState(board)] != isGameOver(tboard):
      wrong += 1
      if verbose:
        print "BAD STATE at", nn
        print board
        print getState(board)
        print value[getState(board)]
        print "it should be:", test[getState(board)]
        print tboard
        print "it is w argmax policy:", isGameOver(tboard)
  print "wrong: %d/%d" % (wrong, len(possible_boards))

def train():
  global temp
  try:
    games = []
    while 1:
      games.append(agentPlay())
      tg = games[-1000:]
      if len(games) % 100 == 0:
        state = getState(np.array(([[1,2,0],[0,0,0],[0,0,1]])))
        print "running: %d/%d played %d with temp %f" % (np.sum(np.array(tg) == -1), len(tg), len(games), temp), pval(value[state])
        temp *= 0.995
      if len(games) % 10000 == 0:
        runTest()
  except KeyboardInterrupt:
    print "saving"
    np.save("values.npy", value)

if len(sys.argv) > 1 and sys.argv[1] == "play":
  player()
elif len(sys.argv) > 1 and sys.argv[1] == "test":
  verbose = True
  runTest()
else:
  train()