Untitled

import random
import numpy as np
from fractions import Fraction
# An abstract class representing a Markov Decision Process (MDP).
class MDP:
    def __init__(self):
        self.computeStates()

    # discount factor
    discountFactor = 0.9

    # Return the start state.
    def startState(self): raise NotImplementedError("Override me")

    # Return set of actions possible from |state|.
    def actions(self, state): raise NotImplementedError("Override me")

    # Return a list of (newState, prob, reward) tuples corresponding to edges
    # coming out of |state|.
    # Mapping to notation from class:
    #   state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
    # If state is a terminal state, return the empty list.
    def succAndProbReward(self, state, action): raise NotImplementedError("Override me")

    # Compute set of states reachable from startState.  Helper function
    # to know which states to compute values and policies for.
    # This function sets |self.states| to be the set of all states.
    def computeStates(self):
        self.states = set()
        queue = []
        self.states.add(self.startState())
        queue.append(self.startState())
        while len(queue) > 0:
            state = queue.pop()
            for action in self.actions(state):
                for newState, prob, rewards in self.succAndProbReward(state, action):
                    if newState not in self.states:
                        self.states.add(newState)
                        queue.append(newState)

        print("%d reachable states" % len(self.states))

        #for state in self.states:
        #    print(state, "\n")

class Participant():

    def __init__(self, hand=0, ace=False):
        self.hand = hand
        self.hasPlayableAce = ace


class BlackjackMDP(MDP):

    # the discount factor for future rewards
    discountFactor = 0.9 # TODO: set this to the correct value
    cards = [(2, Fraction(1, 13)), (3, Fraction(1, 13)), (4, Fraction(1, 13)), (5, Fraction(1, 13)),
            (6, Fraction(1, 13)), (7, Fraction(1, 13)), (8, Fraction(1, 13)), (9, Fraction(1, 13)),
            (10, Fraction(4, 13)), (11, Fraction(1, 13))]
    # Return the start state.
    def startState(self):
        player = Participant()
        dealer = Participant()
        return ('init', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce)
        # TODO: come up with some representation for states and return the initial state here

    # Return set of actions possible from |state|.
    def actions(self, state):
        if state[0] == 'init':
            return ['Init']
        elif state[0] == 'player':
            return ['Hit', 'Stand', 'Double down']
        elif state[0] == 'dealer':
            return ['Noop']
        else:
            return []

    def rewardCalc(self, player, dealer):
        if player > 21:
            reward = -1
        elif dealer > 21:
            reward = 1
        elif player > dealer:
            reward = 1
        elif dealer > player:
            reward = -1
        else:
            reward = 0
        return reward

    def aceCheck(self, card):
        if card == 11:
            return True
        return False
    # Return a list of (newState, prob, reward) tuples corresponding to edges
    # coming out of |state|.
    # Mapping to notation from class:
    #   state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
    # If state is a terminal state, return the empty list.
    def succAndProbReward(self, state, action):
        newstates = [] #All possible states with this action
        #Player state variables
        player = Participant(state[1], state[2])
        #Dealer state variables
        dealer = Participant(state[3], state[4])
        #Final reward
        reward = 0
        if state[0] == 'init':
            for card1 in self.cards:
                for card2 in self.cards:
                    for dcard in self.cards:
                        if self.aceCheck(card1[0]) and self.aceCheck(card2[0]):
                            tempPlayer = Participant(12, True)
                        else:
                            ace = (self.aceCheck(card1[0]) or self.aceCheck(card2[0]))
                            tempPlayer = Participant(card1[0]+card2[0], ace)

                        tempDealer = Participant(dcard[0], self.aceCheck(dcard[0]))
                        newstates.append(
                            (('player', tempPlayer.hand, tempPlayer.hasPlayableAce, tempDealer.hand, tempDealer.hasPlayableAce), card1[1] + card2[1] + dcard[1], reward)
                        )
        elif state[0] == 'player':
            if action == 'Stand':
                newstates.append(
                    (('dealer', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), 1, reward)
                )
            elif action == 'Hit':
                for card in self.cards:
                    tmpPlayer = Participant(player.hand + card[0])
                    if self.aceCheck(card[0]) or player.hasPlayableAce:
                        tmpPlayer.hasPlayableAce = True
                    if tmpPlayer.hand > 21:
                        if tmpPlayer.hasPlayableAce:
                            tmpPlayer.hand -= 10
                            tmpPlayer.hasPlayableAce = False
                        else:
                            reward = self.rewardCalc(tmpPlayer.hand, dealer.hand)
                            newstates.append(
                                (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
                            )
                    if tmpPlayer.hand <= 21:
                        newstates.append(
                            (('player', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward)
                        )
            elif action == 'Double down':
                for card in self.cards:
                    tmpPlayer = Participant(player.hand + card[0])
                    if self.aceCheck(card[0]) or player.hasPlayableAce:
                        tmpPlayer.hasPlayableAce = True
                    if tmpPlayer.hand > 21:
                        if tmpPlayer.hasPlayableAce:
                            tmpPlayer.hand -= 10
                            tmpPlayer.hasPlayableAce = False
                        else:
                            reward = self.rewardCalc(tmpPlayer.hand, dealer.hand) * 2
                            newstates.append(
                                (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
                            )
                    if tmpPlayer.hand <= 21:
                        newstates.append(
                            (('dealer', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward * 2)
                        )
        elif state[0] == 'dealer':
            if dealer.hand < 17:
                for card in self.cards:
                    tmpDealer = Participant(dealer.hand + card[0])
                    tmpDealer.hasPlayableAce = dealer.hasPlayableAce or self.aceCheck(card[0])
                    if tmpDealer.hand > 21:
                        if tmpDealer.hasPlayableAce:
                            tmpDealer.hand -= 10
                            tmpDealer.hasPlayableAce = False
                        else:

                            reward = self.rewardCalc(player.hand, tmpDealer.hand)
                            newstates.append(
                                (('terminal', player.hand, False, tmpDealer.hand, False), card[1], reward)
                            )

                    if tmpDealer.hand <= 21:
                        newstates.append(
                            (('dealer', player.hand, player.hasPlayableAce, tmpDealer.hand, tmpDealer.hasPlayableAce), card[1], reward)
                        )
            else:
                reward = self.rewardCalc(player.hand, dealer.hand)
                newstates.append(
                    (('terminal', player.hand, False, dealer.hand, False), 1, reward)
                )
        else:
            return []

        return newstates

def value_iteration(mdp):
    delta = 1
    margin = 0.000001
    V = {}
    for state in mdp.states:
        V[state] = 0
    while (delta >= margin):
        delta = 0
        for s in mdp.states:
            v = V[s]
            hit = 0
            stand = 0
            dd = 0
            if(s[0] == 'dealer'):
                noop = 0
            else:
                noop = -99999999999
            for a in mdp.actions(s):
                for newState, prob, reward in mdp.succAndProbReward(s, a):
                    if a == 'Hit':
                        hit += prob * (reward + mdp.discountFactor * V[newState])
                    elif a == 'Stand':
                        stand += prob * (reward + mdp.discountFactor * V[newState])
                    elif a == 'Noop':
                        noop += prob * (reward + mdp.discountFactor * V[newState])
                    elif a == 'Double down':
                        dd += prob * (reward + mdp.discountFactor * V[newState])

            V[s] = max([hit, stand, noop, dd])
            delta = max(delta, np.abs(v - V[s]))

        P = {}
        for state in mdp.states:
            hit = 0
            stand = 0
            dd = 0
            if(state[0] == 'dealer'):
                noop = 0
            else:
                noop = -99999999999
            for a in mdp.actions(state):
                for newState, prob, reward in mdp.succAndProbReward(state, a):
                    if a == 'Hit':
                        hit += prob * (reward + mdp.discountFactor * V[newState])
                    elif a == 'Stand':
                        stand += prob * (reward + mdp.discountFactor * V[newState])
                    elif a == 'Noop':
                        noop += prob * (reward + mdp.discountFactor * V[newState])
                    elif a == 'Double down':
                        dd += prob * (reward + mdp.discountFactor * V[newState])
            if hit >= stand and hit >= noop and hit >= dd:
                best = 'Hit'
            elif stand >= hit and stand >= noop and stand >= dd:
                best = 'Stand'
            elif dd >= hit and dd >= stand and stand >= noop:
                best = 'Double down'
            else:
                best = 'Noop'
            P[state] = best
    return P, V

mdp = BlackjackMDP()
startstates = []
startprobs = []
startrewards = []
P, V = value_iteration(mdp)

for state, prob, reward in mdp.succAndProbReward(mdp.startState(), 'Init'):
    startstates.append(state)
    startprobs.append(prob)
    startrewards.append(reward)


totalreward = 0
for i in range(1000):
    st = random.choices(startstates, weights=startprobs)[0]
    while st[0] != 'terminal':
        states = []
        probs = []
        rewards = []
        R = {}
        for state, prob, reward in mdp.succAndProbReward(st, P[st]):
            states.append(state)
            probs.append(prob)
            rewards.append(reward)
            R[state] = reward
        st = random.choices(states, weights=probs)[0]
    totalreward += R[st]
print("Win percentage after 1000 games: {0}%".format((1000/2 + totalreward) / 1000 * 100))