Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import random
- import numpy as np
- from fractions import Fraction
- # An abstract class representing a Markov Decision Process (MDP).
- class MDP:
- def __init__(self):
- self.computeStates()
- # discount factor
- discountFactor = 0.9
- # Return the start state.
- def startState(self): raise NotImplementedError("Override me")
- # Return set of actions possible from |state|.
- def actions(self, state): raise NotImplementedError("Override me")
- # Return a list of (newState, prob, reward) tuples corresponding to edges
- # coming out of |state|.
- # Mapping to notation from class:
- # state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
- # If state is a terminal state, return the empty list.
- def succAndProbReward(self, state, action): raise NotImplementedError("Override me")
- # Compute set of states reachable from startState. Helper function
- # to know which states to compute values and policies for.
- # This function sets |self.states| to be the set of all states.
- def computeStates(self):
- self.states = set()
- queue = []
- self.states.add(self.startState())
- queue.append(self.startState())
- while len(queue) > 0:
- state = queue.pop()
- for action in self.actions(state):
- for newState, prob, rewards in self.succAndProbReward(state, action):
- if newState not in self.states:
- self.states.add(newState)
- queue.append(newState)
- print("%d reachable states" % len(self.states))
- #for state in self.states:
- # print(state, "\n")
- class Participant():
- def __init__(self, hand=0, ace=False):
- self.hand = hand
- self.hasPlayableAce = ace
- class BlackjackMDP(MDP):
- # the discount factor for future rewards
- discountFactor = 0.9 # TODO: set this to the correct value
- cards = [(2, Fraction(1, 13)), (3, Fraction(1, 13)), (4, Fraction(1, 13)), (5, Fraction(1, 13)),
- (6, Fraction(1, 13)), (7, Fraction(1, 13)), (8, Fraction(1, 13)), (9, Fraction(1, 13)),
- (10, Fraction(4, 13)), (11, Fraction(1, 13))]
- # Return the start state.
- def startState(self):
- player = Participant()
- dealer = Participant()
- return ('init', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce)
- # TODO: come up with some representation for states and return the initial state here
- # Return set of actions possible from |state|.
- def actions(self, state):
- if state[0] == 'init':
- return ['Init']
- elif state[0] == 'player':
- return ['Hit', 'Stand', 'Double down']
- elif state[0] == 'dealer':
- return ['Noop']
- else:
- return []
- def rewardCalc(self, player, dealer):
- if player > 21:
- reward = -1
- elif dealer > 21:
- reward = 1
- elif player > dealer:
- reward = 1
- elif dealer > player:
- reward = -1
- else:
- reward = 0
- return reward
- def aceCheck(self, card):
- if card == 11:
- return True
- return False
- # Return a list of (newState, prob, reward) tuples corresponding to edges
- # coming out of |state|.
- # Mapping to notation from class:
- # state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
- # If state is a terminal state, return the empty list.
- def succAndProbReward(self, state, action):
- newstates = [] #All possible states with this action
- #Player state variables
- player = Participant(state[1], state[2])
- #Dealer state variables
- dealer = Participant(state[3], state[4])
- #Final reward
- reward = 0
- if state[0] == 'init':
- for card1 in self.cards:
- for card2 in self.cards:
- for dcard in self.cards:
- if self.aceCheck(card1[0]) and self.aceCheck(card2[0]):
- tempPlayer = Participant(12, True)
- else:
- ace = (self.aceCheck(card1[0]) or self.aceCheck(card2[0]))
- tempPlayer = Participant(card1[0]+card2[0], ace)
- tempDealer = Participant(dcard[0], self.aceCheck(dcard[0]))
- newstates.append(
- (('player', tempPlayer.hand, tempPlayer.hasPlayableAce, tempDealer.hand, tempDealer.hasPlayableAce), card1[1] + card2[1] + dcard[1], reward)
- )
- elif state[0] == 'player':
- if action == 'Stand':
- newstates.append(
- (('dealer', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), 1, reward)
- )
- elif action == 'Hit':
- for card in self.cards:
- tmpPlayer = Participant(player.hand + card[0])
- if self.aceCheck(card[0]) or player.hasPlayableAce:
- tmpPlayer.hasPlayableAce = True
- if tmpPlayer.hand > 21:
- if tmpPlayer.hasPlayableAce:
- tmpPlayer.hand -= 10
- tmpPlayer.hasPlayableAce = False
- else:
- reward = self.rewardCalc(tmpPlayer.hand, dealer.hand)
- newstates.append(
- (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
- )
- if tmpPlayer.hand <= 21:
- newstates.append(
- (('player', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward)
- )
- elif action == 'Double down':
- for card in self.cards:
- tmpPlayer = Participant(player.hand + card[0])
- if self.aceCheck(card[0]) or player.hasPlayableAce:
- tmpPlayer.hasPlayableAce = True
- if tmpPlayer.hand > 21:
- if tmpPlayer.hasPlayableAce:
- tmpPlayer.hand -= 10
- tmpPlayer.hasPlayableAce = False
- else:
- reward = self.rewardCalc(tmpPlayer.hand, dealer.hand) * 2
- newstates.append(
- (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
- )
- if tmpPlayer.hand <= 21:
- newstates.append(
- (('dealer', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward * 2)
- )
- elif state[0] == 'dealer':
- if dealer.hand < 17:
- for card in self.cards:
- tmpDealer = Participant(dealer.hand + card[0])
- tmpDealer.hasPlayableAce = dealer.hasPlayableAce or self.aceCheck(card[0])
- if tmpDealer.hand > 21:
- if tmpDealer.hasPlayableAce:
- tmpDealer.hand -= 10
- tmpDealer.hasPlayableAce = False
- else:
- reward = self.rewardCalc(player.hand, tmpDealer.hand)
- newstates.append(
- (('terminal', player.hand, False, tmpDealer.hand, False), card[1], reward)
- )
- if tmpDealer.hand <= 21:
- newstates.append(
- (('dealer', player.hand, player.hasPlayableAce, tmpDealer.hand, tmpDealer.hasPlayableAce), card[1], reward)
- )
- else:
- reward = self.rewardCalc(player.hand, dealer.hand)
- newstates.append(
- (('terminal', player.hand, False, dealer.hand, False), 1, reward)
- )
- else:
- return []
- return newstates
- def value_iteration(mdp):
- delta = 1
- margin = 0.000001
- V = {}
- for state in mdp.states:
- V[state] = 0
- while (delta >= margin):
- delta = 0
- for s in mdp.states:
- v = V[s]
- hit = 0
- stand = 0
- dd = 0
- if(s[0] == 'dealer'):
- noop = 0
- else:
- noop = -99999999999
- for a in mdp.actions(s):
- for newState, prob, reward in mdp.succAndProbReward(s, a):
- if a == 'Hit':
- hit += prob * (reward + mdp.discountFactor * V[newState])
- elif a == 'Stand':
- stand += prob * (reward + mdp.discountFactor * V[newState])
- elif a == 'Noop':
- noop += prob * (reward + mdp.discountFactor * V[newState])
- elif a == 'Double down':
- dd += prob * (reward + mdp.discountFactor * V[newState])
- V[s] = max([hit, stand, noop, dd])
- delta = max(delta, np.abs(v - V[s]))
- P = {}
- for state in mdp.states:
- hit = 0
- stand = 0
- dd = 0
- if(state[0] == 'dealer'):
- noop = 0
- else:
- noop = -99999999999
- for a in mdp.actions(state):
- for newState, prob, reward in mdp.succAndProbReward(state, a):
- if a == 'Hit':
- hit += prob * (reward + mdp.discountFactor * V[newState])
- elif a == 'Stand':
- stand += prob * (reward + mdp.discountFactor * V[newState])
- elif a == 'Noop':
- noop += prob * (reward + mdp.discountFactor * V[newState])
- elif a == 'Double down':
- dd += prob * (reward + mdp.discountFactor * V[newState])
- if hit >= stand and hit >= noop and hit >= dd:
- best = 'Hit'
- elif stand >= hit and stand >= noop and stand >= dd:
- best = 'Stand'
- elif dd >= hit and dd >= stand and stand >= noop:
- best = 'Double down'
- else:
- best = 'Noop'
- P[state] = best
- return P, V
- mdp = BlackjackMDP()
- startstates = []
- startprobs = []
- startrewards = []
- P, V = value_iteration(mdp)
- for state, prob, reward in mdp.succAndProbReward(mdp.startState(), 'Init'):
- startstates.append(state)
- startprobs.append(prob)
- startrewards.append(reward)
- totalreward = 0
- for i in range(1000):
- st = random.choices(startstates, weights=startprobs)[0]
- while st[0] != 'terminal':
- states = []
- probs = []
- rewards = []
- R = {}
- for state, prob, reward in mdp.succAndProbReward(st, P[st]):
- states.append(state)
- probs.append(prob)
- rewards.append(reward)
- R[state] = reward
- st = random.choices(states, weights=probs)[0]
- totalreward += R[st]
- print("Win percentage after 1000 games: {0}%".format((1000/2 + totalreward) / 1000 * 100))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement