SHARE
TWEET

Untitled

a guest Oct 15th, 2019 87 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import random
  2. import numpy as np
  3. from fractions import Fraction
  4. # An abstract class representing a Markov Decision Process (MDP).
  5. class MDP:
  6.     def __init__(self):
  7.         self.computeStates()
  8.  
  9.     # discount factor
  10.     discountFactor = 0.9
  11.  
  12.     # Return the start state.
  13.     def startState(self): raise NotImplementedError("Override me")
  14.  
  15.     # Return set of actions possible from |state|.
  16.     def actions(self, state): raise NotImplementedError("Override me")
  17.  
  18.     # Return a list of (newState, prob, reward) tuples corresponding to edges
  19.     # coming out of |state|.
  20.     # Mapping to notation from class:
  21.     #   state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
  22.     # If state is a terminal state, return the empty list.
  23.     def succAndProbReward(self, state, action): raise NotImplementedError("Override me")
  24.  
  25.     # Compute set of states reachable from startState.  Helper function
  26.     # to know which states to compute values and policies for.
  27.     # This function sets |self.states| to be the set of all states.
  28.     def computeStates(self):
  29.         self.states = set()
  30.         queue = []
  31.         self.states.add(self.startState())
  32.         queue.append(self.startState())
  33.         while len(queue) > 0:
  34.             state = queue.pop()
  35.             for action in self.actions(state):
  36.                 for newState, prob, rewards in self.succAndProbReward(state, action):
  37.                     if newState not in self.states:
  38.                         self.states.add(newState)
  39.                         queue.append(newState)
  40.  
  41.         print("%d reachable states" % len(self.states))
  42.        
  43.         #for state in self.states:
  44.         #    print(state, "\n")
  45.  
  46. class Participant():
  47.    
  48.     def __init__(self, hand=0, ace=False):
  49.         self.hand = hand
  50.         self.hasPlayableAce = ace
  51.  
  52.  
  53. class BlackjackMDP(MDP):
  54.    
  55.     # the discount factor for future rewards
  56.     discountFactor = 0.9 # TODO: set this to the correct value
  57.     cards = [(2, Fraction(1, 13)), (3, Fraction(1, 13)), (4, Fraction(1, 13)), (5, Fraction(1, 13)),
  58.             (6, Fraction(1, 13)), (7, Fraction(1, 13)), (8, Fraction(1, 13)), (9, Fraction(1, 13)),
  59.             (10, Fraction(4, 13)), (11, Fraction(1, 13))]
  60.     # Return the start state.
  61.     def startState(self):
  62.         player = Participant()
  63.         dealer = Participant()
  64.         return ('init', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce)
  65.         # TODO: come up with some representation for states and return the initial state here
  66.  
  67.     # Return set of actions possible from |state|.
  68.     def actions(self, state):
  69.         if state[0] == 'init':
  70.             return ['Init']
  71.         elif state[0] == 'player':
  72.             return ['Hit', 'Stand', 'Double down']
  73.         elif state[0] == 'dealer':
  74.             return ['Noop']
  75.         else:
  76.             return []
  77.  
  78.     def rewardCalc(self, player, dealer):
  79.         if player > 21:
  80.             reward = -1
  81.         elif dealer > 21:
  82.             reward = 1
  83.         elif player > dealer:
  84.             reward = 1
  85.         elif dealer > player:
  86.             reward = -1
  87.         else:
  88.             reward = 0
  89.         return reward
  90.  
  91.     def aceCheck(self, card):
  92.         if card == 11:
  93.             return True
  94.         return False
  95.     # Return a list of (newState, prob, reward) tuples corresponding to edges
  96.     # coming out of |state|.
  97.     # Mapping to notation from class:
  98.     #   state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
  99.     # If state is a terminal state, return the empty list.
  100.     def succAndProbReward(self, state, action):
  101.         newstates = [] #All possible states with this action
  102.         #Player state variables
  103.         player = Participant(state[1], state[2])
  104.         #Dealer state variables
  105.         dealer = Participant(state[3], state[4])
  106.         #Final reward
  107.         reward = 0
  108.         if state[0] == 'init':
  109.             for card1 in self.cards:
  110.                 for card2 in self.cards:
  111.                     for dcard in self.cards:
  112.                         if self.aceCheck(card1[0]) and self.aceCheck(card2[0]):
  113.                             tempPlayer = Participant(12, True)
  114.                         else:
  115.                             ace = (self.aceCheck(card1[0]) or self.aceCheck(card2[0]))
  116.                             tempPlayer = Participant(card1[0]+card2[0], ace)
  117.  
  118.                         tempDealer = Participant(dcard[0], self.aceCheck(dcard[0]))
  119.                         newstates.append(
  120.                             (('player', tempPlayer.hand, tempPlayer.hasPlayableAce, tempDealer.hand, tempDealer.hasPlayableAce), card1[1] + card2[1] + dcard[1], reward)
  121.                         )
  122.         elif state[0] == 'player':
  123.             if action == 'Stand':
  124.                 newstates.append(
  125.                     (('dealer', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), 1, reward)
  126.                 )
  127.             elif action == 'Hit':
  128.                 for card in self.cards:
  129.                     tmpPlayer = Participant(player.hand + card[0])
  130.                     if self.aceCheck(card[0]) or player.hasPlayableAce:
  131.                         tmpPlayer.hasPlayableAce = True
  132.                     if tmpPlayer.hand > 21:
  133.                         if tmpPlayer.hasPlayableAce:
  134.                             tmpPlayer.hand -= 10
  135.                             tmpPlayer.hasPlayableAce = False
  136.                         else:
  137.                             reward = self.rewardCalc(tmpPlayer.hand, dealer.hand)
  138.                             newstates.append(
  139.                                 (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
  140.                             )
  141.                     if tmpPlayer.hand <= 21:    
  142.                         newstates.append(
  143.                             (('player', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward)
  144.                         )
  145.             elif action == 'Double down':
  146.                 for card in self.cards:
  147.                     tmpPlayer = Participant(player.hand + card[0])
  148.                     if self.aceCheck(card[0]) or player.hasPlayableAce:
  149.                         tmpPlayer.hasPlayableAce = True
  150.                     if tmpPlayer.hand > 21:
  151.                         if tmpPlayer.hasPlayableAce:
  152.                             tmpPlayer.hand -= 10
  153.                             tmpPlayer.hasPlayableAce = False
  154.                         else:
  155.                             reward = self.rewardCalc(tmpPlayer.hand, dealer.hand) * 2
  156.                             newstates.append(
  157.                                 (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
  158.                             )
  159.                     if tmpPlayer.hand <= 21:    
  160.                         newstates.append(
  161.                             (('dealer', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward * 2)
  162.                         )                
  163.         elif state[0] == 'dealer':
  164.             if dealer.hand < 17:
  165.                 for card in self.cards:
  166.                     tmpDealer = Participant(dealer.hand + card[0])
  167.                     tmpDealer.hasPlayableAce = dealer.hasPlayableAce or self.aceCheck(card[0])
  168.                     if tmpDealer.hand > 21:
  169.                         if tmpDealer.hasPlayableAce:
  170.                             tmpDealer.hand -= 10
  171.                             tmpDealer.hasPlayableAce = False
  172.                         else:
  173.  
  174.                             reward = self.rewardCalc(player.hand, tmpDealer.hand)
  175.                             newstates.append(
  176.                                 (('terminal', player.hand, False, tmpDealer.hand, False), card[1], reward)
  177.                             )
  178.                        
  179.                     if tmpDealer.hand <= 21:
  180.                         newstates.append(
  181.                             (('dealer', player.hand, player.hasPlayableAce, tmpDealer.hand, tmpDealer.hasPlayableAce), card[1], reward)
  182.                         )
  183.             else:
  184.                 reward = self.rewardCalc(player.hand, dealer.hand)
  185.                 newstates.append(
  186.                     (('terminal', player.hand, False, dealer.hand, False), 1, reward)
  187.                 )
  188.         else:
  189.             return []
  190.  
  191.         return newstates
  192.  
  193. def value_iteration(mdp):
  194.     delta = 1
  195.     margin = 0.000001
  196.     V = {}
  197.     for state in mdp.states:
  198.         V[state] = 0
  199.     while (delta >= margin):
  200.         delta = 0
  201.         for s in mdp.states:
  202.             v = V[s]        
  203.             hit = 0
  204.             stand = 0
  205.             dd = 0
  206.             if(s[0] == 'dealer'):
  207.                 noop = 0
  208.             else:
  209.                 noop = -99999999999
  210.             for a in mdp.actions(s):
  211.                 for newState, prob, reward in mdp.succAndProbReward(s, a):
  212.                     if a == 'Hit':
  213.                         hit += prob * (reward + mdp.discountFactor * V[newState])
  214.                     elif a == 'Stand':
  215.                         stand += prob * (reward + mdp.discountFactor * V[newState])
  216.                     elif a == 'Noop':
  217.                         noop += prob * (reward + mdp.discountFactor * V[newState])
  218.                     elif a == 'Double down':
  219.                         dd += prob * (reward + mdp.discountFactor * V[newState])
  220.  
  221.             V[s] = max([hit, stand, noop, dd])
  222.             delta = max(delta, np.abs(v - V[s]))
  223.  
  224.         P = {}
  225.         for state in mdp.states:
  226.             hit = 0
  227.             stand = 0
  228.             dd = 0
  229.             if(state[0] == 'dealer'):
  230.                 noop = 0
  231.             else:
  232.                 noop = -99999999999
  233.             for a in mdp.actions(state):
  234.                 for newState, prob, reward in mdp.succAndProbReward(state, a):
  235.                     if a == 'Hit':
  236.                         hit += prob * (reward + mdp.discountFactor * V[newState])
  237.                     elif a == 'Stand':
  238.                         stand += prob * (reward + mdp.discountFactor * V[newState])
  239.                     elif a == 'Noop':
  240.                         noop += prob * (reward + mdp.discountFactor * V[newState])
  241.                     elif a == 'Double down':
  242.                         dd += prob * (reward + mdp.discountFactor * V[newState])
  243.             if hit >= stand and hit >= noop and hit >= dd:
  244.                 best = 'Hit'
  245.             elif stand >= hit and stand >= noop and stand >= dd:
  246.                 best = 'Stand'
  247.             elif dd >= hit and dd >= stand and stand >= noop:
  248.                 best = 'Double down'
  249.             else:
  250.                 best = 'Noop'
  251.             P[state] = best
  252.     return P, V
  253.  
  254. mdp = BlackjackMDP()
  255. startstates = []
  256. startprobs = []
  257. startrewards = []
  258. P, V = value_iteration(mdp)
  259.  
  260. for state, prob, reward in mdp.succAndProbReward(mdp.startState(), 'Init'):
  261.     startstates.append(state)
  262.     startprobs.append(prob)
  263.     startrewards.append(reward)
  264.          
  265.  
  266.  
  267. totalreward = 0
  268. for i in range(1000):
  269.     st = random.choices(startstates, weights=startprobs)[0]
  270.     while st[0] != 'terminal':
  271.         states = []
  272.         probs = []
  273.         rewards = []
  274.         R = {}
  275.         for state, prob, reward in mdp.succAndProbReward(st, P[st]):
  276.             states.append(state)
  277.             probs.append(prob)
  278.             rewards.append(reward)
  279.             R[state] = reward
  280.         st = random.choices(states, weights=probs)[0]
  281.     totalreward += R[st]
  282. print("Win percentage after 1000 games: {0}%".format((1000/2 + totalreward) / 1000 * 100))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top