• API
• FAQ
• Tools
• Archive
SHARE
TWEET

# Untitled

a guest Oct 15th, 2019 87 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import random
2. import numpy as np
3. from fractions import Fraction
4. # An abstract class representing a Markov Decision Process (MDP).
5. class MDP:
6.     def __init__(self):
7.         self.computeStates()
8.
9.     # discount factor
10.     discountFactor = 0.9
11.
12.     # Return the start state.
13.     def startState(self): raise NotImplementedError("Override me")
14.
15.     # Return set of actions possible from |state|.
16.     def actions(self, state): raise NotImplementedError("Override me")
17.
18.     # Return a list of (newState, prob, reward) tuples corresponding to edges
19.     # coming out of |state|.
20.     # Mapping to notation from class:
21.     #   state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
22.     # If state is a terminal state, return the empty list.
23.     def succAndProbReward(self, state, action): raise NotImplementedError("Override me")
24.
25.     # Compute set of states reachable from startState.  Helper function
26.     # to know which states to compute values and policies for.
27.     # This function sets |self.states| to be the set of all states.
28.     def computeStates(self):
29.         self.states = set()
30.         queue = []
32.         queue.append(self.startState())
33.         while len(queue) > 0:
34.             state = queue.pop()
35.             for action in self.actions(state):
36.                 for newState, prob, rewards in self.succAndProbReward(state, action):
37.                     if newState not in self.states:
39.                         queue.append(newState)
40.
41.         print("%d reachable states" % len(self.states))
42.
43.         #for state in self.states:
44.         #    print(state, "\n")
45.
46. class Participant():
47.
48.     def __init__(self, hand=0, ace=False):
49.         self.hand = hand
50.         self.hasPlayableAce = ace
51.
52.
53. class BlackjackMDP(MDP):
54.
55.     # the discount factor for future rewards
56.     discountFactor = 0.9 # TODO: set this to the correct value
57.     cards = [(2, Fraction(1, 13)), (3, Fraction(1, 13)), (4, Fraction(1, 13)), (5, Fraction(1, 13)),
58.             (6, Fraction(1, 13)), (7, Fraction(1, 13)), (8, Fraction(1, 13)), (9, Fraction(1, 13)),
59.             (10, Fraction(4, 13)), (11, Fraction(1, 13))]
60.     # Return the start state.
61.     def startState(self):
62.         player = Participant()
63.         dealer = Participant()
64.         return ('init', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce)
65.         # TODO: come up with some representation for states and return the initial state here
66.
67.     # Return set of actions possible from |state|.
68.     def actions(self, state):
69.         if state[0] == 'init':
70.             return ['Init']
71.         elif state[0] == 'player':
72.             return ['Hit', 'Stand', 'Double down']
73.         elif state[0] == 'dealer':
74.             return ['Noop']
75.         else:
76.             return []
77.
78.     def rewardCalc(self, player, dealer):
79.         if player > 21:
80.             reward = -1
81.         elif dealer > 21:
82.             reward = 1
83.         elif player > dealer:
84.             reward = 1
85.         elif dealer > player:
86.             reward = -1
87.         else:
88.             reward = 0
89.         return reward
90.
91.     def aceCheck(self, card):
92.         if card == 11:
93.             return True
94.         return False
95.     # Return a list of (newState, prob, reward) tuples corresponding to edges
96.     # coming out of |state|.
97.     # Mapping to notation from class:
98.     #   state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
99.     # If state is a terminal state, return the empty list.
100.     def succAndProbReward(self, state, action):
101.         newstates = [] #All possible states with this action
102.         #Player state variables
103.         player = Participant(state[1], state[2])
104.         #Dealer state variables
105.         dealer = Participant(state[3], state[4])
106.         #Final reward
107.         reward = 0
108.         if state[0] == 'init':
109.             for card1 in self.cards:
110.                 for card2 in self.cards:
111.                     for dcard in self.cards:
112.                         if self.aceCheck(card1[0]) and self.aceCheck(card2[0]):
113.                             tempPlayer = Participant(12, True)
114.                         else:
115.                             ace = (self.aceCheck(card1[0]) or self.aceCheck(card2[0]))
116.                             tempPlayer = Participant(card1[0]+card2[0], ace)
117.
118.                         tempDealer = Participant(dcard[0], self.aceCheck(dcard[0]))
119.                         newstates.append(
120.                             (('player', tempPlayer.hand, tempPlayer.hasPlayableAce, tempDealer.hand, tempDealer.hasPlayableAce), card1[1] + card2[1] + dcard[1], reward)
121.                         )
122.         elif state[0] == 'player':
123.             if action == 'Stand':
124.                 newstates.append(
125.                     (('dealer', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), 1, reward)
126.                 )
127.             elif action == 'Hit':
128.                 for card in self.cards:
129.                     tmpPlayer = Participant(player.hand + card[0])
130.                     if self.aceCheck(card[0]) or player.hasPlayableAce:
131.                         tmpPlayer.hasPlayableAce = True
132.                     if tmpPlayer.hand > 21:
133.                         if tmpPlayer.hasPlayableAce:
134.                             tmpPlayer.hand -= 10
135.                             tmpPlayer.hasPlayableAce = False
136.                         else:
137.                             reward = self.rewardCalc(tmpPlayer.hand, dealer.hand)
138.                             newstates.append(
139.                                 (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
140.                             )
141.                     if tmpPlayer.hand <= 21:
142.                         newstates.append(
143.                             (('player', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward)
144.                         )
145.             elif action == 'Double down':
146.                 for card in self.cards:
147.                     tmpPlayer = Participant(player.hand + card[0])
148.                     if self.aceCheck(card[0]) or player.hasPlayableAce:
149.                         tmpPlayer.hasPlayableAce = True
150.                     if tmpPlayer.hand > 21:
151.                         if tmpPlayer.hasPlayableAce:
152.                             tmpPlayer.hand -= 10
153.                             tmpPlayer.hasPlayableAce = False
154.                         else:
155.                             reward = self.rewardCalc(tmpPlayer.hand, dealer.hand) * 2
156.                             newstates.append(
157.                                 (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
158.                             )
159.                     if tmpPlayer.hand <= 21:
160.                         newstates.append(
161.                             (('dealer', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward * 2)
162.                         )
163.         elif state[0] == 'dealer':
164.             if dealer.hand < 17:
165.                 for card in self.cards:
166.                     tmpDealer = Participant(dealer.hand + card[0])
167.                     tmpDealer.hasPlayableAce = dealer.hasPlayableAce or self.aceCheck(card[0])
168.                     if tmpDealer.hand > 21:
169.                         if tmpDealer.hasPlayableAce:
170.                             tmpDealer.hand -= 10
171.                             tmpDealer.hasPlayableAce = False
172.                         else:
173.
174.                             reward = self.rewardCalc(player.hand, tmpDealer.hand)
175.                             newstates.append(
176.                                 (('terminal', player.hand, False, tmpDealer.hand, False), card[1], reward)
177.                             )
178.
179.                     if tmpDealer.hand <= 21:
180.                         newstates.append(
181.                             (('dealer', player.hand, player.hasPlayableAce, tmpDealer.hand, tmpDealer.hasPlayableAce), card[1], reward)
182.                         )
183.             else:
184.                 reward = self.rewardCalc(player.hand, dealer.hand)
185.                 newstates.append(
186.                     (('terminal', player.hand, False, dealer.hand, False), 1, reward)
187.                 )
188.         else:
189.             return []
190.
191.         return newstates
192.
193. def value_iteration(mdp):
194.     delta = 1
195.     margin = 0.000001
196.     V = {}
197.     for state in mdp.states:
198.         V[state] = 0
199.     while (delta >= margin):
200.         delta = 0
201.         for s in mdp.states:
202.             v = V[s]
203.             hit = 0
204.             stand = 0
205.             dd = 0
206.             if(s[0] == 'dealer'):
207.                 noop = 0
208.             else:
209.                 noop = -99999999999
210.             for a in mdp.actions(s):
211.                 for newState, prob, reward in mdp.succAndProbReward(s, a):
212.                     if a == 'Hit':
213.                         hit += prob * (reward + mdp.discountFactor * V[newState])
214.                     elif a == 'Stand':
215.                         stand += prob * (reward + mdp.discountFactor * V[newState])
216.                     elif a == 'Noop':
217.                         noop += prob * (reward + mdp.discountFactor * V[newState])
218.                     elif a == 'Double down':
219.                         dd += prob * (reward + mdp.discountFactor * V[newState])
220.
221.             V[s] = max([hit, stand, noop, dd])
222.             delta = max(delta, np.abs(v - V[s]))
223.
224.         P = {}
225.         for state in mdp.states:
226.             hit = 0
227.             stand = 0
228.             dd = 0
229.             if(state[0] == 'dealer'):
230.                 noop = 0
231.             else:
232.                 noop = -99999999999
233.             for a in mdp.actions(state):
234.                 for newState, prob, reward in mdp.succAndProbReward(state, a):
235.                     if a == 'Hit':
236.                         hit += prob * (reward + mdp.discountFactor * V[newState])
237.                     elif a == 'Stand':
238.                         stand += prob * (reward + mdp.discountFactor * V[newState])
239.                     elif a == 'Noop':
240.                         noop += prob * (reward + mdp.discountFactor * V[newState])
241.                     elif a == 'Double down':
242.                         dd += prob * (reward + mdp.discountFactor * V[newState])
243.             if hit >= stand and hit >= noop and hit >= dd:
244.                 best = 'Hit'
245.             elif stand >= hit and stand >= noop and stand >= dd:
246.                 best = 'Stand'
247.             elif dd >= hit and dd >= stand and stand >= noop:
248.                 best = 'Double down'
249.             else:
250.                 best = 'Noop'
251.             P[state] = best
252.     return P, V
253.
254. mdp = BlackjackMDP()
255. startstates = []
256. startprobs = []
257. startrewards = []
258. P, V = value_iteration(mdp)
259.
260. for state, prob, reward in mdp.succAndProbReward(mdp.startState(), 'Init'):
261.     startstates.append(state)
262.     startprobs.append(prob)
263.     startrewards.append(reward)
264.
265.
266.
267. totalreward = 0
268. for i in range(1000):
269.     st = random.choices(startstates, weights=startprobs)[0]
270.     while st[0] != 'terminal':
271.         states = []
272.         probs = []
273.         rewards = []
274.         R = {}
275.         for state, prob, reward in mdp.succAndProbReward(st, P[st]):
276.             states.append(state)
277.             probs.append(prob)
278.             rewards.append(reward)
279.             R[state] = reward
280.         st = random.choices(states, weights=probs)[0]
281.     totalreward += R[st]
282. print("Win percentage after 1000 games: {0}%".format((1000/2 + totalreward) / 1000 * 100))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.

Top