Advertisement
Guest User

Untitled

a guest
Oct 15th, 2019
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.49 KB | None | 0 0
  1. import random
  2. import numpy as np
  3. from fractions import Fraction
  4. # An abstract class representing a Markov Decision Process (MDP).
  5. class MDP:
  6. def __init__(self):
  7. self.computeStates()
  8.  
  9. # discount factor
  10. discountFactor = 0.9
  11.  
  12. # Return the start state.
  13. def startState(self): raise NotImplementedError("Override me")
  14.  
  15. # Return set of actions possible from |state|.
  16. def actions(self, state): raise NotImplementedError("Override me")
  17.  
  18. # Return a list of (newState, prob, reward) tuples corresponding to edges
  19. # coming out of |state|.
  20. # Mapping to notation from class:
  21. # state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
  22. # If state is a terminal state, return the empty list.
  23. def succAndProbReward(self, state, action): raise NotImplementedError("Override me")
  24.  
  25. # Compute set of states reachable from startState. Helper function
  26. # to know which states to compute values and policies for.
  27. # This function sets |self.states| to be the set of all states.
  28. def computeStates(self):
  29. self.states = set()
  30. queue = []
  31. self.states.add(self.startState())
  32. queue.append(self.startState())
  33. while len(queue) > 0:
  34. state = queue.pop()
  35. for action in self.actions(state):
  36. for newState, prob, rewards in self.succAndProbReward(state, action):
  37. if newState not in self.states:
  38. self.states.add(newState)
  39. queue.append(newState)
  40.  
  41. print("%d reachable states" % len(self.states))
  42.  
  43. #for state in self.states:
  44. # print(state, "\n")
  45.  
  46. class Participant():
  47.  
  48. def __init__(self, hand=0, ace=False):
  49. self.hand = hand
  50. self.hasPlayableAce = ace
  51.  
  52.  
  53. class BlackjackMDP(MDP):
  54.  
  55. # the discount factor for future rewards
  56. discountFactor = 0.9 # TODO: set this to the correct value
  57. cards = [(2, Fraction(1, 13)), (3, Fraction(1, 13)), (4, Fraction(1, 13)), (5, Fraction(1, 13)),
  58. (6, Fraction(1, 13)), (7, Fraction(1, 13)), (8, Fraction(1, 13)), (9, Fraction(1, 13)),
  59. (10, Fraction(4, 13)), (11, Fraction(1, 13))]
  60. # Return the start state.
  61. def startState(self):
  62. player = Participant()
  63. dealer = Participant()
  64. return ('init', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce)
  65. # TODO: come up with some representation for states and return the initial state here
  66.  
  67. # Return set of actions possible from |state|.
  68. def actions(self, state):
  69. if state[0] == 'init':
  70. return ['Init']
  71. elif state[0] == 'player':
  72. return ['Hit', 'Stand', 'Double down']
  73. elif state[0] == 'dealer':
  74. return ['Noop']
  75. else:
  76. return []
  77.  
  78. def rewardCalc(self, player, dealer):
  79. if player > 21:
  80. reward = -1
  81. elif dealer > 21:
  82. reward = 1
  83. elif player > dealer:
  84. reward = 1
  85. elif dealer > player:
  86. reward = -1
  87. else:
  88. reward = 0
  89. return reward
  90.  
  91. def aceCheck(self, card):
  92. if card == 11:
  93. return True
  94. return False
  95. # Return a list of (newState, prob, reward) tuples corresponding to edges
  96. # coming out of |state|.
  97. # Mapping to notation from class:
  98. # state = s, action = a, newState = s', reward = r, prob = p(s', r | s, a)
  99. # If state is a terminal state, return the empty list.
  100. def succAndProbReward(self, state, action):
  101. newstates = [] #All possible states with this action
  102. #Player state variables
  103. player = Participant(state[1], state[2])
  104. #Dealer state variables
  105. dealer = Participant(state[3], state[4])
  106. #Final reward
  107. reward = 0
  108. if state[0] == 'init':
  109. for card1 in self.cards:
  110. for card2 in self.cards:
  111. for dcard in self.cards:
  112. if self.aceCheck(card1[0]) and self.aceCheck(card2[0]):
  113. tempPlayer = Participant(12, True)
  114. else:
  115. ace = (self.aceCheck(card1[0]) or self.aceCheck(card2[0]))
  116. tempPlayer = Participant(card1[0]+card2[0], ace)
  117.  
  118. tempDealer = Participant(dcard[0], self.aceCheck(dcard[0]))
  119. newstates.append(
  120. (('player', tempPlayer.hand, tempPlayer.hasPlayableAce, tempDealer.hand, tempDealer.hasPlayableAce), card1[1] + card2[1] + dcard[1], reward)
  121. )
  122. elif state[0] == 'player':
  123. if action == 'Stand':
  124. newstates.append(
  125. (('dealer', player.hand, player.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), 1, reward)
  126. )
  127. elif action == 'Hit':
  128. for card in self.cards:
  129. tmpPlayer = Participant(player.hand + card[0])
  130. if self.aceCheck(card[0]) or player.hasPlayableAce:
  131. tmpPlayer.hasPlayableAce = True
  132. if tmpPlayer.hand > 21:
  133. if tmpPlayer.hasPlayableAce:
  134. tmpPlayer.hand -= 10
  135. tmpPlayer.hasPlayableAce = False
  136. else:
  137. reward = self.rewardCalc(tmpPlayer.hand, dealer.hand)
  138. newstates.append(
  139. (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
  140. )
  141. if tmpPlayer.hand <= 21:
  142. newstates.append(
  143. (('player', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward)
  144. )
  145. elif action == 'Double down':
  146. for card in self.cards:
  147. tmpPlayer = Participant(player.hand + card[0])
  148. if self.aceCheck(card[0]) or player.hasPlayableAce:
  149. tmpPlayer.hasPlayableAce = True
  150. if tmpPlayer.hand > 21:
  151. if tmpPlayer.hasPlayableAce:
  152. tmpPlayer.hand -= 10
  153. tmpPlayer.hasPlayableAce = False
  154. else:
  155. reward = self.rewardCalc(tmpPlayer.hand, dealer.hand) * 2
  156. newstates.append(
  157. (('terminal', tmpPlayer.hand, False, dealer.hand, False), card[1], reward)
  158. )
  159. if tmpPlayer.hand <= 21:
  160. newstates.append(
  161. (('dealer', tmpPlayer.hand, tmpPlayer.hasPlayableAce, dealer.hand, dealer.hasPlayableAce), card[1], reward * 2)
  162. )
  163. elif state[0] == 'dealer':
  164. if dealer.hand < 17:
  165. for card in self.cards:
  166. tmpDealer = Participant(dealer.hand + card[0])
  167. tmpDealer.hasPlayableAce = dealer.hasPlayableAce or self.aceCheck(card[0])
  168. if tmpDealer.hand > 21:
  169. if tmpDealer.hasPlayableAce:
  170. tmpDealer.hand -= 10
  171. tmpDealer.hasPlayableAce = False
  172. else:
  173.  
  174. reward = self.rewardCalc(player.hand, tmpDealer.hand)
  175. newstates.append(
  176. (('terminal', player.hand, False, tmpDealer.hand, False), card[1], reward)
  177. )
  178.  
  179. if tmpDealer.hand <= 21:
  180. newstates.append(
  181. (('dealer', player.hand, player.hasPlayableAce, tmpDealer.hand, tmpDealer.hasPlayableAce), card[1], reward)
  182. )
  183. else:
  184. reward = self.rewardCalc(player.hand, dealer.hand)
  185. newstates.append(
  186. (('terminal', player.hand, False, dealer.hand, False), 1, reward)
  187. )
  188. else:
  189. return []
  190.  
  191. return newstates
  192.  
  193. def value_iteration(mdp):
  194. delta = 1
  195. margin = 0.000001
  196. V = {}
  197. for state in mdp.states:
  198. V[state] = 0
  199. while (delta >= margin):
  200. delta = 0
  201. for s in mdp.states:
  202. v = V[s]
  203. hit = 0
  204. stand = 0
  205. dd = 0
  206. if(s[0] == 'dealer'):
  207. noop = 0
  208. else:
  209. noop = -99999999999
  210. for a in mdp.actions(s):
  211. for newState, prob, reward in mdp.succAndProbReward(s, a):
  212. if a == 'Hit':
  213. hit += prob * (reward + mdp.discountFactor * V[newState])
  214. elif a == 'Stand':
  215. stand += prob * (reward + mdp.discountFactor * V[newState])
  216. elif a == 'Noop':
  217. noop += prob * (reward + mdp.discountFactor * V[newState])
  218. elif a == 'Double down':
  219. dd += prob * (reward + mdp.discountFactor * V[newState])
  220.  
  221. V[s] = max([hit, stand, noop, dd])
  222. delta = max(delta, np.abs(v - V[s]))
  223.  
  224. P = {}
  225. for state in mdp.states:
  226. hit = 0
  227. stand = 0
  228. dd = 0
  229. if(state[0] == 'dealer'):
  230. noop = 0
  231. else:
  232. noop = -99999999999
  233. for a in mdp.actions(state):
  234. for newState, prob, reward in mdp.succAndProbReward(state, a):
  235. if a == 'Hit':
  236. hit += prob * (reward + mdp.discountFactor * V[newState])
  237. elif a == 'Stand':
  238. stand += prob * (reward + mdp.discountFactor * V[newState])
  239. elif a == 'Noop':
  240. noop += prob * (reward + mdp.discountFactor * V[newState])
  241. elif a == 'Double down':
  242. dd += prob * (reward + mdp.discountFactor * V[newState])
  243. if hit >= stand and hit >= noop and hit >= dd:
  244. best = 'Hit'
  245. elif stand >= hit and stand >= noop and stand >= dd:
  246. best = 'Stand'
  247. elif dd >= hit and dd >= stand and stand >= noop:
  248. best = 'Double down'
  249. else:
  250. best = 'Noop'
  251. P[state] = best
  252. return P, V
  253.  
  254. mdp = BlackjackMDP()
  255. startstates = []
  256. startprobs = []
  257. startrewards = []
  258. P, V = value_iteration(mdp)
  259.  
  260. for state, prob, reward in mdp.succAndProbReward(mdp.startState(), 'Init'):
  261. startstates.append(state)
  262. startprobs.append(prob)
  263. startrewards.append(reward)
  264.  
  265.  
  266.  
  267. totalreward = 0
  268. for i in range(1000):
  269. st = random.choices(startstates, weights=startprobs)[0]
  270. while st[0] != 'terminal':
  271. states = []
  272. probs = []
  273. rewards = []
  274. R = {}
  275. for state, prob, reward in mdp.succAndProbReward(st, P[st]):
  276. states.append(state)
  277. probs.append(prob)
  278. rewards.append(reward)
  279. R[state] = reward
  280. st = random.choices(states, weights=probs)[0]
  281. totalreward += R[st]
  282. print("Win percentage after 1000 games: {0}%".format((1000/2 + totalreward) / 1000 * 100))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement