Advertisement
Guest User

Untitled

a guest
Jan 20th, 2017
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.08 KB | None | 0 0
  1. import numpy as np
  2.  
  3. # R matrix
  4. R = np.matrix([ [-1,-1,-1,-1,0,-1],
  5. [-1,-1,-1,0,-1,100],
  6. [-1,-1,-1,0,-1,-1],
  7. [-1,0,0,-1,0,-1],
  8. [-1,0,0,-1,-1,100],
  9. [-1,0,-1,-1,0,100] ])
  10.  
  11. # Q matrix
  12. Q = np.matrix(np.zeros([6,6]))
  13.  
  14. # Gamma (learning parameter).
  15. gamma = 0.8
  16.  
  17. # Initial state. (Usually to be chosen at random)
  18. initial_state = 1
  19.  
  20. # This function returns all available actions in the state given as an argument
  21. def available_actions(state):
  22. current_state_row = R[state,]
  23. av_act = np.where(current_state_row >= 0)[1]
  24. return av_act
  25.  
  26. # Get available actions in the current state
  27. available_act = available_actions(initial_state)
  28.  
  29. # This function chooses at random which action to be performed within the range
  30. # of all the available actions.
  31. def sample_next_action(available_actions_range):
  32. next_action = int(np.random.choice(available_act,1))
  33. return next_action
  34.  
  35. # Sample next action to be performed
  36. action = sample_next_action(available_act)
  37.  
  38. # This function updates the Q matrix according to the path selected and the Q
  39. # learning algorithm
  40. def update(current_state, action, gamma):
  41.  
  42. max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
  43.  
  44. if max_index.shape[0] > 1:
  45. max_index = int(np.random.choice(max_index, size = 1))
  46. else:
  47. max_index = int(max_index)
  48. max_value = Q[action, max_index]
  49.  
  50. # Q learning formula
  51. Q[current_state, action] = R[current_state, action] + gamma * max_value
  52.  
  53. # Update Q matrix
  54. update(initial_state,action,gamma)
  55.  
  56. #-------------------------------------------------------------------------------
  57. # Training
  58.  
  59. # Train over 10 000 iterations. (Re-iterate the process above).
  60. for i in range(10000):
  61. current_state = np.random.randint(0, int(Q.shape[0]))
  62. available_act = available_actions(current_state)
  63. action = sample_next_action(available_act)
  64. update(current_state,action,gamma)
  65.  
  66. # Normalize the "trained" Q matrix
  67. print("Trained Q matrix:")
  68. print(Q/np.max(Q)*100)
  69.  
  70. #-------------------------------------------------------------------------------
  71. # Testing
  72.  
  73. # Goal state = 5
  74. # Best sequence path starting from 2 -> 2, 3, 1, 5
  75.  
  76. current_state = 2
  77. steps = [current_state]
  78.  
  79. while current_state != 5:
  80.  
  81. next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
  82.  
  83. if next_step_index.shape[0] > 1:
  84. next_step_index = int(np.random.choice(next_step_index, size = 1))
  85. else:
  86. next_step_index = int(next_step_index)
  87.  
  88. steps.append(next_step_index)
  89. current_state = next_step_index
  90.  
  91. # Print selected sequence of steps
  92. print("Selected path:")
  93. print(steps)
  94.  
  95. #-------------------------------------------------------------------------------
  96. # OUTPUT
  97. #-------------------------------------------------------------------------------
  98. #
  99. # Trained Q matrix:
  100. #[[ 0. 0. 0. 0. 80. 0. ]
  101. # [ 0. 0. 0. 64. 0. 100. ]
  102. # [ 0. 0. 0. 64. 0. 0. ]
  103. # [ 0. 80. 51.2 0. 80. 0. ]
  104. # [ 0. 80. 51.2 0. 0. 100. ]
  105. # [ 0. 80. 0. 0. 80. 100. ]]
  106. #
  107. # Selected path:
  108. # [2, 3, 1, 5]
  109. #
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement