Advertisement
Hitesh_jadhav

error_in_python_reinforcement_learning

May 7th, 2024
44
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.56 KB | None | 0 0
  1. import numpy as np
  2.  
  3. # R matrix
  4. R = np.matrix([[-1,-1,-1,-1,0,-1],
  5. [-1,-1,-1,0,-1,100],
  6. [-1,-1,-1,0,-1,-1],
  7. [-1,0,0,-1,0,-1],
  8. [-1,0,0,-1,-1,100],
  9. [-1,0,-1,-1,0,100]])
  10.  
  11. # Q matrix
  12. Q = np.matrix(np.zeros([6,6]))
  13.  
  14. # Gamma (learning parameter)
  15. gamma = 0.8
  16.  
  17. # initial state. (Usually to be chosen at random)
  18. initial_state = 1
  19.  
  20. # this function return all available actions in the state given as argument
  21. def available_actions(state):
  22. current_state_row = R[state]
  23. av_act = np.where(current_state_row >= 0)[1]
  24. return av_act
  25.  
  26. # get available actions in the current state
  27. available_act = available_actions(initial_state)
  28.  
  29. # This function chooses at random which action to be performed within the range
  30. # of all the available actions
  31. def sample_next_action(available_action_range):
  32. next_action = int(np.random.choice(available_act,1))
  33.  
  34. # sample next action to be performed
  35. action = sample_next_action(available_act)
  36.  
  37. # This function updates the Q matrix according to the path selected and the Q
  38. # learning method
  39. def update(current_state, action, gamma):
  40. max_index = np.where(Q[action] == np.max(Q[action]))
  41.  
  42. if max_index.shape[0] > 1:
  43. max_index = int(np.random.choice(max_index, size=1))
  44. else:
  45. max_index = int(max_index)
  46. max_value = Q[action, max_index]
  47. # Q learning formula
  48. Q[current_state, action] = R[current_state, action] + gamma * max_value
  49.  
  50.  
  51. # update Q matrix
  52. update(initial_state, action, gamma)
  53.  
  54. # Training
  55.  
  56. # Trainn over 10 000 iterations. (reiterate the process above).
  57. for i in range(10000):
  58. current_state = np.random.radint(0, int(Q.shape[0]))
  59. available_act = available_actions(current_state)
  60. action = sample_next_action(available_act)
  61. update(current_state, action, gamma)
  62.  
  63. # Normalize the "trained" Q matrix
  64. print("Trined Q matrix:")
  65. print(Q/np.max(Q) * 100)
  66.  
  67.  
  68. # Testing
  69.  
  70. # goal state = 5
  71. # best sequence path starting from 2 -> 2, 3, 1, 5
  72.  
  73. current_state = 1
  74. steps = [current_state]
  75.  
  76. while current_state !=5:
  77.  
  78. next_step_index = np.where(Q[current_state] == np.max(Q[current_state]))[1]
  79.  
  80.  
  81. if next_step_index.shape[0] > 1:
  82. next_step_index = int(np.random.choice(next_step_index, size = 1))
  83. else:
  84. next_step_index = int(next_step_index)
  85.  
  86. steps.append(next_step_index)
  87. current_state = next_step_index
  88.  
  89.  
  90. # print selected sequence of steps
  91. print("selected path:")
  92. print(steps)
  93.  
  94.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement