Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import matplotlib.pyplot as plt
- import numpy as np
- import math
- n=10
- plays=1000
- epoch=2000
- average_rewards_1=np.zeros(1000)
- average_rewards_2=np.zeros(1000)
- average_rewards_3=np.zeros(1000)
- def action(epsilon, Q):
- #random probability
- p=np.random.random()
- #choose action
- if p > epsilon:
- action=np.argmax(Q)
- else:
- action=np.random.randint(0, 10)
- #get reward
- reward=np.random.uniform(Q[action], 1)
- return action, reward
- for i in range(epoch):
- epsilon_1=0.1
- epsilon_2=0.01
- epsilon_3=0.0
- Q1=np.zeros((n, 1))
- N1=np.zeros((n, 1))
- Q2=np.zeros((n, 1))
- N2=np.zeros((n, 1))
- Q3=np.zeros((n, 1))
- N3=np.zeros((n, 1))
- #each Q is chosen randomly by a uniform distribution
- for j in range(n):
- Q1[j]=np.random.uniform(0, 1)
- Q2[j]=np.random.uniform(0, 1)
- Q3[j]=np.random.uniform(0, 1)
- iteration=0
- total_reward_1=0
- total_reward_2=0
- total_reward_3=0
- average_reward_1=[]
- average_reward_2=[]
- average_reward_3=[]
- total_optimal_action_1=np.zeros(1000)
- total_optimal_action_2=np.zeros(1000)
- total_optimal_action_3=np.zeros(1000)
- for k in range(plays):
- a1, reward1=action(epsilon_1, Q1)
- a2, reward2=action(epsilon_2, Q2)
- a3, reward3=action(epsilon_3, Q3)
- N1[a1]=N1[a1]+1
- N2[a2]=N2[a2]+1
- N3[a3]=N3[a3]+1
- Q1[a1]=Q1[a1]+(1/N1[a1])*(reward1-Q1[a1])
- Q2[a2]=Q2[a2]+(1/N2[a2])*(reward2-Q2[a2])
- Q3[a3]=Q3[a3]+(1/N3[a3])*(reward3-Q3[a3])
- iteration=iteration + 1
- total_reward_1=total_reward_1 + reward1
- total_reward_2=total_reward_2 + reward2
- total_reward_3=total_reward_3 + reward3
- average_reward_1.append(total_reward_1/iteration)
- average_reward_2.append(total_reward_2/iteration)
- average_reward_3.append(total_reward_3/iteration)
- #??
- average_rewards_1=np.add(average_rewards_1, average_reward_1)
- average_rewards_2=np.add(average_rewards_2, average_reward_2)
- average_rewards_3=np.add(average_rewards_3, average_reward_3)
- average_rewards_1=average_rewards_1/2000
- average_rewards_2=average_rewards_2/2000
- average_rewards_3=average_rewards_3/2000
- plt.plot(range(1000), average_rewards_1, label='Epsilon=0.1', color='b')
- plt.plot(range(1000), average_rewards_2, label='Epsilon=0.01', color='k')
- plt.plot(range(1000), average_rewards_3, label='Epsilon=0.0', color='r')
- plt.xlabel('Iterations')
- plt.ylabel('Average reward')
- plt.legend(loc='best')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement