Untitled

class BanditSimulation(object):

    def __init__(self, k_arms, T, number_of_runs):
        self.T = T
        self.number_of_runs = number_of_runs
        self.k_arms = k_arms
        self.cumulative_regret = {"Thompson Sampling":np.zeros((T,number_of_runs)),
                                  "UCB":np.zeros((T,number_of_runs))}

    def run(self):

        for run_number in range(self.number_of_runs):

            mu_vector = np.random.uniform(size=self.k_arms)

            arms = [BernoulliArm(mu) for mu in mu_vector]

            ucb = UCB(self.k_arms)

            thompson_sampling = ThompsonSampling(self.k_arms)

            for t in range(self.T):

                ucb_chosen_arm = ucb.select_arm()
                ucb_reward = arms[ucb_chosen_arm].draw()
                ucb.update(ucb_chosen_arm, ucb_reward)
                ucb_regret = max(mu_vector) - ucb_reward

                thompson_sampling_chosen_arm = thompson_sampling.select_arm()
                thompson_sampling_reward = arms[thompson_sampling_chosen_arm].draw()
                thompson_sampling.update(thompson_sampling_chosen_arm, thompson_sampling_reward)
                thompson_sampling_regret = max(mu_vector) - thompson_sampling_reward

                if t==0:
                    self.cumulative_regret["UCB"][t,run_number] = ucb_regret
                    self.cumulative_regret["Thompson Sampling"][t,run_number] = thompson_sampling_regret
                else:
                    self.cumulative_regret["UCB"][t,run_number] = ucb_regret + self.cumulative_regret["UCB"][t-1,run_number]
                    self.cumulative_regret["Thompson Sampling"][t,run_number] = thompson_sampling_regret + self.cumulative_regret["Thompson Sampling"][t-1,run_number]