Restless Bandit problem

import numpy as np
import matplotlib.pyplot as plt

class RestlessBandit:
    def __init__(self, n_arms, alpha=1.0):
        self.n_arms = n_arms
        self.alpha = alpha
        self.t = 0
        self.rewards = [0.0] * n_arms
        self.plays = [0] * n_arms

    def play(self):
        self.t += 1
        ucb = [self.rewards[i] + self.alpha * np.sqrt(np.log(self.t) / self.plays[i]) for i in range(self.n_arms)]
        return np.argmax(ucb)

    def update(self, arm, reward):
        self.plays[arm] += 1
        self.rewards[arm] = ((self.plays[arm] - 1) * self.rewards[arm] + reward) / self.plays[arm]

# Define the number of arms
n_arms = 5

# Create an instance of the RestlessBandit class
bandit = RestlessBandit(n_arms)

# Define the number of rounds to play
n_rounds = 5000

# Define the true expected rewards for each arm
true_rewards = [0.1, 0.2, 0.3, 0.4, 0.5]

# Store the results for each round
results = []


# Play the bandit for each round
for i in range(n_rounds):


    # Choose an arm to play
    arm = bandit.play()

    # Simulate the reward for the chosen arm
    reward = np.random.normal(true_rewards[arm], 1.0)

    # Update the rewards for the chosen arm
    bandit.update(arm, reward)


    # Update the true expected rewards for each arm
    true_rewards = [reward + np.random.normal(0, 0.01) for reward in true_rewards]

    # Store the results for each round
    results.append(reward)


# Plot the results
plt.plot(results)
plt.xlabel("Round")
plt.ylabel("Reward")
plt.title("Restless Bandit Problem")
plt.show()