Pythonでバンディットアルゴリズム関連のサンプルコード一覧
ε-greedy方策
import numpy as np
class Agent:
def __init__(self, epsilon, num_trials):
self.epsilon = epsilon
self.Qs = np.zeros(num_trials)
self.ns = np.zeros(num_trials)
def update(self, select_arm, reward):
self.ns[select_arm] += 1
self.Qs[select_arm] += (reward - self.Qs[select_arm]) / self.ns[select_arm]
def get_action(self):
if np.random.rand() < self.epsilon:
return np.random.randint(0, len(self.Qs))
return np.argmax(self.Qs)import numpy as np
class Bandit:
def __init__(self, arms):
self.rates = np.random.rand(arms)
print(np.round(self.rates, 3))
def play(self, arm):
rate = self.rates[arm]
if rate > np.random.rand():
return 1
else:
return 0import numpy as np
import matplotlib.pyplot as plt
from bandit import Bandit
from agent import Agent
runs = 200
arms = 100
num_trials = 1000
epsilon = 0.4
all_rates = np.zeros((runs, num_trials))
for run in range(runs):
bandit = Bandit(arms)
agent = Agent(epsilon, arms)
total_reward = 0
rates = []
#total_rewards = []
for n in range(num_trials):
# select action
select_arm = agent.get_action()
# play slot game
reward = bandit.play(select_arm)
# update reward
agent.update(select_arm, reward)
total_reward += reward
#total_rewards.append(total_reward)
rates.append(total_reward / (n+1))
all_rates[run] = rates
avg_rates = np.average(all_rates, axis=0)
print(avg_rates[num_trials-1])
plt.figure(figsize=(8, 6))
for run in range(runs):
plt.plot(all_rates[run], alpha=0.1)
plt.plot(avg_rates, color='red', label='average')
plt.xlabel('Steps')
plt.ylabel('Rates')
plt.title('$runs:'+str(runs) + ', \epsilon='+str(epsilon)+'$', loc='left')
plt.suptitle('Average reward', fontsize=20)
plt.grid()
plt.legend()
plt.ylim(0, 1)
plt.show()UCB方策
import numpy as np
class Agent:
def __init__(self, num_trials):
self.Qs = np.zeros(num_trials)
self.ns = np.zeros(num_trials)
self.ucb_values = np.zeros(num_trials)
def update(self, select_arm, reward):
self.ns[select_arm] += 1
#self.Qs[select_arm] += (reward - self.Qs[select_arm]) / self.ns[select_arm]
self.Qs[select_arm] = ((self.ns[select_arm]-1) / float(self.ns[select_arm])) * self.Qs[select_arm] + (1 / float(self.ns[select_arm])) * reward
def get_action(self):
n_arms = len(self.counts)
for arm in range(n_arms):
if self.counts[arm] == 0:
return arm
total_counts = sum(self.counts)
bonus = np.sqrt((2 * np.log(np.array(total_counts))) / np.array(self.counts))
ucb_values = np.array(self.values) + bonus
return np.argmax(ucb_values)import numpy as np
class Bandit:
def __init__(self, arms):
self.rates = np.random.rand(arms)
print(np.round(self.rates, 3))
def play(self, arm):
rate = self.rates[arm]
if rate > np.random.rand():
return 1
else:
return 0import numpy as np
import matplotlib.pyplot as plt
from bandit import Bandit
from agent import Agent
runs = 200
arms = 100
num_trials = 1000
total_reward = 0
total_rewards = []
rates = []
all_rates = np.zeros((runs, num_trials))
for run in range(runs):
bandit = Bandit(arms)
agent = Agent(arms)
total_reward = 0
rates = []
for n in range(num_trials):
for a in range(arms):
# select action
if agent.ns[a] == 0:
agent.ucb_values[a] = np.inf
else:
agent.ucb_values[a] = agent.Qs[a] / agent.ns[a] + np.sqrt(2 * np.log(n) / agent.ns[a] )
select_arm = np.argmax(agent.ucb_values)
###select_arm = agent.get_action()
# play slot game
reward = bandit.play(select_arm)
# update
#agent.update(select_arm, reward)
agent.Qs[select_arm] += reward
agent.ns[select_arm] += 1
total_reward += reward
#total_rewards.append(total_reward)
rates.append(total_reward / (n+1))
all_rates[run] = rates
#print(total_rewards)
#print(rates)
avg_rates = np.average(all_rates, axis=0)
print(avg_rates[num_trials-1])
plt.figure(figsize=(8, 6))
for run in range(runs):
plt.plot(all_rates[run], alpha=0.1)
plt.plot(avg_rates, color='red', label='average')
plt.xlabel('Steps')
plt.ylabel('Rates')
plt.title('$runs:'+str(runs) + '$', loc='left')
plt.suptitle('Average reward', fontsize=20)
plt.grid()
plt.legend()
plt.ylim(0, 1)
plt.show()

