[Python]バンディットアルゴリズム[強化学習]

Pythonでバンディットアルゴリズム関連のサンプルコード一覧

ε-greedy方策
ＵＣＢ方策

ε-greedy方策

import numpy as np

class Agent:
    def __init__(self, epsilon, num_trials):
        self.epsilon = epsilon
        self.Qs = np.zeros(num_trials)
        self.ns = np.zeros(num_trials)

    def update(self, select_arm, reward):
        self.ns[select_arm] += 1
        self.Qs[select_arm] += (reward - self.Qs[select_arm]) / self.ns[select_arm]

    def get_action(self):
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, len(self.Qs))
        return np.argmax(self.Qs)

import numpy as np

class Bandit:
    def __init__(self, arms):
        self.rates = np.random.rand(arms)
        print(np.round(self.rates, 3))

    def play(self, arm):
        rate = self.rates[arm]
        if rate > np.random.rand():
            return 1
        else:
            return 0

import numpy as np
import matplotlib.pyplot as plt
from bandit import Bandit
from agent import Agent

runs = 200
arms = 100
num_trials = 1000
epsilon = 0.4

all_rates = np.zeros((runs, num_trials))

for run in range(runs):
    bandit = Bandit(arms)
    agent = Agent(epsilon, arms)

    total_reward = 0
    rates = []
    #total_rewards = []

    for n in range(num_trials):
        # select action
        select_arm = agent.get_action()

        # play slot game
        reward = bandit.play(select_arm)

        # update reward 
        agent.update(select_arm, reward)

        total_reward += reward

        #total_rewards.append(total_reward)
        rates.append(total_reward / (n+1))

    all_rates[run] = rates

avg_rates = np.average(all_rates, axis=0)
print(avg_rates[num_trials-1])

plt.figure(figsize=(8, 6))
for run in range(runs):
    plt.plot(all_rates[run], alpha=0.1) 
plt.plot(avg_rates, color='red', label='average')
plt.xlabel('Steps')
plt.ylabel('Rates')
plt.title('$runs:'+str(runs) + ', \epsilon='+str(epsilon)+'$', loc='left') 
plt.suptitle('Average reward', fontsize=20)
plt.grid()
plt.legend()
plt.ylim(0, 1)
plt.show()

ＵＣＢ方策

import numpy as np

class Agent:
    def __init__(self, num_trials):
        self.Qs = np.zeros(num_trials)
        self.ns = np.zeros(num_trials)
        self.ucb_values = np.zeros(num_trials)
        

    def update(self, select_arm, reward):
        self.ns[select_arm] += 1
        #self.Qs[select_arm] += (reward - self.Qs[select_arm]) / self.ns[select_arm]
        self.Qs[select_arm] = ((self.ns[select_arm]-1) / float(self.ns[select_arm])) * self.Qs[select_arm] + (1 / float(self.ns[select_arm])) * reward
    def get_action(self):
        n_arms = len(self.counts)
        for arm in range(n_arms):
            if self.counts[arm] == 0:
                return arm

        total_counts = sum(self.counts)
        bonus = np.sqrt((2 * np.log(np.array(total_counts))) / np.array(self.counts))
        ucb_values = np.array(self.values) + bonus
        return np.argmax(ucb_values)

import numpy as np

class Bandit:
    def __init__(self, arms):
        self.rates = np.random.rand(arms)
        print(np.round(self.rates, 3))

    def play(self, arm):
        rate = self.rates[arm]
        if rate > np.random.rand():
            return 1
        else:
            return 0

import numpy as np
import matplotlib.pyplot as plt
from bandit import Bandit
from agent import Agent

runs = 200
arms = 100
num_trials = 1000

total_reward = 0
total_rewards = []
rates = []
all_rates = np.zeros((runs, num_trials))

for run in range(runs):
    bandit = Bandit(arms)
    agent = Agent(arms)

    total_reward = 0
    rates = []

    for n in range(num_trials):
        for a in range(arms):
            # select action
            if agent.ns[a] == 0:
                agent.ucb_values[a] = np.inf
            else:
                agent.ucb_values[a] = agent.Qs[a] / agent.ns[a] + np.sqrt(2 * np.log(n) / agent.ns[a] )
        select_arm = np.argmax(agent.ucb_values)
        ###select_arm = agent.get_action()

        # play slot game
        reward = bandit.play(select_arm)

        # update 
        #agent.update(select_arm, reward)
        agent.Qs[select_arm] += reward
        agent.ns[select_arm] += 1
        total_reward += reward

        #total_rewards.append(total_reward)
        rates.append(total_reward / (n+1))

    all_rates[run] = rates

#print(total_rewards)
#print(rates)

avg_rates = np.average(all_rates, axis=0)
print(avg_rates[num_trials-1])

plt.figure(figsize=(8, 6))
for run in range(runs):
    plt.plot(all_rates[run], alpha=0.1) 
plt.plot(avg_rates, color='red', label='average')
plt.xlabel('Steps')
plt.ylabel('Rates')
plt.title('$runs:'+str(runs) + '$', loc='left') 
plt.suptitle('Average reward', fontsize=20)
plt.grid()
plt.legend()
plt.ylim(0, 1)
plt.show()