用简单平均法强化学习

from abc import ABC from typing import List import numpy as np import pandas as pd import matplotlib.pyplot as plt from multiprocessing.pool import Pool class Strategy(ABC): def update_estimates(self, step: int, estimates: np.ndarray, action: int, reward: float): raise NotImplementedError() class Averaging(Strategy): def __str__(self): return 'avg' def update_estimates(self, step: int, estimates: np.ndarray, action: int, reward: float): current = estimates[action] return current + 1.0 / step * (reward - current) class WeightedAveraging(Strategy): def __init__(self, alpha): self.alpha = alpha def __str__(self): return 'weighted-avg_alpha=%.2f' % self.alpha def update_estimates(self, step: int, estimates: List[float], action: int, reward: float): current = estimates[action] return current + self.alpha * (reward - current) class Agent: def __init__(self, nb_actions, epsilon, strategy: Strategy): self.nb_actions = nb_actions self.epsilon = epsilon self.estimates = np.zeros(self.nb_actions) self.strategy = strategy def __str__(self): return ','.join(['eps=%.2f' % self.epsilon, str(self.strategy)]) def get_action(self): best_known = np.argmax(self.estimates) if np.random.rand() < self.epsilon and len(self.estimates) > 1: explore = best_known while explore == best_known: explore = np.random.randint(0, len(self.estimates)) return explore return best_known def update_estimates(self, step, action, reward): self.estimates[action] = self.strategy.update_estimates(step, self.estimates, action, reward) def reset(self): self.estimates = np.zeros(self.nb_actions) def play_bandit(agent, nb_arms, nb_steps): agent.reset() bandit_rewards = np.random.normal(0, 1, nb_arms) rewards = list() optimal_actions = list() for step in range(1, nb_steps + 1): action = agent.get_action() reward = bandit_rewards[action] + np.random.normal(0, 1) agent.update_estimates(step, action, reward) rewards.append(reward) optimal_actions.append(np.argmax(bandit_rewards) == action) return pd.DataFrame(dict( optimal_actions=optimal_actions, rewards=rewards )) def main(): nb_tasks = 2000 nb_steps = 1000 nb_arms = 10 fig, (ax_rewards, ax_optimal) = plt.subplots(2, 1, sharex='col', figsize=(8, 9)) pool = Pool() agents = [ Agent(nb_actions=nb_arms, epsilon=0.00, strategy=Averaging()), Agent(nb_actions=nb_arms, epsilon=0.01, strategy=Averaging()), Agent(nb_actions=nb_arms, epsilon=0.10, strategy=Averaging()), Agent(nb_actions=nb_arms, epsilon=0.00, strategy=WeightedAveraging(0.5)), Agent(nb_actions=nb_arms, epsilon=0.01, strategy=WeightedAveraging(0.5)), Agent(nb_actions=nb_arms, epsilon=0.10, strategy=WeightedAveraging(0.5)), ] for agent in agents: print('Agent: %s' % str(agent)) args = [(agent, nb_arms, nb_steps) for _ in range(nb_tasks)] results = pool.starmap(play_bandit, args) df_result = sum(results) / nb_tasks df_result.rewards.plot(ax=ax_rewards, label=str(agent)) df_result.optimal_actions.plot(ax=ax_optimal) ax_rewards.set_title('Rewards') ax_rewards.set_ylabel('Average reward') ax_rewards.legend() ax_optimal.set_title('Optimal action') ax_optimal.set_ylabel('% optimal action') ax_optimal.set_xlabel('steps') plt.xlim([0, nb_steps]) plt.show() if __name__ == '__main__': main()

1条回答

网友
1楼 · 发布于 2024-06-26 01:53:29

在更新规则的公式中
new_estimate = current_estimate + 1.0 / step * (reward - current_estimate)
参数step应该是特定action被执行的次数，而不是模拟的总步数。因此，您需要将该变量与操作值一起存储，以便将其用于更新。你知道吗
这也可以从第2.4章增量实现末尾的伪代码框中看到：
^{（来源：Richard S.Sutton和Andrew G.Barto:强化学习-简介，第二版，2018年，第2.4章增量实施）}

代码（MVP）

相关问题更多 >

编程相关推荐

热门问题

热门文章