Skip to content
Snippets Groups Projects
Commit 75a6f326 authored by tuhe's avatar tuhe
Browse files

Week8

parent edcb2b87
No related branches found
No related tags found
No related merge requests found
Showing
with 889 additions and 29 deletions
...@@ -4,11 +4,11 @@ exam_tabular_examples ...@@ -4,11 +4,11 @@ exam_tabular_examples
#solutions/ex01 #solutions/ex01
#solutions/ex02 #solutions/ex02
#solutions/ex03 #solutions/ex03
solutions/ex04 #solutions/ex04
solutions/ex05 #solutions/ex05
solutions/ex06 #solutions/ex06
solutions/ex07 #solutions/ex07
solutions/ex08 #solutions/ex08
solutions/ex09 solutions/ex09
solutions/ex10 solutions/ex10
solutions/ex11 solutions/ex11
...@@ -31,10 +31,10 @@ solutions/ex13 ...@@ -31,10 +31,10 @@ solutions/ex13
#irlc/tests/tests_week02.py #irlc/tests/tests_week02.py
#irlc/tests/tests_week03.py #irlc/tests/tests_week03.py
#irlc/tests/tests_week04.py #irlc/tests/tests_week04.py
irlc/tests/tests_week05.py #irlc/tests/tests_week05.py
irlc/tests/tests_week06.py #irlc/tests/tests_week06.py
irlc/tests/tests_week07.py #irlc/tests/tests_week07.py
irlc/tests/tests_week08.py #irlc/tests/tests_week08.py
irlc/tests/tests_week09.py irlc/tests/tests_week09.py
irlc/tests/tests_week10.py irlc/tests/tests_week10.py
irlc/tests/tests_week11.py irlc/tests/tests_week11.py
...@@ -68,10 +68,10 @@ irlc/exam/exam20*/solution ...@@ -68,10 +68,10 @@ irlc/exam/exam20*/solution
# irlc/lectures/lec02 # irlc/lectures/lec02
#irlc/lectures/lec03 #irlc/lectures/lec03
#irlc/lectures/lec04 #irlc/lectures/lec04
irlc/lectures/lec05 #irlc/lectures/lec05
irlc/lectures/lec06 #irlc/lectures/lec06
irlc/lectures/lec07 #irlc/lectures/lec07
irlc/lectures/lec08 #irlc/lectures/lec08
irlc/lectures/lec09 irlc/lectures/lec09
irlc/lectures/lec10 irlc/lectures/lec10
irlc/lectures/lec11 irlc/lectures/lec11
......
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""This directory contains the exercises for week 8."""
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import numpy as np
import matplotlib.pyplot as plt
if __name__ == "__main__":
from irlc import Agent, train, savepdf
from irlc.ex08.bandits import StationaryBandit
bandit = StationaryBandit(k=10) # A 10-armed bandit
agent = Agent(bandit) # Recall the agent takes random actions
_, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500)
plt.plot(trajectories[0].reward)
plt.xlabel("Time step")
plt.ylabel("Reward per time step")
savepdf("dumbitA")
plt.show()
agent = Agent(bandit) # Recall the agent takes random actions
for i in range(10):
_, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500)
regret = np.asarray([r['gab'] for r in trajectories[0].env_info[1:]])
cum_regret = np.cumsum(regret)
plt.plot(cum_regret, label=f"Episode {i}")
plt.legend()
plt.xlabel("Time step")
plt.ylabel("Accumulated Regret")
savepdf("dumbitB")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
import numpy as np
import matplotlib.pyplot as plt
from gymnasium import Env
from gymnasium.spaces import Discrete
from irlc import train
from tqdm import tqdm
import sys
from irlc import cache_read, cache_write, cache_exists
class BanditEnvironment(Env):
r"""
A helper class for defining bandit problems similar to e.g. the 10-armed testbed discsused in (SB18).
We are going to implement the bandit problems as greatly simplfied gym environments, as this will allow us to
implement the bandit agents as the familiar ``Agent``. I hope this way of doing it will make it clearer that bandits
are in fact a sort of reinforcement learning method.
The following code shows an example of how to use a bandit environment:
.. runblock:: pycon
>>> from irlc.ex08.bandits import StationaryBandit
>>> env = StationaryBandit(k=10) # 10-armed testbed.
>>> env.reset() # Reset env.q_star
>>> s, r, _, _, info = env.step(3)
>>> print(f"The reward we got from taking arm a=3 was {r=}")
"""
def __init__(self, k : int):
r"""
Initialize a bandit problem. The observation space is given a dummy value since bandit problems of the sort
(SB18) discuss don't have observations.
:param k: The number of arms.
"""
super().__init__()
self.observation_space = Discrete(1) # Dummy observation space with a single observation.
self.action_space = Discrete(k) # The arms labelled 0,1,...,k-1.
self.k = k # Number of arms
def reset(self):
r"""
Use this function to reset the all internal parameters of the environment and get ready for a new episode.
In the (SB18) 10-armed bandit testbed, this would involve resetting the expected return
.. math::
q^*_a
The function must return a dummy state and info dictionary to agree with the gym ``Env`` class, but their values are
irrelevant
:return:
- s - a state, for instance 0
- info - the info dictionary, for instance {}
"""
raise NotImplementedError("Implement the reset method")
def bandit_step(self, a):
r"""This helper function simplify the definition of the environments ``step``-function.
Given an action :math:`r`, this function computes the reward obtained by taking that action :math:`r_t`
and the gab. This is defined as the expected reward we miss out on by taking the potentially suboptimal action :math:`a`
and is defined as:
.. math::
\Delta = \max_{a'} q^*_{a'} - q_a
Once implemented, the reward and regret enters into the ``step`` function as follows:
.. runblock:: pycon
>>> from irlc.ex08.bandits import StationaryBandit
>>> env = StationaryBandit(k=4) # 4-armed testbed.
>>> env.reset() # Reset all parameters.
>>> _, r, _, _, info = env.step(2) # Take action a=2
>>> print(f"Reward from a=2 was {r=}, the gab was {info['gab']=}")
:param a: The current action we take
:return:
- r - The reward we thereby incur
- gab - The regret incurred by taking this action (0 for an optimal action)
"""
reward = 0 # Compute the reward associated with arm a
gab = 0 # Compute the gab, by comparing to the optimal arms reward.
return reward, gab
def step(self, action):
r"""You do not have to edit this function.
In a bandit environment, the step function is simplified greatly since there are no
states to keep track on. It should simply return the reward incurred by the action ``a``
and (for convenience) also returns the gab in the ``info``-dictionary.
:param action: The current action we take :math:`a_t`
:return:
- next_state - This is always ``None``
- reward - The reward obtained by taking the given action. In (SB18) this is defined as :math:`r_t`
- terminated - Always ``False``. Bandit problems don't terminate.
- truncated - Always ``False``
- info - For convenience, this includes the gab (used by the plotting methods)
"""
reward, gab = self.bandit_step(action)
info = {'gab': gab}
return None, reward, False, False, info
class StationaryBandit(BanditEnvironment):
r"""Implement the 'stationary bandit environment' which is described in (SB18, Section 2.3)
and used as a running example throughout the chapter.
We will implement a version with a constant mean offset (q_star_mean), so that
q* = x + q_star_mean, x ~ Normal(0,1)
q_star_mean can just be considered to be zero at first.
"""
def __init__(self, k, q_star_mean=0):
super().__init__(k)
self.q_star_mean = q_star_mean
def reset(self):
""" Set q^*_k = N(0,1) + mean_value. The mean_value is 0 in most examples. I.e., implement the 10-armed testbed environment. """
self.q_star = np.random.randn(self.k) + self.q_star_mean
self.optimal_action = np.argmax(self.q_star) # Optimal action is the one with the largest q^*-value.
return 0, {} # The reset method in a gym Env must return a (dummy) state and a dictionary.
def bandit_step(self, a):
""" Return the reward/gab for action a for the simple bandit. Use self.q_star (see reset-function above).
To implement it, implement the reward (see the description of the 10-armed testbed for more information.
How is it computed from q^*_k?) and also compute the gab.
As a small hint, since we are computing the gab, it will in fact be the difference between the
value of q^* corresponding to the current arm, and the q^* value for the optimal arm.
Remember it is 0 if the optimal action is selected.
"""
# TODO: 2 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
# Actual logic goes here. Use self.q_star[a] to get mean reward and np.random.randn() to generate random numbers.
return reward, gab
def __str__(self):
return f"{type(self).__name__}_{self.q_star_mean}"
"""
Helper function for running a bunch of bandit experiments and plotting the results.
The function will run the agents in 'agents' (a list of bandit agents)
on the bandit environment 'bandit' and plot the result.
Each agent will be evaluated for num_episodes episodes, and one episode consist of 'steps' steps.
However, to speed things up you can use cache, and the bandit will not be evaluated for more than
'max_episodes' over all cache runs.
"""
def eval_and_plot(bandit, agents, num_episodes=2000, max_episodes=2000, steps=1000, labels=None, use_cache=True):
if labels is None:
labels = [str(agent) for agent in agents]
f, axs = plt.subplots(nrows=3, ncols=1)
f.set_size_inches(10,7)
(ax1, ax2, ax3) = axs
for i,agent in enumerate(agents):
rw, oa, regret, num_episodes = run_agent(bandit, agent, episodes=num_episodes, max_episodes=max_episodes, steps=steps, use_cache=use_cache)
ax1.plot(rw, label=labels[i])
ax2.plot(oa, label=labels[i])
ax3.plot(regret, label=labels[i])
for ax in axs:
ax.grid()
ax.set_xlabel("Steps")
ax1.set_ylabel("Average Reward")
ax2.set_ylabel("% optimal action")
ax3.set_ylabel("Regret $L_t$")
ax3.legend()
f.suptitle(f"Evaluated on {str(bandit)} for {num_episodes} episodes")
def run_agent(env, agent, episodes=2000, max_episodes=2000, steps=1000, use_cache=False, verbose=True):
"""
Helper function. most of the work involves the cache; the actual training is done by 'train'.
"""
C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = 0, 0, 0, 0
if use_cache:
cache = f"cache/{str(env)}_{str(agent)}_{steps}.pkl"
if cache_exists(cache):
print("> Reading from cache", cache)
C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = cache_read(cache)
regrets = []
rewards = []
cruns = max(0, min(episodes, max_episodes - C_n_episodes)) # Missing runs.
for _ in tqdm(range(cruns), file=sys.stdout, desc=str(agent),disable=not verbose):
stats, traj = train(env, agent, max_steps=steps, verbose=False, return_trajectory=True)
regret = np.asarray([r['gab'] for r in traj[0].env_info[1:]])
regrets.append(regret)
rewards.append(traj[0].reward)
regrets_cum_sum = C_regrets_cum_sum
oas_sum = C_oas_sum
rewards_sum = C_rewards_sum
episodes = C_n_episodes
if len(regrets) > 0:
regrets_cum_sum += np.cumsum(np.sum(np.stack(regrets), axis=0))
oas_sum += np.sum(np.stack(regrets) == 0, axis=0)
rewards_sum += np.sum(np.stack(rewards), axis=0)
episodes += cruns
if use_cache and cruns > 0:
cache_write((regrets_cum_sum, oas_sum, rewards_sum, episodes), cache, protocol=4)
return rewards_sum/episodes, oas_sum/episodes, regrets_cum_sum/episodes, episodes
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc import savepdf
import numpy as np
import matplotlib.pyplot as plt
from irlc.ex08.bandits import eval_and_plot, StationaryBandit
from irlc import Agent
class GradientAgent(Agent):
def __init__(self, env, alpha=None, use_baseline=True):
self.k = env.action_space.n
self.alpha = alpha
self.baseline=use_baseline
self.H = np.zeros((self.k,))
super().__init__(env)
def Pa(self):
""" This helper method returns the probability distribution P(A=a) of chosing the
arm a as a vector
"""
pi_a = np.exp(self.H)
return pi_a / np.sum(pi_a)
def pi(self, s, t, info_s=None):
if t == 0:
self.R_bar = 0 # average reward baseline
self.H *= 0 # Reset H to all-zeros.
self.t = t # Sore the current time step.
return np.random.choice( self.k, p=self.Pa() )
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
# TODO: 9 lines missing.
raise NotImplementedError("Implement function body")
def __str__(self):
return f"{type(self).__name__}_{self.alpha}_{'baseline' if self.baseline else 'no_baseline'}"
if __name__ == "__main__":
baseline_bandit = StationaryBandit(k=10, q_star_mean=4)
alphas = [0.1, 0.4]
agents = [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=False) for alpha in alphas]
agents += [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=True) for alpha in alphas]
labels = [f'Gradient Bandit alpha={alpha}' for alpha in alphas ]
labels += [f'With baseline: Gradient Bandit alpha={alpha}' for alpha in alphas ]
use_cache = False
eval_and_plot(baseline_bandit, agents, max_episodes=2000, num_episodes=100, labels=labels, use_cache=use_cache)
savepdf("gradient_baseline")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import matplotlib.pyplot as plt
from irlc.ex08.simple_agents import BasicAgent
from irlc.ex08.bandits import StationaryBandit, eval_and_plot
from irlc.ex08.nonstationary import MovingAverageAgent, NonstationaryBandit
from irlc.ex08.gradient_agent import GradientAgent
from irlc.ex08.ucb_agent import UCBAgent
from irlc import savepdf
import time
if __name__ == "__main__":
print("Ladies and gentlemen. It is time for the graaand bandit race")
def intro(bandit, agents):
print("We are live from the beautiful surroundings where they will compete in:")
print(bandit)
print("Who will win? who will have the most regret? we are about to find out")
print("in a minute after a brief word from our sponsors")
time.sleep(1)
print("And we are back. Let us introduce todays contestants:")
for a in agents:
print(a)
print("And they are off!")
epsilon = 0.1
alpha = 0.1
c = 2
# TODO: 1 lines missing.
raise NotImplementedError("Define the bandit here: bandit1 = ...")
# TODO: 5 lines missing.
raise NotImplementedError("define agents list here")
labels = ["Basic", "Moving avg.", "gradient", "Gradient+baseline", "UCB"]
'''
Stationary, no offset. Vanilla setting.
'''
intro(bandit1, agents)
# TODO: 1 lines missing.
raise NotImplementedError("Call eval_and_plot here")
plt.suptitle("Stationary bandit (no offset)")
savepdf("grand_race_1")
plt.show()
'''
Stationary, but with offset
'''
print("Whew what a race. Let's get ready to next round:")
# TODO: 1 lines missing.
raise NotImplementedError("Define bandit2 = ... here")
intro(bandit2, agents)
# TODO: 1 lines missing.
raise NotImplementedError("Call eval_and_plot here")
plt.suptitle("Stationary bandit (with offset)")
savepdf("grand_race_2")
plt.show()
'''
Long (nonstationary) simulations
'''
print("Whew what a race. Let's get ready to next round which will be a long one.")
# TODO: 1 lines missing.
raise NotImplementedError("define bandit3 here")
intro(bandit3, agents)
# TODO: 1 lines missing.
raise NotImplementedError("call eval_and_plot here")
plt.suptitle("Non-stationary bandit (no offset)")
savepdf("grand_race_3")
plt.show()
'''
Stationary, no offset, long run. Exclude stupid bandits.
'''
agents2 = []
agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=False)]
agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=True)]
agents2 += [UCBAgent(bandit1, c=2)]
labels = ["Gradient", "Gradient+baseline", "UCB"]
intro(bandit1, agents2)
# TODO: 1 lines missing.
raise NotImplementedError("Call eval_and_plot here")
plt.suptitle("Stationary bandit (no offset)")
savepdf("grand_race_4")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
import numpy as np
import matplotlib.pyplot as plt
from irlc.ex08.simple_agents import BasicAgent
from irlc.ex08.bandits import StationaryBandit, eval_and_plot
from irlc import savepdf
class NonstationaryBandit(StationaryBandit):
def __init__(self, k, q_star_mean=0, reward_change_std=0.01):
self.reward_change_std = reward_change_std
super().__init__(k, q_star_mean)
def bandit_step(self, a):
r""" Implement the non-stationary bandit environment (as described in (SB18)).
Hint: use reward_change_std * np.random.randn() to generate a single random number with the given std.
then add one to each coordinate. Remember you have to compute the regret as well, see StationaryBandit for ideas.
(remember the optimal arm will change when you add noise to q_star) """
# TODO: 2 lines missing.
raise NotImplementedError("Implement function body")
return super().bandit_step(a)
def __str__(self):
return f"{type(self).__name__}_{self.q_star_mean}_{self.reward_change_std}"
class MovingAverageAgent(BasicAgent):
r"""
The simple bandit from (SB18, Section 2.4), but with moving average alpha
as described in (SB18, Eqn. (2.3))
"""
def __init__(self, env, epsilon, alpha):
# TODO: 2 lines missing.
raise NotImplementedError("Implement function body")
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
def __str__(self):
return f"{type(self).__name__}_{self.epsilon}_{self.alpha}"
if __name__ == "__main__":
plt.figure(figsize=(10, 10))
epsilon = 0.1
alphas = [0.15, 0.1, 0.05]
# TODO: 4 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
labels = [f"Basic agent, epsilon={epsilon}"]
# TODO: 1 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
use_cache = False # Set this to True to use cache (after code works!)
eval_and_plot(bandit, agents, steps=10000, num_episodes=200, labels=labels, use_cache=use_cache)
savepdf("nonstationary_bandits")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
import numpy as np
import matplotlib.pyplot as plt
from irlc.ex08.bandits import StationaryBandit, eval_and_plot
from irlc import Agent
from irlc import savepdf
class BasicAgent(Agent):
r"""
Simple bandit as described on (SB18, Section 2.4).
"""
def __init__(self, env, epsilon):
super().__init__(env)
self.k = env.action_space.n
self.epsilon = epsilon
def pi(self, s, t, info=None):
""" Since this is a bandit, s=None and can be ignored, while t refers to the time step in the current episode """
if t == 0:
# At step 0 of episode. Re-initialize data structure.
# TODO: 2 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
# compute action here
# TODO: 1 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
""" Since this is a bandit, done, s, sp, info_s, info_sp can all be ignored.
From the input arguments you should only use a
"""
# TODO: 2 lines missing.
raise NotImplementedError("Implement function body")
def __str__(self):
return f"BasicAgent_{self.epsilon}"
if __name__ == "__main__":
N = 100000
S = [np.max( np.random.randn(10) ) for _ in range(100000) ]
print( np.mean(S), np.std(S)/np.sqrt(N) )
use_cache = False # Set this to True to use cache (after code works!)
from irlc.utils.timer import Timer
timer = Timer(start=True)
R = 100
steps = 1000
env = StationaryBandit(k=10)
agents = [BasicAgent(env, epsilon=.1), BasicAgent(env, epsilon=.01), BasicAgent(env, epsilon=0) ]
eval_and_plot(env, agents, num_episodes=100, steps=1000, max_episodes=150, use_cache=use_cache)
savepdf("bandit_epsilon")
plt.show()
print(timer.display())
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
import numpy as np
import matplotlib.pyplot as plt
from irlc.ex08.simple_agents import BasicAgent
from irlc import savepdf
from irlc import Agent
class UCBAgent(Agent):
def __init__(self, env, c=2):
self.c = c
super().__init__(env)
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
# TODO: 2 lines missing.
raise NotImplementedError("Train agent here")
def pi(self, s, k, info=None):
if k == 0:
""" Initialize the agent"""
# TODO: 3 lines missing.
raise NotImplementedError("Reset agent (i.e., make it ready to learn in a new episode with a new optimal action)")
# TODO: 1 lines missing.
raise NotImplementedError("Compute (and return) optimal action")
def __str__(self):
return f"{type(self).__name__}_{self.c}"
from irlc.ex08.bandits import StationaryBandit, eval_and_plot
if __name__ == "__main__":
r"""Reproduce (SB18, Fig. 2.4) comparing UCB agent to epsilon greedy """
runs, use_cache = 100, False
c = 2
eps = 0.1
steps = 1000
env = StationaryBandit(k=10)
agents = [UCBAgent(env,c=c), BasicAgent(env, epsilon=eps)]
eval_and_plot(bandit=env, agents=agents, num_episodes=runs, steps=steps, max_episodes=2000, use_cache=use_cache)
savepdf("UCB_agent")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.utils.bandit_graphics_environment import GraphicalBandit
import time
from irlc import train
from irlc.ex08.simple_agents import BasicAgent
from irlc import interactive
def bandit_eps(autoplay=False):
env = GraphicalBandit(10, render_mode='human',frames_per_second=30)
env.reset()
agent = BasicAgent(env, epsilon=0.1)
agent.method = 'Epsilon-greedy'
env, agent = interactive(env, agent, autoplay=autoplay)
t0 = time.time()
n = 3000
stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False)
tpf = (time.time()-t0)/ n
print("tpf", tpf, 'fps', 1/tpf)
env.close()
if __name__ == "__main__":
bandit_eps()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.utils.bandit_graphics_environment import GraphicalBandit
from irlc import interactive, train
# import numpy as np
import time
def bandit_ucb(autoplay=False):
env = GraphicalBandit(10, render_mode='human', frames_per_second=30)
env.reset()
#env.viewer.show_q_star = True
#env.viewer.show_q_ucb = True
from irlc.ex08.ucb_agent import UCBAgent
agent = UCBAgent(env, c=1)
agent.method = 'UCB'
env, agent = interactive(env, agent, autoplay=autoplay)
t0 = time.time()
n = 500
stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False)
tpf = (time.time() - t0) / n
print("tpf", tpf, 'fps', 1 / tpf)
env.close()
if __name__ == "__main__":
bandit_ucb()
...@@ -79,20 +79,20 @@ class CartpoleCostQuestion(DirectSolverQuestion): ...@@ -79,20 +79,20 @@ class CartpoleCostQuestion(DirectSolverQuestion):
from irlc.ex05.direct_cartpole_kelly import compute_solutions from irlc.ex05.direct_cartpole_kelly import compute_solutions
return compute_solutions()[1] return compute_solutions()[1]
class BrachistochroneQuestion(DirectSolverQuestion): # class BrachistochroneQuestion(DirectSolverQuestion):
""" Brachistochrone (unconstrained) """ # """ Brachistochrone (unconstrained) """
#
@classmethod # @classmethod
def compute_solution(cls): # def compute_solution(cls):
from irlc.ex05.direct_brachistochrone import compute_constrained_solutions # from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
return compute_constrained_solutions()[1] # return compute_constrained_solutions()[1]
#
class BrachistochroneConstrainedQuestion(DirectSolverQuestion): # class BrachistochroneConstrainedQuestion(DirectSolverQuestion):
""" Brachistochrone (constrained) """ # """ Brachistochrone (constrained) """
@classmethod # @classmethod
def compute_solution(cls): # def compute_solution(cls):
from irlc.ex05.direct_brachistochrone import compute_constrained_solutions # from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
return compute_constrained_solutions()[1] # return compute_constrained_solutions()[1]
class Week05Tests(Report): class Week05Tests(Report):
title = "Tests for week 05" title = "Tests for week 05"
...@@ -105,8 +105,8 @@ class Week05Tests(Report): ...@@ -105,8 +105,8 @@ class Week05Tests(Report):
(DirectAgentPendulum, 10), # ok (DirectAgentPendulum, 10), # ok
(CartpoleTimeQuestion, 5), # ok (CartpoleTimeQuestion, 5), # ok
(CartpoleCostQuestion, 5), # ok (CartpoleCostQuestion, 5), # ok
(BrachistochroneQuestion, 5), # ok # (BrachistochroneQuestion, 5), # ok
(BrachistochroneConstrainedQuestion, 10), # ok # (BrachistochroneConstrainedQuestion, 10), # ok
] ]
if __name__ == '__main__': if __name__ == '__main__':
......
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from unitgrade import UTestCase, Report, cache
import numpy as np
from irlc import train
def train_recording(env, agent, trajectories):
for t in trajectories:
env.reset()
for k in range(len(t.action)):
s = t.state[k]
r = t.reward[k]
a = t.action[k]
sp = t.state[k+1]
agent.pi(s,k)
agent.train(s, a, r, sp, done=k == len(t.action)-1)
class BanditQuestion(UTestCase):
""" Value (Q) function estimate """
tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined.
# testfun = QPrintItem.assertL2
# def setUpClass(cls) -> None:
# from irlc.ex08.simple_agents import BasicAgent
# from irlc.ex08.bandits import StationaryBandit
# env = StationaryBandit(k=10, )
# agent = BasicAgent(env, epsilon=0.1)
# _, cls.trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
# cls.Q = agent.Q
# cls.env = env
# cls.agent = agent
def get_env_agent(self):
from irlc.ex08.simple_agents import BasicAgent
from irlc.ex08.bandits import StationaryBandit
env = StationaryBandit(k=10)
agent = BasicAgent(env, epsilon=0.1)
return env, agent
@cache
def get_trajectories(self):
env, agent = self.get_env_agent()
_, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
return trajectories
# def precompute_payload(self):
# env, agent = self.get_env_agent()
# _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
# return trajectories, agent.Q
def test_agent(self):
trajectories = self.get_trajectories()
env, agent = self.get_env_agent()
train_recording(env, agent, trajectories)
self.assertL2(agent.Q, tol=1e-5)
# return agent.Q
# self.Q = Q
# self.question.agent = agent
# return agent.Q
# testfun = QPrintItem.assertL2
def test_action_distributin(self):
T = 10000
tol = 1 / np.sqrt(T) * 5
trajectories = self.get_trajectories()
env, agent = self.get_env_agent()
train_recording(env, agent, trajectories)
# for k in self._cache.keys(): print(k)
from collections import Counter
counts = Counter([agent.pi(None, k) for k in range(T)])
distrib = [counts[k] / T for k in range(env.k)]
self.assertL2(np.asarray(distrib), tol=tol)
# def process_output(self, res, txt, numbers):
# return res
# def process_output(self, res, txt, numbers):
# return res
#
# def test(self, computed, expected):
# super().test(computed, self.Q)
# class BanditQuestion(QPrintItem):
# # tol = 1e-6
# tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined.
# title = "Value (Q) function estimate"
# testfun = QPrintItem.assertL2
#
# def get_env_agent(self):
# from irlc.ex08.simple_agents import BasicAgent
# from irlc.ex08.bandits import StationaryBandit
# env = StationaryBandit(k=10, )
# agent = BasicAgent(env, epsilon=0.1)
# return env, agent
#
# def precompute_payload(self):
# env, agent = self.get_env_agent()
# _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
# return trajectories, agent.Q
#
# def compute_answer_print(self):
# trajectories, Q = self.precomputed_payload()
# env, agent = self.get_env_agent()
# train_recording(env, agent, trajectories)
# self.Q = Q
# self.question.agent = agent
# return agent.Q
#
# def process_output(self, res, txt, numbers):
# return res
#
# def test(self, computed, expected):
# super().test(computed, self.Q)
#
# class BanditItemActionDistribution(QPrintItem):
# # Assumes setup has already been done.
# title = "Action distribution test"
# T = 10000
# tol = 1/np.sqrt(T)*5
# testfun = QPrintItem.assertL2
#
# def compute_answer_print(self):
# # print("In agent print code")
# from collections import Counter
# counts = Counter( [self.question.agent.pi(None, k) for k in range(self.T)] )
# distrib = [counts[k] / self.T for k in range(self.question.agent.env.k)]
# return np.asarray(distrib)
#
# def process_output(self, res, txt, numbers):
# return res
#
# class BanditQuestion(QuestionGroup):
# title = "Simple bandits"
# class SimpleBanditItem(BanditItem):
# #title = "Value function estimate"
# def get_env_agent(self):
# from irlc.ex08.simple_agents import BasicAgent
# from irlc.ex08.bandits import StationaryBandit
# env = StationaryBandit(k=10, )
# agent = BasicAgent(env, epsilon=0.1)
# return env, agent
# class SimpleBanditActionDistribution(BanditItemActionDistribution):
# pass
class GradientBanditQuestion(BanditQuestion):
""" Gradient agent """
# class SimpleBanditItem(BanditItem):
# title = "Simple agent question"
def get_env_agent(self):
from irlc.ex08.bandits import StationaryBandit
from irlc.ex08.gradient_agent import GradientAgent
env = StationaryBandit(k=10)
agent = GradientAgent(env, alpha=0.05)
return env, agent
# def precompute_payload(self):
# env, agent = self.get_env_agent()
# _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
# return trajectories
def test_agent(self):
trajectories = self.get_trajectories()
env, agent = self.get_env_agent()
train_recording(env, agent, trajectories)
self.assertL2(agent.H, tol=1e-5)
# def test(self, computed, expected):
# self.testfun(computed, self.H)
#
# class SimpleBanditActionDistribution(BanditItemActionDistribution):
# pass
# class GradientBanditQuestion(QuestionGroup):
# title = "Gradient agent"
# class SimpleBanditItem(BanditItem):
# # title = "Simple agent question"
# def get_env_agent(self):
# from irlc.ex08.bandits import StationaryBandit
# from irlc.ex08.gradient_agent import GradientAgent
# env = StationaryBandit(k=10)
# agent = GradientAgent(env, alpha=0.05)
# return env, agent
#
# def precompute_payload(self):
# env, agent = self.get_env_agent()
# _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
# return trajectories, agent.H
#
# def compute_answer_print(self):
# trajectories, H = self.precomputed_payload()
# env, agent = self.get_env_agent()
# train_recording(env, agent, trajectories)
# self.H = H
# self.question.agent = agent
# return agent.H
#
# def test(self, computed, expected):
# self.testfun(computed, self.H)
#
# class SimpleBanditActionDistribution(BanditItemActionDistribution):
# pass
class UCBAgentQuestion(BanditQuestion):
""" UCB agent """
# class UCBAgentItem(BanditItem):
def get_env_agent(self):
from irlc.ex08.bandits import StationaryBandit
from irlc.ex08.ucb_agent import UCBAgent
env = StationaryBandit(k=10)
agent = UCBAgent(env)
return env, agent
# class UCBAgentActionDistribution(BanditItemActionDistribution):
# pass
# class UCBAgentQuestion(QuestionGroup):
# title = "UCB agent"
# class UCBAgentItem(BanditItem):
# def get_env_agent(self):
# from irlc.ex08.bandits import StationaryBandit
# from irlc.ex08.ucb_agent import UCBAgent
# env = StationaryBandit(k=10)
# agent = UCBAgent(env)
# return env, agent
#
# class UCBAgentActionDistribution(BanditItemActionDistribution):
# pass
# class NonstatiotnaryAgentQuestion(QuestionGroup):
# title = "Nonstationary bandit environment"
# class NonstationaryItem(BanditItem):
# def get_env_agent(self):
# epsilon = 0.1
# from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent
# bandit = NonstationaryBandit(k=10)
# agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15)
# return bandit, agent
#
# class NonstationaryActionDistribution(BanditItemActionDistribution):
# pass
class NonstatiotnaryAgentQuestion(BanditQuestion):
""" UCB agent """
# class UCBAgentItem(BanditItem):
def get_env_agent(self):
epsilon = 0.1
from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent
bandit = NonstationaryBandit(k=10)
agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15)
return bandit, agent
import irlc
class Week08Tests(Report):
title = "Tests for week 08"
pack_imports = [irlc]
individual_imports = []
questions = [
(BanditQuestion, 10),
(GradientBanditQuestion, 10),
(UCBAgentQuestion, 5),
(NonstatiotnaryAgentQuestion, 5)
]
if __name__ == '__main__':
from unitgrade import evaluate_report_student
evaluate_report_student(Week08Tests())
File added
File deleted
File deleted
No preview for this file type
No preview for this file type
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment