Week8

75a6f326 · tuhe · edcb2b87 · 75a6f326 · 75a6f326 · 75a6f326
Commit 75a6f326 authored 4 months ago by tuhe
--- a/.gitignore
+++ b/.gitignore
@@ -4,11 +4,11 @@ exam_tabular_examples
 #solutions/ex01
 #solutions/ex02
 #solutions/ex03
-solutions/ex04
+#solutions/ex04
-solutions/ex05
+#solutions/ex05
-solutions/ex06
+#solutions/ex06
-solutions/ex07
+#solutions/ex07
-solutions/ex08
+#solutions/ex08
 solutions/ex09
 solutions/ex10
 solutions/ex11
@@ -31,10 +31,10 @@ solutions/ex13
 #irlc/tests/tests_week02.py
 #irlc/tests/tests_week03.py
 #irlc/tests/tests_week04.py
-irlc/tests/tests_week05.py
+#irlc/tests/tests_week05.py
-irlc/tests/tests_week06.py
+#irlc/tests/tests_week06.py
-irlc/tests/tests_week07.py
+#irlc/tests/tests_week07.py
-irlc/tests/tests_week08.py
+#irlc/tests/tests_week08.py
 irlc/tests/tests_week09.py
 irlc/tests/tests_week10.py
 irlc/tests/tests_week11.py
@@ -68,10 +68,10 @@ irlc/exam/exam20*/solution
 # irlc/lectures/lec02
 #irlc/lectures/lec03
 #irlc/lectures/lec04
-irlc/lectures/lec05
+#irlc/lectures/lec05
-irlc/lectures/lec06
+#irlc/lectures/lec06
-irlc/lectures/lec07
+#irlc/lectures/lec07
-irlc/lectures/lec08
+#irlc/lectures/lec08
 irlc/lectures/lec09
 irlc/lectures/lec10
 irlc/lectures/lec11

--- a/irlc/ex08/__init__.py
+++ b/irlc/ex08/__init__.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""This directory contains the exercises for week 8."""
--- a/irlc/ex08/bandit_example.py
+++ b/irlc/ex08/bandit_example.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import numpy as np
+import matplotlib.pyplot as plt
+if __name__ == "__main__":
+    from irlc import Agent, train, savepdf 
+    from irlc.ex08.bandits import StationaryBandit
+    bandit = StationaryBandit(k=10) # A 10-armed bandit
+    agent = Agent(bandit)  # Recall the agent takes random actions
+    _, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500)
+    plt.plot(trajectories[0].reward)
+    plt.xlabel("Time step")
+    plt.ylabel("Reward per time step") 
+    savepdf("dumbitA")
+    plt.show()
+    agent = Agent(bandit)  # Recall the agent takes random actions  
+    for i in range(10):
+        _, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500)
+        regret = np.asarray([r['gab'] for r in trajectories[0].env_info[1:]])
+        cum_regret = np.cumsum(regret)
+        plt.plot(cum_regret, label=f"Episode {i}")
+    plt.legend()
+    plt.xlabel("Time step")
+    plt.ylabel("Accumulated Regret") 
+    savepdf("dumbitB")
+    plt.show()
--- a/irlc/ex08/bandits.py
+++ b/irlc/ex08/bandits.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from gymnasium import Env
+from gymnasium.spaces import Discrete
+from irlc import train
+from tqdm import tqdm
+import sys
+from irlc import cache_read, cache_write, cache_exists
+class BanditEnvironment(Env): 
+    r"""
+    A helper class for defining bandit problems similar to e.g. the 10-armed testbed discsused in (SB18).
+    We are going to implement the bandit problems as greatly simplfied gym environments, as this will allow us to
+    implement the bandit agents as the familiar ``Agent``. I hope this way of doing it will make it clearer that bandits
+    are in fact a sort of reinforcement learning method.
+    The following code shows an example of how to use a bandit environment:
+    .. runblock:: pycon
+        >>> from irlc.ex08.bandits import StationaryBandit
+        >>> env = StationaryBandit(k=10)                    # 10-armed testbed.
+        >>> env.reset()                                     # Reset env.q_star
+        >>> s, r, _, _, info = env.step(3)
+        >>> print(f"The reward we got from taking arm a=3 was {r=}")
+    """
+    def __init__(self, k : int): 
+        r"""
+        Initialize a bandit problem. The observation space is given a dummy value since bandit problems of the sort
+        (SB18) discuss don't have observations.
+        :param k: The number of arms.
+        """
+        super().__init__() 
+        self.observation_space = Discrete(1)  # Dummy observation space with a single observation.
+        self.action_space = Discrete(k)       # The arms labelled 0,1,...,k-1.
+        self.k = k  # Number of arms 
+    def reset(self): 
+        r"""
+        Use this function to reset the all internal parameters of the environment and get ready for a new episode.
+        In the (SB18) 10-armed bandit testbed, this would involve resetting the expected return
+        .. math::
+            q^*_a
+        The function must return a dummy state and info dictionary to agree with the gym ``Env`` class, but their values are
+        irrelevant
+        :return:
+            - s - a state, for instance 0
+            - info - the info dictionary, for instance {}
+        """
+        raise NotImplementedError("Implement the reset method") 
+    def bandit_step(self, a): 
+        r"""This helper function simplify the definition of the environments ``step``-function.
+        Given an action :math:`r`, this function computes the reward obtained by taking that action :math:`r_t`
+        and the gab. This is defined as the expected reward we miss out on by taking the potentially suboptimal action :math:`a`
+        and is defined as:
+        .. math::
+            \Delta = \max_{a'} q^*_{a'} - q_a
+        Once implemented, the reward and regret enters into the ``step`` function as follows:
+        .. runblock:: pycon
+            >>> from irlc.ex08.bandits import StationaryBandit
+            >>> env = StationaryBandit(k=4)     # 4-armed testbed.
+            >>> env.reset()                     # Reset all parameters.
+            >>> _, r, _, _, info = env.step(2)  # Take action a=2
+            >>> print(f"Reward from a=2 was {r=}, the gab was {info['gab']=}")
+        :param a: The current action we take
+        :return:
+            - r - The reward we thereby incur
+            - gab - The regret incurred by taking this action (0 for an optimal action)
+        """
+        reward = 0 # Compute the reward associated with arm a 
+        gab = 0 # Compute the gab, by comparing to the optimal arms reward.
+        return reward, gab
+    def step(self, action): 
+        r"""You do not have to edit this function.
+        In a bandit environment, the step function is simplified greatly since there are no
+        states to keep track on. It should simply return the reward incurred by the action ``a``
+        and (for convenience) also returns the gab in the ``info``-dictionary.
+        :param action: The current action we take :math:`a_t`
+        :return:
+            - next_state - This is always ``None``
+            - reward - The reward obtained by taking the given action. In (SB18) this is defined as :math:`r_t`
+            - terminated - Always ``False``. Bandit problems don't terminate.
+            - truncated - Always ``False``
+            - info - For convenience, this includes the gab (used by the plotting methods)
+        """
+        reward, gab = self.bandit_step(action) 
+        info = {'gab': gab}
+        return None, reward, False, False, info  
+class StationaryBandit(BanditEnvironment): 
+    r"""Implement the 'stationary bandit environment' which is described in (SB18, Section 2.3)
+    and used as a running example throughout the chapter.
+    We will implement a version with a constant mean offset (q_star_mean), so that
+     q* = x + q_star_mean,   x ~ Normal(0,1)
+    q_star_mean can just be considered to be zero at first.
+    """
+    def __init__(self, k, q_star_mean=0):
+        super().__init__(k)
+        self.q_star_mean = q_star_mean
+    def reset(self): 
+        """ Set q^*_k = N(0,1) + mean_value. The mean_value is 0 in most examples. I.e., implement the 10-armed testbed environment. """
+        self.q_star = np.random.randn(self.k) + self.q_star_mean
+        self.optimal_action = np.argmax(self.q_star) # Optimal action is the one with the largest q^*-value. 
+        return 0, {} # The reset method in a gym Env must return a (dummy) state and a dictionary.
+    def bandit_step(self, a): 
+        """ Return the reward/gab for action a for the simple bandit. Use self.q_star (see reset-function above).
+         To implement it, implement the reward (see the description of the 10-armed testbed for more information.
+         How is it computed from q^*_k?) and also compute the gab.
+         As a small hint, since we are computing the gab, it will in fact be the difference between the
+         value of q^* corresponding to the current arm, and the q^* value for the optimal arm.
+         Remember it is 0 if the optimal action is selected.
+         """
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Insert your solution and remove this error.")
+        # Actual logic goes here. Use self.q_star[a] to get mean reward and np.random.randn() to generate random numbers.  
+        return reward, gab 
+    def __str__(self):
+        return f"{type(self).__name__}_{self.q_star_mean}"
+"""
+Helper function for running a bunch of bandit experiments and plotting the results.
+The function will run the agents in 'agents' (a list of bandit agents) 
+on the bandit environment 'bandit' and plot the result.
+Each agent will be evaluated for num_episodes episodes, and one episode consist of 'steps' steps.
+However, to speed things up you can use cache, and the bandit will not be evaluated for more than 
+'max_episodes' over all cache runs. 
+"""
+def eval_and_plot(bandit, agents, num_episodes=2000, max_episodes=2000, steps=1000, labels=None, use_cache=True):
+    if labels is None:
+        labels = [str(agent) for agent in agents]
+    f, axs = plt.subplots(nrows=3, ncols=1)
+    f.set_size_inches(10,7)
+    (ax1, ax2, ax3) = axs
+    for i,agent in enumerate(agents):
+        rw, oa, regret, num_episodes = run_agent(bandit, agent, episodes=num_episodes, max_episodes=max_episodes, steps=steps, use_cache=use_cache)
+        ax1.plot(rw, label=labels[i])
+        ax2.plot(oa, label=labels[i])
+        ax3.plot(regret, label=labels[i])
+    for ax in axs:
+        ax.grid()
+        ax.set_xlabel("Steps")
+    ax1.set_ylabel("Average Reward")
+    ax2.set_ylabel("% optimal action")
+    ax3.set_ylabel("Regret $L_t$")
+    ax3.legend()
+    f.suptitle(f"Evaluated on {str(bandit)} for {num_episodes} episodes")
+def run_agent(env, agent, episodes=2000, max_episodes=2000, steps=1000, use_cache=False, verbose=True):
+    """
+    Helper function. most of the work involves the cache; the actual training is done by 'train'.
+    """
+    C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = 0, 0, 0, 0
+    if use_cache:
+        cache = f"cache/{str(env)}_{str(agent)}_{steps}.pkl"
+        if cache_exists(cache):
+            print("> Reading from cache", cache)
+            C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = cache_read(cache)
+    regrets = []
+    rewards = []
+    cruns = max(0, min(episodes, max_episodes - C_n_episodes)) # Missing runs.
+    for _ in tqdm(range(cruns), file=sys.stdout, desc=str(agent),disable=not verbose):
+        stats, traj = train(env, agent, max_steps=steps, verbose=False, return_trajectory=True)
+        regret = np.asarray([r['gab'] for r in traj[0].env_info[1:]])
+        regrets.append(regret)
+        rewards.append(traj[0].reward)
+    regrets_cum_sum = C_regrets_cum_sum
+    oas_sum = C_oas_sum
+    rewards_sum = C_rewards_sum
+    episodes = C_n_episodes
+    if len(regrets) > 0:
+        regrets_cum_sum += np.cumsum(np.sum(np.stack(regrets), axis=0))
+        oas_sum += np.sum(np.stack(regrets) == 0, axis=0)
+        rewards_sum += np.sum(np.stack(rewards), axis=0)
+        episodes += cruns
+    if use_cache and cruns > 0:
+        cache_write((regrets_cum_sum, oas_sum, rewards_sum, episodes), cache, protocol=4)
+    return rewards_sum/episodes, oas_sum/episodes, regrets_cum_sum/episodes, episodes
--- a/irlc/ex08/gradient_agent.py
+++ b/irlc/ex08/gradient_agent.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc import savepdf
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex08.bandits import eval_and_plot, StationaryBandit
+from irlc import Agent
+class GradientAgent(Agent):
+    def __init__(self, env, alpha=None, use_baseline=True):
+        self.k = env.action_space.n
+        self.alpha = alpha
+        self.baseline=use_baseline
+        self.H = np.zeros((self.k,))
+        super().__init__(env)
+    def Pa(self):
+        """ This helper method returns the probability distribution P(A=a) of chosing the
+        arm a as a vector
+        """
+        pi_a = np.exp(self.H)
+        return pi_a / np.sum(pi_a)
+    def pi(self, s, t, info_s=None):
+        if t == 0:
+            self.R_bar = 0  # average reward baseline
+            self.H *= 0 # Reset H to all-zeros.
+        self.t = t  # Sore the current time step.
+        return np.random.choice( self.k, p=self.Pa() )
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        # TODO: 9 lines missing.
+        raise NotImplementedError("Implement function body")
+    def __str__(self):
+        return f"{type(self).__name__}_{self.alpha}_{'baseline' if self.baseline else 'no_baseline'}"
+if __name__ == "__main__":
+    baseline_bandit = StationaryBandit(k=10, q_star_mean=4)
+    alphas = [0.1, 0.4]
+    agents = [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=False) for alpha in alphas]
+    agents += [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=True) for alpha in alphas]
+    labels = [f'Gradient Bandit alpha={alpha}' for alpha in alphas ]
+    labels += [f'With baseline: Gradient Bandit alpha={alpha}' for alpha in alphas ]
+    use_cache = False
+    eval_and_plot(baseline_bandit, agents, max_episodes=2000, num_episodes=100, labels=labels, use_cache=use_cache)
+    savepdf("gradient_baseline")
+    plt.show()
--- a/irlc/ex08/grand_bandit_race.py
+++ b/irlc/ex08/grand_bandit_race.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import matplotlib.pyplot as plt
+from irlc.ex08.simple_agents import BasicAgent
+from irlc.ex08.bandits import StationaryBandit, eval_and_plot
+from irlc.ex08.nonstationary import MovingAverageAgent, NonstationaryBandit
+from irlc.ex08.gradient_agent import GradientAgent
+from irlc.ex08.ucb_agent import UCBAgent
+from irlc import savepdf
+import time
+if __name__ == "__main__":
+    print("Ladies and gentlemen. It is time for the graaand bandit race")
+    def intro(bandit, agents):
+        print("We are live from the beautiful surroundings where they will compete in:")
+        print(bandit)
+        print("Who will win? who will have the most regret? we are about to find out")
+        print("in a minute after a brief word from our sponsors")
+        time.sleep(1)
+        print("And we are back. Let us introduce todays contestants:")
+        for a in agents:
+            print(a)
+        print("And they are off!")
+    epsilon = 0.1
+    alpha = 0.1
+    c = 2
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Define the bandit here: bandit1 = ...")
+    # TODO: 5 lines missing.
+    raise NotImplementedError("define agents list here")
+    labels = ["Basic", "Moving avg.", "gradient", "Gradient+baseline", "UCB"]
+    '''
+    Stationary, no offset. Vanilla setting.
+    '''
+    intro(bandit1, agents)
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Call eval_and_plot here")
+    plt.suptitle("Stationary bandit (no offset)")
+    savepdf("grand_race_1")
+    plt.show()
+    '''
+    Stationary, but with offset
+    '''
+    print("Whew what a race. Let's get ready to next round:")
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Define bandit2 = ... here")
+    intro(bandit2, agents)
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Call eval_and_plot here")
+    plt.suptitle("Stationary bandit (with offset)")
+    savepdf("grand_race_2")
+    plt.show()
+    '''
+    Long (nonstationary) simulations
+    '''
+    print("Whew what a race. Let's get ready to next round which will be a long one.")
+    # TODO: 1 lines missing.
+    raise NotImplementedError("define bandit3 here")
+    intro(bandit3, agents)
+    # TODO: 1 lines missing.
+    raise NotImplementedError("call eval_and_plot here")
+    plt.suptitle("Non-stationary bandit (no offset)")
+    savepdf("grand_race_3")
+    plt.show()
+    '''
+    Stationary, no offset, long run. Exclude stupid bandits.
+    '''
+    agents2 = []
+    agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=False)]
+    agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=True)]
+    agents2 += [UCBAgent(bandit1, c=2)]
+    labels = ["Gradient", "Gradient+baseline", "UCB"]
+    intro(bandit1, agents2)
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Call eval_and_plot here")
+    plt.suptitle("Stationary bandit (no offset)")
+    savepdf("grand_race_4")
+    plt.show()
--- a/irlc/ex08/nonstationary.py
+++ b/irlc/ex08/nonstationary.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex08.simple_agents import BasicAgent
+from irlc.ex08.bandits import StationaryBandit, eval_and_plot
+from irlc import savepdf
+class NonstationaryBandit(StationaryBandit):
+    def __init__(self, k, q_star_mean=0, reward_change_std=0.01):
+        self.reward_change_std = reward_change_std
+        super().__init__(k, q_star_mean)
+    def bandit_step(self, a): 
+        r""" Implement the non-stationary bandit environment (as described in (SB18)).
+        Hint: use reward_change_std * np.random.randn() to generate a single random number with the given std.
+         then add one to each coordinate. Remember you have to compute the regret as well, see StationaryBandit for ideas.
+         (remember the optimal arm will change when you add noise to q_star) """
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Implement function body")
+        return super().bandit_step(a)
+    def __str__(self):
+        return f"{type(self).__name__}_{self.q_star_mean}_{self.reward_change_std}"
+class MovingAverageAgent(BasicAgent):
+    r"""
+    The simple bandit from (SB18, Section 2.4), but with moving average alpha
+    as described in (SB18, Eqn. (2.3))
+    """
+    def __init__(self, env, epsilon, alpha): 
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Implement function body")
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Implement function body")
+    def __str__(self):
+        return f"{type(self).__name__}_{self.epsilon}_{self.alpha}"
+if __name__ == "__main__":
+    plt.figure(figsize=(10, 10))
+    epsilon = 0.1
+    alphas = [0.15, 0.1, 0.05]
+    # TODO: 4 lines missing.
+    raise NotImplementedError("Insert your solution and remove this error.")
+    labels = [f"Basic agent, epsilon={epsilon}"]
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Insert your solution and remove this error.")
+    use_cache = False # Set this to True to use cache (after code works!)
+    eval_and_plot(bandit, agents, steps=10000, num_episodes=200, labels=labels, use_cache=use_cache)
+    savepdf("nonstationary_bandits")
+    plt.show()
--- a/irlc/ex08/simple_agents.py
+++ b/irlc/ex08/simple_agents.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex08.bandits import StationaryBandit, eval_and_plot
+from irlc import Agent
+from irlc import savepdf
+class BasicAgent(Agent):
+    r"""
+    Simple bandit as described on (SB18, Section 2.4).
+    """
+    def __init__(self, env, epsilon):
+        super().__init__(env)
+        self.k = env.action_space.n
+        self.epsilon = epsilon
+    def pi(self, s, t, info=None): 
+        """ Since this is a bandit, s=None and can be ignored, while t refers to the time step in the current episode """
+        if t == 0:
+            # At step 0 of episode. Re-initialize data structure. 
+            # TODO: 2 lines missing.
+            raise NotImplementedError("Insert your solution and remove this error.")
+        # compute action here 
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Insert your solution and remove this error.")
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        """ Since this is a bandit, done, s, sp, info_s, info_sp can all be ignored.
+        From the input arguments you should only use a
+        """
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Implement function body")
+    def __str__(self):
+        return f"BasicAgent_{self.epsilon}"
+if __name__ == "__main__":
+    N = 100000
+    S = [np.max( np.random.randn(10) ) for _ in range(100000) ]
+    print( np.mean(S), np.std(S)/np.sqrt(N) )
+    use_cache = False # Set this to True to use cache (after code works!)
+    from irlc.utils.timer import Timer
+    timer = Timer(start=True)
+    R = 100
+    steps = 1000
+    env = StationaryBandit(k=10) 
+    agents = [BasicAgent(env, epsilon=.1), BasicAgent(env, epsilon=.01), BasicAgent(env, epsilon=0) ]
+    eval_and_plot(env, agents, num_episodes=100, steps=1000, max_episodes=150, use_cache=use_cache)
+    savepdf("bandit_epsilon")
+    plt.show() 
+    print(timer.display())
--- a/irlc/ex08/ucb_agent.py
+++ b/irlc/ex08/ucb_agent.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex08.simple_agents import BasicAgent
+from irlc import savepdf
+from irlc import Agent
+class UCBAgent(Agent):
+    def __init__(self, env, c=2):
+        self.c = c
+        super().__init__(env)
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Train agent here")
+    def pi(self, s, k, info=None):
+        if k == 0: 
+            """ Initialize the agent"""
+            # TODO: 3 lines missing.
+            raise NotImplementedError("Reset agent (i.e., make it ready to learn in a new episode with a new optimal action)")
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Compute (and return) optimal action")
+    def __str__(self):
+        return f"{type(self).__name__}_{self.c}"
+from irlc.ex08.bandits import StationaryBandit, eval_and_plot
+if __name__ == "__main__":
+    r"""Reproduce (SB18, Fig. 2.4) comparing UCB agent to epsilon greedy """
+    runs, use_cache = 100, False
+    c = 2
+    eps = 0.1
+    steps = 1000
+    env = StationaryBandit(k=10)
+    agents = [UCBAgent(env,c=c), BasicAgent(env, epsilon=eps)]
+    eval_and_plot(bandit=env, agents=agents, num_episodes=runs, steps=steps, max_episodes=2000, use_cache=use_cache)
+    savepdf("UCB_agent")
+    plt.show()
--- a/irlc/lectures/lec08/__init__.py
+++ b/irlc/lectures/lec08/__init__.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
--- a/irlc/lectures/lec08/demo_bandit.py
+++ b/irlc/lectures/lec08/demo_bandit.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.utils.bandit_graphics_environment import GraphicalBandit
+import time
+from irlc import train
+from irlc.ex08.simple_agents import BasicAgent
+from irlc import interactive
+def bandit_eps(autoplay=False):
+    env = GraphicalBandit(10, render_mode='human',frames_per_second=30)
+    env.reset()
+    agent = BasicAgent(env, epsilon=0.1)
+    agent.method = 'Epsilon-greedy'
+    env, agent = interactive(env, agent, autoplay=autoplay)
+    t0 = time.time()
+    n = 3000
+    stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False)
+    tpf = (time.time()-t0)/ n
+    print("tpf", tpf, 'fps', 1/tpf)
+    env.close()
+if __name__ == "__main__":
+    bandit_eps()
--- a/irlc/lectures/lec08/demo_bandit_ucb.py
+++ b/irlc/lectures/lec08/demo_bandit_ucb.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.utils.bandit_graphics_environment import GraphicalBandit
+from irlc import interactive, train
+# import numpy as np
+import time
+def bandit_ucb(autoplay=False):
+    env = GraphicalBandit(10, render_mode='human', frames_per_second=30)
+    env.reset()
+    #env.viewer.show_q_star = True
+    #env.viewer.show_q_ucb = True
+    from irlc.ex08.ucb_agent import UCBAgent
+    agent = UCBAgent(env, c=1)
+    agent.method = 'UCB'
+    env, agent = interactive(env, agent, autoplay=autoplay)
+    t0 = time.time()
+    n = 500
+    stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False)
+    tpf = (time.time() - t0) / n
+    print("tpf", tpf, 'fps', 1 / tpf)
+    env.close()
+if __name__ == "__main__":
+    bandit_ucb()
--- a/irlc/tests/tests_week05.py
+++ b/irlc/tests/tests_week05.py
@@ -79,20 +79,20 @@ class CartpoleCostQuestion(DirectSolverQuestion):
        from irlc.ex05.direct_cartpole_kelly import compute_solutions
        return compute_solutions()[1]
-class BrachistochroneQuestion(DirectSolverQuestion):
+# class BrachistochroneQuestion(DirectSolverQuestion):
-    """ Brachistochrone (unconstrained) """
+#     """ Brachistochrone (unconstrained) """
+#
-    @classmethod
+#     @classmethod
-    def compute_solution(cls):
+#     def compute_solution(cls):
-        from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
+#         from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
-        return compute_constrained_solutions()[1]
+#         return compute_constrained_solutions()[1]
+#
-class BrachistochroneConstrainedQuestion(DirectSolverQuestion):
+# class BrachistochroneConstrainedQuestion(DirectSolverQuestion):
-    """ Brachistochrone (constrained) """
+#     """ Brachistochrone (constrained) """
-    @classmethod
+#     @classmethod
-    def compute_solution(cls):
+#     def compute_solution(cls):
-        from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
+#         from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
-        return compute_constrained_solutions()[1]
+#         return compute_constrained_solutions()[1]
 class Week05Tests(Report):
    title = "Tests for week 05"
@@ -105,8 +105,8 @@ class Week05Tests(Report):
        (DirectAgentPendulum, 10),                  # ok
        (CartpoleTimeQuestion, 5),                  # ok
        (CartpoleCostQuestion, 5),                  # ok
-        (BrachistochroneQuestion, 5),               # ok
+        # (BrachistochroneQuestion, 5),               # ok
-        (BrachistochroneConstrainedQuestion, 10),   # ok
+        # (BrachistochroneConstrainedQuestion, 10),   # ok
                 ]
 if __name__ == '__main__':

--- a/irlc/tests/tests_week08.py
+++ b/irlc/tests/tests_week08.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from unitgrade import UTestCase, Report, cache
+import numpy as np
+from irlc import train
+def train_recording(env, agent, trajectories):
+    for t in trajectories:
+        env.reset()
+        for k in range(len(t.action)):
+            s = t.state[k]
+            r = t.reward[k]
+            a = t.action[k]
+            sp = t.state[k+1]
+            agent.pi(s,k)
+            agent.train(s, a, r, sp, done=k == len(t.action)-1)
+class BanditQuestion(UTestCase):
+    """ Value (Q) function estimate """
+    tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined.
+    # testfun = QPrintItem.assertL2
+    # def setUpClass(cls) -> None:
+    #     from irlc.ex08.simple_agents import BasicAgent
+    #     from irlc.ex08.bandits import StationaryBandit
+    #     env = StationaryBandit(k=10, )
+    #     agent = BasicAgent(env, epsilon=0.1)
+    #     _, cls.trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+    #     cls.Q = agent.Q
+    #     cls.env = env
+    #     cls.agent = agent
+    def get_env_agent(self):
+        from irlc.ex08.simple_agents import BasicAgent
+        from irlc.ex08.bandits import StationaryBandit
+        env = StationaryBandit(k=10)
+        agent = BasicAgent(env, epsilon=0.1)
+        return env, agent
+    @cache
+    def get_trajectories(self):
+        env, agent = self.get_env_agent()
+        _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+        return trajectories
+    # def precompute_payload(self):
+    #     env, agent = self.get_env_agent()
+    #     _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+    #     return trajectories, agent.Q
+    def test_agent(self):
+        trajectories = self.get_trajectories()
+        env, agent = self.get_env_agent()
+        train_recording(env, agent, trajectories)
+        self.assertL2(agent.Q, tol=1e-5)
+        # return agent.Q
+        # self.Q = Q
+        # self.question.agent = agent
+        # return agent.Q
+    # testfun = QPrintItem.assertL2
+    def test_action_distributin(self):
+        T = 10000
+        tol = 1 / np.sqrt(T) * 5
+        trajectories = self.get_trajectories()
+        env, agent = self.get_env_agent()
+        train_recording(env, agent, trajectories)
+        # for k in self._cache.keys(): print(k)
+        from collections import Counter
+        counts = Counter([agent.pi(None, k) for k in range(T)])
+        distrib = [counts[k] / T for k in range(env.k)]
+        self.assertL2(np.asarray(distrib), tol=tol)
+    # def process_output(self, res, txt, numbers):
+    #     return res
+    # def process_output(self, res, txt, numbers):
+    #     return res
+    #
+    # def test(self, computed, expected):
+    #     super().test(computed, self.Q)
+# class BanditQuestion(QPrintItem):
+#     # tol = 1e-6
+#     tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined.
+#     title = "Value (Q) function estimate"
+#     testfun = QPrintItem.assertL2
+#
+#     def get_env_agent(self):
+#         from irlc.ex08.simple_agents import BasicAgent
+#         from irlc.ex08.bandits import StationaryBandit
+#         env = StationaryBandit(k=10, )
+#         agent = BasicAgent(env, epsilon=0.1)
+#         return env, agent
+#
+#     def precompute_payload(self):
+#         env, agent = self.get_env_agent()
+#         _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+#         return trajectories, agent.Q
+#
+#     def compute_answer_print(self):
+#         trajectories, Q = self.precomputed_payload()
+#         env, agent = self.get_env_agent()
+#         train_recording(env, agent, trajectories)
+#         self.Q = Q
+#         self.question.agent = agent
+#         return agent.Q
+#
+#     def process_output(self, res, txt, numbers):
+#         return res
+#
+#     def test(self, computed, expected):
+#         super().test(computed, self.Q)
+#
+# class BanditItemActionDistribution(QPrintItem):
+#     # Assumes setup has already been done.
+#     title = "Action distribution test"
+#     T = 10000
+#     tol = 1/np.sqrt(T)*5
+#     testfun = QPrintItem.assertL2
+#
+#     def compute_answer_print(self):
+#         # print("In agent print code")
+#         from collections import Counter
+#         counts = Counter( [self.question.agent.pi(None, k) for k in range(self.T)] )
+#         distrib = [counts[k] / self.T for k in range(self.question.agent.env.k)]
+#         return np.asarray(distrib)
+#
+#     def process_output(self, res, txt, numbers):
+#         return res
+#
+# class BanditQuestion(QuestionGroup):
+#     title = "Simple bandits"
+#     class SimpleBanditItem(BanditItem):
+#         #title = "Value function estimate"
+#         def get_env_agent(self):
+#             from irlc.ex08.simple_agents import BasicAgent
+#             from irlc.ex08.bandits import StationaryBandit
+#             env = StationaryBandit(k=10, )
+#             agent = BasicAgent(env, epsilon=0.1)
+#             return env, agent
+#     class SimpleBanditActionDistribution(BanditItemActionDistribution):
+#         pass
+class GradientBanditQuestion(BanditQuestion):
+    """ Gradient agent """
+    # class SimpleBanditItem(BanditItem):
+        # title = "Simple agent question"
+    def get_env_agent(self):
+        from irlc.ex08.bandits import StationaryBandit
+        from irlc.ex08.gradient_agent import GradientAgent
+        env = StationaryBandit(k=10)
+        agent = GradientAgent(env, alpha=0.05)
+        return env, agent
+    # def precompute_payload(self):
+    #     env, agent = self.get_env_agent()
+    #     _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+    #     return trajectories
+    def test_agent(self):
+        trajectories = self.get_trajectories()
+        env, agent = self.get_env_agent()
+        train_recording(env, agent, trajectories)
+        self.assertL2(agent.H, tol=1e-5)
+    # def test(self, computed, expected):
+    #     self.testfun(computed, self.H)
+    #
+    # class SimpleBanditActionDistribution(BanditItemActionDistribution):
+    #     pass
+# class GradientBanditQuestion(QuestionGroup):
+#     title = "Gradient agent"
+#     class SimpleBanditItem(BanditItem):
+#         # title = "Simple agent question"
+#         def get_env_agent(self):
+#             from irlc.ex08.bandits import StationaryBandit
+#             from irlc.ex08.gradient_agent import GradientAgent
+#             env = StationaryBandit(k=10)
+#             agent = GradientAgent(env, alpha=0.05)
+#             return env, agent
+#
+#         def precompute_payload(self):
+#             env, agent = self.get_env_agent()
+#             _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+#             return trajectories, agent.H
+#
+#         def compute_answer_print(self):
+#             trajectories, H = self.precomputed_payload()
+#             env, agent = self.get_env_agent()
+#             train_recording(env, agent, trajectories)
+#             self.H = H
+#             self.question.agent = agent
+#             return agent.H
+#
+#         def test(self, computed, expected):
+#             self.testfun(computed, self.H)
+#
+#     class SimpleBanditActionDistribution(BanditItemActionDistribution):
+#         pass
+class UCBAgentQuestion(BanditQuestion):
+    """ UCB agent """
+    # class UCBAgentItem(BanditItem):
+    def get_env_agent(self):
+        from irlc.ex08.bandits import StationaryBandit
+        from irlc.ex08.ucb_agent import UCBAgent
+        env = StationaryBandit(k=10)
+        agent = UCBAgent(env)
+        return env, agent
+    # class UCBAgentActionDistribution(BanditItemActionDistribution):
+    #     pass
+# class UCBAgentQuestion(QuestionGroup):
+#     title = "UCB agent"
+#     class UCBAgentItem(BanditItem):
+#         def get_env_agent(self):
+#             from irlc.ex08.bandits import StationaryBandit
+#             from irlc.ex08.ucb_agent import UCBAgent
+#             env = StationaryBandit(k=10)
+#             agent = UCBAgent(env)
+#             return env, agent
+#
+#     class UCBAgentActionDistribution(BanditItemActionDistribution):
+#         pass
+# class NonstatiotnaryAgentQuestion(QuestionGroup):
+#     title = "Nonstationary bandit environment"
+#     class NonstationaryItem(BanditItem):
+#         def get_env_agent(self):
+#             epsilon = 0.1
+#             from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent
+#             bandit = NonstationaryBandit(k=10)
+#             agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15)
+#             return bandit, agent
+#
+# class NonstationaryActionDistribution(BanditItemActionDistribution):
+#     pass
+class NonstatiotnaryAgentQuestion(BanditQuestion):
+    """ UCB agent """
+    # class UCBAgentItem(BanditItem):
+    def get_env_agent(self):
+        epsilon = 0.1
+        from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent
+        bandit = NonstationaryBandit(k=10)
+        agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15)
+        return bandit, agent
+import irlc
+class Week08Tests(Report):
+    title = "Tests for week 08"
+    pack_imports = [irlc]
+    individual_imports = []
+    questions = [
+            (BanditQuestion, 10),
+            (GradientBanditQuestion, 10),
+            (UCBAgentQuestion, 5),
+            (NonstatiotnaryAgentQuestion, 5)
+                ]
+if __name__ == '__main__':
+    from unitgrade import evaluate_report_student
+    evaluate_report_student(Week08Tests())
--- a/irlc/tests/unitgrade_data/BanditQuestion.pkl
+++ b/irlc/tests/unitgrade_data/BanditQuestion.pkl
--- a/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl
+++ b/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl
--- a/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl
+++ b/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl
--- a/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl
+++ b/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl
--- a/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl
+++ b/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl
--- a/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl
+++ b/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl