Week 10 final

bb039ad1 · tuhe · 23ff9694 · bb039ad1 · bb039ad1 · bb039ad1
Commit bb039ad1 authored 3 months ago by tuhe
--- a/.gitignore
+++ b/.gitignore
@@ -73,7 +73,7 @@ irlc/exam/exam20*/solution
 #irlc/lectures/lec07
 #irlc/lectures/lec08
 # irlc/lectures/lec09
-irlc/lectures/lec10
+#irlc/lectures/lec10
 irlc/lectures/lec11
 irlc/lectures/lec12
 irlc/lectures/lec13

--- a/irlc/ex10/__init__.py
+++ b/irlc/ex10/__init__.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""This directory contains the exercises for week 10."""
--- a/irlc/ex10/blackjack/__init__.py
+++ b/irlc/ex10/blackjack/__init__.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
--- a/irlc/ex10/blackjack/mc_agent_blackjack.py
+++ b/irlc/ex10/blackjack/mc_agent_blackjack.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import gym
+import numpy as np
+from collections import defaultdict
+import matplotlib.pyplot as plt
+from irlc import main_plot
+from irlc import savepdf
+from irlc.ex01.agent import train
+from irlc.ex10.mc_evaluate_blackjack import plot_blackjack_value, plot_blackjack_policy
+from irlc.ex10.mc_agent import MCAgent
+
+def run_experiment(episodes, first_visit=True, **kwargs):
+    env_name = 'Blackjack-v1'
+    env = gym.make(env_name)
+    agent = MCAgent(env, **kwargs)
+    lbl = "_".join(map(str, kwargs.values()))
+    fvl = "First" if first_visit else "Every"
+    title = f"MC agent ({fvl} visit)"
+
+    expn = f"experiments/{env_name}_MCagent_{episodes}_{first_visit}_{lbl}" # Name the experiment. Pass the label to the train function to store intermediate results. See the online documentation for more information.
+    # TODO: 1 lines missing.
+    raise NotImplementedError("call the train(...) function here.")
+
+    # Matplotlib with seaborn is for some reason very slow.
+    # This code re-samples the curve to just 400 points:
+    main_plot(expn, smoothing_window=episodes//100, resample_ticks=400)
+    plt.title("Estimated returns in blackjack using " + title)
+    plt.ylim([-0.3, 0])
+    savepdf(f"blackjack_MC_agent_{episodes}_{first_visit}")
+    plt.show()
+
+    V = defaultdict(lambda: 0)
+    A = defaultdict(lambda: 0)
+    for s, av in agent.Q.to_dict().items():
+        A[s] = agent.pi(s, 0)
+        V[s] = max(av.values() )
+
+    plot_blackjack_value(V, title=title, pdf_out=f"blackjack_mcagent_policy{fvl}_valfun_{episodes}")
+    plt.show()
+    plot_blackjack_policy(A, title=title)
+    savepdf(f"blackjack_mcagent_policy{fvl}_{episodes}")
+    plt.show()
+
+if __name__ == "__main__":
+    episodes = 1000000
+    # episodes = 1000 # Uncomment to run far fewer episodes during debugging.
+    run_experiment(episodes, epsilon=0.05, first_visit=True)
+    run_experiment(episodes, epsilon=0.05, first_visit=False)
--- a/irlc/ex10/blackjack/mc_evaluate_blackjack.py
+++ b/irlc/ex10/blackjack/mc_evaluate_blackjack.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import matplotlib.pyplot as plt
+import numpy as np
+
+def get_by_ace(V,ace=False):
+    dd = V.copy()
+    dd.clear()
+    for (p,d,ac),val in V.items():
+        if ac == ace:
+            dd[ (p,d)] = val
+    return dd
+
+def plot_surface_2(X,Y,Z,fig=None, ax=None, **kwargs):
+    if fig is None and ax is None:
+        fig = plt.figure(figsize=(20, 10))
+    if ax is None:
+        ax = fig.add_subplot(projection='3d')
+    surf = ax.plot_surface(X, Y, Z, cmap=plt.cm.coolwarm, linewidth=1, edgecolors='k', **kwargs)
+    ax.view_init(ax.elev, -120)
+    if fig is not None:
+        fig.colorbar(surf, shrink=0.5, aspect=5)
+    return ax
+
+def to_matrix(V):
+    min_x = min(k[0] for k in V.keys())
+    max_x = max(k[0] for k in V.keys())
+    min_y = min(k[1] for k in V.keys())
+    max_y = max(k[1] for k in V.keys())
+
+    x_range = np.arange(min_x, max_x + 1)
+    y_range = np.arange(min_y, max_y + 1)
+    X, Y = np.meshgrid(x_range, y_range)
+
+    Z_ace = np.zeros_like(X, dtype=float)
+    for j,(x, y) in enumerate( zip( X.flat, Y.flat)):
+        Z_ace.flat[j] = float(V[(x,y)])
+    return X, Y, Z_ace
+
+def plot_blackjack_value(V, title="Value Function", pdf_out=None):
+    """
+    Plots the value function as a surface plot.
+    """
+    for lbl, ac in zip(["Usable ace", "No usable ace"], [True, False]):
+        w = get_by_ace(V,ace=ac)
+        X,Y,Z = to_matrix(w)
+        ax = plot_surface_2(X, Y, Z)
+        ax.set_zlabel("Value")
+        ax.set_title(title)
+        if pdf_out is not None:
+            savepdf(pdf_out+"_"+lbl.replace(" ", "_"))
+
+def plot_blackjack_policy(V, title):
+    plt.figure(figsize=(18, 12))
+    for lbl, ac in zip(["Usable ace", "No usable ace"], [True, False]):
+        w = get_by_ace(V,ace=ac)
+        X, Y, Z = to_matrix(w)
+        plt.subplot(1,2,1+ac)
+        plt.imshow(Z.T)
+        plt.title(f"{title} ({lbl})")
+        plt.gca().invert_yaxis()
+        plt.ylabel('Player Sum')
+        plt.xlabel('Dealer Showing')
+        plt.colorbar()
+
+def policy20(s): 
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Implement the rule where we stick if we have a score of 20 or more.")
+
+if __name__ == "__main__":
+    from irlc.ex10.mc_evaluate import MCEvaluationAgent
+    from irlc.ex01.agent import train
+    import gym
+    from irlc import main_plot, savepdf
+
+    nenv = "Blackjack-v1"
+    env = gym.make(nenv)
+    episodes = 50000
+    gamma = 1
+    experiment = f"experiments/{nenv}_first_{episodes}"
+    """ Instantiate the agent and call the training method here. Make sure to pass the policy=policy20 function to the MCEvaluationAgent
+     and set gamma=1. """
+    # TODO: 2 lines missing.
+    raise NotImplementedError("Insert your solution and remove this error.")
+    main_plot(experiment, smoothing_window=episodes//100, resample_ticks=200)
+    plt.ylim([-0.5, 0])
+    plt.title("Blackjack using first-visit MC")
+    savepdf("blackjack_stick20_first")
+    plt.show()
+
+    pdf = "blackjack_stick20_valuefun"
+    plot_blackjack_value(agent.v, title="MC first-visit value function", pdf_out=pdf)
+    savepdf("blackjack_stick20_valuefun")
+    plt.show()
--- a/irlc/ex10/blackjack/random_walk_example.py
+++ b/irlc/ex10/blackjack/random_walk_example.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from irlc import savepdf
+from irlc.ex10.td0_evaluate import TD0ValueAgent
+from irlc.ex10.mc_evaluate import MCEvaluationAgent
+import seaborn as sns
+import pandas as pd
+from irlc.ex01.agent import train
+from irlc.ex09.mdp import MDP2GymEnv, MDP
+
+class ChainMRP(MDP):
+    def __init__(self, length=6):
+        """
+        Build the "Chain MRP" yafcport from (SB18). Terminal states are [0,6],
+        all states are [0,1,2,3,4,5,6] and initial state is 3. (default settings).
+        """
+        self.max_states = length
+        super().__init__(initial_state=length // 2)
+
+    def is_terminal(self, state):
+        return state == 0 or state == self.max_states
+
+    def A(self, s): # 0: left, 1: right.
+        return [0,1]
+
+    def Psr(self, s, a): 
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Return the P(s', r | s,a) values here. See e.g. the gampler problem from previous week for help.")
+        return {(sp, 1 if sp == self.max_states else 0): 1.0}
+
+class ChainEnvironment(MDP2GymEnv):
+    def __init__(self, *args, **kwargs):
+        super().__init__(mdp=ChainMRP(*args, **kwargs))
+
+if __name__ == "__main__":
+    """ plot results as in (SB18, Example 6.2) """
+    env = ChainEnvironment()
+    V_init = np.array([0.5, 0.5, 0.5, 0.5, 0.5])
+    V_true = np.array([1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6])
+    states = range(1,6)
+    """
+    This is a bit janky. The value-function is initialized at 
+    0.5 in the example, however (see (SB18)) the value function must be initialized at 
+    0 in terminal states. We make a function to initialize the value function
+    and pass it along to the ValueAgent; the ValueAgent then uses a subclassed 
+    defaultdict which can handle a parameterized default value. """
+    v_init_fun = lambda x: 0.5
+
+    fig, ax = plt.subplots(figsize=(15, 6), ncols=2)
+    """ Make TD plot """
+    td_episodes = [0, 1, 10, 100]
+    V_current = np.copy(V_init)
+    xticks = ['A', 'B', 'C', 'D', 'E']
+
+    for i, episodes in enumerate(td_episodes):
+        agent = TD0ValueAgent(env, v_init_fun=v_init_fun)
+        train(env, agent, num_episodes=episodes,verbose=False, return_trajectory=False)
+        vs = [agent.value(s) for s in states]
+        ax[0].plot(vs, label=f"{episodes} episodes", marker='o')
+
+    ax[0].plot(V_true, label='true values', marker='o')
+    ax[0].set(xlabel='State', ylabel='Estimated Value', title='Estimated Values TD(0)',
+              xticks=np.arange(5), xticklabels=['A','B','C','D','E'])
+    ax[0].legend()
+
+    """ Make TD vs. MC plot """
+    td_alphas = [0.05, 0.15, 0.1]
+    mc_alphas = [0.01, 0.03]
+    episodes = 100
+    runs = 200
+
+    def eval_mse(agent):
+        errors = []
+        for i in range(episodes):
+            V_ = [agent.value(s) for s in states]
+            train(env, agent, num_episodes=1, verbose=False, return_trajectory=False)
+            z = np.sqrt(np.sum(np.power(V_ - V_true, 2)) / 5.0)
+            errors.append(z)
+        return errors
+
+    methods = [(TD0ValueAgent, 'TD', alpha) for alpha in td_alphas]
+    methods += [(MCEvaluationAgent, 'MC', alpha) for alpha in mc_alphas]
+
+    dfs = []
+    for AC,method,alpha in tqdm(methods):
+        TD_mse = []
+        for r in range(runs):
+            agent = AC(env, alpha=alpha, gamma=1, v_init_fun=v_init_fun)
+            err_ = eval_mse(agent)
+            TD_mse.append( np.asarray(err_))
+
+        # Happy times with pandas. Let's up the production value by also plotting 1 std.
+        for u,mse in enumerate(TD_mse):
+            df = pd.DataFrame(mse, columns=['rmse'])
+            df.insert(len(df.columns), 'Unit', u)
+            df.insert(len(df.columns), 'Episodes', range(episodes))
+            df.insert(len(df.columns), 'Condition', f"{method} $\\alpha$={alpha}")
+            dfs.append(df)
+
+    data = pd.concat(dfs, ignore_index=True)
+    sns.lineplot(data=data, x='Episodes', y='rmse', hue="Condition", errorbar=('ci', 95), estimator='mean')
+    plt.ylabel("RMS error (averaged over states)")
+    plt.title("Empirical RMS error, averaged over states")
+    savepdf("random_walk_example")
+    plt.show()
--- a/irlc/ex10/envs.py
+++ b/irlc/ex10/envs.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import gymnasium as gym
+
+gym.envs.register(
+     id='Gambler-v0',
+     entry_point='irlc.ex09.gambler:GamblerEnv',
+)
+
+gym.envs.register(
+     id='Tenv-v0',
+     entry_point='irlc.ex09.gambler:TEnv',
+    max_episode_steps=100,
+)
+
+gym.envs.register(
+     id='JackRental4-v0',
+     entry_point='irlc.ex09.jacks_car_rental:RentalEnv',
+     max_episode_steps=1000,
+     kwargs={"max_cars": 4,
+             "poisson_truncation": 4,
+             "cache_str": "jack_rental_environment_4"},
+)
+
+gym.envs.register(
+     id='JackRental-v0',
+     entry_point='irlc.ex09.jacks_car_rental:RentalEnv',
+     max_episode_steps=1000,
+     kwargs={"cache_str": "jack_rental_environment"},
+)  # "compress_tol": 0.01
+
+gym.envs.register(
+     id='SmallGridworld-v0',
+     entry_point='irlc.gridworld.gridworld_environments:SuttonCornerGridEnvironment',
+     # max_episode_steps=100,  # Stop trying to make it happen
+)
+
+gym.envs.register( # Like MountainCar-v0, but time limit increased from 200 to 500.
+    id='MountainCar500-v0',
+    entry_point='gymnasium.envs.classic_control:MountainCarEnv',
+    max_episode_steps=500,
+    reward_threshold=-110.0,
+)
+
+
+if __name__ == "__main__":
+    print("Testing...")
+    mc = gym.make('MountainCar500-v0')
+    # j4 = gym.make("JackRental4-v0")
+    # jack = gym.make("JackRental-v0")
+    sg = gym.make("SmallGridworld-v0")
--- a/irlc/ex10/mc_agent.py
+++ b/irlc/ex10/mc_agent.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from collections import defaultdict
+import matplotlib.pyplot as plt
+from irlc.ex09.rl_agent import TabularAgent
+from irlc import main_plot, savepdf, train
+from irlc import interactive
+
+
+def get_MC_return_SA(episode, gamma, first_visit=True):
+    """ Helper method for computing the MC returns.
+    Given an episodes in the form [ (s0,a0,r1), (s1,a1,r2), ...]
+    this function computes (if first_visit=True) a new list
+
+    > [((s,a), G) , ... ]
+
+    consisting of the unique $(s_t,a_t)$ pairs in episode along with their return G_t (computed from their first occurance).
+    Alternatively, if first_visit=False, the method return a list of same length of episode
+    with all (s,a) pairs and their return.
+    """
+    sa = [(s, a) for s, a, r in episode] # Get all state/action pairs. Useful for checking if we have visited a state/action before.
+    G = 0
+    returns = []
+    for t in reversed(range(len(episode))):
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Insert your solution and remove this error.")
+        if sa_t not in sa[:t] or not first_visit: 
+            # TODO: 1 lines missing.
+            raise NotImplementedError("Implement function body")
+    return returns
+
+class MCAgent(TabularAgent): 
+    def __init__(self, env, gamma=1.0, epsilon=0.05, alpha=None, first_visit=True):
+        if alpha is None:
+            self.returns_sum_S = defaultdict(float)
+            self.returns_count_N = defaultdict(float)
+        self.alpha = alpha
+        self.first_visit = first_visit
+        self.episode = []
+        super().__init__(env, gamma, epsilon) 
+
+    def pi(self, s,k, info=None): 
+        """
+        Compute the policy of the MC agent. Remember the agent is epsilon-greedy. You can use the pi_eps(s,info)-function defined
+        in the TabularAgent class.
+        """
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Compute action here using the Q-values. (remember to be epsilon-greedy)")
+
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):  
+        """
+        Consult your implementation of value estimation agent for ideas. Note you can index the Q-values as
+
+        >> self.Q[s, a] = new_q_value
+
+        see comments in the Agent class for more details, however for now you can consider them as simply a nested
+        structure where ``self.Q[s, a]`` defaults to 0 unless the Q-value has been updated.
+        """
+        # TODO: 12 lines missing.
+        raise NotImplementedError("Train the agent here.")
+
+    def __str__(self):
+        return f"MC_{self.gamma}_{self.epsilon}_{self.alpha}_{self.first_visit}"
+
+if __name__ == "__main__":
+    """ Load environment but make sure it is time-limited. Can you tell why? """
+    envn = "SmallGridworld-v0"
+
+    from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment
+    env = SuttonCornerGridEnvironment(uniform_initial_state=True)
+    # env = BookGridEnvironment(living_reward=-0.05) # Uncomment to test an alternative environment with a negative living reward.
+
+    gamma = 1 
+    episodes = 20000
+    experiment="experiments/mcagent_smallgrid"
+    agent = MCAgent(env, gamma=gamma, first_visit=True)
+    train(env, agent, experiment_name=experiment, num_episodes=episodes, return_trajectory=False)
+    main_plot(experiments=[experiment], resample_ticks=200) 
+    plt.title("Smallgrid MC agent value function")
+    plt.ylim([-10, 0])
+    savepdf("mcagent_smallgrid") 
+    plt.show() 
+
+    env, agent = interactive(env, agent)
+    env.reset()
+    env.plot()
+    plt.title(f"MC on-policy control of {envn} using first-visit")
+    savepdf("MC_agent_value_smallgrid")
+    plt.show(block=False)
--- a/irlc/ex10/mc_evaluate.py
+++ b/irlc/ex10/mc_evaluate.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc import savepdf
+import matplotlib.pyplot as plt
+from irlc.ex09.rl_agent import ValueAgent
+from collections import defaultdict
+from irlc.ex01.agent import train
+import numpy as np
+import matplotlib
+#matplotlib.use('qtagg')  # Fix crash on linux with default backend.
+
+def get_MC_return_S(episode, gamma, first_visit=True):
+    """ Helper method for computing the MC returns.
+    Given an episodes in the form ``[ (s0,a0,r1), (s1,a1,r2), ...]``
+    this function computes (if first_visit=True) a new list::
+
+        [(s0, G0), (s1, G1), ...]
+
+    consisting of the unique s_t values in the episode along with their return G_t (computed from their first occurance).
+
+    Alternatively, if first_visit=False, the method return a list of same length of episode
+    with all s values and their return.
+    """
+    ss = [s for s, a, r in episode]
+    G = 0
+    returns = []
+    for t in reversed(range(len(episode))):
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Insert your solution and remove this error.")
+        if s_t not in ss[:t] or not first_visit: 
+            # TODO: 1 lines missing.
+            raise NotImplementedError("Implement function body")
+    return returns
+class MCEvaluationAgent(ValueAgent): 
+    def __init__(self, env, policy=None, gamma=1, alpha=None, first_visit=True, v_init_fun=None):
+        self.episode = [] 
+        self.first_visit = first_visit
+        self.alpha = alpha
+        if self.alpha is None:
+            self.returns_sum_S = defaultdict(float)
+            self.returns_count_N = defaultdict(float) 
+        super().__init__(env, gamma, policy, v_init_fun=v_init_fun)
+
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        self.episode.append((s, a, r)) # Gather the episode
+        if done: # Only train when the episode has stopped
+            returns = get_MC_return_S(self.episode, self.gamma, self.first_visit)
+            for s, G in returns:  
+                if self.alpha: 
+                    # TODO: 1 lines missing.
+                    raise NotImplementedError("Implement function body")
+                else: 
+                    # TODO: 3 lines missing.
+                    raise NotImplementedError("Implement function body")
+
+            self.episode = []
+
+    def __str__(self):
+        return f"MCeval_{self.gamma}_{self.alpha}_{self.first_visit}"
+
+
+if __name__ == "__main__":
+    envn = "SmallGridworld-v0"
+    from irlc import interactive
+    from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment
+    env = SuttonCornerGridEnvironment(render_mode=None)
+    gamma = 1
+    episodes = 200
+    agent = MCEvaluationAgent(env, gamma=gamma)
+    train(env, agent, num_episodes=episodes)
+    env.render_mode = 'human'
+    env, agent = interactive(env, agent, autoplay=True)
+    env.plot()
+    plt.title(f"MC evaluation of {envn} using first-visit")
+    savepdf("MC_value_random_smallgrid")
+    plt.show(block=False)
+    env.close()
+
+    env = SuttonCornerGridEnvironment(render_mode=None)
+    agent_every = MCEvaluationAgent(env, gamma=gamma, first_visit=False)
+    train(env, agent_every, num_episodes=episodes)
+    env.render_mode = 'human'
+    env, agent = interactive(env, agent, autoplay=True)
+    env.plot()
+    plt.title(f"MC evaluation of {envn} using every-visit")
+    savepdf("MC_value_random_smallgrid_every")
+    plt.show(block=False)
+    env.close()
+    s0 = (1, 1)
+    print(f"Estimated value functions v_pi(s0) for first visit {agent.v[(1,1)]:3}") 
+    print(f"Estimated value functions v_pi(s0) for every visit {agent_every.v[(1,1)]:3}") 
+
+    ## Second part:
+    repeats = 5000  # increase to e.g. 20'000.
+    episodes = 1
+    ev, fv = [], []
+    env = SuttonCornerGridEnvironment()
+    print(f"Repeating experiment {repeats} times, this may take a while.")
+    for _ in range(repeats):
+        """
+        Instantiate two agents with first_visit=True and first_visit=False.
+        Train the agents using the train function for episodes episodes. You might want to pass verbose=False to the 
+        'train'-method to suppress output. 
+        When done, compute the mean of agent.values() and add it to the lists ev / fv; the mean of these lists
+        are the desired result. 
+        """
+        agent = MCEvaluationAgent(env, gamma=gamma)
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Create and train an every-visit agent.")
+
+        train(env, agent, num_episodes=episodes, verbose=False)
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Create and train an every-visit agent.")
+
+        ev.append(agent.v[(1,1)])
+        fv.append(agent_every.v[(1,1)])
+
+    print(f"First visit: Mean of value functions E[v_pi(s0)] after {repeats} repeats {np.mean(fv):3}")  
+    print(f"Every visit: Mean of value functions E[v_pi(s0)] after {repeats} repeats {np.mean(ev):3}")  
+    env.close()
+    plt.close()
--- a/irlc/ex10/question_td0.py
+++ b/irlc/ex10/question_td0.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+def a_compute_deltas(v: dict, states: list, rewards: list, gamma: float) -> list:
+    # TODO: Code has been removed from here.
+    raise NotImplementedError("Insert your solution and remove this error.")
+    return deltas
+
+
+def b_perform_td0(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict:
+    # TODO: Code has been removed from here.
+    raise NotImplementedError("Insert your solution and remove this error.")
+    return v
+
+
+def c_perform_td0_batched(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict:
+    # TODO: Code has been removed from here.
+    raise NotImplementedError("Insert your solution and remove this error.")
+    return v
+
+
+if __name__ == "__main__":
+    states = [1, 0, 2, -1, 2, 4, 5, 4, 3, 2, 1, -1]
+    rewards = [1, 0.5, -1, 0, 1, 2, 2, 0, 0, -1, 0.5]
+    # In the notation of the problem: T = len(rewards).
+    v = {s: 0 for s in states}  # Initialize the value function v.
+    gamma = 0.9
+    alpha = 0.2
+
+    deltas = a_compute_deltas(v, states, rewards, gamma)
+    print(f"The first value of delta should be 1, your value is {deltas[0]=}")
+
+    v = b_perform_td0(v, states, rewards, gamma, alpha)
+    print(f"The value function v(s=1) should be 0.25352, your value is {v[1]=}")
+
+    v_batched = {s: 0 for s in states}  # Initialize the value function anew
+    v_batched = c_perform_td0_batched(v_batched, states, rewards, gamma, alpha)
+    print(f"The batched value function in v(s=1) should be 0.3, your value is {v_batched[1]=}")
--- a/irlc/ex10/td0_evaluate.py
+++ b/irlc/ex10/td0_evaluate.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex09.rl_agent import ValueAgent
+from irlc import savepdf
+from irlc.ex01.agent import train
+
+class TD0ValueAgent(ValueAgent):
+    def __init__(self, env, policy=None, gamma=0.99, alpha=0.05, v_init_fun=None):
+        self.alpha = alpha
+        super().__init__(env, gamma=gamma, policy=policy, v_init_fun=v_init_fun)
+
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        # TODO: 3 lines missing.
+        raise NotImplementedError("Implement function body")
+
+    def __str__(self):
+        return f"TD0Value_{self.gamma}_{self.alpha}"
+
+def value_function_test(env, agent, v_true, episodes=200):
+    err = []
+    for t in range(episodes):
+        train(env, agent, num_episodes=1, verbose=False)
+        err.append( np.mean( [(v_true - v0) ** 2 for k, v0 in agent.v.items()] ) )
+    return np.asarray(err)
+
+if __name__ == "__main__":
+    envn = "SmallGridworld-v0"
+
+    from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment 
+    from irlc import interactive
+    env = SuttonCornerGridEnvironment() # Make the gridworld environment itself 
+
+    gamma = 1   
+    agent = TD0ValueAgent(env, gamma=gamma, alpha=0.05) # Make a TD(0) agent
+    train(env, agent, num_episodes=2000, return_trajectory=False) # Train for 2000 episodes 
+    env = SuttonCornerGridEnvironment(render_mode='human') # Re-make the gridworld to get rendering.
+    env, agent = interactive(env, agent) # Add a video monitor, the environment will now show an animation 
+    train(env,agent,num_episodes=1) # Train for a (single) new episode
+    env.plot() # Plot the current state of the environment/agent
+    plt.title(f"TD0 evaluation of {envn}")
+    savepdf("TD_value_random_smallgrid")
+    plt.show(block=False) 
--- a/irlc/lectures/lec10/__init__.py
+++ b/irlc/lectures/lec10/__init__.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
--- a/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state.py
+++ b/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.gridworld.gridworld_environments import BookGridEnvironment
+from irlc.ex10.mc_agent import MCAgent
+from irlc import interactive, train
+
+class MCControlAgentOneState(MCAgent):
+    def __init__(self, *args, state_action=None, **kwargs):
+        a = 34
+        super().__init__(*args, **kwargs)
+        if state_action is None:
+            state_action = (self.env.mdp.initial_state, self.env.mdp.A(self.env.mdp.initial_state)[0])
+
+        self.state_action = state_action
+        self._clear_states()
+
+    def _clear_states(self, val=None):
+        for s in self.env.mdp.nonterminal_states:
+            for a in self.env.mdp.A(s):
+                if (s,a) != self.state_action:
+                    self.returns_sum[s,a] = val
+                    self.returns_count[s,a] = val
+                    k = next(self.env.mdp.Psr(s, self.env.mdp.A(s)[0]).keys().__iter__() )[0]
+                    if not self.env.mdp.is_terminal(k):
+                        self.Q[s,a] = 0
+
+
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
+        # self.episode = [e for e in self.episode if e[0] == self.state]
+        self._clear_states(0)
+        super().train(s, a, r, sp, done)
+        # Clear out many of the state, actions:
+        self._clear_states(None)
+        # for s in self.env.mdp.nonterminal_states:
+        #     if s != self.state:
+        #         self.v[s] = None
+
+        pass
+
+
+if __name__ == "__main__":
+    env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=True, zoom=2)
+    agent = MCControlAgentOneState(env, gamma=1, alpha=None, first_visit=True)
+    method_label = 'MC (gamma=1)'
+    agent.label = method_label
+    autoplay = False
+    env, agent = interactive(env, agent, autoplay=autoplay)
+    # agent = PlayWrapper(agent, env,autoplay=autoplay)
+    # env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label})
+    num_episodes = 1000
+    train(env, agent, num_episodes=num_episodes)
+    env.close()
+
+    # keyboard_play(env,agent,method_label='MC (alpha=0.5)')
--- a/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state_b.py
+++ b/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state_b.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play
+from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment
+from irlc.ex10.mc_agent import MCAgent
+from irlc.lectures.lec10.lecture_10_mc_action_value_first_one_state import MCControlAgentOneState
+from irlc.ex10.mc_evaluate import MCEvaluationAgent
+import numpy as np
+from irlc import interactive, train
+
+
+if __name__ == "__main__":
+    env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=True, zoom=2)
+    agent = MCControlAgentOneState(env, gamma=1, alpha=None, first_visit=True, state_action=( (0,2), 2))
+    method_label = 'MC control (gamma=1)'
+    agent.label = method_label
+    autoplay = False
+    env, agent = interactive(env, agent, autoplay=autoplay)
+    num_episodes = 1000
+    train(env, agent, num_episodes=num_episodes)
+    env.close()
+    # keyboard_play(env,agent,method_label='MC (alpha=0.5)')
--- a/irlc/lectures/lec10/lecture_10_mc_control.py
+++ b/irlc/lectures/lec10/lecture_10_mc_control.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play
+from irlc.gridworld.gridworld_environments import BookGridEnvironment
+from irlc.ex10.mc_agent import MCAgent
+import numpy as np
+
+if __name__ == "__main__":
+    np.random.seed(433)
+    env = BookGridEnvironment(render_mode='human',zoom=2)
+    # agent = MCAgent(env, gamma=0.9, epsilon=0.15, alpha=0.1, first_visit=True)
+    agent = MCAgent(env, gamma=1.0, epsilon=0.15, alpha=None, first_visit=True)
+    # env, agent = interactive(env, agent)
+    keyboard_play(env,agent,method_label='MC control')
--- a/irlc/lectures/lec10/lecture_10_mc_corner.py
+++ b/irlc/lectures/lec10/lecture_10_mc_corner.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play
+from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment
+from irlc.ex10.mc_agent import MCAgent
+import numpy as np
+
+if __name__ == "__main__":
+    env = SuttonCornerGridEnvironment(render_mode='human')
+    agent = MCAgent(env, gamma=1, epsilon=1, alpha=.5, first_visit=False)
+    keyboard_play(env,agent,method_label='MC (alpha=0.5)')
--- a/irlc/lectures/lec10/lecture_10_mc_onestate_every.py
+++ b/irlc/lectures/lec10/lecture_10_mc_onestate_every.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.exam_tabular_examples.helper import keyboard_play_value
+from irlc.ex10.mc_evaluate import MCEvaluationAgent
+from irlc.lectures.lec10.lecture_10_mc_onestate_first import CaughtGrid
+
+
+if __name__ == "__main__":
+    env = CaughtGrid(view_mode=1, render_mode='humanp')
+    agent = MCEvaluationAgent(env, gamma=1, alpha=None, first_visit=False)
+    keyboard_play_value(env,agent,method_label='MC (every visit)')
--- a/irlc/lectures/lec10/lecture_10_mc_onestate_first.py
+++ b/irlc/lectures/lec10/lecture_10_mc_onestate_first.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.exam_tabular_examples.helper import keyboard_play_value
+# from irlc.gridworld_pyglet.gridworld_environments import BookGridEnvironment
+from irlc.ex10.mc_evaluate import MCEvaluationAgent
+from irlc.gridworld.gridworld_environments import GridworldEnvironment
+
+map = [['#', '#', '#', '#'],
+        ['#','S',0,'#'],
+        ['#','#','#','#']]
+
+class CaughtGrid(GridworldEnvironment):
+    def __init__(self, **kwargs):
+        super().__init__(map, living_reward=1, zoom=1.5, **kwargs)
+
+if __name__ == "__main__":
+    env = CaughtGrid(view_mode=1, render_mode='human')
+    agent = MCEvaluationAgent(env, gamma=1, alpha=None)
+    keyboard_play_value(env,agent,method_label='MC (first visit)')
--- a/irlc/lectures/lec10/lecture_10_mc_q_estimation.py
+++ b/irlc/lectures/lec10/lecture_10_mc_q_estimation.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.gridworld.gridworld_environments import BookGridEnvironment
+from irlc import train, interactive
+
+def keyboard_play(env, agent, method_label='MC',autoplay=False, num_episodes=1000):
+    agent.label = method_label
+    env, agent = interactive(env, agent, autoplay=autoplay)
+    # agent = PlayWrapper(agent, env,autoplay=autoplay)
+    # env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label})
+    train(env, agent, num_episodes=num_episodes)
+    env.close()
+
+
+def automatic_play(env, agent, method_label='MC'):
+    # agent = PlayWrapper(agent, env)
+    env = VideoMonitor(env, agent=agent, fps=40, continious_recording=True, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label})
+    train(env, agent, num_episodes=1000)
+    env.close()
+
+def automatic_play_value(env, agent, method_label='MC'):
+    agent.label = method_label
+    env, agent = interactive(env, agent)
+
+    # env = VideoMonitor(env, agent=agent, fps=40, continious_recording=True, agent_monitor_keys=('v'), render_kwargs={'method_label': method_label})
+    # agent = PlayWrapper(agent, env)
+    train(env, agent, num_episodes=1000)
+    env.close()
+
+if __name__ == "__main__":
+    env = BookGridEnvironment(render_mode='human', zoom=2, living_reward=-0.05)
+    from irlc.ex10.mc_agent import MCAgent
+    agent = MCAgent(env, gamma=0.9, epsilon=1., first_visit=True, alpha=None)
+    # agent.label =
+    # env, agent = interactive(env, agent)
+    keyboard_play(env, agent, method_label='MC Q-estimation (First visit)')
--- a/irlc/lectures/lec10/lecture_10_mc_value_every.py
+++ b/irlc/lectures/lec10/lecture_10_mc_value_every.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.exam_tabular_examples.helper import keyboard_play_value
+# from irlc.berkley.rl.feature_encoder import SimplePacmanExtractor
+from irlc.gridworld.gridworld_environments import BookGridEnvironment
+from irlc.ex10.mc_evaluate import MCEvaluationAgent
+
+if __name__ == "__main__":
+    env = BookGridEnvironment(view_mode=1, render_mode='human', living_reward=-0.05)
+    agent = MCEvaluationAgent(env, gamma=.9, alpha=None, first_visit=False)
+
+    keyboard_play_value(env,agent,method_label='MC every')