Skip to content
Snippets Groups Projects
Commit bb039ad1 authored by tuhe's avatar tuhe
Browse files

Week 10 final

parent 23ff9694
Branches
No related tags found
No related merge requests found
Showing
with 766 additions and 1 deletion
......@@ -73,7 +73,7 @@ irlc/exam/exam20*/solution
#irlc/lectures/lec07
#irlc/lectures/lec08
# irlc/lectures/lec09
irlc/lectures/lec10
#irlc/lectures/lec10
irlc/lectures/lec11
irlc/lectures/lec12
irlc/lectures/lec13
......
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""This directory contains the exercises for week 10."""
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import gym
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from irlc import main_plot
from irlc import savepdf
from irlc.ex01.agent import train
from irlc.ex10.mc_evaluate_blackjack import plot_blackjack_value, plot_blackjack_policy
from irlc.ex10.mc_agent import MCAgent
def run_experiment(episodes, first_visit=True, **kwargs):
env_name = 'Blackjack-v1'
env = gym.make(env_name)
agent = MCAgent(env, **kwargs)
lbl = "_".join(map(str, kwargs.values()))
fvl = "First" if first_visit else "Every"
title = f"MC agent ({fvl} visit)"
expn = f"experiments/{env_name}_MCagent_{episodes}_{first_visit}_{lbl}" # Name the experiment. Pass the label to the train function to store intermediate results. See the online documentation for more information.
# TODO: 1 lines missing.
raise NotImplementedError("call the train(...) function here.")
# Matplotlib with seaborn is for some reason very slow.
# This code re-samples the curve to just 400 points:
main_plot(expn, smoothing_window=episodes//100, resample_ticks=400)
plt.title("Estimated returns in blackjack using " + title)
plt.ylim([-0.3, 0])
savepdf(f"blackjack_MC_agent_{episodes}_{first_visit}")
plt.show()
V = defaultdict(lambda: 0)
A = defaultdict(lambda: 0)
for s, av in agent.Q.to_dict().items():
A[s] = agent.pi(s, 0)
V[s] = max(av.values() )
plot_blackjack_value(V, title=title, pdf_out=f"blackjack_mcagent_policy{fvl}_valfun_{episodes}")
plt.show()
plot_blackjack_policy(A, title=title)
savepdf(f"blackjack_mcagent_policy{fvl}_{episodes}")
plt.show()
if __name__ == "__main__":
episodes = 1000000
# episodes = 1000 # Uncomment to run far fewer episodes during debugging.
run_experiment(episodes, epsilon=0.05, first_visit=True)
run_experiment(episodes, epsilon=0.05, first_visit=False)
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import matplotlib.pyplot as plt
import numpy as np
def get_by_ace(V,ace=False):
dd = V.copy()
dd.clear()
for (p,d,ac),val in V.items():
if ac == ace:
dd[ (p,d)] = val
return dd
def plot_surface_2(X,Y,Z,fig=None, ax=None, **kwargs):
if fig is None and ax is None:
fig = plt.figure(figsize=(20, 10))
if ax is None:
ax = fig.add_subplot(projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap=plt.cm.coolwarm, linewidth=1, edgecolors='k', **kwargs)
ax.view_init(ax.elev, -120)
if fig is not None:
fig.colorbar(surf, shrink=0.5, aspect=5)
return ax
def to_matrix(V):
min_x = min(k[0] for k in V.keys())
max_x = max(k[0] for k in V.keys())
min_y = min(k[1] for k in V.keys())
max_y = max(k[1] for k in V.keys())
x_range = np.arange(min_x, max_x + 1)
y_range = np.arange(min_y, max_y + 1)
X, Y = np.meshgrid(x_range, y_range)
Z_ace = np.zeros_like(X, dtype=float)
for j,(x, y) in enumerate( zip( X.flat, Y.flat)):
Z_ace.flat[j] = float(V[(x,y)])
return X, Y, Z_ace
def plot_blackjack_value(V, title="Value Function", pdf_out=None):
"""
Plots the value function as a surface plot.
"""
for lbl, ac in zip(["Usable ace", "No usable ace"], [True, False]):
w = get_by_ace(V,ace=ac)
X,Y,Z = to_matrix(w)
ax = plot_surface_2(X, Y, Z)
ax.set_zlabel("Value")
ax.set_title(title)
if pdf_out is not None:
savepdf(pdf_out+"_"+lbl.replace(" ", "_"))
def plot_blackjack_policy(V, title):
plt.figure(figsize=(18, 12))
for lbl, ac in zip(["Usable ace", "No usable ace"], [True, False]):
w = get_by_ace(V,ace=ac)
X, Y, Z = to_matrix(w)
plt.subplot(1,2,1+ac)
plt.imshow(Z.T)
plt.title(f"{title} ({lbl})")
plt.gca().invert_yaxis()
plt.ylabel('Player Sum')
plt.xlabel('Dealer Showing')
plt.colorbar()
def policy20(s):
# TODO: 1 lines missing.
raise NotImplementedError("Implement the rule where we stick if we have a score of 20 or more.")
if __name__ == "__main__":
from irlc.ex10.mc_evaluate import MCEvaluationAgent
from irlc.ex01.agent import train
import gym
from irlc import main_plot, savepdf
nenv = "Blackjack-v1"
env = gym.make(nenv)
episodes = 50000
gamma = 1
experiment = f"experiments/{nenv}_first_{episodes}"
""" Instantiate the agent and call the training method here. Make sure to pass the policy=policy20 function to the MCEvaluationAgent
and set gamma=1. """
# TODO: 2 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
main_plot(experiment, smoothing_window=episodes//100, resample_ticks=200)
plt.ylim([-0.5, 0])
plt.title("Blackjack using first-visit MC")
savepdf("blackjack_stick20_first")
plt.show()
pdf = "blackjack_stick20_valuefun"
plot_blackjack_value(agent.v, title="MC first-visit value function", pdf_out=pdf)
savepdf("blackjack_stick20_valuefun")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from irlc import savepdf
from irlc.ex10.td0_evaluate import TD0ValueAgent
from irlc.ex10.mc_evaluate import MCEvaluationAgent
import seaborn as sns
import pandas as pd
from irlc.ex01.agent import train
from irlc.ex09.mdp import MDP2GymEnv, MDP
class ChainMRP(MDP):
def __init__(self, length=6):
"""
Build the "Chain MRP" yafcport from (SB18). Terminal states are [0,6],
all states are [0,1,2,3,4,5,6] and initial state is 3. (default settings).
"""
self.max_states = length
super().__init__(initial_state=length // 2)
def is_terminal(self, state):
return state == 0 or state == self.max_states
def A(self, s): # 0: left, 1: right.
return [0,1]
def Psr(self, s, a):
# TODO: 1 lines missing.
raise NotImplementedError("Return the P(s', r | s,a) values here. See e.g. the gampler problem from previous week for help.")
return {(sp, 1 if sp == self.max_states else 0): 1.0}
class ChainEnvironment(MDP2GymEnv):
def __init__(self, *args, **kwargs):
super().__init__(mdp=ChainMRP(*args, **kwargs))
if __name__ == "__main__":
""" plot results as in (SB18, Example 6.2) """
env = ChainEnvironment()
V_init = np.array([0.5, 0.5, 0.5, 0.5, 0.5])
V_true = np.array([1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6])
states = range(1,6)
"""
This is a bit janky. The value-function is initialized at
0.5 in the example, however (see (SB18)) the value function must be initialized at
0 in terminal states. We make a function to initialize the value function
and pass it along to the ValueAgent; the ValueAgent then uses a subclassed
defaultdict which can handle a parameterized default value. """
v_init_fun = lambda x: 0.5
fig, ax = plt.subplots(figsize=(15, 6), ncols=2)
""" Make TD plot """
td_episodes = [0, 1, 10, 100]
V_current = np.copy(V_init)
xticks = ['A', 'B', 'C', 'D', 'E']
for i, episodes in enumerate(td_episodes):
agent = TD0ValueAgent(env, v_init_fun=v_init_fun)
train(env, agent, num_episodes=episodes,verbose=False, return_trajectory=False)
vs = [agent.value(s) for s in states]
ax[0].plot(vs, label=f"{episodes} episodes", marker='o')
ax[0].plot(V_true, label='true values', marker='o')
ax[0].set(xlabel='State', ylabel='Estimated Value', title='Estimated Values TD(0)',
xticks=np.arange(5), xticklabels=['A','B','C','D','E'])
ax[0].legend()
""" Make TD vs. MC plot """
td_alphas = [0.05, 0.15, 0.1]
mc_alphas = [0.01, 0.03]
episodes = 100
runs = 200
def eval_mse(agent):
errors = []
for i in range(episodes):
V_ = [agent.value(s) for s in states]
train(env, agent, num_episodes=1, verbose=False, return_trajectory=False)
z = np.sqrt(np.sum(np.power(V_ - V_true, 2)) / 5.0)
errors.append(z)
return errors
methods = [(TD0ValueAgent, 'TD', alpha) for alpha in td_alphas]
methods += [(MCEvaluationAgent, 'MC', alpha) for alpha in mc_alphas]
dfs = []
for AC,method,alpha in tqdm(methods):
TD_mse = []
for r in range(runs):
agent = AC(env, alpha=alpha, gamma=1, v_init_fun=v_init_fun)
err_ = eval_mse(agent)
TD_mse.append( np.asarray(err_))
# Happy times with pandas. Let's up the production value by also plotting 1 std.
for u,mse in enumerate(TD_mse):
df = pd.DataFrame(mse, columns=['rmse'])
df.insert(len(df.columns), 'Unit', u)
df.insert(len(df.columns), 'Episodes', range(episodes))
df.insert(len(df.columns), 'Condition', f"{method} $\\alpha$={alpha}")
dfs.append(df)
data = pd.concat(dfs, ignore_index=True)
sns.lineplot(data=data, x='Episodes', y='rmse', hue="Condition", errorbar=('ci', 95), estimator='mean')
plt.ylabel("RMS error (averaged over states)")
plt.title("Empirical RMS error, averaged over states")
savepdf("random_walk_example")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import gymnasium as gym
gym.envs.register(
id='Gambler-v0',
entry_point='irlc.ex09.gambler:GamblerEnv',
)
gym.envs.register(
id='Tenv-v0',
entry_point='irlc.ex09.gambler:TEnv',
max_episode_steps=100,
)
gym.envs.register(
id='JackRental4-v0',
entry_point='irlc.ex09.jacks_car_rental:RentalEnv',
max_episode_steps=1000,
kwargs={"max_cars": 4,
"poisson_truncation": 4,
"cache_str": "jack_rental_environment_4"},
)
gym.envs.register(
id='JackRental-v0',
entry_point='irlc.ex09.jacks_car_rental:RentalEnv',
max_episode_steps=1000,
kwargs={"cache_str": "jack_rental_environment"},
) # "compress_tol": 0.01
gym.envs.register(
id='SmallGridworld-v0',
entry_point='irlc.gridworld.gridworld_environments:SuttonCornerGridEnvironment',
# max_episode_steps=100, # Stop trying to make it happen
)
gym.envs.register( # Like MountainCar-v0, but time limit increased from 200 to 500.
id='MountainCar500-v0',
entry_point='gymnasium.envs.classic_control:MountainCarEnv',
max_episode_steps=500,
reward_threshold=-110.0,
)
if __name__ == "__main__":
print("Testing...")
mc = gym.make('MountainCar500-v0')
# j4 = gym.make("JackRental4-v0")
# jack = gym.make("JackRental-v0")
sg = gym.make("SmallGridworld-v0")
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from collections import defaultdict
import matplotlib.pyplot as plt
from irlc.ex09.rl_agent import TabularAgent
from irlc import main_plot, savepdf, train
from irlc import interactive
def get_MC_return_SA(episode, gamma, first_visit=True):
""" Helper method for computing the MC returns.
Given an episodes in the form [ (s0,a0,r1), (s1,a1,r2), ...]
this function computes (if first_visit=True) a new list
> [((s,a), G) , ... ]
consisting of the unique $(s_t,a_t)$ pairs in episode along with their return G_t (computed from their first occurance).
Alternatively, if first_visit=False, the method return a list of same length of episode
with all (s,a) pairs and their return.
"""
sa = [(s, a) for s, a, r in episode] # Get all state/action pairs. Useful for checking if we have visited a state/action before.
G = 0
returns = []
for t in reversed(range(len(episode))):
# TODO: 2 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
if sa_t not in sa[:t] or not first_visit:
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
return returns
class MCAgent(TabularAgent):
def __init__(self, env, gamma=1.0, epsilon=0.05, alpha=None, first_visit=True):
if alpha is None:
self.returns_sum_S = defaultdict(float)
self.returns_count_N = defaultdict(float)
self.alpha = alpha
self.first_visit = first_visit
self.episode = []
super().__init__(env, gamma, epsilon)
def pi(self, s,k, info=None):
"""
Compute the policy of the MC agent. Remember the agent is epsilon-greedy. You can use the pi_eps(s,info)-function defined
in the TabularAgent class.
"""
# TODO: 1 lines missing.
raise NotImplementedError("Compute action here using the Q-values. (remember to be epsilon-greedy)")
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
"""
Consult your implementation of value estimation agent for ideas. Note you can index the Q-values as
>> self.Q[s, a] = new_q_value
see comments in the Agent class for more details, however for now you can consider them as simply a nested
structure where ``self.Q[s, a]`` defaults to 0 unless the Q-value has been updated.
"""
# TODO: 12 lines missing.
raise NotImplementedError("Train the agent here.")
def __str__(self):
return f"MC_{self.gamma}_{self.epsilon}_{self.alpha}_{self.first_visit}"
if __name__ == "__main__":
""" Load environment but make sure it is time-limited. Can you tell why? """
envn = "SmallGridworld-v0"
from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment
env = SuttonCornerGridEnvironment(uniform_initial_state=True)
# env = BookGridEnvironment(living_reward=-0.05) # Uncomment to test an alternative environment with a negative living reward.
gamma = 1
episodes = 20000
experiment="experiments/mcagent_smallgrid"
agent = MCAgent(env, gamma=gamma, first_visit=True)
train(env, agent, experiment_name=experiment, num_episodes=episodes, return_trajectory=False)
main_plot(experiments=[experiment], resample_ticks=200)
plt.title("Smallgrid MC agent value function")
plt.ylim([-10, 0])
savepdf("mcagent_smallgrid")
plt.show()
env, agent = interactive(env, agent)
env.reset()
env.plot()
plt.title(f"MC on-policy control of {envn} using first-visit")
savepdf("MC_agent_value_smallgrid")
plt.show(block=False)
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc import savepdf
import matplotlib.pyplot as plt
from irlc.ex09.rl_agent import ValueAgent
from collections import defaultdict
from irlc.ex01.agent import train
import numpy as np
import matplotlib
#matplotlib.use('qtagg') # Fix crash on linux with default backend.
def get_MC_return_S(episode, gamma, first_visit=True):
""" Helper method for computing the MC returns.
Given an episodes in the form ``[ (s0,a0,r1), (s1,a1,r2), ...]``
this function computes (if first_visit=True) a new list::
[(s0, G0), (s1, G1), ...]
consisting of the unique s_t values in the episode along with their return G_t (computed from their first occurance).
Alternatively, if first_visit=False, the method return a list of same length of episode
with all s values and their return.
"""
ss = [s for s, a, r in episode]
G = 0
returns = []
for t in reversed(range(len(episode))):
# TODO: 2 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
if s_t not in ss[:t] or not first_visit:
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
return returns
class MCEvaluationAgent(ValueAgent):
def __init__(self, env, policy=None, gamma=1, alpha=None, first_visit=True, v_init_fun=None):
self.episode = []
self.first_visit = first_visit
self.alpha = alpha
if self.alpha is None:
self.returns_sum_S = defaultdict(float)
self.returns_count_N = defaultdict(float)
super().__init__(env, gamma, policy, v_init_fun=v_init_fun)
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
self.episode.append((s, a, r)) # Gather the episode
if done: # Only train when the episode has stopped
returns = get_MC_return_S(self.episode, self.gamma, self.first_visit)
for s, G in returns:
if self.alpha:
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
else:
# TODO: 3 lines missing.
raise NotImplementedError("Implement function body")
self.episode = []
def __str__(self):
return f"MCeval_{self.gamma}_{self.alpha}_{self.first_visit}"
if __name__ == "__main__":
envn = "SmallGridworld-v0"
from irlc import interactive
from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment
env = SuttonCornerGridEnvironment(render_mode=None)
gamma = 1
episodes = 200
agent = MCEvaluationAgent(env, gamma=gamma)
train(env, agent, num_episodes=episodes)
env.render_mode = 'human'
env, agent = interactive(env, agent, autoplay=True)
env.plot()
plt.title(f"MC evaluation of {envn} using first-visit")
savepdf("MC_value_random_smallgrid")
plt.show(block=False)
env.close()
env = SuttonCornerGridEnvironment(render_mode=None)
agent_every = MCEvaluationAgent(env, gamma=gamma, first_visit=False)
train(env, agent_every, num_episodes=episodes)
env.render_mode = 'human'
env, agent = interactive(env, agent, autoplay=True)
env.plot()
plt.title(f"MC evaluation of {envn} using every-visit")
savepdf("MC_value_random_smallgrid_every")
plt.show(block=False)
env.close()
s0 = (1, 1)
print(f"Estimated value functions v_pi(s0) for first visit {agent.v[(1,1)]:3}")
print(f"Estimated value functions v_pi(s0) for every visit {agent_every.v[(1,1)]:3}")
## Second part:
repeats = 5000 # increase to e.g. 20'000.
episodes = 1
ev, fv = [], []
env = SuttonCornerGridEnvironment()
print(f"Repeating experiment {repeats} times, this may take a while.")
for _ in range(repeats):
"""
Instantiate two agents with first_visit=True and first_visit=False.
Train the agents using the train function for episodes episodes. You might want to pass verbose=False to the
'train'-method to suppress output.
When done, compute the mean of agent.values() and add it to the lists ev / fv; the mean of these lists
are the desired result.
"""
agent = MCEvaluationAgent(env, gamma=gamma)
# TODO: 1 lines missing.
raise NotImplementedError("Create and train an every-visit agent.")
train(env, agent, num_episodes=episodes, verbose=False)
# TODO: 1 lines missing.
raise NotImplementedError("Create and train an every-visit agent.")
ev.append(agent.v[(1,1)])
fv.append(agent_every.v[(1,1)])
print(f"First visit: Mean of value functions E[v_pi(s0)] after {repeats} repeats {np.mean(fv):3}")
print(f"Every visit: Mean of value functions E[v_pi(s0)] after {repeats} repeats {np.mean(ev):3}")
env.close()
plt.close()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
def a_compute_deltas(v: dict, states: list, rewards: list, gamma: float) -> list:
# TODO: Code has been removed from here.
raise NotImplementedError("Insert your solution and remove this error.")
return deltas
def b_perform_td0(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict:
# TODO: Code has been removed from here.
raise NotImplementedError("Insert your solution and remove this error.")
return v
def c_perform_td0_batched(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict:
# TODO: Code has been removed from here.
raise NotImplementedError("Insert your solution and remove this error.")
return v
if __name__ == "__main__":
states = [1, 0, 2, -1, 2, 4, 5, 4, 3, 2, 1, -1]
rewards = [1, 0.5, -1, 0, 1, 2, 2, 0, 0, -1, 0.5]
# In the notation of the problem: T = len(rewards).
v = {s: 0 for s in states} # Initialize the value function v.
gamma = 0.9
alpha = 0.2
deltas = a_compute_deltas(v, states, rewards, gamma)
print(f"The first value of delta should be 1, your value is {deltas[0]=}")
v = b_perform_td0(v, states, rewards, gamma, alpha)
print(f"The value function v(s=1) should be 0.25352, your value is {v[1]=}")
v_batched = {s: 0 for s in states} # Initialize the value function anew
v_batched = c_perform_td0_batched(v_batched, states, rewards, gamma, alpha)
print(f"The batched value function in v(s=1) should be 0.3, your value is {v_batched[1]=}")
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import numpy as np
import matplotlib.pyplot as plt
from irlc.ex09.rl_agent import ValueAgent
from irlc import savepdf
from irlc.ex01.agent import train
class TD0ValueAgent(ValueAgent):
def __init__(self, env, policy=None, gamma=0.99, alpha=0.05, v_init_fun=None):
self.alpha = alpha
super().__init__(env, gamma=gamma, policy=policy, v_init_fun=v_init_fun)
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
# TODO: 3 lines missing.
raise NotImplementedError("Implement function body")
def __str__(self):
return f"TD0Value_{self.gamma}_{self.alpha}"
def value_function_test(env, agent, v_true, episodes=200):
err = []
for t in range(episodes):
train(env, agent, num_episodes=1, verbose=False)
err.append( np.mean( [(v_true - v0) ** 2 for k, v0 in agent.v.items()] ) )
return np.asarray(err)
if __name__ == "__main__":
envn = "SmallGridworld-v0"
from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment
from irlc import interactive
env = SuttonCornerGridEnvironment() # Make the gridworld environment itself
gamma = 1
agent = TD0ValueAgent(env, gamma=gamma, alpha=0.05) # Make a TD(0) agent
train(env, agent, num_episodes=2000, return_trajectory=False) # Train for 2000 episodes
env = SuttonCornerGridEnvironment(render_mode='human') # Re-make the gridworld to get rendering.
env, agent = interactive(env, agent) # Add a video monitor, the environment will now show an animation
train(env,agent,num_episodes=1) # Train for a (single) new episode
env.plot() # Plot the current state of the environment/agent
plt.title(f"TD0 evaluation of {envn}")
savepdf("TD_value_random_smallgrid")
plt.show(block=False)
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.gridworld.gridworld_environments import BookGridEnvironment
from irlc.ex10.mc_agent import MCAgent
from irlc import interactive, train
class MCControlAgentOneState(MCAgent):
def __init__(self, *args, state_action=None, **kwargs):
a = 34
super().__init__(*args, **kwargs)
if state_action is None:
state_action = (self.env.mdp.initial_state, self.env.mdp.A(self.env.mdp.initial_state)[0])
self.state_action = state_action
self._clear_states()
def _clear_states(self, val=None):
for s in self.env.mdp.nonterminal_states:
for a in self.env.mdp.A(s):
if (s,a) != self.state_action:
self.returns_sum[s,a] = val
self.returns_count[s,a] = val
k = next(self.env.mdp.Psr(s, self.env.mdp.A(s)[0]).keys().__iter__() )[0]
if not self.env.mdp.is_terminal(k):
self.Q[s,a] = 0
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
# self.episode = [e for e in self.episode if e[0] == self.state]
self._clear_states(0)
super().train(s, a, r, sp, done)
# Clear out many of the state, actions:
self._clear_states(None)
# for s in self.env.mdp.nonterminal_states:
# if s != self.state:
# self.v[s] = None
pass
if __name__ == "__main__":
env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=True, zoom=2)
agent = MCControlAgentOneState(env, gamma=1, alpha=None, first_visit=True)
method_label = 'MC (gamma=1)'
agent.label = method_label
autoplay = False
env, agent = interactive(env, agent, autoplay=autoplay)
# agent = PlayWrapper(agent, env,autoplay=autoplay)
# env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label})
num_episodes = 1000
train(env, agent, num_episodes=num_episodes)
env.close()
# keyboard_play(env,agent,method_label='MC (alpha=0.5)')
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play
from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment
from irlc.ex10.mc_agent import MCAgent
from irlc.lectures.lec10.lecture_10_mc_action_value_first_one_state import MCControlAgentOneState
from irlc.ex10.mc_evaluate import MCEvaluationAgent
import numpy as np
from irlc import interactive, train
if __name__ == "__main__":
env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=True, zoom=2)
agent = MCControlAgentOneState(env, gamma=1, alpha=None, first_visit=True, state_action=( (0,2), 2))
method_label = 'MC control (gamma=1)'
agent.label = method_label
autoplay = False
env, agent = interactive(env, agent, autoplay=autoplay)
num_episodes = 1000
train(env, agent, num_episodes=num_episodes)
env.close()
# keyboard_play(env,agent,method_label='MC (alpha=0.5)')
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play
from irlc.gridworld.gridworld_environments import BookGridEnvironment
from irlc.ex10.mc_agent import MCAgent
import numpy as np
if __name__ == "__main__":
np.random.seed(433)
env = BookGridEnvironment(render_mode='human',zoom=2)
# agent = MCAgent(env, gamma=0.9, epsilon=0.15, alpha=0.1, first_visit=True)
agent = MCAgent(env, gamma=1.0, epsilon=0.15, alpha=None, first_visit=True)
# env, agent = interactive(env, agent)
keyboard_play(env,agent,method_label='MC control')
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play
from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment
from irlc.ex10.mc_agent import MCAgent
import numpy as np
if __name__ == "__main__":
env = SuttonCornerGridEnvironment(render_mode='human')
agent = MCAgent(env, gamma=1, epsilon=1, alpha=.5, first_visit=False)
keyboard_play(env,agent,method_label='MC (alpha=0.5)')
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.exam_tabular_examples.helper import keyboard_play_value
from irlc.ex10.mc_evaluate import MCEvaluationAgent
from irlc.lectures.lec10.lecture_10_mc_onestate_first import CaughtGrid
if __name__ == "__main__":
env = CaughtGrid(view_mode=1, render_mode='humanp')
agent = MCEvaluationAgent(env, gamma=1, alpha=None, first_visit=False)
keyboard_play_value(env,agent,method_label='MC (every visit)')
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.exam_tabular_examples.helper import keyboard_play_value
# from irlc.gridworld_pyglet.gridworld_environments import BookGridEnvironment
from irlc.ex10.mc_evaluate import MCEvaluationAgent
from irlc.gridworld.gridworld_environments import GridworldEnvironment
map = [['#', '#', '#', '#'],
['#','S',0,'#'],
['#','#','#','#']]
class CaughtGrid(GridworldEnvironment):
def __init__(self, **kwargs):
super().__init__(map, living_reward=1, zoom=1.5, **kwargs)
if __name__ == "__main__":
env = CaughtGrid(view_mode=1, render_mode='human')
agent = MCEvaluationAgent(env, gamma=1, alpha=None)
keyboard_play_value(env,agent,method_label='MC (first visit)')
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.gridworld.gridworld_environments import BookGridEnvironment
from irlc import train, interactive
def keyboard_play(env, agent, method_label='MC',autoplay=False, num_episodes=1000):
agent.label = method_label
env, agent = interactive(env, agent, autoplay=autoplay)
# agent = PlayWrapper(agent, env,autoplay=autoplay)
# env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label})
train(env, agent, num_episodes=num_episodes)
env.close()
def automatic_play(env, agent, method_label='MC'):
# agent = PlayWrapper(agent, env)
env = VideoMonitor(env, agent=agent, fps=40, continious_recording=True, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label})
train(env, agent, num_episodes=1000)
env.close()
def automatic_play_value(env, agent, method_label='MC'):
agent.label = method_label
env, agent = interactive(env, agent)
# env = VideoMonitor(env, agent=agent, fps=40, continious_recording=True, agent_monitor_keys=('v'), render_kwargs={'method_label': method_label})
# agent = PlayWrapper(agent, env)
train(env, agent, num_episodes=1000)
env.close()
if __name__ == "__main__":
env = BookGridEnvironment(render_mode='human', zoom=2, living_reward=-0.05)
from irlc.ex10.mc_agent import MCAgent
agent = MCAgent(env, gamma=0.9, epsilon=1., first_visit=True, alpha=None)
# agent.label =
# env, agent = interactive(env, agent)
keyboard_play(env, agent, method_label='MC Q-estimation (First visit)')
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.exam_tabular_examples.helper import keyboard_play_value
# from irlc.berkley.rl.feature_encoder import SimplePacmanExtractor
from irlc.gridworld.gridworld_environments import BookGridEnvironment
from irlc.ex10.mc_evaluate import MCEvaluationAgent
if __name__ == "__main__":
env = BookGridEnvironment(view_mode=1, render_mode='human', living_reward=-0.05)
agent = MCEvaluationAgent(env, gamma=.9, alpha=None, first_visit=False)
keyboard_play_value(env,agent,method_label='MC every')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment