diff --git a/irlc/__init__.py b/irlc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a5273e7ad3ca01ceff509fd51046bd4f52d95482 --- /dev/null +++ b/irlc/__init__.py @@ -0,0 +1,261 @@ +""" Source code for 02466, Introduction to reinforcement learning and control, offered at DTU """ +__version__ = "0.0.1" + +# Do not import Matplotlib (or imports which import matplotlib) in case you have to run in headless mode. +import shutil +import inspect +import lzma, pickle + +import gymnasium +import numpy as np +import os + +# Global imports from across the API. Allows imports like +# > from irlc import Agent, train +from irlc.utils.irlc_plot import main_plot as main_plot +from irlc.utils.irlc_plot import plot_trajectory as plot_trajectory +try: + from irlc.ex01.agent import Agent as Agent, train as train + from irlc.ex09.rl_agent import TabularAgent, ValueAgent +except ImportError: + pass +from irlc.utils.player_wrapper import interactive as interactive +from irlc.utils.lazylog import LazyLog # This one is unclear. Is it required? +from irlc.utils.timer import Timer + + +def get_irlc_base(): + dir_path = os.path.dirname(os.path.realpath(__file__)) + return dir_path + +def get_students_base(): + return os.path.join(get_irlc_base(), "../../../02465students/") + + +def pd2latex_(pd, index=False, escape=False, column_spec=None, **kwargs): # You can add column specs. + for c in pd.columns: + if pd[c].values.dtype == 'float64' and all(pd[c].values - np.round(pd[c].values)==0): + pd[c] = pd[c].astype(int) + ss = pd.to_latex(index=index, escape=escape, **kwargs) + return fix_bookstabs_latex_(ss,column_spec=column_spec) + +def fix_bookstabs_latex_(ss, linewidth=True, first_column_left=True, column_spec=None): + to_tabular_x = linewidth + + if to_tabular_x: + ss = ss.replace("tabular", "tabularx") + lines = ss.split("\n") + hd = lines[0].split("{") + if column_spec is None: + adj = (('l' if to_tabular_x else 'l') if first_column_left else 'C') + ("".join(["C"] * (len(hd[-1][:-1]) - 1))) + else: + adj = column_spec + + # adj = ( ('l' if to_tabular_x else 'l') if first_column_left else 'C') + ("".join(["C"] * (len(hd[-1][:-1])-1))) + if linewidth: + lines[0] = "\\begin{tabularx}{\\linewidth}{" + adj + "}" + else: + lines[0] = "\\begin{tabular}{" + adj.lower() + "}" + + ss = '\n'.join(lines) + return ss + +def plotenv(env : gymnasium.Env): + """ + Given a Gymnasium environment instance, this function will plot the environment as a matplotlib image. Remember to call ``plt.show()`` to actually see the image. + + For this function to work, you must create the environment with :python:`render_mode='human'`. + + .. note:: + + This function may not work for all gymnasium environments, however, it will work for most environments we use in this course. + + :param env: The environment to plot. + """ + + from PIL import Image + import matplotlib.pyplot as plt + if hasattr(env, 'render_mode') and not env.render_mode == 'rgb_array': + env.render_mode, rmt = 'rgb_array', env.render_mode + frame = env.render() + if hasattr(env, 'render_mode') and not env.render_mode == 'rgb_array': + env.render_mode = rmt + + im = Image.fromarray(frame) + + plt.figure(figsize=(16, 16)) + plt.imshow(im) + plt.axis('off') + plt.tight_layout() + + + + +def _savepdf_env(file, env): + from PIL import Image + import matplotlib.pyplot as plt + if hasattr(env, 'render_mode') and not env.render_mode == 'rgb_array': + env.render_mode, rmt = 'rgb_array', env.render_mode + frame = env.render() + if hasattr(env, 'render_mode') and not env.render_mode == 'rgb_array': + env.render_mode = rmt + + im = Image.fromarray(frame) + snapshot_base = file + if snapshot_base.endswith(".png"): + sf = snapshot_base[:-4] + fext = 'png' + else: + fext = 'pdf' + if snapshot_base.endswith(".pdf"): + sf = snapshot_base[:-4] + else: + sf = snapshot_base + + sf = f"{sf}.{fext}" + dn = os.path.dirname(sf) + if len(dn) > 0 and not os.path.isdir(dn): + os.makedirs(dn) + print("Saving snapshot of environment to", os.path.abspath(sf)) + if fext == 'png': + im.save(sf) + from irlc import _move_to_output_directory + _move_to_output_directory(sf) + else: + plt.figure(figsize=(16, 16)) + plt.imshow(im) + plt.axis('off') + plt.tight_layout() + from irlc import savepdf + savepdf(sf, verbose=True) + # plt.show() + + + +def savepdf(pdf, verbose=False, watermark=False, env=None): + """ + Convenience function for saving PDFs. Just call it after you have created your plot as ``savepdf('my_file.pdf')`` + to save a PDF of the plot. + You can also pass an environment, in which case the environment will be stored to a pdf file. + + + :param pdf: The file to save to, for instance ``"my_pdf.pdf"`` + :param verbose: Print output destination (optional) + :param watermark: Include a watermark (optional) + :return: Full path of the created PDF. + """ + if env is not None: + _savepdf_env(pdf, env) + return + + import matplotlib.pyplot as plt + pdf = os.path.normpath(pdf.strip()) + pdf = pdf+".pdf" if not pdf.endswith(".pdf") else pdf + + if os.sep in pdf: + pdf = os.path.abspath(pdf) + else: + pdf = os.path.join(os.getcwd(), "pdf", pdf) + if not os.path.isdir(os.path.dirname(pdf)): + os.makedirs(os.path.dirname(pdf)) + + + + # filename = None + stack = inspect.stack() + modules = [inspect.getmodule(s[0]) for s in inspect.stack()] + files = [m.__file__ for m in modules if m is not None] + if any( [f.endswith("RUN_OUTPUT_CAPTURE.py") for f in files] ): + return + + # for s in stack: + # print(s) + # print(stack) + # for k in range(len(stack)-1, -1, -1): + # frame = stack[k] + # module = inspect.getmodule(frame[0]) + # filename = module.__file__ + # print(filename) + # if not any([filename.endswith(f) for f in ["pydev_code_executor.py", "pydevd.py", "_pydev_execfile.py", "pydevconsole.py", "pydev_ipython_console.py"] ]): + # # print("breaking c. debugger", filename) + # break + # if any( [filename.endswith(f) for f in ["pydevd.py", "_pydev_execfile.py"]]): + # print("pdf path could not be resolved due to debug mode being active in pycharm", filename) + # return + # print("Selected filename", filename) + # wd = os.path.dirname(filename) + # pdf_dir = wd +"/pdf" + # if filename.endswith("_RUN_OUTPUT_CAPTURE.py"): + # return + # if not os.path.isdir(pdf_dir): + # os.mkdir(pdf_dir) + wd = os.getcwd() + irlc_base = os.path.dirname(__file__) + if False: + pass + else: + plt.savefig(fname=pdf) + outf = os.path.normpath(os.path.abspath(pdf)) + print("> [savepdf]", pdf + (f" [full path: {outf}]" if verbose else "")) + + return outf + + +def _move_to_output_directory(file): + """ + Hidden function: Move file given file to static output dir. + """ + if not is_this_my_computer(): + return + CDIR = os.path.dirname(os.path.realpath(__file__)).replace('\\', '/') + shared_output_dir = CDIR + "/../../shared/output" + shutil.copy(file, shared_output_dir + "/"+ os.path.basename(file) ) + + +def bmatrix(a): + if False: + return a.__str__() + else: + np.set_printoptions(suppress=True) + """Returns a LaTeX bmatrix + :a: numpy array + :returns: LaTeX bmatrix as a string + """ + if len(a.shape) > 2: + raise ValueError('bmatrix can at most display two dimensions') + lines = str(a).replace('[', '').replace(']', '').splitlines() + rv = [r'\begin{bmatrix}'] + rv += [' ' + ' & '.join(l.split()) + r'\\' for l in lines] + rv += [r'\end{bmatrix}'] + return '\n'.join(rv) + + +def is_this_my_computer(): + CDIR = os.path.dirname(os.path.realpath(__file__)).replace('\\', '/') + return os.path.exists(CDIR + "/../../Exercises") + +def cache_write(object, file_name, only_on_professors_computer=False, verbose=True, protocol=-1): # -1 is default protocol. Fix crash issue with large files. + if only_on_professors_computer and not is_this_my_computer(): + """ Probably for your own good :-). """ + return + + dn = os.path.dirname(file_name) + if not os.path.exists(dn): + os.mkdir(dn) + if verbose: print("Writing cache...", file_name) + with lzma.open(file_name, 'wb') as f: + pickle.dump(object, f) + # compress_pickle.dump(object, f, compression="lzma", protocol=protocol) + if verbose: + print("Done!") + + +def cache_exists(file_name): + return os.path.exists(file_name) + +def cache_read(file_name): + if os.path.exists(file_name): + with lzma.open(file_name, 'rb') as f: + return pickle.load(f) + else: + return None diff --git a/irlc/__pycache__/__init__.cpython-311.pyc b/irlc/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97a0dcea2a082392c4fe14d86fedd5de91689215 Binary files /dev/null and b/irlc/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/car/__init__.py b/irlc/car/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/car/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/car/car_model.py b/irlc/car/car_model.py new file mode 100644 index 0000000000000000000000000000000000000000..d99168458fe1e71303acb48540bead0f934357a9 --- /dev/null +++ b/irlc/car/car_model.py @@ -0,0 +1,304 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.car.car_viewer import CarViewer +from irlc.car.car_viewer import CarViewerPygame +import numpy as np +import sympy as sym +from scipy.optimize import Bounds +from gymnasium.spaces import Box +from irlc.car.sym_map import SymMap, wrap_angle +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment +# from irlc.ex03.control_specification import ControlSpecification + +""" +class MySpecification(): + def get_bounds(self): + return bounds + + def get_cost(self): + pass + + def sym_f(self): + return ... + + def simulate(self): + # Simulate using RK4. + + pass + + +spec = MySpecification() +model = Model(spec) +model.simulate(...) + + + +""" + + +class SymbolicBicycleModel(ControlModel): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 30 + } + def __init__(self, map_width=0.8, simple_bounds=None, cost=None, hot_start=False, verbose=True): + s = """ + Coordinate system of the car: + State x consist of + x[0] = Vx (speed in direction of the car body) + x[1] = Vy (speed perpendicular to car body) + x[2] = wz (Yaw rate; how fast the car is turning) + x[3] = e_psi (Angle of rotation between car body and centerline) + x[4] = s (How far we are along the track) + x[5] = e_y (Distance between car body and closest point on centerline) + + Meanwhile the actions are + u[0] : Angle between wheels and car body (i.e. are we steering to the right or to the left) + u[1] : Engine force (applied to the rear wheels, i.e. accelerates car) + """ + if verbose: + print(s) + # if simple_bounds is None: + # simple_bounds = dict() + self.map = SymMap(width=map_width) + self.v_max = 3.0 + + self.viewer = None # rendering + self.hot_start = hot_start + # self.observation_space = Box(low=np.asarray([-np.inf, -np.inf, -np.inf, -np.inf, -np.inf, -map_width], dtype=float), + # high=np.asarray([v_max, np.inf, np.inf, np.inf, np.inf, map_width]), dtype=float) + # self.action_space = Box(low=np.asarray([-0.5, -1]), high=np.asarray([0.5, 1]), dtype=float) + + # xl = np.zeros((6,)) + # xl[4] = self.map.TrackLength + # simple_bounds = {'x0': Bounds([-np.inf, -np.inf, -np.inf, -np.inf, -np.inf, -map_width], [v_max, np.inf, np.inf, np.inf, np.inf, map_width]), + # 'xF': Bounds(list(xl), list(xl)), **simple_bounds} + # n = 6 + # d = 2 + # if cost is None: + # cost = SymbolicQRCost(Q=np.zeros((6,6)), R=np.eye(2)*10, qc=0*1.) + # bounds = dict(x_low=[-np.inf, -np.inf, -np.inf, -np.inf, -np.inf, -map_width], x_high=[self.v_max, np.inf, np.inf, np.inf, np.inf, map_width], + # u_low=[-0.5, -1], u_high=[0.5, 1]) + + super().__init__() + + def get_cost(self) -> SymbolicQRCost: + return SymbolicQRCost(Q=np.zeros((6,6)), R=np.eye(2)*10, qc=1.*0) + + def x_bound(self) -> Box: + return Box(np.asarray([-np.inf, -np.inf, -np.inf, -np.inf, -np.inf, -self.map.width]), + np.asarray([self.v_max, np.inf, np.inf, np.inf, np.inf, self.map.width])) + + def u_bound(self) -> Box: + return Box(np.asarray([-0.5, -1]),np.asarray([0.5, 1])) + + def render(self, x, render_mode='human'): + if self.viewer == None: + self.viewer = CarViewerPygame(self) + + self.viewer.update(self.x_curv2x_XY(x)) + return self.viewer.blit(render_mode=render_mode) + # return self.viewer.render(return_rgb_array=mode == 'rgb_array') + + def close(self): + if self.viewer is not None: + self.viewer.close() + + def x_curv2x_XY(self, x_curv): + ''' + Utility function for converting x (including velocities, etc.) from local (curvilinear) coordinates to global XY position. + ''' + Xc, Yc, vangle = self.map.getGlobalPosition(s=x_curv[4], ey=x_curv[5], epsi=x_curv[3]) + dglob = np.asarray([x_curv[0], x_curv[1], x_curv[2], vangle, Xc, Yc]) + return dglob + + def sym_f(self, x, u, t=None, curvelinear_coordinates=True, curvature_s=None): + ''' + Create derivative function + + \dot{x} = f(x, u) + + We will both create it in curvelinear coordinates or normal (global) coordinates. + ''' + # Vehicle Parameters + m = 1.98 + lf = 0.125 + lr = 0.125 + Iz = 0.024 + Df = 0.8 * m * 9.81 / 2.0 + Cf = 1.25 + Bf = 1.0 + Dr = 0.8 * m * 9.81 / 2.0 + Cr = 1.25 + Br = 1.0 + + vx = x[0] + vy = x[1] + wz = x[2] + if curvelinear_coordinates: + epsi = x[3] + s = x[4] + ey = x[5] + else: + psi = x[3] + + delta = u[0] + a = u[1] + + alpha_f = delta - sym.atan2(vy + lf * wz, vx) + alpha_r = -sym.atan2(vy - lf * wz, vx) + + # Compute lateral force at front and rear tire + Fyf = 2 * Df * sym.sin(Cf * sym.atan(Bf * alpha_f)) + Fyr = 2 * Dr * sym.sin(Cr * sym.atan(Br * alpha_r)) + + d_vx = (a - 1 / m * Fyf * sym.sin(delta) + wz * vy) + d_vy = (1 / m * (Fyf * sym.cos(delta) + Fyr) - wz * vx) + d_wz = (1 / Iz * (lf * Fyf * sym.cos(delta) - lr * Fyr)) + + if curvelinear_coordinates: + cur = self.map.sym_curvature(s) + d_epsi = (wz - (vx * sym.cos(epsi) - vy * sym.sin(epsi)) / (1 - cur * ey) * cur) + d_s = ((vx * sym.cos(epsi) - vy * sym.sin(epsi)) / (1 - cur * ey)) + """ + Compute derivative of e_y here (d_ey). See paper for details. + """ + d_ey = (vx * sym.sin(epsi) + vy * sym.cos(epsi)) # Old ex here ! b ! b + # implement the ODE governing ey (distane from center of road) in curveliner coordinates + xp = [d_vx, d_vy, d_wz, d_epsi, d_s, d_ey] + + else: + d_psi = wz + d_X = ((vx * sym.cos(psi) - vy * sym.sin(psi))) + d_Y = (vx * sym.sin(psi) + vy * sym.cos(psi)) + + xp = [d_vx, d_vy, d_wz, d_psi, d_X, d_Y] + return xp + + def fix_angles(self, x): + # fix angular component of x + if x.size == self.state_size: + x[3] = wrap_angle(x[3]) + elif x.shape[1] == self.state_size: + x[:,3] = wrap_angle(x[:,3]) + return x + + +class DiscreteCarModel(DiscreteControlModel): + def __init__(self, dt=0.1, cost=None, **kwargs): + model = SymbolicBicycleModel(**kwargs) + # self.observation_space = model.observation_space + # self.action_space = model.action_space + # n = 6 + # d = 2 + # if cost is None: + # from irlc.ex04.cost_discrete import DiscreteQRCost + # cost = DiscreteQRCost(Q=np.zeros((model.state_size, model.state_size)), R=np.eye(model.action_size)) + super().__init__(model=model, dt=dt, cost=cost) + # self.cost = cost + self.map = model.map + + +class CarEnvironment(ControlEnvironment): + def __init__(self, Tmax=10, noise_scale=1.0, cost=None, max_laps=10, hot_start=False, render_mode=None, **kwargs): + discrete_model = DiscreteCarModel(cost=cost, hot_start=hot_start, **kwargs) + super().__init__(discrete_model, Tmax=Tmax, render_mode=render_mode) + self.map = discrete_model.map + self.noise_scale = noise_scale + self.cost = cost + self.completed_laps = 0 + self.max_laps = max_laps + + def simple_bounds(self): + simple_bounds = {'x': Bounds(self.observation_space.low, self.observation_space.high), + 't0': Bounds([0], [0]), + 'u': Bounds(self.action_space.low, self.action_space.high)} + return simple_bounds + + """ We add a bit of noise for backward compatibility. """ + def step(self, u): + # We don't want to render the car before we have added jitter (below). These lines therefore disable rendering + self.render_mode, rmt_ = None, self.render_mode + xp, cost, terminated, truncated, info = super().step(u) + self.render_mode = rmt_ + + x = xp + if hasattr(self, 'seed') and self.seed is not None and not callable(self.seed): + np.random.seed(self.seed) + + noise_vx = np.maximum(-0.05, np.minimum(np.random.randn() * 0.01, 0.05)) + noise_vy = np.maximum(-0.1, np.minimum(np.random.randn() * 0.01, 0.1)) + noise_wz = np.maximum(-0.05, np.minimum(np.random.randn() * 0.005, 0.05)) + if True: #self.noise_scale > 0: + x[0] = x[0] + 0.03 * noise_vx #* self.noise_scale + x[1] = x[1] + 0.03 * noise_vy #* self.noise_scale + x[2] = x[2] + 0.03 * noise_wz #* self.noise_scale + + if x[4] > self.map.TrackLength: + self.completed_laps += 1 + x[4] -= self.map.TrackLength + + done = self.completed_laps >= self.max_laps + if x[4] < 0: + assert(False) + if self.render_mode == 'human': + self.render() + return x, cost, done, False, info + + def L(self, x): + ''' + Implement whether we have obtained the terminal condition. see eq. 4 in "Autonomous Racing using LMPC" + + :param x: + :return: + ''' + return x[4] > self.map.TrackLength + + def epoch_reset(self, x): + ''' + After completing one epoch, i.e. when L(x) == True, reset the x-vector using this method to + restart the epoch. In practice, take one more lap on the track. + + :param x: + :return: + ''' + x = x.copy() + x[4] -= self.map.TrackLength + return x + + def _get_initial_state(self): + x0 = np.zeros((6,)) + if self.discrete_model.continuous_model.hot_start: + x0[0] = 0.5 # Start velocity is 0.5 + # self.render() + return x0 + +if __name__ == "__main__": + # car = SymbolicBicycleModel() + # car.render(car.reset()) + # sleep(2.0) + # car.close() + # print("Hello world") + env = CarEnvironment(render_mode='human') + env.metadata['video.frames_per_second'] = 10000 + # from irlc import VideoMonitor + # env = wrappers.Monitor(env, "carvid2", force=True, video_callable=lambda episode_id: True) + # env = VideoMonitor(env) + env.reset() + import time + t0 = time.time() + n = 300 + for _ in range(n): + u = env.action_space.sample() + # print(u) + # u *= 0 + u[0] = 0 + u[1] = 0.01 + s, cost, done, truncated, info = env.step(u) + # print(s) + # sleep(5) + env.close() + tpf = (time.time()- t0)/n + print("TPF", tpf, "fps", 1/tpf) diff --git a/irlc/car/car_viewer.py b/irlc/car/car_viewer.py new file mode 100644 index 0000000000000000000000000000000000000000..0952d40241b9eda29df6319d8482f7d9abb697b8 --- /dev/null +++ b/irlc/car/car_viewer.py @@ -0,0 +1,51 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from pyglet.shapes import Rectangle, Circle +# from irlc.utils.pyglet_rendering import PygletViewer, PolygonOutline, GroupedElement +import pygame +from irlc.utils.graphics_util_pygame import UpgradedGraphicsUtil +import numpy as np + +track_outline = (0, 0, 0) +track_middle = (220, 25, 25) + +class CarViewerPygame(UpgradedGraphicsUtil): + def __init__(self, car): + + n = int(10 * (car.map.PointAndTangent[-1, 3] + car.map.PointAndTangent[-1, 4])) + center = [car.map.getGlobalPosition(i * 0.1, 0) for i in range(n)] + outer = [car.map.getGlobalPosition(i * 0.1, -car.map.width) for i in range(n)] + inner = [car.map.getGlobalPosition(i * 0.1, car.map.width) for i in range(n)] + fudge = 0.2 + xs, ys = zip(*outer) + super().__init__(screen_width=1000, xmin=min(xs) - fudge, xmax=max(xs) + fudge, + ymax=min(ys) - fudge, ymin=max(ys) + fudge, title="Racecar environment") + self.center = center + self.outer = outer + self.inner = inner + # Load ze sprite. + from irlc.utils.graphics_util_pygame import Object + self.car = Object("car.png", image_width=90) + + + def render(self): + green = (126, 200, 80) + track = (144,)*3 + self.draw_background(background_color=green) + + self.polygon("safd", self.outer, fillColor=track, outlineColor=track_outline, width=3) + self.polygon("in", self.inner, fillColor=green, outlineColor=track_outline, width=3) + self.polygon("in", self.center, fillColor=None, filled=False, outlineColor=(100, 100, 100), width=5) + # Now draw the pretty car. + x, y, psi = self.xglob[4], self.xglob[5], self.xglob[3] + xy = self.fixxy((x,y)) + # self.car.rect.move() + self.car.rect.center = xy + # self.car.rect.center = xy[1] + + self.car.rotate(psi / (2*np.pi) * 360) + # self.car.rotate(45) + self.car.blit(self.surf) + self.circle("in", (x,y), 4, fillColor=(255, 0, 0)) # drawn on the center of the car. + + def update(self, xglob): + self.xglob = xglob diff --git a/irlc/car/sym_map.py b/irlc/car/sym_map.py new file mode 100644 index 0000000000000000000000000000000000000000..0142042dd9d769c2456009e2706edf6826f30221 --- /dev/null +++ b/irlc/car/sym_map.py @@ -0,0 +1,450 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import pdb +import matplotlib.pyplot as plt +import numpy as np +import numpy.linalg as la +import sympy as sym + +""" +This is a bunch of pretty awful code to define a map and compute useful quantities like tangents, etc. +Defining a map is pretty straight forward (it consist of circle archs and lines), but +don't try to read on. +""" +class SymMap: + def plot(self, show=False): + PointAndTangent, TrackLength, extra = self.spec2PointAndTangent(self.spec) + for i in range(PointAndTangent.shape[0]-1): + extra_ = extra[i] + if 'CenterX' in extra_: + CenterX, CenterY = extra_['CenterX'], extra_['CenterY'] + angle, spanAng = extra_['angle'], extra_['spanAng'] + r = self.spec[i,1] + direction = 1 if r >= 0 else -1 + + # Plotting. Ignore this + plt.plot(CenterX, CenterY, 'ro') + tt = np.linspace(angle, angle + direction * spanAng) + plt.plot(CenterX + np.cos(tt) * np.abs(r), CenterY + np.abs(r) * np.sin(tt), 'r-') + + x, y = PointAndTangent[:, 0], PointAndTangent[:, 1] + plt.plot(x, y, '.-') + print(np.sum(np.sum(np.abs(self.PointAndTangent - PointAndTangent)))) + + if show: + plt.show() + ''' + Format: + PointAndTangent = [x, + y, + psi: angle of tangent vector at the last point of segment, + total-distance-travelled, + segment-length, curvature] + + Also creates a symbolic expression to evaluate track position. + ''' + def spec2PointAndTangent(self, spec): + # also create a symbolic piecewise expression to evaluate the curvature as a function of track length location. + + # spec = self.spec + # PointAndTangent = self.PointAndTangent.copy() + PointAndTangent = np.zeros((spec.shape[0] + 1, 6)) + extra = [] + + N = spec.shape[0] + segment_s_cur = 0 # Distance travelled to start of segment (s-coordinate). + angle_prev = 0 # Angle of the tangent vector at the starting point of the segment + x_prev, y_prev = 0, 0 # x,y coordinate of last point of previous segment. + for i in range(N): + l, r = spec[i,0], spec[i,1] # Length of segment and radius of curvature + ang = angle_prev # Angle of the tangent vector at the starting point of the segment + + if r == 0.0: # If the current segment is a straight line + x = x_prev + l * np.cos(ang) # x coordinate of the last point of the segment + y = y_prev + l * np.sin(ang) # y coordinate of the last point of the segment + psi = ang # Angle of the tangent vector at the last point of the segment + curvature = 0 + extra_ = {} + else: + direction = 1 if r >= 0 else -1 + CenterX = x_prev + np.abs(r) * np.cos(ang + direction * np.pi / 2) # x coordinate center of circle + CenterY = y_prev + np.abs(r) * np.sin(ang + direction * np.pi / 2) # y coordinate center of circle + spanAng = l / np.abs(r) # Angle spanned by the circle + psi = wrap_angle(ang + spanAng * np.sign(r)) # Angle of the tangent vector at the last point of the segment + angleNormal = wrap_angle((direction * np.pi / 2 + ang)) + angle = -(np.pi - np.abs(angleNormal)) * (sign(angleNormal)) + x = CenterX + np.abs(r) * np.cos(angle + direction * spanAng) # x coordinate of the last point of the segment + y = CenterY + np.abs(r) * np.sin(angle + direction * spanAng) # y coordinate of the last point of the segment + curvature = 1/r + + extra_ = {'CenterX': CenterX, + 'CenterY': CenterY, + 'angle': angle, + 'direction': direction, + 'spanAng': spanAng} + + extra.append(extra_) + NewLine = np.array([x, y, psi, segment_s_cur, l, curvature]) + PointAndTangent[i, :] = NewLine # Write the new info + x_prev, y_prev, angle_prev = PointAndTangent[i, 0], PointAndTangent[i, 1], PointAndTangent[i, 2] + segment_s_cur += l + + xs = PointAndTangent[-2, 0] + ys = PointAndTangent[-2, 1] + xf = 0 + yf = 0 + psif = 0 + + l = np.sqrt((xf - xs) ** 2 + (yf - ys) ** 2) + + NewLine = np.array([xf, yf, psif, PointAndTangent[-2, 3] + PointAndTangent[-2, 4], l, 0]) + PointAndTangent[-1, :] = NewLine + TrackLength = PointAndTangent[-1, 3] + PointAndTangent[-1, 4] + + return PointAndTangent, TrackLength, extra + + + """map object + Attributes: + getGlobalPosition: convert position from (s, ey) to (X,Y) + """ + def __init__(self, width): + """Initialization + width: track width + Modify the vector spec to change the geometry of the track + """ + self.width = width + self.halfWidth = 0.4 + self.slack = 0.45 + lengthCurve = 3.5 # 3.0 + straight = 1.0 + spec = np.array([[1.0, 0], + [lengthCurve, lengthCurve / np.pi], + # Note s = 1 * np.pi / 2 and r = -1 ---> Angle spanned = np.pi / 2 + [straight, 0], + [lengthCurve / 2, -lengthCurve / np.pi], + [straight, 0], + [lengthCurve, lengthCurve / np.pi], + [lengthCurve / np.pi * 2 + 1.0, 0], + [lengthCurve / 2, lengthCurve / np.pi]]) + + + PointAndTangent, TrackLength, extra = self.spec2PointAndTangent(spec) + self.PointAndTangent = PointAndTangent + self.TrackLength = TrackLength + self.spec = spec + + + ''' + Creates a symbolic expression for the curvature + +def Curvature(s, PointAndTangent): + """curvature computation + s: curvilinear abscissa at which the curvature has to be evaluated + PointAndTangent: points and tangent vectors defining the map (these quantities are initialized in the map object) + """ + TrackLength = PointAndTangent[-1,3]+PointAndTangent[-1,4] + + # In case on a lap after the first one + while (s > TrackLength): + s = s - TrackLength + + # Given s \in [0, TrackLength] compute the curvature + # Compute the segment in which system is evolving + index = np.all([[s >= PointAndTangent[:, 3]], [s < PointAndTangent[:, 3] + PointAndTangent[:, 4]]], axis=0) + + i = int(np.where(np.squeeze(index))[0]) + curvature = PointAndTangent[i, 5] + + return curvature + + ''' + def sym_curvature(self, s): + s = s - self.TrackLength * sym.floor(s / self.TrackLength) + n = self.PointAndTangent.shape[0] + pw = [] + for i in range(n): + pw.append( (self.PointAndTangent[i,5], s - (self.PointAndTangent[i, 3] + self.PointAndTangent[i, 4]) <= 0) ) + p = sym.Piecewise(*pw) + return p + + def getGlobalPosition(self, s, ey, epsi=None, vangle_true=None): + """coordinate transformation from curvilinear reference frame (e, ey) to inertial reference frame (X, Y) + (s, ey): position in the curvilinear reference frame + """ + # wrap s along the track + # while (s > self.TrackLength): + # s = s - self.TrackLength + s = np.mod(s, self.TrackLength) + + # Compute the segment in which system is evolving + PointAndTangent = self.PointAndTangent + + index = np.all([[s >= PointAndTangent[:, 3]], [s < PointAndTangent[:, 3] + PointAndTangent[:, 4]]], axis=0) + dx = np.where(np.squeeze(index)) + if len(dx) < 1: + a = 234 + raise Exception("bad") + try: + i = int(np.where(np.squeeze(index))[0]) + except Exception as e: + print(e) + + + if PointAndTangent[i, 5] == 0.0: # If segment is a straight line + # Extract the first final and initial point of the segment + xf = PointAndTangent[i, 0] + yf = PointAndTangent[i, 1] + xs = PointAndTangent[i - 1, 0] + ys = PointAndTangent[i - 1, 1] + psi = PointAndTangent[i, 2] + + # Compute the segment length + deltaL = PointAndTangent[i, 4] + reltaL = s - PointAndTangent[i, 3] + + # Do the linear combination + x = (1 - reltaL / deltaL) * xs + reltaL / deltaL * xf + ey * np.cos(psi + np.pi / 2) + y = (1 - reltaL / deltaL) * ys + reltaL / deltaL * yf + ey * np.sin(psi + np.pi / 2) + if epsi is not None: + vangle = psi + epsi + else: + r = 1 / PointAndTangent[i, 5] # Extract curvature + ang = PointAndTangent[i - 1, 2] # Extract angle of the tangent at the initial point (i-1) + # Compute the center of the arc + direction = 1 if r >= 0 else -1 + # if r >= 0: + # direction = 1 + # else: + # direction = -1 + + CenterX = PointAndTangent[i - 1, 0] + np.abs(r) * np.cos(ang + direction * np.pi / 2) # x coordinate center of circle + CenterY = PointAndTangent[i - 1, 1] + np.abs(r) * np.sin(ang + direction * np.pi / 2) # y coordinate center of circle + + spanAng = (s - PointAndTangent[i, 3]) / (np.pi * np.abs(r)) * np.pi + + angleNormal = wrap_angle(direction * np.pi / 2 + ang) + + angle = -(np.pi - np.abs(angleNormal)) * (sign(angleNormal)) + + x = CenterX + (np.abs(r) - direction * ey) * np.cos(angle + direction * spanAng) # x coordinate of the last point of the segment + y = CenterY + (np.abs(r) - direction * ey) * np.sin(angle + direction * spanAng) # y coordinate of the last point of the segment + + if epsi is not None: + vangle = epsi + direction * spanAng + PointAndTangent[i - 1, 2] + + if epsi is None: + return x,y + else: + vangle = wrap_angle(vangle) + if vangle_true is not None: + vangle_true = wrap_angle(vangle_true) + # vangle, vangle_true = np.unwrap([vangle, vangle_true]) + if err(vangle - vangle_true, exception=False) > 1e-3: # debug code + print([vangle_true, vangle]) + print("Bad angle, delta: ", vangle - vangle_true) + raise Exception("bad angle") + return x, y, vangle + + def getLocalPosition(self, x, y, psi): + """coordinate transformation from inertial reference frame (X, Y) to curvilinear reference frame (s, ey) + (X, Y): position in the inertial reference frame + """ + PointAndTangent = self.PointAndTangent + CompletedFlag = 0 + + for i in range(0, PointAndTangent.shape[0]): + if CompletedFlag == 1: + break + + if PointAndTangent[i, 5] == 0.0: # If segment is a straight line + # Extract the first final and initial point of the segment + xf = PointAndTangent[i, 0] + yf = PointAndTangent[i, 1] + xs = PointAndTangent[i - 1, 0] + ys = PointAndTangent[i - 1, 1] + + psi_unwrap = np.unwrap([PointAndTangent[i - 1, 2], psi])[1] + epsi = psi_unwrap - PointAndTangent[i - 1, 2] + # Check if on the segment using angles + if (la.norm(np.array([xs, ys]) - np.array([x, y]))) == 0: + s = PointAndTangent[i, 3] + ey = 0 + CompletedFlag = 1 + + elif (la.norm(np.array([xf, yf]) - np.array([x, y]))) == 0: + s = PointAndTangent[i, 3] + PointAndTangent[i, 4] + ey = 0 + CompletedFlag = 1 + else: + if np.abs(computeAngle( [x,y] , [xs, ys], [xf, yf])) <= np.pi/2 and np.abs(computeAngle( [x,y] , [xf, yf], [xs, ys])) <= np.pi/2: + v1 = np.array([x,y]) - np.array([xs, ys]) + angle = computeAngle( [xf,yf] , [xs, ys], [x, y]) + s_local = la.norm(v1) * np.cos(angle) + s = s_local + PointAndTangent[i, 3] + ey = la.norm(v1) * np.sin(angle) + + if np.abs(ey)<= self.width: + CompletedFlag = 1 + + else: + xf = PointAndTangent[i, 0] + yf = PointAndTangent[i, 1] + xs = PointAndTangent[i - 1, 0] + ys = PointAndTangent[i - 1, 1] + + r = 1 / PointAndTangent[i, 5] # Extract curvature + direction = 1 if r >= 0 else -1 + # if r >= 0: + # direction = 1 + # else: + # direction = -1 + ang = PointAndTangent[i - 1, 2] # Extract angle of the tangent at the initial point (i-1) + + # Compute the center of the arc + CenterX = xs + np.abs(r) * np.cos(ang + direction * np.pi / 2) # x coordinate center of circle + CenterY = ys + np.abs(r) * np.sin(ang + direction * np.pi / 2) # y coordinate center of circle + + # Check if on the segment using angles + if (la.norm(np.array([xs, ys]) - np.array([x, y]))) == 0: + ey = 0 + psi_unwrap = np.unwrap([ang, psi])[1] + epsi = psi_unwrap - ang + s = PointAndTangent[i, 3] + CompletedFlag = 1 + elif (la.norm(np.array([xf, yf]) - np.array([x, y]))) == 0: + s = PointAndTangent[i, 3] + PointAndTangent[i, 4] + ey = 0 + psi_unwrap = np.unwrap([PointAndTangent[i, 2], psi])[1] + epsi = psi_unwrap - PointAndTangent[i, 2] + CompletedFlag = 1 + else: + arc1 = PointAndTangent[i, 4] * PointAndTangent[i, 5] + arc2 = computeAngle([xs, ys], [CenterX, CenterY], [x, y]) + if np.sign(arc1) == np.sign(arc2) and np.abs(arc1) >= np.abs(arc2): + v = np.array([x, y]) - np.array([CenterX, CenterY]) + s_local = np.abs(arc2)*np.abs(r) + s = s_local + PointAndTangent[i, 3] + ey = -np.sign(direction) * (la.norm(v) - np.abs(r)) + psi_unwrap = np.unwrap([ang + arc2, psi])[1] + epsi = psi_unwrap - (ang + arc2) + + if np.abs(ey) <= self.width: + CompletedFlag = 1 + + if epsi>1.0: + raise Exception("epsi very large; car in wrong direction") + pdb.set_trace() + + if CompletedFlag == 0: + s = 10000 + ey = 10000 + epsi = 10000 + + print("Error!! POINT OUT OF THE TRACK!!!! <==================") + raise Exception("car outside track") + # pdb.set_trace() + + return s, ey, epsi, CompletedFlag + + + def curvature_and_angle(self, s): + """curvature computation + s: curvilinear abscissa at which the curvature has to be evaluated + PointAndTangent: points and tangent vectors defining the map (these quantities are initialized in the map object) + """ + PointAndTangent = self.PointAndTangent + TrackLength = PointAndTangent[-1, 3] + PointAndTangent[-1, 4] + + # In case on a lap after the first one + while (s > TrackLength): + s = s - TrackLength + + # Given s \in [0, TrackLength] compute the curvature + # Compute the segment in which system is evolving + index = np.all([[s >= PointAndTangent[:, 3]], [s < PointAndTangent[:, 3] + PointAndTangent[:, 4]]], axis=0) + i = int(np.where(np.squeeze(index))[0]) + curvature = PointAndTangent[i, 5] + angle = PointAndTangent[i, 4] # tangent angle of path + return curvature, angle, i + + + +# ====================================================================================================================== +# ====================================================================================================================== +# ====================================== Internal utilities functions ================================================== +# ====================================================================================================================== +# ====================================================================================================================== +def computeAngle(point1, origin, point2): + # The orientation of this angle matches that of the coordinate system. Tha is why a minus sign is needed + v1 = np.array(point1) - np.array(origin) + v2 = np.array(point2) - np.array(origin) + + dot = v1[0] * v2[0] + v1[1] * v2[1] # dot product between [x1, y1] and [x2, y2] + det = v1[0] * v2[1] - v1[1] * v2[0] # determinant + angle = np.arctan2(det, dot) # atan2(y, x) or atan2(sin, cos) + return angle + +''' +This is used because np.sign(a) return 0 when a=0, which is pretty stupid. +''' +def sign(a): + return 1 if a >= 0 else -1 + +def wrap_angle(angle): + return np.mod(angle+np.pi, 2 * np.pi) - np.pi + +''' +Compute difference of these two vectors taking into account the angular component wraps +''' +def xy_diff(x,y): + dx = x-y + if len(dx.shape) == 1: + dx[3] = wrap_angle(dx[3]) + else: + dx[:,3] = wrap_angle(dx[:,3]) + return dx + + +def unityTestChangeOfCoordinates(map, ClosedLoopData): + """For each point in ClosedLoopData change (X, Y) into (s, ey) and back to (X, Y) to check accurancy + """ + TestResult = 1 + for i in range(0, ClosedLoopData.x.shape[0]): + xdat = ClosedLoopData.x + xglobdat = ClosedLoopData.x_glob + + s, ey, epsi, _ = map.getLocalPosition(x=xglobdat[i, 4], y=xglobdat[i, 5], psi=xglobdat[i, 3]) + v1 = np.array([epsi, s, ey]) + v2 = np.array(xdat[i, 3:6]) + x,y,vangle = np.array(map.getGlobalPosition(s=v1[1], ey=v1[2],epsi=v1[0], vangle_true=xglobdat[i,3] )) + v3 = np.array([ vangle, x, y]) + v4 = np.array( [wrap_angle( xglobdat[i, 3] )] + xglobdat[i, 4:6].tolist() ) + # print(i) + if np.abs( wrap_angle( xglobdat[i, 3] ) - vangle ) > 0.1: + print("BAD") + raise Exception("bad angle test result") + + if np.dot(v3 - v4, v3 - v4) > 0.00000001: + TestResult = 0 + print("ERROR", v1, v2, v3, v4) + # pdb.set_trace() + v1 = np.array(map.getLocalPosition(xglobdat[i, 4], xglobdat[i, 5])) + v2 = np.array(xdat[i, 4:6]) + v3 = np.array(map.getGlobalPosition(v1[0], v1[1])) + v4 = np.array([xglobdat[i, 4], xglobdat[i, 5]]) + print(np.dot(v3 - v4, v3 - v4)) + # pdb.set_trace() + + if TestResult == 1: + print("Change of coordinates test passed!") + + +def err(x, exception=True, tol=1e-5, message="Error too large!"): + er = np.mean(np.abs(x).flat) + if er > tol: + print(message) + print(x) + print(er) + if exception: + raise Exception(message) + return er diff --git a/irlc/ex00/__init__.py b/irlc/ex00/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8239917af89716ec33217e0ca7a897d67aaef65c --- /dev/null +++ b/irlc/ex00/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 0.""" diff --git a/irlc/ex00/fruit_homework.py b/irlc/ex00/fruit_homework.py new file mode 100644 index 0000000000000000000000000000000000000000..c2538c5645d8f95b480c9591f5cd5a6a964f2334 --- /dev/null +++ b/irlc/ex00/fruit_homework.py @@ -0,0 +1,119 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +def add(a, b): + """ This function shuold return the sum of a and b. I.e. if print(add(2,3)) should print '5'. """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + +def misterfy(animals): + """ + Given a list of animals like animals=["cat", "wolf", "elephans"], this function should return + a list like ["mr cat", "mr wolf", "mr elephant"] """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + +def mean_value(p_dict): + """ + Given a dictionary of the form: {x: probability_of_x, ...} compute the mean value of + x, i.e. sum_i x_i * p(x_i). The recommended way is to use list comprehension and not numpy. + Hint: Look at the .items() method and the build-in sum(my_list) method. """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + +def fruits_ordered(order_dict): + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + +class BasicFruitShop: + """ This is a simple class that represents a fruit-shop. + You instantiate it with a dictionary of prices """ + def __init__(self, name, prices): + """ prices is a dictionary of the form {fruit_name: cost}. For instance + prices = {'apple': 5, 'orange': 6} """ + self.name = name + self.prices = prices + + def cost(self, fruit): + """ Return the cost in pounds of the fruit with name 'fruit'. It uses the self.prices variable + to get the price. + You don't need to do exception handling here. """ + # TODO: 1 lines missing. + raise NotImplementedError("Return cost of fruit as a floating point number") + +class OnlineFruitShop(BasicFruitShop): + def price_of_order(self, order): + """ + order_dict = {'apple': 5, 'pear': 2, ...} where the numbers are the quantity ordered. + + Hints: Dictionary comprehension like: + > for fruit, pounds in order_dict.items() + > self.getCostPerPound(fruit) allows you to get cost of a fruit + > the total is sum of {pounds} * {cost_per_pound} + """ + # TODO: 1 lines missing. + raise NotImplementedError("return the total cost of the order") + + +def shop_smart(order, fruit_shops): + """ + order_dict: dictionary {'apple': 3, ...} of fruits and the pounds ordered + fruitShops: List of OnlineFruitShops + + Hints: + > Remember there is a s.price_of_order method + > Use this method to first make a list containing the cost of the order at each fruit shop + > List has form [cost1, cost2], then find the index of the smallest value (the list has an index-function) + > return fruitShops[lowest_index]. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + return best_shop + + +if __name__ == '__main__': + "This code runs when you invoke the script from the command line (but not otherwise)" + + """ Quesion 1: Lists and basic data types """ + print("add(2,5) function should return 7, and it returned", add(2, 5)) + + animals = ["cat", "giraffe", "wolf"] + print("The nice animals are", misterfy(animals)) + + """ + This problem represents the probabilities of a loaded die as a dictionary such that + > p(roll=3) = p_dict[3] = 0.15. + """ + p_die = {1: 0.20, + 2: 0.10, + 3: 0.15, + 4: 0.05, + 5: 0.10, + 6: 0.40} + print("Mean roll of die, sum_{i=1}^6 i * p(i) =", mean_value(p_die)) + + order = {'apples': 1.0, + 'oranges': 3.0} + print("The different fruits in the fruit-order is", fruits_ordered(order)) + + """ Part B: A simple class """ + price1 = {"apple": 4, "pear": 8, 'orange': 10} + shop1 = BasicFruitShop("Alis Funky Fruits", price1) + + price2 = {'banana': 9, "apple": 5, "pear": 7, 'orange': 11} + shop2 = BasicFruitShop("Hansen Fruit Emporium", price2) + + fruit = "apple" + print("The cost of", fruit, "in", shop1.name, "is", shop1.cost(fruit)) + print("The cost of", fruit, "in", shop2.name, "is", shop2.cost(fruit)) + + """ Part C: Class inheritance """ + price_of_fruits = {'apples': 2, 'oranges': 1, 'pears': 1.5, 'mellon': 10} + shopA = OnlineFruitShop('shopA', price_of_fruits) + print("The price of the given order in shopA is", shopA.price_of_order(order)) + + """ Part C: Using classes """ + shopB = OnlineFruitShop('shopB', {'apples': 1.0, 'oranges': 5.0}) + + shops = [shopA, shopB] + print("For the order", order, " the best shop is", shop_smart(order, shops).name) + order = {'apples': 3.0} # test with a new order. + print("For the order", order, " the best shop is", shop_smart(order, shops).name) diff --git a/irlc/ex01/__init__.py b/irlc/ex01/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..51d06d4927f23b2f23bf8b39f2b235f268d55ca8 --- /dev/null +++ b/irlc/ex01/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 1.""" diff --git a/irlc/ex01/__pycache__/__init__.cpython-311.pyc b/irlc/ex01/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31afb3e3eb30d987d8b13ffdca6fad503c42634e Binary files /dev/null and b/irlc/ex01/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex01/__pycache__/agent.cpython-311.pyc b/irlc/ex01/__pycache__/agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38d6cdd2e0550830a1f3c3198566c6e9aa1b77b5 Binary files /dev/null and b/irlc/ex01/__pycache__/agent.cpython-311.pyc differ diff --git a/irlc/ex01/__pycache__/inventory_environment.cpython-311.pyc b/irlc/ex01/__pycache__/inventory_environment.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c931bc5d63994b85ecd853b1a4743d0e84ee3b9c Binary files /dev/null and b/irlc/ex01/__pycache__/inventory_environment.cpython-311.pyc differ diff --git a/irlc/ex01/agent.py b/irlc/ex01/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..093e841165a2d21e3cd6ad8f78e11fb66f34ff52 --- /dev/null +++ b/irlc/ex01/agent.py @@ -0,0 +1,385 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""The Agent class. + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +import typing +import itertools +import os +import sys +from collections import OrderedDict, namedtuple +import numpy as np +from tqdm import tqdm +from irlc.utils.common import load_time_series, log_time_series +from irlc.utils.irlc_plot import existing_runs +import shutil +from gymnasium import Env +from dataclasses import dataclass + +class Agent: + r"""The main agent class. See (Her24, Subsection 4.4.3) for additional details. + + To use the agent class, you should first create an environment. In this case we will just create an instance of the + ``InventoryEnvironment`` (see (Her24, Subsection 4.2.3)) + + :Example: + + .. runblock:: pycon + + >>> from irlc import Agent # You can import directly from top-level package + >>> import numpy as np + >>> np.random.seed(42) # Fix the seed for reproduciability + >>> from irlc.ex01.inventory_environment import InventoryEnvironment + >>> env = InventoryEnvironment() # Create an instance of the environment + >>> agent = Agent(env) # Create an instance of the agent. + >>> s0, info0 = env.reset() # Always call reset to start the environment + >>> a0 = agent.pi(s0, k=0, info=info0) # Tell the agent to compute action $a_{k=0}$ + >>> print(f"In state {s0=}, the agent took the action {a0=}") + """ + + def __init__(self, env: Env): + """Instantiate the Agent class. + + The agent is given the openai gym environment it must interact with. This allows the agent to know what the + action and observation space is. + + :param env: The openai gym ``Env`` instance the agent should interact with. + """ + self.env = env + + def pi(self, s, k : int, info : typing.Optional[dict] =None): + r"""Evaluate the Agent's policy (i.e., compute the action the agent want to take) at time step ``k`` in state ``s``. + + This correspond to the environment being in a state evaluating :math:`x_k`, and the function should compute the next + action the agent wish to take: + + .. math:: + u_k = \mu_k(x_k) + + This means that ``s`` = :math:`x_k` and ``k`` = :math:`k =\{0, 1, ...\}`. The function should return an action that lies in the action-space + of the environment. + + The info dictionary: + The ``info``-dictionary contains possible extra information returned from the environment, for instance when calling the ``s, info = env.reset()`` function. + The main use in this course is in control, where the dictionary contains a value ``info['time_seconds']`` (which corresponds to the simulation time :math:`t` in seconds). + + We will also use the info dictionary to let the agent know certain actions are not available. This is done by setting the ``info['mask']``-key. + Note that this is only relevant for reinforcement learning, and you should see the documentation/exercises for reinforcement learning for additional details. + + The default behavior of the agent is to return a random action. An example: + + .. runblock:: pycon + + >>> from irlc.pacman.pacman_environment import PacmanEnvironment + >>> from irlc import Agent + >>> env = PacmanEnvironment() + >>> s, info = env.reset() + >>> agent = Agent(env) + >>> agent.pi(s, k=0, info=info) # get a random action + >>> agent.pi(s, k=0) # If info is not specified, all actions are assumed permissible. + + + :param s: Current state the environment is in. + :param timestep: Current time + :return: The action the agent want to take in the given state at the given time. By default the agent returns a random action + """ + if info is None or 'mask' not in info: + return self.env.action_space.sample() + else: + """ In the case where the actions available in each state differ, openAI deals with that by specifying a + ``mask``-entry in the info-dictionary. The mask can then be passed on to the + env.action_space.sample-function to make sure we don't sample illegal actions. I consider this the most + difficult and annoying thing about openai gym.""" + if info['mask'].max() > 1: + raise Exception("Bad mask!") + return self.env.action_space.sample(mask=info['mask']) + + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + r"""Implement this function if the agent has to learn (be trained). + + Note that you only have to implement this function from week 7 onwards -- before that, we are not interested in control methods that learn. + + The agent takes a number of input arguments. You should imagine that + + * ``s`` is the current state :math:`x_k`` + * ``a`` is the action the agent took in state ``s``, i.e. ``a`` :math:`= u_k = \mu_k(x_k)` + * ``r`` is the reward the the agent got from that action + * ``sp`` (s-plus) is the state the environment then transitioned to, i.e. ``sp`` :math:`= x_{k+1}` + * '``done`` tells the agent if the environment has stopped + * ``info_s`` is the information-dictionary returned by the environment as it transitioned to ``s`` + * ``info_sp`` is the information-dictionary returned by the environment as it transitioned to ``sp``. + + The following example will hopefully clarify it by showing how you would manually call the train-function once: + + :Example: + + .. runblock:: pycon + + >>> from irlc.ex01.inventory_environment import InventoryEnvironment # import environment + >>> from irlc import Agent + >>> env = InventoryEnvironment() # Create an instance of the environment + >>> agent = Agent(env) # Create an instance of the agent. + >>> s, info_s = env.reset() # s is the current state + >>> a = agent.pi(s, k=0, info=info_s) # The agent takes an action + >>> sp, r, done, _, info_sp = env.step(a) # Environment updates + >>> agent.train(s, a, r, sp, done, info_s, info_sp) # How the training function is called + + + In control and dynamical programming, please recall that the reward is equal to minus the cost. + + :param s: Current state :math:`x_k` + :param a: Action taken :math:`u_k` + :param r: Reward obtained by taking action :math:`a_k` in state :math:`x_k` + :param sp: The state that the environment transitioned to :math:`{\\bf x}_{k+1}` + :param info_s: The information dictionary corresponding to ``s`` returned by ``env.reset`` (when :math:`k=0`) and otherwise ``env.step``. + :param info_sp: The information-dictionary corresponding to ``sp`` returned by ``env.step`` + :param done: Whether environment terminated when transitioning to ``sp`` + :return: None + """ + pass + + def __str__(self): + """**Optional:** A unique name for this agent. Used for labels when plotting, but can be kept like this.""" + return super().__str__() + + def extra_stats(self) -> dict: + """**Optional:** Implement this function if you wish to record extra information from the ``Agent`` while training. + + You can safely ignore this method as it will only be used for control theory to create nicer plots """ + return {} + +fields = ('time', 'state', 'action', 'reward') +Trajectory = namedtuple('Trajectory', fields + ("env_info",)) + +# Experiment using a dataclass. +@dataclass +class Stats: + episode: int + episode_length: int + accumulated_reward: float + + total_steps: int + trajectory : Trajectory = None + agent_stats : dict = None + + @property + def average_reward(self): + return self.accumulated_reward / self.episode_length + +# s = Stats(episode=0, episode_length=5, accumulated_reward=4, total_steps=2, trajectory=Trajectory()) + + +def train(env, + agent=None, + experiment_name=None, + num_episodes=1, + verbose=True, + reset=True, # If True we will call env.reset() upon episode start. + max_steps=1e10, + max_runs=None, + return_trajectory=True, # Return the current trajectories as a list + resume_stats=None, # Resume stat collection from last save. + log_interval=1, # Only log every log_interval steps. Reduces size of log files. + delete_old_experiments=False, # Remove the old experiments folder. Useful while debugging a model (or to conserve disk space) + seed=None, # Attempt to set the seed of the random number generator to produce reproducible results. + ): + """This function implements the main training loop as described in (Her24, Subsection 4.4.4). + + The loop will simulate the interaction between agent `agent` and the environment `env`. + The function has a lot of special functionality, so it is useful to consider the common cases. An example: + + >>> stats, _ = train(env, agent, num_episodes=2) + + Simulate interaction for two episodes (i.e. environment terminates two times and is reset). + `stats` will be a list of length two containing information from each run + + >>> stats, trajectories = train(env, agent, num_episodes=2, return_Trajectory=True) + + `trajectories` will be a list of length two containing information from the two trajectories. + + >>> stats, _ = train(env, agent, experiment_name='experiments/my_run', num_episodes=2) + + Save `stats`, and trajectories, to a file which can easily be loaded/plotted (see course software for examples of this). + The file will be time-stamped so using several calls you can repeat the same experiment (run) many times. + + >>> stats, _ = train(env, agent, experiment_name='experiments/my_run', num_episodes=2, max_runs=10) + + As above, but do not perform more than 10 runs. Useful for repeated experiments. + + :param env: An openai-Gym ``Env`` instance (the environment) + :param agent: An ``Agent`` instance + :param experiment_name: The outcome of this experiment will be saved in a folder with this name. This will allow you to run multiple (repeated) experiment and visualize the results in a single plot, which is very important in reinforcement learning. + :param num_episodes: Number of episodes to simulate + :param verbose: Display progress bar + :param reset: Call ``env.reset()`` before simulation start. Default is ``True``. This is only useful in very rare cases. + :param max_steps: Terminate if this many steps have elapsed (for non-terminating environments) + :param max_runs: Maximum number of repeated experiments (requires ``experiment_name``) + :param return_trajectory: Return trajectories list (Off by default since it might consume lots of memory) + :param resume_stats: Resume stat collection from last run (this requires the ``experiment_name`` variable to be set) + :param log_interval: Log stats less frequently than each episode. Useful if you want to run really long experiments. + :param delete_old_experiments: If true, old saved experiments will be deleted. This is useful during debugging. + :param seed: An integer. The random number generator of the environment will be reset to this seed allowing for reproducible results. + :return: A list where each element corresponds to each (started) episode. The elements are dictionaries, and contain the statistics for that episode. + """ + + from irlc import cache_write + from irlc import cache_read + saveload_model = False + # temporal_policy = None + save_stats = True + if agent is None: + print("[train] No agent was specified. Using irlc.Agent(env) (this agent selects actions at random)") + agent = Agent(env) + + if delete_old_experiments and experiment_name is not None and os.path.isdir(experiment_name): + shutil.rmtree(experiment_name) + + if experiment_name is not None and max_runs is not None and existing_runs(experiment_name) >= max_runs: + stats, recent = load_time_series(experiment_name=experiment_name) + if return_trajectory: + trajectories = cache_read(recent+"/trajectories.pkl") + else: + trajectories = [] + return stats, trajectories + stats = [] + steps = 0 + ep_start = 0 + resume_stats = saveload_model if resume_stats is None else resume_stats + + recent = None + if resume_stats: + stats, recent = load_time_series(experiment_name=experiment_name) + if recent is not None: + ep_start, steps = stats[-1]['Episode']+1, stats[-1]['Steps'] + + trajectories = [] + # include_metadata = len(inspect.getfullargspec(agent.train).args) >= 7 + break_outer = False + + with tqdm(total=num_episodes, disable=not verbose, file=sys.stdout, mininterval=int(num_episodes/100) if num_episodes>100 else None) as tq: + for i_episode in range(num_episodes): + if break_outer: + break + info_s = {} + if reset or i_episode > 0: + if seed is not None: + s, info_s = env.reset(seed=seed) + seed = None + else: + s, info_s = env.reset() + elif hasattr(env, "s"): # This is doing what, exactly? Perhaps save/load of agent? + s = env.s + elif hasattr(env, 'state'): + s = env.state + else: + s = env.model.s + # time = 0 + reward = [] + trajectory = Trajectory(time=[], state=[], action=[], reward=[], env_info=[]) + k = 0 # initial state k. + for _ in itertools.count(): + # policy is always temporal + a = agent.pi(s, k, info_s) # if temporal_policy else agent.pi(s) + k = k + 1 + sp, r, terminated, truncated, info_sp = env.step(a) + done = terminated or truncated + + if info_sp is not None and 'mask' in info_sp and info_sp['mask'].max() > 1: + print("bad") + + agent.train(s, a, r, sp, done, info_s, info_sp) + + if return_trajectory: + trajectory.time.append(np.asarray(info_s['time_seconds'] if 'time_seconds' in info_s else steps)) #np.asarray(time)) + trajectory.state.append(s) + trajectory.action.append(a) + trajectory.reward.append(np.asarray(r)) + trajectory.env_info.append(info_s) + + reward.append(r) + steps += 1 + # time += info_sp['dt'] if 'dt' in info_sp else 1 + # time += 1 + + if done or steps >= max_steps: + trajectory.state.append(sp) + trajectory.env_info.append(info_sp) + trajectory.time.append(np.asarray(info_sp['time_seconds'] if 'time_seconds' in info_s else steps)) + break_outer = steps >= max_steps + break + s = sp + info_s = info_sp + if return_trajectory: + try: + from irlc.ex04.control_environment import ControlEnvironment + if isinstance(env, ControlEnvironment): # TODO: this is too hacky. States/actions should be lists, and subsequent methods should stack. + trajectory = Trajectory(**{field: np.stack([np.asarray(x_) for x_ in getattr(trajectory, field)]) for field in fields}, env_info=trajectory.env_info) + # else: + # trajectory = Trajectory(**{field: np.stack([np.asarray(x_) for x_ in getattr(trajectory, field)]) for field in fields}, env_info=trajectory.env_info) + + except Exception as e: + pass + + trajectories.append(trajectory) + if (i_episode + 1) % log_interval == 0: + stats.append({"Episode": i_episode + ep_start, + "Accumulated Reward": sum(reward), + # "Average Reward": np.mean(reward), # Not sure we need this anymore. + "Length": len(reward), + "Steps": steps, # Useful for deep learning applications. This should be kept, or week 13 will have issues. + **agent.extra_stats()}) + + rate = int(num_episodes / 100) + if rate > 0 and i_episode % rate == 0: + tq.set_postfix(ordered_dict=OrderedDict(list(OrderedDict(stats[-1]).items())[:5])) if len(stats) > 0 else None + tq.update() + + sys.stderr.flush() + + if resume_stats and save_stats and recent is not None: + os.remove(recent+"/log.txt") + + if experiment_name is not None and save_stats: + path = log_time_series(experiment=experiment_name, list_obs=stats) + if return_trajectory: + cache_write(trajectories, path+"/trajectories.pkl") + + print(f"Training completed. Logging {experiment_name}: '{', '.join( stats[0].keys()) }'") + + for i, t in enumerate(trajectories): + from collections import defaultdict + nt = defaultdict(lambda: []) + if t.env_info is not None and t.env_info[1] is not None and "supersample" in t.env_info[1]: + for f in fields: + for k, ei in enumerate(t.env_info): + if 'supersample' not in ei: + continue + z = ei['supersample'].__getattribute__(f).T + if k == 0: + pass + else: + z = z[1:] + nt[f].append(z) + + for f in fields: + nt[f] = np.concatenate([z for z in nt[f]],axis=0) + traj2 = Trajectory(**nt, env_info=[]) + trajectories[i] = traj2 + + # for k, t in enumerate(stats): + # if k < len(trajectories): + # stats[k]['trajectory'] = trajectories[k] + # Turn this into a single episodes-list (refactor later) + return stats, trajectories + + +if __name__ == "__main__": + # Use the trajectories here. + from irlc.ex01.inventory_environment import InventoryEnvironment + env = InventoryEnvironment(N=10) + stats, traj = train(env, Agent(env)) + print(stats) + s = Stats(episode=1, episode_length=2, accumulated_reward=4, total_steps=4, trajectory=None, agent_stats={}) + print(s) diff --git a/irlc/ex01/bobs_friend.py b/irlc/ex01/bobs_friend.py new file mode 100644 index 0000000000000000000000000000000000000000..0d515d8e5e2f8186451fe37b03aa8b83ea7f66ed --- /dev/null +++ b/irlc/ex01/bobs_friend.py @@ -0,0 +1,59 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import gymnasium +import numpy as np +from gymnasium.spaces.discrete import Discrete +from irlc.ex01.agent import Agent, train + +class BobFriendEnvironment(gymnasium.Env): + def __init__(self, x0=20): + self.x0 = x0 + self.action_space = Discrete(2) # Possible actions {0, 1} + + def reset(self): + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return self.s, {} + + def step(self, a): + # TODO: 9 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return s_next, reward, terminated, False, {} + +class AlwaysAction_u0(Agent): + def pi(self, s, k, info=None): + """This agent should always take action u=0.""" + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + +class AlwaysAction_u1(Agent): + def pi(self, s, k, info=None): + """This agent should always take action u=1.""" + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + +if __name__ == "__main__": + # Part A: + env = BobFriendEnvironment() + x0, _ = env.reset() + print(f"Initial amount of money is x0 = {x0} (should be 20 kroner)") + print("Lets put it in the bank, we should end up in state x1=22 and get a reward of 2 kroner") + x1, reward, _, _, _ = env.step(0) + print("we got", x1, reward) + # Since we reset the environment, we should get the same result as before: + env.reset() + x1, reward, _, _, _ = env.step(0) + print("(once more) we got", x1, reward, "(should be the same as before)") + + env.reset() # We must call reset -- the environment has possibly been changed! + print("Lets lend it to our friend -- what happens will now be random") + x1, reward, _, _, _ = env.step(1) + print("we got", x1, reward) + + # Part B: + stats, _ = train(env, AlwaysAction_u0(env), num_episodes=1000) + average_u0 = np.mean([stat['Accumulated Reward'] for stat in stats]) + + stats, _ = train(env, AlwaysAction_u1(env), num_episodes=1000) + average_u1 = np.mean([stat['Accumulated Reward'] for stat in stats]) + print(f"Average reward while taking action u=0 was {average_u0} (should be 2)") + print(f"Average reward while taking action u=1 was {average_u1} (should be 4)") diff --git a/irlc/ex01/chess.py b/irlc/ex01/chess.py new file mode 100644 index 0000000000000000000000000000000000000000..935e1fc1c4c40d121bcf249eb00b17e11e618c82 --- /dev/null +++ b/irlc/ex01/chess.py @@ -0,0 +1,99 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This file contains code for the Chess Tournament problem.""" +import numpy as np +from gymnasium.spaces.discrete import Discrete +from gymnasium import Env + +class ChessTournament(Env): + """The ChessTournament gymnasium-environment which simulate a chess tournament. + + In the problem, a chess tournament ends when a player wins two games in a row. The results + of each game are -1, 0, 1 corresponding to a loss, draw and win for player 1. See: + https://www.youtube.com/watch?v=5UQU1oBpAic + + To implement this, we define the step-function such that one episode of the environment corresponds to playing + a chess tournament to completion. Once the environment completes, it returns a reward of +1 if the player won + the tournament, and otherwise 0. + + Each step therefore corresponds to playing a single game in the tournament. + To implement this, we use a state corresponding to the sequence of games in the tournament: + + >>> self.s = [0, -1, 1, 0, 0, 1] + + In the self.step(action)-function, we ignore the action, simulate the outcome of a single game, + and append the outcome to self.s. We then compute whether the tournament has completed, and if so + a reward of 1 if we won. + """ + + def __init__(self, p_draw=3 / 4, p_win=2 / 3): + self.action_space = Discrete(1) + self.p_draw = p_draw + self.p_win = p_win + self.s = [] # A chess tournament is a sequence of won/lost games s = [0, -1, 1, 0, ...] + + def reset(self): + """Reset the tournament environment to begin to simulate a new tournament. + + After each episode is complete, this function will reset :python:`self.s` and return the current state s and an empty dictionary. + :return: + - s - The initial state (what is it?) + - info - An empty dictionary, ``{}`` + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return self.s, {} + + def step(self, action): + """Play a single game in the current tournament + + The variable action is required by gymnasium but it is not used since no (player) actions occur in this problem. + + The step-method should update `self.state` to be the next (new) state, compute the reward, and determine whether + the environment has terminated (:python:`done`). + + :param action: This input is required by gymnasium but it is not used in this case. + :return: A tuple of the form :python:`(new_state, reward, done, False, {})` + """ + game_outcome = None # should be -1, 0, or 1 depending on outcome of single game. + ## TODO: Oy veh, the following 7 lines below have been permuted. Uncomment, rearrange to the correct order and remove the error. + #------------------------------------------------------------------------------------------------------------------------------- + # else: + # else: + # game_outcome = 1 + # if np.random.rand() < self.p_win: + # game_outcome = -1 + # game_outcome = 0 + # if np.random.rand() < self.p_draw: + raise NotImplementedError("Compute game_outcome here") + self.s.append(game_outcome) + + #done = True if the tournament has ended otherwise false. Compute using s. + # TODO: 1 lines missing. + raise NotImplementedError("Compute 'done', whether the tournament has ended.") + # r = ... . Compute reward. Let r=1 if we won the tournament otherwise 0. + # TODO: 1 lines missing. + raise NotImplementedError("Compute the reward 'r' here.") + return self.s, r, done, False, {} + +def main(): + """The main method of the chess-game problem. + + This function will simulate T tournament games and estimate average win probability for player 1 as p_win (answer to riddle) and also + the average length. Note the later should be a 1-liner, but would require non-trivial computations to solve + analytically. Please see the :class:`gymnasium.Env` class for additional details. + """ + T = 5000 + from irlc import train, Agent + env = ChessTournament() + # Compute stats using the train function. Simulate the tournament for a total of T=10'000 episodes. + # TODO: 1 lines missing. + raise NotImplementedError("Compute stats here using train(env, ...). Use num_episodes.") + p_win = np.mean([st['Accumulated Reward'] for st in stats]) + avg_length = np.mean([st['Length'] for st in stats]) + + print("Agent: Estimated chance I won the tournament: ", p_win) + print("Agent: Average tournament length", avg_length) + + +if __name__ == "__main__": + main() diff --git a/irlc/ex01/inventory_environment.py b/irlc/ex01/inventory_environment.py new file mode 100644 index 0000000000000000000000000000000000000000..a4601596baf8e487d7048eb800c9df42a434a8f5 --- /dev/null +++ b/irlc/ex01/inventory_environment.py @@ -0,0 +1,71 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from gymnasium.spaces.discrete import Discrete +from gymnasium import Env +from irlc.ex01.agent import Agent, train + +class InventoryEnvironment(Env): + def __init__(self, N=2): + self.N = N # planning horizon + self.action_space = Discrete(3) # Possible actions {0, 1, 2} + self.observation_space = Discrete(3) # Possible observations {0, 1, 2} + + def reset(self): + self.s = 0 # reset initial state x0=0 + self.k = 0 # reset time step k=0 + return self.s, {} # Return the state we reset to (and an empty dict) + + def step(self, a): + w = np.random.choice(3, p=(.1, .7, .2)) # Generate random disturbance + # TODO: 5 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return s_next, reward, terminated, False, {} # return transition information + +class RandomAgent(Agent): + def pi(self, s, k, info=None): + """ Return action to take in state s at time step k """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + +def simplified_train(env: Env, agent: Agent) -> float: + s, _ = env.reset() + J = 0 # Accumulated reward for this rollout + for k in range(1000): + ## TODO: Oy veh, the following 7 lines below have been permuted. Uncomment, rearrange to the correct order and remove the error. + #------------------------------------------------------------------------------------------------------------------------------- + # if terminated or truncated: + # sp, r, terminated, truncated, metadata = env.step(a) + # a = agent.pi(s, k) + # s = sp + # J += r + # agent.train(s, a, sp, r, terminated) + # break + raise NotImplementedError("Remove this exception after the above lines have been uncommented and rearranged.") + return J + +def run_inventory(): + env = InventoryEnvironment() + agent = RandomAgent(env) + stats, _ = train(env,agent,num_episodes=1,verbose=False) # Perform one rollout. + print("Accumulated reward of first episode", stats[0]['Accumulated Reward']) + # I recommend inspecting 'stats' in a debugger; why do you think it is a list of length 1? + + stats, _ = train(env, agent, num_episodes=1000,verbose=False) # do 1000 rollouts + avg_reward = np.mean([stat['Accumulated Reward'] for stat in stats]) + print("[RandomAgent class] Average cost of random policy J_pi_random(0)=", -avg_reward) + # Try to inspect stats again in a debugger here. How long is the list now? + + stats, _ = train(env, Agent(env), num_episodes=1000,verbose=False) # Perform 1000 rollouts using Agent class + avg_reward = np.mean([stat['Accumulated Reward'] for stat in stats]) + print("[Agent class] Average cost of random policy J_pi_random(0)=", -avg_reward) + + """ Second part: Using the simplified training method. I.e. do not use train() below. + You can find some pretty strong hints about what goes on in simplified_train in the lecture slides for today. """ + avg_reward_simplified_train = np.mean( [simplified_train(env, agent) for i in range(1000)]) + print("[simplified train] Average cost of random policy J_pi_random(0) =", -avg_reward_simplified_train) + + + +if __name__ == "__main__": + run_inventory() diff --git a/irlc/ex01/pacman_hardcoded.py b/irlc/ex01/pacman_hardcoded.py new file mode 100644 index 0000000000000000000000000000000000000000..62547565232907e67c339e90463d1c7a9cd6f121 --- /dev/null +++ b/irlc/ex01/pacman_hardcoded.py @@ -0,0 +1,60 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc import Agent, train, savepdf + + +# Maze layouts can be specified using a string. +layout = """ +%%%%%%%%%% +%P.......% +%.%%%%%%.% +%.% %.% +%.% %.% +%.% %.% +%.% %.% +%.%%%%%%.% +%........% +%%%%%%%%%% +""" + +# This is our first agent. Note it inherits from the Agent class. Use <ctrl>+click in pycharm to navigate to code definitions -- +# this is a very useful habbit when you work with other peoples code in general, and object-oriented code in particular. +class GoAroundAgent(Agent): + def pi(self, x, k, info=None): + """ Collect all dots in the maze in the smallest amount of time. + This function should return an action, check the output of the code below to see what actions you can potentially + return. + Remember Pacman only have to solve this single maze, so don't make the function general. + + Hints: + - Insert a breakpoint in the function. Try to write self.env and self.env.action_space.actions in the interpreter. Where did self.env get set? + - Remember that k is the current step number. + - Ignore the info dictionary; you can probably also ignore the state x. + - The function should return a string (the actions are strings such as 'North') + """ + # TODO: 7 lines missing. + raise NotImplementedError("Implement function body") + return 'West' + +if __name__ == "__main__": + # Create an environment with the given layout. animate_movement is just for a nicer visualization. + env = PacmanEnvironment(layout_str=layout, render_mode='human') + # This creates a visualization (Note this makes the environment slower) which can help us see what Pacman does + # This create the GoAroundAgent-instance + agent = GoAroundAgent(env) + # Uncomment the following line to input actions instead of the agent using the keyboard: + # env, agent = interactive(env, agent) + s, info = env.reset() # Reset (and start) the environment + + savepdf("pacman_roundabout.pdf", env=env) # Saves a snapshot of the start layout + # The next two lines display two ways to get the available actions. The 'canonical' way using the + # env.action_space, and a way particular to Pacman by using the s.A() function on the state. + # You can read more about the functions in the state in project 1. + # print("Available actions at start:", env.action_space.actions) # This will list the available actions. + print("Alternative way of getting actions:", s.A()) # See also project description + + # Simulate the agent for one episode + stats, _ = train(env, agent, num_episodes=1) + # Print your obtained score. + print("Your obtained score was", stats[0]['Accumulated Reward']) + env.close() # When working with visualizations, call env.close() to close windows it may have opened. " diff --git a/irlc/ex02/__init__.py b/irlc/ex02/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..97bfecdc3daf77e00a1987adf3a7d3dba98c5aa4 --- /dev/null +++ b/irlc/ex02/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 2.""" diff --git a/irlc/ex02/__pycache__/__init__.cpython-311.pyc b/irlc/ex02/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c0e4230c872e1b4fd8be12ef0dc7e0d21ce5642 Binary files /dev/null and b/irlc/ex02/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex02/__pycache__/dp.cpython-311.pyc b/irlc/ex02/__pycache__/dp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a183932fe284d48d4142290e953c73af60cfb31 Binary files /dev/null and b/irlc/ex02/__pycache__/dp.cpython-311.pyc differ diff --git a/irlc/ex02/__pycache__/dp_model.cpython-311.pyc b/irlc/ex02/__pycache__/dp_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d9cd5bd2a1e51fd4f90d8072be63d1e1bea7167 Binary files /dev/null and b/irlc/ex02/__pycache__/dp_model.cpython-311.pyc differ diff --git a/irlc/ex02/__pycache__/graph_traversal.cpython-311.pyc b/irlc/ex02/__pycache__/graph_traversal.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..539851022997c8a7f7a92f9e87749c2a08229cad Binary files /dev/null and b/irlc/ex02/__pycache__/graph_traversal.cpython-311.pyc differ diff --git a/irlc/ex02/__pycache__/inventory.cpython-311.pyc b/irlc/ex02/__pycache__/inventory.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5f7d3211a68b3c18faebe51fd3aa2a340afe6df Binary files /dev/null and b/irlc/ex02/__pycache__/inventory.cpython-311.pyc differ diff --git a/irlc/ex02/dp.py b/irlc/ex02/dp.py new file mode 100644 index 0000000000000000000000000000000000000000..853d188f68d0127c1dfc228704825364868b5ed2 --- /dev/null +++ b/irlc/ex02/dp.py @@ -0,0 +1,71 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from irlc.ex02.graph_traversal import SmallGraphDP +from irlc.ex02.dp_model import DPModel + +def DP_stochastic(model: DPModel): + """ + Implement the stochastic DP algorithm. The implementation follows (Her24, Algorithm 1). + Once you are done, you should be able to call the function as: + + .. runblock:: pycon + + >>> from irlc.ex02.graph_traversal import SmallGraphDP + >>> from irlc.ex02.dp import DP_stochastic + >>> model = SmallGraphDP(t=5) # Instantiate the small graph with target node 5 + >>> J, pi = DP_stochastic(model) + >>> print(pi[0][2]) # Action taken in state ``x=2`` at time step ``k=0``. + + :param model: An instance of :class:`irlc.ex02.dp_model.DPModel` class. This represents the problem we wish to solve. + :return: + - ``J`` - A list of of cost function so that ``J[k][x]`` represents :math:`J_k(x)` + - ``pi`` - A list of dictionaries so that ``pi[k][x]`` represents :math:`\mu_k(x)` + """ + + """ + In case you run into problems, I recommend following the hints in (Her24, Subsection 6.2.1) and focus on the + case without a noise term; once it works, you can add the w-terms. When you don't loop over noise terms, just specify + them as w = None in env.f and env.g. + """ + N = model.N + J = [{} for _ in range(N + 1)] + pi = [{} for _ in range(N)] + J[N] = {x: model.gN(x) for x in model.S(model.N)} + for k in range(N-1, -1, -1): + for x in model.S(k): + """ + Update pi[k][x] and Jstar[k][x] using the general DP algorithm given in (Her24, Algorithm 1). + If you implement it using the pseudo-code, I recommend you define Q (from the algorithm) as a dictionary like the J-function such that + + > Q[u] = Q_u (for all u in model.A(x,k)) + + Then you find the u with the lowest value of Q_u, i.e. + + > umin = arg_min_u Q[u] + + (for help, google: `python find key in dictionary with minimum value'). + Then you can use this to update J[k][x] = Q_umin and pi[k][x] = umin. + """ + # TODO: 4 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + """ + After the above update it should be the case that: + + J[k][x] = J_k(x) + pi[k][x] = pi_k(x) + """ + return J, pi + + +if __name__ == "__main__": # Test dp on small graph given in (Her24, Subsection 6.2.1) + print("Testing the deterministic DP algorithm on the small graph environment") + model = SmallGraphDP(t=5) # Instantiate the small graph with target node 5 + J, pi = DP_stochastic(model) + # Print all optimal cost functions J_k(x_k) + for k in range(len(J)): + print(", ".join([f"J_{k}({i}) = {v:.1f}" for i, v in J[k].items()])) + print(f"Cost of shortest path when starting in node 2 is: {J[0][2]=} (and should be 4.5)") diff --git a/irlc/ex02/dp_agent.py b/irlc/ex02/dp_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..7e49efd69dacab5359d6fa208c5dce5b125f4ba5 --- /dev/null +++ b/irlc/ex02/dp_agent.py @@ -0,0 +1,44 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex01.agent import Agent +from irlc.ex02.dp import DP_stochastic +from irlc import train +import numpy as np + + +class DynamicalProgrammingAgent(Agent): + """ + This is an agent which plan using dynamical programming. + """ + def __init__(self, env, model=None): + super().__init__(env) + self.J, self.pi_ = DP_stochastic(model) + + def pi(self, s, k, info=None): + if k >= len(self.pi_): + raise Exception("k >= N; I have not planned this far!") + ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # action = se???????????? + raise NotImplementedError("Get the action according to the DP policy.") + return action + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): # Do nothing; this is DP so no learning takes place. + pass + + +def main(): + from irlc.ex01.inventory_environment import InventoryEnvironment + from irlc.ex02.inventory import InventoryDPModel + + env = InventoryEnvironment(N=3) + inventory_model = InventoryDPModel(N=3) + agent = DynamicalProgrammingAgent(env, model=inventory_model) + stats, _ = train(env, agent, num_episodes=5000) + + s, _ = env.reset() # Get initial state + Er = np.mean([stat['Accumulated Reward'] for stat in stats]) + print("Estimated reward using trained policy and MC rollouts", Er) + print("Reward as computed using DP", -agent.J[0][s]) + +if __name__ == "__main__": + main() diff --git a/irlc/ex02/dp_model.py b/irlc/ex02/dp_model.py new file mode 100644 index 0000000000000000000000000000000000000000..88dd27cf84b58b7865b613750336b8326dfb11e0 --- /dev/null +++ b/irlc/ex02/dp_model.py @@ -0,0 +1,185 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np + +class DPModel: + r""" The Dynamical Programming model class + + The purpose of this class is to translate a dynamical programming problem, defined by the equations, + + .. math:: + + x_{k+1} & = f_k(x_k, u_k, w_k) \\ + \text{cost} & = g_k(x_k, u_k, w_k) \\ + \text{terminal cost} & = g_N(x_N) \\ + \text{Noise disturbances:} \quad w_k & \sim P_W(w_k | x_k, u_k) \\ + \text{State/action spaces:} \quad & \mathcal A_k(x_k), \mathcal S_k + + into a single python object which we can then use for planning. + + .. Note:: + + This is the first time many of you encounter a class. If so, you might wonder why you can't just implement + the functions as usual, i.e. ``def f(x, k, ...):``, ``def g(x, k, ...):``, + as regular python function and just let that be it? + + The reason is that we want to pass all these function (which taken together represents a planning problem) + to planning methods such as the DP-algorithm (see the function :func:`~irlc.ex02.dp.DP_stochastic`) + all at once. + It is not very convenient to pass the functions one at a time -- instead we collect them into a class and simply call the function as + + >>> from irlc.ex02.inventory import InventoryDPModel + >>> from irlc.ex02.dp import DP_stochastic + >>> model = InventoryDPModel() # Intialize the model + >>> J, pi = DP_stochastic(model) # All functions are passed to DP_stochastic + + + + To actually use the model, you need to extend it and implement the methods. The basic recipe for this is something like:: + + class MyDPModel(DPModel): + def f(self, x, u, w, k): # Note the `self`-variable. You can use it to access class variables such as`self.N`. + return x + u - w # Just an example + def S(self, k): + return [0, 1, 2] # State space S_k = {0, 1, 2} + # Implement the other functions A, g, gN and Pw here. + + + You should take a look at :func:`~irlc.ex02.inventory.InventoryDPModel` for a concrete example. + Once the functions have been implemented, you can call them as: + + .. runblock:: pycon + + >>> from irlc.ex02.inventory import InventoryDPModel + >>> model = InventoryDPModel(N=5) # Plan on a horizon of 5 + >>> print("State space S_2", model.S(2)) + >>> model.f(x=1, u=2, w=1, k=0) # Just an example. You don't have to use named arguments, although it helps on readability. + >>> model.A(1, k=2) # Action space A_1(2), i.e. the actions available at time step k=1 in state 2. + + """ + def __init__(self, N): + """ + Called when the DP Model is initialized. By default, it simply stores the planning horizon ``N`` + + :param N: The planning horizon in the DP problem :math:`N` + """ + self.N = N # Store the planning horizon. + + def f(self, x, u, w, k: int): + """ + Implements the transition function :math:`x_{k+1} = f_k(x, u, w)` and returns the next state :math:`x_{k+1}` + + :param x: The state :math:`x_k` + :param u: The action taken :math:`u_k` + :param w: The random noise disturbance :math:`w_k` + :param k: The current time step :math:`k` + :return: The state the environment (deterministically) transitions to, i.e. :math:`x_{k+1}` + """ + raise NotImplementedError("Return f_k(x,u,w)") + + def g(self, x, u, w, k: int) -> float: + """ + Implements the cost function :math:`c = g_k(x, u, w)` and returns the cost :math:`c` + + :param x: The state :math:`x_k` + :param u: The action taken :math:`u_k` + :param w: The random noise disturbance :math:`w_k` + :param k: The current time step :math:`k` + :return: The cost (as a ``float``) incurred by the environment, i.e. :math:`g_k(x, u, w)` + """ + raise NotImplementedError("Return g_k(x,u,w)") + + def gN(self, x) -> float: + """ + Implements the terminal cost function :math:`c = g_N(x)` and returns the terminal cost :math:`c`. + + :param x: A state seen at the last time step :math:`x_N` + :return: The terminal cost (as a ``float``) incurred by the environment, i.e. :math:`g_N(x)` + """ + raise NotImplementedError("Return g_N(x)") + + def S(self, k: int): + """ + Computes the state space :math:`\mathcal S_k` at time step :math:`k`. + In other words, this function returns a set of all states the system can possibly be in at time step :math:`k`. + + .. Note:: + I think the cleanest implementation is one where this function returns a python ``set``. However, it won't matter + if the function returns a ``list`` or ``tuple`` instead. + + :param k: The current time step :math:`k` + :return: The state space (as a ``list`` or ``set``) available at time step ``k``, i.e. :math:`\mathcal S_k` + """ + raise NotImplementedError("Return state space as set S_k = {x_1, x_2, ...}") + + def A(self, x, k: int): + """ + Computes the action space :math:`\mathcal A_k(x)` at time step :math:`k` in state `x`. + + In other words, this function returns a ``set`` of all actions the agent can take in time step :math:`k`. + + .. Note:: + An example where the actions depend on the state is chess (in this case, the state is board position, and the actions are the legal moves) + + :param k: The current time step :math:`k` + :param x: The state we want to compute the actions in :math:`x_k` + :return: The action space (as a ``list`` or ``set``) available at time step ``k``, i.e. :math:`\mathcal A_k(x_k)` + """ + raise NotImplementedError("Return action space as set A(x_k) = {u_1, u_2, ...}") + + def Pw(self, x, u, k: int): + """ + Returns the random noise disturbances and their probability. In other words, this function implements the distribution: + + .. math:: + + P_k(w_k | x_k, u_k) + + To implement this distribution, we must keep track of both the possible values of the noise disturbances :math:`w_k` + as well as the (numerical) value of their probability :math:`p(w_k| ...)`. + + To do this, the function returns a dictionary of the form ``P = {w1: p_w1, w2: p_w2, ...}`` where + + - The keys ``w`` represents random noise disturbances + - the values ``P[w]`` represents their probability (i.e. a ``float``) + + This can hopefully be made more clear with the Inventory environment: + + .. runblock:: pycon + + >>> from irlc.ex02.inventory import InventoryDPModel + >>> model = InventoryDPModel(N=5) # Plan on a horizon of 5 + >>> print("Random noise disturbances in state x=1 using action u=0 is:", model.Pw(x=1, u=0, k=0)) + >>> for w, pw in model.Pw(x=1, u=0, k=0).items(): # Iterate and print: + ... print(f"p_k({w}|x, u) =", pw) + + + :param x: The state :math:`x_k` + :param u: The action taken :math:`u_k` + :param k: The current time step :math:`k` + :return: A dictionary representing the distribution of random noise disturbances :math:`P_k(w |x_k, u_k)` of the form ``{..., w_i: pw_i, ...}`` such that ``pw_i = P_k(w_i | x, u)`` + """ + # Compute and return the random noise disturbances here. + # As an example: + return {'w_dummy': 1/3, 42: 2/3} # P(w_k="w_dummy") = 1/3, P(w_k =42)=2/3. + + def w_rnd(self, x, u, k): + """ + This helper function computes generates a random noise disturbance using the function + :func:`irlc.ex02.dp_model.DPModel.Pw`, i.e. it returns a sample: + + .. math:: + w \sim P_k(x_k, u_k) + + This will be useful for simulating the model. + + .. Note:: + You don't have to implement or change this function. + + :param x: The state :math:`x_k` + :param u: The action taken :math:`u_k` + :param k: The current time step :math:`k` + :return: A random noise disturbance :math:`w` distributed as :math:`P_k(x_k, u_k)` + """ + pW = self.Pw(x, u, k) + w, pw = zip(*pW.items()) # seperate w and p(w) + return np.random.choice(a=w, p=pw) diff --git a/irlc/ex02/flower_store.py b/irlc/ex02/flower_store.py new file mode 100644 index 0000000000000000000000000000000000000000..35a4712bdd7f06688b8eaebc539e37b804e331ea --- /dev/null +++ b/irlc/ex02/flower_store.py @@ -0,0 +1,27 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex02.inventory import InventoryDPModel +from irlc.ex02.dp import DP_stochastic +import numpy as np + +# TODO: Code has been removed from here. +raise NotImplementedError("Insert your solution and remove this error.") + +def a_get_policy(N: int, c: float, x0 : int) -> int: + # TODO: Code has been removed from here. + raise NotImplementedError("Insert your solution and remove this error.") + return u + +def b_prob_one(N : int, x0 : int) -> float: + # TODO: Code has been removed from here. + raise NotImplementedError("Insert your solution and remove this error.") + return pr_empty + + +if __name__ == "__main__": + model = InventoryDPModel() + pi = [{s: 0 for s in model.S(k)} for k in range(model.N)] + x0 = 0 + c = 0.5 + N = 3 + print(f"a) The policy choice for {c=} is {a_get_policy(N, c,x0)} should be 1") + print(f"b) The probability of ending up with a single element in the inventory is {b_prob_one(N, x0)} and should be 0.492") diff --git a/irlc/ex02/graph_traversal.py b/irlc/ex02/graph_traversal.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd25aabae0fd409d0051a734960577764871417 --- /dev/null +++ b/irlc/ex02/graph_traversal.py @@ -0,0 +1,67 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +import numpy as np +from irlc.ex02.dp_model import DPModel + +r""" +Graph of shortest path problem of (Her24, Subsection 5.1.1) +""" +G222 = {(1, 2): 6, (1, 3): 5, (1, 4): 2, (1, 5): 2, + (2, 3): .5, (2, 4): 5, (2, 5): 7, + (3, 4): 1, (3, 5): 5, (4, 5): 3} + +def symG(G): + """ make a graph symmetric. I.e. if it contains edge (a,b) with cost z add edge (b,a) with cost c """ + G.update({(b, a): l for (a, b), l in G.items()}) +symG(G222) + +class SmallGraphDP(DPModel): + r""" Implement the small-graph example in (Her24, Subsection 5.1.1). t is the terminal node. """ + def __init__(self, t, G=None): + self.G = G.copy() if G is not None else G222.copy() + self.G[(t,t)] = 0 # make target vertex absorbing + self.t = t # target vertex in graph + self.nodes = {node for edge in self.G for node in edge} # set of all nodes + super(SmallGraphDP, self).__init__(N=len(self.nodes)-1) + + def f(self, x, u, w, k): + if (x,u) in self.G: + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + else: + raise Exception("Nodes are not connected") + + def g(self, x, u, w, k): + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def gN(self, x): + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def S(self, k): + return self.nodes + + def A(self, x, k): + return {j for (i,j) in self.G if i == x} + +def main(): + t = 5 # target node + model = SmallGraphDP(t=t) + x0 = 1 # starting node + k = 0 + w = 0 # irrelevant. + u = 2 # as an example. + print(f"{model.f(x0, u, w, k)=} (should be 2)") + print(f"{model.g(x0, u, w, k)=} (should be 6)") + print(f"{model.gN(x0)=} (should be np.inf)") + print(f"{model.S(k)=}", "(should be {1, 2, 3, 4, 5})") + print(f"{model.A(x0, k)=}", "(should be {2, 3, 4, 5})") + print("Run the tests to check your implementation.") + +if __name__ == '__main__': + main() diff --git a/irlc/ex02/inventory.py b/irlc/ex02/inventory.py new file mode 100644 index 0000000000000000000000000000000000000000..74c8eb8a8c8b2fb0580d6d2d6e71f3223f5f6940 --- /dev/null +++ b/irlc/ex02/inventory.py @@ -0,0 +1,44 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" +Implements the inventory-control problem from (Her24, Subsection 5.1.2). + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from irlc.ex02.dp_model import DPModel +from irlc.ex02.dp import DP_stochastic + +class InventoryDPModel(DPModel): + def __init__(self, N=3): + super().__init__(N=N) + + def A(self, x, k): # Action space A_k(x) + return {0, 1, 2} + + def S(self, k): # State space S_k + return {0, 1, 2} + + def g(self, x, u, w, k): # Cost function g_k(x,u,w) + return u + (x + u - w) ** 2 + + def f(self, x, u, w, k): # Dynamics f_k(x,u,w) + return max(0, min(2, x + u - w )) + + def Pw(self, x, u, k): # Distribution over random disturbances + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def gN(self, x): + return 0 + +def main(): + inv = InventoryDPModel() + J,pi = DP_stochastic(inv) + print(f"Inventory control optimal policy/value functions") + for k in range(inv.N): + print(", ".join([f" J_{k}(x_{k}={i}) = {J[k][i]:.2f}" for i in inv.S(k)] ) ) + for k in range(inv.N): + print(", ".join([f"pi_{k}(x_{k}={i}) = {pi[k][i]}" for i in inv.S(k)] ) ) + +if __name__ == "__main__": + main() diff --git a/irlc/ex03/__init__.py b/irlc/ex03/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..01980cab4517ee3de7eb4bdc269e9927949d4ecb --- /dev/null +++ b/irlc/ex03/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 3.""" diff --git a/irlc/ex03/__pycache__/__init__.cpython-311.pyc b/irlc/ex03/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5b4f16c809b017449ec5929b058171c20265711 Binary files /dev/null and b/irlc/ex03/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex03/__pycache__/basic_pendulum.cpython-311.pyc b/irlc/ex03/__pycache__/basic_pendulum.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8aea6b565bd95c52b88ab61c24199f4f2fc19207 Binary files /dev/null and b/irlc/ex03/__pycache__/basic_pendulum.cpython-311.pyc differ diff --git a/irlc/ex03/__pycache__/control_cost.cpython-311.pyc b/irlc/ex03/__pycache__/control_cost.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbd4fa86d1447a69fa82333373f93802475e67a2 Binary files /dev/null and b/irlc/ex03/__pycache__/control_cost.cpython-311.pyc differ diff --git a/irlc/ex03/__pycache__/control_model.cpython-311.pyc b/irlc/ex03/__pycache__/control_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30de5a19f87513667a5d380652c9bb051c97ccc2 Binary files /dev/null and b/irlc/ex03/__pycache__/control_model.cpython-311.pyc differ diff --git a/irlc/ex03/basic_pendulum.py b/irlc/ex03/basic_pendulum.py new file mode 100644 index 0000000000000000000000000000000000000000..817e511daec05c0a99bb422443c2b04c72a7369b --- /dev/null +++ b/irlc/ex03/basic_pendulum.py @@ -0,0 +1,39 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import sympy as sym +import numpy as np +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +from gymnasium.spaces import Box + +class BasicPendulumModel(ControlModel): + def sym_f(self, x, u, t=None): + g = 9.82 + l = 1 + m = 2 + theta_dot = x[1] # Parameterization: x = [theta, theta'] + theta_dot_dot = g / l * sym.sin(x[0]) + 1 / (m * l ** 2) * u[0] + return [theta_dot, theta_dot_dot] + + def get_cost(self) -> SymbolicQRCost: + return SymbolicQRCost(Q=np.eye(2), R=np.eye(1)) + + def u_bound(self) -> Box: + return Box(np.asarray([-10]), np.asarray([10])) + + def x0_bound(self) -> Box: + return Box(np.asarray( [np.pi, 0] ), np.asarray( [np.pi, 0])) + +if __name__ == "__main__": + p = BasicPendulumModel() + print(p) + + from irlc.ex04.discrete_control_model import DiscreteControlModel + model = BasicPendulumModel() + discrete_pendulum = DiscreteControlModel(model, dt=0.5) # Using a discretization time step: 0.5 seconds. + x0 = model.x0_bound().low # Get the initial state: x0 = [np.pi, 0]. + u0 = [0] # No action. Note the action must be a list. + x1 = discrete_pendulum.f(x0, u0) + print(x1) + print("Now, lets compute the Euler step manually to confirm") + x1_manual = x0 + 0.5 * model.f(x0, u0, 0) + print(x1_manual) diff --git a/irlc/ex03/control_cost.py b/irlc/ex03/control_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..43d1c794a50fc8974d7b94c26b395ae58c98d538 --- /dev/null +++ b/irlc/ex03/control_cost.py @@ -0,0 +1,289 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +import sympy as sym +import numpy as np + + +def mat(x): # Helper function. + return sym.Matrix(x) if x is not None else x + + +class SymbolicQRCost: + """ + This class represents the cost function for a continuous-time model. In the simulations, we are going to assume + that the cost function takes the form: + + .. math:: + \int_{t_0}^{t_F} c(x(t), u(t)) dt + c_F(x_F) + + And this class will specifically implement the two functions :math:`c` and :math:`c_F`. They will be assumed to have the quadratic form: + + .. math:: + c(x, u) & = \\frac{1}{2} x^T Q x + \\frac{1}{2} u^T R u + u^T H x + q^T x + r^T u + q_0, \\\\ + c_F(x_F) & = \\frac{1}{2} x_F^T Q_F x_F + q_F^T x_F + q_{0,F}. + + So what all of this boils down to is that the class just need to store a bunch of matrices and vectors. + + You can add and scale cost-functions + ********************************************************** + + A slightly smart thing about the cost functions are that you can add and scale them. The following provides an + example: + + .. runblock:: pycon + + >>> from irlc.ex03.control_cost import SymbolicQRCost + >>> import numpy as np + >>> cost1 = SymbolicQRCost(np.eye(2), np.zeros(1) ) # Set Q = I, R = 0 + >>> cost2 = SymbolicQRCost(np.ones((2,2)), np.zeros(1) ) # Set Q = 2x2 matrices of 1's, R = 0 + >>> print(cost1.Q) # Will be the identity matrix. + >>> cost = cost1 * 3 + cost2 * 2 + >>> print(cost.Q) # Will be 3 x I + 2 + + """ + + def __init__(self, Q, R, q=None, qc=None, r=None, H=None, QN=None, qN=None, qcN=None): + """ + The constructor can be used to manually create a cost function. You will rarely want to call the constructor + directly but instead use the helper methods (see class documentation). + What the class basically does is that it stores the input parameters as fields. In other words, you can access the quadratic + term of the cost function, :math:`\\frac{1}{2}x^T Q x`, as ``cost.Q``. + + :param Q: The matrix :math:`Q` + :param R: The matrix :math:`R` + :param q: The vector :math:`q` + :param qc: The constant :math:`q_0` + :param r: The vector :math:`r` + :param H: The matrix :math:`H` + :param QN: The terminal cost matrix :math:`Q_N` + :param qN: The terminal cost vector :math:`q_N` + :param qcN: The terminal cost constant :math:`q_{0,N}` + """ + + n = Q.shape[0] + d = R.shape[0] + self.Q = Q + self.R = R + self.q = np.zeros( (n,)) if q is None else q + self.qc = 0 if qc == None else qc + self.r = np.zeros( (d,)) if r is None else r + self.H = np.zeros((d,n)) if H is None else H + self.QN = np.zeros((n,n)) if QN is None else QN + self.qN = np.zeros((n,)) if qN is None else qN + self.qcN = 0 if qcN == None else qcN + self.flds = ('Q', 'R', 'q', 'qc', 'r', 'H', 'QN', 'qN', 'qcN') + self.flds_term = ('QN', 'qN', 'qcN') + + self.c_numpy = None + self.cF_numpy = None + + + @classmethod + def zero(cls, state_size, action_size): + """ + Creates an all-zero cost function, i.e. all terms :math:`Q`, :math:`R` are set to zer0. + + .. runblock:: pycon + + >>> from irlc.ex03.control_cost import SymbolicQRCost + >>> cost = SymbolicQRCost.zero(2, 1) + >>> cost.Q # 2x2 zero matrix + >>> cost.R # 1x1 zero matrix. + + :param state_size: Dimension of the state vector :math:`n` + :param action_size: Dimension of the action vector :math:`d` + :return: A ``SymbolicQRCost`` with all zero terms. + """ + + return cls(Q=np.zeros( (state_size,state_size)), R=np.zeros((action_size,action_size)) ) + + + def sym_c(self, x, u, t=None): + """ + Evaluate the (instantaneous) part of the function :math:`c(x,u, t)`. An example: + + .. runblock:: pycon + + >>> from irlc.ex03.control_cost import SymbolicQRCost + >>> import numpy as np + >>> cost = SymbolicQRCost(np.eye(2), np.eye(1)) # Set Q = I, R = 0 + >>> cost.sym_c(x = np.asarray([1,2]), u=np.asarray([0])) # should return 0.5 * x^T Q x = 0.5 * (1 + 4) + + :param x: The state :math:`x(t)` + :param u: The action :math:`u(t)` + :param t: The current time step :math:`t` (this will be ignored) + :return: A ``sympy`` symbolic expression corresponding to the instantaneous cost. + """ + u = sym.Matrix(u) + x = sym.Matrix(x) + c = 1 / 2 * (x.transpose() @ self.Q @ x) + 1 / 2 * (u.transpose() @ self.R @ u) + u.transpose() @ self.H @ x + sym.Matrix(self.q).transpose() @ x + sym.Matrix(self.r).transpose() @ u + sym.Matrix([[self.qc]]) + assert c.shape == (1,1) + return c[0,0] + + + def sym_cf(self, t0, tF, x0, xF): + """ + Evaluate the terminal (constant) term in the cost function :math:`c_F(t_0, t_F, x_0, x_F)`. An example: + + .. runblock:: pycon + + >>> from irlc.ex03.control_cost import SymbolicQRCost + >>> import numpy as np + >>> cost = SymbolicQRCost(np.eye(2), np.zeros(1), QN=np.eye(2)) # Set Q = I, R = 0 + >>> cost.sym_cf(0, 0, 0, xF=2*np.ones((2,))) # should return 0.5 * xF^T * xF = 0.5 * 8 + + :param t0: Starting time :math:`t_0` (not used) + :param tF: Stopping time :math:`t_F` (not used) + :param x0: Initial state :math:`x_0` (not used) + :param xF: Termi lanstate :math:`x_F` (**this one is used**) + :return: A ``sympy`` symbolic expression corresponding to the terminal cost. + """ + xF = sym.Matrix(xF) + c = 0.5 * xF.transpose() @ self.QN @ xF + xF.transpose() @ sym.Matrix(self.qN) + sym.Matrix([[self.qcN]]) + assert c.shape == (1,1) + return c[0,0] + + def discretize(self, dt): + """ + Discretize the cost function so it is suitable for a discrete control problem. See (Her24, Subsection 13.1.5) for more information. + + :param dt: The discretization time step :math:`\Delta` + :return: An :class:`~irlc.ex04.cost_discrete.DiscreteQRCost` instance corresponding to a discretized version of this cost function. + """ + from irlc.ex04.discrete_control_cost import DiscreteQRCost + return DiscreteQRCost(**{f: self.__getattribute__(f) * (1 if f in self.flds_term else dt) for f in self.flds} ) + + + def __add__(self, c): + return SymbolicQRCost(**{k: self.__dict__[k] + c.__dict__[k] for k in self.flds}) + + def __mul__(self, c): + return SymbolicQRCost(**{k: self.__dict__[k] * c for k in self.flds}) + + def __str__(self): + title = "Continuous-time cost function" + label1 = "Non-zero terms in c(x, u)" + label2 = "Non-zero terms in c_F(x)" + terms1 = [s for s in self.flds if s not in self.flds_term] + terms2 = self.flds_term + return _repr_cost(self, title, label1, label2, terms1, terms2) + + def goal_seeking_terminal_cost(self, xF_target, QF=None): + """ + Create a cost function which is minimal when the terminal state :math:`x_F` is equal to a goal state :math:`x_F^*`. + Concretely, it will return a cost function of the form + + .. math:: + c_F(x_F) = \\frac{1}{2} (x_F^* - x_F)^\\top Q_F (x_F^* - x_F) + + .. runblock:: pycon + + >>> from irlc.ex03.control_cost import SymbolicQRCost + >>> import numpy as np + >>> cost = SymbolicQRCost.zero(2, 1) + >>> cost += cost.goal_seeking_terminal_cost(xF_target=np.ones((2,))) + >>> print(cost.qN) + >>> print(cost) + + :param xF_target: Target state :math:`x_F^*` + :param QF: Cost matrix :math:`Q_F` + :return: A ``SymbolicQRCost`` object corresponding to the goal-seeking cost function + """ + if QF is None: + QF = np.eye(xF_target.size) + QF, qN, qcN = targ2matrices(xF_target, Q=QF) + return SymbolicQRCost(Q=self.Q*0, R=self.R*0, QN=QF, qN=qN, qcN=qcN) + + def goal_seeking_cost(self, x_target, Q=None): + """ + Create a cost function which is minimal when the state :math:`x` is equal to a goal state :math:`x^*`. + Concretely, it will return a cost function of the form + + .. math:: + c(x, u) = \\frac{1}{2} (x^* - x)^\\top Q (x^* - x) + + .. runblock:: pycon + + >>> from irlc.ex03.control_cost import SymbolicQRCost + >>> import numpy as np + >>> cost = SymbolicQRCost.zero(2, 1) + >>> cost += cost.goal_seeking_cost(x_target=np.ones((2,))) + >>> print(cost.q) + >>> print(cost) + + :param x_target: Target state :math:`x^*` + :param Q: Cost matrix :math:`Q` + :return: A ``SymbolicQRCost`` object corresponding to the goal-seeking cost function + """ + if Q is None: + Q = np.eye(x_target.size) + Q, q, qc = targ2matrices(x_target, Q=Q) + return SymbolicQRCost(Q=Q, R=self.R*0, q=q, qc=qc) + + def term(self, Q=None, R=None,r=None): + dd = {} + lc = locals() + for f in self.flds: + if f in lc and lc[f] is not None: + dd[f] = lc[f] + else: + dd[f] = self.__getattribute__(f)*0 + return SymbolicQRCost(**dd) + + @property + def state_size(self): + return self.Q.shape[0] + + @property + def action_size(self): + return self.R.shape[0] + + + +def _repr_cost(cost, title, label1, label2, terms1, terms2): + self = cost + def _get(flds, label): + d = {s: self.__dict__[s] for s in flds if np.sum(np.sum(self.__dict__[s] != 0)) != 0} + out = "" + if len(d) > 0: + # out = "" + out += f"> {label}:\n" + for s, m in d.items(): + mm = f"{m}" + if len(mm.splitlines()) > 1: + mm = "\n" + mm + out += f" * {s} = {mm}\n" + + return d, out + + nz_c, o1 = _get([s for s in terms1], label1) + out = "" + out += f"{title}:\n" + out += o1 + nz_term, o2 = _get(terms2, label2) + out += o2 + if len(nz_c) + len(nz_term) == 0: + print("All terms in the cost-function are zero.") + return out + + +def targ2matrices(t, Q=None): # Helper function + """ + Given a target vector :math:`t` and a matrix :math:`Q` this function returns cost-matrices suitable for implementing: + + .. math:: + \\frac{1}{2} * (x - t)^Q (x - t) = \\frac{1}{2} * x^T Q x + 1/2 * t^T * t - x * t + + :param t: + :param Q: + :return: + """ + n = t.size + if Q is None: + Q = np.eye(n) + + return Q, -1/2 * (Q @ t + t @ Q.T), 1/2 * t @ Q @ t diff --git a/irlc/ex03/control_model.py b/irlc/ex03/control_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ed57c858c46b0ca694810407ac2854df6b6f0c59 --- /dev/null +++ b/irlc/ex03/control_model.py @@ -0,0 +1,423 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from collections import defaultdict +import tabulate +import sympy as sym +import numpy as np +import matplotlib.pyplot as plt +from gymnasium.spaces import Box +from irlc.ex03.control_cost import SymbolicQRCost + + +class ControlModel: + r"""Represents the continious time model of a control environment. + + + See (Her24, Section 13.2) for a top-level description. + + The model represents the physical system we are simulating and can be considered a control-equivalent of the + :class:`irlc.ex02.dp_model.DPModel`. The class must keep track of the following: + + .. math:: + \frac{dx}{dt} = f(x, u, t) + + And the cost-function which is defined as an integral + + .. math:: + c_F(t_0, t_F, x(t_0), x(t_F)) + \int_{t_0}^{t_F} c(t, x, u) dt + + as well as constraints and boundary conditions on :math:`x`, :math:`u` and the initial conditions state :math:`x(t_0)`. + this course, the cost function will always be quadratic, and can be accessed as ``model.get_cost``. + + If you want to implement your own model, the best approach is to start with an existing model and modify it for + your needs. The overall idea is that you implement the dynamics,``sym_f``, and the cost function ``get_cost``, + and optionally define bounds as needed. + """ + state_labels = None # Labels (as lists) used for visualizations. + action_labels = None # Labels (as lists) used for visualizations. + + def __init__(self): + """ + The cost must be an instance of :class:`irlc.ex04.cost_continuous.SymbolicQRCost`. + Bounds is a dictionary but otherwise optional; the model should give it a default value. + + :param cost: A quadratic cost function + :param dict bounds: A dictionary of boundary constraints. + """ + if self.state_labels is None: + self.state_labels = [f'x{i}' for i in range(self.state_size)] + if self.action_labels is None: + self.action_labels = [f'u{i}' for i in range(self.action_size)] + + t = sym.symbols("t") + x = sym.symbols(f"x0:{self.state_size}") + u = sym.symbols(f"u0:{self.action_size}") + try: + f = self.sym_f(x, u, t) + except Exception as e: + print("control_model.py> There is a problem with the way you have specified the dynamics. The function sym_f must accept lists as inputs") + raise e + if len(f) != len(x): + print("control_model.py> Your function ControlModel.sym_f must output a list of symbolic expressions.") + assert len(f) == len(x) + + self._f_np = sym.lambdify((x, u, t), self.sym_f(x, u, t)) + + def x0_bound(self) -> Box: + r"""The bound on the initial state :math:`\mathbf{x}_0`. + + The default bound is ``Box(0, 0, shape=(self.state_size,))``, i.e. :math:`\mathbf{x}_0 = 0`. + + :return: An appropriate gymnasium Box instance. + """ + return Box(0, 0, shape=(self.state_size,)) + + def xF_bound(self) -> Box: + r"""The bound on the terminal state :math:`\mathbf{x}_F`. + + :return: An appropriate gymnasium Box instance. + """ + return Box(-np.inf, np.inf, shape=(self.state_size,)) + + def x_bound(self) -> Box: + r"""The bound on all other states :math:`\mathbf{x}(t)`. + + :return: An appropriate gymnasium Box instance. + """ + return Box(-np.inf, np.inf, shape=(self.state_size,)) + + def u_bound(self) -> Box: + r"""The bound on the terminal state :math:`\mathbf{u}(t)`. + + :return: An appropriate gymnasium Box instance. + """ + return Box(-np.inf, np.inf, shape=(self.action_size,)) + + def t0_bound(self) -> Box: + r"""The bound on the initial time :math:`\mathbf{t}_0`. + + I have included this bound for completeness: In practice, there is no reason why you should change it + from the default bound is ``Box(0, 0, shape=(1,))``, i.e. :math:`\mathbf{t}_0 = 0`. + + :return: An appropriate gymnasium Box instance. + """ + return Box(0, 0, shape=(1,)) + + def tF_bound(self) -> Box: + r"""The bound on the final time :math:`\mathbf{t}_F`, i.e. when the environment terminates. + + :return: An appropriate gymnasium Box instance. + """ + return Box(-np.inf, np.inf, shape=(1,)) + + def get_cost(self) -> SymbolicQRCost: + raise NotImplementedError("When you implement the model, you must implement the get_cost() function.\nfor instance, use return SymbolicQRCost(Q=np.eye(n), R=np.eye(d))") + + def sym_f(self, x, u, t=None): + """ + The symbolic (``sympy``) version of the dynamics :math:`f(x, u, t)`. This is the main place where you specify + the dynamics when you build a new model. you should look at concrete implementations of models for specifics. + + :param x: A list of symbolic expressions ``['x0', 'x1', ..]`` corresponding to :math:`x` + :param u: A list of symbolic expressions ``['u0', 'u1', ..]`` corresponding to :math:`u` + :param t: A single symbolic expression corresponding to the time :math:`t` (seconds) + :return: A list of symbolic expressions ``[f0, f1, ...]`` of the same length as ``x`` where each element is a coordinate of :math:`f` + """ + raise NotImplementedError("Implement a function which return the environment dynamics f(x,u,t) as a sympy exression") + + def f(self, x, u, t=0) -> np.ndarray: + r"""Evaluate the dynamics. + + This function will evaluate the dynamics. In other words, it will evaluate :math:`\mathbf{f}` in the following expression: + + .. math:: + + \dot{\mathbf{x}} = \mathbf{f}(\mathbf{x}, \mathbf{u}, t) + + :param x: A numpy ndarray corresponding to the state + :param u: A numpy ndarray corresponding to the control + :param t: A :python:`float` corresponding to the time. + :return: The time derivative of the state, :math:`\mathbf{x}(t)`. + """ + return np.asarray( self._f_np(x, u, t) ) + + + def simulate(self, x0, u_fun, t0, tF, N_steps=1000, method='rk4'): + """ + Used to simulate the effect of a policy on the model. By default, it uses + Runge-Kutta 4 (RK4) with a fine discretization -- this is slow, but in nearly all cases exact. See (Her24, Algorithm 18) for more information. + + The input argument ``u_fun`` should be a function which returns a list or tuple with same dimension as + ``model.action_space``, :math:`d`. + + :param x0: The initial state of the simulation. Must be a list of floats of same dimension as ``env.observation_space``, :math:`n`. + :param u_fun: Can be either: + - Either a policy function that can be called as ``u_fun(x, t)`` and returns an action ``u`` in the ``action_space`` + - A single action (i.e. a list of floats of same length as the action space). The model will be simulated with a constant action in this case. + :param float t0: Starting time :math:`t_0` + :param float tF: Stopping time :math:`t_F`; the model will be simulated for :math:`t_F - t_0` seconds + :param int N_steps: Steps :math:`N` in the RK4 simulation + :param str method: Simulation method. Either ``'rk4'`` (default) or ``'euler'`` + :return: + - xs - A numpy ``ndarray`` of dimension :math:`(N+1)\\times n` containing the observations :math:`x` + - us - A numpy ``ndarray`` of dimension :math:`(N+1)\\times d` containing the actions :math:`u` + - ts - A numpy ``ndarray`` of dimension :math:`(N+1)` containing the corresponding times :math:`t` (seconds) + """ + + u_fun = ensure_policy(u_fun) + tt = np.linspace(t0, tF, N_steps+1) # Time grid t_k = tt[k] between t0 and tF. + xs = [ np.asarray(x0) ] + us = [ u_fun(x0, t0 )] + for k in range(N_steps): + Delta = tt[k+1] - tt[k] + tn = tt[k] + xn = xs[k] + un = us[k] # ensure the action u is a vector. + unp = u_fun(xn, tn + Delta) + if method == 'rk4': + """ Implementation of RK4 here. See: (Her24, Algorithm 18) """ + k1 = np.asarray(self.f(xn, un, tn)) + k2 = np.asarray(self.f(xn + Delta * k1/2, u_fun(xn, tn+Delta/2), tn+Delta/2)) + k3 = np.asarray(self.f(xn + Delta * k2/2, u_fun(xn, tn+Delta/2), tn+Delta/2)) + k4 = np.asarray(self.f(xn + Delta * k3, u_fun(xn, tn + Delta), tn+Delta)) + xnp = xn + 1/6 * Delta * (k1 + 2*k2 + 2*k3 + k4) + elif method == 'euler': + xnp = xn + Delta * np.asarray(self.f(xn, un, tn)) + else: + raise Exception("Bad integration method", method) + xs.append(xnp) + us.append(unp) + xs = np.stack(xs, axis=0) + us = np.stack(us, axis=0) + return xs, us, tt + + @property + def state_size(self): + """ + This field represents the dimensionality of the state-vector :math:`n`. Use it as ``model.state_size`` + :return: Dimensionality of the state vector :math:`x` + """ + return self.get_cost().state_size + # return len(list(self.bounds['x_low'])) + + @property + def action_size(self): + """ + This field represents the dimensionality of the action-vector :math:`d`. Use it as ``model.action_size`` + :return: Dimensionality of the action vector :math:`u` + """ + return self.get_cost().action_size + # return len(list(self.bounds['u_low'])) + + def render(self, x, render_mode="human"): + """ + Responsible for rendering the state. You don't have to worry about this function. + + :param x: State to render + :param str render_mode: Rendering mode. Select ``"human"`` for a visualization. + :return: Either none or a ``ndarray`` for plotting. + """ + raise NotImplementedError() + + def close(self): + pass + + def phi_x(self, x : list) -> list: + r"""Coordinate transformation of the state when the model is discretized. + + This function specifies the coordinate transformation :math:`x_k = \Phi_x(x(t_k))` which is applied to the environment when it is + discretized. It should accept a list of symbols, corresponding to :math:`x`, and return a new list + of symbols corresponding to the (discrete) coordinates. + + :param x: A list of symbols ``[x0, x1, ..., xn]`` corresponding to :math:`\mathbf{x}(t)` + :return: A new list of symbols corresponding to the discrete coordinates :math:`\mathbf{x}_k`. + """ + return x + + def phi_x_inv(self, x: list) -> list: + r"""Inverse of coordinate transformation for the state. + + This function should specify the inverse of the coordinate transformation :math:`\Phi_x`, i.e. :math:`\Phi_x^{-1}`. + In other words, it has to map from the discrete coordinates to the continuous-time coordinates: :math:`x(t) = \Phi_x^{-1}(x_k)`. + + :param x: A list of symbols ``[x0, x1, ..., xn]`` corresponding to :math:`\mathbf{x}_k` + :return: A new list of symbols corresponding to the continuous-time coordinates :math:`\mathbf{x}(t)`. + """ + return x + + def phi_u(self, u: list) -> list: + r"""Coordinate transformation of the action when the model is discretized. + + This function specifies the coordinate transformation :math:`x_k = \Phi_x(x(t_k))` which is applied to the environment when it is + discretized. It should accept a list of symbols, corresponding to :math:`x`, and return a new list + of symbols corresponding to the (discrete) coordinates. + + :param x: A list of symbols ``[x0, x1, ..., xn]`` corresponding to :math:`\mathbf{x}(t)` + :return: A new list of symbols corresponding to the discrete coordinates :math:`\mathbf{x}_k`. + """ + return u + + def phi_u_inv(self, u: list) -> list: + r"""Inverse of coordinate transformation for the action. + + This function should specify the inverse of the coordinate transformation :math:`\Phi_u`, i.e. :math:`\Phi_u^{-1}`. + In other words, it has to map from the discrete coordinates to the continuous-time coordinates: :math:`u(t) = \Phi_u^{-1}(u_k)`. + + :param x: A list of symbols ``[u0, u1, ..., ud]`` corresponding to :math:`\mathbf{u}_k` + :return: A new list of symbols corresponding to the continuous-time coordinates :math:`\mathbf{u}(t)`. + """ + return u + + def __str__(self): + """ + Return a string representation of the model. This is a potentially helpful way to summarize the content of the + model. You can use it as: + + .. runblock:: pycon + + >>> from irlc.ex04.model_pendulum import SinCosPendulumModel + >>> model = SinCosPendulumModel() + >>> print(model) + + :return: A string containing the details of the model. + """ + split = "-"*20 + s = [f"{self.__class__}"] + ['='*50] + s += ["Dynamics:", split] + t = sym.symbols("t") + x = sym.symbols(f"x0:{self.state_size}") + u = sym.symbols(f"u0:{self.action_size}") + + s += [typeset_eq(x, u, self.sym_f(x, u, t) )] + + s += ["Cost:", split, str(self.get_cost())] + + dd = defaultdict(list) + bounds = [ ('x', self.x_bound()), ('x0', self.x0_bound()), ('xF', self.xF_bound()), + ('u', self.u_bound()), + ('t0', self.t0_bound()), ('tF', self.tF_bound())] + + for v, box in bounds: + if (box.low == -np.inf).all() and (box.high == np.inf).all(): + continue + dd['low'].append(box.low_repr) + dd['variable'].append("<= " + v + " <=") + dd['high'].append(box.high_repr) + + if len(dd) > 0: + s += ["Bounds:", split] + s += [tabulate.tabulate(dd, headers='keys')] + else: + s += ['No bounds are applied to the x and u-variables.'] + return "\n".join(s) + + +def symv(s, n): + """ + Returns a vector of symbolic functions. For instance if s='x' and n=3 then it will return + [x0,x1,x2] + where x0,..,x2 are symbolic variables. + """ + return sym.symbols(" ".join(["%s%i," % (s, i) for i in range(n)])) + +def ensure_policy(u): + """ + Ensure u corresponds to a policy function with input arguments u(x, t) + """ + if callable(u): + return lambda x, t: np.asarray(u(x,t)).reshape((-1,)) + else: + return lambda x, t: np.asarray(u).reshape((-1,)) + +def plot_trajectory(x_res, tt, lt='k-', ax=None, labels=None, legend=None): + M = x_res.shape[1] + if labels is None: + labels = [f"x_{i}" for i in range(M)] + + if ax is None: + if M == 2: + a = 234 + if M == 3: + r = 1 + c = 3 + else: + r = 2 if M > 1 else 1 + c = (M + 1) // 2 + + H = 2*r if r > 1 else 3 + W = 6*c + # if M == 2: + # W = 12 + f, ax = plt.subplots(r,c, figsize=(W,H)) + if M == 1: + ax = np.asarray([ax]) + print(M,r,c) + + for i in range(M): + if len(ax) <= i: + print("issue!") + + a = ax.flat[i] + a.plot(tt, x_res[:, i], lt, label=legend) + + a.set_xlabel("Time/seconds") + a.set_ylabel(labels[i]) + # a.set_title(labels[i]) + a.grid(True) + if legend is not None and i == 0: + a.legend() + # if i == M: + plt.tight_layout() + return ax + +def make_space_above(axes, topmargin=1.0): + """ increase figure size to make topmargin (in inches) space for + titles, without changing the axes sizes""" + fig = axes.flatten()[0].figure + s = fig.subplotpars + w, h = fig.get_size_inches() + + figh = h - (1-s.top)*h + topmargin + fig.subplots_adjust(bottom=s.bottom*h/figh, top=1-topmargin/figh) + fig.set_figheight(figh) + +def typeset_eq(x, u, f): + def ascii_vector(ls): + ml = max(map(len, ls)) + ls = [" " * (ml - len(s)) + s for s in ls] + ls = ["[" + s + "]" for s in ls] + return "\n".join(ls) + + v = [str(z) for z in f] + + def cstack(ls: list): + # ls = [l.splitlines() for l in ls] + height = max([len(l) for l in ls]) + widths = [len(l[0]) for l in ls] + + for k in range(len(ls)): + missing2 = (height - len(ls[k])) // 2 + missing1 = (height - len(ls[k]) - missing2) + tpad = [" " * widths[k]] * missing1 + bpad = [" " * widths[k]] * missing2 + ls[k] = tpad + ls[k] + bpad + + r = [""] * len(ls[0]) + for w in range(len(ls)): + for h in range(len(ls[0])): + r[h] += ls[w][h] + + return r + + xx = [str(x) for x in x] + uu = [str(u) for u in u] + xx = ascii_vector(xx).splitlines() + uu = ascii_vector(uu).splitlines() + cm = cstack([xx, [", "], uu]) + eq = cstack([["f("], cm, [")"]]) + eq = cstack([[" "], eq, [" = "], ascii_vector(v).splitlines()]) + return "\n".join(eq) diff --git a/irlc/ex03/inventory_evaluation.py b/irlc/ex03/inventory_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d7eda4076d4dde269bb4b8ad7bd4c4e4c0d6f2 --- /dev/null +++ b/irlc/ex03/inventory_evaluation.py @@ -0,0 +1,26 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex02.inventory import InventoryDPModel + +def a_expected_items_next_day(x : int, u : int) -> float: + model = InventoryDPModel() + expected_number_of_items = None + # TODO: Code has been removed from here. + raise NotImplementedError("Insert your solution and remove this error.") + return expected_number_of_items + + +def b_evaluate_policy(pi : list, x0 : int) -> float: + # TODO: Code has been removed from here. + raise NotImplementedError("Insert your solution and remove this error.") + return J_pi_x0 + +if __name__ == "__main__": + model = InventoryDPModel() + # Create a policy that always buy an item if the inventory is empty. + pi = [{s: 1 if s == 0 else 0 for s in model.S(k)} for k in range(model.N)] + x = 0 + u = 1 + x0 = 1 + a_expected_items_next_day(x=0, u=1) + print(f"Given inventory is {x=} and we buy {u=}, the expected items on day k=1 is {a_expected_items_next_day(x, u)} and should be 0.1") + print(f"Evaluation of policy is {b_evaluate_policy(pi, x0)} and should be 2.7") diff --git a/irlc/ex03/kuramoto.py b/irlc/ex03/kuramoto.py new file mode 100644 index 0000000000000000000000000000000000000000..e20844efe1ed2359c423df0cca48094993258fa7 --- /dev/null +++ b/irlc/ex03/kuramoto.py @@ -0,0 +1,123 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +import sympy as sym +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +import numpy as np +from irlc import savepdf +from gymnasium.spaces import Box + + +class KuramotoModel(ControlModel): + r""" + The Kuramoto model. It implements the following dynamics: + + .. math:: + + \dot{x}(t) = u(t) +\cos(x(t)) + + I.e. the state and control variables are both one-dimensional. The cost function is simply: + + .. math:: + + c(t) = \frac{1}{2}x(t)^2 + \frac{1}{2}u(t)^2 + + This is a QR cost with :math:`Q=R=1`. + """ + def u_bound(self) -> Box: + return Box(-2, 2, shape=(1,)) + + def x0_bound(self) -> Box: + return Box(0, 0, shape=(1,)) + + def get_cost(self) -> SymbolicQRCost: + """ + Create a cost-object. The code defines a quadratic cost (with the given matrices) and allows easy computation + of derivatives, etc. There are automatic ways to discretize the cost so you don't have to bother with that. + See the online documentation for further details. + """ + return SymbolicQRCost(Q=np.zeros((1, 1)), R=np.ones((1,1))) + + def sym_f(self, x: list, u: list, t=None): + r""" Return a symbolic expression representing the Kuramoto model. + The inputs x, u are themselves *lists* of symbolic variables (insert breakpoint and check their value). + you have to use them to create a symbolic object representing f, and return it as a list. That is, you are going to return + + .. codeblock:: python + + return [f_val] + + where ``f_val`` is the symbolic expression corresponding to the dynamics, i.e. :math:`u(t) + \cos( x(t))`. + Note you can use trigonometric functions like ``sym.cos``. + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement symbolic expression as a singleton list here") + # define the symbolic expression + return symbolic_f_list + + +def f(x, u): + """ Implement the kuramoto osscilator model's dynamics, i.e. f such that dx/dt = f(x,u). + The answer should be returned as a singleton list. """ + cmodel = KuramotoModel() + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + # Use the ContiniousKuramotoModel to compute f(x,u). If in doubt, insert a breakpoint and let pycharms autocomplete + # guide you. See my video to Exercise 2 for how to use the debugger. Don't forget to specify t (for instance t=0). + # Note that sympys error messages can be a bit unforgiving. + return f_value + +def rk4_simulate(x0, u, t0, tF, N=1000): + """ + Implement the RK4 algorithm (Her24, Algorithm 18). + In this function, x0 and u are constant numpy ndarrays. I.e. u is not a function, which simplify the RK4 + algorithm a bit. + + The function you want to integrate, f, is already defined above. You can likewise assume f is not a function of + time. t0 and tF play the same role as in the algorithm. + + The function should return a numpy ndarray xs of dimension (N,) (containing all the x-values) and a numpy ndarray + tt containing the corresponding time points. + + Hints: + * Call f as in f(x, u). You defined f earlier in this exercise. + """ + tt = np.linspace(t0, tF, N+1) # Time grid t_k = tt[k] between t0 and tF. + xs = [ x0 ] + f(x0, u) # This is how you can call f. + for k in range(N): + x_next = None # Obtain x_next = x_{k+1} using a single RK4 step. + # Remember to insert breakpoints and use the console to examine what the various variables are. + # TODO: 7 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + xs.append(x_next) + xs = np.stack(xs, axis=0) + return xs, tt + +if __name__ == "__main__": + # Create a symbolic model corresponding to the Kuramoto model: + # Evaluate the dynamics dx / dt = f(x, u). + + print("Value of f(x,u) in x=2, u=0.3", f([2], [0.3])) + print("Value of f(x,u) in x=0, u=1", f([0], [1])) + + cmodel = KuramotoModel() + print(cmodel) + x0 = cmodel.x0_bound().low # Get the starting state x0. We exploit that the bound on x0 is an equality constraint. + u = 1.3 + xs, ts = rk4_simulate(x0, [u], t0=0, tF=20, N=100) + xs_true, us_true, ts_true = cmodel.simulate(x0, u_fun=u, t0=0, tF=20, N_steps=100) + """You should generally use cmodel.simulate(...) to simulate the environment. Note that u_fun in the simulate + function can be set to a constant. Use this compute numpy ndarrays corresponding to the time, x and u values. + """ + # Plot the exact simulation of the environment + import matplotlib.pyplot as plt + plt.plot(ts_true, xs_true, 'k.-', label='RK4 state sequence x(t) (using model.simulate)') + plt.plot(ts, xs, 'r-', label='RK4 state sequence x(t) (using your code)') + plt.legend() + #savepdf('kuramoto_rk4') + plt.show(block=False) diff --git a/irlc/ex03/toy_2d_control.py b/irlc/ex03/toy_2d_control.py new file mode 100644 index 0000000000000000000000000000000000000000..187dd0369b3f8077cce422d58ec975a79d889144 --- /dev/null +++ b/irlc/ex03/toy_2d_control.py @@ -0,0 +1,23 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import sympy as sym +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +import numpy as np + +class Toy2DControl(ControlModel): + def get_cost(self): + # You get the cost-function for free because it can be anything as far as this problem is concerned. + return SymbolicQRCost(Q=np.eye(2), R=np.eye(1)) + + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + +def toy_simulation(u0 : float, T : float) -> float: + # TODO: 4 lines missing. + raise NotImplementedError("Create a Toy2dControl instance and use model.simulate(..) to get the final state.") + return wT + +if __name__ == "__main__": + x0 = np.asarray([np.pi/2, 0]) + wT = toy_simulation(u0=0.4, T=5) + print(f"Starting in x0=[pi/2, 0], after T=5 seconds the system is an an angle {wT=} (should be 1.265)") diff --git a/irlc/ex04/__init__.py b/irlc/ex04/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d084853c8794e0bebd41758f6e27fbf152bc134f --- /dev/null +++ b/irlc/ex04/__init__.py @@ -0,0 +1,20 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 4.""" + +speech = """ +Fate has ordained that the men who went to the moon to explore in peace will stay on the moon to rest in peace. + +These brave men, Neil Armstrong and Edwin Aldrin, know that there is no hope for their recovery. But they also know that there is hope for mankind in their sacrifice. + +These two men are laying down their lives in mankind’s most noble goal: the search for truth and understanding. + +They will be mourned by their families and friends; they will be mourned by their nation; they will be mourned by the people of the world; they will be mourned by a Mother Earth that dared send two of her sons into the unknown. + +In their exploration, they stirred the people of the world to feel as one; in their sacrifice, they bind more tightly the brotherhood of man. + +In ancient days, men looked at stars and saw their heroes in the constellations. In modern times, we do much the same, but our heroes are epic men of flesh and blood. + +Others will follow, and surely find their way home. Man’s search will not be denied. But these men were the first, and they will remain the foremost in our hearts. + +For every human being who looks up at the moon in the nights to come will know that there is some corner of another world that is forever mankind. +""" diff --git a/irlc/ex04/__pycache__/__init__.cpython-311.pyc b/irlc/ex04/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3164efb4748bc0ed4f1b2e1ee9f97c20e21d294b Binary files /dev/null and b/irlc/ex04/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex04/__pycache__/control_environment.cpython-311.pyc b/irlc/ex04/__pycache__/control_environment.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d907d0d5b866124103b4679a0addcac96838cba2 Binary files /dev/null and b/irlc/ex04/__pycache__/control_environment.cpython-311.pyc differ diff --git a/irlc/ex04/__pycache__/discrete_control_cost.cpython-311.pyc b/irlc/ex04/__pycache__/discrete_control_cost.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a34b7db9927788c8e5d7bb03c37ab974e150ae1d Binary files /dev/null and b/irlc/ex04/__pycache__/discrete_control_cost.cpython-311.pyc differ diff --git a/irlc/ex04/__pycache__/discrete_control_model.cpython-311.pyc b/irlc/ex04/__pycache__/discrete_control_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c4477150f85d1c7b05871d85cef6908932f7aee Binary files /dev/null and b/irlc/ex04/__pycache__/discrete_control_model.cpython-311.pyc differ diff --git a/irlc/ex04/__pycache__/model_linear_quadratic.cpython-311.pyc b/irlc/ex04/__pycache__/model_linear_quadratic.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..421272b3d5c7dc0c25f34823cd4b2c1862685ce6 Binary files /dev/null and b/irlc/ex04/__pycache__/model_linear_quadratic.cpython-311.pyc differ diff --git a/irlc/ex04/__pycache__/model_pendulum.cpython-311.pyc b/irlc/ex04/__pycache__/model_pendulum.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdff94ba6b6693e07f04de94d8bd44222e68cf01 Binary files /dev/null and b/irlc/ex04/__pycache__/model_pendulum.cpython-311.pyc differ diff --git a/irlc/ex04/control_environment.py b/irlc/ex04/control_environment.py new file mode 100644 index 0000000000000000000000000000000000000000..ad44fe94c41231d206d591a981b9b91199f4ee6a --- /dev/null +++ b/irlc/ex04/control_environment.py @@ -0,0 +1,171 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import gymnasium as gym +import numpy as np +from irlc.ex03.control_model import ensure_policy +from irlc.ex04.discrete_control_model import DiscreteControlModel + + +class ControlEnvironment(gym.Env): + """ + Helper class to convert a discretized model into an environment. + See the ``__init__`` function for how to create a new environment using this class. Once an environment has been + created, you can use it as any other gym environment: + + .. runblock:: pycon + + >>> from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment + >>> env = GymSinCosPendulumEnvironment(Tmax=4) # Specify we want it to run for a maximum of 4 seconds + >>> env.reset() # Reset both the time and state variable + >>> u = env.action_space.sample() + >>> next_state, cost, done, truncated, info = env.step(u) + >>> print("Current state: ", env.state) + >>> print("Current time", env.time) + + In this example, tell the environment to terminate after 4 seconds using ``Tmax`` (after which ``done=True``) + + .. Note:: + The ``step``-method will use the (nearly exact) RK4 method to integrate the enviorent over a timespan of ``dt``, + and **not** use the approximate ``model.f(x_k,u_k, k)``-method in the discrete environment which is based on + Euler discretization. + This is the correct behavior since we want the environment to reflect what happens in the real world and not + our apprixmation method. + """ + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 30 + } + action_space = None + observation_space = None + + def __init__(self, discrete_model: DiscreteControlModel, Tmax=None, supersample_trajectory=False, render_mode=None): + """ + Creates a new instance. You should use this in conjunction with a discrete model to build a new class. An example: + + .. runblock:: pycon + + >>> from irlc.ex04.model_pendulum import DiscreteSinCosPendulumModel + >>> from irlc.ex04.control_environment import ControlEnvironment + >>> from gymnasium.spaces import Box + >>> import numpy as np + >>> class MyGymSinCosEnvironment(ControlEnvironment): + ... def __init__(self, Tmax=5): + ... discrete_model = DiscreteSinCosPendulumModel() + ... self.action_space = Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float64) + ... self.observation_space = Box(low=-np.inf, high=np.inf, shape=(3,), dtype=np.float64) + ... super().__init__(discrete_model, Tmax=Tmax) + >>> + >>> env = MyGymSinCosEnvironment() + >>> env.reset() + >>> env.step(env.action_space.sample()) + + :param discrete_model: The discrete model the environment is based on + :param Tmax: Time in seconds until the environment terminates (``step`` returns ``done=True``) + :param supersample_trajectory: Used to create nicer (smooth) trajectories. Don't worry about it. + :param render_mode: If ``human`` the environment will be rendered (inherited from ``Env``) + """ + self.dt = discrete_model.dt # Discretization time + self.state = None # the current state + self.time = 0 # Current global time index + self.discrete_model = discrete_model + self.Tmax = Tmax + + # Try to guess action/observation spaces unless they are already defined. + if self.observation_space is None: + self.observation_space = gym.spaces.Box(-np.inf, np.inf, shape=(discrete_model.state_size,) ) + + if self.action_space is None: + u_bound = self.discrete_model.continuous_model.u_bound() + self.action_space = gym.spaces.Box(low=np.asarray(self.discrete_model.phi_u(u_bound.low)), + high=np.asarray(self.discrete_model.phi_u(u_bound.high)), + dtype=np.float64, + ) + self.state_labels = discrete_model.state_labels + self.action_labels = discrete_model.action_labels + self.supersample_trajectory = supersample_trajectory + self.render_mode = render_mode + + + def step(self, u): + """ + This works similar to the gym ``Env.step``-function. ``u`` is an action in the action-space, + and the environment will then assume we (constantly) apply the action ``u`` from the current time step, :math:`t_k`, until + the next time step :math:`t_{k+1} = t_k + \Delta`, where :math:`\Delta` is equal to ``env.model.dt``. + + During this period, the next state is computed using the relatively exact RK4 simulation, and the incurred cost will be + computed using Riemann integration. + + .. math:: + \int_{t_k}^{t_k+\Delta} c(x(t), u(t)) dt + + .. Note:: + The gym environment requires that we return a cost. The reward will therefore be equal to minus the (integral) of the cost function. + + In case the environment terminates, the reward will include the terminal cost. :math:`c_F`. + + :param u: The action we apply :math:`u` + :return: + - ``state`` - the state we arrive in + - ``reward`` - (minus) the total (integrated) cost incurred in this time period. + - ``done`` - ``True`` if the environment has finished, i.e. we reached ``env.Tmax``. + - ``truncated`` - ``True`` if the environment was forced to terminated prematurely. Assume it is ``False`` and ignore it. + - ``info`` - A dictionary of potential extra information. Includes ``info['time_seconds']``, which is the current time after the step function has completed. + """ + + def clip_action(self, u): + return np.clip(u, a_max=self.action_space.high, a_min=self.action_space.low) + + u = clip_action(self, u) + self.discrete_model.continuous_model._u_prev = u # for rendering. + if not ((self.action_space.low <= u).all() and (u <= self.action_space.high).all()): # u not in self.action_space: + raise Exception("Action", u, "not contained in action space", self.action_space) + # N=20 is a bit arbitrary; should probably be a parameter to the environment. + xx, uu, tt = self.discrete_model.simulate2(x0=self.state, policy=ensure_policy(u), t0=self.time, tF=self.time + self.discrete_model.dt, N=20) + self.state = xx[-1] + self.time = tt[-1] + cc = [self.discrete_model.cost.c(x, u, k=None) for x, u in zip(xx[:-1], uu[:-1])] + done = False + if self.time + self.discrete_model.dt/2 > self.Tmax: + cc[-1] += self.discrete_model.cost.cN(xx[-1]) + done = True + info = {'dt': self.discrete_model.dt, 'time_seconds': self.time} # Allow the train() function to figure out the simulation time step size + if self.supersample_trajectory: # This is only for nice visualizations. + from irlc.ex01.agent import Trajectory + traj = Trajectory(time=tt, state=xx.T, action=uu.T, reward=np.asarray(cc), env_info=[]) + info['supersample'] = traj # Supersample the trajectory + reward = -sum(cc) # To be compatible with openai gym we return the reward as -cost. + if not ( (self.observation_space.low <= self.state).all() and (self.state <= self.observation_space.high).all() ): #self.state not in self.observation_space: + print("> state", self.state) + print("> observation space", self.observation_space) + raise Exception("State no longer in observation space", self.state) + if self.render_mode == "human": # as in gym's carpole + self.render() + + return self.state, reward, done, False, info + + def reset(self): + """ + Reset the environment to the initial state. This will by default be `the value computed using `self.discrete_model.reset()``. + + :return: + - ``state`` - The initial state the environment has been reset to + - ``info`` - A dictionary with extra information, in this case that time begins at 0 seconds. + """ + self.state = self._get_initial_state() + self.time = 0 # Reset internal time (seconds) + if self.render_mode == "human": + self.render() + return self.state, {'time_seconds': self.time} + + def _get_initial_state(self) -> np.ndarray: + # This helper function returns an initial state. It will be used by the reset() function, and it is this function + # you should overwrite if you want to reset to a state which is not implied by the bounds. + if (self.discrete_model.continuous_model.x0_bound().low == self.discrete_model.continuous_model.x0_bound().high).all(): + return np.asarray(self.discrete_model.phi_x(self.discrete_model.continuous_model.x0_bound().low)) + else: + raise Exception("Since bounds do not agree I cannot return initial state.") + + def render(self): + return self.discrete_model.render(x=self.state, render_mode=self.render_mode) + + def close(self): + self.discrete_model.close() diff --git a/irlc/ex04/discrete_control_cost.py b/irlc/ex04/discrete_control_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..f9ad90dc6958aed6b8d57bdac22355c8c3b14612 --- /dev/null +++ b/irlc/ex04/discrete_control_cost.py @@ -0,0 +1,195 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" +Quadratic cost functions +""" +import numpy as np +from irlc.ex03.control_cost import targ2matrices + +def nz(X,a,b=None): + return np.zeros((a,) if b is None else (a,b)) if X is None else X + +class DiscreteQRCost: #(DiscreteCost): + """ + This class represents the cost function for a discrete-time model. In the simulations, we are going to assume + that the cost function takes the form: + + .. math:: + \sum_{k=0}^{N-1} c_k(x_k, u_k) + c_N(x_N) + + + And this class will specifically implement the two functions :math:`c` and :math:`c_N`. + They will be assumed to have the quadratic form: + + .. math:: + c_k(x_k, u_k) & = \\frac{1}{2} x_k^T Q x_k + \\frac{1}{2} u_k^T R u_k + u^T_k H x_k + q^T x_k + r^T u_k + q_0, \\\\ + c_N(x_N) & = \\frac{1}{2} x_N^T Q_N x_N + q_N^T x_N + q_{0,N}. + + So what all of this boils down to is that the class just need to store a bunch of matrices and vectors. + + You can add and scale cost-functions + ********************************************************** + + A slightly smart thing about the cost functions are that you can add and scale them. The following provides an + example: + + .. runblock:: pycon + + >>> from irlc.ex04.discrete_control_cost import DiscreteQRCost + >>> import numpy as np + >>> cost1 = DiscreteQRCost(np.eye(2), np.zeros(1) ) # Set Q = I, R = 0 + >>> cost2 = DiscreteQRCost(np.ones((2,2)), np.zeros(1) ) # Set Q = 2x2 matrices of 1's, R = 0 + >>> print(cost1.Q) # Will be the identity matrix. + >>> cost = cost1 * 3 + cost2 * 2 + >>> print(cost.Q) # Will be 3 x I + 2 + + """ + def __init__(self, Q, R, H=None,q=None,r=None,qc=0, QN=None, qN=None,qcN=0): + n, d = Q.shape[0], R.shape[0] + self.QN, self.qN = nz(QN,n,n), nz(qN,n) + self.Q, self.q = nz(Q, n, n), nz(q, n) + self.R, self.H, self.r = nz(R, d, d), nz(H, d, n), nz(r, d) + self.qc, self.qcN = qc, qcN + self.flds_term = ['QN', 'qN', 'qcN'] + self.flds = ['Q', 'q', 'R', 'H', 'r', 'qc'] + self.flds_term + + def c(self, x, u, k=None, compute_gradients=False): + """ + Evaluate the (instantaneous) part of the function :math:`c_k(x_k,u_k)`. An example: + + .. runblock:: pycon + + >>> from irlc.ex04.discrete_control_cost import DiscreteQRCost + >>> import numpy as np + >>> cost = DiscreteQRCost(np.eye(2), np.eye(1)) # Set Q = I, R = 0 + >>> cost.c(x = np.asarray([1,2]), u=np.asarray([0]), compute_gradients=False) # should return 0.5 * x^T Q x = 0.5 * (1 + 4) + + The function can also return the derivates of the cost function if ``compute_derivates=True`` + + :param x: The state :math:`x_k` + :param u: The action :math:`u_k` + :param k: The time step :math:`k` (this will be ignored) + :param compute_gradients: if ``True`` the function will compute gradients and Hessians. + :return: + - ``c`` - The cost as a ``float`` + - ``c_x`` - The derivative with respect to :math:`x` + """ + c = 1/2 * (x @ self.Q @ x) + 1/2 * (u @ self.R @ u) + u @ self.H @ x + self.q @ x + self.r @ u + self.qc + c_x = 1/2 * (self.Q + self.Q.T) @ x + self.q + c_u = 1 / 2 * (self.R + self.R.T) @ u + self.r + c_ux = self.H + c_xx = self.Q + c_uu = self.R + if compute_gradients: + # this is useful for MPC when we apply an optimizer rather than LQR (iLQR) + return c, c_x, c_u, c_xx, c_ux, c_uu + else: + return c + + def cN(self, x, compute_gradients=False): + """ + Evaluate the terminal (constant) term in the cost function :math:`c_N(x_N)`. An example: + + .. runblock:: pycon + + >>> from irlc.ex04.discrete_control_cost import DiscreteQRCost + >>> import numpy as np + >>> cost = DiscreteQRCost(np.eye(2), np.zeros(1), QN=np.eye(2)) # Set Q = I, R = 0 + >>> c, Jx, Jxx = cost.cN(x=2*np.ones((2,)), compute_gradients=True) + >>> c # should return 0.5 * x_N^T * x_N = 0.5 * 8 + + :param x: Terminal state :math:`x_N` + :param compute_gradients: if ``True`` the function will compute gradients and Hessians of the cost function. + :return: The last (terminal) part of the cost-function :math:`c_N` + """ + J = 1/2 * (x @ self.QN @ x) + self.qN @ x + self.qcN + if compute_gradients: + J_x = 1 / 2 * (self.QN + self.QN.T) @ x + self.qN + return J, J_x, self.QN + else: + return J + + def __add__(self, c): + return DiscreteQRCost(**{k: self.__dict__[k] + c.__dict__[k] for k in self.flds}) + + def __mul__(self, c): + return DiscreteQRCost(**{k: self.__dict__[k] * c for k in self.flds}) + + def __str__(self): + title = "Discrete-time cost function" + label1 = "Non-zero terms in c_k(x_k, u_k)" + label2 = "Non-zero terms in c_N(x_N)" + terms1 = [s for s in self.flds if s not in self.flds_term] + terms2 = self.flds_term + from irlc.ex03.control_cost import _repr_cost + return _repr_cost(self, title, label1, label2, terms1, terms2) + + @classmethod + def zero(cls, state_size, action_size): + """ + Creates an all-zero cost function, i.e. all terms :math:`Q`, :math:`R` are set to zero. + + .. runblock:: pycon + + >>> from irlc.ex04.discrete_control_cost import DiscreteQRCost + >>> cost = DiscreteQRCost.zero(2, 1) + >>> cost.Q # 2x2 zero matrix + >>> cost.R # 1x1 zero matrix. + + :param state_size: Dimension of the state vector :math:`n` + :param action_size: Dimension of the action vector :math:`d` + :return: A ``DiscreteQRCost`` with all zero terms. + """ + return cls(Q=np.zeros((state_size, state_size)), R=np.zeros((action_size, action_size))) + + def goal_seeking_terminal_cost(self, xN_target, QN=None): + """ + Create a discrete cost function which is minimal when the final state :math:`x_N` is equal to a goal state :math:`x_N^*`. + Concretely, it will return a cost function of the form + + .. math:: + c_N(x_N) = \\frac{1}{2} (x^*_N - x_N)^\\top Q (x^*_N - x_N) + + .. runblock:: pycon + + >>> from irlc.ex04.discrete_control_cost import DiscreteQRCost + >>> import numpy as np + >>> cost = DiscreteQRCost.zero(2, 1) + >>> cost += cost.goal_seeking_terminal_cost(xN_target=np.ones((2,))) + >>> print(cost.qN) + >>> print(cost) + + :param xN_target: Target state :math:`x_N^*` + :param Q: Cost matrix :math:`Q` + :return: A ``DiscreteQRCost`` object corresponding to the goal-seeking cost function + """ + + if QN is None: + QN = np.eye(xN_target.size) + QN, qN, qcN = targ2matrices(xN_target, Q=QN) + return DiscreteQRCost(Q=QN*0, R=self.R*0, QN=QN, qN=qN, qcN=qcN) + + def goal_seeking_cost(self, x_target, Q=None): + """ + Create a discrete cost function which is minimal when the state :math:`x_k` is equal to a goal state :math:`x_k^*`. + Concretely, it will return a cost function of the form + + .. math:: + c_k(x_k, u_k) = \\frac{1}{2} (x^*_k - x_k)^\\top Q (x^*_k - x_k) + + .. runblock:: pycon + + >>> from irlc.ex04.discrete_control_cost import DiscreteQRCost + >>> import numpy as np + >>> cost = DiscreteQRCost.zero(2, 1) + >>> cost += cost.goal_seeking_cost(x_target=np.ones((2,))) + >>> print(cost.q) + >>> print(cost) + + :param x_target: Target state :math:`x_k^*` + :param Q: Cost matrix :math:`Q` + :return: A ``DiscreteQRCost`` object corresponding to the goal-seeking cost function + """ + if Q is None: + Q = np.eye(x_target.size) + Q, q, qc = targ2matrices(x_target, Q=Q) + return DiscreteQRCost(Q=Q, R=self.R*0, q=q, qc=qc) diff --git a/irlc/ex04/discrete_control_model.py b/irlc/ex04/discrete_control_model.py new file mode 100644 index 0000000000000000000000000000000000000000..085a78243784661039f6950079d67259aa774a1d --- /dev/null +++ b/irlc/ex04/discrete_control_model.py @@ -0,0 +1,346 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from irlc.ex03.control_model import ControlModel +import sympy as sym +import numpy as np +import sys +from irlc.ex03.control_model import ensure_policy +# Patch sympy with mapping to numpy functions. +sympy_modules_ = ['numpy', {'atan': np.arctan, 'atan2': np.arctan2, 'atanh': np.arctanh}, 'sympy'] + +class DiscreteControlModel: + """ + A discretized model. To create a model of this type, first specify a symbolic model, then pass it along to the constructor. + Since the symbolic model will specify the dynamics as a symbolic function, the discretized model can automatically discretize it + and create functions for computing derivatives. + + The class will also discretize the cost. Note that it is possible to specify coordinate transformations. + """ + state_labels = None + action_labels = None + + "This field represents the :class:`~irlc.ex04.continuous_time_model.ContinuousSymbolicModel` the discrete model is derived from." + continuous_model = None + + def __init__(self, model: ControlModel, dt: float, cost=None, discretization_method=None): + """ + Create the discretized model. + + :param model: The continuous-time model to discretize. + :param dt: Discretization timestep :math:`\Delta` + :param cost: If this parameter is not specified, the cost will be derived (discretized) automatically from ``model`` + :param discretization_method: Can be either ``'Euler'`` (default) or ``'ei'`` (exponential integration). The later will assume that the model is a linear. + """ + self.dt = dt + self.continuous_model = model + if discretization_method is None: + from irlc.ex04.model_linear_quadratic import LinearQuadraticModel + if isinstance(model, LinearQuadraticModel): + discretization_method = 'Ei' + else: + discretization_method = 'Euler' + self.discretization_method = discretization_method.lower() + + """ Initialize symbolic variables representing inputs and actions. """ + + uc = sym.symbols(f"uc:{model.action_size}") + xc = sym.symbols(f"xc:{model.state_size}") + + # xd, ud = self.sym_continious_xu2discrete_xu(xc, uc) + xd, ud = model.phi_x(xc), model.phi_u(uc) + + x = sym.symbols(f"x:{len(xd)}") + u = sym.symbols(f"u:{len(ud)}") + + """ x_next is a symbolic variable representing x_{k+1} = f_k(x_k, u_k) """ + x_next = self._f_discrete_sym(x, u, dt=dt) + """ compute the symbolic derivate of x_next wrt. z = (x,u): d x_{k+1}/dz """ + dy_dz = sym.Matrix([[sym.diff(f, zi) for zi in list(x) + list(u)] for f in x_next]) + """ Define (numpy) functions giving next state and the derivatives """ + self._f_z_np = sym.lambdify((tuple(x), tuple(u)), dy_dz, modules=sympy_modules_) + # Create a numpy function corresponding to the discretized model x_{k+1} = f_discrete(x_k, u_k) + self._f_np = sym.lambdify((tuple(x), tuple(u)), x_next, modules=sympy_modules_) + self._n = len(x) + self._d = len(u) + + # Make action/state transformation + # xc_, uc_ = self.sym_discrete_xu2continious_xu(x, u) + # self.discrete_states2continious_states = sym.lambdify( (x,), xc_, modules=sympy_modules_) # probably better to make these individual + # self.discrete_actions2continious_actions = sym.lambdify( (u,), uc_, modules=sympy_modules_) # probably better to make these individual + + self.phi_x_inv = sym.lambdify( (x,), model.phi_x_inv(x), modules=sympy_modules_) + self.phi_u_inv = sym.lambdify( (u,), model.phi_u_inv(u), modules=sympy_modules_) + + # xd, ud = self.sym_continious_xu2discrete_xu(xc, uc) + # self.continious_states2discrete_states = sym.lambdify((xc,), xd, modules=sympy_modules_) + # self.continious_actions2discrete_actions = sym.lambdify((uc,), ud, modules=sympy_modules_) + + self.phi_x = sym.lambdify((xc,), model.phi_x(xc), modules=sympy_modules_) + self.phi_u = sym.lambdify((uc,), model.phi_u(uc), modules=sympy_modules_) + + # set labels + if self.state_labels is None: + self.state_labels = self.continuous_model.state_labels + + if self.action_labels is None: + self.action_labels = self.continuous_model.action_labels + + if cost is None: + self.cost = model.get_cost().discretize(dt=dt) + else: + self.cost = cost + + @property + def state_size(self): + """ + The dimension of the state vector :math:`x`, i.e. :math:`n` + :return: Dimension of the state vector :math:`n` + """ + return self._n + + @property + def action_size(self): + """ + The dimension of the action vector :math:`u`, i.e. :math:`d` + :return: Dimension of the action vector :math:`d` + """ + return self._d + + def _f_discrete_sym(self, xs, us, dt): + """ + This is a helper function. It computes the discretized dynamics as a symbolic object: + + .. math:: + x_{k+1} = f_k(x_k, u_k, t_k) + + The parameters corresponds to states and actions and are lists of the form ``[x0, x1, ..]`` and ``[u0, u1, ..]`` + where each element is a symbolic expression. The function returns a list of the form ``[f0, f1, ..]`` where + each element is a symbolic expression corresponding to a coordinate of :math:`f_k`. + + :param xs: List of symbolic expressions corresponding to the coordinates of :math:`x_k` + :param us: List of symbolic expressions corresponding to the coordinates of :math:`x_u` + :param dt: A symbolic expressions corresponding to :math:`t_k` + :return: A list of symbolic expressions corresponding to the coordinates of :math:`f_k` + """ + # xc, uc = self.sym_discrete_xu2continious_xu(xs, us) + xc, uc = self.continuous_model.phi_x_inv(xs), self.continuous_model.phi_u_inv(us) + if self.discretization_method == 'euler': + xdot = self.continuous_model.sym_f(x=xc, u=uc) + xnext = [x_ + xdot_ * dt for x_, xdot_ in zip(xc, xdot)] + elif self.discretization_method == 'ei': # Assume the continuous model is linear; a bit hacky, but use exact Exponential integration in that case + A = self.continuous_model.A + B = self.continuous_model.B + d = self.continuous_model.d + """ These are the matrices of the continuous-time problem. + > dx/dt = Ax + Bu + d + and should be discretized using the exact integration technique (see (Her24, Subsection 13.1.3) and (Her24, Subsection 13.1.6)); + the precise formula you should implement is given in (Her24, eq. (13.19)) + + Remember the output matrix should be symbolic (see Euler integration for examples) but you can assume there are no variable transformations for simplicity. + """ + from scipy.linalg import expm, inv + """ + expm computes the matrix exponential: + > expm(A) = exp(A) + inv computes the inverse of a matrix inv(A) = A^{-1}. + """ + Ad = expm(A * dt) + n = Ad.shape[0] + d = d.reshape( (len(B),1) ) if d is not None else np.zeros( (n, 1) ) + Bud = B @ sym.Matrix(uc) + (sym.zeros(len(B),1) if d is None else d) + x_next = sym.Matrix(Ad) @ sym.Matrix(xc) + dt * phi1(A * dt) @ Bud + xnext = list(x_next) + else: + raise Exception("Unknown discreetization method", self.discretization_method) + # xnext, _ = self.sym_continious_xu2discrete_xu(xnext, uc) + xnext = self.continuous_model.phi_x(xnext) + return xnext + + def simulate2(self, x0, policy, t0, tF, N=1000): + policy3 = lambda x, t: self.phi_u_inv(ensure_policy(policy)(x, t)) + x, u, t = self.continuous_model.simulate(self.phi_x_inv(x0), policy3, t0, tF, N_steps=N, method='rk4') + # transform to discrete representations using phi. + xd = np.stack( [np.asarray(self.phi_x(x_)).reshape((-1,)) for x_ in x ] ) + ud = np.stack( [np.asarray(self.phi_u(u_)).reshape((-1,)) for u_ in u] ) + return xd, ud, t + + def f(self, x, u, k=0): + """ + This function implements the dynamics :math:`f_k(x_k, u_k)` of the model. They can be evaluated as: + + .. runblock:: pycon + + >>> from irlc.ex04.model_pendulum import DiscreteSinCosPendulumModel + >>> model = DiscreteSinCosPendulumModel() + >>> x = [0, 1, 0.4] + >>> u = [1] + >>> print(model.f(x,u) ) # Computes x_{k+1} = f_k(x_k, u_k) + + The model will by default be Euler discretized: + + .. math:: + + x_{k+1} = f_k(x_k, u_k) = x_k + \Delta f(x_k, u_k) + + except :python:`LinearQuadraticModel` which will be discretized using Exponential Integration by default. + + + :param x: The state as a numpy array + :param u: The action as a numpy array + :param k: The time step as an integer (currently this has no effect) + :return: The next state :math:`x_{x+1}` as a numpy array. + """ + fx = np.asarray( self._f_np(x, u) ) + return fx + # if compute_jacobian: + # assert False + # # J = self._f_z_np(x, u) + # return fx, J[:, :self.state_size], J[:, self.state_size:] + # else: + # return fx + + + def f_jacobian(self, x, u, k=0): + """Compute the Jacobians of the discretized dynamics. + + The function will compute the two Jacobian derives of the discrete dynamics :math:`f_k` with respect to :math:`x` and :math:`u`: + + .. math:: + J_x f_k(x,u), \quad J_u f_k(x, u) + + .. runblock:: pycon + + >>> from irlc.ex04.model_pendulum import DiscreteSinCosPendulumModel + >>> model = DiscreteSinCosPendulumModel() + >>> x = [0, 1, 0.4] + >>> u = [0] + >>> f, Jx, Ju = model.f(x,u) + >>> Jx, Ju = model.f_jacobian(x,u) + >>> print("Jacobian Jx is\\n", Jx) + >>> print("Jacobian Ju is\\n", Ju) + + + :param x: The state as a numpy array + :param u: The action as a numpy array + :param k: The time step as an integer (currently this has no effect) + :return: The two Jacobians computed wrt. :math:`x` and :math:`u`. + """ + J = self._f_z_np(x, u) + return J[:, :self.state_size], J[:, self.state_size:] + + + def render(self, x=None, render_mode="human"): + return self.continuous_model.render(x=self.phi_x_inv(x), render_mode=render_mode) + + # def sym_continious_xu2discrete_xu(self, x, u): + # """ + # This (optional) function handle coordinate transformations. + # ``x`` and ``u`` are lists of symbolic expressions (the state and action), and the function then computes and return + # the forward coordinate transformation (from continuous coordinates to discrete): + # + # .. math:: + # x_k & = \phi_x(x) \\\\ + # u_k & = \phi_u(u) + # + # :param x: Continuous state + # :param u: Continuous action + # :return: + # - ``x_k`` - Transformed (discrete) state + # - ``u_k`` - Transformed (discrete) action + # """ + # return x, u + + # def sym_discrete_xu2continious_xu(self, x_k, u_k): + # """ + # This (optional) function handle coordinate transformations. + # ``x_k`` and ``u_k`` are lists of symbolic expressions (the state and action), and the function then computes and return + # the **backward** coordinate transformation (from discrete coordinates to continuous coordinates): + # + # .. math:: + # x & = \phi^{-1}_x(x_k) \\\\ + # u & = \phi^{-1}_u(u_k) + # + # :param x_k: discrete state + # :param u_k: discrete action + # :return: + # - ``x`` - Transformed (Continuous) state + # - ``u`` - Transformed (Continuous) action + # """ + # return x_k, u_k + + def close(self): + self.continuous_model.close() + + def __str__(self): + """ + Return a string representation of the model. This is a potentially helpful way to summarize the content of the + model. You can use it as: + + .. runblock:: pycon + + >>> from irlc.ex04.model_pendulum import DiscreteSinCosPendulumModel + >>> model = DiscreteSinCosPendulumModel() + >>> print(model) + + :return: A string containing the details of the model. + """ + split = "-"*20 + s = [f"{self.__class__}"] + ['='*50] + s += [f"Dynamics (after discretization with Delta = {self.dt}):", split] + t = sym.symbols("t") + x = sym.symbols(f"x0:{self.state_size}") + u = sym.symbols(f"u0:{self.action_size}") + + # x = symv("x", self.state_size) + # u = symv("u", self.action_size) + # s += [f"f_k({x}, {u}) = {str(self.f_discrete_sym(x, u, self.dt))}", ''] + + f = self._f_discrete_sym(x, u, self.dt) + + # x = sym.symbols(f"x0:{self.state_size}") + # u = sym.symbols(f"u0:{self.action_size}") + from irlc.ex03.control_model import typeset_eq + + s += [typeset_eq(x, u, f)] + + # print(typeset_eq(x, u, f)) + + + s += ["Continuous-time dynamics:", split] + # xc = symv("x", self.continuous_model.state_size) + # uc = symv("u", self.continuous_model.action_size) + xc = sym.symbols(f"x:{self.continuous_model.state_size}") + uc = sym.symbols(f"u:{self.continuous_model.action_size}") + + s += [f"f_k({x}, {u}) = {str(self.continuous_model.sym_f(xc, uc))}", ''] + s += ["Variable transformations:", split] + # self.continious_states2discrete_states(xc) + xd, ud = self.continuous_model.phi_x(xc), self.continuous_model.phi_u(uc) + s += [f' * phi_x( x(t) ) -> x_k = {xd}'] + s += [f' * phi_u( u(t) ) -> u_k = {ud}', ''] + s += ["Cost:", split, str(self.cost)] + return "\n".join(s) + + +def phi1(A): + """ This is a helper functions which computes + .. math:: + A^{-1} (e^A - I) + + and importantly deals with potential numerical instability in the expression. + """ + from scipy.linalg import expm + from math import factorial + if np.linalg.cond(A) < 1 / sys.float_info.epsilon: + return np.linalg.solve(A, expm(A) - np.eye( len(A) ) ) + else: + C = np.zeros_like(A) + for k in range(1, 20): + dC = np.linalg.matrix_power(A, k - 1) / factorial(k) + C += dC + assert sum( np.abs(dC.flat)) < 1e-10 + return C diff --git a/irlc/ex04/discrete_kuramoto.py b/irlc/ex04/discrete_kuramoto.py new file mode 100644 index 0000000000000000000000000000000000000000..50edc127f74c221805c2bbf40c3f56b3fe7ad3e9 --- /dev/null +++ b/irlc/ex04/discrete_kuramoto.py @@ -0,0 +1,101 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment +import numpy as np +from irlc import train, Agent, savepdf +import matplotlib.pyplot as plt +from irlc.ex03.kuramoto import KuramotoModel, f + + +def fk(x,u): + """ Computes the discrete (Euler 1-step integrated) version of the Kuromoto update with discretization time dt=0.5,i.e. + + x_{k+1} = f_k(x,u). + + Look at dmodel.f for inspiration. As usual, use a debugger and experiment. Note you have to specify input arguments as lists, + and the function should return a numpy ndarray. + """ + dmodel = DiscreteControlModel(KuramotoModel(), dt=0.5) # this is how we discretize the Kuramoto model. + # TODO: 1 lines missing. + raise NotImplementedError("Compute Euler discretized dynamics here using the dmodel.") + return f_euler + +def dfk_dx(x,u): + """ Computes the derivative of the (Euler 1-step integrated) version of the Kuromoto update with discretization time dt=0.5, + i.e. if + + .. math:: + + x_{k+1} = f_k(x,u) + + this function should return + + .. math:: + + \frac{\partial f_k}{\partial x } + + (i.e. the Jacobian with respect to x) as a numpy matrix. + Look at dmodel.f for inspiration, and note it has an input argument that is relevant. + As usual, use a debugger and experiment. Note you have to specify input arguments as lists, + and the function should return a two-dimensional numpy ndarray. + + """ + dmodel = DiscreteControlModel(KuramotoModel(), dt=0.5) + # the function dmodel.f accept various parameters. Perhaps their name can give you an idea? + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return f_euler_derivative + + +if __name__ == "__main__": + # Part 1: Making a model + cmodel = KuramotoModel() + print(cmodel) + # Computing f_k + dmodel = DiscreteControlModel(KuramotoModel(), dt=0.5) + print(dmodel) # This will print details about the discrete model. + + print("The Euler-discretized version, f_k(x,u) = x + Delta f(x,u), is") + print("f_k(x=0,u=0) =", fk([0], [0])) + print("f_k(x=1,u=0.3) =", fk([1], [0.3])) + + # Computing df_k / dx (The Jacobian). + print("The derivative of the Euler discretized version wrt. x is:") + print("df_k/dx(x=0,u=0) =", dfk_dx([0], [0])) + + # Part 2: The environment and simulation: + env = ControlEnvironment(dmodel, Tmax=20) # An environment that runs for 20 seconds. + u = 1.3 # Action to take in each time step. + + ts_step = [] # Current time (according to the environment, i.e. in increments of dt. + xs_step = [] # x_k using the env.step-function in the enviroment. + + x, _ = env.reset() # Get starting state. + ts_step.append(env.time) # env.time keeps track of the clock-time in the environment. + xs_step.append(x) # Initialize with first state + + # Use + # > next_x, cost, terminated, truncated, metadata = env.step([u]) + # to simulate a single step. + for _ in range(10000): + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + xs_step.append(next_x) + ts_step.append(env.time) # This is how you get the current time (in seconds) from the environment. + + if terminated: # Obtain 'terminated' from the step-function. It will be true when Tmax=20 seconds has passed. + break + + x0 = cmodel.x0_bound().low # Get the starting state x0. We exploit that the bound on x0 is an equality constraint. + xs_rk4, us_rk4, ts_rk4 = cmodel.simulate(x0, u_fun=u, t0=0, tF=20, N_steps=100) + + plt.plot(ts_rk4, xs_rk4, 'k-', label='RK4 (nearly exact)') + plt.plot(ts_step, xs_step, 'ro', label='RK4 (step-function in environment)') + + # Use the train-function to plot the result of simulating a random agent. + stats, trajectories = train(env, Agent(env), return_trajectory=True) + plt.plot(trajectories[0].time, trajectories[0].state, label='x(t) when using a random action sequence from agent') + plt.legend() + savepdf('kuramoto_step') + plt.show(block=False) + print("The total cost obtained using random actions", -stats[0]['Accumulated Reward']) diff --git a/irlc/ex04/locomotive.py b/irlc/ex04/locomotive.py new file mode 100644 index 0000000000000000000000000000000000000000..10b0a1494bb38cce9731295a9ad591dfe574d834 --- /dev/null +++ b/irlc/ex04/locomotive.py @@ -0,0 +1,105 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment +from irlc.ex04.model_harmonic import HarmonicOscilatorModel +import numpy as np +from irlc.utils.graphics_util_pygame import UpgradedGraphicsUtil +from gymnasium.spaces import Box + +class LocomotiveModel(HarmonicOscilatorModel): + state_labels = ["x(t)", "v(t)"] + action_labels = ["u(t)"] + + + viewer = None + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 20 + } + + def __init__(self, m=1., slope=0.0, target=0): + """ + Slope is the uphill slope of the train (in degrees). E.g. slope=15 makes it harder for the engine. + + :param m: + :param slope: + """ + self.target = target + self.slope = slope + super().__init__(m=m, k=0., drag=-np.sin(slope/360*2*np.pi) * m * 9.82) + + def x0_bound(self) -> Box: + return Box(np.asarray([-1, 0]), np.asarray([-1,0])) + + def u_bound(self) -> Box: + return Box(np.asarray([-100]), np.asarray([100])) # Min and Max engine power. + + def render(self, x, render_mode="human"): + """ Initialize a viewer and update the states. """ + if self.viewer is None: + self.viewer = LocomotiveViewer(self) + self.viewer.update(x, self.target) + import time + time.sleep(0.05) + return self.viewer.blit(render_mode=render_mode) + + def close(self): + if self.viewer is not None: + self.viewer.close() + +class DiscreteLocomotiveModel(DiscreteControlModel): + def __init__(self, *args, dt=0.1, **kwargs): + model = LocomotiveModel(*args, **kwargs) + super().__init__(model=model, dt=dt) + +class LocomotiveEnvironment(ControlEnvironment): + def __init__(self, *args, dt=0.1, Tmax=5, render_mode=None, **kwargs): + model = DiscreteLocomotiveModel(*args, dt=dt, **kwargs) + # self.dt = model.dt + super().__init__(discrete_model=model, Tmax=Tmax, render_mode=render_mode) + + +class LocomotiveViewer(UpgradedGraphicsUtil): + def __init__(self, train): + self.train = train + width = 1100 + self.scale = width / 4 + self.dw = self.scale * 0.1 + super().__init__(screen_width=width, xmin=-width / 2, xmax=width / 2, ymin=-width / 5, ymax=width / 5, title='Locomotive environment') + from irlc.utils.graphics_util_pygame import Object + self.locomotive = Object("locomotive.png", image_width=90, graphics=self) + + def render(self): + # fugly rendering code. + dw = self.dw + scale = self.scale + train = self.train + red = (200, 40, 40) + from irlc.utils.graphics_util_pygame import rotate_around + ptrack = [(-2 * scale, -dw / 2*0), + (-2 * scale, dw / 2), + (2 * scale, dw / 2), + (2 * scale, -dw / 2*0)] + ptrack.append( ptrack[-1]) + ptrack = rotate_around(ptrack,(0,0), -self.train.slope) + self.draw_background(background_color=(255, 255, 255)) + self.polygon("asdf", coords=ptrack, fillColor=(int(.7 * 255),) * 3, filled=True) + self.locomotive.surf.get_height() + self.locomotive.rotate(self.train.slope) + p0 = (0,0) + self.locomotive.move_center_to_xy( *rotate_around( (self.scale * self.x[0], -self.locomotive.surf.get_height()//2), p0, -self.train.slope)) + self.locomotive.blit(self.surf) + xx = 0*self.scale * self.x[0] + triangle = [(train.target * scale - dw / 2+ xx, dw/2), (train.target * scale + xx, -0*dw / 2), + (train.target * scale + dw / 2 + xx, dw/2)] + triangle = rotate_around(triangle, p0, -self.train.slope) + ddw = dw/2 + xx = self.scale * self.x[0] + trainloc = [(xx- ddw / 2, -ddw / 2), ( xx, -0 * ddw / 2), (xx + ddw / 2, -ddw / 2)] + trainloc = rotate_around(trainloc, p0, -self.train.slope) + self.trg = self.polygon("", coords=trainloc, fillColor=red, filled=True) + self.trg = self.polygon("", coords=triangle, fillColor=red, filled=True) + + def update(self, x, xstar): + self.x = x #*self.scale + self.xstar = xstar diff --git a/irlc/ex04/model_harmonic.py b/irlc/ex04/model_harmonic.py new file mode 100644 index 0000000000000000000000000000000000000000..198e529d5953df1be967a9c48ecd53786beb808a --- /dev/null +++ b/irlc/ex04/model_harmonic.py @@ -0,0 +1,113 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.model_linear_quadratic import LinearQuadraticModel +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment +import numpy as np +from irlc.utils.graphics_util_pygame import UpgradedGraphicsUtil + +""" +Simulate a Harmonic oscillator governed by equations: + +d^2 x1 / dt^2 = -k/m x1 + u(x1, t) + +where x1 is the position and u is our externally applied force (the control) +k is the spring constant and m is the mass. See: + +https://en.wikipedia.org/wiki/Simple_harmonic_motion#Dynamics + +for more details. +In the code, we will re-write the equations as: + +dx/dt = f(x, u), u = u_fun(x, t) + +where x = [x1,x2] is now a vector and f is a function of x and the current control. +here, x1 is the position (same as x in the first equation) and x2 is the velocity. + +The function should return ts, xs, C + +where ts is the N time points t_0, ..., t_{N-1}, xs is a corresponding list [ ..., [x_1(t_k),x_2(t_k)], ...] and C is the cost. +""" + +class HarmonicOscilatorModel(LinearQuadraticModel): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 20 + } + """ + See: https://books.google.dk/books?id=tXZDAAAAQBAJ&pg=PA147&lpg=PA147&dq=boeing+747+flight+0.322+model+longitudinal+flight&source=bl&ots=L2RpjCAWiZ&sig=ACfU3U2m0JsiHmUorwyq5REcOj2nlxZkuA&hl=en&sa=X&ved=2ahUKEwir7L3i6o3qAhWpl4sKHQV6CdcQ6AEwAHoECAoQAQ#v=onepage&q=boeing%20747%20flight%200.322%20model%20longitudinal%20flight&f=false + """ + def __init__(self, k=1., m=1., drag=0.0, Q=None, R=None): + self.k = k + self.m = m + A = [[0, 1], + [-k/m, 0]] + + B = [[0], [1/m]] + d = [[0], [drag/m]] + + A, B, d = np.asarray(A), np.asarray(B), np.asarray(d) + if Q is None: + Q = np.eye(2) + if R is None: + R = np.eye(1) + self.viewer = None + super().__init__(A=A, B=B, Q=Q, R=R, d=d) + + def render(self, x, render_mode="human"): + """ Render the environment. You don't have to understand this code. """ + if self.viewer is None: + self.viewer = HarmonicViewer(xstar=0) # target: x=0. + self.viewer.update(x) + import time + time.sleep(0.05) + return self.viewer.blit(render_mode=render_mode) + + def close(self): + if self.viewer is not None: + self.viewer.close() + + +class DiscreteHarmonicOscilatorModel(DiscreteControlModel): + def __init__(self, dt=0.1, discretization_method=None, **kwargs): + model = HarmonicOscilatorModel(**kwargs) + super().__init__(model=model, dt=dt, discretization_method=discretization_method) + + +class HarmonicOscilatorEnvironment(ControlEnvironment): + def __init__(self, Tmax=80, supersample_trajectory=False, render_mode=None, **kwargs): + model = DiscreteHarmonicOscilatorModel(**kwargs) + self.dt = model.dt + super().__init__(discrete_model=model, Tmax=Tmax, render_mode=render_mode, supersample_trajectory=supersample_trajectory) + + def _get_initial_state(self) -> np.ndarray: + return np.asarray([1, 0]) + +class HarmonicViewer(UpgradedGraphicsUtil): + def __init__(self, xstar = 0): + self.xstar = xstar + width = 1100 + self.scale = width / 6 + self.dw = self.scale * 0.1 + super().__init__(screen_width=width, xmin=-width / 2, xmax=width / 2, ymin=-width / 5, ymax=width / 5, title='Harmonic Osscilator') + + def render(self): + self.draw_background(background_color=(255, 255, 255)) + dw = self.dw + self.rectangle(color=(0,0,0), x=-dw//2, y=-dw//2, width=dw, height=dw) + xx = np.linspace(0, 1) + y = np.sin(xx * 2 * np.pi * 5) * 0.1*self.scale * 0.5 + + for i in range(len(xx) - 1): + self.line("asfasf", here=(xx[i] * self.x[0] * self.scale, y[i]), there=(xx[i + 1] * self.x[0] * self.scale, y[i+1]), + color=(0,0,0), width=2) + self.circle("asdf", pos=( self.x[0] * self.scale, 0), r=dw, fillColor=(0,0,0)) + self.circle("asdf", pos=( self.x[0] * self.scale, 0), r=dw*0.9, fillColor=(int(.7 * 255),) * 3) + + def update(self, x): + self.x = x + +if __name__ == "__main__": + from irlc import train + env = HarmonicOscilatorEnvironment(render_mode='human') + # train(env, NullAgent(env), num_episodes=1, max_steps=200) + # env.close() diff --git a/irlc/ex04/model_linear_quadratic.py b/irlc/ex04/model_linear_quadratic.py new file mode 100644 index 0000000000000000000000000000000000000000..912c2bbe9e6fdcbb783cfb4c330e4e496f5c7fdd --- /dev/null +++ b/irlc/ex04/model_linear_quadratic.py @@ -0,0 +1,29 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import sympy as sym +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +from gymnasium.spaces import Box + +class LinearQuadraticModel(ControlModel): + """ + Implements a model with update equations + + dx/dt = Ax + Bx + d + Cost = integral_0^{t_F} (1/2 x^T Q x + 1/2 u^T R u + q' x + qc) dt + """ + def __init__(self, A, B, Q, R, q=None, qc=None, d=None): + self._cost = SymbolicQRCost(R=R, Q=Q, q=q, qc=qc) + self.A, self.B, self.d = A, B, d + super().__init__() + + def sym_f(self, x, u, t=None): + xp = sym.Matrix(self.A) * sym.Matrix(x) + sym.Matrix(self.B) * sym.Matrix(u) + if self.d is not None: + xp += sym.Matrix(self.d) + return [x for xr in xp.tolist() for x in xr] + + def x0_bound(self) -> Box: + return Box(0, 0, shape=(self.state_size,)) + + def get_cost(self): + return self._cost diff --git a/irlc/ex04/model_pendulum.py b/irlc/ex04/model_pendulum.py new file mode 100644 index 0000000000000000000000000000000000000000..2777afce25b45f89986db02ea772f892c6b269d6 --- /dev/null +++ b/irlc/ex04/model_pendulum.py @@ -0,0 +1,164 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import sympy as sym +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +from irlc.ex04.discrete_control_model import DiscreteControlModel +import gymnasium as gym +from gymnasium.spaces.box import Box +from irlc.ex04.control_environment import ControlEnvironment +import numpy as np + +""" +SEE: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py +https://github.com/openai/gym/blob/master/gym/envs/classic_control/pendulum.py +""" +class PendulumModel(ControlModel): + state_labels= [r"$\theta$", r"$\frac{d \theta}{dt}$"] + action_labels = ['Torque $u$'] + x_upright, x_down = np.asarray([0.0, 0.0]), np.asarray([np.pi, 0.0]) + + def __init__(self, l=1., m=.8, friction=0.0, max_torque=6.0, transform_coordinates=False): + self.l, self.m, self.max_torque = l, m, max_torque + assert not transform_coordinates + super().__init__() + self.friction = friction + self._u_prev = None # For rendering + self.cp_render = {} + assert friction == 0.0 + + def sym_f(self, x, u, t=None): + l, m = self.l, self.m + g = 9.82 + theta_dot = x[1] # Parameterization: x = [theta, theta'] + theta_dot_dot = g/l * sym.sin(x[0]) + 1/(m*l**2) * u[0] + return [theta_dot, theta_dot_dot] + + def get_cost(self) -> SymbolicQRCost: + return SymbolicQRCost(R=np.ones((1, 1)), Q=np.eye(2)) + + def tF_bound(self) -> Box: + return Box(0.5, 4, shape=(1,)) + + def t0_bound(self) -> Box: + return Box(0, 0, shape=(1,)) + + def x_bound(self) -> Box: + return Box(np.asarray( [-2 * np.pi, -np.inf]), np.asarray( [2 * np.pi, np.inf]) ) + + def u_bound(self) -> Box: + return Box(np.asarray([-self.max_torque]), np.asarray([self.max_torque])) + + def x0_bound(self) -> Box: + return Box(np.asarray( [np.pi, 0] ), np.asarray( [np.pi, 0])) + + def xF_bound(self) -> Box: + return Box(np.asarray([0, 0]), np.asarray([0, 0])) + + # def close(self): + # if self.cp_render is not None: + # self.cp_render.close() + + # def render(self, x, render_mode="human"): + # if self.cp_render is None: + # self.cp_render = gym.make("Pendulum-v1", render_mode=render_mode) # environment only used for rendering + # self.cp_render.max_time_limit = 10000 + # self.cp_render.reset() + # + # self.cp_render.unwrapped.last_u = float(self._u_prev) if self._u_prev is not None else self._u_prev + # self.cp_render.unwrapped.state = np.asarray(x) + # return self.cp_render.render() + + + def close(self): + for r in self.cp_render.values(): + r.close() + + def render(self, x, render_mode="human"): + if render_mode not in self.cp_render: # is None or self.cp_render[1] != render_mode: + # if self.cp_render is not None: + # self.cp_render.close() + + self.cp_render[render_mode] = gym.make("Pendulum-v1", render_mode=render_mode) # environment only used for rendering. Change to v1 in gym 0.26. + # self.cp_render[render_mode].render_mode = render_mode + self.cp_render[render_mode].max_time_limit = 10000 + self.cp_render[render_mode].reset() + self.cp_render[render_mode].unwrapped.state = np.asarray(x) # environment is wrapped + self.cp_render[render_mode].unwrapped.last_u = self._u_prev[0] if self._u_prev is not None else None + return self.cp_render[render_mode].render() + +class SinCosPendulumModel(PendulumModel): + def phi_x(self, x): + theta, theta_dot = x[0], x[1] + return [sym.sin(theta), sym.cos(theta), theta_dot] + + def phi_x_inv(self, x): + sin_theta, cos_theta, theta_dot = x[0], x[1], x[2] + theta = sym.atan2(sin_theta, cos_theta) # Obtain angle theta from sin(theta),cos(theta) + return [theta, theta_dot] + + def phi_u(self, u): + return [sym.atanh(u[0] / self.max_torque)] + + def phi_u_inv(self, u): + return [sym.tanh(u[0]) * self.max_torque] + + def u_bound(self) -> Box: + return Box(np.asarray([-np.inf]), np.asarray([np.inf])) + +def _pendulum_cost(model): + from irlc.ex04.discrete_control_cost import DiscreteQRCost + Q = np.eye(model.state_size) + Q[0, 1] = Q[1, 0] = model.l + Q[0, 0] = Q[1, 1] = model.l ** 2 + Q[2, 2] = 0.0 + R = np.array([[0.1]]) * 10 + c0 = DiscreteQRCost(Q=np.zeros((model.state_size,model.state_size)), R=R) + c0 = c0 + c0.goal_seeking_cost(Q=Q, x_target=model.x_upright) + c0 = c0 + c0.goal_seeking_terminal_cost(xN_target=model.x_upright) * 1000 + return c0 * 2 + + +class DiscreteSinCosPendulumModel(DiscreteControlModel): + state_labels = ['$\sin(\\theta)$', '$\cos(\\theta)$', '$\\dot{\\theta}$'] # Check if this escape character works. + action_labels = ['Torque $u$'] + + def __init__(self, dt=0.02, cost=None, **kwargs): + model = SinCosPendulumModel(**kwargs) + self.max_torque = model.max_torque + # self.transform_actions = transform_actions + super().__init__(model=model, dt=dt, cost=cost) + self.x_upright = np.asarray(self.phi_x(model.x_upright)) + self.l = model.l # Pendulum length + if cost is None: + cost = _pendulum_cost(self) + self.cost = cost + + +class ThetaPendulumEnvironment(ControlEnvironment): + def __init__(self, Tmax=5, render_mode=None): + dt = 0.02 + discrete_model = DiscreteControlModel(PendulumModel(), dt=dt) + super().__init__(discrete_model, Tmax=Tmax, render_mode=render_mode) + +class GymSinCosPendulumEnvironment(ControlEnvironment): + def __init__(self, *args, Tmax=5, supersample_trajectory=False, render_mode=None, **kwargs): + discrete_model = DiscreteSinCosPendulumModel(*args, **kwargs) + self.action_space = Box(low=-np.inf, high=np.inf, shape=(discrete_model.action_size,), dtype=float) + self.observation_space = Box(low=-np.inf, high=np.inf, shape=(discrete_model.state_size,), dtype=float) + super().__init__(discrete_model, Tmax=Tmax, supersample_trajectory=supersample_trajectory, render_mode=render_mode) + +if __name__ == "__main__": + model = SinCosPendulumModel(l=1, m=1) + print(str(model)) + print(f"Pendulum with l={model.l}, m={model.m}") + x = [1,2] + u = [0] # Input state/action. + # x_dot = ... + # TODO: 1 lines missing. + raise NotImplementedError("Compute dx/dt = f(x, u, t=0) here using the model-class defined above") + # x_dot_numpy = ... + # TODO: 1 lines missing. + raise NotImplementedError("Compute dx/dt = f(x, u, t=0) here using numpy-expressions you write manually.") + + print(f"Using model-class: dx/dt = f(x, u, t) = {x_dot}") + print(f"Using numpy: dx/dt = f(x, u, t) = {x_dot_numpy}") diff --git a/irlc/ex04/pid.py b/irlc/ex04/pid.py new file mode 100644 index 0000000000000000000000000000000000000000..440228e8588d8c188beec60ff8f3fc0a73e1672c --- /dev/null +++ b/irlc/ex04/pid.py @@ -0,0 +1,60 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from irlc import savepdf +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex04.locomotive import LocomotiveEnvironment + +class PID: + def __init__(self, dt, Kp, Ki, Kd, target): + self.Kp = Kp + self.Ki = Ki + self.Kd = Kd + self.dt = dt # discretization time + self.target = target # target, in our case just a number. + self.I = 0 # Internal variables for integral/derivative terms; use these or define your own. + self.e_prior = 0 # Previous value of the error. Used in the derivative term. Remember to update it in the pi-function. + + def reset(self): + self.I = 0 + self.e_prior = 0 + + def pi(self, x): + """ + Policy for the PID class. x is always a scalar (float) and the output u is a scalar. + Should implement (Her24, Algorithm 19) + + :param x: Input state (float) + :return: Action to take (float) + """ + # TODO: 6 lines missing. + raise NotImplementedError("Compute u here.") + return u + + +def pid_explicit(): + env = LocomotiveEnvironment(m=70, slope=0, dt=0.05, Tmax=15) + pid = PID(dt=0.05, Kp=40, Kd=0, Ki=0, target=0) + # Compute the first action using PID control: + print(f"When x_0 = 1 then the first action is u_0 = {pid.pi(x=1)} (and should be u_0 = -40.0)") + x0, _ = env.reset() + x = [x0] + for _ in range(200): # Simulate for 200 steps, i.e. 0.05 * 200 seconds. + x_cur = x[-1] # x is the last state [position, velocity]. Note that you only need to pass position to your PID controller. + # TODO: 1 lines missing. + raise NotImplementedError("Compute action here using the pid class.") + u = np.clip(u, -100, 100) # clip actions. + xp_, reward, done, truncated, _ = env.step(u) + x.append(xp_) + + x = np.stack(x) + plt.plot(x[:,0], 'k-', label="PID state trajectory") + savepdf("pid_basic") + plt.show(block=False) + +if __name__ == "__main__": + pid_explicit() diff --git a/irlc/ex04/pid_car.py b/irlc/ex04/pid_car.py new file mode 100644 index 0000000000000000000000000000000000000000..84627fd578c9601b4acf1a26ea79bd2bb63be21f --- /dev/null +++ b/irlc/ex04/pid_car.py @@ -0,0 +1,61 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc import savepdf +from irlc.ex04.pid import PID +from irlc import Agent +from irlc.ex04.control_environment import ControlEnvironment + +class PIDCarAgent(Agent): + def __init__(self, env: ControlEnvironment, v_target=0.5, use_both_x5_x3=True): + """ + Define two pid controllers: One for the angle, the other for the velocity. + + self.pid_angle = PID(dt=self.discrete_model.dt, Kp=x, ...) + self.pid_velocity = PID(dt=self.discrete_model.dt, Kp=z, ...) + + I did not use Kd/Ki, however you need to think a little about the targets. + """ + # self.pid_angle = ... + ## TODO: Half of each line of code in the following 2 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # self.pid_angle = PID(dt=env.discrete_m?????????????????????????????????????? + # self.pid_velocity = PID(dt=env.discrete_mod??????????????????????????????????????????? + raise NotImplementedError("Define PID controllers here.") + self.use_both_x5_x3 = use_both_x5_x3 # Using both x3+x5 seems to make it a little easier to get a quick lap time, but you can just use x5 to begin with. + super().__init__(env) + + def pi(self, x, k, info=None): + """ + Call PID controller. The steering angle controller should initially just be based on + x[5] (distance to the centerline), but you can later experiment with a linear combination of x5 and x3 as input. + + Hints: + - To control the velocity, you should use x[0], the velocity of the car in the direction of the car. + - Remember to start out with a low value of v_target, then tune the controller and look at the animation. + - You can access the pid controllers as self.pid_angle(x_input) + - Remember the function must return a 2d numpy ndarray. + """ + + # TODO: 2 lines missing. + raise NotImplementedError("Compute action here. No clipping necesary.") + return u + + +if __name__ == "__main__": + from irlc.ex01.agent import train + from irlc.car.car_model import CarEnvironment + import matplotlib.pyplot as plt + + env = CarEnvironment(noise_scale=0,Tmax=30, max_laps=1, render_mode='human') + agent = PIDCarAgent(env, v_target=1, use_both_x5_x3=True) # I recommend lowering v_target to make the problem simpler. + + stats, trajectories = train(env, agent, num_episodes=1, return_trajectory=True) + env.close() + t = trajectories[0] + plt.clf() + plt.plot(t.state[:,0], label="velocity" ) + plt.plot(t.state[:,5], label="s (distance to center)" ) + plt.xlabel("Time/seconds") + plt.legend() + savepdf("pid_car_agent") + plt.show() diff --git a/irlc/ex04/pid_locomotive_agent.py b/irlc/ex04/pid_locomotive_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..bb2083c454f86465e7edc4d055518898f88fcc5c --- /dev/null +++ b/irlc/ex04/pid_locomotive_agent.py @@ -0,0 +1,70 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex04.locomotive import LocomotiveEnvironment +from irlc.ex04.pid_car import PID +from irlc import Agent, train +from irlc import savepdf +from irlc.ex04.control_environment import ControlEnvironment + +class PIDLocomotiveAgent(Agent): + def __init__(self, env: ControlEnvironment, dt, Kp=1.0, Ki=0.0, Kd=0.0, target=0): + # self.pid = PID(dt=...) + # TODO: 1 lines missing. + raise NotImplementedError("Make a pid instance here.") + super().__init__(env) + + def pi(self, x, k, info=None): + # TODO: 1 lines missing. + raise NotImplementedError("Get the correct action using self.pid.pi(...). Same as previous exercise") + u = np.clip(u, self.env.action_space.low[0], self.env.action_space.high[0]) # Clip actions to ensure u is in the action space + return np.asarray([u]) # Must return actions as numpy ndarrays. + +def fixplt(): + plt.legend() + plt.grid('on') + plt.box(False) + # plt.ylim([-dd, dd]) + plt.xlabel('Time/seconds') + plt.ylabel('$x(t)$') + +def pid_locomotive(): + dt = .08 + m = 70 + Tmax=15 + + env = LocomotiveEnvironment(m=m, slope=0, dt=dt, Tmax=Tmax, render_mode='human') + Kp = 40 + agent = PIDLocomotiveAgent(env, dt=dt, Kp=Kp, Ki=0, Kd=0, target=0) + stats, traj = train(env, agent, return_trajectory=True) + plt.plot(traj[0].time, traj[0].state[:, 0], '-', label=f"$K_p={40}$") + fixplt() + savepdf('pid_locomotive_Kp') + plt.show() + + # Now include a derivative term: + Kp = 40 + for Kd in [10, 50, 100]: + agent = PIDLocomotiveAgent(env, dt=dt, Kp=Kp, Ki=0, Kd=Kd, target=0) + stats, traj = train(env, agent, return_trajectory=True) + plt.plot(traj[0].time, traj[0].state[:, 0], '-', label=f"$K_p={Kp}, K_d={Kd}$") + fixplt() + savepdf('pid_locomotive_Kd') + plt.show() + env.close() + + # Derivative test: Include a slope term. For fun, let's also change the target. + env = LocomotiveEnvironment(m=m, slope=2, dt=dt, Tmax=20, target=1, render_mode='human') + for Ki in [0, 10]: + agent = PIDLocomotiveAgent(env, dt=dt, Kp=40, Ki=Ki, Kd=50, target=1) + stats, traj = train(env, agent, return_trajectory=True) + x = traj[0].state + tt = traj[0].time + plt.plot(tt, x[:, 0], '-', label=f"$K_p={Kp}, K_i={Ki}, K_d={Kd}$") + fixplt() + savepdf('pid_locomotive_Ki') + plt.show() + env.close() + +if __name__ == '__main__': + pid_locomotive() diff --git a/irlc/ex04/pid_lunar.py b/irlc/ex04/pid_lunar.py new file mode 100644 index 0000000000000000000000000000000000000000..7af982d9bf3e50d575e9c275f6b8318977531e6d --- /dev/null +++ b/irlc/ex04/pid_lunar.py @@ -0,0 +1,136 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" +For information about the Apollo 11 lunar lander see: +https://eli40.com/lander/02-debrief/ + +For code for the Gym LunarLander environment see: + +https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py + +This particular controller is inspired by: + +https://github.com/wfleshman/PID_Control/blob/master/pid.py + +However, I had better success with different parameters for the PID controller. +""" +import gymnasium as gym +import matplotlib.pyplot as plt +import numpy as np +from irlc import train +from irlc.ex04.pid import PID +from irlc import Agent +from irlc.ex04 import speech +from irlc import savepdf +from gymnasium.envs.box2d.lunar_lander import FPS + +class ApolloLunarAgent(Agent): + def __init__(self, env, dt, Kp_altitude=18, Kd_altitude=13, Kp_angle=-18, Kd_angle=-18): + """ Set up PID parameters for the two controllers (one controlling the altitude, another the angle of the lander) """ + self.Kp_altitude = Kp_altitude + self.Kd_altitude = Kd_altitude + self.Kp_angle = Kp_angle + self.Kd_angle = Kd_angle + self.error_angle = [] + self.error_altitude = [] + self.dt = dt + super().__init__(env) + + def pi(self, x, k, info=None): + """ From documentation: https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py + x (list): The state. Attributes: + x[0] is the horizontal coordinate + x[1] is the vertical coordinate + x[2] is the horizontal speed + x[3] is the vertical speed + x[4] is the angle + x[5] is the angular speed + x[6] 1 if first leg has contact, else 0 + x[7] 1 if second leg has contact, else 0 + + Your implementation should follow what happens in: + + https://github.com/wfleshman/PID_Control/blob/master/pid.py + + I.e. you have to compute the target for the angle and altitude as done in the code (and explained in the documentation. + Note the target for the PID controllers is 0. + """ + if k == 0: + """ At time t=0 we set up the two PID controllers. You don't have to change these lines. """ + self.pid_alt = PID(dt=self.dt, Kp=self.Kp_altitude, Kd=self.Kd_altitude, Ki=0, target=0) + self.pid_ang = PID(dt=self.dt, Kp=self.Kp_angle, Kd=self.Kd_angle, Ki=0, target=0) + + """ Compute the PID control signals using two calls to the PID controllers such as: """ + # alt_adj = self.pid_alt.pi(...) + # ang_adj = self.pid_ang.pi(...) + """ You need to specify the inputs to the controllers. Look at the code in the link above and implement a comparable control rule. + The inputs you give to the controller will be simple functions of the coordinates of x, i.e. x[0], x[1], and so on. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Compute the alt_adj and ang_adj as in the gitlab repo (see code comment).") + + u = np.array([alt_adj, ang_adj]) + u = np.clip(u, -1, +1) + + # If the legs are on the ground we made it, kill engines + if (x[6] or x[7]): + u[:] = 0 + # Record stats. + self.error_altitude.append(self.pid_alt.e_prior) + self.error_angle.append(self.pid_ang.e_prior) + return u + +def get_lunar_lander(env): + dt = 1/FPS # Get time discretization from environment. + spars = ['Kp_altitude', 'Kd_altitude', 'Kp_angle', 'Kd_angle'] + def x2pars(x2): + return {spars[i]: x2[i] for i in range(4)} + x_opt = np.asarray([52.23302414, 34.55938593, -80.68722976, -38.04571655]) + agent = ApolloLunarAgent(env, dt=dt, **x2pars(x_opt)) + return agent + +def lunar_single_mission(): + env = gym.make('LunarLanderContinuous-v2', render_mode='human') + env._max_episode_steps = 1000 # We don't want it to time out. + + agent = get_lunar_lander(env) + stats, traj = train(env, agent, return_trajectory=True, num_episodes=1) + env.close() + if traj[0].reward[-1] == 100: + print("A small step for man, a giant leap for mankind!") + elif traj[0].reward[-1] == -100: + print(speech) + else: + print("Environment timed out and the lunar module is just kind of floating around") + + states = np.stack(traj[0].state) + plt.plot(states[:, 0], label='x') + plt.plot(states[:, 1], label='y') + plt.plot(states[:, 2], label='vx') + plt.plot(states[:, 3], label='vy') + plt.plot(states[:, 4], label='theta') + plt.plot(states[:, 5], label='vtheta') + plt.legend() + plt.grid() + plt.ylim(-1.1, 1.1) + plt.title('PID Control') + plt.ylabel('Value') + plt.xlabel('Steps') + savepdf("pid_lunar_trajectory") + plt.show(block=False) + +def lunar_average_performance(): + env = gym.make('LunarLanderContinuous-v2', render_mode=None) # Set render_mode = 'human' to see what it does. + env._max_episode_steps = 1000 # To avoid the environment timing out after just 200 steps + + agent = get_lunar_lander(env) + stats, traj = train(env, agent, return_trajectory=True, num_episodes=20) + env.close() + + n_won = sum([np.sum(t.reward[-1] == 100) for t in traj]) + n_lost = sum([np.sum(t.reward[-1] == -100) for t in traj]) + print("Successfull landings: ", n_won, "of 20") + print("Unsuccessfull landings: ", n_lost, "of 20") + +if __name__ == "__main__": + lunar_single_mission() + lunar_average_performance() diff --git a/irlc/ex04/pid_pendulum.py b/irlc/ex04/pid_pendulum.py new file mode 100644 index 0000000000000000000000000000000000000000..82e865b853b734b3003f42a04df6cba43f3b9a2e --- /dev/null +++ b/irlc/ex04/pid_pendulum.py @@ -0,0 +1,74 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import matplotlib.pyplot as plt +np.random.seed(32) +from irlc import Agent, savepdf +from irlc.ex04.pid import PID +from irlc.ex01.agent import train + +class PIDPendulumAgent(Agent): + def __init__(self, env, dt, Kp=1.0, Ki=0.0, Kd=0.0, target_angle=0): + """ Balance_to_x0 = True implies the agent should also try to get the cartpole to x=0 (i.e. center). + If balance_to_x0 = False implies it is good enough for the agent to get the cart upright. + """ + self.pid = PID(dt=dt, Kp = Kp, Ki=Ki, Kd=Kd, target=target_angle) + super().__init__(env) + + def pi(self, x, k, info=None): + """ Compute action using self.pid. YCartpoleou have to think about the inputs as they will depend on + whether balance_to_x0 is true or not. """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + return u + + +def get_offbalance_pendulum(waiting_steps=30): + from irlc.ex04.model_pendulum import ThetaPendulumEnvironment + env = ThetaPendulumEnvironment(Tmax=10, render_mode='human') + + env.reset() + env.state[0] = 0 + env.state[1] = 0 + for _ in range(waiting_steps): # Simulate the environment for 30 steps to get things out of balance. + env.step(1) + return env + +def plot_trajectory(trajectory): + t = trajectory + plt.plot(t.time, t.state[:,0], label="Angle $\\theta$" ) + plt.plot(t.time, t.state[:,1], label="Angular speed $\\cdot{\\theta}$") + plt.xlabel("Time") + plt.legend() + + +target_angle = np.pi/6 # The target angle for the second task in the pendulum problem. +if __name__ == "__main__": + """ + First task: Bring the balance upright from a slightly off-center position. + For this task, we do not care about the x-position, only the angle theta which should be 0 (upright) + """ + env = get_offbalance_pendulum(30) + ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # agent = PIDPendulumAgent(env, dt=env.?????????????????????????????????????? + raise NotImplementedError("Define your agent here (including parameters)") + _, trajectories = train(env, agent, num_episodes=1, return_trajectory=True, reset=False) # Note reset=False to maintain initial conditions. + env.close() + plot_trajectory(trajectories[0]) + savepdf("pid_pendulumA") + plt.show() + + """ + Second task: We will now try to get to a target angle of target_angle=np.pi/6. + """ + env = get_offbalance_pendulum(30) + ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # agent = PIDPendulumAgent(env, dt=env.dt,????????????????????????????????????????? + raise NotImplementedError("Define your agent here (include the target_angle parameter to the agent!)") + _, trajectories = train(env, agent, num_episodes=1, return_trajectory=True, reset=False) # Note reset=False to maintain initial conditions. + env.close() + plot_trajectory(trajectories[0]) + print("Final state is x(t_F) =", trajectories[0].state[-1], f"goal [{target_angle:.2f}, 0]") + savepdf("pid_pendulumB") + plt.show() diff --git a/irlc/ex05/__init__.py b/irlc/ex05/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..23f7751213a745f38295d78a273a61b6a1ce7ffc --- /dev/null +++ b/irlc/ex05/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 5.""" diff --git a/irlc/ex05/__pycache__/__init__.cpython-311.pyc b/irlc/ex05/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b9bf1d95f19da97a2af54288ce60d0d9100ed11 Binary files /dev/null and b/irlc/ex05/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex05/__pycache__/direct.cpython-311.pyc b/irlc/ex05/__pycache__/direct.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76f0a425273c0209db1eb6913f557682f56014ee Binary files /dev/null and b/irlc/ex05/__pycache__/direct.cpython-311.pyc differ diff --git a/irlc/ex05/direct.py b/irlc/ex05/direct.py new file mode 100644 index 0000000000000000000000000000000000000000..b38379afda4c16bc99e554214e5bff7e8f96d3c6 --- /dev/null +++ b/irlc/ex05/direct.py @@ -0,0 +1,370 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from irlc.ex03.control_model import ControlModel +import numpy as np +import sympy as sym +import sys +from scipy.optimize import Bounds, minimize +from scipy.interpolate import interp1d +from irlc.ex03.control_model import symv +from irlc.ex04.discrete_control_model import sympy_modules_ +from irlc import Timer +from tqdm import tqdm + +def bounds2fun(t0 : float, tF : float, bounds : Bounds): + """ + Given start and end times [t0, tF] and a scipy Bounds object with upper/lower bounds on some variable x, i.e. so that: + + > bounds.lb <= x <= bounds.ub + + this function returns a new function f such that f(t0) equals bounds.lb and f(tF) = bounds.ub and + f(t) interpolates between the uppower/lower bounds linearly, i.e. + + > bounds.lb <= f(t) <= bounds.ub + + The function will return a numpy ``ndarray``. + """ + return interp1d(np.asarray([t0, tF]), np.stack([np.reshape(b, (-1,)) for b in bounds], axis=1)) + +def direct_solver(model, options): + """ + Main direct solver method, see (Her24, Algorithm 21). Given a list of options of length S, the solver performers collocation + using the settings found in the dictionary options[i], and use the result of options[i] to initialize collocation on options[i+1]. + + This iterative refinement scheme is required to obtain good overall solutions. + + :param model: A ContinuousTimeModel instance + :param options: An options-structure. This is a list of dictionaries of options for each collocation iteration + :return: A list of solutions, one for each collocation step. The last will be the 'best' solution (highest N) + + """ + if isinstance(options, dict): + options = [options] + solutions = [] # re-use result of current solutions to initialize next with a higher value of N + for i, opt in enumerate(options): + optimizer_options = opt['optimizer_options'] # to be passed along to minimize() + if i == 0 or "guess" in opt: + # No solutions-function is given. Re-calculate by linearly interpreting bounds (see (Her24, Subsection 15.3.4)) + guess = opt['guess'] + guess['u'] = bounds2fun(guess['t0'],guess['tF'],guess['u']) if isinstance(guess['u'], list) else guess['u'] + guess['x'] = bounds2fun(guess['t0'],guess['tF'],guess['x']) if isinstance(guess['x'], list) else guess['x'] + else: + """ For an iterative solver ((Her24, Subsection 15.3.4)), initialize the guess at iteration i to be the solution at iteration i-1. + The guess consists of a guess for t0, tF (just numbers) as well as x, u (state/action trajectories), + the later two being functions. The format of the guess is just a dictionary (you have seen several examples) + i.e. + + > guess = {'t0': (number), 'tF': (number), 'x': (function), 'u': (function)} + + and you can get the solution by using solutions[i - 1]['fun']. (insert a breakpoint and check the fields) + """ + # TODO: 1 lines missing. + raise NotImplementedError("Define guess = {'t0': ..., ...} here.") + N = opt['N'] + print(f"{i}> Collocation starting with grid-size N={N}") + sol = collocate(model, N=N, optimizer_options=optimizer_options, guess=guess, verbose=opt.get('verbose', False)) + solutions.append(sol) + + print("Was collocation success full at each iteration?") + for i, s in enumerate(solutions): + print(f"{i}> Success? {s['solver']['success']}") + return solutions + +def collocate(model : ControlModel, N=25, optimizer_options=None, guess : dict = None, verbose=True): + r""" + Performs collocation by discretizing the model using a grid-size of N and optimize to find the optimal solution. + The 'model' should be a ControlModel instance, optimizer_options contains options for the optimizer, and guess + is a dictionary used to initialize the optimizer containing keys:: + + guess = {'t0': Start time (float), + 'tF': Terminal time (float), + 'x': A *function* which takes time as input and return a guess for x(t), + 'u': A *function* which takes time as input and return a guess for u(t), + } + + So for instance + + .. code-block:: python + + guess['x'](0.5) + + will return the state :math:`\mathbf x(0.5)` as a numpy ndarray. + + The overall structure of the optimization procedure is as follows: + + #. Define the following variables. They will all be lists: + - ``z``: Variables to be optimized over. Each element ``z[k]`` is a symbolic variable. This will allow us to compute derivatives. + - ``z0``: A list of numbers representing the initial guess. Computed using 'guess' (above) + - ``z_lb``, ``z_ub``: Lists of numbers representting the upper/lower bounds on z. Use bound-methods in :class:`irlc.ex03.control_model.ControlModel` to get these. + #. Create a symbolic expression representing the cost-function J + This is defined using the symbolic variables similar to the toy-problem we saw last week. This allows us to compute derivatives of the cost + #. Create *symbolic* expressions representing all constraints + The lists ``Iineq`` and ``Ieq`` contains *lists* of constraints. The solver will ensure that for any i:: + + Ieq[i] == 0 + + and:: + + Iineq[i] <= 0 + + This allows us to just specify each element in 'eqC' and 'ineqC' as a single symbolic expression. Once more, we use symbolic expressions so + derivatives can be computed automatically. The most important constraints are in 'eqC', as these must include the collocation-constraints (see algorithm in notes) + #. Compile all symbolic expressions into a format useful for the optimizer + The optimizer accepts numpy functions, so we turn all symbolic expressions and derivatives into numpy (similar to the example last week). + It is then fed into the optimizer and, fingers crossed, the optimizer spits out a value 'z*', which represents the optimal values. + + #. Unpack z: + The value 'z*' then has to be unpacked and turned into function u*(t) and x*(t) (as in the notes). These functions can then be put into the + solution-dictionary and used to initialize the next guess (or assuming we terminate, these are simply our solution). + + :param model: A :class:`irlc.ex03.control_model.ControlModel` instance + :param N: The number of collocation knot points :math:`N` + :param optimizer_options: Options for the scipy optimizer. You can ignore this. + :param guess: A dictionary containing the initial guess. See the online documentation. + :param verbose: Whether to print out extra details during the run. Useful only for debugging. + :return: A dictionary containing the solution. It is compatible with the :python:`guess` datastructure . + """ + timer = Timer(start=True) + cost = model.get_cost() + t0, tF = sym.symbols("t0"), sym.symbols("tF") + ts = t0 + np.linspace(0, 1, N) * (tF-t0) # N points linearly spaced between [t0, tF] TODO: Convert this to a list. + xs, us = [], [] + for i in range(N): + xs.append(list(symv("x_%i_" % i, model.state_size))) + us.append(list(symv("u_%i_" % i, model.action_size))) + + ''' (1) Construct guess z0, all simple bounds [z_lb, z_ub] for the problem and collect all symbolic variables as z ''' + # sb = model.simple_bounds() # get simple inequality boundaries in problem (v_lb <= v <= v_ub) + z = [] # list of all *symbolic* variables in the problem + # These lists contain the guess z0 and lower/upper bounds (list-of-numbers): z_lb[k] <= z0[k] <= z_ub[k]. + # They should be lists of *numbers*. + z0, z_lb, z_ub = [], [], [] + ts_eval = sym.lambdify((t0, tF), ts.tolist(), modules='numpy') + for k in range(N): + x_low = list(model.x0_bound().low if k == 0 else (model.xF_bound().low if k == N - 1 else model.x_bound().low)) + x_high = list(model.x0_bound().high if k == 0 else (model.xF_bound().high if k == N - 1 else model.x_bound().high)) + u_low, u_high = list(model.u_bound().low), list(model.u_bound().high) + + tk = ts_eval(guess['t0'], guess['tF'])[k] + """ In these lines, update z, z0, z_lb, and z_ub with values corresponding to xs[k], us[k]. + The values are all lists; i.e. z[j] (symbolic) has guess z0[j] (float) and bounds z_lb[j], z_ub[j] (floats) """ + # TODO: 2 lines missing. + raise NotImplementedError("Updates for x_k, u_k") + + """ Update z, z0, z_lb, and z_ub with bounds/guesses corresponding to t0 and tF (same format as above). """ + # z, z0, z_lb, z_ub = z+[t0], z0+[guess['t0']], z_lb+[model.bounds['t0_low']], z_ub+[model.bounds['t0_high']] + # TODO: 2 lines missing. + raise NotImplementedError("Updates for t0, tF") + assert len(z) == len(z0) == len(z_lb) == len(z_ub) + if verbose: + print(f"z={z}\nz0={np.asarray(z0).round(1).tolist()}\nz_lb={np.asarray(z_lb).round(1).tolist()}\nz_ub={np.asarray(z_ub).round(1).tolist()}") + print(">>> Trapezoid collocation of problem") # problem in this section + fs, cs = [], [] # lists of symbolic variables corresponding to f_k and c_k, see (Her24, Algorithm 20). + for k in range(N): + """ Update both fs and cs; these are lists of symbolic expressions such that fs[k] corresponds to f_k and cs[k] to c_k in the slides. + Use the functions env.sym_f and env.sym_c """ + # fs.append( symbolic variable corresponding to f_k; see env.sym_f). similarly update cs.append(env.sym_c(...) ). + ## TODO: Half of each line of code in the following 2 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # fs.append(model.sym_f(x=????????????????????????? + # cs.append(cost.sym_c(x=x???????????????????????? + raise NotImplementedError("Compute f[k] and c[k] here (see slides) and add them to above lists") + + J = cost.sym_cf(x0=xs[0], t0=t0, xF=xs[-1], tF=tF) # terminal cost; you need to update this variable with all the cs[k]'s. + Ieq, Iineq = [], [] # all symbolic equality/inequality constraints are stored in these lists + for k in range(N - 1): + # Update cost function ((Her24, eq. (15.15))). Use the above defined symbolic expressions ts, hk and cs. + # TODO: 2 lines missing. + raise NotImplementedError("Update J here") + # Set up equality constraints. See (Her24, eq. (15.18)). + for j in range(model.state_size): + """Create all collocation equality-constraints here and add them to Ieq. I.e. + + xs[k+1] - xs[k] = 0.5 h_k (f_{k+1} + f_k) + + Note we have to create these coordinate-wise which is why we loop over j. + """ + ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # Ieq.append((xs[k+1][j] - xs[k][j])?????????????????????????????????? + raise NotImplementedError("Update collocation constraints here") + """ + To solve problems with dynamical path constriants like Brachiostone, update Iineq here to contain the + inequality constraint model.sym_h(...) <= 0. For the other problems this can simply be left blank """ + if hasattr(model, 'sym_h'): + # TODO: 1 lines missing. + raise NotImplementedError("Update symbolic path-dependent constraint h(x,u,t)<=0 here") + + print(">>> Creating objective and derivative...") + timer.tic("Building symbolic objective") + J_fun = sym.lambdify([z], J, modules='numpy') # create a python function from symbolic expression + # To compute the Jacobian, you can use sym.derive_by_array(J, z) to get the correct symbolic expression, then use sym.lamdify (as above) to get a numpy function. + ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # J_jac = sym.lambdify([z], sym.deri??????????????????????????????????? + raise NotImplementedError("Jacobian of J. See how this is computed for equality/inequality constratins for help.") + if verbose: + print(f"{Ieq=}\n{Iineq=}\n{J=}") + timer.toc() + print(">>> Differentiating equality constraints..."), timer.tic("Differentiating equality constraints") + constraints = [] + for eq in tqdm(Ieq, file=sys.stdout): # don't write to error output. + constraints.append(constraint2dict(eq, z, type='eq')) + timer.toc() + print(">>> Differentiating inequality constraints"), timer.tic("Differentiating inequality constraints") + constraints += [constraint2dict(ineq, z, type='ineq') for ineq in Iineq] + timer.toc() + + c_viol = sum(abs(np.minimum(z_ub - np.asarray(z0), 0))) + sum(abs(np.maximum(np.asarray(z_lb) - np.asarray(z0), 0))) + if c_viol > 0: # check if: z_lb <= z0 <= z_ub. Violations only serious if large + print(f">>> Warning! Constraint violations found of total magnitude: {c_viol:4} before optimization") + + print(">>> Running optimizer..."), timer.tic("Optimizing") + z_B = Bounds(z_lb, z_ub) + res = minimize(J_fun, x0=z0, method='SLSQP', jac=J_jac, constraints=constraints, options=optimizer_options, bounds=z_B) + # Compute value of equality constraints to check violations + timer.toc() + eqC_fun = sym.lambdify([z], Ieq) + eqC_val_ = eqC_fun(res.x) + eqC_val = np.zeros((N - 1, model.state_size)) + + x_res = np.zeros((N, model.state_size)) + u_res = np.zeros((N, model.action_size)) + t0_res = res.x[-2] + tF_res = res.x[-1] + + m = model.state_size + model.action_size + for k in range(N): + dx = res.x[k * m:(k + 1) * m] + if k < N - 1: + eqC_val[k, :] = eqC_val_[k * model.state_size:(k + 1) * model.state_size] + x_res[k, :] = dx[:model.state_size] + u_res[k, :] = dx[model.state_size:] + + # Generate solution structure + ts_numpy = ts_eval(t0_res, tF_res) + # make linear interpolation similar to (Her24, eq. (15.22)) + ufun = interp1d(ts_numpy, np.transpose(u_res), kind='linear') + # Evaluate function values fk points (useful for debugging but not much else): + f_eval = sym.lambdify((t0, tF, xs, us), fs) + fs_numpy = f_eval(t0_res, tF_res, x_res, u_res) + fs_numpy = np.asarray(fs_numpy) + + r""" Interpolate to get x(t) as described in (Her24, eq. (15.26)). The function should accept both lists and numbers for t.""" + x_fun = lambda t_new: np.stack([trapezoid_interpolant(ts_numpy, np.transpose(x_res), np.transpose(fs_numpy), t_new=t) for t in np.reshape(np.asarray(t_new), (-1,))], axis=1) + + if verbose: + newt = np.linspace(ts_numpy[0], ts_numpy[-1], len(ts_numpy)-1) + print( x_fun(newt) ) + + sol = { + 'grid': {'x': x_res, 'u': u_res, 'ts': ts_numpy, 'fs': fs_numpy}, + 'fun': {'x': x_fun, 'u': ufun, 'tF': tF_res, 't0': t0_res}, + 'solver': res, + 'eqC_val': eqC_val, + 'inputs': {'z': z, 'z0': z0, 'z_lb': z_lb, 'z_ub': z_ub}, + } + print(timer.display()) + return sol + +def trapezoid_interpolant(ts : list, xs : list, fs : list, t_new=None): + r""" + This function implements (Her24, eq. (15.26)) to evaluate :math:`\mathbf{x}(t)` at a point :math:`t =` ``t_new``. + + The other inputs represent the output of the direct optimization procedure. I.e., ``ts`` is a list of length + :math:`N+1` corresponding to :math:`t_k`, ``xs`` is a list of :math:`\mathbf x_k`, and ``fs`` is a list corresponding + to :math:`\mathbf f_k`. To implement the method, you should first determine which :math:`k` the new time point ``t_new`` + corresponds to, i.e. where :math:`t_k \leq t_\text{new} < t_{k+1}`. + + + :param ts: List of time points ``[.., t_k, ..]`` + :param xs: List of numpy ndarrays ``[.., x_k, ...]`` + :param fs: List of numpy ndarrays ``[.., f_k, ...]`` + :param t_new: The time point we should evaluate the function in. + :return: The state evaluated at time ``t_new``, i.e. :math:`\mathbf x(t_\text{new})`. + """ + # TODO: 3 lines missing. + raise NotImplementedError("Determine the time index k here so that ts[k] <= t_new < ts[k+1].") + + ts = np.asarray(ts) + tau = t_new - ts[k] + hk = ts[k + 1] - ts[k] + r""" + Make interpolation here. Should be a numpy array of dimensions [xs.shape[0], len(I)] + What the code does is that for each t in ts, we work out which knot-point interval the code falls within. I.e. + insert a breakpoint and make sure you understand what e.g. the code tau = t_new - ts[I] does. + + Given this information, we can recover the relevant (evaluated) knot-points as for instance + fs[:,I] and those at the next time step as fs[:,I]. With this information, the problem is simply an + implementation of (Her24, eq. (15.26)), i.e. + + > x_interp = xs[:,I] + tau * fs[:,I] + (...) + + """ + ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # x_interp = xs[:, k] + tau * fs[:, k] + (tau ???????????????????????????????????????????? + raise NotImplementedError("Insert your solution and remove this error.") + return x_interp + + +def constraint2dict(symb, all_vars, type='eq'): + ''' Turn constraints into a dict with type, fun, and jacobian field. ''' + if type == "ineq": symb = -1 * symb # To agree with sign convention in optimizer + + f = sym.lambdify([all_vars], symb, modules=sympy_modules_) + # np.atan = np.arctan # Monkeypatch numpy to contain atan. Passing "numpy" does not seem to fix this. + jac = sym.lambdify([all_vars], sym.derive_by_array(symb, all_vars), modules=sympy_modules_) + eq_cons = {'type': type, + 'fun': f, + 'jac': jac} + return eq_cons + +def get_opts(N, ftol=1e-6, guess=None, verbose=False): # helper function to instantiate options objet. + d = {'N': N, + 'optimizer_options': {'maxiter': 1000, + 'ftol': ftol, + 'iprint': 1, + 'disp': True, + 'eps': 1.5e-8}, # 'eps': 1.4901161193847656e-08, + 'verbose': verbose} + if guess: + d['guess'] = guess + return d + +def guess(model : ControlModel): + def mfin(z): + return [z_ if np.isfinite(z_) else 0 for z_ in z] + xL = mfin(model.x0_bound().low) + xU = mfin(model.xF_bound().high) + tF = 10 if not np.isfinite(model.tF_bound().high[0]) else model.tF_bound().high[0] + gs = {'t0': 0, + 'tF': tF, + 'x': [xL, xU], + 'u': [mfin(model.u_bound().low), mfin(model.u_bound().high)]} + return gs + + +def run_direct_small_problem(): + from irlc.ex04.model_pendulum import SinCosPendulumModel + model = SinCosPendulumModel() + """ + Test out implementation on a very small grid. The overall solution will be fairly bad, + but we can print out the various symbolic expressions + + We use verbose=True to get debug-information. + """ + print("Solving with a small grid, N=5") + options = [get_opts(N=5, ftol=1e-3, guess=guess(model), verbose=True)] + solutions = direct_solver(model, options) + return model, solutions + + +if __name__ == "__main__": + from irlc.ex05.direct_plot import plot_solutions + model, solutions = run_direct_small_problem() + plot_solutions(model, solutions, animate=False, pdf="direct_pendulum_small") diff --git a/irlc/ex05/direct_agent.py b/irlc/ex05/direct_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..e8cbca2f99735e27da5518d5f686194c82211f71 --- /dev/null +++ b/irlc/ex05/direct_agent.py @@ -0,0 +1,77 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex05.direct import direct_solver, get_opts, guess +from irlc.ex04.model_pendulum import SinCosPendulumModel +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment +from irlc import train +from irlc import Agent +import numpy as np +import matplotlib.pyplot as plt +from irlc import savepdf +from irlc.ex05.direct_plot import plot_solutions + +class DirectAgent(Agent): + def __init__(self, env: ControlEnvironment, options=None): + cmod = env.discrete_model.continuous_model # Get the continuous-time model for planning + + if options is None: + options = [get_opts(N=10, ftol=1e-3, guess=guess(cmod), verbose=False), + get_opts(N=60, ftol=1e-6, verbose=False) + ] + solutions = direct_solver(cmod, options) + + # The next 3 lines are for plotting purposes. You can ignore them. + self.x_grid = np.stack([env.discrete_model.phi_x(x) for x in solutions[-1]['grid']['x']]) + self.u_grid = np.stack([env.discrete_model.phi_u(u) for u in solutions[-1]['grid']['u']]) + self.ts_grid = np.stack(solutions[-1]['grid']['ts']) + # set self.ufun equal to the solution (policy) function. You can get it by looking at `solutions` computed above + self.solutions = solutions + # TODO: 1 lines missing. + raise NotImplementedError("set self.ufun = solutions[....][somethingsomething] (insert a breakpoint, it should be self-explanatory).") + super().__init__(env) + + def pi(self, x, k, info=None): + """ Return the action given x and t. As a hint, you will only use t, and self.ufun computed a few lines above""" + # TODO: 7 lines missing. + raise NotImplementedError("Implement function body") + return u + +def train_direct_agent(animate=True, plot=False): + from irlc.ex04.model_pendulum import PendulumModel + model = PendulumModel() + """ + Test out implementation on a fairly small grid. Note this will work fairly terribly. + """ + guess = {'t0': 0, + 'tF': 4, + 'x': [np.asarray([0, 0]), np.asarray([np.pi, 0])], + 'u': [np.asarray([0]), np.asarray([0])]} + + options = [get_opts(N=10, ftol=1e-3, guess=guess), + get_opts(N=20, ftol=1e-3), + get_opts(N=80, ftol=1e-6) + ] + + dmod = DiscreteControlModel(model=model, dt=0.1) # Discretize the pendulum model. Used for creating the environment. + denv = ControlEnvironment(discrete_model=dmod, Tmax=4, render_mode='human' if animate else None) + agent = DirectAgent(denv, options=options) + denv.Tmax = agent.solutions[-1]['fun']['tF'] # Specify max runtime of the environment. Must be based on the Agent's solution. + stats, traj = train(denv, agent=agent, num_episodes=1, return_trajectory=True) + + if plot: + from irlc import plot_trajectory + plot_trajectory(traj[0], env=denv) + savepdf("direct_agent_pendulum") + plt.show() + + return stats, traj, agent + +if __name__ == "__main__": + stats, traj, agent = train_direct_agent(animate=True, plot=True) + print("Obtained cost", -stats[0]['Accumulated Reward']) + + # Let's try to plot the state-vectors for the two models. They are not going to agree that well. + plt.plot(agent.ts_grid, agent.x_grid, 'r-', label="Direct solver prediction") + plt.plot(traj[0].time, traj[0].state, 'k-', label='Simulation') + plt.legend() + plt.show() diff --git a/irlc/ex05/direct_brachistochrone.py b/irlc/ex05/direct_brachistochrone.py new file mode 100644 index 0000000000000000000000000000000000000000..2aaf14e8dc0257fe29f5c5beebaac14ef659b5cd --- /dev/null +++ b/irlc/ex05/direct_brachistochrone.py @@ -0,0 +1,59 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import matplotlib.pyplot as plt +from irlc import savepdf +from irlc.ex05.model_brachistochrone import ContiniouBrachistochrone +from irlc.ex05.direct import direct_solver, get_opts, guess +from irlc.ex05.direct_plot import plot_solutions + +def plot_brachistochrone_solutions(model, solutions, out=None): + plot_solutions(model, solutions, animate=False, pdf=out) + for index, sol in enumerate(solutions): + x_res = sol['grid']['x'] + plt.figure(figsize=(5,5)) + plt.plot( x_res[:,0], x_res[:,1]) + xF = model.bounds['xF_low'] + plt.plot([0, 0], [0, xF[1]], 'r-') + plt.plot([0, xF[0]], [xF[1], xF[1]], 'r-') + # plt.title("Curve in x/y plane") + plt.xlabel("$x$-position") + plt.ylabel("$y$-position") + if model.h is not None: + # add dynamical constraint. + xc = np.linspace(0, model.x_dist) + yc = -xc/2 - model.h + plt.plot(xc, yc, 'k-', linewidth=2) + plt.grid() + # plt.gca().invert_yaxis() + plt.gca().axis('equal') + if out: + savepdf(f"{out}_{index}") + plt.show() + pass + +def compute_unconstrained_solutions(): + model = ContiniouBrachistochrone(h=None, x_dist=1) + options = [get_opts(N=10, ftol=1e-3, guess=guess(model)), + get_opts(N=30, ftol=1e-6)] + # solve without constraints + solutions = direct_solver(model, options) + return model, solutions + +def compute_constrained_solutions(): + model_h = ContiniouBrachistochrone(h=0.1, x_dist=1) + options = [get_opts(N=10, ftol=1e-3, guess=guess(model_h)), + get_opts(N=30, ftol=1e-6)] + solutions_h = direct_solver(model_h, options) + return model_h, solutions_h + +if __name__ == "__main__": + """ + For further information see: + http://www.hep.caltech.edu/~fcp/math/variationalCalculus/variationalCalculus.pdf + """ + model, solutions = compute_unconstrained_solutions() + plot_brachistochrone_solutions(model, solutions[-1:], out="brachi") + + # solve with dynamical (sloped planc) constraint at height of h. + model_h, solutions_h = compute_constrained_solutions() + plot_brachistochrone_solutions(model_h, solutions_h[-1:], out="brachi_h") diff --git a/irlc/ex05/direct_cartpole_kelly.py b/irlc/ex05/direct_cartpole_kelly.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf026828d35bae29ddf09ddae26b282b138d592 --- /dev/null +++ b/irlc/ex05/direct_cartpole_kelly.py @@ -0,0 +1,56 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Kel17] Matthew Kelly. An introduction to trajectory optimization: how to do your own direct collocation. SIAM Review, 59(4):849–904, 2017. (See kelly2017.pdf). +""" +from irlc.ex05.direct import guess +from irlc.ex05.model_cartpole import CartpoleModel +from irlc.ex03.control_cost import SymbolicQRCost +from irlc.ex05.direct import direct_solver, get_opts +import numpy as np +from gymnasium.spaces import Box + +class KellyCartpoleModel(CartpoleModel): + """Completes the Cartpole swingup task in exactly 2 seconds. + + The only changes to the original cartpole model is the inclusion of a new bound on ``tf_bound(self)``, + to limit the end-time to :math:`t_F = 2`, and an updated cost function so that :math:`Q=0` and :math:`R=I`. + """ + def get_cost(self) -> SymbolicQRCost: + # TODO: 2 lines missing. + raise NotImplementedError("Construct and return a new cost-function here.") + + def tF_bound(self) -> Box: + # TODO: 2 lines missing. + raise NotImplementedError("Implement the bound on tF here") + +def make_cartpole_kelly17(): + """ + Creates Cartpole problem. Details about the cost function can be found in (Kel17, Section 6) + and details about the physical parameters can be found in (Kel17, Appendix E, table 3). + """ + # this will generate a different carpole environment with an emphasis on applying little force u. + duration = 2.0 + maxForce = 20 + model = KellyCartpoleModel(max_force=maxForce, mp=0.3, l=0.5, mc=1.0, dist=1) + guess2 = guess(model) + guess2['tF'] = duration # Our guess should match the constraints. + return model, guess2 + +def compute_solutions(): + model, guess = make_cartpole_kelly17() + options = [get_opts(N=10, ftol=1e-3, guess=guess), + get_opts(N=40, ftol=1e-6)] + solutions = direct_solver(model, options) + return model, solutions + +def direct_cartpole(): + model, solutions = compute_solutions() + from irlc.ex05.direct_plot import plot_solutions + print("Did we succeed?", solutions[-1]['solver']['success']) + plot_solutions(model, solutions, animate=True, pdf="direct_cartpole_force") + model.close() + +if __name__ == "__main__": + direct_cartpole() diff --git a/irlc/ex05/direct_cartpole_time.py b/irlc/ex05/direct_cartpole_time.py new file mode 100644 index 0000000000000000000000000000000000000000..ccf63363f8cd7ba4ec00b8e7fe075a25673b7c11 --- /dev/null +++ b/irlc/ex05/direct_cartpole_time.py @@ -0,0 +1,28 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex05.model_cartpole import CartpoleModel +from irlc.ex05.direct import direct_solver, get_opts +from irlc.ex05.direct_plot import plot_solutions +from irlc.ex05.direct import guess + +def compute_solutions(): + """ + See: https://github.com/MatthewPeterKelly/OptimTraj/blob/master/demo/cartPole/MAIN_minTime.m + """ + model = CartpoleModel(max_force=50, mp=0.5, mc=2.0, l=0.5) + guess2 = guess(model) + guess2['tF'] = 2 + guess2['u'] = [[0], [0]] + + options = [get_opts(N=8, ftol=1e-3, guess=guess2), # important. + get_opts(N=16, ftol=1e-6), # This is a hard problem and we need gradual grid-refinement. + get_opts(N=32, ftol=1e-6), + get_opts(N=70, ftol=1e-6) + ] + solutions = direct_solver(model, options) + return model, solutions + +if __name__ == "__main__": + model, solutions = compute_solutions() + x_sim, u_sim, t_sim = plot_solutions(model, solutions[:], animate=True, pdf="direct_cartpole_mintime") + model.close() + print("Did we succeed?", solutions[-1]['solver']['success']) diff --git a/irlc/ex05/direct_pendulum.py b/irlc/ex05/direct_pendulum.py new file mode 100644 index 0000000000000000000000000000000000000000..80ae5a76deca15aaddaa444d09542e9800a0f90e --- /dev/null +++ b/irlc/ex05/direct_pendulum.py @@ -0,0 +1,27 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex05.direct import direct_solver, get_opts +from irlc.ex04.model_pendulum import SinCosPendulumModel +from irlc.ex05.direct_plot import plot_solutions +import numpy as np + +def compute_pendulum_solutions(): + model = SinCosPendulumModel() + """ + Test out implementation on a fairly small grid. Note this will work fairly terribly. + """ + guess = {'t0': 0, + 'tF': 4, + 'x': [np.asarray([0, 0]), np.asarray([np.pi, 0])], + 'u': [np.asarray([0]), np.asarray([0])]} + + options = [get_opts(N=10, ftol=1e-3, guess=guess), + get_opts(N=20, ftol=1e-3), + get_opts(N=80, ftol=1e-6) + ] + + solutions = direct_solver(model, options) + return model, solutions + +if __name__ == "__main__": + model, solutions = compute_pendulum_solutions() + plot_solutions(model, solutions, animate=True, pdf="direct_pendulum_real") diff --git a/irlc/ex05/direct_plot.py b/irlc/ex05/direct_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..67a324ae8a79b9091e86f4ed7f194305fa8efd95 --- /dev/null +++ b/irlc/ex05/direct_plot.py @@ -0,0 +1,82 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import matplotlib.pyplot as plt +import numpy as np +from irlc.ex03.control_model import plot_trajectory +from irlc import savepdf + +""" +Helper function for plotting. +""" +def plot_solutions(model, solutions, animate=True, pdf=None, plot_defects=True, Ix=None, animate_repeats=1, animate_all=False, plot=True): + + for k, sol in enumerate(solutions): + grd = sol['grid'] + x_res = sol['grid']['x'] + u_res = sol['grid']['u'] + ts = sol['grid']['ts'] + u_fun = lambda x, t: sol['fun']['u'](t) + N = len(ts) + if pdf is not None: + pdf_out = f"{pdf}_sol{N}" + + x_sim, u_sim, t_sim = model.simulate(x0=grd['x'][0, :], u_fun=u_fun, t0=grd['ts'][0], tF=grd['ts'][-1], N_steps=1000) + if animate and (k == len(solutions)-1 or animate_all): + for _ in range(animate_repeats): + animate_rollout(model, x0=grd['x'][0, :], u_fun=u_fun, t0=grd['ts'][0], tF=grd['ts'][-1], N_steps=1000, fps=30) + + eqC_val = sol['eqC_val'] + labels = model.state_labels + + if Ix is not None: + labels = [l for k, l in enumerate(labels) if k in Ix] + x_res = x_res[:,np.asarray(Ix)] + x_sim = x_sim[:,np.asarray(Ix)] + + print("Initial State: " + ",".join(labels)) + print(x_res[0]) + print("Final State:") + print(x_res[-1]) + if plot: + ax = plot_trajectory(x_res, ts, lt='ko-', labels=labels, legend="Direct state prediction $x(t)$") + plot_trajectory(x_sim, t_sim, lt='-', ax=ax, labels=labels, legend="RK4 exact simulation") + # plt.suptitle("State", fontsize=14, y=0.98) + # make_space_above(ax, topmargin=0.5) + + if pdf is not None: + savepdf(pdf_out +"_x") + plt.show(block=False) + # print("plotting...") + plot_trajectory(u_res, ts, lt='ko-', labels=model.action_labels, legend="Direct action prediction $u(t)$") + # print("plotting... B") + # plt.suptitle("Action", fontsize=14, y=0.98) + # print("plotting... C") + # make_space_above(ax, topmargin=0.5) + # print("plotting... D") + if pdf is not None: + savepdf(pdf_out +"_u") + plt.show(block=False) + if plot_defects: + plot_trajectory(eqC_val, ts[:-1], lt='-', labels=labels) + plt.suptitle("Defects (equality constraint violations)") + if pdf is not None: + savepdf(pdf_out +"_defects") + plt.show(block=False) + return x_sim, u_sim, t_sim + + +def animate_rollout(model, x0, u_fun, t0, tF, N_steps = 1000, fps=10): + """ Helper function to animate a policy. """ + + import time + # if sys.gettrace() is not None: + # print("Not animating stuff in debugger as it crashes.") + # return + y, _, tt = model.simulate(x0, u_fun, t0, tF, N_steps=N_steps) + secs = tF-t0 + frames = int( np.ceil( secs * fps ) ) + I = np.round( np.linspace(0, N_steps-1, frames)).astype(int) + y = y[I,:] + + for i in range(frames): + model.render(x=y[i], render_mode="human") + time.sleep(1/fps) diff --git a/irlc/ex05/model_brachistochrone.py b/irlc/ex05/model_brachistochrone.py new file mode 100644 index 0000000000000000000000000000000000000000..14c0ae74370488ec0147c9427826da025f74beb2 --- /dev/null +++ b/irlc/ex05/model_brachistochrone.py @@ -0,0 +1,55 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" +The Brachistochrone problem. See +https://apmonitor.com/wiki/index.php/Apps/BrachistochroneProblem +and (Bet10) + +References: + [Bet10] John T Betts. Practical methods for optimal control and estimation using nonlinear programming. Volume 19. Siam, 2010. +""" +import sympy as sym +import numpy as np +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +from gymnasium.spaces import Box + +class ContiniouBrachistochrone(ControlModel): + state_labels= ["$x$", "$y$", "bead speed"] + action_labels = ['Tangent angle'] + + def __init__(self, g=9.82, h=None, x_dist=1): + self.g = g + self.h = h + self.x_dist = x_dist # or x_B + super().__init__() + + def get_cost(self) -> SymbolicQRCost: + # TODO: 1 lines missing. + raise NotImplementedError("Instantiate cost=SymbolicQRCost(...) here corresponding to minimum time.") + return cost + + def x0_bound(self) -> Box: + return Box(0, 0, shape=(self.state_size,)) + + def xF_bound(self) -> Box: + return Box(np.array([self.x_dist, -np.inf, -np.inf]), np.array([self.x_dist, np.inf, np.inf])) + + def sym_f(self, x, u, t=None): + # TODO: 3 lines missing. + raise NotImplementedError("Implement function body") + return xp + + def sym_h(self, x, u, t): + r""" + Add a dynamical constraint of the form + + .. math:: + + h(x, u, t) \leq 0 + """ + if self.h is None: + return [] + else: + # compute a single dynamical constraint as in (Bet10, Example (4.10)) (Note y-axis is reversed in the example) + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") diff --git a/irlc/ex05/model_cartpole.py b/irlc/ex05/model_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..aea63db9a39f72c6dfde43b4da2d687f72194ff7 --- /dev/null +++ b/irlc/ex05/model_cartpole.py @@ -0,0 +1,173 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.discrete_control_cost import DiscreteQRCost +import sympy as sym +import numpy as np +import gymnasium as gym +from gymnasium.spaces import Box +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment + +class CartpoleModel(ControlModel): + state_labels = ["$x$", r"$\frac{dx}{dt}$", r"$\theta$", r"$\frac{d \theta}{dt}$"] + action_labels = ["Cart force $u$"] + + def __init__(self, mc=2, + mp=0.5, + l=0.5, + max_force=50, dist=1.0): + self.mc = mc + self.mp = mp + self.l = l + self.max_force = max_force + self.dist = dist + self.cp_render = {} + super().__init__() + + + def tF_bound(self) -> Box: + return Box(0.01, np.inf, shape=(1,)) + + def x_bound(self) -> Box: + return Box(np.asarray([-2 * self.dist, -np.inf, -2 * np.pi, -np.inf]), np.asarray([2 * self.dist, np.inf, 2 * np.pi, np.inf])) + + def x0_bound(self) -> Box: + return Box(np.asarray([0, 0, np.pi, 0]), np.asarray([0, 0, np.pi, 0])) + + def xF_bound(self) -> Box: + return Box(np.asarray([self.dist, 0, 0, 0]), np.asarray([self.dist, 0, 0, 0])) + + def u_bound(self) -> Box: + return Box(np.asarray([-self.max_force]), np.asarray([self.max_force])) + + def get_cost(self) -> SymbolicQRCost: + return SymbolicQRCost(R=np.eye(1) * 0, Q=np.eye(4) * 0, qc=1) # just minimum time + + def sym_f(self, x, u, t=None): + mp = self.mp + l = self.l + mc = self.mc + g = 9.81 # Gravity on earth. + + x_dot = x[1] + theta = x[2] + sin_theta = sym.sin(theta) + cos_theta = sym.cos(theta) + theta_dot = x[3] + F = u[0] + # Define dynamics model as per Razvan V. Florian's + # "Correct equations for the dynamics of the cart-pole system". + # Friction is neglected. + + # Eq. (23) + temp = (F + mp * l * theta_dot ** 2 * sin_theta) / (mc + mp) + numerator = g * sin_theta - cos_theta * temp + denominator = l * (4.0 / 3.0 - mp * cos_theta ** 2 / (mc + mp)) + theta_dot_dot = numerator / denominator + + # Eq. (24) + x_dot_dot = temp - mp * l * theta_dot_dot * cos_theta / (mc + mp) + xp = [x_dot, + x_dot_dot, + theta_dot, + theta_dot_dot] + return xp + + def close(self): + for r in self.cp_render.values(): + r.close() + + def render(self, x, render_mode="human"): + if render_mode not in self.cp_render: + self.cp_render[render_mode] = gym.make("CartPole-v1", render_mode=render_mode) # environment only used for rendering. Change to v1 in gym 0.26. + self.cp_render[render_mode].max_time_limit = 10000 + self.cp_render[render_mode].reset() + self.cp_render[render_mode].unwrapped.state = np.asarray(x) # environment is wrapped + return self.cp_render[render_mode].render() + +class SinCosCartpoleModel(CartpoleModel): + def phi_x(self, x): + x, dx, theta, theta_dot = x[0], x[1], x[2], x[3] + return [x, dx, sym.sin(theta), sym.cos(theta), theta_dot] + + def phi_x_inv(self, x): + x, dx, sin_theta, cos_theta, theta_dot = x[0], x[1], x[2], x[3], x[4] + theta = sym.atan2(sin_theta, cos_theta) # Obtain angle theta from sin(theta),cos(theta) + return [x, dx, theta, theta_dot] + + def phi_u(self, u): + return [sym.atanh(u[0] / self.max_force)] + + def phi_u_inv(self, u): + return [sym.tanh(u[0]) * self.max_force] + +def _cartpole_discrete_cost(model): + pole_length = model.continuous_model.l + + state_size = model.state_size + Q = np.eye(state_size) + Q[0, 0] = 1.0 + Q[1, 1] = Q[4, 4] = 0. + Q[0, 2] = Q[2, 0] = pole_length + Q[2, 2] = Q[3, 3] = pole_length ** 2 + + print("Warning: I altered the cost-matrix to prevent underflow. This is not great.") + R = np.array([[0.1]]) + Q_terminal = 1 * Q + + q = np.asarray([0,0,0,-1,0]) + # Instantaneous control cost. + c3 = DiscreteQRCost(Q=Q*0, R=R * 0.1, q=1 * q, qN=q * 1) + c3 += c3.goal_seeking_cost(Q=Q, x_target=model.x_upright) + c3 += c3.goal_seeking_terminal_cost(QN=Q_terminal, xN_target=model.x_upright) + cost = c3 + return cost + +class GymSinCosCartpoleModel(DiscreteControlModel): + state_labels = ['x', 'd_x', '$\sin(\\theta)$', '$\cos(\\theta)$', '$d\\theta/dt$'] + action_labels = ['Torque $u$'] + + def __init__(self, dt=0.02, cost=None, transform_actions=True, **kwargs): + model = SinCosCartpoleModel(**kwargs) + self.transform_actions = transform_actions + super().__init__(model=model, dt=dt, cost=cost) + self.x_upright = np.asarray(self.phi_x(model.xF_bound().low )) + if cost is None: + cost = _cartpole_discrete_cost(self) + self.cost = cost + + @property + def max_force(self): + return self.continuous_model.maxForce + + +class GymSinCosCartpoleEnvironment(ControlEnvironment): + def __init__(self, Tmax=5, transform_actions=True, supersample_trajectory=False, render_mode='human', **kwargs): + discrete_model = GymSinCosCartpoleModel(transform_actions=transform_actions, **kwargs) + self.observation_space = Box(low=-np.inf, high=np.inf, shape=(5,), dtype=float) + if transform_actions: + self.action_space = Box(low=-np.inf, high=np.inf, shape=(1,), dtype=float) + super().__init__(discrete_model, Tmax=Tmax,render_mode=render_mode, supersample_trajectory=supersample_trajectory) + + +class DiscreteCartpoleModel(DiscreteControlModel): + def __init__(self, dt=0.02, cost=None, **kwargs): + model = CartpoleModel(**kwargs) + super().__init__(model=model, dt=dt, cost=cost) + + +class CartpoleEnvironment(ControlEnvironment): + def __init__(self, Tmax=5, supersample_trajectory=False, render_mode='human', **kwargs): + discrete_model = DiscreteCartpoleModel(**kwargs) + super().__init__(discrete_model, Tmax=Tmax, supersample_trajectory=supersample_trajectory, render_mode=render_mode) + + +if __name__ == "__main__": + from irlc import train, VideoMonitor + from irlc import Agent + env = GymSinCosCartpoleEnvironment() + agent = Agent(env) + env = VideoMonitor(env) + stats, traj = train(env, agent, num_episodes=1, max_steps=100) + env.close() diff --git a/irlc/ex06/__init__.py b/irlc/ex06/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6e26755e5ec4fe79d8350778babe173741127191 --- /dev/null +++ b/irlc/ex06/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 6.""" diff --git a/irlc/ex06/__pycache__/__init__.cpython-311.pyc b/irlc/ex06/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2005a99d71b80389f53d84cdab39fae60aa41248 Binary files /dev/null and b/irlc/ex06/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex06/__pycache__/dlqr.cpython-311.pyc b/irlc/ex06/__pycache__/dlqr.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3b9ef8646190a661fbefabe8b43de0c6663fce1 Binary files /dev/null and b/irlc/ex06/__pycache__/dlqr.cpython-311.pyc differ diff --git a/irlc/ex06/boeing_lqr.py b/irlc/ex06/boeing_lqr.py new file mode 100644 index 0000000000000000000000000000000000000000..e06cf3f4efdc2ef72a4f38b20e4df647e85e145d --- /dev/null +++ b/irlc/ex06/boeing_lqr.py @@ -0,0 +1,85 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +import numpy as np +import matplotlib.pyplot as plt +from irlc import savepdf +from irlc import train +from irlc.ex06.model_boeing import BoeingEnvironment +from irlc.ex06.lqr_agent import LQRAgent +from irlc.ex03.control_model import ControlModel +import scipy + + +def boeing_simulation(): + env = BoeingEnvironment(Tmax=10) + model = env.discrete_model.continuous_model # get the model from the Boeing environment + dt = env.dt # Get the discretization time. + A, B, d = compute_A_B_d(model, dt) + # Use compute_Q_R_q to get the Q, R, and q matrices in the discretized system + # TODO: 1 lines missing. + raise NotImplementedError("Compute Q, R and q here") + ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # agent = LQRAgent(env, A=A?????????????????????????? + raise NotImplementedError("Use your LQRAgent to plan using the system matrices.") + stats, trajectories = train(env, agent, return_trajectory=True) + return stats, trajectories, env + +def compute_Q_R_q(model : ControlModel, dt : float): + cost = model.get_cost() # Get the continuous-time cost-function + # use print(cost) to see what it contains. + # Then get the discretized matrices using the techniques described in (Her24, Subsection 13.1.6). + # TODO: 3 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return Q, R, q + +def compute_A_B_d(model : ControlModel, dt : float): + if model.d is None: + d = np.zeros((model.state_size,)) # Ensure d is set to a zero vector if it is not defined. + else: + d = model.d + + A_discrete = scipy.linalg.expm(model.A * dt) # This is the discrete A-matrix computed using the matrix exponential + # Now it is your job to define B_discrete and d_discrete. + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return A_discrete, B_discrete, d_discrete.flatten() + +def boeing_experiment(): + _, trajectories, env = boeing_simulation() + model = env.discrete_model.continuous_model + + dt = env.dt + Q, R, q = compute_Q_R_q(model, dt) + print("Discretization time is", dt) + print("Original q-vector was:", model.get_cost().q) + print("Discretized version is:", q) + + t = trajectories[-1] + out = t.state @ model.P.T + + plt.plot(t.time, out[:, 0], '-', label=env.observation_labels[0]) + plt.plot(t.time, out[:, 1], '-', label=env.observation_labels[1]) + plt.grid() + plt.legend() + plt.xlabel("Time/seconds") + plt.ylabel("Output") + savepdf("boing_lqr_output") + plt.show(block=False) + plt.close() + + plt.plot(t.time[:-1], t.action[:, 0], '-', label=env.action_labels[0]) + plt.plot(t.time[:-1], t.action[:, 1], '-', label=env.action_labels[1]) + plt.xlabel("Time/seconds") + plt.ylabel("Control action") + plt.grid() + plt.legend() + savepdf("boing_lqr_action") + plt.show() + +if __name__ == "__main__": + boeing_experiment() diff --git a/irlc/ex06/dlqr.py b/irlc/ex06/dlqr.py new file mode 100644 index 0000000000000000000000000000000000000000..205aa9fc433157df852e97470903e44720e7f44b --- /dev/null +++ b/irlc/ex06/dlqr.py @@ -0,0 +1,207 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +import numpy as np +import matplotlib.pyplot as plt +from irlc import bmatrix +from irlc import savepdf + + + +def LQR(A : list, # Dynamic + B : list, # Dynamics + d : list =None, # Dynamics (optional) + Q : list=None, + R: list=None, + H : list=None, + q : list=None, + r : list=None, + qc : list=None, + QN : np.ndarray =None, # Terminal cost term + qN : np.ndarray=None, # Terminal cost term + qcN : np.ndarray =None, # Terminal cost term. + mu : float =0 # regularization parameter which will only be relevant next week. + ): + r""" + Implement the LQR as defined in (Her24, Algorithm 22). I recommend viewing this documentation online (documentation for week 6). + + When you solve this exercise, look at the algorithm in the book. Since the LQR problem is on the form: + + .. math:: + + x_{k+1} = A_k x_k + B_k u_k + d_k + + For :math:`k=0,\dots,N-1` this means there are :math:`N` matrices :math:`A_k`. This is implemented by assuming that + :python:`A` (i.e., the input argument) is a :python:`list` of length :math:`N` so that :python:`A[k]` corresponds + to :math:`A_k`. + + Similar conventions are used for the cost term (please see the lecture notes or the online documentation for their meaning). Recall it has the form: + + .. math:: + + c(x_k, u_k) = \frac{1}{2} \mathbf x_k^\top Q_k \mathbf x_k + \frac{1}{2} \mathbf q_k^\top \mathbf x_k + q_k + \cdots + + When the function is called, the vector :math:`\textbf{q}_k` corresponds to :python:`q` and the constant :math:`q_k` correspond to :python:`qc` (q-constant). + + .. note:: + + Only the terms :python:`A` and :python:`B` are required. The rest of the terms will default to 0-matrices. + + The LQR algorithm will ultimately compute a control law of the form: + + .. math:: + + \mathbf u_k = L_k \mathbf x_k + \mathbf l_k + + And a cost-to-go function as: + + .. math:: + + J_k(x_k) = \frac{1}{2} \mathbf x_k^\top V_k \mathbf x_k + v_k^\top \mathbf x_k + v_k + + Again there are :math:`N-1` terms. The function then return :python:`return (L, l), (V, v, vc)` so that :python:`L[k]` corresponds to :math:`L_k`. + + :param A: A list of :python:`np.ndarray` containing all terms :math:`A_k` + :param B: A list of :python:`np.ndarray` containing all terms :math:`B_k` + :param d: A list of :python:`np.ndarray` containing all terms :math:`\mathbf d_k` (optional) + :param Q: A list of :python:`np.ndarray` containing all terms :math:`Q_k` (optional) + :param R: A list of :python:`np.ndarray` containing all terms :math:`R_k` (optional) + :param H: A list of :python:`np.ndarray` containing all terms :math:`H_k` (optional) + :param q: A list of :python:`np.ndarray` containing all terms :math:`\mathbf q_k` (optional) + :param r: A list of :python:`np.ndarray` containing all terms :math:`\mathbf r_k` (optional) + :param qc: A list of :python:`float` containing all terms :math:`q_k` (i.e., constant terms) (optional) + :param QN: A :python:`np.ndarray` containing the terminal cost term :math:`Q_N` (optional) + :param qN: A :python:`np.ndarray` containing the terminal cost term :math:`\mathbf q_N` (optional) + :param qcN: A :python:`np.ndarray` containing the terminal cost term :math:`q_N` + :param mu: A regularization term which is useful for iterative-LQR (next week). Default to 0. + :return: A tuple of the form :python:`(L, l), (V, v, vc)` corresponding to the control and cost-matrices. + """ + N = len(A) + n,m = B[0].shape + # Initialize empty lists for control matrices and cost terms + L, l = [None]*N, [None]*N + V, v, vc = [None]*(N+1), [None]*(N+1), [None]*(N+1) + # Initialize constant cost-function terms to zero if not specified. + # They will be initialized to zero, meaning they have no effect on the update rules. + QN = np.zeros((n,n)) if QN is None else QN + qN = np.zeros((n,)) if qN is None else qN + qcN = 0 if qcN is None else qcN + H, q, qc, r = init_mat(H,m,n,N=N), init_mat(q,n,N=N), init_mat(qc,1,N=N), init_mat(r,m,N=N) + d = init_mat(d,n, N=N) + """ In the next line, you should initialize the last cost-term. This is similar to how we in DP had the initialization step + > J_N(x_N) = g_N(x_N) + Except that since x_N is no longer discrete, we store it as matrices/vectors representing a second-order polynomial, i.e. + > J_N(X_N) = 1/2 * x_N' V[N] x_N + v[N]' x_N + vc[N] + """ + # TODO: 1 lines missing. + raise NotImplementedError("Initialize V[N], v[N], vc[N] here") + + In = np.eye(n) + for k in range(N-1,-1,-1): + # When you update S_uu and S_ux remember to add regularization as the terms ... (V[k+1] + mu * In) ... + # Note that that to find x such that + # >>> x = A^{-1} y this + # in a numerically stable manner this should be done as + # >>> x = np.linalg.solve(A, y) + # The terms you need to update will be, in turn: + # Suu = ... + # Sux = ... + # Su = ... + # L[k] = ... + # l[k] = ... + # V[k] = ... + # V[k] = ... + # v[k] = ... + # vc[k] = ... + ## TODO: Half of each line of code in the following 4 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # Suu = R[k] + B[k].T @ (???????????????????????? + # Sux = H[k] + B[k].T @ (???????????????????????? + # Su = r[k] + B[k].T @ v[k + 1????????????????????????????? + # L[k] = -np.linal????????????????? + raise NotImplementedError("Insert your solution and remove this error.") + l[k] = -np.linalg.solve(Suu, Su) # You get this for free. Notice how we use np.lingalg.solve(A,x) to compute A^{-1} x + V[k] = Q[k] + A[k].T @ V[k+1] @ A[k] - L[k].T @ Suu @ L[k] + V[k] = 0.5 * (V[k] + V[k].T) # I recommend putting this here to keep V positive semidefinite + # You get these for free: Compare to the code in the algorithm. + v[k] = q[k] + A[k].T @ (v[k+1] + V[k+1] @ d[k]) + Sux.T @ l[k] + vc[k] = vc[k+1] + qc[k] + d[k].T @ v[k+1] + 1/2*( d[k].T @ V[k+1] @ d[k] ) + 1/2*l[k].T @ Su + + return (L,l), (V,v,vc) + + +def init_mat(X, a, b=None, N=None): + """ + Helper function. Check if X is None, and if so return a list + [A, A,....] + which is N long and where each A is a (a x b) zero-matrix, else returns X repeated N times: + [X, X, ...] + """ + M0 = np.zeros((a,) if b is None else (a, b)) + if X is not None: + return [m if m is not None else M0 for m in X] + else: + return [M0] * N + +def lqr_rollout(x0,A,B,d,L,l): + """ + Compute a rollout (states and actions) given solution from LQR controller function. + + x0 is a vector (starting state), and A, B, d and L, l are lists of system/control matrices. + """ + x, states,actions = x0, [x0], [] + n,m = B[0].shape + N = len(L) + d = init_mat(d,n,1,N) # Initialize as a list of zero matrices [ np.zeros((n,1)), np.zeros((n,1)), ...] + l = init_mat(l,m,1,N) # Initialize as a list of zero matrices [ np.zeros((m,1)), np.zeros((m,1)), ...] + + for k in range(N): + u = L[k] @ x + l[k] + x = A[k] @ x + B[k] @ u + d[k] + actions.append(u) + states.append(x) + return states, actions + +if __name__ == "__main__": + """ + Solve this problem (see also lecture notes for the same example) + http://cse.lab.imtlucca.it/~bemporad/teaching/ac/pdf/AC2-04-LQR-Kalman.pdf + """ + N = 20 + A = np.ones((2,2)) + A[1,0] = 0 + B = np.asarray([[0], [1]]) + Q = np.zeros((2,2)) + R = np.ones((1,1)) + + print("System matrices A, B, Q, R") + print(bmatrix(A)) + print(bmatrix(B)) + print(bmatrix(Q)) + print(bmatrix(R)) + + for rho in [0.1, 10, 100]: + Q[0,0] = 1/rho + (L,l), (V,v,vc) = LQR(A=[A]*N, B=[B]*N, d=None, Q=[Q]*N, R=[R]*N, QN=Q) + + x0 = np.asarray( [[1],[0]]) + trajectory, actions = lqr_rollout(x0,A=[A]*N, B=[B]*N, d=None,L=L,l=l) + + xs = np.concatenate(trajectory, axis=1)[0,:] + + plt.plot(xs, 'o-', label=f'rho={rho}') + + k = 10 + print(f"Control matrix in u_k = L_k x_k + l_k at k={k}:", L[k]) + for k in [N-1,N-2,0]: + print(f"L[{k}] is:", L[k].round(4)) + plt.title("Double integrator") + plt.xlabel('Steps $k$') + plt.ylabel('$x_1 = $ x[0]') + plt.legend() + plt.grid() + savepdf("dlqr_double_integrator") + plt.show() diff --git a/irlc/ex06/dlqr_check.py b/irlc/ex06/dlqr_check.py new file mode 100644 index 0000000000000000000000000000000000000000..3d86db35853cffea347fbb5a5666a7ba9ec47681 --- /dev/null +++ b/irlc/ex06/dlqr_check.py @@ -0,0 +1,40 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex06.dlqr import LQR + +def urnd(sz): + return np.random.uniform(-1, 1, sz) + +def check_LQR(): + np.random.seed(42) + n,m,N = 3,2,4 + """ + Create a randomized, nonsense control problem and solve it. Since seed is fixed we can expect same solution. + """ + # system tersm + A = [urnd((n, n)) for _ in range(N)] + B = [urnd((n, m)) for _ in range(N)] + d = [urnd((n,)) for _ in range(N)] + # cost terms + Q = [urnd((n, n)) for _ in range(N)] + R = [urnd((m, m)) for _ in range(N)] + H = [urnd((m, n)) for _ in range(N)] + q = [urnd((n,)) for _ in range(N)] + qc = [urnd(()) for _ in range(N)] + r = [urnd((m,)) for _ in range(N)] + # terminal costs + QN = urnd((n, n)) + qN = urnd((n,)) + qcN = urnd(()) + return LQR(A=A, B=B, d=d, Q=Q, R=R, H=H, q=q, r=r, qc=qc, QN=QN, qN=qN, qcN=qcN, mu=0) + + +if __name__ == "__main__": + (L, l), (V, v, vc) = check_LQR() + N = len(V)-1 + print(", ".join([f"l[{k}]={l[k].round(4)}" for k in [N - 1, N - 2, 0]])) + print("\n".join([f"L[{k}]={L[k].round(4)}" for k in [N - 1, N - 2, 0]])) + + print("\n".join([f"V[{k}]={V[k].round(4)}" for k in [0]])) + print(", ".join([f"v[{k}]={v[k].round(4)}" for k in [N, N - 1, 0]])) + print(", ".join([f"vc[{k}]={vc[k].round(4)}" for k in [N, N - 1, 0]])) diff --git a/irlc/ex06/lqr_agent.py b/irlc/ex06/lqr_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..f62ec55971e3757bb84a7ac113a24fee99462c4e --- /dev/null +++ b/irlc/ex06/lqr_agent.py @@ -0,0 +1,54 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.locomotive import LocomotiveEnvironment +from irlc import train, plot_trajectory, savepdf, Agent +from irlc.ex06.dlqr import LQR +from irlc.ex04.control_environment import ControlEnvironment +import numpy as np +import matplotlib.pyplot as plt + +class LQRAgent(Agent): + def __init__(self, env : ControlEnvironment, A, B, Q, R, d=None, q=None): + N = int((env.Tmax / env.dt)) # Obtain the planning horizon + """ Define A, B as the list of A/B matrices here. I.e. x[t+1] = A x[t] + B x[t] + d. + You should use the function model.f to do this, which has build-in functionality to compute Jacobians which will be equal to A, B """ + """ Define self.L, self.l here as the (lists of) control matrices. """ + ## TODO: Half of each line of code in the following 1 lines have been replaced by garbage. Make it work and remove the error. + #---------------------------------------------------------------------------------------------------------------------------- + # (self.L, self.l), _ = LQR(A=[A]*N, B=[B]*N, d=[d]*N if d is not No??????????????????????????????????????????????????????????????????? + raise NotImplementedError("Insert your solution and remove this error.") + self.dt = env.dt + super().__init__(env) + + def pi(self,x, k, info=None): + """ + Compute the action here using u = L_k x + l_k. + You should use self.L, self.l to get the control matrices (i.e. L_k = self.L[k] ), + """ + # TODO: 1 lines missing. + raise NotImplementedError("Compute current action here") + return u + + +if __name__ == "__main__": + # Make a guess at the system matrices for planning. We will return on how to compute these exactly in a later exercise. + A = np.ones((2, 2)) + A[1, 0] = 0 + B = np.asarray([[0], [1]]) + Q = np.eye(2)*3 + R = np.ones((1, 1))*2 + q = np.asarray([-1.1, 0 ]) + + # Create and test our LQRAgent. + env = LocomotiveEnvironment(render_mode='human', Tmax=10, slope=1) + agent = LQRAgent(env, A=A, B=B, Q=Q, R=R, q=q) + stats, traj = train(env, agent, num_episodes=1) + + env.reset() + savepdf("locomotive_snapshot.pdf", env=env) # Make a plot for the exercise file. + env.state_labels = ["x(t)", "v(t)"] + env.action_labels = ["u(t)"] + plot_trajectory(traj[0], env) + plt.show(block=True) + savepdf("lqr_agent") + plt.show() + env.close() diff --git a/irlc/ex06/lqr_pid.py b/irlc/ex06/lqr_pid.py new file mode 100644 index 0000000000000000000000000000000000000000..136cae2ba5a2bebcf45880daec13e853a7bee9b4 --- /dev/null +++ b/irlc/ex06/lqr_pid.py @@ -0,0 +1,79 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import matplotlib.pyplot as plt +import numpy as np +from irlc import savepdf, train +from irlc.ex04.pid_locomotive_agent import PIDLocomotiveAgent +from irlc.ex06.lqr_agent import LQRAgent +from irlc.ex04.model_harmonic import HarmonicOscilatorEnvironment +from irlc.ex06.boeing_lqr import compute_A_B_d, compute_Q_R_q + +class ConstantLQRAgent(LQRAgent): + # TODO: 3 lines missing. + raise NotImplementedError("Complete this agent here. You need to update the policy-function: def pi(self, ..).") + +def get_Kp_Kd(L0): + # TODO: 1 lines missing. + raise NotImplementedError("Use lqr_agent.L to define Kp and Kd.") + return Kp, Kd + + +if __name__ == "__main__": + Delta = 0.06 # Time discretization constant + # Define a harmonic osscilator environment. Use .., render_mode='human' to see a visualization. + env = HarmonicOscilatorEnvironment(Tmax=8, dt=Delta, m=0.5, R=np.eye(1) * 8, render_mode=None) # set render_mode='human' to see the oscillator. + model = env.discrete_model.continuous_model # Get the ControlModel corresponding to this environment. + + + # Compute the discretized A, B and d matrices using the helper functions we defined in the Boeing problem. + # Note that these are for the discrete environment: x_{k+1} = A x_k + B u_k + d + A, B, d = compute_A_B_d(model, Delta) + Q, R, q = compute_Q_R_q(model, Delta) + + # Run the LQR agent + lqr_agent = LQRAgent(env, A=A, B=B, d=d, Q=Q, R=R, q=q) + _, traj1 = train(env, lqr_agent, return_trajectory=True) + + # Part 1. Build an agent that always takes actions u_k = L_0 x_k + l_0 + constant_agent = ConstantLQRAgent(env, A=A, B=B, d=d, Q=Q, R=R, q=q) + # Check that its policy is independent of $k$: + x0, _ = env.reset() + print(f"Initial state is {x0=}") + print(f"Action at time step k=0 {constant_agent.pi(x0, k=0)=}") + print(f"Action at time step k=5 (should be the same) {constant_agent.pi(x0, k=0)=}") + + _, traj2 = train(env, constant_agent, return_trajectory=True) + + # Part 2. Use the L and l matrices (see lqr_agent.L and lqr_agent.l) + # to select Kp and Kd in a PID agent. Then let's use the Locomotive agent to see the effect of the controller. + # Use render_mode='human' to see its effect. + # We only need to use L. + # Hint: compare the form of the LQR and PID controller and use that to select Kp and Kd. + Kp, Kd = get_Kp_Kd(lqr_agent.L[0]) # Use lqr_agent.L to define Kp and Kd. + + # Define and run the PID agent. + pid_agent = PIDLocomotiveAgent(env, env.dt, Kp=Kp, Kd=Kd) + _, traj3 = train(env, pid_agent, return_trajectory=True) + + # Plot all actions and state sequences. + plt.figure(figsize=(10,5)) + plt.grid() + plt.plot(traj1[0].time[:-1], traj1[0].action, label="Optimal LQR action sequence") + plt.plot(traj2[0].time[:-1], traj2[0].action, '.-', label="Constant LQR action sequence") + plt.plot(traj3[0].time[:-1], traj3[0].action, label="PID agent action sequence") + plt.xlabel("Time / Seconds") + plt.ylabel("Action / Newtons") + plt.ylim([-.2, .2]) + plt.legend() + savepdf("pid_lqr_actions") + plt.show(block=True) + + plt.figure(figsize=(10, 5)) + plt.grid() + plt.plot(traj1[0].time, traj1[0].state[:, 0], label="Optimal LQR states x(t)") + plt.plot(traj2[0].time, traj2[0].state[:, 0], label="Constant LQR states x(t)") + plt.plot(traj3[0].time, traj3[0].state[:, 0], label="PID agent states x(t)") + plt.xlabel("Time / Seconds") + plt.ylabel("Position x(t) / Meters") + plt.ylim([-1, 1]) + plt.legend() + savepdf("pid_lqr_states") diff --git a/irlc/ex06/model_boeing.py b/irlc/ex06/model_boeing.py new file mode 100644 index 0000000000000000000000000000000000000000..57e0a0c7a3664a45005437b4576038021c60f4ef --- /dev/null +++ b/irlc/ex06/model_boeing.py @@ -0,0 +1,62 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment +from irlc.ex04.model_linear_quadratic import LinearQuadraticModel + +class BoeingModel(LinearQuadraticModel): + """ + Boeing 747 level flight example. + + See: https://books.google.dk/books?id=tXZDAAAAQBAJ&pg=PA147&lpg=PA147&dq=boeing+747+flight+0.322+model+longitudinal+flight&source=bl&ots=L2RpjCAWiZ&sig=ACfU3U2m0JsiHmUorwyq5REcOj2nlxZkuA&hl=en&sa=X&ved=2ahUKEwir7L3i6o3qAhWpl4sKHQV6CdcQ6AEwAHoECAoQAQ#v=onepage&q=boeing%20747%20flight%200.322%20model%20longitudinal%20flight&f=false + Also: https://web.stanford.edu/~boyd/vmls/vmls-slides.pdf + """ + state_labels = ["Longitudinal velocity (x) ft/sec", "Velocity in y-axis ft/sec", "Angular velocity", + "angle wrt. horizontal"] + action_labels = ['Elevator', "Throttle"] + observation_labels = ["Airspeed", "Climb rate"] + + def __init__(self, output=None): + if output is None: + output = [10, 0] + # output = [10, 0] + A = [[-0.003, 0.039, 0, -0.322], + [-0.065, -.319, 7.74, 0], + [.02, -.101, -0.429, 0], + [0, 0, 1, 0]] + B = [[.01, 1], + [-.18, -.04], + [-1.16, .598], + [0, 0]] + + A, B = np.asarray(A), np.asarray(B) + self.u0 = 7.74 # speed in hundred feet/seconds + self.P = np.asarray([[1, 0, 0, 0], [0, -1, 0, 7.74]]) # Projection of state into airspeed + + dt = 0.1 # Scale the cost by this factor. + + # Set up the cost: + self.Q_obs = np.eye(2) + Q = self.P.T @ self.Q_obs @ self.P / dt + R = np.eye(2) / dt + q = -np.asarray(output) @ self.Q_obs @ self.P / dt + super().__init__(A=A, B=B, Q=Q, R=R, q=q) + + def state2outputs(self, x): + return self.P @ x + +class DiscreteBoeingModel(DiscreteControlModel): + def __init__(self, output=None): + model = BoeingModel(output=output) + dt = 0.1 + super().__init__(model=model, dt=dt) + + +class BoeingEnvironment(ControlEnvironment): + @property + def observation_labels(self): + return self.discrete_model.continuous_model.observation_labels + + def __init__(self, Tmax=10): + model = DiscreteBoeingModel() + super().__init__(discrete_model=model, Tmax=Tmax) diff --git a/irlc/ex06/model_rendevouz.py b/irlc/ex06/model_rendevouz.py new file mode 100644 index 0000000000000000000000000000000000000000..c6a98291f8a6a8e282e7a7ae4c2cf99b4c779d27 --- /dev/null +++ b/irlc/ex06/model_rendevouz.py @@ -0,0 +1,95 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.utils.graphics_util_pygame import UpgradedGraphicsUtil +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment +from irlc.ex04.model_linear_quadratic import LinearQuadraticModel +from gymnasium.spaces import Box + +""" +SEE: https://github.com/anassinator/ilqr/blob/master/examples/rendezvous.ipynb +""" +class ContiniousRendevouzModel(LinearQuadraticModel): + state_labels= ["x0", "y0", "x1", "y1", 'Vx0', "Vy0", "Vx1", "Vy1"] + action_labels = ['Fx0', 'Fy0', "Fx1", "Fy1"] + x0 = np.array([0, 0, 10, 10, 0, -5, 5, 0]) # Initial state. + + def __init__(self, m=10.0, alpha=0.1, simple_bounds=None, cost=None): + m00 = np.zeros((4,4)) + mI = np.eye(4) + A = np.block( [ [m00, mI], [m00, -alpha/m*mI] ] ) + B = np.block( [ [m00], [mI/m]] ) + state_size = len(self.x0) + action_size = 4 + self.m = m + self.alpha = alpha + Q = np.eye(state_size) + Q[0, 2] = Q[2, 0] = -1 + Q[1, 3] = Q[3, 1] = -1 + R = 0.1 * np.eye(action_size) + self.viewer = None + super().__init__(A=A, B=B, Q=Q*20, R=R*20) + + def x0_bound(self) -> Box: + return Box(self.x0, self.x0) # self.bounds['x0_low'] = self.bounds['x0_high'] = list(self.x0) + + def render(self, x, render_mode="human"): + """ Render the environment. You don't have to understand this code. """ + if self.viewer is None: + self.viewer = HarmonicViewer(xstar=0, x0=self.x0) # target: x=0. + self.viewer.update(x) + import time + time.sleep(0.05) + return self.viewer.blit(render_mode=render_mode) + + def close(self): + pass + + +class DiscreteRendevouzModel(DiscreteControlModel): + def __init__(self, dt=0.1, cost=None, transform_actions=True, **kwargs): + model = ContiniousRendevouzModel(**kwargs) + super().__init__(model=model, dt=dt, cost=cost) + +class RendevouzEnvironment(ControlEnvironment): + def __init__(self, Tmax=20, render_mode=None, **kwargs): + discrete_model = DiscreteRendevouzModel(**kwargs) + super().__init__(discrete_model, Tmax=Tmax, render_mode=render_mode) + +class HarmonicViewer(UpgradedGraphicsUtil): + def __init__(self, xstar = 0, x0=None): + self.xstar = xstar + width = 800 + self.x0 = x0 + sz = 20 + self.scale = width/(2*sz) + self.p1h = [] + self.p2h = [] + super().__init__(screen_width=width, xmin=-sz, xmax=sz, ymin=-sz, ymax=sz, title='Rendevouz environment') + + def render(self): + self.draw_background(background_color=(255, 255, 255)) + # dw = self.dw + p1 = self.x[:2] + p2 = self.x[2:4] + self.p1h.append(p1) + self.p2h.append(p2) + self.circle("asdf", pos=p1, r=.5 * self.scale, fillColor=(200, 0, 0)) + self.circle("asdf", pos=p2, r=.5 * self.scale, fillColor=(0, 0, 200) ) + if len(self.p1h) > 2: + self.polyline('...', np.stack(self.p1h)[:,0], np.stack(self.p1h)[:,1], width=1, color=(200, 0, 0)) + self.polyline('...', np.stack(self.p2h)[:,0], np.stack(self.p2h)[:,1], width=1, color=(0, 0, 200)) + + if tuple(self.x) == tuple(self.x0): + self.p1h = [] + self.p2h = [] + + + def update(self, x): + self.x = x + + +if __name__ == "__main__": + from irlc import Agent, train + env = RendevouzEnvironment(render_mode='human') + train(env, Agent(env), num_episodes=4) diff --git a/irlc/ex07/__init__.py b/irlc/ex07/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8044b811c2884a4486857534492a8d7a83575b --- /dev/null +++ b/irlc/ex07/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 7.""" diff --git a/irlc/ex07/__pycache__/__init__.cpython-311.pyc b/irlc/ex07/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31f12480f2dcfe61ecbfb39cae46bc4ec87cffd0 Binary files /dev/null and b/irlc/ex07/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex07/__pycache__/ilqr.cpython-311.pyc b/irlc/ex07/__pycache__/ilqr.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a751e7129fe8878e0d83ad2613ed7b7c560ad85 Binary files /dev/null and b/irlc/ex07/__pycache__/ilqr.cpython-311.pyc differ diff --git a/irlc/ex07/ilqr.py b/irlc/ex07/ilqr.py new file mode 100644 index 0000000000000000000000000000000000000000..8e33a8f7a0ba13fcbe07df05a76ed6f096743b86 --- /dev/null +++ b/irlc/ex07/ilqr.py @@ -0,0 +1,273 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" +This implements two methods: The basic ILQR method, described in (Her24, Algorithm 24), and the linesearch-based method +described in (Her24, Algorithm 25). + +If you are interested, you can consult (TET12) (which contains generalization to DDP) and (Har20, Alg 1). + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. + [TET12] Yuval Tassa, Tom Erez, and Emanuel Todorov. Synthesis and stabilization of complex behaviors through online trajectory optimization. In 2012 IEEE/RSJ International Conference on Intelligent Robots and Systems, 4906–4913. IEEE, 2012. (See tassa2012.pdf). + [Har20] James Harrison. Optimal and learning-based control combined course notes. (See AA203combined.pdf), 2020. +""" +import warnings +import numpy as np +from irlc.ex06.dlqr import LQR +from irlc.ex04.discrete_control_model import DiscreteControlModel + +def ilqr_basic(model : DiscreteControlModel, N, x0, us_init : list = None, n_iterations=500, verbose=True): + """ + Implements the basic ilqr algorithm, i.e. (Her24, Algorithm 24). Our notation (x_bar, etc.) will be consistent with the lecture slides + """ + mu, alpha = 1, 1 # Hyperparameters. For now, just let them have defaults and don't change them + # Create a random initial state-sequence + n, m = model.state_size, model.action_size + u_bar = [np.random.uniform(-1, 1,(model.action_size,)) for _ in range(N)] if us_init is None else us_init + x_bar = [x0] + [np.zeros(n, )] * N + """ + Initialize nominal trajectory xs, us using us and x0 (i.e. simulate system from x0 using action sequence us). + The simplest way to do this is to call forward_pass with all-zero sequence of control vector/matrix l, L. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Initialize x_bar, u_bar here") + J_hist = [] + for i in range(n_iterations): + """ + Compute derivatives around trajectory and cost estimate J of trajectory. To do so, use the get_derivatives + function. Remember the functions will return lists of derivatives. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Compute J and derivatives A_k = f_x, B_k = f_u, ....") + """ Backward pass: Obtain feedback law matrices l, L using the backward_pass function. + """ + # TODO: 1 lines missing. + raise NotImplementedError("Compute L, l = .... here") + """ Forward pass: Given L, l matrices computed above, simulate new (optimal) action sequence. + In the lecture slides, this is similar to how we compute u^*_k and x_k + Once they are computed, iterate the iLQR algorithm by setting x_bar, u_bar equal to these values + """ + # TODO: 1 lines missing. + raise NotImplementedError("Compute x_bar, u_bar = ...") + if verbose: + print(f"{i}> J={J:4g}, change in cost since last iteration {0 if i == 0 else J-J_hist[-1]:4g}") + J_hist.append(J) + return x_bar, u_bar, J_hist, L, l + +def ilqr_linesearch(model : DiscreteControlModel, N, x0, n_iterations, us_init=None, tol=1e-6, verbose=True): + """ + For linesearch implement method described in (Her24, Algorithm 25) (we will use regular iLQR, not DDP!) + """ + # The range of alpha-values to try out in the linesearch + # plus parameters relevant for regularization scheduling. + alphas = 1.1 ** (-np.arange(10) ** 2) # alphas = [1, 1.1^{-2}, ...] + mu_min = 1e-6 + mu_max = 1e10 + Delta_0 = 2 + mu = 1.0 + Delta = Delta_0 + + n, m = model.state_size, model.action_size + u_bar = [np.random.uniform(-1, 1, (model.action_size,)) for _ in range(N)] if us_init is None else us_init + x_bar = [x0] + [np.zeros(n, )] * (N) + # Initialize nominal trajectory xs, us (same as in basic linesearch) + # TODO: 2 lines missing. + raise NotImplementedError("Copy-paste code from previous solution") + J_hist = [] + + converged = False + for i in range(n_iterations): + alpha_was_accepted = False + """ Step 1: Compute derivatives around trajectory and cost estimate of trajectory. + (copy-paste from basic implementation). In our implementation, J_bar = J_{u^star}(x_0) """ + # TODO: 2 lines missing. + raise NotImplementedError("Obtain derivatives f_x, f_u, ... as well as cost of trajectory J_bar = ...") + try: + """ + Step 2: Backward pass to obtain control law (l, L). Same as before so more copy-paste + """ + # TODO: 1 lines missing. + raise NotImplementedError("Obtain l, L = ... in backward pass") + """ + Step 3: Forward pass and alpha scheduling. + Decrease alpha and check condition |J^new < J'|. Apply the regularization scheduling as needed. """ + for alpha in alphas: + x_hat, u_hat = forward_pass(model, x_bar, u_bar, L=L, l=l, alpha=alpha) # Simulate trajectory using this alpha + # TODO: 1 lines missing. + raise NotImplementedError("Compute J_new = ... as the cost of trajectory x_hat, u_hat") + + if J_new < J_prime: + """ Linesearch proposed trajectory accepted! Set current trajectory equal to x_hat, u_hat. """ + if np.abs((J_prime - J_new) / J_prime) < tol: + converged = True # Method does not seem to decrease J; converged. Break and return. + + J_prime = J_new + x_bar, u_bar = x_hat, u_hat + ''' + The update was accepted and you should change the regularization term mu, + and the related scheduling term Delta. + ''' + # TODO: 1 lines missing. + raise NotImplementedError("Delta, mu = ...") + alpha_was_accepted = True # accept this alpha + break + except np.linalg.LinAlgError as e: + # Matrix in dlqr was not positive-definite and this diverged + warnings.warn(str(e)) + + if not alpha_was_accepted: + ''' No alphas were accepted, which is not too hot. Regularization should change + ''' + # TODO: 1 lines missing. + raise NotImplementedError("Delta, mu = ...") + + if mu_max and mu >= mu_max: + raise Exception("Exceeded max regularization term; we are stuffed.") + + dJ = 0 if i == 0 else J_prime-J_hist[-1] + info = "converged" if converged else ("accepted" if alpha_was_accepted else "failed") + if verbose: + print(f"{i}> J={J_prime:4g}, decrease in cost {dJ:4g} ({info}).\nx[N]={x_bar[-1].round(2)}") + J_hist.append(J_prime) + if converged: + break + return x_bar, u_bar, J_hist, L, l + +def backward_pass(A : list, B : list, c_x : list, c_u : list, c_xx : list, c_ux : list, c_uu : list, mu=1): + r"""Given all derivatives, apply the LQR algorithm to get the control law. + + The input arguments are described in the online documentation and the lecture notes. You should use them to call your + implementation of the :func:`~irlc.ex06.dlqr.LQR` method. Note that you should give a value of all inputs except for the ``d``-term. + + :param A: linearization of the dynamics matrices :math:`A_k`. + :param B: linearization of the dynamics matrices :math:`B_k`. + :param c_x: Cost terms corresponding to :math:`\mathbf{q}_k` + :param c_u: Cost terms corresponding to :math:`\mathbf{r}_k` + :param c_xx: Cost terms corresponding to :math:`Q_k` + :param c_ux: Cost terms corresponding to :math:`H_k` + :param c_uu: Cost terms corresponding to :math:`R_k` + :param mu: Regularization parameter for the LQR method + :return: The control law :math:`L_k, \mathbf{l}_k` as two lists. + """ + Q, QN = c_xx[:-1], c_xx[-1] # An example. + # TODO: 4 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + # Define the inputs using the linearization inputs. + (L, l), (V, v, vc) = LQR(A=A, B=B, R=R, Q=Q, QN=QN, H=H, q=q, qN=qN, r=r, mu=mu) + return L, l + +def cost_of_trajectory(model : DiscreteControlModel, xs : list, us : list) -> float: + r"""Helper function which computes the cost of the trajectory. + + The cost is defined as: + + .. math:: + + c_N( \bar {\mathbf x}_N, \bar {\mathbf u}_) + \sum_{k=0}^{N-1} c_k(\bar {\mathbf x}_k, \bar {\mathbf u}_k) + + and to compute it, you should use the two helper methods ``model.cost.c`` and ``model.cost.cN`` + (see :func:`~irlc.ex04.discrete_control_cost.DiscreteQRCost.c` and :func:`~irlc.ex04.discrete_control_cost.DiscreteQRCost.cN`). + + :param model: The control model used to compute the cost. + :param xs: A list of length :math:`N+1` of the form :math:`\begin{bmatrix}\bar {\mathbf x}_0 & \dots & \bar {\mathbf x}_N \end{bmatrix}` + :param us: A list of length :math:`N` of the form :math:`\begin{bmatrix}\bar {\mathbf x}_0 & \dots & \bar {\mathbf x}_{N-1} \end{bmatrix}` + :return: The cost as a number. + """ + N = len(us) + JN = model.cost.cN(xs[-1]) + return sum(map(lambda args: model.cost.c(*args), zip(xs[:-1], us, range(N)))) + JN + +def get_derivatives(model : DiscreteControlModel, x_bar : list, u_bar : list): + """Compute all the derivatives used in the model. + + The return type should match the meaning in (Her24, Subequation 17.8) and in the online documentation. + + - ``c`` should be a list of length :math:`N+1` + - ``c_x`` should be a list of length :math:`N+1` + - ``c_xx`` should be a list of length :math:`N+1` + - ``c_u`` should be a list of length :math:`N` + - ``c_uu`` should be a list of length :math:`N` + - ``c_ux`` should be a list of length :math:`N` + - ``A`` should be a list of length :math:`N` + - ``B`` should be a list of length :math:`N` + + Use the model to compute these terms. For instance, this will compute the first terms ``A[0]`` and ``B[0]``:: + + A0, B0 = model.f_jacobian(x_bar[0], u_bar[0], 0) + + Meanwhile, to compute the first terms of the cost-functions you should use:: + + c[0], c_x[0], c_u[0], c_xx[0], c_ux[0], c_uu[0] = model.cost.c(x_bar[0], u_bar[0], k=0, compute_gradients=True) + + :param model: The model to use when computing the derivatives of the cost + :param x_bar: The nominal state-trajectory + :param u_bar: The nominal action-trajectory + :return: Lists of all derivatives computed around the nominal trajectory (see the lecture notes). + """ + N = len(u_bar) + """ Compute A_k, B_k (lists of matrices of length N) as the jacobians of the dynamics. To do so, + recall from the online documentation that: + x, f_x, f_u = model.f(x, u, k, compute_jacobian=True) + """ + A = [None]*N + B = [None]*N + c = [None] * (N+1) + c_x = [None] * (N + 1) + c_xx = [None] * (N + 1) + + c_u = [None] * (N+1) + c_ux = [None] * (N + 1) + c_uu = [None] * (N + 1) + # Now update each entry correctly (i.e., ensure there are no None elements left). + # TODO: 4 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + """ Compute derivatives of the cost function. For terms not including u these should be of length N+1 + (because of gN!), for the other lists of length N + recall model.cost.c has output: + c[i], c_x[i], c_u[i], c_xx[i], c_ux[i], c_uu[i] = model.cost.c(x, u, i, compute_gradients=True) + """ + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + # Concatenate the derivatives associated with the last time point N. + cN, c_xN, c_xxN = model.cost.cN(x_bar[N], compute_gradients=True) + # TODO: 3 lines missing. + raise NotImplementedError("Update c, c_x and c_xx with the terminal terms.") + return A, B, c, c_x, c_u, c_xx, c_ux, c_uu + +def forward_pass(model : DiscreteControlModel, x_bar : list, u_bar : list, L : list, l : list, alpha=1.0): + r"""Simulates the effect of the controller on the model + + We assume the system starts in :math:`\mathbf{x}_0 = \bar {\mathbf x}_0`, and then simulate the effect of + generating actions using the closed-loop policy + + .. math:: + + \mathbf{u}_k = \bar {\mathbf u}_k + \alpha \mathbf{l}_k + L_k (\mathbf{x}_k - \bar { \mathbf x}_k) + + (see (Her24, eq. (17.16))). + + :param model: The model used to compute the dynamics. + :param x_bar: A nominal list of states + :param u_bar: A nominal list of actions (not used by the method) + :param L: A list of control matrices :math:`L_k` + :param l: A list of control vectors :math:`\mathbf{l}_k` + :param alpha: The linesearch parameter. + :return: A list of length :math:`N+1` of simulated states and a list of length :math:`N` of simulated actions. + """ + N = len(u_bar) + x = [None] * (N+1) + u_star = [None] * N + x[0] = x_bar[0].copy() + + for i in range(N): + r""" Compute using (Her24, eq. (17.16)) + u_{i} = ... + """ + # TODO: 1 lines missing. + raise NotImplementedError("u_star[i] = ....") + """ Remember to compute + x_{i+1} = f_k(x_i, u_i^*) + here: + """ + # TODO: 1 lines missing. + raise NotImplementedError("x[i+1] = ...") + return x, u_star diff --git a/irlc/ex07/ilqr_agent.py b/irlc/ex07/ilqr_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..9280fc7d9f14e4ffa9b8763762bbe747506ad9f1 --- /dev/null +++ b/irlc/ex07/ilqr_agent.py @@ -0,0 +1,56 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from irlc.ex06.model_rendevouz import RendevouzEnvironment +from irlc.ex07.ilqr_rendovouz_basic import ilqr +from irlc import train +from irlc import Agent +import numpy as np + +class ILQRAgent(Agent): + def __init__(self, env, discrete_model, N=250, ilqr_iterations=10, use_ubar=False, use_linesearch=True): + super().__init__(env) + self.dt = discrete_model.dt + # x0 = discrete_model.reset() + x0,_ = env.reset() + x0 = np.asarray(x0) # Get the initial state. We will take this from the environment. + xs, us, self.J_hist, L, l = ilqr(discrete_model, N, x0, n_iter=ilqr_iterations, use_linesearch=use_linesearch) + self.ubar = us + self.xbar = xs + self.L = L + self.l = l + self.use_ubar = use_ubar # Should policy use open-loop u-bar (suboptimal) or closed-loop L_k, l_k? + + def pi(self, x, k, info=None): + if self.use_ubar: + u = self.ubar[k] + else: + if k >= len(self.ubar): + print(k, len(self.ubar)) + k = len(self.ubar)-1 + # See (Her24, eq. (17.16)) + # TODO: 1 lines missing. + raise NotImplementedError("Generate action using the control matrices.") + return u + +def solve_rendevouz(): + env = RendevouzEnvironment() + N = int(env.Tmax / env.dt) + agent = ILQRAgent(env, env.discrete_model, N=N) + stats, trajectories = train(env, agent, num_episodes=1, return_trajectory=True) + env.close() + return stats, trajectories, agent + +if __name__ == "__main__": + from irlc.ex07.ilqr_rendovouz_basic import plot_vehicles + import matplotlib.pyplot as plt + stats, trajectories, agent = solve_rendevouz() + t =trajectories[0].state + xb = agent.xbar + plot_vehicles(t[:,0], t[:,1], t[:,2], t[:,3], linespec=':', legend=("RK4 policy simulation", "RK4 policy simulation")) + plot_vehicles(xb[:,0], xb[:,1], xb[:,2], xb[:,3], linespec='-') + plt.legend() + plt.show() diff --git a/irlc/ex07/ilqr_cartpole.py b/irlc/ex07/ilqr_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..d2463a56ff438c600d4e4047d1e349757716dfc7 --- /dev/null +++ b/irlc/ex07/ilqr_cartpole.py @@ -0,0 +1,83 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import matplotlib.pyplot as plt +import numpy as np +from irlc.ex05.model_cartpole import GymSinCosCartpoleEnvironment +import time +from irlc.ex07.ilqr_rendovouz_basic import ilqr +from irlc import savepdf + +# Number of steps. +N = 100 +def cartpole(use_linesearch): + env = GymSinCosCartpoleEnvironment(render_mode='human') + x0, info = env.reset() + xs, us, J_hist, L, l = ilqr(env.discrete_model, N, x0, n_iter=300, use_linesearch=use_linesearch) + plot_cartpole(env, xs, us, use_linesearch=use_linesearch) + +def plot_cartpole(env, xs, us, J_hist=None, use_linesearch=True): + animate(xs, env) + env.close() + # Transform actions/states using build-in functions. + def gapply(f, xm): + usplit = np.split(xm, len(xm)) + u2 = [f(u.flat) for u in usplit] + us = np.stack(u2) + return us + + us = gapply(env.discrete_model.phi_u_inv, us) + xs = gapply(env.discrete_model.phi_x_inv, xs) + + t = np.arange(N + 1) * env.dt + x = xs[:, 0] + theta = np.unwrap(xs[:, 2]) # Makes for smoother plots. + theta_dot = xs[:, 3] + pdf_ex = '_linesearch' if use_linesearch else '' + ev = 'cartpole_' + + plt.plot(theta, theta_dot) + plt.xlabel("theta (rad)") + plt.ylabel("theta_dot (rad/s)") + plt.title("Orientation Phase Plot") + plt.grid() + savepdf(f"{ev}theta{pdf_ex}") + plt.show() + + _ = plt.plot(t[:-1], us) + _ = plt.xlabel("time (s)") + _ = plt.ylabel("Force (N)") + _ = plt.title("Action path") + plt.grid() + savepdf(f"{ev}action{pdf_ex}") + plt.show() + + _ = plt.plot(t, x) + _ = plt.xlabel("time (s)") + _ = plt.ylabel("Position (m)") + _ = plt.title("Cart position") + plt.grid() + savepdf(f"{ev}position{pdf_ex}") + plt.show() + if J_hist is not None: + _ = plt.plot(J_hist) + _ = plt.xlabel("Iteration") + _ = plt.ylabel("Total cost") + _ = plt.title("Total cost-to-go") + plt.grid() + savepdf(f"{ev}J{pdf_ex}") + plt.show() + +def animate(xs0, env): + render = True + if render: + for i in range(2): + render_(xs0, env.discrete_model) + time.sleep(1) + # env.viewer.close() + +def render_(xs, env): + for i in range(xs.shape[0]): + x = xs[i] + env.render(x=x) + +if __name__ == "__main__": + cartpole(use_linesearch=True) diff --git a/irlc/ex07/ilqr_cartpole_agent.py b/irlc/ex07/ilqr_cartpole_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..cd82bd29a258b4eaf04b3f4f2ee3e64b74bf8af2 --- /dev/null +++ b/irlc/ex07/ilqr_cartpole_agent.py @@ -0,0 +1,43 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex07.ilqr_agent import ILQRAgent +from irlc import train +from irlc import savepdf +import matplotlib.pyplot as plt +from irlc.ex05.model_cartpole import GymSinCosCartpoleEnvironment + +def cartpole_experiment(N=12, use_linesearch=True, figex="", animate=True): + np.random.seed(2) + Tmax = .9 + dt = Tmax/N + env = GymSinCosCartpoleEnvironment(dt=dt, Tmax=Tmax, supersample_trajectory=True, render_mode='human' if animate else None) + agent = ILQRAgent(env, env.discrete_model, N=N, ilqr_iterations=200, use_linesearch=use_linesearch) + stats, trajectories = train(env, agent, num_episodes=1, return_trajectory=True) + + agent.use_ubar = True + stats2, trajectories2 = train(env, agent, num_episodes=1, return_trajectory=True) + env.close() + + xb = agent.xbar + tb = np.arange(N+1)*dt + plt.figure(figsize=(8,6)) + F = 3 + plt.plot(trajectories[0].time, trajectories[0].state[:,F], 'k-', label='Closed-loop $\\pi$') + plt.plot(trajectories2[0].time, trajectories2[0].state[:,F], '-', label='Open-loop $\\bar{u}_k$') + + plt.plot(tb, xb[:,F], '.-', label="iLQR rediction $\\bar{x}_k$") + plt.xlabel("Time/seconds") + plt.ylabel("$\cos(\\theta)$") + plt.title(f"Cartpole environment $T={N}$") + + plt.grid() + plt.legend() + ev = "pendulum" + savepdf(f"irlc_cartpole_theta_N{N}_{use_linesearch}{figex}") + plt.show() + +def plt_cartpole(): + cartpole_experiment(N=50, use_linesearch=True, animate=True) + +if __name__ == '__main__': + plt_cartpole() diff --git a/irlc/ex07/ilqr_pendulum.py b/irlc/ex07/ilqr_pendulum.py new file mode 100644 index 0000000000000000000000000000000000000000..5bcc82e7d94d072bf5ce529e08a99bb6f0647895 --- /dev/null +++ b/irlc/ex07/ilqr_pendulum.py @@ -0,0 +1,68 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex04.model_pendulum import DiscreteSinCosPendulumModel +import matplotlib.pyplot as plt +import time +from irlc.ex07.ilqr_rendovouz_basic import ilqr +from irlc import savepdf + +def pendulum(use_linesearch): + print("> Using iLQR to solve Pendulum swingup task. Using linesearch?", use_linesearch) + dt = 0.02 + model = DiscreteSinCosPendulumModel(dt, cost=None) + N = 250 + # This rather clunky line gets us the initial state; we transform the bound by the variable transformation. + x0 = np.asarray(model.phi_x(model.continuous_model.x0_bound().low)) + + n_iter = 200 # Use 200 iLQR iterations. + # xs, us, J_hist, L, l = ilqr(model, ...) Write a function call like this, but with the correct parametesr + # TODO: 1 lines missing. + raise NotImplementedError("Call iLQR here (see hint above).") + + render = True + if render: + for i in range(2): + render_(xs, model) + time.sleep(2) # Sleep for two seconds between simulations. + model.close() + xs = np.asarray([model.phi_x_inv(x) for x, u in zip(xs, us)]) # Convert to Radians. We use the build-in functions to change coordinates. + xs, us = np.asarray(xs), np.asarray(us) + + t = np.arange(N) * dt + theta = np.unwrap(xs[:, 0]) # Makes for smoother plots. + theta_dot = xs[:, 1] + + pdf_ex = '_linesearch' if use_linesearch else '' + stitle = "(using linesearch)" if use_linesearch else "(not using linesearch) " + ev = 'pendulum_' + _ = plt.plot(theta, theta_dot) + _ = plt.xlabel("$\\theta$ (rad)") + _ = plt.ylabel("$d\\theta/dt$ (rad/s)") + _ = plt.title(f"Phase Plot {stitle}") + plt.grid() + savepdf(f"{ev}theta{pdf_ex}") + plt.show() + + _ = plt.plot(t, us) + _ = plt.xlabel("time (s)") + _ = plt.ylabel("Force (N)") + _ = plt.title(f"Action path {stitle}") + plt.grid() + savepdf(f"{ev}action{pdf_ex}") + plt.show() + + _ = plt.plot(J_hist) + _ = plt.xlabel("Iteration") + _ = plt.ylabel("Total cost") + _ = plt.title(f"Total cost-to-go {stitle}") + plt.grid() + savepdf(f"{ev}J{pdf_ex}") + plt.show() + +def render_(xs, env): + for i in range(xs.shape[0]): + env.render(xs[i]) + +if __name__ == "__main__": + pendulum(use_linesearch=False) + pendulum(use_linesearch=True) diff --git a/irlc/ex07/ilqr_pendulum_agent.py b/irlc/ex07/ilqr_pendulum_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..a52b0237cd6669973b0dabe95c4b31d6556eb1c3 --- /dev/null +++ b/irlc/ex07/ilqr_pendulum_agent.py @@ -0,0 +1,63 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment +from irlc.ex07.ilqr_agent import ILQRAgent +from irlc import train +from irlc import savepdf +import matplotlib.pyplot as plt + +Tmax = 3 +def pen_experiment(N=12, use_linesearch=True,figex="", animate=True): + dt = Tmax / N + env = GymSinCosPendulumEnvironment(dt, Tmax=Tmax, supersample_trajectory=True, render_mode="human" if animate else None) + agent = ILQRAgent(env, env.discrete_model, N=N, ilqr_iterations=200, use_linesearch=use_linesearch) + + stats, trajectories = train(env, agent, num_episodes=1, return_trajectory=True) + + agent.use_ubar = True + stats2, trajectories2 = train(env, agent, num_episodes=1, return_trajectory=True) + env.close() + + plot_pendulum_trajectory(env, trajectories[0], label='Closed-loop $\\pi$') + xb = agent.xbar + tb = np.arange(N+1)*dt + plt.figure(figsize=(12, 6)) + plt.plot(trajectories[0].time, trajectories[0].state[:,1], '-', label='Closed-loop $\\pi(x_k)$') + + plt.plot(trajectories2[0].time, trajectories2[0].state[:,1], '-', label='Open-loop $\\bar{u}_k$') + plt.plot(tb, xb[:,1], 'o-', label="iLQR prediction $\\bar{x}_k$") + plt.grid() + plt.legend() + ev = "pendulum" + savepdf(f"irlc_pendulum_theta_N{N}_{use_linesearch}{figex}") + plt.show() + + ## Plot J + plt.figure(figsize=(6, 6)) + plt.semilogy(agent.J_hist, 'k.-') + plt.xlabel("iLQR Iterations") + plt.ylabel("Cost function estimate $J$") + # plt.title("Last value: {") + plt.grid() + savepdf(f"irlc_pendulum_J_N{N}_{use_linesearch}{figex}") + plt.show() + +def plot_pendulum_trajectory(env, traj, style='k.-', label=None, action=False, **kwargs): + if action: + y = traj.action[:, 0] + y = np.clip(y, env.action_space.low[0], env.action_space.high[0]) + else: + y = traj.state[:, 1] + + plt.plot(traj.time[:-1] if action else traj.time, y, style, label=label, **kwargs) + plt.xlabel("Time/seconds") + if action: + plt.ylabel("Torque $u$") + else: + plt.ylabel("$\cos(\\theta)$") + plt.grid() + pass + +N = 50 +if __name__ == "__main__": + pen_experiment(N=N, use_linesearch=True) diff --git a/irlc/ex07/ilqr_rendevoyz.py b/irlc/ex07/ilqr_rendevoyz.py new file mode 100644 index 0000000000000000000000000000000000000000..8cd6cdc30de69f057cb7d024bbdb46f6530e146e --- /dev/null +++ b/irlc/ex07/ilqr_rendevoyz.py @@ -0,0 +1,5 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex07.ilqr_rendovouz_basic import plot_rendevouz + +if __name__ == "__main__": + plot_rendevouz(use_linesearch=True) diff --git a/irlc/ex07/ilqr_rendovouz_basic.py b/irlc/ex07/ilqr_rendovouz_basic.py new file mode 100644 index 0000000000000000000000000000000000000000..255103bd8d5c77baa8d9dfc850885aed8c8259d0 --- /dev/null +++ b/irlc/ex07/ilqr_rendovouz_basic.py @@ -0,0 +1,97 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import matplotlib.pyplot as plt +from irlc import savepdf +from irlc.ex07.ilqr import ilqr_basic, ilqr_linesearch +from irlc.ex06.model_rendevouz import DiscreteRendevouzModel +from irlc.ex04.control_environment import ControlEnvironment +from irlc.ex04.discrete_control_model import DiscreteControlModel + + +def ilqr(model : DiscreteControlModel, N, x0, n_iter, use_linesearch, verbose=True): + if not use_linesearch: + xs, us, J_hist, L, l = ilqr_basic(model, N, x0, n_iterations=n_iter,verbose=verbose) + else: + xs, us, J_hist, L, l = ilqr_linesearch(model, N, x0, n_iterations=n_iter, tol=1e-6,verbose=verbose) + xs, us = np.stack(xs), np.stack(us) + return xs, us, J_hist, L, l + +def plot_vehicles(x_0, y_0, x_1, y_1, linespec='-', legend=("Vehicle 1", "Vehicle 2")): + _ = plt.title("Trajectory of the two omnidirectional vehicles") + _ = plt.plot(x_0, y_0, "r"+linespec, label=legend[0]) + _ = plt.plot(x_1, y_1, "b"+linespec, label=legend[1]) + +Tmax = 20 +def solve_rendovouz(use_linesearch=False): + model = DiscreteRendevouzModel() + x0 = np.asarray(model.continuous_model.x0_bound().low) # Starting position + N = int(Tmax/model.dt) + return ilqr(model, N, x0, n_iter=10, use_linesearch=use_linesearch), model + +def plot_rendevouz(use_linesearch=False): + (xs, us, J_hist, _, _), env = solve_rendovouz(use_linesearch=use_linesearch) + N = int(Tmax / env.dt) + dt = env.dt + x_0 = xs[:, 0] + y_0 = xs[:, 1] + x_1 = xs[:, 2] + y_1 = xs[:, 3] + x_0_dot = xs[:, 4] + y_0_dot = xs[:, 5] + x_1_dot = xs[:, 6] + y_1_dot = xs[:, 7] + + pdf_ex = '_linesearch' if use_linesearch else '' + ev = 'rendevouz_' + plot_vehicles(x_0, y_0, x_1, y_1, linespec='-', legend=("Vehicle 1", "Vehicle 2")) + plt.legend() + savepdf(f'{ev}trajectory{pdf_ex}') + plt.show() + + t = np.arange(N + 1) * dt + _ = plt.plot(t, x_0, "r") + _ = plt.plot(t, x_1, "b") + _ = plt.xlabel("Time (s)") + _ = plt.ylabel("x (m)") + _ = plt.title("X positional paths") + _ = plt.legend(["Vehicle 1", "Vehicle 2"]) + savepdf(f'{ev}vehicles_x_pos{pdf_ex}') + plt.show() + + _ = plt.plot(t, y_0, "r") + _ = plt.plot(t, y_1, "b") + _ = plt.xlabel("Time (s)") + _ = plt.ylabel("y (m)") + _ = plt.title("Y positional paths") + _ = plt.legend(["Vehicle 1", "Vehicle 2"]) + savepdf(f'{ev}vehicles_y_pos{pdf_ex}') + plt.show() + + _ = plt.plot(t, x_0_dot, "r") + _ = plt.plot(t, x_1_dot, "b") + _ = plt.xlabel("Time (s)") + _ = plt.ylabel("x_dot (m)") + _ = plt.title("X velocity paths") + _ = plt.legend(["Vehicle 1", "Vehicle 2"]) + savepdf(f'{ev}vehicles_vx{pdf_ex}') + plt.show() + + _ = plt.plot(t, y_0_dot, "r") + _ = plt.plot(t, y_1_dot, "b") + _ = plt.xlabel("Time (s)") + _ = plt.ylabel("y_dot (m)") + _ = plt.title("Y velocity paths") + _ = plt.legend(["Vehicle 1", "Vehicle 2"]) + savepdf(f'{ev}vehicles_vy{pdf_ex}') + plt.show() + + _ = plt.plot(J_hist) + _ = plt.xlabel("Iteration") + _ = plt.ylabel("Total cost") + _ = plt.title("Total cost-to-go") + savepdf(f'{ev}cost_to_go{pdf_ex}') + plt.show() + + +if __name__ == "__main__": + plot_rendevouz(use_linesearch=False) diff --git a/irlc/ex07/linearization_agent.py b/irlc/ex07/linearization_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..e06916139a0acaa5ceb36cb5f60aaf62ce55bbb3 --- /dev/null +++ b/irlc/ex07/linearization_agent.py @@ -0,0 +1,67 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from irlc.ex06.dlqr import LQR +from irlc import Agent +from irlc.ex05.model_cartpole import GymSinCosCartpoleEnvironment +from irlc import train, savepdf +import matplotlib.pyplot as plt +import numpy as np +from irlc.ex04.control_environment import ControlEnvironment +from irlc.ex04.discrete_control_model import DiscreteControlModel + +class LinearizationAgent(Agent): + """ Implement the simple linearization procedure described in (Her24, Algorithm 23) which expands around a single fixed point. """ + def __init__(self, env: ControlEnvironment, model : DiscreteControlModel, xbar=None, ubar=None): + self.model = model + N = 50 # Plan on this horizon. The control matrices will converge fairly quickly. + """ Define A, B, d as the list of A/B matrices here. I.e. x[t+1] = A x[t] + B u[t] + d. + You should use the function model.f to do this, which has build-in functionality to compute Jacobians which will be equal to A, B. + It is important that you linearize around xbar, ubar. See (Her24, Section 17.1) for further details. """ + # TODO: 4 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + Q, q, R = self.model.cost.Q, self.model.cost.q, self.model.cost.R + """ Define self.L, self.l here as the (lists of) control matrices. """ + # TODO: 1 lines missing. + raise NotImplementedError("Compute control matrices L, l here using LQR(...)") + super().__init__(env) + + def pi(self, x, k, info=None): + """ + Compute the action here using u_k = L_0 x_k + l_0. The control matrix/vector L_0 can be found as the output from LQR, i.e. + L_0 = L[0] and l_0 = l[0]. + + The reason we use L_0, l_0 (and not L_k, l_k) is because the LQR problem itself is an approximation of the true dynamics + and this controller will be able to balance the pendulum for an infinite amount of time. + """ + # TODO: 1 lines missing. + raise NotImplementedError("Compute current action here") + return u + + +def get_offbalance_cart(waiting_steps=30, sleep_time=0.1): + env = GymSinCosCartpoleEnvironment(Tmax=3, render_mode='human') + env.reset() + import time + time.sleep(sleep_time) + env.state = env.discrete_model.x_upright + env.state[-1] = 0.01 # a bit of angular speed. + for _ in range(waiting_steps): # Simulate the environment for 30 steps to get things out of balance. + env.step(1) + time.sleep(sleep_time) + return env + + +if __name__ == "__main__": + np.random.seed(42) # I don't think these results are seed-dependent but let's make sure. + from irlc import plot_trajectory + env = get_offbalance_cart(4) # Simulate for 4 seconds to get the cart off-balance. Same idea as PID control. + agent = LinearizationAgent(env, model=env.discrete_model, xbar=env.discrete_model.x_upright, ubar=env.action_space.sample()*0) + _, trajectories = train(env, agent, num_episodes=1, return_trajectory=True, reset=False) # Note reset=False to maintain initial conditions. + plot_trajectory(trajectories[0], env, xkeys=[0,2, 3], ukeys=[0]) + env.close() + savepdf("linearization_cartpole") + plt.show() diff --git a/irlc/ex08/__init__.py b/irlc/ex08/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..28514114cf38978975fea28d6e6670715223cfb8 --- /dev/null +++ b/irlc/ex08/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 8.""" diff --git a/irlc/ex08/__pycache__/__init__.cpython-311.pyc b/irlc/ex08/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0cf56d19154df1c345a7e784ebd4018c06211464 Binary files /dev/null and b/irlc/ex08/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex08/__pycache__/bandits.cpython-311.pyc b/irlc/ex08/__pycache__/bandits.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c02bb467c268e6303ac30d6dbf56ff684aeef2ec Binary files /dev/null and b/irlc/ex08/__pycache__/bandits.cpython-311.pyc differ diff --git a/irlc/ex08/__pycache__/simple_agents.cpython-311.pyc b/irlc/ex08/__pycache__/simple_agents.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07b42a3efde578976cf2c50196bdb81b6934b371 Binary files /dev/null and b/irlc/ex08/__pycache__/simple_agents.cpython-311.pyc differ diff --git a/irlc/ex08/bandit_example.py b/irlc/ex08/bandit_example.py new file mode 100644 index 0000000000000000000000000000000000000000..fa5412b7920ab3f503ab0a5b3f9136ae8e5db32c --- /dev/null +++ b/irlc/ex08/bandit_example.py @@ -0,0 +1,27 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import matplotlib.pyplot as plt + +if __name__ == "__main__": + from irlc import Agent, train, savepdf + from irlc.ex08.bandits import StationaryBandit + bandit = StationaryBandit(k=10) # A 10-armed bandit + agent = Agent(bandit) # Recall the agent takes random actions + _, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500) + plt.plot(trajectories[0].reward) + plt.xlabel("Time step") + plt.ylabel("Reward per time step") + savepdf("dumbitA") + plt.show() + + agent = Agent(bandit) # Recall the agent takes random actions + for i in range(10): + _, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500) + regret = np.asarray([r['average_regret'] for r in trajectories[0].env_info[1:]]) + cum_regret = np.cumsum(regret) + plt.plot(cum_regret, label=f"Episode {i}") + plt.legend() + plt.xlabel("Time step") + plt.ylabel("Accumulated Regret") + savepdf("dumbitB") + plt.show() diff --git a/irlc/ex08/bandits.py b/irlc/ex08/bandits.py new file mode 100644 index 0000000000000000000000000000000000000000..7b3b9577c41fe40f4ee1735e8055f46120a32212 --- /dev/null +++ b/irlc/ex08/bandits.py @@ -0,0 +1,216 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from gymnasium import Env +from gymnasium.spaces import Discrete +from irlc import train +from tqdm import tqdm +import sys +from irlc import cache_read, cache_write, cache_exists + +class BanditEnvironment(Env): + r""" + A helper class for defining bandit problems similar to e.g. the 10-armed testbed discsused in (SB18). + We are going to implement the bandit problems as greatly simplfied gym environments, as this will allow us to + implement the bandit agents as the familiar ``Agent``. I hope this way of doing it will make it clearer that bandits + are in fact a sort of reinforcement learning method. + + The following code shows an example of how to use a bandit environment: + + .. runblock:: pycon + + >>> from irlc.ex08.bandits import StationaryBandit + >>> env = StationaryBandit(k=10) # 10-armed testbed. + >>> env.reset() # Reset env.q_star + >>> s, r, _, _, info = env.step(3) + >>> print(f"The reward we got from taking arm a=3 was {r=}") + + """ + def __init__(self, k : int): + r""" + Initialize a bandit problem. The observation space is given a dummy value since bandit problems of the sort + (SB18) discuss don't have observations. + + :param k: The number of arms. + """ + super().__init__() + self.observation_space = Discrete(1) # Dummy observation space with a single observation. + self.action_space = Discrete(k) # The arms labelled 0,1,...,k-1. + self.k = k # Number of arms + + def reset(self): + """ + Use this function to reset the all internal parameters of the environment and get ready for a new episode. + In the (SB18) 10-armed bandit testbed, this would involve resetting the expected return + + .. math:: + q^*_a + + The function must return a dummy state and info dictionary to agree with the gym ``Env`` class, but their values are + irrelevant + + :return: + - s - a state, for instance 0 + - info - the info dictionary, for instance {} + """ + raise NotImplementedError("Implement the reset method") + + def bandit_step(self, a): + """ + This helper function simplify the definition of the environments ``step``-function. + + Given an action :math:`r`, this function computes the reward obtained by taking that action :math:`r_t` + and the average regret. This is defined as the expected reward we miss out on by taking the potentially suboptimal action :math:`a` + and is defined as: + + .. math:: + \max_{a'} q^*_{a'} - q_a + + Once implemented, the reward and regret enters into the ``step`` function as follows: + + .. runblock:: pycon + + >>> from irlc.ex08.bandits import StationaryBandit + >>> env = StationaryBandit(k=4) # 4-armed testbed. + >>> env.reset() # Reset all parameters. + >>> _, r, _, _, info = env.step(2) # Take action a=2 + >>> print(f"Reward from a=2 was {r=}, the regret was {info['average_regret']=}") + + :param a: The current action we take + :return: + - r - The reward we thereby incur + - regret - The average regret incurred by taking this action (0 for an optimal action) + """ + reward = 0 # Compute the reward associated with arm a + regret = 0 # Compute the regret, by comparing to the optimal arms reward. + return reward, regret + + def step(self, action): + """ + This function is automatically defind and you do not have to edit it. + In a bandit environment, the step function is simplified greatly since there are no + states to keep track on. It should simply return the reward incurred by the action ``a`` + and (for convenience) also returns the average regret in the ``info``-dictionary. + + :param action: The current action we take :math:`a_t` + :return: + - next_state - This is always ``None`` + - reward - The reward obtained by taking the given action. In (SB18) this is defined as :math:`r_t` + - terminated - Always ``False``. Bandit problems don't terminate. + - truncated - Always ``False`` + - info - For convenience, this includes the average regret (used by the plotting methods) + + """ + reward, average_regret = self.bandit_step(action) + info = {'average_regret': average_regret} + return None, reward, False, False, info + +class StationaryBandit(BanditEnvironment): + """ + Implement the 'stationary bandit environment' which is described in (SB18, Section 2.3) + and used as a running example throughout the chapter. + + We will implement a version with a constant mean offset (q_star_mean), so that + + q* = x + q_star_mean, x ~ Normal(0,1) + + q_star_mean can just be considered to be zero at first. + """ + def __init__(self, k, q_star_mean=0): + super().__init__(k) + self.q_star_mean = q_star_mean + + def reset(self): + """ Set q^*_k = N(0,1) + mean_value. The mean_value is 0 in most examples. I.e., implement the 10-armed testbed environment. """ + self.q_star = np.random.randn(self.k) + self.q_star_mean + self.optimal_action = np.argmax(self.q_star) # Optimal action is the one with the largest q^*-value. + return 0, {} # The reset method in a gym Env must return a (dummy) state and a dictionary. + + def bandit_step(self, a): + """ Return the reward/regret for action a for the simple bandit. Use self.q_star (see reset-function above). + To implement it, implement the reward (see the description of the 10-armed testbed for more information. + How is it computed from from q^*_k?) and also compute the regret. + + As a small hint, since we are computing the average regret, it will in fact be the difference between the + value of q^* corresponding to the current arm, and the q^* value for the optimal arm. + Remember it is 0 if the optimal action is selected. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + # Actual logic goes here. Use self.q_star[a] to get mean reward and np.random.randn() to generate random numbers. + return reward, regret + + def __str__(self): + return f"{type(self).__name__}_{self.q_star_mean}" + +""" +Helper function for running a bunch of bandit experiments and plotting the results. + +The function will run the agents in 'agents' (a list of bandit agents) +on the bandit environment 'bandit' and plot the result. + +Each agent will be evaluated for num_episodes episodes, and one episode consist of 'steps' steps. +However, to speed things up you can use cache, and the bandit will not be evaluated for more than +'max_episodes' over all cache runs. + +""" +def eval_and_plot(bandit, agents, num_episodes=2000, max_episodes=2000, steps=1000, labels=None, use_cache=True): + if labels is None: + labels = [str(agent) for agent in agents] + + f, axs = plt.subplots(nrows=3, ncols=1) + f.set_size_inches(10,7) + (ax1, ax2, ax3) = axs + for i,agent in enumerate(agents): + rw, oa, regret, num_episodes = run_agent(bandit, agent, episodes=num_episodes, max_episodes=max_episodes, steps=steps, use_cache=use_cache) + ax1.plot(rw, label=labels[i]) + ax2.plot(oa, label=labels[i]) + ax3.plot(regret, label=labels[i]) + + for ax in axs: + ax.grid() + ax.set_xlabel("Steps") + + ax1.set_ylabel("Average Reward") + ax2.set_ylabel("% optimal action") + ax3.set_ylabel("Regret $L_t$") + ax3.legend() + f.suptitle(f"Evaluated on {str(bandit)} for {num_episodes} episodes") + +def run_agent(env, agent, episodes=2000, max_episodes=2000, steps=1000, use_cache=False, verbose=True): + """ + Helper function. most of the work involves the cache; the actual training is done by 'train'. + """ + C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = 0, 0, 0, 0 + if use_cache: + cache = f"cache/{str(env)}_{str(agent)}_{steps}.pkl" + if cache_exists(cache): + print("> Reading from cache", cache) + C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = cache_read(cache) + + regrets = [] + rewards = [] + cruns = max(0, min(episodes, max_episodes - C_n_episodes)) # Missing runs. + for _ in tqdm(range(cruns), file=sys.stdout, desc=str(agent),disable=not verbose): + stats, traj = train(env, agent, max_steps=steps, verbose=False, return_trajectory=True) + regret = np.asarray([r['average_regret'] for r in traj[0].env_info[1:]]) + regrets.append(regret) + rewards.append(traj[0].reward) + + regrets_cum_sum = C_regrets_cum_sum + oas_sum = C_oas_sum + rewards_sum = C_rewards_sum + episodes = C_n_episodes + if len(regrets) > 0: + regrets_cum_sum += np.cumsum(np.sum(np.stack(regrets), axis=0)) + oas_sum += np.sum(np.stack(regrets) == 0, axis=0) + rewards_sum += np.sum(np.stack(rewards), axis=0) + episodes += cruns + if use_cache and cruns > 0: + cache_write((regrets_cum_sum, oas_sum, rewards_sum, episodes), cache, protocol=4) + return rewards_sum/episodes, oas_sum/episodes, regrets_cum_sum/episodes, episodes diff --git a/irlc/ex08/gradient_agent.py b/irlc/ex08/gradient_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..34b296b0cd1dbcea66b194b63d16422b423df98e --- /dev/null +++ b/irlc/ex08/gradient_agent.py @@ -0,0 +1,48 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import savepdf +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex08.bandits import eval_and_plot, StationaryBandit +from irlc import Agent + +class GradientAgent(Agent): + def __init__(self, env, alpha=None, use_baseline=True): + self.k = env.action_space.n + self.alpha = alpha + self.baseline=use_baseline + self.H = np.zeros((self.k,)) + super().__init__(env) + + def Pa(self): + """ This helper method returns the probability distribution P(A=a) of chosing the + arm a as a vector + """ + pi_a = np.exp(self.H) + return pi_a / np.sum(pi_a) + + def pi(self, s, t, info_s=None): + if t == 0: + self.R_bar = 0 # average reward baseline + self.H *= 0 # Reset H to all-zeros. + self.t = t # Sore the current time step. + return np.random.choice( self.k, p=self.Pa() ) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 9 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"{type(self).__name__}_{self.alpha}_{'baseline' if self.baseline else 'no_baseline'}" + +if __name__ == "__main__": + baseline_bandit = StationaryBandit(k=10, q_star_mean=4) + alphas = [0.1, 0.4] + agents = [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=False) for alpha in alphas] + agents += [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=True) for alpha in alphas] + + labels = [f'Gradient Bandit alpha={alpha}' for alpha in alphas ] + labels += [f'With baseline: Gradient Bandit alpha={alpha}' for alpha in alphas ] + use_cache = False + eval_and_plot(baseline_bandit, agents, max_episodes=2000, num_episodes=100, labels=labels, use_cache=use_cache) + savepdf("gradient_baseline") + plt.show() diff --git a/irlc/ex08/grand_bandit_race.py b/irlc/ex08/grand_bandit_race.py new file mode 100644 index 0000000000000000000000000000000000000000..ad466aaaffc88b0b4aa43375b55640aa17dc096a --- /dev/null +++ b/irlc/ex08/grand_bandit_race.py @@ -0,0 +1,78 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import matplotlib.pyplot as plt +from irlc.ex08.simple_agents import BasicAgent +from irlc.ex08.bandits import StationaryBandit, eval_and_plot +from irlc.ex08.nonstationary import MovingAverageAgent, NonstationaryBandit +from irlc.ex08.gradient_agent import GradientAgent +from irlc.ex08.ucb_agent import UCBAgent +from irlc import savepdf +import time + +if __name__ == "__main__": + print("Ladies and gentlemen. It is time for the graaand bandit race") + def intro(bandit, agents): + print("We are live from the beautiful surroundings where they will compete in:") + print(bandit) + print("Who will win? who will have the most regret? we are about to find out") + print("in a minute after a brief word from our sponsors") + time.sleep(1) + print("And we are back. Let us introduce todays contestants:") + for a in agents: + print(a) + print("And they are off!") + epsilon = 0.1 + alpha = 0.1 + c = 2 + # TODO: 1 lines missing. + raise NotImplementedError("Define the bandit here: bandit1 = ...") + # TODO: 5 lines missing. + raise NotImplementedError("define agents list here") + labels = ["Basic", "Moving avg.", "gradient", "Gradient+baseline", "UCB"] + ''' + Stationary, no offset. Vanilla setting. + ''' + intro(bandit1, agents) + # TODO: 1 lines missing. + raise NotImplementedError("Call eval_and_plot here") + plt.suptitle("Stationary bandit (no offset)") + savepdf("grand_race_1") + plt.show() + ''' + Stationary, but with offset + ''' + print("Whew what a race. Let's get ready to next round:") + # TODO: 1 lines missing. + raise NotImplementedError("Define bandit2 = ... here") + intro(bandit2, agents) + # TODO: 1 lines missing. + raise NotImplementedError("Call eval_and_plot here") + plt.suptitle("Stationary bandit (with offset)") + savepdf("grand_race_2") + plt.show() + ''' + Long (nonstationary) simulations + ''' + print("Whew what a race. Let's get ready to next round which will be a long one.") + # TODO: 1 lines missing. + raise NotImplementedError("define bandit3 here") + intro(bandit3, agents) + # TODO: 1 lines missing. + raise NotImplementedError("call eval_and_plot here") + plt.suptitle("Non-stationary bandit (no offset)") + savepdf("grand_race_3") + plt.show() + + ''' + Stationary, no offset, long run. Exclude stupid bandits. + ''' + agents2 = [] + agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=False)] + agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=True)] + agents2 += [UCBAgent(bandit1, c=2)] + labels = ["Gradient", "Gradient+baseline", "UCB"] + intro(bandit1, agents2) + # TODO: 1 lines missing. + raise NotImplementedError("Call eval_and_plot here") + plt.suptitle("Stationary bandit (no offset)") + savepdf("grand_race_4") + plt.show() diff --git a/irlc/ex08/nonstationary.py b/irlc/ex08/nonstationary.py new file mode 100644 index 0000000000000000000000000000000000000000..1128f0a0bd24b8a8d487c2f8c79ac4f38a94d58f --- /dev/null +++ b/irlc/ex08/nonstationary.py @@ -0,0 +1,62 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex08.simple_agents import BasicAgent +from irlc.ex08.bandits import StationaryBandit, eval_and_plot +from irlc import savepdf + +class NonstationaryBandit(StationaryBandit): + def __init__(self, k, q_star_mean=0, reward_change_std=0.01): + self.reward_change_std = reward_change_std + super().__init__(k, q_star_mean) + + def bandit_step(self, a): + """ Implement the non-stationary bandit environment (as described in (SB18)). + Hint: use reward_change_std * np.random.randn() to generate a single random number with the given std. + then add one to each coordinate. Remember you have to compute the regret as well, see StationaryBandit for ideas. + (remember the optimal arm will change when you add noise to q_star) """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + return super().bandit_step(a) + + def __str__(self): + return f"{type(self).__name__}_{self.q_star_mean}_{self.reward_change_std}" + + +class MovingAverageAgent(BasicAgent): + """ + The simple bandit from (SB18, Section 2.4), but with moving average alpha + as described in (SB18, Eqn. (2.3)) + """ + def __init__(self, env, epsilon, alpha): + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"{type(self).__name__}_{self.epsilon}_{self.alpha}" + + +if __name__ == "__main__": + plt.figure(figsize=(10, 10)) + epsilon = 0.1 + alphas = [0.15, 0.1, 0.05] + + # TODO: 4 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + + labels = [f"Basic agent, epsilon={epsilon}"] + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + use_cache = False # Set this to True to use cache (after code works!) + eval_and_plot(bandit, agents, steps=10000, num_episodes=200, labels=labels, use_cache=use_cache) + savepdf("nonstationary_bandits") + plt.show() diff --git a/irlc/ex08/simple_agents.py b/irlc/ex08/simple_agents.py new file mode 100644 index 0000000000000000000000000000000000000000..8c51d02312a2150d5d58c2166210befc2c366fca --- /dev/null +++ b/irlc/ex08/simple_agents.py @@ -0,0 +1,57 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex08.bandits import StationaryBandit, eval_and_plot +from irlc import Agent +from irlc import savepdf + +class BasicAgent(Agent): + """ + Simple bandit as described on (SB18, Section 2.4). + """ + def __init__(self, env, epsilon): + super().__init__(env) + self.k = env.action_space.n + self.epsilon = epsilon + + def pi(self, s, t, info=None): + """ Since this is a bandit, s=None and can be ignored, while t refers to the time step in the current episode """ + if t == 0: + # At step 0 of episode. Re-initialize data structure. + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + # compute action here + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + """ Since this is a bandit, done, s, sp, info_s, info_sp can all be ignored. + From the input arguments you should only use a + """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"BasicAgent_{self.epsilon}" + +if __name__ == "__main__": + N = 100000 + S = [np.max( np.random.randn(10) ) for _ in range(100000) ] + print( np.mean(S), np.std(S)/np.sqrt(N) ) + + use_cache = False # Set this to True to use cache (after code works!) + from irlc.utils.timer import Timer + timer = Timer(start=True) + R = 100 + steps = 1000 + env = StationaryBandit(k=10) + agents = [BasicAgent(env, epsilon=.1), BasicAgent(env, epsilon=.01), BasicAgent(env, epsilon=0) ] + eval_and_plot(env, agents, num_episodes=100, steps=1000, max_episodes=150, use_cache=use_cache) + savepdf("bandit_epsilon") + plt.show() + print(timer.display()) diff --git a/irlc/ex08/ucb_agent.py b/irlc/ex08/ucb_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..cd706ab879562cd16c31fac2941eaa78ef6caa7c --- /dev/null +++ b/irlc/ex08/ucb_agent.py @@ -0,0 +1,45 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex08.simple_agents import BasicAgent +from irlc import savepdf +from irlc import Agent + +class UCBAgent(Agent): + def __init__(self, env, c=2): + self.c = c + super().__init__(env) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 2 lines missing. + raise NotImplementedError("Train agent here") + + def pi(self, s, k, info=None): + if k == 0: + """ Initialize the agent""" + # TODO: 3 lines missing. + raise NotImplementedError("Reset agent (i.e., make it ready to learn in a new episode with a new optimal action)") + # TODO: 1 lines missing. + raise NotImplementedError("Compute (and return) optimal action") + + def __str__(self): + return f"{type(self).__name__}_{self.c}" + +from irlc.ex08.bandits import StationaryBandit, eval_and_plot +if __name__ == "__main__": + """ Reproduce (SB18, Fig. 2.4) comparing UCB agent to epsilon greedy """ + runs, use_cache = 100, False + c = 2 + eps = 0.1 + + steps = 1000 + env = StationaryBandit(k=10) + agents = [UCBAgent(env,c=c), BasicAgent(env, epsilon=eps)] + eval_and_plot(bandit=env, agents=agents, num_episodes=runs, steps=steps, max_episodes=2000, use_cache=use_cache) + savepdf("UCB_agent") + plt.show() diff --git a/irlc/ex09/__init__.py b/irlc/ex09/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e753a4f8165d8230bf25dad15c13bb55af050a60 --- /dev/null +++ b/irlc/ex09/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 9.""" diff --git a/irlc/ex09/__pycache__/__init__.cpython-311.pyc b/irlc/ex09/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..770ea7a94070c53deff8bab18a19af41044593a6 Binary files /dev/null and b/irlc/ex09/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex09/__pycache__/mdp.cpython-311.pyc b/irlc/ex09/__pycache__/mdp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6c15cd660ced299b47841c52203bdccca264409 Binary files /dev/null and b/irlc/ex09/__pycache__/mdp.cpython-311.pyc differ diff --git a/irlc/ex09/__pycache__/mdp_warmup.cpython-311.pyc b/irlc/ex09/__pycache__/mdp_warmup.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48b1298531f6c58a39f743d79eeb976a262d212c Binary files /dev/null and b/irlc/ex09/__pycache__/mdp_warmup.cpython-311.pyc differ diff --git a/irlc/ex09/__pycache__/rl_agent.cpython-311.pyc b/irlc/ex09/__pycache__/rl_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5540d8734650d6d4c0b057b272c9ec92e443e436 Binary files /dev/null and b/irlc/ex09/__pycache__/rl_agent.cpython-311.pyc differ diff --git a/irlc/ex09/__pycache__/value_iteration.cpython-311.pyc b/irlc/ex09/__pycache__/value_iteration.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..824c674ad5ff89eefd1e4abc43e24567ffb038f5 Binary files /dev/null and b/irlc/ex09/__pycache__/value_iteration.cpython-311.pyc differ diff --git a/irlc/ex09/__pycache__/value_iteration_agent.cpython-311.pyc b/irlc/ex09/__pycache__/value_iteration_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f0351567c0782f22dfae996b4cd3f6b35a9c91f Binary files /dev/null and b/irlc/ex09/__pycache__/value_iteration_agent.cpython-311.pyc differ diff --git a/irlc/ex09/gambler.py b/irlc/ex09/gambler.py new file mode 100644 index 0000000000000000000000000000000000000000..c45a7e5cb6726b808b857c951752923b56c99cbb --- /dev/null +++ b/irlc/ex09/gambler.py @@ -0,0 +1,81 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +from irlc import savepdf +import matplotlib.pyplot as plt +from irlc.ex09.value_iteration import value_iteration +from irlc.ex09.mdp import MDP + +class GamblerEnv(MDP): + """ + The gamler problem (see description given in (SB18, Example 4.3)) + + See the MDP class for more information about the methods. In summary: + > the state is the amount of money you have. if state = goal or state = 0 the game ends (use this for is_terminal) + > A are the available actions (a list). Note that these depends on the state; see below or example for details. + > Psr are the transitions (see MDP class for documentation) + """ + def __init__(self, goal=100, p_heads=0.4): + super().__init__(initial_state=goal//2) + self.goal = goal + self.p_heads = p_heads + + def is_terminal(self, state): + """ Implement if the state is terminal (0 or self.goal) """ + # TODO: 1 lines missing. + raise NotImplementedError("Return true only if state is terminal.") + + def A(self, s): + """ Action is the amount you choose to gamle. + You can gamble from 0 and up to the amount of money you have (state), + but not so much you will exceed the goal amount (see (SB18) for details). + In other words, return this as a list, and the number of elements should depend on the state s. """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def Psr(self, s, a): + """ Implement transition probabilities here. + the reward is 1 if you win (obtain goal amount) and otherwise 0. Remember the format should + return a dictionary with entries: + > { (sp, r) : probability } + + You can see the small-gridworld example (see exercise description) for an example of how to use this function, + but now you should keep in mind that since you can win (or not) the dictionary you return should have two entries: + one with a probability of self.p_heads (winning) and one with a probability of 1-self.p_heads (loosing). + """ + # TODO: 4 lines missing. + raise NotImplementedError("Implement function body") + return outcome_dict + +def gambler(): + """ + Gambler's problem from (SB18, Example 4.3) + """ + mdp = GamblerEnv(p_heads=0.4) + pi, V = value_iteration(mdp, gamma=1., theta=1e-11) + + V = [V[s] for s in mdp.states] + plt.bar(mdp.states, V) + plt.xlabel('Capital') + plt.ylabel('Value Estimates') + plt.title('Final value function (expected return) vs State (Capital)') + plt.grid() + savepdf("gambler_valuefunction") + plt.show() + + y = [pi[s] for s in mdp.nonterminal_states] + plt.bar(mdp.nonterminal_states, y, align='center', alpha=0.5) + plt.xlabel('Capital') + plt.ylabel('Final policy (stake)') + plt.title('Capital vs Final Policy') + plt.grid() + savepdf("gambler_policy") + plt.show() + + +if __name__ == "__main__": + + gambler() diff --git a/irlc/ex09/mdp.py b/irlc/ex09/mdp.py new file mode 100644 index 0000000000000000000000000000000000000000..367ebdf9bb3e6f532f84f1b53da2a9f26cafdf87 --- /dev/null +++ b/irlc/ex09/mdp.py @@ -0,0 +1,303 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import gymnasium as gym +from gymnasium import Env +from collections import defaultdict +from tqdm import tqdm +import sys + +class MDP: + r""" + This class represents a Markov Decision Process. It defines three main components: + + - The actions available in a given state :math:`A(s)` + - The transition probabilities :math:`p(s', r | s, a)` + - A terminal check to determine if a state :math:`s` is terminal + - A way to specify the initial state: + + - As a single state the MDP always begins in (most common) + - As a general distribution :math:`p(s_0)`. + + In addition to this it allows you to access either + - The set of all states (including terminal states) as ``mdp.states`` + - The set of all non-terminal states as ``mdp.non_terminal_states`` + + .. note:: + The ``states`` and ``non_termianl_states`` are computed lazily. This means that if you don't access them, they won't use memory. + This allows you to specify MDPs with an infinite number of states without running out of memory. + """ + def __init__(self, initial_state=None, verbose=False): + """ + Initialize the MDP. In the case where ``initial_state`` is set to a value :math:`s_0`, the initial state distribution will be + + .. math:: + p(s_0) = 1 + + :param initial_state: An optional initial state. + :param verbose: If ``True``, the class will print out debug information (useful for very large MDPs) + """ + self.verbose=verbose + self.initial_state = initial_state # Starting state s_0 of the MDP. + # The following variables that begin with _ are used to cache computations. The reason why we don't compute them + # up-front is because their computation may be time-consuming and they might not be needed. + self._states = None + self._nonterminal_states = None + self._terminal_states = None + + def is_terminal(self, state) -> bool: + r""" + Determines if a state is terminal (i.e., the environment/model is complete). In (SB18), the terminal + state is written as :math:`s_T`. + + .. runblock:: pycon + + >>> from irlc.gridworld.gridworld_environments import FrozenLake + >>> mdp = FrozenLake().mdp + >>> mdp.is_terminal(mdp.initial_state) # False, obviously. + + + :param state: The state :math:`s` to check + :return: ``True`` if the state is terminal and otherwise ``False``. + """ + return False # Return true if the given state is terminal. + + def Psr(self, state, action) -> dict: + r""" + Represents the transition probabilities: + + .. math:: + P(s', r | s, a) + + When called with state ``state`` and action ``action``, the function returns a dictionary of the form + ``{(s1, r1): p1, (s2, r2): p2, ...}``, so that ``p2`` is the probability of transitioning to ``s2`` (and obtaining + reward ``r2``) given we are in state ``state`` and take action ``action``: + + .. math:: + P(s_2, r_2 | s,a) = p_2 + + An example: + + .. runblock:: pycon + + >>> from irlc.gridworld.gridworld_environments import FrozenLake + >>> mdp = FrozenLake().mdp + >>> transitions = mdp.Psr(mdp.initial_state, 0) # P( ... | s0, a=0) + >>> for (sp, r), p in transitions.items(): + ... print(f"P(s'={sp}, r={r} | s={mdp.initial_state}, a=0) = {p}") + + :param state: The state to compute the transition probabilities in + :param action: The action to compute the transition probabilities in + :return: A dictionary where the keys are state, reward pairs we will transition to, :math:`p(s', r | ...)`, and the values are their probability. + """ + raise NotImplementedError("Return state distribution as a dictionary (see class documentation)") + + def A(self, state) -> list: + """ + Returns a list of actions available in the given state: + + .. math:: + A(s) + + An example to get the actions in the initial state: + + .. runblock:: pycon + + >>> from irlc.gridworld.gridworld_environments import FrozenLake + >>> mdp = FrozenLake().mdp + >>> mdp.A(mdp.initial_state) + + :param state: State to compute the actions in :math:`s` + :return: The list of available actions :math:`\mathcal A(s) = \{0, 1, ..., n-1\}` + """ + raise NotImplementedError("Return set/list of actions in given state A(s) = {a1, a2, ...}") + + def initial_state_distribution(self): + """ + (**Optional**) specify the initial state distribution. Should return a dictionary of the form: + ``{s0: p0, s1: p1, ..., sn: pn}``, in which case :math:`p(S_0 = s_k) = p_k`. + + You will typically not overwrite this function but just set the initial state. In that case the initial state distribution + is deterministic: + + + .. runblock:: pycon + + >>> from irlc.gridworld.gridworld_environments import FrozenLake + >>> mdp = FrozenLake().mdp + >>> mdp.initial_state_distribution() + + + + :return: An initial state distribution as a dictionary, where the keys are states, and the valuse are their probability. + """ + if self.initial_state is not None: + return {self.initial_state: 1} + else: + raise Exception("Either specify the initial state, or implement this method.") + + @property + def nonterminal_states(self): + r""" + The list of non-terminal states, i.e. :math:`\mathcal{S}` in (SB18) + + + .. runblock:: pycon + + >>> from irlc.gridworld.gridworld_environments import FrozenLake + >>> mdp = FrozenLake().mdp + >>> mdp.nonterminal_states + + :return: The list of non-terminal states :math:`\mathcal{S}` + """ + if self._nonterminal_states is None: + self._nonterminal_states = [s for s in self.states if not self.is_terminal(s)] + return self._nonterminal_states + + @property + def states(self): + r""" + The list of all states including terminal ones, i.e. :math:`\mathcal{S}^+` in (SB18). + The terminal states are those where ``is_terminal(state)`` is true. + + .. runblock:: pycon + + >>> from irlc.gridworld.gridworld_environments import FrozenLake + >>> mdp = FrozenLake().mdp + >>> mdp.states + + :return: The list all states :math:`\mathcal{S}^+` + """ + if self._states is None: + next_chunk = set(self.initial_state_distribution().keys()) + all_states = list(next_chunk) + while True: + new_states = set() + for s in tqdm(next_chunk, file=sys.stdout) if self.verbose else next_chunk: + if self.is_terminal(s): + continue + for a in self.A(s): + new_states = new_states | {sp for sp, r in self.Psr(s, a)} + + new_states = [s for s in new_states if s not in all_states] + if len(new_states) == 0: + break + all_states += new_states + next_chunk = new_states + self._states = list(set(all_states)) + + return self._states + + +def rng_from_dict(d): + """ Helper function. If d is a dictionary {x1: p1, x2: p2, ...} then this will sample an x_i with probability p_i """ + w, pw = zip(*d.items()) # seperate w and p(w) + i = np.random.choice(len(w), p=pw) # Required because numpy cast w to array (and w may contain tuples) + return w[i] + +class MDP2GymEnv(Env): + + def A(self, state): + raise Exception("Don't use this function; it is here for legacy reasons") + + def __init__(self, mdp, render_mode=None): + # We ignore this variable in this class, however, the Gridworld environment will check if + # render_mode == "human" and use it to render the environment. See: + # https://younis.dev/blog/render-api/ + self.render_mode = render_mode + self.mdp = mdp + self.state = None + # actions = set + all_actions = set.union(*[set(self.mdp.A(s)) for s in self.mdp.nonterminal_states ]) + n = max(all_actions) - min(all_actions) + 1 + assert isinstance(n, int) + self.action_space = gym.spaces.Discrete(n=n, start=min(all_actions)) + # Make observation space: + states = self.mdp.nonterminal_states + if not hasattr(self, 'observation_space'): + if isinstance(states[0], tuple): + self.observation_space = gym.spaces.Tuple([gym.spaces.Discrete(n+1) for n in np.asarray(states).max(axis=0)]) + else: + print("Could not guess observation space. Set it manually.") + + + def reset(self, seed=None, options=None): + info = {} + if seed is not None: + np.random.seed(seed) + self.action_space.seed(seed) + self.observation_space.seed(seed) + info['seed'] = seed + + ps = self.mdp.initial_state_distribution() + self.state = rng_from_dict(ps) + if self.render_mode == "human": + self.render() + info['mask'] = self._mk_mask(self.state) + return self.state, info + + def step(self, action): + ps = self.mdp.Psr(self.state, action) + self.state, reward = rng_from_dict(ps) + terminated = self.mdp.is_terminal(self.state) + if self.render_mode == "human": + self.render() + info = {'mask': self._mk_mask(self.state)} if not terminated else None + return self.state, reward, terminated, False, info + + def _mk_mask(self, state): + # self.A(state) + mask = np.zeros((self.action_space.n,), dtype=np.int8) + for a in self.mdp.A(state): + mask[a - self.action_space.start] = 1 + return mask + + +class GymEnv2MDP(MDP): + def __init__(self, env): + super().__init__() + self._states = list(range(env.observation_space.n)) + if hasattr(env, 'env'): + env = env.env + self._terminal_states = [] + for s in env.P: + for a in env.P[s]: + for (pr, sp, reward, done) in env.P[s][a]: + if done: + self._terminal_states.append(sp) + + self._terminal_states = set(self._terminal_states) + self.env = env + + def is_terminal(self, state): + return state in self._terminal_states + + def A(self, state): + return list(self.env.P[state].keys()) + + def Psr(self, state, action): + d = defaultdict(float) + for (pr, sp, reward, done) in self.env.P[state][action]: + d[ (sp, reward)] += pr + return d + +if __name__ == '__main__': + """A handful of examples of using the MDP-class in conjunction with a gym environment:""" + env = gym.make("FrozenLake-v1") + mdp = GymEnv2MDP(env) + from irlc.ex09.value_iteration import value_iteration + value_iteration(mdp) + mdp = GymEnv2MDP(gym.make("FrozenLake-v1")) + print("N = ", mdp.nonterminal_states) + print("S = ", mdp.states) + print("Is state 3 terminal?", mdp.is_terminal(3), "is state 11 terminal?", mdp.is_terminal(11)) + state = 0 + print("A(S=0) =", mdp.A(state)) + action = 2 + mdp.Psr(state, action) # Get transition probabilities + for (next_state, reward), Pr in mdp.Psr(state, action).items(): + print(f"P(S'={next_state},R={reward} | S={state}, A={action} ) = {Pr:.2f}") diff --git a/irlc/ex09/mdp_warmup.py b/irlc/ex09/mdp_warmup.py new file mode 100644 index 0000000000000000000000000000000000000000..aab1ac665700dfc6392773d9501056a43bb735c4 --- /dev/null +++ b/irlc/ex09/mdp_warmup.py @@ -0,0 +1,86 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +from irlc.ex09.mdp import MDP + + +def value_function2q_function(mdp : MDP, s, gamma, v : dict) -> dict: + r"""This helper function converts a value function to an action-value function. + + Given a value-function ``v`` and a state ``s``, this function implements the update: + + .. math:: + + Q(s,a) = \mathbb{E}[r + \gamma * v(s') | s, a] = \sum_{r, s'} (r + \gamma v(s') ) p(s', r| s,a) + + as described in (SB18, ). It should return a dictionary of the form:: + + {a1: Q(s,a1), a2: Q(s,a2), ..., an: Q(s,an)} + + where the actions are keys. You can compute these using ``mdp.A(s)``. When done the following should work:: + + Qs = value_function2q_function(mdp, s, gamma, v) + Qs[a] # This is the Q-value Q(s,a) + + Hints: + + * Remember that ``v[s'] = 0`` if ``s'`` is a terminal state (this is explained in (SB18)). + + :param mdp: An MDP instance. Use this to compute :math:`p(s', r| s,a)` + :param s: A state + :param gamma: The discount factor :math:`\gamma` + :param v: The value function represented as a dictionary. + :return: A dictionary representing :math:`Q` of the form ``{a1: Q(s,a1), a2: Q(s,a2), ..., an: Q(s,an)}`` + """ + # TODO: 1 lines missing. + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return q_dict + +def expected_reward(mdp : MDP, s, a) -> float: + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return expected_reward + +def q_function2value_function(policy : dict, Q : dict, s) -> float: + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return V_s + +if __name__ == "__main__": + from irlc.gridworld.gridworld_environments import FrozenLake + mdp = FrozenLake(living_reward=0.2).mdp # Get the MDP of this environment. + + ## Part 1: Expected reward + s0 = mdp.initial_state + s0 = (0, 3) # initial state + a = 3 # Go east. + print("Expected reward E[r | s0, a] =", expected_reward(mdp, s=s0, a=0), "should be 0.2") + print("Expected reward E[r | s0, a] =", expected_reward(mdp, s=(1, 2), a=0), "should be 0") + + + ## Part 2 + # First let's create a non-trivial value function + V = {} + for s in mdp.nonterminal_states: + V[s] = s[0] + 2*s[1] + print("Value function is", V) + # Compute the corresponding Q(s,a)-values in state s0: + q_ = value_function2q_function(mdp, s=s0, gamma=0.9, v=V) + print(f"Q-values in {s0=} is", q_) + + ## Part 3 + # Create a non-trivial Q-function for this problem. + Q = {} + for s in mdp.nonterminal_states: + for a in mdp.A(s): + Q[s,a] = s[0] + 2*s[1] - 10*a # The particular values are not important in this example + # Create a policy. In this case pi(a=3) = 0.4. + pi = {0: 0.2, + 1: 0.2, + 2: 0.2, + 3: 0.4} + print(f"Value-function in {s0=} is", q_function2value_function(pi, Q, s=s0)) diff --git a/irlc/ex09/policy_evaluation.py b/irlc/ex09/policy_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..8abbf5e7a742d29cd18b77ce805346e3d6b12f9f --- /dev/null +++ b/irlc/ex09/policy_evaluation.py @@ -0,0 +1,68 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +from collections import defaultdict +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex09.mdp_warmup import value_function2q_function +from irlc.ex09.small_gridworld import SmallGridworldMDP, plot_value_function +from irlc import savepdf + + +def policy_evaluation(pi, mdp, gamma=.99, theta=0.00001): + """ Implements the iterative policy-evaluation algorithm ((SB18, Section 4.1)). + The algorithm is given a policy pi which is represented as a dictionary so that + + > pi[s][a] = p + + is the probability p of taking action a in state s. The 'mdp' is a MDP-instance and the other terms have the same meaning as in the algorithm. + It should return a dictionary v so that + > v[s] + is the value-function evaluated in state s. I recommend using the qs_-function defined above. + """ + v = defaultdict(float) + Delta = theta #Initialize the 'Delta'-variable to a large value to make sure the first iteration of the method runs. + while Delta >= theta: # Outer loop in (SB18) + Delta = 0 # Remember to update Delta (same meaning as in (SB18)) + # Remember that 'S' in (SB18) is actually just the set of non-terminal states (NOT including terminal states!) + for s in mdp.nonterminal_states: # See the MDP class if you are curious about how this variable is defined. + """ Implement the main body of the policy evaluation algorithm here. You can do this directly, + or implement (and use) the value_function2q_function-function (consider what it does and compare to the algorithm). + If you do so, note that value_function2q_function(mdp, s, gamma, v) computes the equivalent of Q(s,a) (as a dictionary), + and in the algorithm, you then need to compute the expectation over pi: + > sum_a pi(a|s) Q(s,a) + In code it would be more akin to + q = value_function2q_function(...) + sum_a pi[s][a] * q[a] + + Don't be afraid to use a few more lines than I do. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + """ stop condition. v_ is the yafcport value of the value function (see algorithm listing in (SB18)) which you need to update. """ + Delta = max(Delta, np.abs(v_ - v[s])) + return v + + +if __name__ == "__main__": + mdp = SmallGridworldMDP() + """ + Create the random policy pi0 below. The policy is defined as a nested dict, i.e. + + > pi0[s][a] = (probability to take action a in state s) + + """ + pi0 = {s: {a: 1/len(mdp.A(s)) for a in mdp.A(s) } for s in mdp.nonterminal_states } + V = policy_evaluation(pi0, mdp, gamma=1) + plot_value_function(mdp, V) + plt.title("Value function using random policy") + savepdf("policy_eval") + plt.show() + + expected_v = np.array([0, -14, -20, -22, + -14, -18, -20, -20, + -20, -20, -18, -14, + -22, -20, -14, 0]) diff --git a/irlc/ex09/policy_iteration.py b/irlc/ex09/policy_iteration.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ab623a5d40097bee4d0b47d541d8e547a4f954 --- /dev/null +++ b/irlc/ex09/policy_iteration.py @@ -0,0 +1,63 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +from irlc.ex09.small_gridworld import SmallGridworldMDP +import matplotlib.pyplot as plt +from irlc.ex09.policy_evaluation import policy_evaluation +from irlc.ex09.mdp_warmup import value_function2q_function + +def policy_iteration(mdp, gamma=1.0): + r""" + Implement policy iteration (see (SB18, Section 4.3)). + + Note that policy iteration only considers deterministic policies. we will therefore use the shortcut by representing the policy pi + as a dictionary (similar to the DP-problem in week 2!) so that + > a = pi[s] + is the action in state s. + + """ + pi = {s: np.random.choice(mdp.A(s)) for s in mdp.nonterminal_states} + policy_stable = False + V = None # Sutton has an initialization-step, but it can actually be skipped if we intialize the policy randomly. + while not policy_stable: + # Evaluate the current policy using your code from the previous exercise. + # The main complication is that we need to transform our deterministic policy, pi[s], into a stochastic one pi[s][a]. + # It will be defined as: + # >>> pi_prob[s][a] = 1 if a = pi[s] and otherwise 0. + pi_prob = {s: {a: 1 if pi[s] == a else 0 for a in mdp.A(s)} for s in mdp.nonterminal_states} + V = policy_evaluation(pi_prob, mdp, gamma) + V = policy_evaluation( {s: {pi[s]: 1} for s in mdp.nonterminal_states}, mdp, gamma) + """ Implement the method. This is step (3) in (SB18). """ + policy_stable = True # Will be set to False if the policy pi changes + r""" Implement the steps for policy improvement here. Start by writing a for-loop over all non-terminal states + you can see the policy_evaluation function for how to do this, but + I recommend looking at the property mdp.nonterminal_states (see MDP class for more information). + Hints: + * In the algorithm in (SB18), you need to perform an argmax_a over what is actually Q-values. The function + value_function2q_function(mdp, s, gamma, V) can compute these. + * The argmax itself, assuming you follow the above procedure, involves a dictionary. It can be computed + using methods similar to those we saw in week2 of the DP problem. + It is not a coincidence these algorithms are very similar -- if you think about it, the maximization step closely resembles the DP algorithm! + """ + # TODO: 6 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return pi, V + +if __name__ == "__main__": + mdp = SmallGridworldMDP() + pi, v = policy_iteration(mdp, gamma=0.99) + expected_v = np.array([ 0, -1, -2, -3, + -1, -2, -3, -2, + -2, -3, -2, -1, + -3, -2, -1, 0]) + + from irlc.ex09.small_gridworld import plot_value_function + plot_value_function(mdp, v) + plt.title("Value function using policy iteration to find optimal policy") + from irlc import savepdf + savepdf("policy_iteration") + plt.show() diff --git a/irlc/ex09/rl_agent.py b/irlc/ex09/rl_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..94e3c1ca80c3b7d41770296ae6eb224aa40f5bab --- /dev/null +++ b/irlc/ex09/rl_agent.py @@ -0,0 +1,212 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.utils.common import defaultdict2 +from irlc import Agent + +class TabularAgent(Agent): + """ + This helper class will simplify the implementation of most basic reinforcement learning. Specifically it provides: + + - A :math:`Q(s,a)`-table data structure + - An epsilon-greedy exploration method + + The code for the class is very simple, and I think it is a good idea to at least skim it. + + The Q-data structure can be used a follows: + + .. runblock:: pycon + + >>> from irlc.ex09.rl_agent import TabularAgent + >>> from irlc.gridworld.gridworld_environments import BookGridEnvironment + >>> env = BookGridEnvironment() + >>> agent = TabularAgent(env) + >>> state, info = env.reset() # Get the info-dictionary corresponding to s + >>> agent.Q[state, 1] = 2.5 # Update a Q-value; action a=1 is now optimal. + >>> agent.Q[state, 1] # Check it has indeed been updated. + >>> agent.Q[state, 0] # Q-values are 0 by default. + >>> agent.Q.get_optimal_action(state, info) # Note we pass along the info-dictionary corresopnding to this state + + .. note:: + The ``get_optimal_action``-function requires an ``info`` dictionary. This is required since the info dictionary + contains information about which actions are available. To read more about the Q-values, see :class:`~irlc.ex09.rl_agent.TabularQ`. + """ + def __init__(self, env, gamma=0.99, epsilon=0): + """ + Initialize a tabular environment. For convenience it stores the discount factor :math:`\gamma` and + exploration parameter :math:`\\varepsilon` for epsilon-greedy exploration. Access them as e.g. ``self.gamma`` + + When you implement an agent and overwrite the ``__init__``-method, you should include a call such as ``super( + ).__init__(gamma, epsilon)``. + + :param env: The gym environment + :param gamma: The discount factor :math:`\gamma` + :param epsilon: Exploration parameter :math:`\\varepsilon` for epsilon-greedy exploration + """ + super().__init__(env) + self.gamma, self.epsilon = gamma, epsilon + self.Q = TabularQ(env) + + def pi_eps(self, s, info): + """ + Performs :math:`\\varepsilon`-greedy exploration with :math:`\\varepsilon =` ``self.epsilon`` and returns the + action. Recall this means that with probability :math:`\\varepsilon` it returns a random action, and otherwise + it returns an action associated with a maximal Q-value (:math:`\\arg\\max_a Q(s,a)`). An example: + + .. runblock:: pycon + + >>> from irlc.ex09.rl_agent import TabularAgent + >>> from irlc.gridworld.gridworld_environments import BookGridEnvironment + >>> env = BookGridEnvironment() + >>> agent = TabularAgent(env) + >>> state, info = env.reset() + >>> agent.pi_eps(state, info) # Note we pass along the info-dictionary corresopnding to this state + + .. note:: + The ``info`` dictionary is used to mask (exclude) actions that are not possible in the state. + It is similar to the info dictionary in ``agent.pi(s,info)``. + + :param s: A state :math:`s_t` + :param info: The corresponding ``info``-dictionary returned by the gym environment + :return: An action computed using :math:`\\varepsilon`-greedy action selection based the Q-values stored in the ``self.Q`` class. + """ + if info is not None and 'seed' in info: # In case info contains a seed, reset the random number generator. + np.random.seed(info['seed']) + return Agent.pi(self, s, k=0, info=info) if np.random.rand() < self.epsilon else self.Q.get_optimal_action(s, info) + + +class ValueAgent(TabularAgent): + """ + This is a simple wrapper class around the Agent class above. It fixes the policy and is therefore useful for doing + value estimation. + """ + def __init__(self, env, gamma=0.95, policy=None, v_init_fun=None): + self.env = env + self.policy = policy # policy to evaluate + """ self.v holds the value estimates. + Initially v[s] = 0 unless v_init_fun is given in which case v[s] = v_init_fun(s). """ + self.v = defaultdict2(float if v_init_fun is None else v_init_fun) + super().__init__(env, gamma=gamma) + self.Q = None # Blank out the Q-values which will not be used. + + def pi(self, s, k, info=None): + return TabularAgent.pi(self, s, k, info) if self.policy is None else self.policy(s) + + def value(self, s): + return self.v[s] + +def _masked_actions(action_space, mask): + """Helper function which applies a mask to the action space.""" + from irlc.utils.common import DiscreteTextActionSpace + if isinstance(action_space, DiscreteTextActionSpace): + return [a for a in range(action_space.n) if mask[a] == 1] + else: + return [a for a in range(action_space.n) if mask[a - action_space.start] == 1] + + +class TabularQ: + """ + This is a helper class for storing Q-values. It is used by the :class:`~ircl.ex09.rl_agent.TabularAgent` to store + Q-values where it can be be accessed as ``self.Q[s,a]``. + """ + def __init__(self, env): + """ + Initialize the table. It requires a gym environment to know how many actions there are for each state. + :param env: A gym environment. + """ + self._known_masks = {} # Cache the known action masks. + + def q_default(s): + if s in self._known_masks: + return {a: 0 for a in range(self.env.action_space.n) if self._known_masks[s][a- self.env.action_space.start] == 1} + else: + return {a: 0 for a in range(self.env.action_space.n)} + + # qfun = lambda s: OrderedDict({a: 0 for a in (env.P[s] if hasattr(env, 'P') else range(env.action_space.n))}) + self.q_ = defaultdict2(lambda s: q_default(s)) + self.env = env + + def get_Qs(self, state, info_s=None): + """ + Get a list of all known Q-values for this particular state. That is, in a given state, it will return the two + lists: + + .. math:: + \\begin{bmatrix} a_1 \\\\ a_2 \\\\ \\vdots \\\\ a_k \\end{bmatrix}, \\quad + \\begin{bmatrix} Q(s,a_1) \\\\ Q(s,a_1) \\\\ \\vdots \\\\ Q(s,a_k) \\end{bmatrix} \\\\ + + the ``info_s`` parameter will ensure actions are correctly masked. An example of how to use this function from + a policy: + + .. runblock:: pycon + + >>> from irlc.ex09.rl_agent import TabularAgent + >>> class MyAgent(TabularAgent): + ... def pi(self, s, k, info=None): + ... actions, q_values = self.Q.get_Qs(s, info) + + :param state: The state to query + :param info_s: The info-dictionary returned by the environment for this state. Used for action-masking. + :return: + - actions - A tuple containing all actions available in this state ``(a_1, a_2, ..., a_k)`` + - Qs - A tuple containing all Q-values available in this state ``(Q[s,a1], Q[s, a2], ..., Q[s,ak])`` + """ + if info_s is not None and 'mask' in info_s: + if state not in self._known_masks: + self._known_masks[state] = info_s['mask'] + # Probably a good idea to check the Q-values are okay... + avail_actions = _masked_actions(self.env.action_space, info_s['mask']) + self.q_[state] = {a: self.q_[state][a] for a in avail_actions} + + (actions, Qa) = zip(*self.q_[state].items()) + return tuple(actions), tuple(Qa) + + def get_optimal_action(self, state, info_s): + """ + For a given state ``state``, this function returns the optimal action for that state. + + .. math:: + a^* = \\arg\\max_a Q(s,a) + + An example: + .. runblock:: pycon + + >>> from irlc.ex09.rl_agent import TabularAgent + >>> class MyAgent(TabularAgent): + ... def pi(self, s, k, info=None): + ... a_star = self.Q.get_optimal_action(s, info) + + + :param state: State to find the optimal action in :math:`s` + :param info_s: The ``info``-dictionary corresponding to this state + :return: The optimal action according to the Q-table :math:`a^*` + """ + actions, Qa = self.get_Qs(state, info_s) + a_ = np.argmax(np.asarray(Qa) + np.random.rand(len(Qa)) * 1e-8) + return actions[a_] + + def _chk_mask(self, s, a): + if s in self._known_masks: + mask = self._known_masks[s] + if mask[a - self.env.action_space.start] == 0: + raise Exception(f" Invalid action. You tried to access Q[{s}, {a}], however the action {a} has been previously masked and therefore cannot exist in this state. The mask for {s} is mask={mask}.") + + def __getitem__(self, state_comma_action): + s, a = state_comma_action + self._chk_mask(s, a) + return self.q_[s][a] + + def __setitem__(self, state_comma_action, q_value): + s, a = state_comma_action + self._chk_mask(s, a) + self.q_[s][a] = q_value + + def to_dict(self): + """ + This helper function converts the known Q-values to a dictionary. This function is only used for + visualization purposes in some of the examples. + + :return: A dictionary ``q`` of all known Q-values of the form ``q[s][a]`` + """ + # Convert to a regular dictionary + d = {s: {a: Q for a, Q in Qs.items() } for s,Qs in self.q_.items()} + return d diff --git a/irlc/ex09/small_gridworld.py b/irlc/ex09/small_gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..32711713130f81d7a4ce6bd4a46f2458698d8f05 --- /dev/null +++ b/irlc/ex09/small_gridworld.py @@ -0,0 +1,39 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex09.mdp import MDP +import seaborn as sns + +# action space available to the agent +UP,RIGHT, DOWN, LEFT = 0, 1, 2, 3 +class SmallGridworldMDP(MDP): + def __init__(self, rows=4, cols=4): + self.rows, self.cols = rows, cols # Number of rows, columns. + super().__init__(initial_state=(rows//2, cols//2) ) # Initial state is in the middle of the board. + + def A(self, state): + return [UP, DOWN, RIGHT, LEFT] # All four directions available. + + def Psr(self, state, action): + row, col = state # state is in the format state = (row, col) + if action == UP: row -= 1 + if action == DOWN: row += 1 + if action == LEFT: col += 1 + if action == RIGHT: col -= 1 + + col = min(self.cols-1, max(col, 0)) # Check boundary conditions. + row = min(self.rows-1, max(row, 0)) + reward = -1 # Always get a reward of -1 + next_state = (row, col) + # Note that P(next_state, reward | state, action) = 1 because environment is deterministic + return {(next_state, reward): 1} + + def is_terminal(self, state): + row, col = state + return (row == 0 and col == 0) or (row == self.rows-1 and col == self.cols-1) + + +def plot_value_function(env, v): + A = np.zeros((env.rows, env.cols)) + for (row, col) in env.nonterminal_states: + A[row, col] = v[(row,col)] + sns.heatmap(A, cmap="YlGnBu", annot=True, cbar=False, square=True, fmt='g') diff --git a/irlc/ex09/value_iteration.py b/irlc/ex09/value_iteration.py new file mode 100644 index 0000000000000000000000000000000000000000..9c651b667a2ac70dd08a3f41e6332b88d55f0ebb --- /dev/null +++ b/irlc/ex09/value_iteration.py @@ -0,0 +1,73 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import matplotlib.pyplot as plt +from collections import defaultdict +import numpy as np +from irlc.ex09.mdp_warmup import value_function2q_function +from irlc import savepdf + +def value_iteration(mdp, gamma=.99, theta=0.0001, max_iters=10 ** 6, verbose=False): + """ Implement the value-iteration algorithm defined in (SB18, Section 4.4). + The inputs should be self-explanatory given the pseudo-code. + + I have also included a max_iters variable which represents an upper bound on the total number of iterations. This is useful + if you want to check what the algorithm does after a certain (e.g. 1 or 2) steps. + + The verbose-variable makes the algorithm print out the biggest change in the value-function in a single step. + This is useful if you run it on a large problem and want to know how much time remains, or simply get an idea of + how quickly it converges. + """ + V = defaultdict(lambda: 0) # value function + for i in range(max_iters): + Delta = 0 + for s in mdp.nonterminal_states: + """ Perform the update the value-function V[s] here for the given state. + Note that this has a lot of similarity to the policy-evaluation algorithm, and you can re-use + a lot of that solution, including value_function2q_function(...) (assuming you used that function). """ + # TODO: 2 lines missing. + raise NotImplementedError("Complete the algorithm here.") + if verbose: + print(i, Delta) + if Delta < theta: + break + # Turn the value-function into a policy. It implements the last line of the algorithm. + pi = values2policy(mdp, V, gamma) + return pi, V + +def values2policy(mdp, V, gamma): + r""" Turn the value-function V into a policy. The value function V is implemented as a dictionary so that + > value = V[s] + is the value-function in state s. + The procedure you implement is the very last line of the value-iteration algorithm (SB18, Section 4.4), and it should return + a policy pi as a dictionary so that + > a = pi[s] + is the action in state s. + + Note once again you can re-use the qs_-function. and the argmax -- in fact, the solution is very similar to your solution to the + policy-iteration problem in policy_iteration.py. + As you have properly noticed, even though we implement different algorithms, they are all build using the same + building-block. + """ + pi = {} + for s in mdp.nonterminal_states: + # Create the policy here. pi[s] = a is the action to be taken in state s. + # You can use the qs_ helper function to simplify things and perhaps + # re-use ideas from the dp.py problem from week 2. + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return pi + +if __name__ == "__main__": + import seaborn as sns + from irlc.ex09.small_gridworld import SmallGridworldMDP, plot_value_function + env = SmallGridworldMDP() + policy, v = value_iteration(env, gamma=0.99, theta=1e-6) + plot_value_function(env, v) + + plt.title("Value function obtained using value iteration to find optimal policy") + savepdf("value_iteration") + plt.show() diff --git a/irlc/ex09/value_iteration_agent.py b/irlc/ex09/value_iteration_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..063fcbefeb50bd2a10579964abbbb0ec17f9fb15 --- /dev/null +++ b/irlc/ex09/value_iteration_agent.py @@ -0,0 +1,42 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex09.value_iteration import value_iteration +from irlc import TabularAgent +import numpy as np + + +class ValueIterationAgent(TabularAgent): + def __init__(self, env, mdp=None, gamma=1, epsilon=0, **kwargs): + super().__init__(env) + self.epsilon = epsilon + # TODO: 1 lines missing. + raise NotImplementedError("Call the value_iteration function and store the policy for later.") + + def pi(self, s, k, info=None): + """ With probability (1-epsilon), the take optimal action as computed using value iteration + With probability epsilon, take a random action. You can do this using return self.random_pi(s) + """ + if np.random.rand() < self.epsilon: + return super().pi(s, k, info) # Recall that by default the policy takes random actions. + else: + """ Return the optimal action here. This should be computed using value-iteration. + To speed things up, I recommend calling value-iteration from the __init__-method and store the policy. """ + # TODO: 1 lines missing. + raise NotImplementedError("Compute and return optimal action according to value-iteration.") + return action + + def __str__(self): + return f"ValueIteration(epsilon={self.epsilon})" + + +if __name__ == "__main__": + from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment + env = SuttonCornerGridEnvironment(living_reward=-1, render_mode='human') + from irlc import train, interactive + # Note you can access the MDP for a gridworld using env.mdp. The mdp will be an instance of the MDP class we have used for planning so far. + agent = ValueIterationAgent(env, mdp=env.mdp) # Make a ValueIteartion-based agent + # Visualize & interactivity. Press P or space to follow the policy. + agent.Q = None # This ensure the value function is visualized. + env, agent = interactive(env, agent) + train(env, agent, num_episodes=20) # Train for 100 episodes + env.savepdf("smallgrid.pdf") # Take a snapshot of the final configuration + env.close() # Whenever you use a VideoMonitor, call this to avoid a dumb openglwhatever error message on exit diff --git a/irlc/ex10/__init__.py b/irlc/ex10/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..066dc00a7ae293f26c784ccb6aa91d017ee0adea --- /dev/null +++ b/irlc/ex10/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 10.""" diff --git a/irlc/ex10/blackjack/__init__.py b/irlc/ex10/blackjack/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/ex10/blackjack/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/ex10/blackjack/mc_agent_blackjack.py b/irlc/ex10/blackjack/mc_agent_blackjack.py new file mode 100644 index 0000000000000000000000000000000000000000..f04c457b45db88af90bab205031dde0cab39353c --- /dev/null +++ b/irlc/ex10/blackjack/mc_agent_blackjack.py @@ -0,0 +1,48 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import gym +import numpy as np +from collections import defaultdict +import matplotlib.pyplot as plt +from irlc import main_plot +from irlc import savepdf +from irlc.ex01.agent import train +from irlc.ex10.mc_evaluate_blackjack import plot_blackjack_value, plot_blackjack_policy +from irlc.ex10.mc_agent import MCAgent + +def run_experiment(episodes, first_visit=True, **kwargs): + env_name = 'Blackjack-v1' + env = gym.make(env_name) + agent = MCAgent(env, **kwargs) + lbl = "_".join(map(str, kwargs.values())) + fvl = "First" if first_visit else "Every" + title = f"MC agent ({fvl} visit)" + + expn = f"experiments/{env_name}_MCagent_{episodes}_{first_visit}_{lbl}" # Name the experiment. Pass the label to the train function to store intermediate results. See the online documentation for more information. + # TODO: 1 lines missing. + raise NotImplementedError("call the train(...) function here.") + + # Matplotlib with seaborn is for some reason very slow. + # This code re-samples the curve to just 400 points: + main_plot(expn, smoothing_window=episodes//100, resample_ticks=400) + plt.title("Estimated returns in blackjack using " + title) + plt.ylim([-0.3, 0]) + savepdf(f"blackjack_MC_agent_{episodes}_{first_visit}") + plt.show() + + V = defaultdict(lambda: 0) + A = defaultdict(lambda: 0) + for s, av in agent.Q.to_dict().items(): + A[s] = agent.pi(s, 0) + V[s] = max(av.values() ) + + plot_blackjack_value(V, title=title, pdf_out=f"blackjack_mcagent_policy{fvl}_valfun_{episodes}") + plt.show() + plot_blackjack_policy(A, title=title) + savepdf(f"blackjack_mcagent_policy{fvl}_{episodes}") + plt.show() + +if __name__ == "__main__": + episodes = 1000000 + # episodes = 1000 # Uncomment to run far fewer episodes during debugging. + run_experiment(episodes, epsilon=0.05, first_visit=True) + run_experiment(episodes, epsilon=0.05, first_visit=False) diff --git a/irlc/ex10/blackjack/mc_evaluate_blackjack.py b/irlc/ex10/blackjack/mc_evaluate_blackjack.py new file mode 100644 index 0000000000000000000000000000000000000000..1e0cd7b9773947a6f6dd6904f9655a20a19490a8 --- /dev/null +++ b/irlc/ex10/blackjack/mc_evaluate_blackjack.py @@ -0,0 +1,93 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import matplotlib.pyplot as plt +import numpy as np + +def get_by_ace(V,ace=False): + dd = V.copy() + dd.clear() + for (p,d,ac),val in V.items(): + if ac == ace: + dd[ (p,d)] = val + return dd + +def plot_surface_2(X,Y,Z,fig=None, ax=None, **kwargs): + if fig is None and ax is None: + fig = plt.figure(figsize=(20, 10)) + if ax is None: + ax = fig.add_subplot(projection='3d') + surf = ax.plot_surface(X, Y, Z, cmap=plt.cm.coolwarm, linewidth=1, edgecolors='k', **kwargs) + ax.view_init(ax.elev, -120) + if fig is not None: + fig.colorbar(surf, shrink=0.5, aspect=5) + return ax + +def to_matrix(V): + min_x = min(k[0] for k in V.keys()) + max_x = max(k[0] for k in V.keys()) + min_y = min(k[1] for k in V.keys()) + max_y = max(k[1] for k in V.keys()) + + x_range = np.arange(min_x, max_x + 1) + y_range = np.arange(min_y, max_y + 1) + X, Y = np.meshgrid(x_range, y_range) + + Z_ace = np.zeros_like(X, dtype=float) + for j,(x, y) in enumerate( zip( X.flat, Y.flat)): + Z_ace.flat[j] = float(V[(x,y)]) + return X, Y, Z_ace + +def plot_blackjack_value(V, title="Value Function", pdf_out=None): + """ + Plots the value function as a surface plot. + """ + for lbl, ac in zip(["Usable ace", "No usable ace"], [True, False]): + w = get_by_ace(V,ace=ac) + X,Y,Z = to_matrix(w) + ax = plot_surface_2(X, Y, Z) + ax.set_zlabel("Value") + ax.set_title(title) + if pdf_out is not None: + savepdf(pdf_out+"_"+lbl.replace(" ", "_")) + +def plot_blackjack_policy(V, title): + plt.figure(figsize=(18, 12)) + for lbl, ac in zip(["Usable ace", "No usable ace"], [True, False]): + w = get_by_ace(V,ace=ac) + X, Y, Z = to_matrix(w) + plt.subplot(1,2,1+ac) + plt.imshow(Z.T) + plt.title(f"{title} ({lbl})") + plt.gca().invert_yaxis() + plt.ylabel('Player Sum') + plt.xlabel('Dealer Showing') + plt.colorbar() + +def policy20(s): + # TODO: 1 lines missing. + raise NotImplementedError("Implement the rule where we stick if we have a score of 20 or more.") + +if __name__ == "__main__": + from irlc.ex10.mc_evaluate import MCEvaluationAgent + from irlc.ex01.agent import train + import gym + from irlc import main_plot, savepdf + + nenv = "Blackjack-v1" + env = gym.make(nenv) + episodes = 50000 + gamma = 1 + experiment = f"experiments/{nenv}_first_{episodes}" + """ Instantiate the agent and call the training method here. Make sure to pass the policy=policy20 function to the MCEvaluationAgent + and set gamma=1. """ + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + main_plot(experiment, smoothing_window=episodes//100, resample_ticks=200) + plt.ylim([-0.5, 0]) + plt.title("Blackjack using first-visit MC") + savepdf("blackjack_stick20_first") + plt.show() + + pdf = "blackjack_stick20_valuefun" + plot_blackjack_value(agent.v, title="MC first-visit value function", pdf_out=pdf) + savepdf("blackjack_stick20_valuefun") + plt.show() diff --git a/irlc/ex10/blackjack/random_walk_example.py b/irlc/ex10/blackjack/random_walk_example.py new file mode 100644 index 0000000000000000000000000000000000000000..0e64027c0279c1fe0bcd0a009045ffec7750b698 --- /dev/null +++ b/irlc/ex10/blackjack/random_walk_example.py @@ -0,0 +1,112 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from tqdm import tqdm +from irlc import savepdf +from irlc.ex10.td0_evaluate import TD0ValueAgent +from irlc.ex10.mc_evaluate import MCEvaluationAgent +import seaborn as sns +import pandas as pd +from irlc.ex01.agent import train +from irlc.ex09.mdp import MDP2GymEnv, MDP + +class ChainMRP(MDP): + def __init__(self, length=6): + """ + Build the "Chain MRP" yafcport from (SB18). Terminal states are [0,6], + all states are [0,1,2,3,4,5,6] and initial state is 3. (default settings). + """ + self.max_states = length + super().__init__(initial_state=length // 2) + + def is_terminal(self, state): + return state == 0 or state == self.max_states + + def A(self, s): # 0: left, 1: right. + return [0,1] + + def Psr(self, s, a): + # TODO: 1 lines missing. + raise NotImplementedError("Return the P(s', r | s,a) values here. See e.g. the gampler problem from previous week for help.") + return {(sp, 1 if sp == self.max_states else 0): 1.0} + +class ChainEnvironment(MDP2GymEnv): + def __init__(self, *args, **kwargs): + super().__init__(mdp=ChainMRP(*args, **kwargs)) + +if __name__ == "__main__": + """ plot results as in (SB18, Example 6.2) """ + env = ChainEnvironment() + V_init = np.array([0.5, 0.5, 0.5, 0.5, 0.5]) + V_true = np.array([1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6]) + states = range(1,6) + """ + This is a bit janky. The value-function is initialized at + 0.5 in the example, however (see (SB18)) the value function must be initialized at + 0 in terminal states. We make a function to initialize the value function + and pass it along to the ValueAgent; the ValueAgent then uses a subclassed + defaultdict which can handle a parameterized default value. """ + v_init_fun = lambda x: 0.5 + + fig, ax = plt.subplots(figsize=(15, 6), ncols=2) + """ Make TD plot """ + td_episodes = [0, 1, 10, 100] + V_current = np.copy(V_init) + xticks = ['A', 'B', 'C', 'D', 'E'] + + for i, episodes in enumerate(td_episodes): + agent = TD0ValueAgent(env, v_init_fun=v_init_fun) + train(env, agent, num_episodes=episodes,verbose=False, return_trajectory=False) + vs = [agent.value(s) for s in states] + ax[0].plot(vs, label=f"{episodes} episodes", marker='o') + + ax[0].plot(V_true, label='true values', marker='o') + ax[0].set(xlabel='State', ylabel='Estimated Value', title='Estimated Values TD(0)', + xticks=np.arange(5), xticklabels=['A','B','C','D','E']) + ax[0].legend() + + """ Make TD vs. MC plot """ + td_alphas = [0.05, 0.15, 0.1] + mc_alphas = [0.01, 0.03] + episodes = 100 + runs = 200 + + def eval_mse(agent): + errors = [] + for i in range(episodes): + V_ = [agent.value(s) for s in states] + train(env, agent, num_episodes=1, verbose=False, return_trajectory=False) + z = np.sqrt(np.sum(np.power(V_ - V_true, 2)) / 5.0) + errors.append(z) + return errors + + methods = [(TD0ValueAgent, 'TD', alpha) for alpha in td_alphas] + methods += [(MCEvaluationAgent, 'MC', alpha) for alpha in mc_alphas] + + dfs = [] + for AC,method,alpha in tqdm(methods): + TD_mse = [] + for r in range(runs): + agent = AC(env, alpha=alpha, gamma=1, v_init_fun=v_init_fun) + err_ = eval_mse(agent) + TD_mse.append( np.asarray(err_)) + + # Happy times with pandas. Let's up the production value by also plotting 1 std. + for u,mse in enumerate(TD_mse): + df = pd.DataFrame(mse, columns=['rmse']) + df.insert(len(df.columns), 'Unit', u) + df.insert(len(df.columns), 'Episodes', range(episodes)) + df.insert(len(df.columns), 'Condition', f"{method} $\\alpha$={alpha}") + dfs.append(df) + + data = pd.concat(dfs, ignore_index=True) + sns.lineplot(data=data, x='Episodes', y='rmse', hue="Condition", errorbar=('ci', 95), estimator='mean') + plt.ylabel("RMS error (averaged over states)") + plt.title("Empirical RMS error, averaged over states") + savepdf("random_walk_example") + plt.show() diff --git a/irlc/ex10/envs.py b/irlc/ex10/envs.py new file mode 100644 index 0000000000000000000000000000000000000000..bd341256a496d21dd28bcff38cc2a172e06ce9b1 --- /dev/null +++ b/irlc/ex10/envs.py @@ -0,0 +1,50 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import gymnasium as gym + +gym.envs.register( + id='Gambler-v0', + entry_point='irlc.ex09.gambler:GamblerEnv', +) + +gym.envs.register( + id='Tenv-v0', + entry_point='irlc.ex09.gambler:TEnv', + max_episode_steps=100, +) + +gym.envs.register( + id='JackRental4-v0', + entry_point='irlc.ex09.jacks_car_rental:RentalEnv', + max_episode_steps=1000, + kwargs={"max_cars": 4, + "poisson_truncation": 4, + "cache_str": "jack_rental_environment_4"}, +) + +gym.envs.register( + id='JackRental-v0', + entry_point='irlc.ex09.jacks_car_rental:RentalEnv', + max_episode_steps=1000, + kwargs={"cache_str": "jack_rental_environment"}, +) # "compress_tol": 0.01 + +gym.envs.register( + id='SmallGridworld-v0', + entry_point='irlc.gridworld.gridworld_environments:SuttonCornerGridEnvironment', + # max_episode_steps=100, # Stop trying to make it happen +) + +gym.envs.register( # Like MountainCar-v0, but time limit increased from 200 to 500. + id='MountainCar500-v0', + entry_point='gymnasium.envs.classic_control:MountainCarEnv', + max_episode_steps=500, + reward_threshold=-110.0, +) + + +if __name__ == "__main__": + print("Testing...") + mc = gym.make('MountainCar500-v0') + # j4 = gym.make("JackRental4-v0") + # jack = gym.make("JackRental-v0") + sg = gym.make("SmallGridworld-v0") diff --git a/irlc/ex10/mc_agent.py b/irlc/ex10/mc_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..0719f15ff7518194491fcf26095f27442deacd4c --- /dev/null +++ b/irlc/ex10/mc_agent.py @@ -0,0 +1,86 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from collections import defaultdict +import matplotlib.pyplot as plt +from irlc.ex09.rl_agent import TabularAgent +from irlc import main_plot, savepdf, train +from irlc import interactive +def get_MC_return_SA(episode, gamma, first_visit=True): + """ Helper method for computing the MC returns. + Given an episodes in the form [ (s0,a0,r1), (s1,a1,r2), ...] + this function computes (if first_visit=True) a new list + + > [((s,a), G) , ... ] + + consisting of the unique $(s_t,a_t)$ pairs in episode along with their return G_t (computed from their first occurance). + Alternatively, if first_visit=False, the method return a list of same length of episode + with all (s,a) pairs and their return. + """ + sa = [(s, a) for s, a, r in episode] # Get all state/action pairs. Useful for checking if we have visited a state/action before. + G = 0 + returns = [] + for t in reversed(range(len(episode))): + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + if sa_t not in sa[:t] or not first_visit: + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return returns + +class MCAgent(TabularAgent): + def __init__(self, env, gamma=1.0, epsilon=0.05, alpha=None, first_visit=True): + if alpha is None: + self.returns_sum = defaultdict(float) + self.returns_count = defaultdict(float) + self.alpha = alpha + self.first_visit = first_visit + self.episode = [] + super().__init__(env, gamma, epsilon) + + def pi(self, s,k, info=None): + """ + Compute the policy of the MC agent. Remember the agent is epsilon-greedy. You can use the pi_eps(s,info)-function defined + in the TabularAgent class. + """ + # TODO: 1 lines missing. + raise NotImplementedError("Compute action here using the Q-values. (remember to be epsilon-greedy)") + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + """ + Consult your implementation of value estimation agent for ideas. Note you can index the Q-values as + + >> self.Q[s, a] = new_q_value + + see comments in the Agent class for more details, however for now you can consider them as simply a nested + structure where ``self.Q[s, a]`` defaults to 0 unless the Q-value has been updated. + """ + # TODO: 12 lines missing. + raise NotImplementedError("Train the agent here.") + + def __str__(self): + return f"MC_{self.gamma}_{self.epsilon}_{self.alpha}_{self.first_visit}" + +if __name__ == "__main__": + """ Load environment but make sure it is time-limited. Can you tell why? """ + envn = "SmallGridworld-v0" + + from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment + env = SuttonCornerGridEnvironment(uniform_initial_state=True) + # env = BookGridEnvironment(living_reward=-0.05) # Uncomment to test an alternative environment with a negative living reward. + + gamma = 1 + episodes = 20000 + experiment="experiments/mcagent_smallgrid" + agent = MCAgent(env, gamma=gamma, first_visit=True) + train(env, agent, experiment_name=experiment, num_episodes=episodes, return_trajectory=False) + main_plot(experiments=[experiment], resample_ticks=200) + plt.title("Smallgrid MC agent value function") + plt.ylim([-10, 0]) + savepdf("mcagent_smallgrid") + plt.show() + + env, agent = interactive(env, agent) + env.reset() + env.plot() + plt.title(f"MC on-policy control of {envn} using first-visit") + savepdf("MC_agent_value_smallgrid") + plt.show(block=False) diff --git a/irlc/ex10/mc_evaluate.py b/irlc/ex10/mc_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..973d4b1040e499dc85f1cf72add0617e768ae2d7 --- /dev/null +++ b/irlc/ex10/mc_evaluate.py @@ -0,0 +1,120 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import savepdf +import matplotlib.pyplot as plt +from irlc.ex09.rl_agent import ValueAgent +from collections import defaultdict +from irlc.ex01.agent import train +import numpy as np +import matplotlib +#matplotlib.use('qtagg') # Fix crash on linux with default backend. + +def get_MC_return_S(episode, gamma, first_visit=True): + """ Helper method for computing the MC returns. + Given an episodes in the form ``[ (s0,a0,r1), (s1,a1,r2), ...]`` + this function computes (if first_visit=True) a new list:: + + [(s0, G0), (s1, G1), ...] + + consisting of the unique s_t values in the episode along with their return G_t (computed from their first occurance). + + Alternatively, if first_visit=False, the method return a list of same length of episode + with all s values and their return. + """ + ss = [s for s, a, r in episode] + G = 0 + returns = [] + for t in reversed(range(len(episode))): + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + if s_t not in ss[:t] or not first_visit: + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return returns +class MCEvaluationAgent(ValueAgent): + def __init__(self, env, policy=None, gamma=1, alpha=None, first_visit=True, v_init_fun=None): + self.episode = [] + self.first_visit = first_visit + self.alpha = alpha + if self.alpha is None: + self.returns_sum_S = defaultdict(float) + self.returns_count_N = defaultdict(float) + super().__init__(env, gamma, policy, v_init_fun=v_init_fun) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + self.episode.append((s, a, r)) # Gather the episode + if done: # Only train when the episode has stopped + returns = get_MC_return_S(self.episode, self.gamma, self.first_visit) + for s, G in returns: + if self.alpha: + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + else: + # TODO: 3 lines missing. + raise NotImplementedError("Implement function body") + + self.episode = [] + + def __str__(self): + return f"MCeval_{self.gamma}_{self.alpha}_{self.first_visit}" + + +if __name__ == "__main__": + envn = "SmallGridworld-v0" + from irlc import interactive + from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment + env = SuttonCornerGridEnvironment(render_mode=None) + gamma = 1 + episodes = 200 + agent = MCEvaluationAgent(env, gamma=gamma) + train(env, agent, num_episodes=episodes) + env.render_mode = 'human' + env, agent = interactive(env, agent, autoplay=True) + env.plot() + plt.title(f"MC evaluation of {envn} using first-visit") + savepdf("MC_value_random_smallgrid") + plt.show(block=False) + env.close() + + env = SuttonCornerGridEnvironment(render_mode=None) + agent_every = MCEvaluationAgent(env, gamma=gamma, first_visit=False) + train(env, agent_every, num_episodes=episodes) + env.render_mode = 'human' + env, agent = interactive(env, agent, autoplay=True) + env.plot() + plt.title(f"MC evaluation of {envn} using every-visit") + savepdf("MC_value_random_smallgrid_every") + plt.show(block=False) + env.close() + s0 = (1, 1) + print(f"Estimated value functions v_pi(s0) for first visit {agent.v[(1,1)]:3}") + print(f"Estimated value functions v_pi(s0) for every visit {agent_every.v[(1,1)]:3}") + + ## Second part: + repeats = 5000 # increase to e.g. 20'000. + episodes = 1 + ev, fv = [], [] + env = SuttonCornerGridEnvironment() + print(f"Repeating experiment {repeats} times, this may take a while.") + for _ in range(repeats): + """ + Instantiate two agents with first_visit=True and first_visit=False. + Train the agents using the train function for episodes episodes. You might want to pass verbose=False to the + 'train'-method to suppress output. + When done, compute the mean of agent.values() and add it to the lists ev / fv; the mean of these lists + are the desired result. + """ + agent = MCEvaluationAgent(env, gamma=gamma) + # TODO: 1 lines missing. + raise NotImplementedError("Create and train an every-visit agent.") + + train(env, agent, num_episodes=episodes, verbose=False) + # TODO: 1 lines missing. + raise NotImplementedError("Create and train an every-visit agent.") + + ev.append(agent.v[(1,1)]) + fv.append(agent_every.v[(1,1)]) + + print(f"First visit: Mean of value functions E[v_pi(s0)] after {repeats} repeats {np.mean(fv):3}") + print(f"Every visit: Mean of value functions E[v_pi(s0)] after {repeats} repeats {np.mean(ev):3}") + env.close() + plt.close() diff --git a/irlc/ex10/question_td0.py b/irlc/ex10/question_td0.py new file mode 100644 index 0000000000000000000000000000000000000000..3f31e5b4ab770e85db1c8752af0c1eff3e2293d3 --- /dev/null +++ b/irlc/ex10/question_td0.py @@ -0,0 +1,36 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +def a_compute_deltas(v: dict, states: list, rewards: list, gamma: float) -> list: + # TODO: Code has been removed from here. + raise NotImplementedError("Insert your solution and remove this error.") + return deltas + + +def b_perform_td0(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict: + # TODO: Code has been removed from here. + raise NotImplementedError("Insert your solution and remove this error.") + return v + + +def c_perform_td0_batched(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict: + # TODO: Code has been removed from here. + raise NotImplementedError("Insert your solution and remove this error.") + return v + + +if __name__ == "__main__": + states = [1, 0, 2, -1, 2, 4, 5, 4, 3, 2, 1, -1] + rewards = [1, 0.5, -1, 0, 1, 2, 2, 0, 0, -1, 0.5] + # In the notation of the problem: T = len(rewards). + v = {s: 0 for s in states} # Initialize the value function v. + gamma = 0.9 + alpha = 0.2 + + deltas = a_compute_deltas(v, states, rewards, gamma) + print(f"The first value of delta should be 1, your value is {deltas[0]=}") + + v = b_perform_td0(v, states, rewards, gamma, alpha) + print(f"The value function v(s=1) should be 0.25352, your value is {v[1]=}") + + v_batched = {s: 0 for s in states} # Initialize the value function anew + v_batched = c_perform_td0_batched(v_batched, states, rewards, gamma, alpha) + print(f"The batched value function in v(s=1) should be 0.3, your value is {v_batched[1]=}") diff --git a/irlc/ex10/td0_evaluate.py b/irlc/ex10/td0_evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..98aa5fc2547852c39135c62e642d917baa5c8a3d --- /dev/null +++ b/irlc/ex10/td0_evaluate.py @@ -0,0 +1,43 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex09.rl_agent import ValueAgent +from irlc import savepdf +from irlc.ex01.agent import train + +class TD0ValueAgent(ValueAgent): + def __init__(self, env, policy=None, gamma=0.99, alpha=0.05, v_init_fun=None): + self.alpha = alpha + super().__init__(env, gamma=gamma, policy=policy, v_init_fun=v_init_fun) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 3 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"TD0Value_{self.gamma}_{self.alpha}" + +def value_function_test(env, agent, v_true, episodes=200): + err = [] + for t in range(episodes): + train(env, agent, num_episodes=1, verbose=False) + err.append( np.mean( [(v_true - v0) ** 2 for k, v0 in agent.v.items()] ) ) + return np.asarray(err) + +if __name__ == "__main__": + envn = "SmallGridworld-v0" + + from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment + from irlc import interactive + env = SuttonCornerGridEnvironment() # Make the gridworld environment itself + + gamma = 1 + agent = TD0ValueAgent(env, gamma=gamma, alpha=0.05) # Make a TD(0) agent + train(env, agent, num_episodes=2000, return_trajectory=False) # Train for 2000 episodes + env = SuttonCornerGridEnvironment(render_mode='human') # Re-make the gridworld to get rendering. + env, agent = interactive(env, agent) # Add a video monitor, the environment will now show an animation + train(env,agent,num_episodes=1) # Train for a (single) new episode + env.plot() # Plot the current state of the environment/agent + plt.title(f"TD0 evaluation of {envn}") + savepdf("TD_value_random_smallgrid") + plt.show(block=False) diff --git a/irlc/ex11/__init__.py b/irlc/ex11/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6fc08338fb059441ae58e5efe88d2db3a4052153 --- /dev/null +++ b/irlc/ex11/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 11.""" diff --git a/irlc/ex11/__pycache__/__init__.cpython-311.pyc b/irlc/ex11/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af11ce3f70498da4527edc9a5bc56e55d7a5bcca Binary files /dev/null and b/irlc/ex11/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex11/__pycache__/feature_encoder.cpython-311.pyc b/irlc/ex11/__pycache__/feature_encoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d8b22f7b9271cf3563787f709c9e3f17b64afa7 Binary files /dev/null and b/irlc/ex11/__pycache__/feature_encoder.cpython-311.pyc differ diff --git a/irlc/ex11/__pycache__/q_agent.cpython-311.pyc b/irlc/ex11/__pycache__/q_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e5fe50d5559fca176eeb75f5ace797e16509486 Binary files /dev/null and b/irlc/ex11/__pycache__/q_agent.cpython-311.pyc differ diff --git a/irlc/ex11/feature_encoder.py b/irlc/ex11/feature_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..79f6bb01e4505115910ad6b4069f5d233f0dacdb --- /dev/null +++ b/irlc/ex11/feature_encoder.py @@ -0,0 +1,402 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +from math import floor +from gymnasium.spaces.box import Box +import numpy as np +from irlc.ex09.rl_agent import _masked_actions +from irlc.utils.common import defaultdict2 + +class FeatureEncoder: + """ + The idea behind linear function approximation of :math:`Q`-values is that + + - We initialize (and eventually learn) a :math:`d`-dimensional weight vector :math:`w \in \mathbb{R}^d` + - We assume there exists a function to compute a :math:`d`-dimensional feature vector :math:`x(s,a) \in \mathbb{R}^d` + - The :math:`Q`-values are then represented as + + .. math:: + Q(s,a) = x(s,a)^\\top w + + Learning is therefore entirely about updating :math:`w`. + + The following example shows how you initialize the linear :math:`Q`-values and compute them in a given state: + + .. runblock:: pycon + + >>> import gymnasium as gym + >>> from irlc.ex11.feature_encoder import LinearQEncoder + >>> env = gym.make('MountainCar-v0') + >>> Q = LinearQEncoder(env, tilings=8) + >>> s, _ = env.reset() + >>> a = env.action_space.sample() + >>> Q(s,a) # Compute a Q-value. + >>> Q.d # Get the number of dimensions + >>> Q.x(s,a)[:4] # Get the first four coordinates of the x-vector + >>> Q.w[:4] # Get the first four coordinates of the w-vector + + """ + def __init__(self, env): + """ + Initialize the feature encoder. It requires an environment to know the number of actions and dimension of the state space. + + :param env: An openai Gym ``Env``. + """ + self.env = env + self.w = np.zeros((self.d, )) + self._known_masks = {} + + def q_default(s): + from irlc.utils.common import DiscreteTextActionSpace + if s in self._known_masks: + return {a: 0 for a in range(self.env.action_space.n) if + self._known_masks[s][(a - self.env.action_space.start) if not isinstance(self.env.action_space, DiscreteTextActionSpace) else a] == 1} + else: + return {a: 0 for a in range(self.env.action_space.n)} + + # qfun = lambda s: OrderedDict({a: 0 for a in (env.P[s] if hasattr(env, 'P') else range(env.action_space.n))}) + + self.q_ = defaultdict2(lambda s: q_default(s)) + + @property + def d(self): + """ Get the number of dimensions of :math:`w` + + .. runblock:: pycon + + >>> import gymnasium as gym + >>> from irlc.ex11.feature_encoder import LinearQEncoder + >>> env = gym.make('MountainCar-v0') + >>> Q = LinearQEncoder(env, tilings=8) # as in (SB18) + >>> Q.d + """ + raise NotImplementedError() + + def x(self, s, a): + """ + Computes the :math:`d`-dimensional feature vector :math:`x(s,a)` + + .. runblock:: pycon + + >>> import gymnasium as gym + >>> from irlc.ex11.feature_encoder import LinearQEncoder + >>> env = gym.make('MountainCar-v0') + >>> Q = LinearQEncoder(env, tilings=8) # as in (SB18) + >>> s, info = env.reset() + >>> x = Q.x(s, env.action_space.sample()) + + :param s: A state :math:`s` + :param a: An action :math:`a` + :return: Feature vector :math:`x(s,a)` + """ + raise NotImplementedError() + + def get_Qs(self, state, info_s=None): + """ + This is a helper function, it is only for internal use. + + :param state: + :param info_s: + :return: + """ + if info_s is not None and 'mask' in info_s and not isinstance(state, np.ndarray): + if state not in self._known_masks: + self._known_masks[state] = info_s['mask'] + # Probably a good idea to check the Q-values are okay... + avail_actions = _masked_actions(self.env.action_space, info_s['mask']) + self.q_[state] = {a: self.q_[state][a] for a in avail_actions} + # raise Exception() + # from irlc.utils.common import ExplicitActionSpace + # + # zip(*self.q_[state].items()) + from irlc.pacman.pacman_environment import PacmanEnvironment + from irlc.pacman.pacman_utils import Actions + if isinstance(state, np.ndarray): + actions = tuple(range(self.env.action_space.n)) + elif isinstance(self.env, PacmanEnvironment): + # actions = Actions + # actions = tuple(Actions._directions.keys()) + actions = _masked_actions(self.env.action_space, info_s['mask']) + actions = tuple([self.env.action_space.actions[n] for n in actions]) + else: + actions = tuple(self.q_[state].keys()) + + # if isinstance(self.env, PacmanEnvironment): + # # TODO: Make smarter masking. + # actions = [a for a in actions if a in self.env.A(state)] + # actions = + Qs = tuple([self(state,a) for a in actions]) + # TODO: Implement masking and masking-cache. + return actions, Qs + # + # actions = list( self.env.P[state].keys() if hasattr(self.env, 'P') else range(self.env.action_space.n) ) + # Qs = [self(state, a) for a in actions] + # return tuple(actions), tuple(Qs) + + def get_optimal_action(self, state, info=None): + """ + For a given state ``state``, this function returns the optimal action for that state. + + .. math:: + a^* = \\arg\\max_a Q(s,a) + + An example: + + .. runblock:: pycon + + >>> from irlc.ex09.rl_agent import TabularAgent + >>> class MyAgent(TabularAgent): + ... def pi(self, s, k, info=None): + ... a_star = self.Q.get_optimal_action(s, info) + + :param state: State to find the optimal action in :math:`s` + :param info: The ``info``-dictionary corresponding to this state + :return: The optimal action according to the Q-values :math:`a^*` + """ + actions, Qa = self.get_Qs(state, info) + if len(actions) == 0: + print("Bad actions list") + a_ = np.argmax(np.asarray(Qa) + np.random.rand(len(Qa)) * 1e-8) + return actions[a_] + + def __call__(self, s, a): + """ + Evaluate the Q-values for the given state and action. An example: + + .. runblock:: pycon + + >>> import gymnasium as gym + >>> from irlc.ex11.feature_encoder import LinearQEncoder + >>> env = gym.make('MountainCar-v0') + >>> Q = LinearQEncoder(env, tilings=8) # as in (SB18) + >>> s, info = env.reset() + >>> Q(s, env.action_space.sample()). # Compute Q(s,a) + + :param s: A state :math:`s` + :param a: An action :math:`a` + :return: Feature vector :math:`x(s,a)` + """ + return self.x(s, a) @ self.w + + def __getitem__(self, item): + raise Exception("Hi! You tried to access linear Q-values as Q[s,a]. You need to use Q(s,a). This choice signifies they are not represented as a table, but as a linear combination x(s,a)^T w") + # s,a = item + # return self.__call__(s, a) + + def __setitem__(self, key, value): + raise Exception("Oy! You tried to set a linearly encoded Q-value as in Q[s, a] = new_q_value.\n This is not possible since they are represented as x(s,a)^T w. Rewrite the expression to update Q.w.") + +class DirectEncoder(FeatureEncoder): + def __init__(self, env): + self.d_ = np.prod( env.observation_space.shape ) * env.action_space.n + # self.d_ = len(self.x(env.reset(), env.action_space.n)) + super().__init__(env) + + def x(self, s, a): + xx = np.zeros( (self.d,)) + n = s.size + xx[n * a:n*(a+1) ] = s + return xx + + ospace = self.env.observation_space.shape + simple = False + if not isinstance(ospace, tuple): + ospace = (ospace,) + simple = True + + sz = [] + for j, disc in enumerate(ospace): + sz.append(disc.n) + + total_size = sum(sz) + csum = np.cumsum(sz, ) - sz[0] + self.max_size = total_size * self.env.action_space.n + + + def fixed_sparse_representation(s, action): + if simple: + s = (s,) + s_encoded = [cs + ds + total_size * action for ds, cs in zip(s, csum)] + return s_encoded + + self.get_active_tiles = fixed_sparse_representation + + # super().__init__(env) + + @property + def d(self): + return self.d_ + return 10000*8 + x = np.zeros(self.d) + at = self.get_active_tiles(s, a) + x[at] = 1.0 + return x + + +class GridworldXYEncoder(FeatureEncoder): + def __init__(self, env): + self.env = env + self.na = self.env.action_space.n + self.ns = 2 + super().__init__(env) + + @property + def d(self): + return self.na*self.ns + + def x(self, s, a): + x,y = s + xx = [np.zeros(self.ns) for _ in range(self.na)] + xx[a][0] = x + xx[a][1] = y + # return xx[a] + xx = np.concatenate(xx) + return xx + +class SimplePacmanExtractor(FeatureEncoder): + def __init__(self, env): + self.env = env + from irlc.pacman.feature_extractor import SimpleExtractor + # from reinforcement.featureExtractors import SimpleExtractor + self._extractor = SimpleExtractor() + self.fields = ["bias", "#-of-ghosts-1-step-away", "#-of-ghosts-1-step-away", "eats-food", "closest-food"] + super().__init__(env) + + def x(self, s, a): + xx = np.zeros_like(self.w) + # ap = self.env._actions_gym2pac[a] + ap = a + for k, v in self._extractor.getFeatures(s, ap).items(): + xx[self.fields.index(k)] = v + return xx + + @property + def d(self): + return len(self.fields) + +class LinearQEncoder(FeatureEncoder): + def __init__(self, env, tilings=8, max_size=2048): + """ + Implements the tile-encoder described by (SB18) + + :param env: The openai Gym environment we wish to solve. + :param tilings: Number of tilings (translations). Typically 8. + :param max_size: Maximum number of dimensions. + """ + if isinstance(env.observation_space, Box): + os = env.observation_space + low = os.low + high = os.high + scale = tilings / (high - low) + hash_table = IHT(max_size) + self.max_size = max_size + def tile_representation(s, action): + s_ = list( (s*scale).flat ) + active_tiles = tiles(hash_table, tilings, s_, [action]) # (s * scale).tolist() + # if 0 not in active_tiles: + # active_tiles.append(0) + return active_tiles + self.get_active_tiles = tile_representation + else: + # raise Exception("Implement in new class") + # + # Use Fixed Sparse Representation. See: + # https://castlelab.princeton.edu/html/ORF544/Readings/Geramifard%20-%20Tutorial%20on%20linear%20function%20approximations%20for%20dynamic%20programming%20and%20RL.pdf + + ospace = env.observation_space + simple = False + if not isinstance(ospace, tuple): + ospace = (ospace,) + simple = True + + sz = [] + for j,disc in enumerate(ospace): + sz.append( disc.n ) + + total_size = sum(sz) + csum = np.cumsum(sz,) - sz[0] + self.max_size = total_size * env.action_space.n + + def fixed_sparse_representation(s, action): + if simple: + s = (s,) + s_encoded = [cs + ds + total_size * action for ds,cs in zip(s, csum)] + return s_encoded + self.get_active_tiles = fixed_sparse_representation + super().__init__(env) + + def x(self, s, a): + x = np.zeros(self.d) + at = self.get_active_tiles(s, a) + x[at] = 1.0 + return x + + @property + def d(self): + return self.max_size + + +""" +Following code contains the tile-coding utilities copied from: +http://incompleteideas.net/tiles/tiles3.py-remove +""" +class IHT: + """Structure to handle collisions""" + + def __init__(self, size_val): + self.size = size_val + self.overfull_count = 0 + self.dictionary = {} + + + def count(self): + return len(self.dictionary) + + def full(self): + return len(self.dictionary) >= self.size + + def get_index(self, obj, read_only=False): + d = self.dictionary + if obj in d: + return d[obj] + elif read_only: + return None + size = self.size + count = self.count() + if count >= size: + if self.overfull_count == 0: + print('IHT full, starting to allow collisions') + self.overfull_count += 1 + return hash(obj) % self.size + else: + d[obj] = count + return count + + + + +def hash_coords(coordinates, m, read_only=False): + if isinstance(m, IHT): return m.get_index(tuple(coordinates), read_only) + if isinstance(m, int): return hash(tuple(coordinates)) % m + if m is None: return coordinates + + +def tiles(iht_or_size, num_tilings, floats, ints=None, read_only=False): + """returns num-tilings tile indices corresponding to the floats and ints""" + if ints is None: + ints = [] + qfloats = [floor(f * num_tilings) for f in floats] + tiles = [] + for tiling in range(num_tilings): + tilingX2 = tiling * 2 + coords = [tiling] + b = tiling + for q in qfloats: + coords.append((q + b) // num_tilings) + b += tilingX2 + coords.extend(ints) + tiles.append(hash_coords(coords, iht_or_size, read_only)) + return tiles diff --git a/irlc/ex11/nstep_sarsa_agent.py b/irlc/ex11/nstep_sarsa_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..b1648dc302576a4cd6d45e68e03486dd0bb12f00 --- /dev/null +++ b/irlc/ex11/nstep_sarsa_agent.py @@ -0,0 +1,84 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +from irlc.ex01.agent import train +import gymnasium as gym +from irlc import main_plot +import matplotlib.pyplot as plt +from irlc.ex11.q_agent import QAgent + +class SarsaNAgent(QAgent): + """ Implement the N-step semi-gradient sarsa agent from (SB18, Section 7.2)""" + def __init__(self, env, gamma=1, alpha=0.2, epsilon=0.1, n=1): + # Variables for TD-n + self.n = n # as in n-step sarse + # Buffer lists for previous (S_t, R_{t}, A_t) triplets + self.R, self.S, self.A = [None] * (self.n + 1), [None] * (self.n + 1), [None] * (self.n + 1) + super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon) + + def pi(self, s, k, info=None): + self.t = k # Save current step in episode for use in train. + if self.t == 0: # First action is epsilon-greedy. + self.A[self.t] = self.pi_eps(s, info) + return self.A[self.t % (self.n+1)] + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # Recall we are given S_t, A_t, R_{t+1}, S_{t+1} and done is whether t=T+1. + n = self.n # n as in n-step sarsa. + t = self.t # Current time step t as in s_t. + if t == 0: # We are in the initial state. Reset buffer. + self.S[0], self.A[0] = s, a + # Store current observations in buffer. + self.S[(t+1)%(n+1)] = sp + self.R[(t+1)%(n+1)] = r + self.A[(t+1)%(n+1)] = self.pi_eps(sp, info_sp) if not done else -1 + + if done: + T = t+1 + tau_steps_to_train = range(t - n + 1, T) + else: + T = 1e10 + tau_steps_to_train = [t - n + 1] + # Tau represent the current tau-steps which are to be updated. The notation is compatible with that in Sutton. + for tau in tau_steps_to_train: + if tau >= 0: + """ + Compute the return for this tau-step and perform the relevant Q-update. + The first step is to compute the expected return G in the below section. + """ + # TODO: 4 lines missing. + raise NotImplementedError("Compute G= (expected return) here.") + + S_tau, A_tau = self.S[tau%(n+1)], self.A[tau%(n+1)] + delta = (G - self._q(S_tau, A_tau)) + if n == 1: # Check your implementation is correct when n=1 by comparing it with regular Sarsa learning. + delta_Sarsa = (r + (0 if done else self.gamma * self._q(sp,A_tau_n)) - self._q(S_tau,A_tau)) + if abs(delta-delta_Sarsa) > 1e-10: + raise Exception("n=1 agreement with Sarsa learning failed. You have at least one bug!") + self._upd_q(S_tau, A_tau, delta) + + def _q(self, s, a): return self.Q[s,a] # Using these helper methods will come in handy when we work with function approximators, but it is optional. + def _upd_q(self, s, a, delta): self.Q[s,a] += self.alpha * delta + + def __str__(self): + return f"SarsaN_{self.gamma}_{self.epsilon}_{self.alpha}_{self.n}" + + +if __name__ == "__main__": + envn = 'CliffWalking-v0' + env = gym.make(envn) + from irlc.ex11.sarsa_agent import sarsa_exp + from irlc.ex11.q_agent import q_exp + + agent = SarsaNAgent(env, n=5, epsilon=0.1,alpha=0.5) + exp = f"experiments/{envn}_{agent}" + for _ in range(10): # Train 10 times to get an idea about the average performance. + train(env, agent, exp, num_episodes=200, max_runs=10) + main_plot([q_exp, sarsa_exp, exp], smoothing_window=10) # plot with results from Q/Sarsa simulations. + plt.ylim([-100,0]) + from irlc import savepdf + savepdf("n_step_sarsa_cliff") + plt.show() diff --git a/irlc/ex11/q_agent.py b/irlc/ex11/q_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..906e873472810f97593e76f5e0c9343abc6389c7 --- /dev/null +++ b/irlc/ex11/q_agent.py @@ -0,0 +1,85 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +from irlc.ex09.mdp import GymEnv2MDP +from irlc.ex09.rl_agent import TabularAgent +from irlc import train +import gymnasium as gym +from irlc import main_plot +import matplotlib.pyplot as plt +from irlc import savepdf +from irlc.ex09.value_iteration_agent import ValueIterationAgent + +class QAgent(TabularAgent): + r""" + Implement the Q-learning agent (SB18, Section 6.5) + Note that the Q-datastructure already exist, as do helper functions useful to compute an epsilon-greedy policy. + You can access these as + + > self.Q[s,a] = 31 # Set a Q-value. + + See the TabularAgent class for more information. + """ + def __init__(self, env, gamma=1.0, alpha=0.5, epsilon=0.1): + self.alpha = alpha + super().__init__(env, gamma, epsilon) + + def pi(self, s, k, info=None): + """ + Return current action using epsilon-greedy exploration. You should look at the TabularAgent class for ideas. + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement the epsilon-greedy policy here.") + return action + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + """ + Implement the Q-learning update rule, i.e. compute a* from the Q-values. + As a hint, note that self.Q[sp,a] corresponds to q(s_{t+1}, a) and + that what you need to update is self.Q[s, a] = ... + + You may want to look at self.Q.get_optimal_action(state) to compute a = argmax_a Q[s,a]. + """ + # TODO: 3 lines missing. + raise NotImplementedError("Update the Q[s,a]-values here.") + + def __str__(self): + return f"QLearner_{self.gamma}_{self.epsilon}_{self.alpha}" + +q_exp = f"experiments/cliffwalk_Q" +epsilon = 0.1 +max_runs = 10 +alpha = 0.5 +def cliffwalk(): + env = gym.make('CliffWalking-v0') + agent = QAgent(env, epsilon=epsilon, alpha=alpha) + train(env, agent, q_exp, num_episodes=200, max_runs=max_runs) + + # As a baseline, we set up/evaluate a value-iteration agent to get an idea about the optimal performance. + # To do so, we need an MDP object. We create an MDP object out of the gym environment below. + # You can look at the code if you like, but it is simply a helper function to convert from one datastructure to another, + # and all it does is to give a MDP object which is needed for our value-iteration implementation from the previous + # week. + mdp = GymEnv2MDP(env) + vi_exp = "experiments/cliffwalk_VI" + Vagent = ValueIterationAgent(env, mdp=mdp, epsilon=epsilon) + train(env, Vagent, vi_exp, num_episodes=200, max_runs=max_runs) + + vi_exp_opt = "experiments/cliffwalk_VI_optimal" + Vagent_opt = ValueIterationAgent(env, mdp=mdp, epsilon=0) # Same, but with epsilon=0 + train(env, Vagent_opt, vi_exp_opt, num_episodes=200, max_runs=max_runs) + + exp_names = [q_exp, vi_exp, vi_exp_opt] + return env, exp_names + +if __name__ == "__main__": + for _ in range(10): + env, exp_names = cliffwalk() + main_plot(exp_names, smoothing_window=10) + plt.ylim([-100, 0]) + plt.title("Q-learning on " + env.spec.name) + savepdf("Q_learning_cliff") + plt.show() diff --git a/irlc/ex11/sarsa_agent.py b/irlc/ex11/sarsa_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..29aa196dfc4ca45ec80ac26bbbf4e6718272142a --- /dev/null +++ b/irlc/ex11/sarsa_agent.py @@ -0,0 +1,52 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import matplotlib.pyplot as plt +from irlc.ex11.q_agent import QAgent +from irlc import main_plot, savepdf +from irlc.ex01.agent import train +from irlc.ex11.q_agent import cliffwalk, alpha, epsilon + +class SarsaAgent(QAgent): + r""" Implement the Sarsa control method from (SB18, Section 6.4). It is recommended you complete + the Q-agent first because the two methods are very similar and the Q-agent is easier to implement. """ + def __init__(self, env, gamma=1, alpha=0.5, epsilon=0.1): + super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon) + + def pi(self, s, k, info=None): + if k == 0: + """ we are at the beginning of the episode. Generate a by being epsilon-greedy""" + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + else: + """ Return the action self.a you generated during the train where you know s_{t+1} """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + """ + generate A' as self.a by being epsilon-greedy. Re-use code from the Agent class. + """ + # TODO: 1 lines missing. + raise NotImplementedError("self.a = ....") + """ now that you know A' = self.a, perform the update to self.Q[s,a] here """ + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + + def __str__(self): + return f"Sarsa{self.gamma}_{self.epsilon}_{self.alpha}" + +sarsa_exp = f"experiments/cliffwalk_Sarsa" +if __name__ == "__main__": + env, q_experiments = cliffwalk() # get results from Q-learning + agent = SarsaAgent(env, epsilon=epsilon, alpha=alpha) + for _ in range(10): + train(env, agent, sarsa_exp, num_episodes=200, max_runs=10) + main_plot(q_experiments + [sarsa_exp], smoothing_window=10) + plt.ylim([-100, 0]) + plt.title("Q and Sarsa learning on " + env.spec.name) + savepdf("QSarsa_learning_cliff") + plt.show() diff --git a/irlc/ex11/semi_grad_q.py b/irlc/ex11/semi_grad_q.py new file mode 100644 index 0000000000000000000000000000000000000000..0910717159ff25dd4e6b0a5c5b22423c2840ef3d --- /dev/null +++ b/irlc/ex11/semi_grad_q.py @@ -0,0 +1,45 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import gymnasium as gym +from irlc.ex01.agent import train +from irlc import main_plot +import matplotlib.pyplot as plt +from irlc.ex11.q_agent import QAgent +from irlc.ex11.feature_encoder import LinearQEncoder +from irlc import savepdf + +class LinearSemiGradQAgent(QAgent): + def __init__(self, env, gamma=1.0, alpha=0.5, epsilon=0.1, q_encoder=None): + """ The Q-values, as implemented using a function approximator, can now be accessed as follows: + + >> self.Q(s,a) # Compute q-value + >> self.Q.x(s,a) # Compute gradient of the above expression wrt. w + >> self.Q.w # get weight-vector. + + I would recommend inserting a breakpoint and investigating the above expressions yourself; + you can of course al check the class LinearQEncoder if you want to see how it is done in practice. + """ + super().__init__(env, gamma, epsilon=epsilon, alpha=alpha) + self.Q = LinearQEncoder(env, tilings=8) if q_encoder is None else q_encoder + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 4 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"LinearSemiGradQ{self.gamma}_{self.epsilon}_{self.alpha}" + +num_of_tilings = 8 +alpha = 1 / num_of_tilings +episodes = 300 +x = "Episode" +experiment_q = "experiments/mountaincar_semigrad_q" + +if __name__ == "__main__": + from irlc.ex10 import envs + env = gym.make("MountainCar500-v0") + for _ in range(10): + agent = LinearSemiGradQAgent(env, gamma=1, alpha=alpha, epsilon=0) + train(env, agent, experiment_q, num_episodes=episodes, max_runs=10) + main_plot(experiments=[experiment_q], x_key=x, y_key='Length', smoothing_window=30, resample_ticks=100) + savepdf("semigrad_q") + plt.show() diff --git a/irlc/ex11/semi_grad_sarsa.py b/irlc/ex11/semi_grad_sarsa.py new file mode 100644 index 0000000000000000000000000000000000000000..4c0e8147df943aa6e06a50f1e1a9f27d225f1ee3 --- /dev/null +++ b/irlc/ex11/semi_grad_sarsa.py @@ -0,0 +1,52 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import matplotlib.pyplot as plt +from irlc import main_plot, savepdf +from irlc.ex01.agent import train +import numpy as np +import gymnasium as gym +from irlc.ex11.semi_grad_q import LinearSemiGradQAgent +np.seterr(all='raise') + +class LinearSemiGradSarsa(LinearSemiGradQAgent): + def __init__(self, env, gamma=0.99, epsilon=0.1, alpha=0.5, q_encoder=None): + """ Implement the Linear semi-gradient Sarsa method from (SB18, Section 10.1)""" + super().__init__(env, gamma, epsilon=epsilon, alpha=alpha, q_encoder=q_encoder) + + def pi(self, s, k, info=None): + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return action + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 4 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + + if sum(np.abs(self.Q.w)) > 1e5: raise Exception("Weights diverged. Decrease alpha") + + def __str__(self): + return f"LinSemiGradSarsa{self.gamma}_{self.epsilon}_{self.alpha}" + +experiment_sarsa = "experiments/mountaincar_Sarsa" + +if __name__ == "__main__": + from irlc.ex11.semi_grad_q import experiment_q, alpha, x + from irlc.ex10 import envs + + env = gym.make("MountainCar500-v0") + for _ in range(10): + agent = LinearSemiGradSarsa(env, gamma=1, alpha=alpha, epsilon=0) + train(env, agent, experiment_sarsa, num_episodes=300, max_runs=10) + + main_plot(experiments=[experiment_q, experiment_sarsa], x_key=x, y_key='Length', smoothing_window=30) + savepdf("semigrad_q_sarsa") + plt.show() + + # Turn off averaging + main_plot(experiments=[experiment_q, experiment_sarsa], x_key=x, y_key='Length', smoothing_window=30, units="Unit", estimator=None) + savepdf("semigrad_q_sarsa_individual") + plt.show() diff --git a/irlc/ex12/__init__.py b/irlc/ex12/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6bf40e6e933a260431d40475e289b2c184c861ac --- /dev/null +++ b/irlc/ex12/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 12.""" diff --git a/irlc/ex12/mountain_car.py b/irlc/ex12/mountain_car.py new file mode 100644 index 0000000000000000000000000000000000000000..7483bdfceded62b90c718923a18f43cd251cb0d4 --- /dev/null +++ b/irlc/ex12/mountain_car.py @@ -0,0 +1,155 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.utils.common import log_time_series +from irlc.ex10 import envs +import numpy as np +import matplotlib.pyplot as plt +from tqdm import tqdm +from irlc import savepdf +from irlc.ex01.agent import train +from irlc.ex12.semi_grad_nstep_sarsa import LinearSemiGradSarsaN +import gymnasium as gym +from irlc import main_plot +from irlc.ex12.semi_grad_sarsa_lambda import LinearSemiGradSarsa + +# Helper function for plotting the value functions. +def plot_surface_2(X,Y,Z,fig=None, ax=None, **kwargs): + if fig is None and ax is None: + fig = plt.figure(figsize=(20, 10)) + if ax is None: + ax = fig.add_subplot(projection='3d') + surf = ax.plot_surface(X, Y, Z, cmap=plt.cm.coolwarm, linewidth=1, edgecolors='k', **kwargs) + ax.view_init(ax.elev, -120) + if fig is not None: + fig.colorbar(surf, shrink=0.5, aspect=5) + return ax + + +def plot_mountaincar_value_function(env, value_function, ax): + """ + 3d plot + """ + grid_size = 40 + low = env.unwrapped.observation_space.low + high = env.unwrapped.observation_space.high + X,Y = np.meshgrid( np.linspace(low[0], high[0], grid_size), np.linspace(low[1], high[1], grid_size) ) + Z = X*0 + for i, (x,y) in enumerate(zip(X.flat, Y.flat)): + Z.flat[i] = value_function( (x,y) ) + + plot_surface_2(X,Y,Z,ax=ax) + ax.set_xlabel('Position') + ax.set_ylabel('Velocity') + ax.set_zlabel('Cost to go') + +def figure_10_1(): + episodes = 9000 + plot_episodes = [1, 99, episodes - 1] + scale = 8 + fig = plt.figure(figsize=(4*scale, scale)) + axes = [fig.add_subplot(1, len(plot_episodes), i+1, projection='3d') for i in range(len(plot_episodes))] + num_of_tilings = 8 + alpha = 0.3 + + env = gym.make("MountainCar-v0") + agent = LinearSemiGradSarsa(env, gamma=1, alpha=alpha/num_of_tilings, epsilon=0) + for ep in tqdm(range(episodes)): + train(env, agent, num_episodes=1, max_steps=np.inf, verbose=False) + if ep in plot_episodes: + v = lambda s: -max(agent.Q.get_Qs(s)[1]) + ax = axes[plot_episodes.index(ep)] + plot_mountaincar_value_function(env, v, ax=ax) + ax.set_title(f'Episode {ep+1}') + + from irlc import savepdf + savepdf("semigrad_sarsa_10-1") + plt.show() + +def figure_10_2(): + episodes = 500 + num_of_tilings = 8 + alphas = [0.1, 0.2, 0.5] + env = gym.make("MountainCar500-v0") + + experiments = [] + for alpha in alphas: + agent = LinearSemiGradSarsa(env, gamma=1, alpha=alpha / num_of_tilings, epsilon=0) + experiment = f"experiments/mountaincar_10-2_{agent}_{episodes}" + train(env, agent, experiment_name=experiment, num_episodes=episodes,max_runs=10) + experiments.append(experiment) + + main_plot(experiments=experiments, y_key="Length") + plt.xlabel('Episode') + plt.ylabel('Steps per episode') + plt.title(env.spec.name + " - Semigrad Sarsa - Figure 10.2") + savepdf("mountaincar_10-2") + plt.show() + +def figure_10_3(): + from irlc.ex12.semi_grad_sarsa_lambda import LinearSemiGradSarsaLambda + from irlc.ex11.semi_grad_q import LinearSemiGradQAgent + + max_runs = 10 + episodes = 500 + num_of_tilings = 8 + alphas = [0.5, 0.3] + n_steps = [1, 8] + + env = gym.make("MountainCar500-v0") + experiments = [] + + """ Plot results of experiments here. """ + # TODO: 16 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + + main_plot(experiments=experiments, y_key="Length") + plt.xlabel('Episode') + plt.ylabel('Steps per episode') + plt.title(env.spec.name + " - Semigrad N-step Sarsa - Figure 10.3") + savepdf("mountaincar_10-3") + plt.show() + +def figure_10_4(): + alphas = np.arange(0.25, 1.75, 0.25) + n_steps = np.power(2, np.arange(0, 5)) + episodes = 50 + env = gym.make("MountainCar500-v0") + experiments = [] + num_of_tilings = 8 + max_asteps = 500 + run = True + for n_step_index, n_step in enumerate(n_steps): + aexp = [] + did_run = False + for alpha_index, alpha in enumerate(alphas): + if not run: + continue + if (n_step == 8 and alpha > 1) or (n_step == 16 and alpha > 0.75): + # In these cases it won't converge, so ignore them + asteps = max_asteps #max_steps * episodes + else: + n = n_step + agent = LinearSemiGradSarsaN(env, gamma=1, alpha=alpha / num_of_tilings, epsilon=0, n=n) + _, stats, _ = train(env, agent, num_episodes=episodes) + asteps = np.mean( [s['Length'] for s in stats] ) + did_run = did_run or stats is not None + + aexp.append({'alpha': alpha, 'average_steps': asteps}) + + experiment = f"experiments/mc_10-4_lsgn_{n_step}" + experiments.append(experiment) + if did_run: + log_time_series(experiment, aexp) + + main_plot(experiments, x_key="alpha", y_key="average_steps", ci=None) + plt.xlabel('alpha') + plt.ylabel('Steps per episode') + plt.title("Figure 10.4: Semigrad n-step Sarsa on mountain car") + plt.ylim([150, 300]) + savepdf("mountaincar_10-4") + plt.show() + +if __name__ == '__main__': + figure_10_1() + figure_10_2() + figure_10_3() + figure_10_4() diff --git a/irlc/ex12/sarsa_lambda_agent.py b/irlc/ex12/sarsa_lambda_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..9cd7baf7955fbfa16c6032acdaaf0dc213eb1c56 --- /dev/null +++ b/irlc/ex12/sarsa_lambda_agent.py @@ -0,0 +1,68 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from collections import defaultdict +import gymnasium as gym +from irlc.ex01.agent import train +from irlc import main_plot, savepdf +import matplotlib.pyplot as plt +from irlc.ex11.sarsa_agent import SarsaAgent + + +class SarsaLambdaAgent(SarsaAgent): + def __init__(self, env, gamma=0.99, epsilon=0.1, alpha=0.5, lamb=0.9): + """ + Implementation of Sarsa(Lambda) in the tabular version, see + http://incompleteideas.net/book/first/ebook/node77.html + for details. Remember to reset the + eligibility trace E after each episode, i.e. set E(s,a) = 0. + + Note 'lamb' is an abbreveation of lambda, because lambda is a reserved keyword in python. + + The constructor initializes e, the eligibility trace. Since we want to easily be able to find the non-zero + elements it will be convenient to use a dictionary. I.e. + + self.e[(s,a)] is the eligibility trace e(s,a) (or E(s,a) if you prefer). + + Note that Sarsa(Lambda) generalize Sarsa. This means that we again must generate the next action A' from S' in the train method and + store it for when we take actions in the policy method pi. I.e. we can re-use the Sarsa Agents code for the policy (self.pi). + """ + super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon) + self.lamb = lamb + # We use a dictionary to store the eligibility trace. It can be indexed as self.e[s,a]. + self.e = defaultdict(float) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 1 lines missing. + raise NotImplementedError("a_prime = ... (get action for S'=sp using self.pi_eps; see Sarsa)") + # TODO: 1 lines missing. + raise NotImplementedError("delta = ... (The ordinary Sarsa learning signal)") + # TODO: 1 lines missing. + raise NotImplementedError("Update the eligibility trace e(s,a) += 1") + for (s,a), ee in self.e.items(): + # TODO: 2 lines missing. + raise NotImplementedError("Update Q values and eligibility trace") + if done: # Clear eligibility trace after each episode and update variables for Sarsa + self.e.clear() + else: + self.a = a_prime + + def __str__(self): + return f"SarsaLambda_{self.gamma}_{self.epsilon}_{self.alpha}_{self.lamb}" + +if __name__ == "__main__": + envn = 'CliffWalking-v0' + env = gym.make(envn) + + alpha =0.05 + sarsaLagent = SarsaLambdaAgent(env,gamma=0.99, epsilon=0.1, alpha=alpha, lamb=0.9) + sarsa = SarsaAgent(env,gamma=0.99,alpha=alpha,epsilon=0.1) + methods = [("SarsaL", sarsaLagent), ("Sarsa", sarsa)] + + experiments = [] + for k, (name,agent) in enumerate(methods): + expn = f"experiments/{envn}_{name}" + train(env, agent, expn, num_episodes=500, max_runs=10) + experiments.append(expn) + main_plot(experiments, smoothing_window=10, resample_ticks=200) + plt.ylim([-100, 0]) + savepdf("cliff_sarsa_lambda") + plt.show() diff --git a/irlc/ex12/sarsa_lambda_open.py b/irlc/ex12/sarsa_lambda_open.py new file mode 100644 index 0000000000000000000000000000000000000000..0fe4e1c621d79dfce739fda10974961ea5c9971c --- /dev/null +++ b/irlc/ex12/sarsa_lambda_open.py @@ -0,0 +1,35 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex12.sarsa_lambda_agent import SarsaLambdaAgent +from irlc.gridworld.gridworld_environments import OpenGridEnvironment +from irlc import train, interactive + +def keyboard_play(Agent, method_label='MC', num_episodes=1000, alpha=0.5, autoplay=False, **args): + print("Evaluating", Agent, "on the open gridworld environment.") + print("Press p to follow the agents policy or use the keyboard to input actions") + print("(Please be aware that Sarsa, N-step Sarsa, and Sarsa(Lambda) do not always make the right updates when you input actions with the keyboard)") + + env = OpenGridEnvironment(render_mode='human', frames_per_second=10) + try: + agent = Agent(env, gamma=0.99, epsilon=0.1, alpha=alpha, **args) + except Exception as e: # If it is a value agent without the epsilon. + agent = Agent(env, gamma=0.99, alpha=alpha, **args) + env, agent = interactive(env, agent, autoplay=autoplay) + train(env, agent, num_episodes=num_episodes) + env.close() + +if __name__ == "__main__": + """ + Example: Play a three episodes and save a snapshot of the Q-values as a .pdf + """ + env = OpenGridEnvironment(render_mode='human') + agent = SarsaLambdaAgent(env, gamma=0.99, epsilon=0.1, alpha=.5) + env, agent = interactive(env, agent, autoplay=True) + train(env, agent, num_episodes=3) + from irlc import savepdf + savepdf("sarsa_lambda_opengrid", env=env) + env.close() + + """ Example: Keyboard play + You can input actions manually with the keyboard, but the Q-values are not necessarily updates correctly in this mode. Can you tell why? + You can let the agent play by pressing `p`, in which case the Q-values will be updated correctly. """ + keyboard_play(SarsaLambdaAgent, method_label="Sarsa(Lambda)", lamb=0.8) diff --git a/irlc/ex12/semi_grad_nstep_sarsa.py b/irlc/ex12/semi_grad_nstep_sarsa.py new file mode 100644 index 0000000000000000000000000000000000000000..c7f6ac23a708b4a7529af7097185afe11fb60c7b --- /dev/null +++ b/irlc/ex12/semi_grad_nstep_sarsa.py @@ -0,0 +1,53 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex01.agent import train +import gymnasium as gym +from irlc.ex11.semi_grad_sarsa import LinearSemiGradSarsa +from irlc.ex11.nstep_sarsa_agent import SarsaNAgent + +class LinearSemiGradSarsaN(SarsaNAgent, LinearSemiGradSarsa): + def __init__(self, env, gamma=0.99, alpha=0.5, epsilon=0.1, q_encoder=None, n=1): + """ + Note you can access the super-classes as: + >> SarsaNAgent.pi(self, s) # Call the pi(s) as implemented in SarsaNAgent + Alternatively, just inherit from Agent and set up data structure as required. + """ + SarsaNAgent.__init__(self, env, gamma, alpha=alpha, epsilon=epsilon, n=n) + LinearSemiGradSarsa.__init__(self, env, gamma, alpha=alpha, epsilon=epsilon, q_encoder=q_encoder) + + def pi(self, s, k, info=None): + return SarsaNAgent.pi(self, s, k, info) + + def _q(self, s, a): + """ + Return Q(s,a) using the linear function approximator with weights self.w; i.e. use self.q + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def _upd_q(self, s, a, delta): + """ + Update the weight-vector w using the appropriate rule (see exercise description). I.e. the update + should be of the form + + self.w += self.alpha * delta * (gradient of Q(s,a;w) + + where + delta = (G^n - Q(s,a;w) + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"LinSemiGradSarsaN{self.gamma}_{self.epsilon}_{self.alpha}_{self.n}" + + +experiment_nsarsa = "experiments/mountaincar_SarsaN" +if __name__ == "__main__": + from irlc.ex12.semi_grad_sarsa_lambda import alpha, plot_including_week10, experiment_sarsaL, episodes + import irlc.ex10.envs + env = gym.make("MountainCar500-v0") + for _ in range(10): + agent = LinearSemiGradSarsaN(env, gamma=1, alpha=alpha, epsilon=0, n=4) + train(env, agent, experiment_nsarsa, num_episodes=episodes, max_runs=10) + # plot while including the results from last week for Sarsa and Q-learning + plot_including_week10([experiment_sarsaL, experiment_nsarsa],output="semigrad_sarsan") diff --git a/irlc/ex12/semi_grad_sarsa_lambda.py b/irlc/ex12/semi_grad_sarsa_lambda.py new file mode 100644 index 0000000000000000000000000000000000000000..04644d9253e0b98e7de5e68a08d8583276629233 --- /dev/null +++ b/irlc/ex12/semi_grad_sarsa_lambda.py @@ -0,0 +1,74 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import gymnasium as gym +import numpy as np +from irlc.ex01.agent import train +from irlc import main_plot, savepdf +import matplotlib.pyplot as plt +from irlc.ex11.semi_grad_sarsa import LinearSemiGradSarsa + +class LinearSemiGradSarsaLambda(LinearSemiGradSarsa): + def __init__(self, env, gamma=0.99, epsilon=0.1, alpha=0.5, lamb=0.9, q_encoder=None): + """ + Sarsa(Lambda) with linear feature approximators (see (SB18, Section 12.7)). + """ + super().__init__(env, gamma, alpha=alpha, epsilon=epsilon, q_encoder=q_encoder) + self.z = np.zeros(self.Q.d) # Vector to store eligibility trace (same dimension as self.w) + self.lamb = lamb # lambda in Sarsa(lambda). We cannot use the reserved keyword 'lambda'. + + def pi(self, s, k, info=None): + if k == 0: # If beginning of episode. + self.a = self.pi_eps(s, info) + self.x = self.Q.x(s,self.a) + self.Q_old = 0 + return self.a + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + a_prime = self.pi_eps(sp, info_sp) if not done else -1 + x_prime = self.Q.x(sp, a_prime) if not done else None + """ + Update the eligibility trace self.z and the weights self.w here. + Note Q-values are approximated as Q = w @ x. + We use Q_prime = w * x(s', a') to denote the new q-values for (stored for next iteration as in the pseudo code) + """ + # TODO: 5 lines missing. + raise NotImplementedError("Update z, w") + if done: # Reset eligibility trace and time step t as in Sarsa. + self.z = self.z * 0 + else: + self.Q_old, self.x, self.a = Q_prime, x_prime, a_prime + + def __str__(self): + return f"LinearSarsaLambda_{self.gamma}_{self.epsilon}_{self.alpha}_{self.lamb}" + + +from irlc.ex11.semi_grad_q import experiment_q, x, episodes +from irlc.ex11.semi_grad_sarsa import experiment_sarsa +from irlc.ex10 import envs +experiment_sarsaL = "experiments/mountaincar_sarsaL" +num_of_tilings = 8 +alpha = 1 / num_of_tilings / 2 # learning rate + +def plot_including_week10(experiments, output): + exps = ["../ex11/" + e for e in [experiment_q, experiment_sarsa]] + experiments + + main_plot(exps, x_key=x, y_key='Length', smoothing_window=30, resample_ticks=100) + savepdf(output) + plt.show() + + # Turn off averaging + main_plot(exps, x_key=x, y_key='Length', smoothing_window=30, units="Unit", estimator=None, resample_ticks=100) + savepdf(output+"_individual") + plt.show() + +if __name__ == "__main__": + env = gym.make("MountainCar500-v0") + for _ in range(5): # run experiment 10 times + agent = LinearSemiGradSarsaLambda(env, gamma=1, alpha=alpha, epsilon=0) + train(env, agent, experiment_sarsaL, num_episodes=episodes, max_runs=10) + # Make plots (we use an external function so we can re-use it for the semi-gradient n-step controller) + plot_including_week10([experiment_sarsaL], output="semigrad_sarsaL") diff --git a/irlc/ex13/__init__.py b/irlc/ex13/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d082cf61275cab1778627c63f97cafa89e399c09 --- /dev/null +++ b/irlc/ex13/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 13.""" diff --git a/irlc/ex13/__pycache__/__init__.cpython-311.pyc b/irlc/ex13/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b3d045b0c7c2ba7e2f61edc2467536eae8273e1 Binary files /dev/null and b/irlc/ex13/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/ex13/__pycache__/buffer.cpython-311.pyc b/irlc/ex13/__pycache__/buffer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eefadd98930e8787a3439fd4bd00601ed4c07aa0 Binary files /dev/null and b/irlc/ex13/__pycache__/buffer.cpython-311.pyc differ diff --git a/irlc/ex13/__pycache__/dqn_network.cpython-311.pyc b/irlc/ex13/__pycache__/dqn_network.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ae0dbe1149808f291ed5478bf244e3b5a6d8334 Binary files /dev/null and b/irlc/ex13/__pycache__/dqn_network.cpython-311.pyc differ diff --git a/irlc/ex13/__pycache__/torch_networks.cpython-311.pyc b/irlc/ex13/__pycache__/torch_networks.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..feab49d6f92ed16db97b01804a73a8c62dd2096c Binary files /dev/null and b/irlc/ex13/__pycache__/torch_networks.cpython-311.pyc differ diff --git a/irlc/ex13/buffer.py b/irlc/ex13/buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..05ef6b56bde9132e8eac0d3122ebcca2bec459be --- /dev/null +++ b/irlc/ex13/buffer.py @@ -0,0 +1,109 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import random +from collections import deque +from irlc import cache_read, cache_write + +class BasicBuffer: + """ + The buffer class is used to keep track of past experience and sample it for learning. + """ + def __init__(self, max_size=2000): + """ + Creates a new (empty) buffer. + + :param max_size: Maximum number of elements in the buffer. This should be a large number like 100'000. + """ + self.buffer = deque(maxlen=max_size) + + def push(self, state, action, reward, next_state, done): + """ + Add information from a single step, :math:`(s_t, a_t, r_{t+1}, s_{t+1}, \\text{done})` to the buffer. + + .. runblock:: pycon + + >>> import gymnasium as gym + >>> from irlc.ex13.buffer import BasicBuffer + >>> env = gym.make("CartPole-v1") + >>> b = BasicBuffer() + >>> s, info = env.reset() + >>> a = env.action_space.sample() + >>> sp, r, done, _, info = env.step(a) + >>> b.push(s, a, r, sp, done) + >>> len(b) # Get number of elements in buffer + + :param state: A state :math:`s_t` + :param action: Action taken :math:`a_t` + :param reward: Reward obtained :math:`r_{t+1}` + :param next_state: Next state transitioned to :math:`s_{t+1}` + :param done: ``True`` if the environment terminated else ``False`` + :return: ``None`` + """ + experience = (state, action, np.array([reward]), next_state, done) + self.buffer.append(experience) + + def sample(self, batch_size): + """ + Sample ``batch_size`` elements from the buffer for use in training a deep Q-learning method. + The elements returned all be numpy ``ndarray`` where the first dimension is the batch dimension, i.e. of size + ``batch_size``. + + .. runblock:: pycon + + >>> import gymnasium as gym + >>> from irlc.ex13.buffer import BasicBuffer + >>> env = gym.make("CartPole-v1") + >>> b = BasicBuffer() + >>> s, info = env.reset() + >>> a = env.action_space.sample() + >>> sp, r, done, _, _ = env.step(a) + >>> b.push(s, a, r, sp, done) + >>> S, A, R, SP, DONE = b.sample(batch_size=32) + >>> S.shape # Dimension batch_size x n + >>> R.shape # Dimension batch_size x 1 + + :param batch_size: Number of elements to sample + :return: + - S - Matrix of size ``batch_size x n`` of sampled states + - A - Matrix of size ``batch_size x n`` of sampled actions + - R - Matrix of size ``batch_size x n`` of sampled rewards + - SP - Matrix of size ``batch_size x n`` of sampled states transitioned to + - DONE - Matrix of size ``batch_size x 1`` of bools indicating if the environment terminated + + """ + state_batch = [] + action_batch = [] + reward_batch = [] + next_state_batch = [] + done_batch = [] + assert len(self.buffer) > 0, "The replay buffer must be non-empty in order to sample a batch: Use push()" + batch = random.choices(self.buffer, k=batch_size) + for state, action, reward, next_state, done in batch: + state_batch.append(state) + action_batch.append(action) + reward_batch.append(reward) + next_state_batch.append(next_state) + done_batch.append(done) + + return map(lambda x: np.asarray(x), (state_batch, action_batch, reward_batch, next_state_batch, done_batch)) + + def __len__(self): + return len(self.buffer) + + def save(self, path): + """ + Use this to save the content of the buffer to a file + + :param path: Path where to save (use same argument with ``load``) + :return: ``None`` + """ + cache_write(self.buffer, path) + + def load(self, path): + """ + Use this to load buffer content from a file + + :param path: Path to load from (use same argument with ``save``) + :return: ``None`` + """ + self.buffer = cache_read(path) diff --git a/irlc/ex13/deepq_agent.py b/irlc/ex13/deepq_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..43facb24f1cabda107e726b71354b318476de225 --- /dev/null +++ b/irlc/ex13/deepq_agent.py @@ -0,0 +1,130 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +USE_KERAS = False # Toggle to use Keras/Pytorch +import gymnasium as gym +import numpy as np +import os +from matplotlib import pyplot as plt +from irlc.ex01.agent import train +from irlc.ex13.buffer import BasicBuffer +from irlc import cache_write, cache_read, cache_exists +from irlc.ex09.rl_agent import TabularAgent +from irlc.ex13.torch_networks import TorchNetwork as QNetwork # Torch network architechture + +class DeepQAgent(TabularAgent): + def __init__(self, env, network=None, buffer=None, gamma=0.99, epsilon=None, alpha=0.001, batch_size=32, + replay_buffer_size=2000, replay_buffer_minreplay=500): + # Ensure 'epsilon' is a function to allow gradually decreasing exploration rate + epsilon = epsilon if callable(epsilon) else lambda steps, episodes: epsilon + super().__init__(env, gamma=gamma, epsilon=epsilon) + self.memory = BasicBuffer(replay_buffer_size) if buffer is None else buffer + """ + All the 'deep' stuff is handled by a seperate class. For instance + self.Q(s) + will return a [batch_size x actions] matrix of Q-values + """ + self.Q = network(env, trainable=True) if network else QNetwork(env, trainable=True, learning_rate=alpha) + self.batch_size = batch_size + self.replay_buffer_minreplay = replay_buffer_minreplay + self.steps, self.episodes = 0, 0 + + def pi(self, s, k, info_s=None): + eps_ = self.epsilon(self.steps, self.episodes) # get the learning rate + # return action by regular epsilon-greedy exploration + return self.env.action_space.sample() if np.random.rand() < eps_ else np.argmax(self.Q(s[np.newaxis,...])) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + self.memory.push(s, a, r, sp, done) # save current observation + if len(self.memory) > self.replay_buffer_minreplay: + self.experience_replay() # do the actual training step + self.steps, self.episodes = self.steps + 1, self.episodes + done + + def experience_replay(self): + """ + Perform the actual deep-Q learning step. + + The actual learning is handled by calling self.Q.fit(s,target) + where s is defined as below (i.e. all states from the replay buffer) + and target is the desired value of self.Q(s). + + Note that target must therefore be of size Batch x Actions. In other words fit minimize + + |Q(s) - target|^2 + + which must implement the proper cost. This can be done by setting most entries of target equal to self.Q(s) + and the other equal to y, which is Q-learning target for Q(s,a). """ + """ First we sample from replay buffer. Returns numpy Arrays of dimension + > [self.batch_size] x [...]] + for instance 'a' will be of dimension [self.batch_size x 1]. + """ + s,a,r,sp,done = self.memory.sample(self.batch_size) + # TODO: 3 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + self.Q.fit(s, target) + + def save(self, path): # allows us to save/load model + if not os.path.isdir(path): + os.makedirs(path) + self.Q.save(os.path.join(path, "Q")) + cache_write(dict(steps=self.steps, episodes=self.episodes), os.path.join(path, "agent.pkl")) + mpath = os.path.join(path, "memory.pkl") + import shutil + if os.path.isfile(mpath): + shutil.move(mpath, mpath +".backup") # shuffle file + self.memory.save(mpath) + + def load(self, path): # allows us to save/load model + if not cache_exists(os.path.join(path, "agent.pkl")): + return False + for k, v in cache_read(os.path.join(path, "agent.pkl")).items(): + self.__dict__[k] = v + self.Q.load(os.path.join(path, "Q")) + self.memory.load(os.path.join(path, "memory.pkl")) + return True + + def __str__(self): + return f"basic_DQN{self.gamma}" + +def linear_interp(maxval, minval, delay, miniter): + """ + Will return a function f(i) with the following signature: + + f(i) = maxval for i < delay + f(i) = linear interpolate between max/minval until delay+miniter + f(i) = miniter for i > delay+miniter + """ + return lambda steps, episodes: min(max([maxval- ((steps-delay)/miniter)*(maxval-minval), minval]), maxval) + +cartpole_dqn_options = dict(gamma=0.95, epsilon=linear_interp(maxval=1,minval=0.01,delay=300,miniter=5000), + replay_buffer_minreplay=300, replay_buffer_size=500000) + +def mk_cartpole(): + env = gym.make("CartPole-v0") + agent = DeepQAgent(env, **cartpole_dqn_options) + return env, agent + +if __name__ == "__main__": + env_id = "CartPole-v0" + ex = f"experiments/cartpole_dqn" + num_episodes = 200 # We train for 200 episodes + env, agent = mk_cartpole() + train(env, agent, experiment_name=ex, num_episodes=num_episodes) + from irlc import main_plot, savepdf + main_plot([ex], units="Unit", estimator=None, smoothing_window=None) + savepdf("cartpole_dqn") + plt.show() + + """ Part 2: The following code showcase how to use the save/load method to store intermediate results + and resume training. Note you have to manually remove 'bad' runs otherwise it will resume where + it left off """ + ex = f"experiments/cartpole_dqn_cache" + num_episodes = 20 # we train 20 just episodes at a time + for j in range(10): # train for a total of 200 episodes + env, agent = mk_cartpole() + """ + saveload_model=True means it will store and load intermediate results + i.e. we can resume training later. It will not be very useful for cartpole, but necesary for e.g. + the atari environment which can run for days + """ + agent.load(ex) + train(env, agent, experiment_name=ex, num_episodes=num_episodes, resume_stats=True) # Resume stat collection from last checkpoint. + agent.save(ex) diff --git a/irlc/ex13/double_deepq_agent.py b/irlc/ex13/double_deepq_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..99b624b1a44d27afcfda1bc4f0341144b06a4e1a --- /dev/null +++ b/irlc/ex13/double_deepq_agent.py @@ -0,0 +1,73 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import gymnasium as gym +import numpy as np +import os +from irlc.ex13.deepq_agent import DeepQAgent +from matplotlib import pyplot as plt +from irlc.ex13.torch_networks import TorchNetwork as QNetwork # Torch network architechture + +class DoubleQAgent(DeepQAgent): + def __init__(self, env, network=None, buffer=None, gamma=0.99, epsilon=0.2, alpha=0.001, tau=0.1, batch_size=32, + replay_buffer_size=2000, replay_buffer_minreplay=500): + super().__init__(env, network=network, buffer=buffer, gamma=gamma,epsilon=epsilon, alpha=alpha, batch_size=batch_size, + replay_buffer_size=replay_buffer_size, replay_buffer_minreplay=replay_buffer_minreplay) + # The target network play the role of q_{phi'} in the slides. + self.target = QNetwork(env, learning_rate=alpha, trainable=False) if network is None else network(env, learning_rate=alpha, trainable=False) + self.tau = tau # Rate at which the weights in the target network is updated (see slides) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + self.memory.push(s, a, r, sp, done) + if len(self.memory) > self.replay_buffer_minreplay: + self.experience_replay() + # TODO: 1 lines missing. + raise NotImplementedError("update Phi here in the self.target network") + self.steps, self.episodes = self.steps + 1, self.episodes + done + + def experience_replay(self): + """ Update the double-Q method, i.e. make sure to select actions a' using self.Q + but evaluate the Q-values using the target network (see slides). + In other words, + > self.target(s) + is a Q-function network which evaluates + > q-hat_{\phi'}(s,:). + Asides this, the code will be nearly identical to the basic DQN agent """ + s,a,r,sp,done = self.memory.sample(self.batch_size) + # TODO: 5 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + self.Q.fit(s, target=target) + + def save(self, path): + super().save(path) + self.target.save(os.path.join(path, "Q_target")) # also save target network + + def load(self, path): + loaded = super().load(path) + if loaded: + self.Q.load(os.path.join(path, "Q_target")) # also load target network + return loaded + + + def __str__(self): + return f"doubleDQN_{self.gamma}" + +from irlc.ex13.deepq_agent import cartpole_dqn_options +cartpole_doubleq_options = {**cartpole_dqn_options, 'tau': 0.08} + +def mk_cartpole(): + env = gym.make("CartPole-v0") + agent = DoubleQAgent(env, **cartpole_doubleq_options) + return env, agent + +if __name__ == "__main__": + from irlc import main_plot, savepdf + + env_id = "CartPole-v0" + MAX_EPISODES = 200 + for j in range(1): + env, agent = mk_cartpole() + from irlc.ex01.agent import train + ex = f"experiments/cartpole_double_dqn" + train(env, agent, experiment_name=ex, num_episodes=MAX_EPISODES) + main_plot([f"experiments/cartpole_dqn", ex], estimator=None, smoothing_window=None) + savepdf("cartpole_double_dqn") + plt.show() diff --git a/irlc/ex13/dqn_network.py b/irlc/ex13/dqn_network.py new file mode 100644 index 0000000000000000000000000000000000000000..d1920992457e06a6c4d390dc654f601d4e51a73a --- /dev/null +++ b/irlc/ex13/dqn_network.py @@ -0,0 +1,63 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +class DQNNetwork: + """ + A class representing a deep Q network. + Note that this function is batched. I.e. ``s`` is assumed to be a numpy array of dimension ``batch_size x n`` + + The following example shows how you can evaluate the Q-values in a given state. An example: + + .. runblock:: pycon + + >>> from irlc.ex13.torch_networks import TorchNetwork + >>> import gymnasium as gym + >>> import numpy as np + >>> env = gym.make("CartPole-v1") + >>> Q = TorchNetwork(env, trainable=True, learning_rate=0.001) # DQN network requires an env to set network dimensions + >>> batch_size = 32 # As an example + >>> states = np.random.rand(batch_size, env.observation_space.shape[0]) # Creates some dummy input + >>> states.shape # batch_size x n + >>> qvals = Q(states) # Evaluate Q(s,a) + >>> qvals.shape # This is a tensor of dimension batch_size x actions + >>> print(qvals[0,1]) # Get Q(s_0, 1) + >>> Y = np.random.rand(batch_size, env.action_space.n) # Generate target Q-values (training data) + >>> Q.fit(states, Y) # Train the Q-network for 1 gradient descent step + """ + def update_Phi(self, source, tau=0.01): + """ + Update (adapts) the weights in this network towards those in source by a small amount. + + For each weight :math:`w_i` in (this) network, and each corresponding weight :math:`w'_i` in the ``source`` network, + the following Polyak update is performed: + + .. math:: + w_i \\leftarrow w_i + \\tau (w'_i - w_i) + + :param source: Target network to update towards + :param tau: Update rate (rate of change :math:`\\tau` + :return: ``None`` + """ + + raise NotImplementedError + + def __call__(self, s): + """ + Evaluate the Q-values in the given (batched) state. + + :param s: A matrix of size ``batch_size x n`` where :math:`n` is the state dimension. + :return: The Q-values as a ``batch_size x d`` dimensional matrix where :math:`d` is the number of actions. + """ + raise NotImplementedError + + def fit(self, s, target): + """ + Fit the network weights by minimizing + + .. math:: + \\frac{1}{B}\sum_{i=1}^B \sum_{a=1}^K \| q_\phi(s_i)_a - y_{i,a} \|^2 + + where ``target`` corresponds to :math:`y` and is a ``[batch_size x actions]`` matrix of target Q-values. + :param s: + :param target: + :return: + """ + raise NotImplementedError diff --git a/irlc/ex13/duel_deepq_agent.py b/irlc/ex13/duel_deepq_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..65491d368a25462fa597e0c4017b995e40b9f9fc --- /dev/null +++ b/irlc/ex13/duel_deepq_agent.py @@ -0,0 +1,35 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import gymnasium as gym +import matplotlib.pyplot as plt +from irlc import main_plot, savepdf +from irlc.ex01.agent import train +from irlc.ex13.double_deepq_agent import DoubleQAgent +from irlc.ex13.torch_networks import TorchDuelNetwork as DuelNetwork +from irlc.ex13.buffer import BasicBuffer +from irlc.ex13.double_deepq_agent import cartpole_doubleq_options + +class DuelQAgent(DoubleQAgent): + def __init__(self, env, network=None, buffer=None, gamma=0.99, epsilon=None, alpha=0.001, tau=0.1, batch_size=32, + replay_buffer_size=2000, replay_buffer_minreplay=500): + network = DuelNetwork if network is None else network # Only relevant change + buffer = buffer if buffer is not None else BasicBuffer(max_size=500000) + super().__init__(env, network=network, buffer=buffer, gamma=gamma,epsilon=epsilon, alpha=alpha, tau=tau,batch_size=batch_size, + replay_buffer_size=replay_buffer_size, replay_buffer_minreplay=replay_buffer_minreplay) + self.target.update_Phi(self.Q) + + def __str__(self): + return f"DuelQ_{self.gamma}" + +def mk_cartpole(): + env = gym.make("CartPole-v0") + agent = DuelQAgent(env, **cartpole_doubleq_options) + return env, agent + +if __name__ == "__main__": + env,agent = mk_cartpole() + ex = f"experiments/cartpole_duel_dqn" + train(env, agent, experiment_name=ex, num_episodes=200) + plt.close() + main_plot([f"experiments/cartpole_dqn", f"experiments/cartpole_double_dqn", ex], smoothing_window=None) + savepdf("cartpole_duel_dqn") + plt.show() diff --git a/irlc/ex13/dyna_q.py b/irlc/ex13/dyna_q.py new file mode 100644 index 0000000000000000000000000000000000000000..a764bef575e35b50f993f7f4e7502520df0caac6 --- /dev/null +++ b/irlc/ex13/dyna_q.py @@ -0,0 +1,89 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +from irlc.ex01.agent import train +import gymnasium as gym +from irlc import main_plot +import matplotlib.pyplot as plt +from irlc import savepdf +from irlc.ex11.sarsa_agent import SarsaAgent +from irlc.ex11.q_agent import QAgent +from irlc.ex12.sarsa_lambda_agent import SarsaLambdaAgent +from irlc.ex13.maze_dyna_environment import MazeEnvironment + +class DynaQ(QAgent): + """ + Implement the tabular dyna-Q agent (SB18, Section 8.7). + """ + def __init__(self, env, gamma=1.0, alpha=0.5, epsilon=0.1, n=5): + super().__init__(env, gamma, alpha=alpha, epsilon=epsilon) + """ + Model is a list of experience, i.e. of the form + Model = [ (s_t, a_t, r_{t+1}, s_{t+1}, done_t), ...] + """ + self.Model = [] + self.n = n # number of planning steps + + def q_update(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + """ + Update the Q-function self.Q[s,a] as in regular Q-learning + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + self.q_update(s,a,r,sp,done, info_s, info_sp) + self.Model.append( (s,a, r,sp, done)) + for _ in range(self.n): + """ Obtain a random transition from the replay buffer. You can use np.random.randint + then call self.q_update on the random sample. """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"DynaQ_{self.gamma}_{self.epsilon}_{self.alpha}_{self.n}" + + +def dyna_experiment(env, env_name='maze',num_episodes=50,epsilon=0.1, alpha=0.1, gamma=.95, runs=2): + for _ in range(runs): # Increase runs for nicer error bars + agents = [QAgent(env, epsilon=epsilon, alpha=alpha,gamma=gamma), + SarsaAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma), + SarsaLambdaAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma,lamb=0.9), + DynaQ(env, epsilon=epsilon, alpha=alpha,gamma=gamma,n=5), + DynaQ(env, epsilon=epsilon, alpha=alpha,gamma=gamma, n=50), + ] + + experiments = [] + for agent in agents: + expn = f"experiments/b{env_name}_{str(agent)}" + train(env, agent, expn, num_episodes=num_episodes, max_runs=100) + experiments.append(expn) + return experiments + +if __name__ == "__main__": + from irlc.ex09.mdp import MDP2GymEnv + """ The maze-environment is created as an MDP, and we then convert it to a Gym environment. + Alternatively, use the irlc.gridworld.gridworld_environments.py - method to specify the layout as in the other gridworld examples. """ + env = MDP2GymEnv(MazeEnvironment()) + experiments = dyna_experiment(env, env_name='maze',num_episodes=50,epsilon=0.1, alpha=0.1, gamma=.95, runs=4) + main_plot(experiments, smoothing_window=None, y_key="Length") + plt.ylim([0, 500]) + plt.title("Dyna Q on simple Maze (Figure 8.2)") + savepdf("dynaq_maze_8_2") + plt.show() + + # Part 2: Cliffwalking as reference. + env = gym.make('CliffWalking-v0') + gamma, alpha, epsilon = 1, 0.5, 0.1 + # Call the dyna_experiment(...) function here similar to the previous call but using new parameters. + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + main_plot(experiments, smoothing_window=5) + plt.ylim([-150, 0]) + plt.title("Dyna-Q learning on " + env.spec.name) + savepdf("dyna_cliff") + plt.show() diff --git a/irlc/ex13/maximization_bias_environment.py b/irlc/ex13/maximization_bias_environment.py new file mode 100644 index 0000000000000000000000000000000000000000..9e40bc32f912d5549e5aa067f3d3b83017ca0437 --- /dev/null +++ b/irlc/ex13/maximization_bias_environment.py @@ -0,0 +1,93 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +from irlc.ex01.agent import train +from irlc import main_plot +import matplotlib.pyplot as plt +from irlc.ex09.mdp import MDP, MDP2GymEnv +from irlc import savepdf +from irlc.ex11.sarsa_agent import SarsaAgent +from irlc.ex11.q_agent import QAgent +from irlc.ex13.tabular_double_q import TabularDoubleQ + +class MaximizationBiasEnvironment(MDP): + """ + The Maximization Bias yafcport from (SB18, Example 6.7). + For easy implementation, we fix the number of transitions from state B to terminal state to + normal_transitions. The code ensure they still have average reward 0.1, i.e. no action will be preferred. + there are B_actions possible actions from state B in this yafcport (the number is not given in the yafcport). + """ + def __init__(self, B_actions=10, normal_transitions=100, **kwargs): + self.state_A = 0 + self.state_B = 1 + self.LEFT = 0 + self.RIGHT = 1 + self.B_actions = B_actions + self.n_transitions = normal_transitions + super().__init__(initial_state=self.state_A, **kwargs) + + def is_terminal(self, state): + return state == 2 + + def A(self, s): + # define the actions pace + if s == self.state_A: + return [self.LEFT, self.RIGHT] + elif s == self.state_B: # in state B + return [n for n in range(self.B_actions)] + else: + return [0] # terminal; return a dummy action 0 which does nothing (some code is sensitive to empty action spaces) + + def Psr(self, s, a): + t = 2 # terminal state + if s == self.state_A: + if a == self.RIGHT: + # TODO: 1 lines missing. + raise NotImplementedError("Implement what the environment does in state A with a RIGHT action") + else: + # TODO: 1 lines missing. + raise NotImplementedError("Implement what the environment does in state A with a LEFT action") + else: # s is in state B + p = 1/self.n_transitions # transition probability + rewards = [np.random.randn() for _ in range(self.n_transitions)] + rewards = [r - np.mean(rewards)-0.1 for r in rewards] + return { (t, r): p for r in rewards} + +if __name__ == "__main__": + """ + The Maximization Bias from (SB18, Example 6.7). + I have fixed the number of "junk" actions in state B to 10, but it can easily be changed + in the environment. + + I don't have an easy way to get the number of 'left'-actions, so instead i plot + the trajectory length: it is 1 for a right action, and 2 for a left. + """ + env = MDP2GymEnv(MaximizationBiasEnvironment()) + + for _ in range(100): + epsilon = 0.1 + alpha = 0.1 + gamma = 1 + agents = [QAgent(env, epsilon=epsilon, alpha=alpha), + SarsaAgent(env, epsilon=epsilon, alpha=alpha), + TabularDoubleQ(env, epsilon=epsilon, alpha=alpha)] + + experiments = [] + for agent in agents: + expn = f"experiments/bias_{str(agent)}" + train(env, agent, expn, num_episodes=300, max_runs=100) + experiments.append(expn) + + main_plot(experiments, smoothing_window=10, y_key="Length") + plt.ylim([1, 2]) + plt.title("Double-Q learning on Maximization-Bias ex. (Figure 6.5)") + savepdf("maximization_bias_6_5") + plt.show() + + main_plot(experiments, smoothing_window=10) + savepdf("maximization_bias_6_5_reward") + plt.show() diff --git a/irlc/ex13/maze_dyna_environment.py b/irlc/ex13/maze_dyna_environment.py new file mode 100644 index 0000000000000000000000000000000000000000..771af4903e4ee3bb81e097e540c3b4146ee5c1cc --- /dev/null +++ b/irlc/ex13/maze_dyna_environment.py @@ -0,0 +1,118 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" +The DynaQ Maze environment. + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" + +from irlc.ex09.mdp import MDP + +class MazeEnvironment(MDP): + """ + The Maze environment from (SB18, Example 8.1) + """ + def __init__(self, **kwargs): + self.maze_ = HiddenMaze() + super().__init__(initial_state=tuple(self.maze_.START_STATE), **kwargs) + + def is_terminal(self, state): + return state == tuple(self.maze_.GOAL_STATES[0]) + + def A(self, s): + return self.maze_.actions + + def Psr(self, s, a): + xy, r = self.maze_.step(list(s), a) + return { (tuple(xy), r): 1 } + +# A wrapper class for a maze, containing all the information about the maze. +# Basically it's initialized to DynaMaze by default, however it can be easily adapted +# to other maze +class HiddenMaze: + def __init__(self): + # maze width + self.WORLD_WIDTH = 9 + + # maze height + self.WORLD_HEIGHT = 6 + + # all possible actions + self.ACTION_UP = 0 + self.ACTION_DOWN = 1 + self.ACTION_LEFT = 2 + self.ACTION_RIGHT = 3 + self.actions = [self.ACTION_UP, self.ACTION_DOWN, self.ACTION_LEFT, self.ACTION_RIGHT] + + # start state + self.START_STATE = [2, 0] + + # goal state + self.GOAL_STATES = [[0, 8]] + + # all obstacles + self.obstacles = [[1, 2], [2, 2], [3, 2], [0, 7], [1, 7], [2, 7], [4, 5]] + self.old_obstacles = None + self.new_obstacles = None + + # time to change obstacles + self.obstacle_switch_time = None + + # initial state action pair values + # self.stateActionValues = np.zeros((self.WORLD_HEIGHT, self.WORLD_WIDTH, len(self.actions))) + + # the size of q value + self.q_size = (self.WORLD_HEIGHT, self.WORLD_WIDTH, len(self.actions)) + + # max steps + self.max_steps = float('inf') + + # track the resolution for this maze + self.resolution = 1 + + # extend a state to a higher resolution maze + # @state: state in lower resoultion maze + # @factor: extension factor, one state will become factor^2 states after extension + def extend_state(self, state, factor): + new_state = [state[0] * factor, state[1] * factor] + new_states = [] + for i in range(0, factor): + for j in range(0, factor): + new_states.append([new_state[0] + i, new_state[1] + j]) + return new_states + + # extend a state into higher resolution + # one state in original maze will become @factor^2 states in @return new maze + def extend_maze(self, factor): + new_maze = HiddenMaze() + new_maze.WORLD_WIDTH = self.WORLD_WIDTH * factor + new_maze.WORLD_HEIGHT = self.WORLD_HEIGHT * factor + new_maze.START_STATE = [self.START_STATE[0] * factor, self.START_STATE[1] * factor] + new_maze.GOAL_STATES = self.extend_state(self.GOAL_STATES[0], factor) + new_maze.obstacles = [] + for state in self.obstacles: + new_maze.obstacles.extend(self.extend_state(state, factor)) + new_maze.q_size = (new_maze.WORLD_HEIGHT, new_maze.WORLD_WIDTH, len(new_maze.actions)) + # new_maze.stateActionValues = np.zeros((new_maze.WORLD_HEIGHT, new_maze.WORLD_WIDTH, len(new_maze.actions))) + new_maze.resolution = factor + return new_maze + + # take @action in @state + # @return: [new state, reward] + def step(self, state, action): + x, y = state + if action == self.ACTION_UP: + x = max(x - 1, 0) + elif action == self.ACTION_DOWN: + x = min(x + 1, self.WORLD_HEIGHT - 1) + elif action == self.ACTION_LEFT: + y = max(y - 1, 0) + elif action == self.ACTION_RIGHT: + y = min(y + 1, self.WORLD_WIDTH - 1) + if [x, y] in self.obstacles: + x, y = state + if [x, y] in self.GOAL_STATES: + reward = 1.0 + else: + reward = 0.0 + return [x, y], reward diff --git a/irlc/ex13/tabular_double_q.py b/irlc/ex13/tabular_double_q.py new file mode 100644 index 0000000000000000000000000000000000000000..a2280d893bbdfdae2672ff417f4a8ca37fee5aa3 --- /dev/null +++ b/irlc/ex13/tabular_double_q.py @@ -0,0 +1,78 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +from irlc.ex01.agent import train +import gymnasium as gym +from irlc import main_plot +import matplotlib.pyplot as plt +from irlc import savepdf +from irlc.ex11.sarsa_agent import SarsaAgent +from irlc.ex11.q_agent import QAgent +from irlc import Agent + +class TabularDoubleQ(QAgent): + """ + Implement the tabular version of the double-Q learning agent from + (SB18, Section 6.7). + + Note we will copy the Q-datastructure from the Agent class. + """ + def __init__(self, env, gamma=1.0, alpha=0.5, epsilon=0.1): + super().__init__(env, gamma, epsilon) + self.alpha = alpha + # The two Q-value functions. These are of the same type as the regular self.Q function + from irlc.ex09.rl_agent import TabularQ + self.Q1 = TabularQ(env) + self.Q2 = TabularQ(env) + self.Q = None # remove self.Q (we will not use it in double Q) + + def pi(self, s, k, info=None): + """ + Implement the epsilon-greedy action. The implementation is nearly identical to pi_eps in the Agent class + which can be used for inspiration, however we should use Q1+Q2 as the Q-value. + """ + a1, Q1 = self.Q1.get_Qs(s, info) + a2, Q2 = self.Q2.get_Qs(s, info) + Q = np.asarray(Q1) + np.asarray(Q2) + + # TODO: 1 lines missing. + raise NotImplementedError("Return epsilon-greedy action using Q") + + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + """ + Implement the double-Q learning rule, i.e. with probability np.random.rand() < 0.5 switch + the role of the two Q networks Q1 and Q2. Use the code for the regular Q-agent as inspiration. + """ + # TODO: 4 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"TabularDoubleQ_{self.gamma}_{self.epsilon}_{self.alpha}" + +if __name__ == "__main__": + """ Part 1: Cliffwalking """ + env = gym.make('CliffWalking-v0') + epsilon = 0.1 + alpha = 0.25 + gamma = 1.0 + for _ in range(20): + agents = [QAgent(env, gamma=1, epsilon=epsilon, alpha=alpha), + SarsaAgent(env, gamma=1, epsilon=epsilon, alpha=alpha), + TabularDoubleQ(env, gamma=1, epsilon=epsilon, alpha=alpha)] + + experiments = [] + for agent in agents: + expn = f"experiments/doubleq_cliffwalk_{str(agent)}" + train(env, agent, expn, num_episodes=500, max_runs=20) + experiments.append(expn) + + main_plot(experiments, smoothing_window=10) + plt.ylim([-100, 0]) + plt.title("Double-Q learning on " + env.spec.name) + savepdf("double_Q_learning_cliff") + plt.show() diff --git a/irlc/ex13/torch_networks.py b/irlc/ex13/torch_networks.py new file mode 100644 index 0000000000000000000000000000000000000000..9ea56b5b32b2d92f7a67fcae747107eb4b14d656 --- /dev/null +++ b/irlc/ex13/torch_networks.py @@ -0,0 +1,131 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import os +from irlc.ex13.dqn_network import DQNNetwork +import torch +import torch.nn as nn +import torch.optim as optim +import torch.autograd as autograd + +# Use GPU; If the drivers give you grief you can turn GPU off without a too big hit on performance in the cartpole task +USE_CUDA = torch.cuda.is_available() + +Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) + +class TorchNetwork(nn.Module,DQNNetwork): + def __init__(self, env, trainable=True, learning_rate=0.001, hidden=30): + nn.Module.__init__(self) + DQNNetwork.__init__(self) + self.env = env + self.hidden = hidden + self.actions = env.action_space.n + self.build_model_() + if trainable: + self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) + if USE_CUDA: + self.cuda() + + def build_feature_network(self): + num_observations = np.prod(self.env.observation_space.shape) + return (nn.Linear(num_observations, self.hidden), + nn.ReLU(), + nn.Linear(self.hidden, self.hidden), + nn.ReLU()) + + def build_model_(self): + num_actions = self.env.action_space.n + self.model = nn.Sequential(*self.build_feature_network(), nn.Linear(self.hidden,num_actions)) + + def forward(self, s): + s = Variable(torch.FloatTensor(s)) + s = self.model(s) + return s + + def __call__(self, s): + return self.forward(s).detach().numpy() + + def fit(self, s, target): + q_value = self.forward(s) + loss = (q_value - torch.FloatTensor(target).detach()).pow(2).sum(axis=1).mean() + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + def update_Phi(self, source, tau=1): + """ + Polyak adapt weights of this class given source: + I.e. tau=1 means adopt weights in one step, + tau = 0.001 means adopt very slowly, tau=1 means instant overwriting + """ + state = self.state_dict() + for k, wa in state.items(): + wb = source.state_dict()[k] + state[k] = wa*(1 - tau) + wb * tau + self.load_state_dict(state) + + def save(self, path): + if not os.path.exists(os.path.dirname(path)): + os.mkdir(os.path.dirname(path)) + torch.save(self.state_dict(), path+".torchsave") + + def load(self, path): + self.load_state_dict(torch.load(path+".torchsave")) + self.eval() # set batch norm layers, dropout, other stuff we don't use + +class TorchDuelNetwork(TorchNetwork): + def build_model_(self): + self.feature = nn.Sequential(*self.build_feature_network()) + self.advantage = nn.Sequential(nn.Linear(self.hidden, self.hidden), + nn.ReLU(), + nn.Linear(self.hidden, self.actions)) + self.value = nn.Sequential(nn.Linear(self.hidden, self.hidden), + nn.ReLU(), + nn.Linear(self.hidden, 1)) + + def forward(self, s): + """ + Return tensor corresponding to Q-values when using dueling Q-networks (see exercise description) + """ + # TODO: 4 lines missing. + raise NotImplementedError("Implement function body") + return value + advantage - advantage.mean() + +class TorchDuelNetworkAtari(TorchNetwork): + def build_feature_network(self): + hidden_size = 256 + in_channels = self.env.observation_space.shape[-1] + num_actions = self.env.action_space.n + return (nn.Conv2d(in_channels, 32, kernel_size=8, stride=4), + nn.BatchNorm2d(32), + nn.Conv2d(32, 64, kernel_size=4, stride=2), + nn.BatchNorm2d(64), + nn.Conv2d(64, 64, kernel_size=3, stride=1), + nn.BatchNorm2d(64), + nn.Linear(7 * 7 * 64, hidden_size), # has to be adjusted for other resolutionz + nn.Linear(hidden_size, num_actions) ) + +if __name__ == "__main__": + a = 234 + import gymnasium as gym + + env = gym.make("CartPole-v0") + Q = DQNNetwork(env, trainable=True, learning_rate=0.001) + + # self.Q = Network(env, trainable=True) # initialize the network + """ Assuming s has dimension [batch_dim x d] this returns a float numpy Array + array of Q-values of [batch_dim x actions], such that qvals[i,a] = Q(s_i,a) """ + batch_size = 32 # As an example + # Creates some dummy input + states = [env.reset()[0] for _ in range(batch_size)] + states.shape # batch_size x n + + qvals = Q(states) + qvals.shape # This is a tensor of dimension batch_size x actions + print(qvals[0,1]) # Get Q(s_0, 1) + + Y = np.random.rand( (batch_size, 1)) # Generate target Q-values (training data) + Q.fit(states, Y) # Train the Q-network. + + + + # Q = TorchNetwork() diff --git a/irlc/exam/__init__.py b/irlc/exam/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4615460b3db7947c972d696206eb8266ca543c94 --- /dev/null +++ b/irlc/exam/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# This file is required for the test-system to find the tests in the exam. diff --git a/irlc/exam/exam2023spring/__init__.py b/irlc/exam/exam2023spring/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/exam/exam2023spring/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/exam/exam2023spring/readme.md b/irlc/exam/exam2023spring/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..c041c5216f5a12754b844c5d316e1d0e835ec09a --- /dev/null +++ b/irlc/exam/exam2023spring/readme.md @@ -0,0 +1,2 @@ +This directory is purposefully left empty. During the exam, you will be given a `.zip` file with the content of this directory. +Replace this directory with the corresponding directory from the `.zip` file to begin working on the exam. diff --git a/irlc/exam/exam2023spring/solution/readme.md b/irlc/exam/exam2023spring/solution/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..8d673296bd5e370246c88be8ff6eff946d4ce4f1 --- /dev/null +++ b/irlc/exam/exam2023spring/solution/readme.md @@ -0,0 +1 @@ +I will make the solution to the exam available in this directory. diff --git a/irlc/exam/exam2024spring/__init__.py b/irlc/exam/exam2024spring/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/exam/exam2024spring/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/exam/exam2024spring/readme.md b/irlc/exam/exam2024spring/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..c041c5216f5a12754b844c5d316e1d0e835ec09a --- /dev/null +++ b/irlc/exam/exam2024spring/readme.md @@ -0,0 +1,2 @@ +This directory is purposefully left empty. During the exam, you will be given a `.zip` file with the content of this directory. +Replace this directory with the corresponding directory from the `.zip` file to begin working on the exam. diff --git a/irlc/exam/midterm2023a/__init__.py b/irlc/exam/midterm2023a/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/exam/midterm2023a/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/exam/midterm2023a/readme.md b/irlc/exam/midterm2023a/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..c041c5216f5a12754b844c5d316e1d0e835ec09a --- /dev/null +++ b/irlc/exam/midterm2023a/readme.md @@ -0,0 +1,2 @@ +This directory is purposefully left empty. During the exam, you will be given a `.zip` file with the content of this directory. +Replace this directory with the corresponding directory from the `.zip` file to begin working on the exam. diff --git a/irlc/exam/midterm2023a/solution/readme.md b/irlc/exam/midterm2023a/solution/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..8d673296bd5e370246c88be8ff6eff946d4ce4f1 --- /dev/null +++ b/irlc/exam/midterm2023a/solution/readme.md @@ -0,0 +1 @@ +I will make the solution to the exam available in this directory. diff --git a/irlc/exam/midterm2023b/__init__.py b/irlc/exam/midterm2023b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/exam/midterm2023b/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/exam/midterm2023b/readme.md b/irlc/exam/midterm2023b/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..c041c5216f5a12754b844c5d316e1d0e835ec09a --- /dev/null +++ b/irlc/exam/midterm2023b/readme.md @@ -0,0 +1,2 @@ +This directory is purposefully left empty. During the exam, you will be given a `.zip` file with the content of this directory. +Replace this directory with the corresponding directory from the `.zip` file to begin working on the exam. diff --git a/irlc/exam/midterm2023b/solution/readme.md b/irlc/exam/midterm2023b/solution/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..8d673296bd5e370246c88be8ff6eff946d4ce4f1 --- /dev/null +++ b/irlc/exam/midterm2023b/solution/readme.md @@ -0,0 +1 @@ +I will make the solution to the exam available in this directory. diff --git a/irlc/exam/readme.md b/irlc/exam/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..c189b31a4b85f0ec947ad1761de0e72448513e77 --- /dev/null +++ b/irlc/exam/readme.md @@ -0,0 +1,15 @@ +# Folder for the exam and midterms + +Before the exam: + - Ensure that the `irlc`-code generally works (you can run exercises, the packages we use such as `gymnasium` or `numpy` are installed, etc.) + - You have no problem running the various `unitgrade`-test scripts and generating `.token`-files + +During the exam: + - Download a `.zip` file with the code from the digital exam + - For the midterm, you can find the file on DTU Learn + - The `zip` file will contain the toolbox code including solutions. It will also contain a directory: + ```bash + irlc/exam/exam2024spring + ``` + - This directory contains the code you need to work on for the exam. Replace the directory on your local computer with this directory and you should be all set up + - The `.zip` file will also contain solutions to nearly all exercises. Use these if benefits you. diff --git a/irlc/exam_tabular_examples/__init__.py b/irlc/exam_tabular_examples/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/exam_tabular_examples/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/exam_tabular_examples/helper.py b/irlc/exam_tabular_examples/helper.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd09f2f9245aaa0724a2e5720223f8809600edf --- /dev/null +++ b/irlc/exam_tabular_examples/helper.py @@ -0,0 +1,11 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import interactive, train + + +def keyboard_play_value(env, agent, method_label='MC', q=False): + env, agent = interactive(env, agent) + agent.label = method_label + agent.label = 'MC (first visit)' + env.view_mode = 1 # Set value-function view-mode. + train(env, agent, num_episodes=100) + env.close() diff --git a/irlc/exam_tabular_examples/lecture_10_mc_value_every.py b/irlc/exam_tabular_examples/lecture_10_mc_value_every.py new file mode 100644 index 0000000000000000000000000000000000000000..59fdeb19c65633d776e9afec57093347de720555 --- /dev/null +++ b/irlc/exam_tabular_examples/lecture_10_mc_value_every.py @@ -0,0 +1,9 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.exam_tabular_examples.helper import keyboard_play_value +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.ex10.mc_evaluate import MCEvaluationAgent + +if __name__ == "__main__": + env = BookGridEnvironment(view_mode=1, render_mode='human') + agent = MCEvaluationAgent(env, gamma=.9, alpha=0.4, first_visit=False) + keyboard_play_value(env,agent,method_label='MC every') diff --git a/irlc/exam_tabular_examples/lecture_10_mc_value_first.py b/irlc/exam_tabular_examples/lecture_10_mc_value_first.py new file mode 100644 index 0000000000000000000000000000000000000000..0c444523c2f8b6d3345e6c6085fbba1ae87a6290 --- /dev/null +++ b/irlc/exam_tabular_examples/lecture_10_mc_value_first.py @@ -0,0 +1,13 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.ex10.mc_evaluate import MCEvaluationAgent +from irlc import interactive, train + +if __name__ == "__main__": + env = BookGridEnvironment(view_mode=1, render_mode='human') + agent = MCEvaluationAgent(env, gamma=.9, alpha=0.4) + agent.label = 'MC (first visit)' + env, agent = interactive(env, agent) + env.view_mode = 1 # Automatically set value-function view-mode. + train(env, agent, num_episodes=100) + env.close() diff --git a/irlc/exam_tabular_examples/sarsa_lambda_delay.py b/irlc/exam_tabular_examples/sarsa_lambda_delay.py new file mode 100644 index 0000000000000000000000000000000000000000..de3107f39d2ab6e18281ca091b9c1f45307677e3 --- /dev/null +++ b/irlc/exam_tabular_examples/sarsa_lambda_delay.py @@ -0,0 +1,45 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from collections import defaultdict +from irlc.ex11.q_agent import QAgent + +class SarsaLambdaDelayAgent(QAgent): + def __init__(self, env, gamma=0.99, epsilon=0.1, alpha=0.5, lamb=0.9): + super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon) + self.lamb = lamb + self.method = 'Sarsa(Lambda)' + self.e = defaultdict(float) + + def pi(self, s, k, info=None): + self.t = k + action = self.pi_eps(s,info=info) + return action + + def lmb_update(self, s, a, r, sp, ap, done): + delta = r + self.gamma * (self.Q[sp,ap] if not done else 0) - self.Q[s,a] + for (s,a), ee in self.e.items(): + self.Q[s,a] += self.alpha * delta * ee + self.e[(s,a)] = self.gamma * self.lamb * ee + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # if self.t == 0: + # self.e.clear() + + if self.t > 0: + # We have an update in the buffer and can update the states. + self.lmb_update(self.s_prev, self.a_prev, self.r_prev, s, a, done=False) + self.e[(s, a)] += 1 + + if done: + self.lmb_update(s, a, r, sp, ap=None, done=True) + self.e.clear() + + self.s_prev = s + self.a_prev = a + self.r_prev = r + + def __str__(self): + return f"SarsaLambdaDelay_{self.gamma}_{self.epsilon}_{self.alpha}_{self.lamb}" + +if __name__ == "__main__": + from irlc.ex12.sarsa_lambda_open import keyboard_play + keyboard_play(SarsaLambdaDelayAgent, method_label="Sarsa(Lambda) (delayed)") diff --git a/irlc/exam_tabular_examples/sarsa_nstep_delay.py b/irlc/exam_tabular_examples/sarsa_nstep_delay.py new file mode 100644 index 0000000000000000000000000000000000000000..32f2aad7a9e4656e81f6c4bab6be8107b4cba005 --- /dev/null +++ b/irlc/exam_tabular_examples/sarsa_nstep_delay.py @@ -0,0 +1,77 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import gymnasium as gym +from irlc import main_plot +import matplotlib.pyplot as plt +from irlc.gridworld.gridworld_environments import OpenGridEnvironment +from irlc import train +from irlc.ex11.q_agent import QAgent + +class SarsaDelayNAgent(QAgent): + """ Implement the N-step semi-gradient sarsa agent from (SB18, Section 7.2)""" + def __init__(self, env, gamma=1, alpha=0.2, epsilon=0.1, n=1): + # Variables for TD-n + self.method = 'Sarsa' if n == 1 else f'Sarsa({n=})' + + self.n = n # as in n-step sarse + # Buffer lists for previous (S_t, R_{t}, A_t) triplets + self.R, self.S, self.A = [None] * (self.n + 1), [None] * (self.n + 1), [None] * (self.n + 1) + super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon) + + def pi(self, s, k, info=None): + self.t = k # Save current step in episode for use in train. + return self.pi_eps(s, info) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # Recall we are given S_t, A_t, R_{t+1}, S_{t+1} and done is whether t=T+1. + n, t = self.n, self.t + # Store current observations in buffer. + self.S[t%(n+1)] = s + self.A[t%(n+1)] = a # self.pi_eps(sp) if not done else -1 + self.R[(t+1)%(n+1)] = r + if done: + T = t+1 + tau_steps_to_train = range(t - n, T) + else: + T = 1e10 + tau_steps_to_train = [t - n ] if t > 0 else [] + + # Tau represent the current tau-steps which are to be updated. The notation is compatible with that in Sutton. + for tau in tau_steps_to_train: + if tau >= 0: + """ + Compute the return for this tau-step and perform the relevant Q-update. + The first step is to compute the expected return G in the below section. + """ + G = sum([self.gamma**(i-tau-1)*self.R[i%(n+1)] for i in range(tau+1, min(tau+n, T)+1)]) + S_tau_n, A_tau_n = self.S[(tau+n)%(n+1)], self.A[(tau+n)%(n+1)] + if tau+n < T: + G += self.gamma**n * self._q(S_tau_n, A_tau_n) + S_tau, A_tau = self.S[tau%(n+1)], self.A[tau%(n+1)] + delta = G - self._q(S_tau, A_tau) + + if n == 1: # Check your implementation is correct when n=1 by comparing it with regular Sarsa learning. + delta_Sarsa = (self.R[ (tau+1)%(n+1) ] + (0 if tau+n==T else self.gamma * self._q(S_tau_n,A_tau_n)) - self._q(S_tau,A_tau)) + if abs(delta-delta_Sarsa) > 1e-10: + raise Exception("n=1 agreement with Sarsa learning failed. You have at least one bug!") + self._upd_q(S_tau, A_tau, delta) + + def _q(self, s, a): return self.Q[s,a] # Using these helper methods will come in handy when we work with function approximators, but it is optional. + def _upd_q(self, s, a, delta): self.Q[s,a] += self.alpha * delta + + def __str__(self): + return f"SarsaN_{self.gamma}_{self.epsilon}_{self.alpha}_{self.n}" + +from irlc.ex11.nstep_sarsa_agent import SarsaNAgent +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +if __name__ == "__main__": + n = 8 + env = OpenGridEnvironment() + agent = SarsaDelayNAgent(env, n=n) + train(env, agent, num_episodes=100) + + open_play(SarsaDelayNAgent, method_label=f"Sarsa n={n}", n=n) diff --git a/irlc/exam_tabular_examples/tabular_examples.py b/irlc/exam_tabular_examples/tabular_examples.py new file mode 100644 index 0000000000000000000000000000000000000000..f9932a592ec8a8552a17c2dc7f4c2e899e4f8d3a --- /dev/null +++ b/irlc/exam_tabular_examples/tabular_examples.py @@ -0,0 +1,78 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex12.sarsa_lambda_agent import SarsaLambdaAgent +from irlc.gridworld.gridworld_environments import OpenGridEnvironment, BookGridEnvironment, SuttonCornerGridEnvironment, SuttonMazeEnvironment +from irlc import train, savepdf +from irlc.ex12.sarsa_lambda_open import keyboard_play +import matplotlib.pyplot as plt +from irlc.ex11.q_agent import QAgent +from irlc.ex11.sarsa_agent import SarsaAgent +from irlc.ex11.nstep_sarsa_agent import SarsaNAgent +from irlc.ex10.mc_agent import MCAgent +from irlc.ex10.mc_evaluate import MCEvaluationAgent +from irlc.exam_tabular_examples.sarsa_nstep_delay import SarsaDelayNAgent +from irlc.exam_tabular_examples.sarsa_lambda_delay import SarsaLambdaDelayAgent +from irlc import interactive + +def open_snapshop(Agent, method_label="Unknown method", file_name=None, alpha=0.5, autoplay=False, **kwargs): + env = OpenGridEnvironment(render_mode='human') + agent = Agent(env, gamma=0.99, epsilon=0.1, alpha=alpha, **kwargs) + agent.label =method_label + print("Running", agent) + env, agent = interactive(env, agent, autoplay=autoplay) + train(env, agent, num_episodes=3) + if file_name is not None: + env.plot() + plt.title(method_label) + savepdf("exam_tabular_"+file_name) + env.close() + +if __name__ == "__main__": + """ All simulations run using gamma=0.99, epsilon=0.1 and alpha=0.5 (when applicable). """ + + """ The following lines will show all the agents using automatic play. It is used to generate screenshots. + Uncomment to go to interactive play. """ + # import numpy as np + # np.random.seed(42) + # env = SuttonMazeEnvironment(living_reward=-2, render_mode='human') + # agent = MCAgent(env, alpha=0.8, epsilon=0, gamma=0.4) + # env, agent = interactive(env, agent) + # train(env, agent, num_episodes=2) + + open_snapshop(MCAgent, "Monte-Carlo control (first visit)", file_name="mc_first", alpha=None) + open_snapshop(MCAgent, "Monte-Carlo control (every visit)", file_name="mc_every", alpha=None, first_visit=False) + open_snapshop(SarsaAgent, "Sarsa", file_name="sarsa") + open_snapshop(SarsaNAgent, "n-step Sarsa (n=8)", file_name="sarsa_n8", n=8) + open_snapshop(QAgent, "Q-learning", file_name="q_learning") + open_snapshop(SarsaLambdaAgent, "Sarsa(Lambda)", file_name="sarsa_lambda") + open_snapshop(MCEvaluationAgent, "Monte-Carlo value-estimation (first visit)", file_name="mc_evaluation_first", alpha=None) + open_snapshop(MCEvaluationAgent, "Monte-Carlo value-estimation (every visit)", file_name="mc_evaluation_every", first_visit=False) + + """ MC-methods for value estimation. This is the upgraded demo which also shows the number of times + a state has been visited in the value-estimation algorithm. """ + keyboard_play(MCEvaluationAgent, "Monte-Carlo value-estimation (first visit)", alpha=None) + keyboard_play(MCEvaluationAgent, "Monte-Carlo value-estimation (every visit)", alpha=None, first_visit=False) + + """ Control methods: + Play with the agents (using keyboard input) """ + keyboard_play(MCAgent, "Monte-Carlo control (first visit)", alpha=None) + keyboard_play(MCAgent, "Monte-Carlo control (every visit)", alpha=None, first_visit=False) + keyboard_play(QAgent, "Q-learning") + + """ These agents also accept keyboard input, but they are not guaranteed to update the Q-values correctly because the next state A' (in Suttons notation) + is generated in the train() method; i.e. we cannot easily overwrite it using the keyboard. I have included them for completeness, but + be a little careful with them. """ + keyboard_play(SarsaAgent, "Sarsa") + keyboard_play(SarsaNAgent, "n-step Sarsa (n=8)", n=8) + keyboard_play(SarsaLambdaAgent, "Sarsa(Lambda)") + + """ Bonus keyboard input agents: These agents implement the same methods as their counterparts above, however they 'wait' with updating + Q(S_t, A_t) until time t+1 when the (actual) next action A_{t+1} is available. This means that when they are used in conjunction with keyboard inputs, + the Q-values will be updated correctly since we can actually set A_{t+1} equal to the keyboard input. + This also mean the updates to the Q-values appear to lag one step behind the methods above. + I have included them in the case some find it useful to test the Q-values using the keyboard, + however, the implementations/delay-idea is not part of the exam pensum: only use them if you find them useful for studying, and otherwise just rely on the + description of the methods in the lecture material. + """ + keyboard_play(SarsaDelayNAgent, "Sarsa (delayed)", n=1) # We use that Sarsa is equal to n-step sarsa with n=1. + keyboard_play(SarsaDelayNAgent, "n-step Sarsa (n=8, delayed)", n=8) + keyboard_play(SarsaLambdaDelayAgent, "Sarsa(Lambda) (delayed)") diff --git a/irlc/gridworld/__init__.py b/irlc/gridworld/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/gridworld/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/gridworld/__pycache__/__init__.cpython-311.pyc b/irlc/gridworld/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51fdb1aad155ed02d6e2e424253ad9f79f46fc88 Binary files /dev/null and b/irlc/gridworld/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/gridworld/__pycache__/gridworld_environments.cpython-311.pyc b/irlc/gridworld/__pycache__/gridworld_environments.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df934112e5c0902a93095143b4af40c837a6e60b Binary files /dev/null and b/irlc/gridworld/__pycache__/gridworld_environments.cpython-311.pyc differ diff --git a/irlc/gridworld/__pycache__/gridworld_graphics_display.cpython-311.pyc b/irlc/gridworld/__pycache__/gridworld_graphics_display.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e17825632aa1704e500dad60aa5ca905581874d Binary files /dev/null and b/irlc/gridworld/__pycache__/gridworld_graphics_display.cpython-311.pyc differ diff --git a/irlc/gridworld/__pycache__/gridworld_mdp.cpython-311.pyc b/irlc/gridworld/__pycache__/gridworld_mdp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..402d55ea6f792f614d4f20b2f76dd75b30781609 Binary files /dev/null and b/irlc/gridworld/__pycache__/gridworld_mdp.cpython-311.pyc differ diff --git a/irlc/gridworld/demo_agents/__init__.py b/irlc/gridworld/demo_agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/gridworld/demo_agents/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/gridworld/demo_agents/hidden_agents.py b/irlc/gridworld/demo_agents/hidden_agents.py new file mode 100644 index 0000000000000000000000000000000000000000..d831b118fac4c28ab77e3f44457c63ca727fd671 --- /dev/null +++ b/irlc/gridworld/demo_agents/hidden_agents.py @@ -0,0 +1,235 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from collections import defaultdict +import numpy as np +from irlc import TabularAgent # , PlayWrapper, VideoMonitor, train +from irlc.ex09.mdp_warmup import value_function2q_function + + +class ValueIterationAgent2(TabularAgent): + def __init__(self, env, gamma=.99, epsilon=0, theta=1e-5, only_current_state=False): + self.v = defaultdict(lambda: 0) + self.steps = 0 + self.mdp = env.mdp + self.only_current_state = only_current_state + super().__init__(env, gamma, epsilon=epsilon) + + def pi(self, s, k, info=None): + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + return self.random_pi(s) if np.random.rand() < self.epsilon else a + + @property + def label(self): + label = f"Value iteration after {self.steps} steps" + return label + + def v2Q(self, s): # used for rendering right now + return value_function2q_function(self.mdp, s, self.gamma, self.v) + + def train(self, s, a, r, sp, done=False, info_sp=None): + delta = 0 + v2 = {} + for s in self.env.P.keys(): + v, v2[s] = self.v[s], max(value_function2q_function(self.mdp, s, self.gamma, self.v).values()) if len(self.mdp.A(s)) > 0 else 0 + delta = max(delta, np.abs(v - self.v[s])) + + self.v = v2 + + for s in self.mdp.nonterminal_states: + for a in self.mdp.A(s): + self.Q[s,a] = self.v2Q(s)[a] + + self.delta = delta + self.steps += 1 + + def __str__(self): + return f"VIAgent_{self.gamma}" + + +class PolicyEvaluationAgent2(TabularAgent): + def __init__(self, env, mdp=None, gamma=0.99, steps_between_policy_improvement=10, only_update_current=False): + if mdp is None: + mdp = env.mdp + self.mdp = mdp + self.v = defaultdict(lambda: 0) + self.imp_steps = 0 + self.steps_between_policy_improvement = steps_between_policy_improvement + self.steps = 0 + self.policy = {} + self.only_update_current = only_update_current + for s in mdp.nonterminal_states: + self.policy[s] = {} + for a in mdp.A(s): + self.policy[s][a] = 1/len(mdp.A(s)) + super().__init__(env, gamma) + + + def pi(self, s,k, info=None): + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return np.random.choice(a, p=pa) + + def v2Q(self, s): # used for rendering right now + return value_function2q_function(self.mdp, s, self.gamma, self.v) + + @property + def label(self): + if self.steps_between_policy_improvement is None: + label = f"Policy evaluation after {self.steps} steps" + else: + dd = self.steps % self.steps_between_policy_improvement == 0 + # print(dd) + label = f"PI after {self.steps} steps/{self.imp_steps-dd} policy improvements" + return label + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + if not self.only_update_current: + v2 = {} + for s in self.mdp.nonterminal_states: + q = value_function2q_function(self.mdp, s, self.gamma, self.v) + if len(q) == 0: + v2[s] = 0 + else: + v2[s] = sum( [qv * self.policy[s][a] for a, qv in q.items()] ) + + + for s in self.mdp.nonterminal_states: + for a,q in self.v2Q(s).items(): + self.Q[s,a] = q + + for k, v in v2.items(): + self.v[k] = v2[k] + + else: + # Only update Q-value in current state: + Q_ = 0 + # print(a) + + for (sp, r), p in self.mdp.Psr(s, a).items(): + Q_ += p*(r + (0 if self.mdp.is_terminal(sp) else sum([self.Q[sp, ap]*pa for ap, pa in self.policy[sp].items()]) )) + + # Q_ += p * (r + (0 if self.mdp.is_terminal(sp) else sum( + # [self.Q[sp, ap] * pa for ap, pa in self.policy[sp].items()]))) + + + self.Q[s, a] = Q_ + + v_ = 0 + for a in self.mdp.A(s): + for (sp, r), p in self.mdp.Psr(s, a).items(): + v_ += self.policy[s][a] * (self.v[sp] * self.gamma + r)*p + self.v[s] = v_ + + + if self.steps_between_policy_improvement is not None and (self.steps+1) % self.steps_between_policy_improvement == 0: + self.policy = {} + for s in self.mdp.nonterminal_states: + q = value_function2q_function(self.mdp, s, self.gamma, self.v) + if len(q) == 0: + continue + a_ = max(q, key=q.get) # optimal action + self.policy[s] = {} + for a in self.mdp.A(s): + self.policy[s][a] = 1 if q[a] == max(q.values()) else 0 #if a == a_ else 0 + + n = sum(self.policy[s].values()) + for a in self.policy[s]: + self.policy[s][a] *= 1/n + + self.imp_steps += 1 + self.steps += 1 + + def __str__(self): + return f"PIAgent_{self.gamma}" + + + +class ValueIterationAgent3(TabularAgent): + def __init__(self, env, mdp=None, epsilon=0, gamma=0.99, steps_between_policy_improvement=10, only_update_current=False): + if mdp is None: + mdp = env.mdp + self.mdp = mdp + self.v = defaultdict(lambda: 0) + self.imp_steps = 0 + self.steps_between_policy_improvement = steps_between_policy_improvement + self.steps = 0 + self.policy = {} + self.only_update_current = only_update_current + self.v = defaultdict(float) + for s in mdp.nonterminal_states: + self.policy[s] = {} + for a in mdp.A(s): + self.policy[s][a] = 1/len(mdp.A(s)) + super().__init__(env, gamma, epsilon=epsilon) + + + def pi(self, s,k, info=None): + from irlc import Agent + if np.random.rand() <self.epsilon: + return Agent.pi(self, s, k=k, info=info) + + a, pa = zip(*self.policy[s].items()) + return np.random.choice(a, p=pa) + + + def v2Q(self, s): # used for rendering right now + if not self.only_update_current: + a,q = self.Q.get_Qs(s) + return {a_: q_ for a_, q_ in zip(a,q)} + else: + return value_function2q_function(self.mdp, s, self.gamma, self.v) + + + def vi_q(self, s, a): + Q_ = 0 + for (sp, r), p in self.mdp.Psr(s, a).items(): + if self.mdp.is_terminal(sp): + QT = 0 + else: + qvals = [self.Q[sp, a_] for a_ in self.mdp.A(sp)] + QT = max(qvals) * (1-self.epsilon) + self.epsilon*np.mean(qvals) + Q_ += p * (r + self.gamma * QT) + return Q_ + + @property + def label(self): + label = f"Value Iteration after {self.steps} steps" + return label + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + s_ = s + if not self.only_update_current: + q_ = dict() + for s in self.mdp.nonterminal_states: + for a in self.mdp.A(s): + q_[s,a] = self.vi_q(s, a) + for (s,a), q in q_.items(): + self.Q[s,a] = q + else: + # Only update Q-value in current state: + # s = s_ + qq = value_function2q_function(self.mdp, s, self.gamma, self.v) + self.v[s] = max(qq.values()) + self.Q[s, a] = self.vi_q(s,a) + + + for s in self.mdp.nonterminal_states: + # q = qs_(self.mdp, s, self.gamma, self.v) + # if len(q) == 0: + # continue + # a_ = max(q, key=q.get) # optimal action + self.policy[s] = {} + qs = [self.Q[s,a] for a in self.mdp.A(s)] + + for a in self.mdp.A(s): + self.policy[s][a] = 1 if self.Q[s,a] >= max(qs)-1e-6 else 0 #if a == a_ else 0 + S = sum(self.policy[s].values()) + for a in self.mdp.A(s): + self.policy[s][a] = self.policy[s][a] / S + if not self.only_update_current: + self.v[s] = max([self.Q[s, a_] for a_ in self.mdp.A(s)]) + + self.steps += 1 + + def __str__(self): + return f"PIAgent_{self.gamma}" diff --git a/irlc/gridworld/gridworld_environments.py b/irlc/gridworld/gridworld_environments.py new file mode 100644 index 0000000000000000000000000000000000000000..d58b21b5c7546ff1281aa1905b00b3e65567fb9c --- /dev/null +++ b/irlc/gridworld/gridworld_environments.py @@ -0,0 +1,362 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +from collections import defaultdict +from gymnasium.envs.toy_text.frozen_lake import FrozenLakeEnv +from gymnasium.spaces.discrete import Discrete +from irlc.ex09.mdp import MDP2GymEnv +from irlc.gridworld.gridworld_mdp import GridworldMDP, FrozenGridMDP +from irlc import Timer +from gymnasium.spaces.multi_discrete import MultiDiscrete +import pygame + +grid_cliff_grid = [[' ',' ',' ',' ',' ', ' ',' ',' ',' ',' ', ' '], + [' ',' ',' ',' ',' ', ' ',' ',' ',' ',' ', ' '], + ['S',-100, -100, -100, -100,-100, -100, -100, -100, -100, 0]] + +grid_cliff_grid2 = [[' ',' ',' ',' ',' '], + ['S',' ',' ',' ',' '], + [-100,-100, -100, -100, 0]] + +grid_discount_grid = [[' ',' ',' ',' ',' '], + [' ','#',' ',' ',' '], + [' ','#', 1,'#', 10], + ['S',' ',' ',' ',' '], + [-10,-10, -10, -10, -10]] + +grid_bridge_grid = [[ '#',-100, -100, -100, -100, -100, '#'], + [ 1, 'S', ' ', ' ', ' ', ' ', 10], + [ '#',-100, -100, -100, -100, -100, '#']] + +grid_book_grid = [[' ',' ',' ',+1], + [' ','#',' ',-1], + ['S',' ',' ',' ']] + +grid_maze_grid = [[' ',' ',' ', +1], + ['#','#',' ','#'], + [' ','#',' ',' '], + [' ','#','#',' '], + ['S',' ',' ',' ']] + +sutton_corner_maze = [[ 1, ' ', ' ', ' '], + [' ', ' ', ' ', ' '], + [' ', 'S', ' ', ' '], + [' ', ' ', ' ', 1]] + +# A big yafcport open maze. +grid_open_grid = [[' ']*8 for _ in range(5)] +grid_open_grid[0][0] = 'S' +grid_open_grid[-1][-1] = 1 + + +class GridworldEnvironment(MDP2GymEnv): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 1000, + } + def get_keys_to_action(self): + return {(pygame.K_LEFT,): GridworldMDP.WEST, (pygame.K_RIGHT,): GridworldMDP.EAST, + (pygame.K_UP,): GridworldMDP.NORTH, (pygame.K_DOWN,): GridworldMDP.SOUTH} + + # return {(key.LEFT,): GridworldMDP.WEST, (key.RIGHT,): GridworldMDP.EAST, (key.UP,): GridworldMDP.NORTH, (key.DOWN,): GridworldMDP.SOUTH} + + def _get_mdp(self, grid, uniform_initial_state=False): + return GridworldMDP(grid, living_reward=self.living_reward) + + def __init__(self, grid=None, uniform_initial_state=True, living_reward=0,zoom=1, view_mode=0, render_mode=None, print_states=False, + frames_per_second=None, + **kwargs): + self.print_states = print_states + self.living_reward = living_reward + mdp = self._get_mdp(grid) + self.render_mode = render_mode + super().__init__(mdp, render_mode=render_mode) + self.action_space = Discrete(4) + # self.observation_space = MultiDiscrete([mdp.height, mdp.width]) # N.b. the state space does not contain the terminal state. + self.render_episodes = 0 + self.render_steps = 0 + self.timer = Timer() + self.view_mode = view_mode + self.agent = None # If this is set, the environment will try to render the internal state of the agent. + # It is a little hacky, it allows us to make the visualizations etc. + # Set up rendering if required. + self.display_pygame = None + # self.screen = None + self.zoom = zoom # Save zoom level. + self.total_reward = 0 + self.frames_per_second = frames_per_second + def _step(*args, **kwargs): + s = self.state + o = type(self).step(self, *args, **kwargs) + done = o[2] + a = args[0] + self.total_reward += o[1] + self.render_steps += 1 + self.render_episodes += done + if self.print_states: + if isinstance(self, FrozenLake): + pr = f" This occurred with probability: P(s', r | s, a) = {self.mdp.Psr(s, a)[(o[0], o[1])]:.2f}." + else: + pr = "" + if done: + pt = f" Total reward for this episode was {self.total_reward}." + else: + pt = "" + print(f"s={s}, a={a} --> s'={o[0]}, r={o[1]}. {pr}{pt}") + return o + self.step = _step + + def reset(self, *args, **kwargs): + o = super().reset(*args, **kwargs) + self.total_reward = 0 + if self.print_states: + print(f"Starting in state s={o[0]}") + return o + + def keypress(self, key): + if key.unicode == 'm': + # changing mode... + self.view_mode += 1 + self.render() + return + + if key == 116: # This may easily not be used. + self.view_mode += 1 + self.render() + + + def render(self): + if self.display_pygame is None: + from irlc.gridworld.gridworld_graphics_display import GraphicsGridworldDisplay + self.display_pygame = GraphicsGridworldDisplay(self.mdp, size=int(150 * self.zoom), frames_per_second=self.frames_per_second) # last item is grid size + + agent = self.agent + label = None + method_label = agent.method if hasattr(agent, 'method') else '' + if label is None and len(method_label) > 0: + label = f"{method_label} AFTER {self.render_steps} STEPS" + + state = self.state + avail_modes = [] + if agent != None: + label = (agent.label if hasattr(agent, 'label') else label if label is not None else '') #if label is None else label + v = agent.v if hasattr(agent, 'v') else None + Q = agent.Q if hasattr(agent, 'Q') else None + # policy = agent.policy if hasattr(agent, 'policy') else None + v2Q = agent.v2Q if hasattr(agent, 'v2Q') else None + avail_modes = [] + if Q is not None: + avail_modes.append("Q") + avail_modes.append("v") + elif v is not None: + avail_modes.append("v") + + if len(avail_modes) > 0: + self.view_mode = self.view_mode % len(avail_modes) + if avail_modes[self.view_mode] == 'v': + preferred_actions = None + + if v == None: + preferred_actions = {} + v = {s: max(Q.get_Qs(s)[1]) for s in self.mdp.nonterminal_states} + + for s in self.mdp.nonterminal_states: + acts, values = Q.get_Qs(s) + preferred_actions[s] = [a for (a,w) in zip(acts, values) if np.round(w, 2) == np.round(v[s], 2)] + + if v2Q is not None: + preferred_actions = {} + for s in self.mdp.nonterminal_states: + q = v2Q(s) + mv = np.round( max( q.values() ), 2) + preferred_actions[s] = [k for k, v in q.items() if np.round(v, 2) == mv] + + if agent != None and hasattr(agent, 'policy') and agent.policy is not None and state in agent.policy and isinstance(agent.policy[state], dict): + for s in self.mdp.nonterminal_states: + preferred_actions[s] = [a for a, v in agent.policy[s].items() if v == max(agent.policy[s].values()) ] + + if hasattr(agent, 'returns_count_N'): + returns_count = agent.returns_count_N + else: + returns_count = None + if hasattr(agent, 'returns_sum_S'): + returns_sum = agent.returns_sum_S + else: + returns_sum = None + + self.display_pygame.displayValues(mdp=self.mdp, v=v, preferred_actions=preferred_actions, currentState=state, message=label, returns_count=returns_count, returns_sum=returns_sum) + + elif avail_modes[self.view_mode] == 'Q': + + if hasattr(agent, 'e') and isinstance(agent.e, defaultdict): + eligibility_trace = defaultdict(float) + for k, v in agent.e.items(): + eligibility_trace[k] = v + + else: + eligibility_trace = None + + if hasattr(agent, 'returns_count_N'): + returns_count = agent.returns_count_N + elif hasattr(agent, 'returns_count'): + returns_count = agent.returns_count + else: + returns_count = None + if hasattr(agent, 'returns_sum_S'): + returns_sum = agent.returns_sum_S + elif hasattr(agent, 'returns_sum'): + returns_sum = agent.returns_sum + else: + returns_sum = None + + self.display_pygame.displayQValues(self.mdp, Q, currentState=state, message=label, eligibility_trace=eligibility_trace, returns_count=returns_count, returns_sum=returns_sum) + else: + raise Exception("No view mode selected") + else: + # self.pygame_display = Gridworl + self.display_pygame.displayNullValues(self.mdp, currentState=state, message=label) + # self.display.displayNullValues(self.mdp, currentState=state) + + render_out2 = self.display_pygame.blit(render_mode=self.render_mode) + return render_out2 + + def close(self): + # print("Closing time...") + if self.display_pygame is not None: + self.display_pygame.close() + + +class BookGridEnvironment(GridworldEnvironment): + def __init__(self, *args, **kwargs): + super().__init__(grid_book_grid, *args, **kwargs) + +class BridgeGridEnvironment(GridworldEnvironment): + def __init__(self, *args, **kwargs): + super().__init__(grid_bridge_grid, *args, **kwargs) + +class CliffGridEnvironment(GridworldEnvironment): + def __init__(self, *args, **kwargs): + super().__init__(grid_cliff_grid, living_reward=-1, *args, **kwargs) + +class CliffGridEnvironment2(GridworldEnvironment): + def __init__(self, *args, **kwargs): + super().__init__(grid_cliff_grid2, living_reward=-1, *args, **kwargs) + + +class OpenGridEnvironment(GridworldEnvironment): + def __init__(self, *args, **kwargs): + super().__init__(grid_open_grid, *args, **kwargs) + +""" +Implement Suttons little corner-maze environment (see (SB18, Example 4.1)). +You can make an instance using: +> from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment +> env = SuttonCornerGridEnvironment() +To get access the the mdp (as a MDP-class instance, for instance to see the states env.mdp.nonterminal_states) use +> env.mdp +""" +class SuttonCornerGridEnvironment(GridworldEnvironment): + def __init__(self, *args, living_reward=-1, **kwargs): # living_reward=-1 means the agent gets a reward of -1 per step. + super().__init__(sutton_corner_maze, *args, living_reward=living_reward, **kwargs) + +class SuttonMazeEnvironment(GridworldEnvironment): + def __init__(self, *args, render_mode=None, living_reward=0, **kwargs): + sutton_maze_grid = [[' ', ' ', ' ', ' ', ' ', ' ', ' ', '#', +1], + [' ', ' ', '#', ' ', ' ', ' ', ' ', '#', ' '], + ['S', ' ', '#', ' ', ' ', ' ', ' ', '#', ' '], + [' ', ' ', '#', ' ', ' ', ' ', ' ', ' ', ' '], + [' ', ' ', ' ', ' ', ' ', '#', ' ', ' ', ' '], + [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']] + + super().__init__(sutton_maze_grid, *args, render_mode=render_mode, living_reward=living_reward, **kwargs) + + + + +# "4x4":[ +# "SFFF", +# "FHFH", +# "FFFH", +# "HFFG" +# ] + +# "8x8": [ +# "SFFFFFFF", +# "FFFFFFFF", +# "FFFHFFFF", +# "FFFFFHFF", +# "FFFHFFFF", +# "FHHFFFHF", +# "FHFFHFHF", +# "FFFHFFFG", +# ] +# frozen_lake_4 = [['S', ' ', ' ', ' '], +# [' ', 0, ' ', 0], +# [' ', ' ', ' ', 0], +# [ 0, ' ', ' ', +1]] + +grid_book_grid_ = [[' ',' ',' ',+1], + [' ','#',' ',-1], + ['S',' ',' ',' ']] + +frozen_lake_4 = [['S',' ',' ',' '], + [' ','#',' ',-1], + [ 0 , ' ', ' ', +1]] + +class FrozenLake(GridworldEnvironment): + def _get_mdp(self, grid, uniform_initial_state=False): + return FrozenGridMDP(grid, is_slippery=self.is_slippery, living_reward=self.living_reward) + + def __init__(self, is_slippery=True, living_reward=0, *args, **kwargs): + self.is_slippery = is_slippery + menv = FrozenLakeEnv(is_slippery=is_slippery) # Load frozen-lake game layout and convert to our format 'grid' + gym2grid = dict(F=' ', G=1, H=0) + grid = [[gym2grid.get(s.decode("ascii"), s.decode("ascii")) for s in l] for l in menv.desc.tolist()] + menv.close() + super().__init__(grid=grid, *args, living_reward=living_reward, **kwargs) + +if __name__ == "__main__": + import gym + # env = gym.make('CartPole-v1', render_mode="human") + # env.reset() + # + # a = 234 gym + # env = gym.make('CartPole-v1', render_mode="human") + # env.reset() + from irlc import interactive, Agent, train + from irlc.ex11.q_agent import QAgent + from irlc.ex11.sarsa_agent import SarsaAgent + # env = SuttonMazeEnvironment(render_mode="human", zoom=0.75) + # env = OpenGridEnvironment(render_mode='human', zoom=0.75) + # env = OpenGridEnvironment() + env = CliffGridEnvironment() + agent = QAgent(env) + # env, agent = interactive(env, QAgent(env)) + # stats, trajectories = train(env, agent, num_episodes=100, experiment_name='q_learning') + stats, trajectories = train(env, SarsaAgent(env), num_episodes=100, experiment_name='sarsa') + + from irlc import main_plot + main_plot(experiments=['q_learning', 'sarsa']) + from matplotlib import pyplot as plt + plt.show() + # from irlc import VideoMonitor, train, Agent, PlayWrapper + # agent = Agent(env) + env.reset() + env.close() + + # agent = PlayWrapper(agent, env) + # env = VideoMonitor(env) + # env = Video + + # a = 234 + # for r in range(100): + # import time + # env.reset() + # time.sleep(1) + # train(env, agent, 2000) + a = 234 + # env.step(0) diff --git a/irlc/gridworld/gridworld_graphics_display.py b/irlc/gridworld/gridworld_graphics_display.py new file mode 100644 index 0000000000000000000000000000000000000000..f8fda14053d55a2cc09aab97e650bc5edcb9d27a --- /dev/null +++ b/irlc/gridworld/gridworld_graphics_display.py @@ -0,0 +1,543 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# gridworld_graphics_display.py +# --------------------------- +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). + +from irlc.utils.graphics_util_pygame import GraphicsUtilGym, formatColor +from irlc.pacman.pacman_graphics_display import PACMAN_OUTLINE_WIDTH, PACMAN_SCALE +from irlc.gridworld.gridworld_mdp import GridworldMDP +from collections import defaultdict +import math +import numpy as np + +# import sphinx_autorun + +BACKGROUND_COLOR = formatColor(0, 0, 0) +EDGE_COLOR = formatColor(1, 1, 1) +OBSTACLE_COLOR = formatColor(0.5, 0.5, 0.5) +TEXT_COLOR = formatColor(1, 1, 1) +MUTED_TEXT_COLOR = formatColor(0.7, 0.7, 0.7) +LOCATION_COLOR = formatColor(0, 0, 1) +RED_TEXT_COLOR = formatColor(.68, .93, 0.93) +from irlc.pacman.pacman_graphics_display import PACMAN_COLOR + +def getEndpoints(direction, position=(0, 0)): + x, y = position + pos = x - int(x) + y - int(y) + width = 30 + 80 * math.sin(math.pi * pos) + delta = width / 2 + if direction == 'West': + endpoints = (180 + delta, 180 - delta) + elif direction == 'North': + endpoints = (90 + delta, 90 - delta) + elif direction == 'South': + endpoints = (270 + delta, 270 - delta) + else: + endpoints = (0 + delta, 0 - delta) + return endpoints + + +class GraphicsGridworldDisplay: + time_since_last_update = 0 + key_queue = [] + def __init__(self, mdp, size=120, frames_per_second=None): + self.mdp = mdp + self.ga = GraphicsUtilGym() + self.Q_old = None + self.v_old = None + self.Null_old = None + title = "Gridworld Display" + self.GRID_SIZE = size + self.MARGIN = self.GRID_SIZE * 0.75 + screen_width = (mdp.width - 1) * self.GRID_SIZE + self.MARGIN * 2 + screen_height = (mdp.height - 0.5) * self.GRID_SIZE + self.MARGIN * 2 + self.ga.begin_graphics(screen_width, screen_height, BACKGROUND_COLOR, title=title, frames_per_second=frames_per_second) + self.annotations = [] + # function to refresh the window + + + def draw_annotation(self): + for a in self.annotations: + if a['type'] == 'text': + self.ga.text(f"adf", (a['x'], a['y']), a['color'], a['message'], "Courier", anchor='c', fontsize=a['fontsize'], bold=a['bold']) + + + def annotate_text(self, state, symbol='o', color=(200,50, 50), dx=0, dy=0, action=None, fontsize=30, bold=False): + x,y = self.to_screen(state) + x += int(dx * self.GRID_SIZE) + y += int(dy * self.GRID_SIZE) + if action is not None: + from irlc.gridworld.gridworld_mdp import GridworldMDP + dd = 0.2 + if action == GridworldMDP.NORTH: y -= int( dd * self.GRID_SIZE) + elif action == GridworldMDP.SOUTH: y += int( dd * self.GRID_SIZE) + elif action == GridworldMDP.EAST: x += int(dd * self.GRID_SIZE) + elif action == GridworldMDP.WEST: x -= int( dd * self.GRID_SIZE) + + self.annotations.append({'type': 'text', 'x': x, 'y': y, 'message': symbol, 'color': color, 'fontsize': fontsize, 'bold': bold}) + + + def close(self): + # Stop pygame and refresh thread + self.ga.close() + + def blit(self, render_mode=None): + return self.ga.blit(render_mode=render_mode) + # if render_mode == 'rgb_array': + # return np.transpose( + # np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2) + # ) + # + # pass + + + + # def autorefresh(self, env, interval=0.1): + # raise Exception("What is this?") + # def task(env, interval): + # while True: + # env.render() + # time.sleep(interval) + # + # from threading import Thread + # thread = Thread(target=task, args=(env, interval)) + # thread.start() + + # def end_frame(self): + # self.ga.end_frame() + + def displayValues(self, mdp, v, preferred_actions=None, currentState=None, message='Agent Values', returns_count=None, returns_sum=None): + # if self.v_old == None: + # self.ga.gc.clear() + # self.v_old = {} + # else: + # pass + self.ga.draw_background() + m = [v[s] for s in mdp.nonterminal_states] + self.Q_old = None + grid = mdp.grid + minValue = min(m) + maxValue = max(m) + + for x in range(mdp.width): + for y in range(mdp.height): + name = f"V_{x}_{y}_" + state = (x, y) + gridType = grid[x, y] + isExit = str(gridType) != gridType + isCurrent = currentState == state + if gridType == '#': + self.drawSquare(name, x, y, 0, 0, 0, None, None, True, False, isCurrent) + else: + value = v[state] + value = np.round(value, 2) + valString = '%.2f' % value + if mdp.is_terminal(state): + all_actions = [] + else: + all_actions = mdp.A(state) + if preferred_actions != None: + all_actions = preferred_actions[state] + + returns_sum_ = returns_sum[state] if returns_sum is not None else None + returns_count_ = returns_count[state] if returns_count is not None else None + self.drawSquare(name, x, y, value, minValue, maxValue, valString, all_actions, False, isExit, isCurrent, + returns_sum=returns_sum_, returns_count=returns_count_) + + # print("Drawing...") + if isinstance(currentState, tuple): + # print("found pacman") + screen_x, screen_y = self.to_screen(currentState) + self.draw_player((screen_x, screen_y), 0.12 * self.GRID_SIZE) + # else: + # print("no instance found??") + + pos = self.to_screen(((mdp.width - 1.0) / 2.0, - 0.8)) + self.ga.text(f"v_text_", pos, TEXT_COLOR, message, "Courier", -32, "bold", "c") + self.draw_annotation() + + def displayNullValues(self, mdp, currentState=None, message=''): + self.ga.draw_background() + grid = mdp.grid + # self.blank() + for x in range(mdp.width): + for y in range(mdp.height): + state = (x, y) + gridType = grid[x,y] + isExit = str(gridType) != gridType + isCurrent = currentState == state + name = f"sq_{x}_{y}" + if gridType == '#': + self.drawSquare(name, x, y, 0, 0, 0, None, None, True, False, isCurrent) + else: + self.drawNullSquare(name, mdp.grid, x, y, False, isExit, isCurrent) + pos = self.to_screen(((mdp.width - 1.0) / 2.0, - 0.8)) + + if isinstance(currentState, tuple): + screen_x, screen_y = self.to_screen(currentState) + self.draw_player((screen_x, screen_y), 0.12 * self.GRID_SIZE) + else: + pass + # print("No player!") + # pos = self.to_screen(((mdp.width - 1.0) / 2.0, - 0.8)) + # self.ga.text("Q_values_text", pos, TEXT_COLOR, message, "Courier", -32, "bold", "c") + + self.ga.text("bottom_text", pos, TEXT_COLOR, message, "Courier", -32, "bold", "c") + self.draw_annotation() + + + def displayQValues(self, mdp, Q, currentState=None, message="Agent Q-Values", eligibility_trace=None, returns_count=None, returns_sum=None): + """ Eligibility trace is an optional dictionary-like object. """ + self.ga.draw_background() + if self.Q_old == None: + # self.ga.gc.clear() + + self.Q_old = {} + self.e_old = {} + else: + pass + # self.ga.gc.copy_all() + + self.v_old = None + self.Null_old = None + + m = [max(Q.get_Qs(s)[1]) for s in mdp.nonterminal_states] + mv = [min(Q.get_Qs(s)[1]) for s in mdp.nonterminal_states] + + minValue = min(mv) + maxValue = max(m) + for x in range(mdp.width): + for y in range(mdp.height): + state = (x, y) + if state not in mdp.nonterminal_states: + actions = [] + Qs = [] + else: + actions, Qs = Q.get_Qs((x, y)) + Qs = list(np.round(Qs, decimals=2)) + + # Q_same = False + if self.Q_old != None and Qs == self.Q_old.get((x, y), 0): + Q_same = True + else: + Q_same = False + Q_same = False + E_same = True + if eligibility_trace is not None: + es = [eligibility_trace[state, a] for a in actions] + if state in self.e_old and self.e_old[state] == es: + E_same = True + else: + E_same = False + + if E_same and Q_same: + continue + else: + self.Q_old[state] = Qs + if eligibility_trace is not None: + self.e_old[state] = es + + name = f"Qsqr_{x}_{y}" + gridType = mdp.grid[x, y] + isExit = (str(gridType) != gridType) + isCurrent = (currentState == state) + # actions = mdp.A(state) + if actions == None or len(actions) == 0: + actions = [None] + q = defaultdict(lambda: 0) + valStrings = {} + + if gridType == '#': + self.drawSquare(name, x, y, 0, 0, 0, None, None, True, False, isCurrent) + elif isExit: + action = actions[0] # next(iter(q.keys())) + value = Qs[0] # q[action] # q[action] + valString = '%.2f' % value + self.drawSquare(name, x, y, value, minValue, maxValue, valString, [action], False, isExit, + isCurrent) + else: + actions, Qs = Q.get_Qs(state) + de = None + rs = None # return-sum + rN = None # return-count + + for k, action in enumerate(actions): + v = Qs[k] # Get the Q-value. + # v = Q[state, action] + q[action] += v + valStrings[action] = '%.2f' % v + # etrace = None if eligibility_trace is None else eligibility_trace[] + # print(state, action, eligibility_trace[state, action]) + de = None if eligibility_trace is None else {a: eligibility_trace[state, a] for a in actions} + if returns_sum is not None: + rs = {a: returns_sum[state, a] for a in actions} + if returns_count is not None: + rN = {a: returns_count[state, a] for a in actions} + + + self.drawSquareQ(name, x, y, q, minValue, maxValue, valStrings, actions, isCurrent, eligibility_trace=de, returns_sum=rs, returns_count=rN) + pos = self.to_screen(((mdp.width - 1.0) / 2.0, - 0.8)) + self.ga.text("Q_values_text", pos, TEXT_COLOR, message, "Courier", -32, "bold", "c") + + if isinstance(currentState, tuple): + + screen_x, screen_y = self.to_screen(currentState) + self.draw_player((screen_x, screen_y), 0.12 * self.GRID_SIZE) + self.draw_annotation() + + + def drawNullSquare(self, name, grid, x, y, isObstacle, isTerminal, isCurrent): + square_color = getColor(0, -1, 1) + if isObstacle: + square_color = OBSTACLE_COLOR + (screen_x, screen_y) = self.to_screen((x, y)) + self.square(name + "_s1", (screen_x, screen_y), + 0.5 * self.GRID_SIZE, + color=square_color, + filled=1, + width=1) + self.square(name + "_s2", (screen_x, screen_y), + 0.5 * self.GRID_SIZE, + color=EDGE_COLOR, + filled=0, + width=3) + if isTerminal and not isObstacle: + self.square(name + "_s3", (screen_x, screen_y), + 0.4 * self.GRID_SIZE, + color=EDGE_COLOR, + filled=0, + width=2) + self.ga.text(name + "_text", (screen_x, screen_y), + TEXT_COLOR, + str(grid[x,y]), + "Courier", -24, "bold", "c") + self.draw_annotation() + + + def drawSquare(self, name, x, y, val, min, max, valStr, all_action, isObstacle, isTerminal, isCurrent, + returns_count=None, returns_sum=None): + square_color = getColor(val, min, max) + (screen_x, screen_y) = self.to_screen((x, y)) + if isObstacle: + square_color = OBSTACLE_COLOR + + self.square(name + "_o1", (screen_x, screen_y), 0.5 * self.GRID_SIZE, color=square_color, filled=1, width=1) + + self.square(name + "_o2", (screen_x, screen_y), 0.5 * self.GRID_SIZE, color=EDGE_COLOR, filled=0, width=3) + if isTerminal and not isObstacle: + self.square(name + "_o3", (screen_x, screen_y), 0.4 * self.GRID_SIZE, color=EDGE_COLOR, filled=0, width=2) + + if all_action is None: + all_action = [] + GRID_SIZE = self.GRID_SIZE + for action in all_action: + if action == GridworldMDP.NORTH: + self.ga.polygon(name + "_p1", [(screen_x, screen_y - 0.45 * GRID_SIZE), + (screen_x + 0.05 * GRID_SIZE, screen_y - 0.40 * GRID_SIZE), + (screen_x - 0.05 * GRID_SIZE, screen_y - 0.40 * GRID_SIZE)], EDGE_COLOR, + filled=1, smoothed=False) + if action == GridworldMDP.SOUTH: + self.ga.polygon(name + "_p2", [(screen_x, screen_y + 0.45 * GRID_SIZE), + (screen_x + 0.05 * GRID_SIZE, screen_y + 0.40 * GRID_SIZE), + (screen_x - 0.05 * GRID_SIZE, screen_y + 0.40 * GRID_SIZE)], EDGE_COLOR, + filled=1, smoothed=False) + if action == GridworldMDP.WEST: + self.ga.polygon(name + "_p3", [(screen_x - 0.45 * GRID_SIZE, screen_y), + (screen_x - 0.4 * GRID_SIZE, screen_y + 0.05 * GRID_SIZE), + (screen_x - 0.4 * GRID_SIZE, screen_y - 0.05 * GRID_SIZE)], EDGE_COLOR, + filled=1, smoothed=False) + if action == GridworldMDP.EAST: + self.ga.polygon(name + "_p4", [(screen_x + 0.45 * GRID_SIZE, screen_y), + (screen_x + 0.4 * GRID_SIZE, screen_y + 0.05 * GRID_SIZE), + (screen_x + 0.4 * GRID_SIZE, screen_y - 0.05 * GRID_SIZE)], EDGE_COLOR, + filled=1, smoothed=False) + + text_color = TEXT_COLOR + if not isObstacle: + self.ga.text(name + "_txt", (screen_x, screen_y - (GRID_SIZE/6 if isCurrent else 0) ), text_color, valStr, "Courier", -30, "bold", "c") + + if returns_count is not None: + self.ga.text(name + "_rc", (screen_x-GRID_SIZE/3, screen_y+GRID_SIZE/7), RED_TEXT_COLOR, f"N(s)={int(returns_count)}", "Courier", -20, "bold", "w") + if returns_sum is not None: + self.ga.text(name + "_rs", (screen_x-GRID_SIZE/3, screen_y+2*GRID_SIZE/7), RED_TEXT_COLOR, f"S(s)={returns_sum:.2f}", "Courier", -20, "bold", "w") + + # if returns_count is not None: + # self.ga.text(name + "_rs", (screen_x, screen_y), text_color, valStr, "Courier", -30, "bold", "c") + + + def drawSquareQ(self, name, x, y, qVals, minVal, maxVal, valStrs, bestActions, isCurrent, eligibility_trace=None, returns_sum=None, returns_count=None): + + GRID_SIZE = self.GRID_SIZE + (screen_x, screen_y) = self.to_screen((x, y)) + center = (screen_x, screen_y) + nw = (screen_x - 0.5 * GRID_SIZE, screen_y - 0.5 * GRID_SIZE) + ne = (screen_x + 0.5 * GRID_SIZE, screen_y - 0.5 * GRID_SIZE) + se = (screen_x + 0.5 * GRID_SIZE, screen_y + 0.5 * GRID_SIZE) + sw = (screen_x - 0.5 * GRID_SIZE, screen_y + 0.5 * GRID_SIZE) + + n = (screen_x, screen_y - 0.5 * GRID_SIZE + 5) + s = (screen_x, screen_y + 0.5 * GRID_SIZE - 5) + w = (screen_x - 0.5 * GRID_SIZE + 5, screen_y) + e = (screen_x + 0.5 * GRID_SIZE - 5, screen_y) + + actions = qVals.keys() + for action in actions: + wedge_color = getColor(qVals[action], minVal, maxVal) + if action == GridworldMDP.NORTH: + self.ga.polygon(name + "_s1", (center, nw, ne), wedge_color, filled=1, smoothed=False) + if action == GridworldMDP.SOUTH: + self.ga.polygon(name + "_s2", (center, sw, se), wedge_color, filled=1, smoothed=False) + if action == GridworldMDP.EAST: + self.ga.polygon(name + "_s3", (center, ne, se), wedge_color, filled=1, smoothed=False) + if action == GridworldMDP.WEST: + self.ga.polygon(name + "_s4", (center, nw, sw), wedge_color, filled=1, smoothed=False) + + self.square(name + "_base_square", (screen_x, screen_y), + 0.5 * GRID_SIZE, + color=EDGE_COLOR, + filled=0, + width=3) + + self.ga.line(name + "_l1", ne, sw, color=EDGE_COLOR) + self.ga.line(name + "_l2", nw, se, color=EDGE_COLOR) + + for action in actions: + text_color = TEXT_COLOR + if qVals[action] < max(qVals.values()): text_color = MUTED_TEXT_COLOR + valStr = "" + if action in valStrs: + valStr = valStrs[action] + h = -20 # Font size (for reasons). + if eligibility_trace is not None: + estr = f'{eligibility_trace[action]:.2f}' + dh = 0.105 * GRID_SIZE + ECOL = RED_TEXT_COLOR if eligibility_trace[action] != 0 else getColor(qVals[action], minVal, maxVal) + esize = -16 + + NCOL = RED_TEXT_COLOR + NSIZE = int(GRID_SIZE/170 * 20) + S_str = '' + N_str = '' + + rca = None + if returns_sum is not None and returns_sum[action] is not None: + rca = returns_sum[action] + + rcc = None + if returns_count is not None and returns_count[action] is not None: + rcc = returns_count[action] + + if rca is not None: + S_str = f"S(s)={returns_sum[action]:.2f}" + if rcc is not None: + N_str = f"N(s)={int(returns_count[action])}" + dh = 0.105 * GRID_SIZE + + # self.ga.text(name + "_rc", (screen_x - GRID_SIZE / 3, screen_y + GRID_SIZE / 7), RED_TEXT_COLOR, + # f"N(s)={int(returns_count)}", "Courier", -20, "bold", "w") + # if returns_sum is not None: + # self.ga.text(name + "_rs", (screen_x - GRID_SIZE / 3, screen_y + 2 * GRID_SIZE / 7), RED_TEXT_COLOR, + # f"S(s)={returns_sum:.2f}", "Courier", -20, "bold", "w") + # dw = 0.095 * GRID_SIZE + + if action == GridworldMDP.NORTH: + self.ga.text(name + "_txt1", n, text_color, valStr, "Courier", h, "bold", "n") + if eligibility_trace is not None: + self.ga.text(name + "_txt1e", (n[0], n[1]+dh), ECOL, estr, "Courier", esize, "bold", "n") + if rca is not None: + self.ga.text(f"{name}_txt_s{action}", (n[0], n[1] + dh), NCOL, S_str, "Courier", 10, "bold", "n",fontsize=NSIZE) + if rcc is not None: + self.ga.text(f"{name}_txt_n{action}", (n[0], n[1] + 2*dh), NCOL, N_str, "Courier", 2, "bold", "n",fontsize=NSIZE) + + + if action == GridworldMDP.SOUTH: + self.ga.text(name + "_txt2", s, text_color, valStr, "Courier", h, "bold", "s") + if eligibility_trace is not None: + self.ga.text(name + "_txt2e", (s[0], s[1]-dh), ECOL, estr, "Courier", esize, "bold", "s") + if rca is not None: + self.ga.text(f"{name}_txt_s{action}", (s[0], s[1] - 1.5*dh), NCOL, S_str, "Courier", 10, "bold", "n",fontsize=NSIZE) + if rcc is not None: + self.ga.text(f"{name}_txt_n{action}", (s[0], s[1] - 1.2*2*dh), NCOL, N_str, "Courier", 2, "bold", "n",fontsize=NSIZE) + + if action == GridworldMDP.EAST: + self.ga.text(name + "_txt3", e, text_color, valStr, "Courier", h, "bold", "e") + if eligibility_trace is not None: + self.ga.text(name + "_txt3e", (e[0], e[1]+dh), ECOL, estr, "Courier", esize, "bold", "e") + if rca is not None: + self.ga.text(f"{name}_txt_s{action}", (e[0]-1.4*dh, e[1] - 0.4*dh+dh), NCOL, S_str, "Courier", 10, "bold", "n",fontsize=NSIZE) + if rcc is not None: + self.ga.text(f"{name}_txt_n{action}", (e[0]-1.4*dh, e[1] + 0.4*dh+dh), NCOL, N_str, "Courier", 2, "bold", "n",fontsize=NSIZE) + + if action == GridworldMDP.WEST: + self.ga.text(name + "_txt4", w, text_color, valStr, "Courier", h, "bold", "w") + if eligibility_trace is not None: + self.ga.text(name + "_txt4e", (w[0], w[1]+dh), ECOL, estr, "Courier", esize, "bold", "w") + if rca is not None: + self.ga.text(f"{name}_txt_s{action}", (w[0]+1.6*dh, w[1] - 0.4*dh+dh), NCOL, S_str, "Courier", 10, "bold", "n",fontsize=NSIZE) + if rcc is not None: + self.ga.text(f"{name}_txt_n{action}", (w[0]+1.6*dh, w[1] + 0.4*dh+dh), NCOL, N_str, "Courier", 2, "bold", "n",fontsize=NSIZE) + + + + def square(self, name, pos, size, color, filled, width): + x, y = pos + dx, dy = size, size + return self.ga.polygon(name, [(x - dx, y - dy), (x - dx, y + dy), (x + dx, y + dy), (x + dx, y - dy)], + outlineColor=color, + fillColor=color, filled=filled, width=width, smoothed=False, closed=True) + + def draw_player(self, position, grid_size): + # PACMAN_COLOR + + self.ga.circle("pacman", position, PACMAN_SCALE * grid_size * 2, + fillColor=PACMAN_COLOR, outlineColor=PACMAN_COLOR, + endpoints=getEndpoints(0), + width=PACMAN_OUTLINE_WIDTH) + + def to_screen(self, point): + (gamex, gamey) = point + x = gamex * self.GRID_SIZE + self.MARGIN + y = (self.mdp.height - gamey - 1) * self.GRID_SIZE + self.MARGIN + return (x, y) + + +def getColor(val, min_value, max_value): + r = val * 0.65 / min_value if val < 0 and min_value < 0 else 0 + g = val * 0.65 / max_value if val > 0 and max_value > 0 else 0 + return formatColor(r, g, 0) + + +if __name__ == "__main__": + from irlc.gridworld.gridworld_environments import OpenGridEnvironment + env = OpenGridEnvironment(render_mode='human') + # env = BookGridEnvironment() + + from irlc.ex11.q_agent import QAgent + from irlc import train + + + agent = QAgent(env) + # env = VideoMonitor(env, agent=agent, fps=2000) + import time + + t = time.time() + n = 200 + train(env, agent, max_steps=n, num_episodes=10000, verbose=False) + env.close() + + print("time per step", (time.time() - t) / n) + # 0.458 + # 0.63 + # 0.61 + # Benchmark over 100 steps: everything else: 0.04 (11 %), setup: 0.25 (72 %), viewer.render: 0.06 (16 %) + +# 423, 390, 342 (cur) diff --git a/irlc/gridworld/gridworld_mdp.py b/irlc/gridworld/gridworld_mdp.py new file mode 100644 index 0000000000000000000000000000000000000000..80c2bb61a365babb6a1812269400500fe1d90550 --- /dev/null +++ b/irlc/gridworld/gridworld_mdp.py @@ -0,0 +1,71 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from collections import defaultdict +from irlc.ex09.mdp import MDP + + +class GridworldMDP(MDP): + TERMINAL = "Terminal state" + NORTH = 0 # These are the four available actions. + EAST = 1 + SOUTH = 2 + WEST = 3 + actions2labels = {NORTH: 'North', + SOUTH: 'South', + EAST: 'East', + WEST: 'West'} # This dictionary is useful for labelling purposes but otherwise serve no purpose. + + def __init__(self, grid, living_reward=0.0, noise=0.0): + self.grid = {} + self.height = len(grid) + self.width = len(grid[0]) + initial_state = None + for dy, line in enumerate(grid): + y = self.height - dy - 1 + for x, el in enumerate(line): + self.grid[x, y] = el + if el == 'S': + initial_state = (x, y) + self.noise = noise + self.living_reward = living_reward + super().__init__(initial_state=initial_state) + + def A(self, state): + """ + Returns list of valid actions available in 'state'. + + You can try to go into walls (but will state in your location) + and when you are on the exit-squares (i.e., the ones with numbers), you have a single action available + 'North' which will take you to the terminal square. + """ + return (self.NORTH,) if type(self.grid[state]) in [int, float] else (self.NORTH, self.EAST, self.SOUTH, self.WEST) + + def is_terminal(self, state): + return state == self.TERMINAL + + def Psr(self, state, action): + if type(self.grid[state]) in [float, int]: + return {(self.TERMINAL, self.grid[state]): 1.} + + probabilities = defaultdict(float) + for a, pr in [(action, 1-self.noise), ((action - 1) % 4, self.noise/2), ((action + 1) % 4, self.noise/2)]: + sp = self.f(state, a) + r = self.grid[state] if type(self.grid[state]) in [int, float] else self.living_reward + probabilities[(sp, r)] += pr + return probabilities + + def f(self, state, action): + x, y = state + nxt = {self.NORTH: (x, y+1), + self.WEST: (x-1, y), + self.EAST: (x+1, y), + self.SOUTH: (x, y-1)} + return nxt[action] if self._legal(nxt[action]) else state + + def _legal(self, state): + return state in self.grid and self.grid[state] != "#" + + +class FrozenGridMDP(GridworldMDP): + def __init__(self, grid, is_slippery=True, living_reward=0): + self.is_slippery = is_slippery + super().__init__(grid, noise=2/3 if is_slippery else 0, living_reward=living_reward) diff --git a/irlc/lectures/__init__.py b/irlc/lectures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec01/__init__.py b/irlc/lectures/lec01/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec01/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec01/lecture_01_car_random.py b/irlc/lectures/lec01/lecture_01_car_random.py new file mode 100644 index 0000000000000000000000000000000000000000..e1ffe55a94a4e5d38bc558b2d94a205400feea2b --- /dev/null +++ b/irlc/lectures/lec01/lecture_01_car_random.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.car.car_model import CarEnvironment +from irlc.ex01.agent import train, Agent + +if __name__ == "__main__": + env = CarEnvironment(render_mode='human') + env.action_space.low[1] = 0 # To ensure we do not drive backwards. + agent = Agent(env) + stats, _ = train(env, agent, num_episodes=1, verbose=False) + env.close() diff --git a/irlc/lectures/lec01/lecture_01_pacman.py b/irlc/lectures/lec01/lecture_01_pacman.py new file mode 100644 index 0000000000000000000000000000000000000000..cba2e1b58bed71d53c46a1d15dd178e344da7563 --- /dev/null +++ b/irlc/lectures/lec01/lecture_01_pacman.py @@ -0,0 +1,15 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc.ex01.agent import train, Agent +from irlc import interactive + +def ppacman(): + # smallGrid + env = PacmanEnvironment(layout='mediumClassic', render_mode='human') + env, agent = interactive(env, Agent(env)) + stats, _ = train(env, agent, num_episodes=100, verbose=False) + print("Accumulated reward", stats[-1]['Accumulated Reward']) + env.close() + +if __name__ == "__main__": + ppacman() diff --git a/irlc/lectures/lec01/lecture_01_pendulum_random.py b/irlc/lectures/lec01/lecture_01_pendulum_random.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e7fc4b0ee98dba4cdf0a637d7c834b9ce58528 --- /dev/null +++ b/irlc/lectures/lec01/lecture_01_pendulum_random.py @@ -0,0 +1,9 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex01.agent import train, Agent +from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment + +if __name__ == "__main__": + env = GymSinCosPendulumEnvironment(Tmax=100, render_mode='human') + agent = Agent(env) + stats, _ = train(env, agent, num_episodes=1, verbose=False) + env.close() diff --git a/irlc/lectures/lec02/__init__.py b/irlc/lectures/lec02/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec02/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec02/lecture_02_dp_gridworld_short.py b/irlc/lectures/lec02/lecture_02_dp_gridworld_short.py new file mode 100644 index 0000000000000000000000000000000000000000..d2831e64a18df333041cb383950e84b0d4ebc289 --- /dev/null +++ b/irlc/lectures/lec02/lecture_02_dp_gridworld_short.py @@ -0,0 +1,8 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.chapter1.dp_planning_agent import dp_visualization +from irlc.gridworld.gridworld_environments import FrozenLake + +if __name__ == "__main__": + env = FrozenLake(render_mode='human') + dp_visualization(env, N=4, num_episodes=10) + env.close() diff --git a/irlc/lectures/lec02/lecture_02_frozen_lake.py b/irlc/lectures/lec02/lecture_02_frozen_lake.py new file mode 100644 index 0000000000000000000000000000000000000000..3a91f818af94df9f8b5dd3d7cbb7ce4b4b211012 --- /dev/null +++ b/irlc/lectures/lec02/lecture_02_frozen_lake.py @@ -0,0 +1,13 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import FrozenLake +from gymnasium.wrappers import TimeLimit +from irlc import Agent, interactive, train + +if __name__ == "__main__": + env = FrozenLake(is_slippery=True, living_reward=-1e-4, render_mode="human") + N = 40 + env, agent = interactive(env, Agent(env)) + env = TimeLimit(env, max_episode_steps=N) + num_episodes = 100 + train(env, agent, num_episodes=num_episodes) + env.close() diff --git a/irlc/lectures/lec02/lecture_02_frozen_long_slippery.py b/irlc/lectures/lec02/lecture_02_frozen_long_slippery.py new file mode 100644 index 0000000000000000000000000000000000000000..217929b2a325c160cebed3363f23c0f5733f0e84 --- /dev/null +++ b/irlc/lectures/lec02/lecture_02_frozen_long_slippery.py @@ -0,0 +1,8 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.chapter1.dp_planning_agent import dp_visualization +from irlc.gridworld.gridworld_environments import FrozenLake + +if __name__ == "__main__": + env = FrozenLake(is_slippery=True, living_reward=-1e-4, render_mode='human') + dp_visualization(env, N=40, num_episodes=100) + env.close() diff --git a/irlc/lectures/lec02/lecture_02_keyboard_pacman_g1.py b/irlc/lectures/lec02/lecture_02_keyboard_pacman_g1.py new file mode 100644 index 0000000000000000000000000000000000000000..06aa7e90647228a87d84bab423163a462b7d10f6 --- /dev/null +++ b/irlc/lectures/lec02/lecture_02_keyboard_pacman_g1.py @@ -0,0 +1,23 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc.ex01.agent import train +from irlc.ex01.agent import Agent +from irlc import interactive +from irlc.lectures.chapter3dp.dp_pacman import SS1tiny +# from irlc.pacman.layouts import S + +# from irlc import PlayWrapper +# from irlc import VideoMonitor + +def ppac(layout_str, name="pac"): + env = PacmanEnvironment(layout=None, layout_str=layout_str, animate_movement=True) + agent = Agent(env) + env, agent = interactive(env, agent) + # agent = PlayWrapper(agent, env) + # env = VideoMonitor(env) + stats, _ = train(env, agent, num_episodes=5, max_steps=8) + print("Accumulated reward for all episodes:", [s['Accumulated Reward'] for s in stats]) + env.close() + +if __name__ == "__main__": + ppac(SS1tiny) diff --git a/irlc/lectures/lec02/lecture_02_keyboard_pacman_g2.py b/irlc/lectures/lec02/lecture_02_keyboard_pacman_g2.py new file mode 100644 index 0000000000000000000000000000000000000000..cd1f8dff48a59a699d084fbc72b8489f5976b844 --- /dev/null +++ b/irlc/lectures/lec02/lecture_02_keyboard_pacman_g2.py @@ -0,0 +1,6 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex02.old.dp_pacman import SS2tiny +from irlc.lectures.lec02.lecture_02_keyboard_pacman_g1 import ppac + +if __name__ == "__main__": + ppac(SS2tiny) diff --git a/irlc/lectures/lec02/lecture_02_optimal_dp_g0.py b/irlc/lectures/lec02/lecture_02_optimal_dp_g0.py new file mode 100644 index 0000000000000000000000000000000000000000..8c914974699e423122d2d1bf7429fd91048afe20 --- /dev/null +++ b/irlc/lectures/lec02/lecture_02_optimal_dp_g0.py @@ -0,0 +1,38 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc.ex02.dp_agent import DynamicalProgrammingAgent +from gymnasium.wrappers import TimeLimit +from irlc.pacman.pacman_environment import PacmanWinWrapper +from irlc.ex01.agent import train +# from irlc import VideoMonitor +# from irlc.ex02.old.dp_pacman import DPPacmanModel +from irlc.lectures.chapter3dp.dp_pacman import DPPacmanModel +# from irlc import PlayWrapper +from irlc import interactive + +def simulate_1_game(layout_str): + N = 30 + env = PacmanEnvironment(layout=None, layout_str=layout_str, render_mode='human') + + # env = VideoMonitor(env, fps=3) + model = DPPacmanModel(env, N=N, verbose=True) + agent = DynamicalProgrammingAgent(env, model=model) + # agent = PlayWrapper(agent, env) + env, agent = interactive(env, agent) + env = TimeLimit(env, max_episode_steps=N) + env = PacmanWinWrapper(env) + stats, trajectories = train(env, agent, num_episodes=100, verbose=False, return_trajectory=True) + env.close() + + +SS0 = """ +%%%%%%%%%% +% P . % +% %%%%%. % +% % +% %%% %%%% +%. .% +%%%%%%%%%% +""" +if __name__ == "__main__": + simulate_1_game(layout_str=SS0) diff --git a/irlc/lectures/lec02/lecture_02_optimal_dp_g1.py b/irlc/lectures/lec02/lecture_02_optimal_dp_g1.py new file mode 100644 index 0000000000000000000000000000000000000000..1cd3b98141171872084db070c64bb1c7aff5ac8c --- /dev/null +++ b/irlc/lectures/lec02/lecture_02_optimal_dp_g1.py @@ -0,0 +1,25 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.pacman.pacman_environment import GymPacmanEnvironment +# from irlc.ex02.dp_agent import DynamicalProgrammingAgent +# from gym.wrappers import TimeLimit +# from irlc.pacman.pacman_environment import PacmanWinWrapper +# from irlc.ex01.agent import train +# # from irlc import VideoMonitor +from irlc.lectures.chapter3dp.dp_pacman import DPPacmanModel, SS1tiny +from irlc import interactive +from irlc.lectures.lec02.lecture_02_optimal_dp_g0 import simulate_1_game + +# def simulate_1_game(layout_str): +# N = 8 +# env = GymPacmanEnvironment(layout=None, layout_str=layout_str, animate_movement=True) +# env = VideoMonitor(env, fps=3) +# model = DPPacmanModel(env, N=N, verbose=True) +# agent = DynamicalProgrammingAgent(env, model=model) +# agent = PlayWrapper(agent, env) +# env = TimeLimit(env, max_episode_steps=N) +# env = PacmanWinWrapper(env) +# stats, trajectories = train(env, agent, num_episodes=100, verbose=False, return_trajectory=True) +# env.close() + +if __name__ == "__main__": + simulate_1_game(layout_str=SS1tiny) diff --git a/irlc/lectures/lec02/lecture_02_optimal_dp_g2.py b/irlc/lectures/lec02/lecture_02_optimal_dp_g2.py new file mode 100644 index 0000000000000000000000000000000000000000..32c4b590116bcfd1c2eb52e55efb8fd1832dd371 --- /dev/null +++ b/irlc/lectures/lec02/lecture_02_optimal_dp_g2.py @@ -0,0 +1,6 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.chapter3dp.dp_pacman import SS2tiny +from irlc.lectures.lec02.lecture_02_optimal_dp_g1 import simulate_1_game + +if __name__ == "__main__": + simulate_1_game(layout_str=SS2tiny) diff --git a/irlc/lectures/lec03/__init__.py b/irlc/lectures/lec03/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec03/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec03/ex_03_search.py b/irlc/lectures/lec03/ex_03_search.py new file mode 100644 index 0000000000000000000000000000000000000000..7d5ce2ca57e2fb179f264be0f519d6d334287b12 --- /dev/null +++ b/irlc/lectures/lec03/ex_03_search.py @@ -0,0 +1,18 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import Agent, train, savepdf +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc.ex03.dp_forward import dp_forward +from irlc.ex03.search_problem import SearchProblem +from irlc.ex03.search_problem import EnsureTerminalSelfTransitionsWrapper +from irlc.ex03.pacman_search import layout2, layout1 + +if __name__ == "__main__": + env = PacmanEnvironment(layout_str=layout1, render_mode='human') + env.reset() + savepdf("ex03_layout1", env=env) + env.close() + + env = PacmanEnvironment(layout_str=layout1, render_mode='human') + env.reset() + savepdf("ex03_layout2", env=env) + env.close() diff --git a/irlc/lectures/lec03/lecture_03_alphab.py b/irlc/lectures/lec03/lecture_03_alphab.py new file mode 100644 index 0000000000000000000000000000000000000000..fa81c07f7264c87577af4431c2be9339a152c139 --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_alphab.py @@ -0,0 +1,7 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex03multisearch.multisearch_alphabeta import GymAlphaBetaAgent +from irlc.lectures.lec03.lecture_03_minimax import gminmax + +if __name__ == "__main__": + d = 3 + gminmax(Agent=GymAlphaBetaAgent,depth=d) diff --git a/irlc/lectures/lec03/lecture_03_dotsearch_astar_manhattan.py b/irlc/lectures/lec03/lecture_03_dotsearch_astar_manhattan.py new file mode 100644 index 0000000000000000000000000000000000000000..ebea74a0e4980b47a004271f184f81f38154fa9e --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_dotsearch_astar_manhattan.py @@ -0,0 +1,8 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec03.lecture_03_dotsearch_dp import singledot +from irlc.lectures.chapter4search.yield_version.pacman_yield import AStarAgentYield +from irlc.ex03multisearch.pacman_problem_positionsearch_astar import manhattanHeuristic + +if __name__ == "__main__": + agent_args = dict(heuristic=manhattanHeuristic) + singledot(SAgent=AStarAgentYield, agent_args=agent_args) diff --git a/irlc/lectures/lec03/lecture_03_dotsearch_bfs.py b/irlc/lectures/lec03/lecture_03_dotsearch_bfs.py new file mode 100644 index 0000000000000000000000000000000000000000..2fafd77ced41a6c50ad917927cc70801ea29061a --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_dotsearch_bfs.py @@ -0,0 +1,9 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec03.lecture_03_dotsearch_dp import singledot +from irlc.lectures.chapter4search.yield_version.pacman_yield import BFSAgentYield + +if __name__ == "__main__": + # agent_args = dict(heuristic=manhattanHeuristic,N=30) + singledot(SAgent=BFSAgentYield) + + # singledot(SAgent=BFSAgentYield) diff --git a/irlc/lectures/lec03/lecture_03_dotsearch_dfs.py b/irlc/lectures/lec03/lecture_03_dotsearch_dfs.py new file mode 100644 index 0000000000000000000000000000000000000000..276aa6bee3f60db8a9172d3dab1ba3ee463918f4 --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_dotsearch_dfs.py @@ -0,0 +1,9 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec03.lecture_03_dotsearch_dp import singledot +from irlc.lectures.chapter4search.yield_version.pacman_yield import DFSAgentYield + +if __name__ == "__main__": + # agent_args = dict(heuristic=manhattanHeuristic,N=30) + singledot(SAgent=DFSAgentYield) + + # singledot(SAgent=BFSAgentYield) diff --git a/irlc/lectures/lec03/lecture_03_dotsearch_dp.py b/irlc/lectures/lec03/lecture_03_dotsearch_dp.py new file mode 100644 index 0000000000000000000000000000000000000000..baff1ee775c117f2d1cfb55948667899eba0db5e --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_dotsearch_dp.py @@ -0,0 +1,12 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.chapter4search.yield_version.pacman_yield import stest, ForwardDPSearchAgent, dargs +# from irlc.ex03.pacsearch_agents import GymPositionSearchProblem, manhattanHeuristic, GymCornersProblem, cornersHeuristic, foodHeuristic, GymFoodSearchProblem, GymAnyFoodSearchProblem +from irlc.ex03multisearch.pacman_problem_positionsearch import GymPositionSearchProblem#, manhattanHeuristic + + +def singledot(layout='smallMaze', SAgent=None, agent_args=None, layout_str=None): + stest(layout=layout, layout_str=layout_str, SAgent=SAgent, prob=GymPositionSearchProblem(), agent_args=agent_args, zoom=2, **dargs, fps=30) # part 3 + +if __name__ == "__main__": + agent_args = dict(N=30) + singledot(SAgent=ForwardDPSearchAgent, agent_args=agent_args) diff --git a/irlc/lectures/lec03/lecture_03_expectimax.py b/irlc/lectures/lec03/lecture_03_expectimax.py new file mode 100644 index 0000000000000000000000000000000000000000..826975f29ec88a7aeaedf08eff8bf356980791f7 --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_expectimax.py @@ -0,0 +1,7 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex03multisearch.multisearch_agents import GymExpectimaxAgent +from irlc.lectures.lec03.lecture_03_minimax import gminmax + +if __name__ == "__main__": + d = 3 + gminmax(Agent=GymExpectimaxAgent,depth=d) diff --git a/irlc/lectures/lec03/lecture_03_minimax.py b/irlc/lectures/lec03/lecture_03_minimax.py new file mode 100644 index 0000000000000000000000000000000000000000..eb8ee7362072498ac35df5df4367f822898fd4bb --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_minimax.py @@ -0,0 +1,35 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex01.agent import train +from irlc.pacman.pacman_environment import GymPacmanEnvironment +from irlc.utils.video_monitor import VideoMonitor +from irlc.ex03multisearch.multisearch_agents import GymMinimaxAgent + + +layout_str = """ +%%%%%%%%% +% % +% %%%% % +% % +% P % +%%%% % +%%%% .G % +%%%% % +%%%%%%%%% +""".strip() + +def gminmax(layout='smallClassic', layout_str=layout_str, Agent=None, depth=3, **kwargs): + zoom = 2 + env = GymPacmanEnvironment(layout=layout, layout_str=layout_str, zoom=zoom, **kwargs) + agent = Agent(env, depth=depth) + from irlc import PlayWrapper + agent = PlayWrapper(agent, env) + + env = VideoMonitor(env, agent=agent, agent_monitor_keys=tuple(), fps=10) + train(env, agent, num_episodes=30) + env.close() + +if __name__ == "__main__": + d = 3 + gminmax(layout='minimaxClassic', layout_str=layout_str, Agent=GymMinimaxAgent,depth=d) + # gminmax(layout='minimaxClassic', layout_str=layout_str, Agent=GymAlphaBetaAgent, depth=d) + # gminmax(layout='minimaxClassic', layout_str=layout_str, Agent=GymExpectimaxAgent,depth=d) diff --git a/irlc/lectures/lec03/lecture_03_squaresearch_bfs.py b/irlc/lectures/lec03/lecture_03_squaresearch_bfs.py new file mode 100644 index 0000000000000000000000000000000000000000..ac1e0953cd661d8ae9d0c859b6470ff7b28af798 --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_squaresearch_bfs.py @@ -0,0 +1,12 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.chapter4search.yield_version.pacman_yield import BFSAgentYield +from irlc.lectures.chapter4search.search_tables import s_large + +# def tricksearchdot(layout='trickySearch', SAgent=None, agent_args=None, layout_str=None): +# stest(layout_str=layout_str, SAgent=SAgent, prob=GymFoodSearchProblem(), agent_args=agent_args, zoom=2, **dargs, fps=1000) # part 3 + +from irlc.lectures.lec03.lecture_03_tricksearch_bfs import tricksearchdot + +if __name__ == "__main__": + # agent_args = dict(heuristic=manhattanHeuristic,N=30) + tricksearchdot(SAgent=BFSAgentYield, agent_args=None, layout_str=s_large) diff --git a/irlc/lectures/lec03/lecture_03_tricksearch_astar.py b/irlc/lectures/lec03/lecture_03_tricksearch_astar.py new file mode 100644 index 0000000000000000000000000000000000000000..6c658491e286be118f4d82e429add412cd680b40 --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_tricksearch_astar.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.ex03.pacsearch_agents import GymPositionSearchProblem, manhattanHeuristic, GymCornersProblem, cornersHeuristic, foodHeuristic, GymFoodSearchProblem, GymAnyFoodSearchProblem +from irlc.lectures.chapter4search.yield_version.pacman_yield import AStarAgentYield + +from irlc.lectures.lec03.lecture_03_tricksearch_bfs import tricksearchdot +from irlc.ex03multisearch.pacman_problem_foodsearch_astar import foodHeuristic + +if __name__ == "__main__": + agent_args = dict(heuristic=foodHeuristic) + tricksearchdot(SAgent=AStarAgentYield, agent_args=agent_args) diff --git a/irlc/lectures/lec03/lecture_03_tricksearch_bfs.py b/irlc/lectures/lec03/lecture_03_tricksearch_bfs.py new file mode 100644 index 0000000000000000000000000000000000000000..89b776456aa03640d6e75721ff7804ca3dbf8b6a --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_tricksearch_bfs.py @@ -0,0 +1,21 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.chapter4search.yield_version.pacman_yield import stest, dargs +from irlc.ex03multisearch.pacman_problem_foodsearch import GymFoodSearchProblem +from irlc.lectures.chapter4search.yield_version.pacman_yield import BFSAgentYield + +layout_str = """ +%%%%%%%%%%%% +% % % +%.%.%.%% % % +% P % % +%%%%%%%%%% % +%. % +%%%%%%%%%%%% +""".strip() + +def tricksearchdot(layout_str=layout_str, SAgent=None, agent_args=None): + stest(layout_str=layout_str, SAgent=SAgent, prob=GymFoodSearchProblem(), agent_args=agent_args, zoom=2, **dargs, fps=1000) # part 3 + +if __name__ == "__main__": + # agent_args = dict(heuristic=manhattanHeuristic,N=30) + tricksearchdot(SAgent=BFSAgentYield, agent_args=None) diff --git a/irlc/lectures/lec03/lecture_03_tricksearch_dfs.py b/irlc/lectures/lec03/lecture_03_tricksearch_dfs.py new file mode 100644 index 0000000000000000000000000000000000000000..f3b2ac4ad2eeed59217fe591c95385576e69c7ec --- /dev/null +++ b/irlc/lectures/lec03/lecture_03_tricksearch_dfs.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.ex03.pacsearch_agents import GymPositionSearchProblem, manhattanHeuristic, GymCornersProblem, cornersHeuristic, foodHeuristic, GymFoodSearchProblem, GymAnyFoodSearchProblem + +from irlc.lectures.chapter4search.yield_version.pacman_yield import DFSAgentYield +from irlc.lectures.lec03.lecture_03_tricksearch_bfs import tricksearchdot + + +if __name__ == "__main__": + # agent_args = dict(heuristic=manhattanHeuristic,N=30) + tricksearchdot(SAgent=DFSAgentYield, agent_args=None) diff --git a/irlc/lectures/lec03/snapshot_base/openaigym.video.0.8068.video000000.meta.json b/irlc/lectures/lec03/snapshot_base/openaigym.video.0.8068.video000000.meta.json new file mode 100644 index 0000000000000000000000000000000000000000..5dc734d01281b1a52d401032ec7e9c6da2d4ea39 --- /dev/null +++ b/irlc/lectures/lec03/snapshot_base/openaigym.video.0.8068.video000000.meta.json @@ -0,0 +1 @@ +{"episode_id": 0, "content_type": "video/mp4"} \ No newline at end of file diff --git a/irlc/lectures/lec03/snapshot_base/openaigym.video.0.8068.video000000.mp4 b/irlc/lectures/lec03/snapshot_base/openaigym.video.0.8068.video000000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..17e5e5fbd204f4f1c8bf240b166ab0a318db4744 Binary files /dev/null and b/irlc/lectures/lec03/snapshot_base/openaigym.video.0.8068.video000000.mp4 differ diff --git a/irlc/lectures/lec04/__init__.py b/irlc/lectures/lec04/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec04/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec04/lecture_04_car_basic_pid.py b/irlc/lectures/lec04/lecture_04_car_basic_pid.py new file mode 100644 index 0000000000000000000000000000000000000000..8ed6d96ae433dfaefa0b74955808c02a544d5930 --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_car_basic_pid.py @@ -0,0 +1,20 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.ex04.pid_lunar import lunar_single_mission, get_lunar_lander +# import gym +from irlc import train +from irlc.car.car_model import CarEnvironment +from irlc.ex04.pid_car import PIDCarAgent +from irlc import savepdf +from irlc import interactive, Agent + +if __name__ == "__main__": + env = CarEnvironment(noise_scale=0, Tmax=30, max_laps=1, render_mode='human') + agent = PIDCarAgent(env, v_target=.2, use_both_x5_x3=False) + stats, trajectories = train(env, agent, num_episodes=1, return_trajectory=True) + env.close() + + + + + # env = CarEnvironment(noise_scale=0,Tmax=30, max_laps=1, render_mode='human') + # agent = PIDCarAgent(env, v_target=1, use_both_x5_x3=True) # I recommend lowering v_target to make the problem simpler. diff --git a/irlc/lectures/lec04/lecture_04_cartpole_A.py b/irlc/lectures/lec04/lecture_04_cartpole_A.py new file mode 100644 index 0000000000000000000000000000000000000000..3f4a2899db5fb22821b12f645f4381f547e85eb9 --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_cartpole_A.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import train +from irlc.ex04.pid_cartpole import PIDCartpoleAgent, get_offbalance_cart + +if __name__ == "__main__": + env = get_offbalance_cart(30) + agent = PIDCartpoleAgent(env, dt=env.dt, Kp=120, Ki=0, Kd=10, balance_to_x0=False) + # agent = PlayWrapper(agent, env) + _, trajectories = train(env, agent, num_episodes=1, reset=False) + env.close() diff --git a/irlc/lectures/lec04/lecture_04_cartpole_B.py b/irlc/lectures/lec04/lecture_04_cartpole_B.py new file mode 100644 index 0000000000000000000000000000000000000000..a57e0950c43dd941464534354d377206b719097a --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_cartpole_B.py @@ -0,0 +1,14 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import train +from irlc.ex04.pid_cartpole import PIDCartpoleAgent, get_offbalance_cart + +if __name__ == "__main__": + """ + Second task: We will now also try to bring the cart towards x=0. + """ + env = get_offbalance_cart(30) + agent = PIDCartpoleAgent(env, env.dt, ...) + # TODO: 1 lines missing. + raise NotImplementedError("Define your agent here (including parameters)") + _, trajectories = train(env, agent, num_episodes=1, reset=False) # Note reset=False to maintain initial conditions. + env.close() diff --git a/irlc/lectures/lec04/lecture_04_harmonic.py b/irlc/lectures/lec04/lecture_04_harmonic.py new file mode 100644 index 0000000000000000000000000000000000000000..7d7409954d82b313805391ec1d35bdfc6ab5a054 --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_harmonic.py @@ -0,0 +1,14 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import train +from irlc.ex04.model_harmonic import HarmonicOscilatorEnvironment +from irlc import Agent +import numpy as np + +class NullAgent(Agent): + def pi(self, x, k, info=None): + return np.asarray([0]) + +if __name__ == "__main__": + env = HarmonicOscilatorEnvironment(render_mode='human') + train(env, NullAgent(env), num_episodes=1, max_steps=200) + env.close() diff --git a/irlc/lectures/lec04/lecture_04_lunar.py b/irlc/lectures/lec04/lecture_04_lunar.py new file mode 100644 index 0000000000000000000000000000000000000000..c68fee76535414d7069884fd6aeb5708a6d975a3 --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_lunar.py @@ -0,0 +1,15 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.pid_lunar import lunar_single_mission, get_lunar_lander +import gymnasium +from irlc import train + +if __name__ == "__main__": + env = gymnasium.make('LunarLanderContinuous-v2', render_mode='human') + env._max_episode_steps = 1000 # We don't want it to time out. + + agent = get_lunar_lander(env) + # agent = PlayWrapper(agent, env) + # env = VideoMonitor(env) + + stats, traj = train(env, agent, return_trajectory=True, num_episodes=10) + env.close() diff --git a/irlc/lectures/lec04/lecture_04_pendulum_random.py b/irlc/lectures/lec04/lecture_04_pendulum_random.py new file mode 100644 index 0000000000000000000000000000000000000000..58d084308b202ff91b5d7b4a332904b1f88979f6 --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_pendulum_random.py @@ -0,0 +1,8 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import Agent, train +from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment + +if __name__ == "__main__": + env = GymSinCosPendulumEnvironment(Tmax=20, render_mode='human') + train(env, Agent(env), num_episodes=1) + env.close() diff --git a/irlc/lectures/lec04/lecture_04_pid_d.py b/irlc/lectures/lec04/lecture_04_pid_d.py new file mode 100644 index 0000000000000000000000000000000000000000..8b05ff10e27da1bb65508aaf74b573559f5c15fd --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_pid_d.py @@ -0,0 +1,5 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec04.lecture_04_pid_p import pidplot + +if __name__ == "__main__": + pidplot(Kp=40, Kd=100, Ki=0) diff --git a/irlc/lectures/lec04/lecture_04_pid_iA.py b/irlc/lectures/lec04/lecture_04_pid_iA.py new file mode 100644 index 0000000000000000000000000000000000000000..fa350611daa8148a817edf063eae1d525b1f55ef --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_pid_iA.py @@ -0,0 +1,6 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec04.lecture_04_pid_p import pidplot + + +if __name__ == "__main__": + pidplot(Kp=40, Kd=50, Ki=0, slope=2, target=0) diff --git a/irlc/lectures/lec04/lecture_04_pid_iB.py b/irlc/lectures/lec04/lecture_04_pid_iB.py new file mode 100644 index 0000000000000000000000000000000000000000..9fda178a6a643fc20b606d10f1705d042dd1146d --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_pid_iB.py @@ -0,0 +1,6 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec04.lecture_04_pid_p import pidplot + + +if __name__ == "__main__": + pidplot(Kp=40, Kd=50, Ki=10, slope=2, target=0) diff --git a/irlc/lectures/lec04/lecture_04_pid_p.py b/irlc/lectures/lec04/lecture_04_pid_p.py new file mode 100644 index 0000000000000000000000000000000000000000..ed3eb6b95ec5d348594a7f2d03d28b38428cbab7 --- /dev/null +++ b/irlc/lectures/lec04/lecture_04_pid_p.py @@ -0,0 +1,19 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.locomotive import LocomotiveEnvironment +from irlc.ex04.pid_locomotive_agent import PIDLocomotiveAgent +from irlc.ex01.agent import train + +def pidplot(Kp=40, Kd=0, Ki=0, slope=0, target=0): + dt = .04 + m = 70 + Tmax=20 + env = LocomotiveEnvironment(m=m, slope=slope, dt=dt, Tmax=Tmax, render_mode='human') + # env = VideoMonitor(env) + # Kp = 40 + agent = PIDLocomotiveAgent(env, dt=dt, Kp=Kp, Ki=Ki, Kd=Kd, target=0) + # env = PlayWrapper(agent, env) + train(env, agent, num_episodes=1) + env.close() + +if __name__ == "__main__": + pidplot(Kp=40, Kd=0, Ki=0) diff --git a/irlc/lectures/lec05/__init__.py b/irlc/lectures/lec05/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec05/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec05/lecture_05_carpole_random.py b/irlc/lectures/lec05/lecture_05_carpole_random.py new file mode 100644 index 0000000000000000000000000000000000000000..e82a89bbe407251ed4d9b02d2881c33ccfefa20b --- /dev/null +++ b/irlc/lectures/lec05/lecture_05_carpole_random.py @@ -0,0 +1,9 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import Agent, train +from irlc.ex05.model_cartpole import GymSinCosCartpoleEnvironment + +if __name__ == "__main__": + + env = GymSinCosCartpoleEnvironment(Tmax=20, render_mode='human') + train(env, Agent(env), num_episodes=1) + env.close() diff --git a/irlc/lectures/lec05/lecture_05_cartpole_kelly.py b/irlc/lectures/lec05/lecture_05_cartpole_kelly.py new file mode 100644 index 0000000000000000000000000000000000000000..1bc3bcce247df45f93d74fdbb4cc731c8f7b539d --- /dev/null +++ b/irlc/lectures/lec05/lecture_05_cartpole_kelly.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex05.direct_cartpole_kelly import compute_solutions +from irlc.ex05.direct_plot import plot_solutions +import matplotlib.pyplot as plt + +if __name__ == "__main__": + env, solutions = compute_solutions() + print("Did we succeed?", solutions[-1]['solver']['success']) + plot_solutions(env, solutions, animate=True, pdf=None, animate_all=True, animate_repeats=3) + env.close() diff --git a/irlc/lectures/lec05/lecture_05_cartpole_time.py b/irlc/lectures/lec05/lecture_05_cartpole_time.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd6e873e6e2036dcd6f6e13c6ee24c124694813 --- /dev/null +++ b/irlc/lectures/lec05/lecture_05_cartpole_time.py @@ -0,0 +1,11 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex05.direct_cartpole_time import compute_solutions +from irlc.ex05.direct_plot import plot_solutions +import matplotlib.pyplot as plt + +if __name__ == "__main__": + env, solutions = compute_solutions() + print("Did we succeed?", solutions[-1]['solver']['success']) + plot_solutions(env, solutions, animate=True, pdf=None, animate_all=True, animate_repeats=3) + env.close() + pass diff --git a/irlc/lectures/lec06/__init__.py b/irlc/lectures/lec06/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec06/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec06/lecture6_lqr_locomotive.py b/irlc/lectures/lec06/lecture6_lqr_locomotive.py new file mode 100644 index 0000000000000000000000000000000000000000..2c9ddf3c2dec5a84476a66a11861d89a9ff08f7b --- /dev/null +++ b/irlc/lectures/lec06/lecture6_lqr_locomotive.py @@ -0,0 +1,37 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import matplotlib.pyplot as plt +import numpy as np +from irlc import savepdf, train +from irlc.ex04.pid_locomotive_agent import PIDLocomotiveAgent +from irlc.ex06.lqr_agent import LQRAgent +from irlc.ex04.model_harmonic import HarmonicOscilatorEnvironment +from irlc.ex06.boeing_lqr import compute_A_B_d, compute_Q_R_q +from irlc.ex07.linearization_agent import LinearizationAgent +from irlc.ex06.lqr_pid import ConstantLQRAgent +from irlc.ex04.locomotive import LocomotiveEnvironment +from irlc.ex04.pid_locomotive_agent import PIDLocomotiveAgent +from irlc.ex01.agent import train +from irlc.ex03.control_cost import SymbolicQRCost +import matplotlib +#matplotlib.use('qtagg') +dt = .04 +m = 70 +Tmax=10 +slope = 0 + +env = LocomotiveEnvironment(m=m, slope=slope, dt=dt, Tmax=Tmax, render_mode='human') + +model = env.discrete_model +model.cost = SymbolicQRCost(Q=np.eye(2)*100, R=np.eye(1)).discretize(dt=dt) +agent = LinearizationAgent(env, model=model, xbar=env.observation_space.sample(), ubar=env.action_space.sample()) +_, traj = train(env, agent, num_episodes=1) +env.close() +if False: + from irlc import plot_trajectory, savepdf + import matplotlib.pyplot as plt + plt.figure() + plot_trajectory(trajectory=traj[0], env=env, xkeys=[0, 1], ukeys=[]) + savepdf('lqr_pid_locomotive_state.pdf') + plot_trajectory(trajectory=traj[0], env=env, ukeys=[0], xkeys=[]) + savepdf('lqr_pid_locomotive_action.pdf') + env.close() diff --git a/irlc/lectures/lec06/lecture_06_cartpole_ilqr.py b/irlc/lectures/lec06/lecture_06_cartpole_ilqr.py new file mode 100644 index 0000000000000000000000000000000000000000..dc5633578e01195c85c72b5e6d29080d5029c038 --- /dev/null +++ b/irlc/lectures/lec06/lecture_06_cartpole_ilqr.py @@ -0,0 +1,47 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex07.ilqr_agent import ILQRAgent +from irlc import train +from irlc.ex05.model_cartpole import GymSinCosCartpoleEnvironment +# from irlc import VideoMonitor + +def cartpole_experiment(N=12, use_linesearch=True, figex="", animate=True): + np.random.seed(2) + Tmax = .9 + dt = Tmax/N + + env = GymSinCosCartpoleEnvironment(dt=dt, Tmax=Tmax, supersample_trajectory=True, render_mode='human') + agent = ILQRAgent(env, env.discrete_model, N=N, ilqr_iterations=200, use_linesearch=use_linesearch) + # if animate: + # env =VideoMonitor(env) + stats, trajectories = train(env, agent, num_episodes=3, return_trajectory=True) + + # agent.use_ubar = True + # stats2, trajectories2 = train(env, agent, num_episodes=1, return_trajectory=True) + # env.close() + env.close() + +def plt_cartpole(): + cartpole_experiment(N=50, use_linesearch=True, animate=True) + +if __name__ == '__main__': + np.random.seed(42) + plt_cartpole() + + # xb = agent.xbar + # tb = np.arange(N+1)*dt + # plt.figure(figsize=(8,6)) + # F = 3 + # # plt.plot(trajectories[0].time, trajectories[0].state[:,F], 'k-', label='Closed-loop $\\pi$') + # # plt.plot(trajectories2[0].time, trajectories2[0].state[:,F], '-', label='Open-loop $\\bar{u}_k$') + # + # plt.plot(tb, xb[:,F], '.-', label="iLQR rediction $\\bar{x}_k$") + # plt.xlabel("Time/seconds") + # plt.ylabel("$\cos(\\theta)$") + # plt.title(f"Pendulum environment $T={N}$") + # + # plt.grid() + # plt.legend() + # ev = "pendulum" + # savepdf(f"irlc_cartpole_theta_N{N}_{use_linesearch}{figex}") + # plt.show() diff --git a/irlc/lectures/lec06/lecture_06_linearize.py b/irlc/lectures/lec06/lecture_06_linearize.py new file mode 100644 index 0000000000000000000000000000000000000000..311dd27eb55cfca8fc3ef90fb25941155b7cadbd --- /dev/null +++ b/irlc/lectures/lec06/lecture_06_linearize.py @@ -0,0 +1,6 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex07.linearization_agent import get_offbalance_cart + +if __name__ == "__main__": + env = get_offbalance_cart(waiting_steps=20, sleep_time=0.1) + env.close() diff --git a/irlc/lectures/lec06/lecture_06_linearize_b.py b/irlc/lectures/lec06/lecture_06_linearize_b.py new file mode 100644 index 0000000000000000000000000000000000000000..b582957f75442e2162edaccc09ff4af51cdfeb13 --- /dev/null +++ b/irlc/lectures/lec06/lecture_06_linearize_b.py @@ -0,0 +1,18 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import plot_trajectory, train +from irlc.ex07.linearization_agent import get_offbalance_cart, LinearizationAgent +import numpy as np +import matplotlib +matplotlib.use("tkagg") +import matplotlib.pyplot as plt + + +if __name__ == "__main__": + np.random.seed(42) # I don't think these results are seed-dependent but let's make sure. + env = get_offbalance_cart(4, sleep_time=0.08) # Simulate for a little time to get an off-balance cart. Increase 4-->10 to get failure. + agent = LinearizationAgent(env, model=env.discrete_model, xbar=env.discrete_model.x_upright, ubar=env.action_space.sample()*0) + _, trajectories = train(env, agent, num_episodes=1, return_trajectory=True, reset=False) # Note reset=False to maintain initial conditions. + plt.figure() + plot_trajectory(trajectories[0], env, xkeys=[0, 2, 3], ukeys=[0]) + plt.show() + env.close() diff --git a/irlc/lectures/lec06/lecture_06_pendulum_bilqr_L.py b/irlc/lectures/lec06/lecture_06_pendulum_bilqr_L.py new file mode 100644 index 0000000000000000000000000000000000000000..e0cb2ca23b92b51c2b23db190b67ca172404f5de --- /dev/null +++ b/irlc/lectures/lec06/lecture_06_pendulum_bilqr_L.py @@ -0,0 +1,7 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.lectures.lec06.lecture_06_pendulum_bilqr_ubar import pen_experiment + +if __name__ == "__main__": + np.random.seed(2) # (!) + pen_experiment(N=50, use_linesearch=False, use_ubar=False) diff --git a/irlc/lectures/lec06/lecture_06_pendulum_bilqr_ubar.py b/irlc/lectures/lec06/lecture_06_pendulum_bilqr_ubar.py new file mode 100644 index 0000000000000000000000000000000000000000..d61a8ddc17ba23ba3c440108f5eaa4dba70a8530 --- /dev/null +++ b/irlc/lectures/lec06/lecture_06_pendulum_bilqr_ubar.py @@ -0,0 +1,66 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment +from irlc.ex07.ilqr_agent import ILQRAgent +from irlc import train +from irlc import savepdf +import matplotlib.pyplot as plt + +Tmax = 3 +def pen_experiment(N=12, use_linesearch=True,figex="", animate=True, use_ubar=False): + dt = Tmax / N + env = GymSinCosPendulumEnvironment(dt, Tmax=Tmax, supersample_trajectory=True, render_mode='human' if animate else None) + agent = ILQRAgent(env, env.discrete_model, N=N, ilqr_iterations=200, use_linesearch=use_linesearch) + # if animate: + # env = VideoMonitor(env) + + if use_ubar: + agent.use_ubar = True + stats2, trajectories = train(env, agent, num_episodes=1, return_trajectory=True) + env.close() + + plot_pendulum_trajectory(trajectories[0], label=f'Use linesearch? {use_linesearch}. Use u-bar? {use_ubar}') + plt.legend() + plt.show() + + plt.figure(figsize=(6, 6)) + plt.semilogy(agent.J_hist, 'k.-') + plt.xlabel("iLQR Iterations") + plt.ylabel("Cost function estimate $J$") + # plt.title("Last value: {") + plt.grid() + # savepdf(f"irlc_pendulum_J_N{N}_{use_linesearch}{figex}") + plt.show() + # + # plt.show() + # xb = agent.xbar + # tb = np.arange(N+1)*dt + # plt.figure(figsize=(12, 6)) + # plt.plot(trajectories2[0].time, trajectories2[0].state[:,1], '-', label='Open-loop $\\bar{u}_k$') + # plt.plot(tb, xb[:,1], 'o-', label="iLQR prediction $\\bar{x}_k$") + # plt.grid() + # plt.legend() + # ev = "pendulum" + # savepdf(f"irlc_pendulum_theta_N{N}_{use_linesearch}{figex}") + # plt.show() + + ## Plot J + +# +def plot_pendulum_trajectory(traj, style='k-', label=None, action=False, **kwargs): + y = traj.state[:, 1] if not action else traj.action[:,0] + plt.plot(traj.time[:-1] if action else traj.time, y, style, label=label, **kwargs) + + plt.xlabel("Time/seconds") + if action: + plt.ylabel("Torque $u$") + else: + plt.ylabel("$\cos(\\theta)$") + plt.grid() + pass + +N = 50 + +if __name__ == "__main__": + np.random.seed(2) # (!) + pen_experiment(N=N, use_linesearch=False, use_ubar=True) diff --git a/irlc/lectures/lec06/lecture_06_pendulum_ilqr_L.py b/irlc/lectures/lec06/lecture_06_pendulum_ilqr_L.py new file mode 100644 index 0000000000000000000000000000000000000000..6e475bf6335497c98536f9850ce9934c53d2d4fd --- /dev/null +++ b/irlc/lectures/lec06/lecture_06_pendulum_ilqr_L.py @@ -0,0 +1,5 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +if __name__ == "__main__": + from irlc.lectures.lec06.lecture_06_pendulum_bilqr_ubar import pen_experiment + N = 50 + pen_experiment(N=N, use_linesearch=True, use_ubar=False) diff --git a/irlc/lectures/lec06/lecture_06_pendulum_ilqr_ubar.py b/irlc/lectures/lec06/lecture_06_pendulum_ilqr_ubar.py new file mode 100644 index 0000000000000000000000000000000000000000..b44a35cc127904eee0bb318cab2b383388cd4110 --- /dev/null +++ b/irlc/lectures/lec06/lecture_06_pendulum_ilqr_ubar.py @@ -0,0 +1,5 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +if __name__ == "__main__": + from irlc.lectures.lec06.lecture_06_pendulum_bilqr_ubar import pen_experiment + N = 50 + pen_experiment(N=N, use_linesearch=True, use_ubar=True) diff --git a/irlc/lectures/lec07/__init__.py b/irlc/lectures/lec07/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec07/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec07/lecture_07_boing_lqr.py b/irlc/lectures/lec07/lecture_07_boing_lqr.py new file mode 100644 index 0000000000000000000000000000000000000000..7a140752a73aa016366a0c2cd371f66504c2c08e --- /dev/null +++ b/irlc/lectures/lec07/lecture_07_boing_lqr.py @@ -0,0 +1,19 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.model_boeing import BoeingEnvironment +from irlc.ex07.lqr_learning_agents import learning_lqr, learning_lqr_mpc, learning_lqr_mpc_local +from irlc.ex07.learning_agent_mpc_optimize import learning_optimization_mpc_local + +if __name__ == "__main__": + env = BoeingEnvironment(output=[10, 0]) + + # Part A: LQR and global regression + learning_lqr(env) + + # Part B: LQR+MPC + # learning_lqr_mpc(env) + # + # # Part C: LQR+MPC and local regression + # learning_lqr_mpc_local(env) + # + # # Part D: Optimization+MPC and local regression + # learning_optimization_mpc_local(env) diff --git a/irlc/lectures/lec07/lecture_07_boing_lqr_mpc.py b/irlc/lectures/lec07/lecture_07_boing_lqr_mpc.py new file mode 100644 index 0000000000000000000000000000000000000000..2c4a72274d3f0fd68c713147bcc4910a9e1b879c --- /dev/null +++ b/irlc/lectures/lec07/lecture_07_boing_lqr_mpc.py @@ -0,0 +1,14 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.model_boeing import BoeingEnvironment +from irlc.ex07.lqr_learning_agents import learning_lqr, learning_lqr_mpc, learning_lqr_mpc_local +from irlc.ex07.learning_agent_mpc_optimize import learning_optimization_mpc_local + +if __name__ == "__main__": + env = BoeingEnvironment(output=[10, 0]) + learning_lqr_mpc(env) + + # # Part C: LQR+MPC and local regression + # learning_lqr_mpc_local(env) + # + # # Part D: Optimization+MPC and local regression + # learning_optimization_mpc_local(env) diff --git a/irlc/lectures/lec07/lecture_07_boing_lqr_mpc_local.py b/irlc/lectures/lec07/lecture_07_boing_lqr_mpc_local.py new file mode 100644 index 0000000000000000000000000000000000000000..22376d13457883a8f9b9f89becbffa9e46aedbb9 --- /dev/null +++ b/irlc/lectures/lec07/lecture_07_boing_lqr_mpc_local.py @@ -0,0 +1,9 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.model_boeing import BoeingEnvironment +from irlc.ex07.lqr_learning_agents import learning_lqr, learning_lqr_mpc, learning_lqr_mpc_local +from irlc.ex07.learning_agent_mpc_optimize import learning_optimization_mpc_local + +if __name__ == "__main__": + env = BoeingEnvironment(output=[10, 0]) + learning_lqr_mpc_local(env) + # learning_optimization_mpc_local(env) diff --git a/irlc/lectures/lec07/lecture_07_boing_lqr_mpc_optim.py b/irlc/lectures/lec07/lecture_07_boing_lqr_mpc_optim.py new file mode 100644 index 0000000000000000000000000000000000000000..4ed3f3e080238b559fb656e1f5640a1374109bfc --- /dev/null +++ b/irlc/lectures/lec07/lecture_07_boing_lqr_mpc_optim.py @@ -0,0 +1,8 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.model_boeing import BoeingEnvironment +from irlc.ex07.lqr_learning_agents import learning_lqr, learning_lqr_mpc, learning_lqr_mpc_local +from irlc.ex07.learning_agent_mpc_optimize import learning_optimization_mpc_local + +if __name__ == "__main__": + env = BoeingEnvironment(output=[10, 0]) + learning_optimization_mpc_local(env) diff --git a/irlc/lectures/lec07/lecture_07_lmpc.py b/irlc/lectures/lec07/lecture_07_lmpc.py new file mode 100644 index 0000000000000000000000000000000000000000..5fff87cb5ed8a3581de2f2c8c6ec6ae94957aeb4 --- /dev/null +++ b/irlc/lectures/lec07/lecture_07_lmpc.py @@ -0,0 +1,5 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex07.lmpc_run import main + +if __name__ == "__main__": + main(show_episode=True) diff --git a/irlc/lectures/lec07/lecture_07_pendulum_mpc_lqr.py b/irlc/lectures/lec07/lecture_07_pendulum_mpc_lqr.py new file mode 100644 index 0000000000000000000000000000000000000000..8867c0afeac9588af42a796e93ec70def74d2a07 --- /dev/null +++ b/irlc/lectures/lec07/lecture_07_pendulum_mpc_lqr.py @@ -0,0 +1,4 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +if __name__ == "__main__": + from irlc.ex07.mpc_pendulum_experiment_lqr import main_pendulum_lqr + main_pendulum_lqr() diff --git a/irlc/lectures/lec07/lecture_07_pendulum_mpc_optm.py b/irlc/lectures/lec07/lecture_07_pendulum_mpc_optm.py new file mode 100644 index 0000000000000000000000000000000000000000..9eff242ac8034a7e3f4ce8aa79e95d15f86d822b --- /dev/null +++ b/irlc/lectures/lec07/lecture_07_pendulum_mpc_optm.py @@ -0,0 +1,4 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +if __name__ == "__main__": + from irlc.ex07.mpc_pendulum_experiment_optim import main_pendulum + main_pendulum() diff --git a/irlc/lectures/lec07/lecture_07_pendulum_simple.py b/irlc/lectures/lec07/lecture_07_pendulum_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..337b16585b8cd3c0a709a49b5eb7b378a08c2757 --- /dev/null +++ b/irlc/lectures/lec07/lecture_07_pendulum_simple.py @@ -0,0 +1,41 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment +from irlc.utils.video_monitor import VideoMonitor +from irlc.ex04.discrete_control_cost import goal_seeking_qr_cost, DiscreteQRCost +from irlc.ex01.agent import train +from irlc.ex07.lqr_learning_agents import MPCLocalLearningLQRAgent, MPCLearningAgent +from irlc import plot_trajectory, main_plot +import matplotlib.pyplot as plt +import numpy as np +from irlc.ex07.mpc_pendulum_experiment_lqr import mk_mpc_pendulum_env + +L = 12 +def main_pendulum_lqr_simple(Tmax=10): + + """ Run Local LQR/MPC agent using the parameters + L = 12 + neighboorhood_size = 50 + min_buffer_size = 50 + """ + env_pendulum = mk_mpc_pendulum_env() + + # agent = .... (instantiate agent here) + # TODO: 1 lines missing. + raise NotImplementedError("Instantiate your agent here") + env_pendulum = VideoMonitor(env_pendulum) + + experiment_name = f"pendulum{L}_lqr" + stats, trajectories = train(env_pendulum, agent, experiment_name=experiment_name, num_episodes=16,return_trajectory=True) + plt.show() + for k in range(len(trajectories)): + plot_trajectory(trajectories[k], env_pendulum) + plt.title(f"Trajectory {k}") + plt.show() + + env_pendulum.close() + main_plot(experiment_name) + plt.show() + +if __name__ == "__main__": + np.random.seed(1) + main_pendulum_lqr_simple() diff --git a/irlc/lectures/lec07/pendulum12/2021-03-19_08-21-20.207/log.txt b/irlc/lectures/lec07/pendulum12/2021-03-19_08-21-20.207/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc592cfe3c7ae897391e9ce88d0c1454848920e8 --- /dev/null +++ b/irlc/lectures/lec07/pendulum12/2021-03-19_08-21-20.207/log.txt @@ -0,0 +1,17 @@ +Episode Accumulated Reward Average Reward Length Steps +0 -4895.647575826604 -39.165180606612836 125 125 +1 -4401.3119492497035 -35.21049559399765 125 250 +2 -2909.118791375824 -23.272950331006577 125 375 +3 -1355.8374521458716 -10.846699617166973 125 500 +4 -1392.1376132555315 -11.13710090604426 125 625 +5 -2377.229711438541 -19.017837691508326 125 750 +6 -1605.205601135333 -12.841644809082668 125 875 +7 -1142.6024308594363 -9.140819446875495 125 1000 +8 -1110.17238007953 -8.881379040636237 125 1125 +9 -1415.0153227915457 -11.320122582332363 125 1250 +10 -1084.806186007314 -8.678449488058519 125 1375 +11 -1204.58673322474 -9.636693865797913 125 1500 +12 -1582.8902992149344 -12.66312239371947 125 1625 +13 -1234.3968104401945 -9.875174483521556 125 1750 +14 -2290.3685781570866 -18.322948625256696 125 1875 +15 -1317.928485283048 -10.543427882264387 125 2000 diff --git a/irlc/lectures/lec07/pendulum12/2021-03-19_08-21-20.207/trajectories.pkl b/irlc/lectures/lec07/pendulum12/2021-03-19_08-21-20.207/trajectories.pkl new file mode 100644 index 0000000000000000000000000000000000000000..34930a77fba816ca18036b42921823ab18cefc24 Binary files /dev/null and b/irlc/lectures/lec07/pendulum12/2021-03-19_08-21-20.207/trajectories.pkl differ diff --git a/irlc/lectures/lec07/pendulum12/2022-03-17_14-16-10.758/log.txt b/irlc/lectures/lec07/pendulum12/2022-03-17_14-16-10.758/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..3005052ea2c0aa4f28a74ca66b23b716fa4a925d --- /dev/null +++ b/irlc/lectures/lec07/pendulum12/2022-03-17_14-16-10.758/log.txt @@ -0,0 +1,17 @@ +Episode Accumulated Reward Average Reward Length Steps +0 -5062.646915554269 -40.50117532443415 125 125 +1 -4545.443228168109 -36.36354582534487 125 250 +2 -3992.451522701582 -31.939612181612656 125 375 +3 -2660.945115772302 -21.287560926178415 125 500 +4 -1089.641113544413 -8.71712890835529 125 625 +5 -1794.3862709577143 -14.35509016766171 125 750 +6 -1599.3332228782826 -12.79466578302626 125 875 +7 -1999.3347944176303 -15.994678355341035 125 1000 +8 -1240.1770407677993 -9.921416326142396 125 1125 +9 -1128.717151496786 -9.029737211974288 125 1250 +10 -1148.8528175884883 -9.19082254070789 125 1375 +11 -1199.5840778420286 -9.596672622736232 125 1500 +12 -1147.0703774473068 -9.176563019578447 125 1625 +13 -1245.4139074019245 -9.963311259215399 125 1750 +14 -1257.9333517907346 -10.063466814325885 125 1875 +15 -1309.9607605947551 -10.479686084758042 125 2000 diff --git a/irlc/lectures/lec07/pendulum12/2022-03-17_14-16-10.758/trajectories.pkl b/irlc/lectures/lec07/pendulum12/2022-03-17_14-16-10.758/trajectories.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c2b34f7154ed75b8dfa9e69bcd3de982d0cd0eb1 Binary files /dev/null and b/irlc/lectures/lec07/pendulum12/2022-03-17_14-16-10.758/trajectories.pkl differ diff --git a/irlc/lectures/lec07/pendulum12_lqr/2023-03-17_08-13-45.172/log.txt b/irlc/lectures/lec07/pendulum12_lqr/2023-03-17_08-13-45.172/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..643cee39646b4d0a0ee0189cf724ae538a4cd508 --- /dev/null +++ b/irlc/lectures/lec07/pendulum12_lqr/2023-03-17_08-13-45.172/log.txt @@ -0,0 +1,17 @@ +Episode Accumulated Reward Length Steps +0 -5042.020051692956 125 0 +1 -4627.801003133058 125 1 +2 -3635.9503089227105 125 2 +3 -2459.2456085370436 125 3 +4 -1142.0454750510762 125 4 +5 -1563.5408392433658 125 5 +6 -1718.6689962603696 125 6 +7 -1215.2631997008277 125 7 +8 -1172.9345344478274 125 8 +9 -1108.3729746371948 125 9 +10 -1012.6787060036193 125 10 +11 -1715.9593847985013 125 11 +12 -1009.5943996400636 125 12 +13 -1082.3121757069966 125 13 +14 -1248.0530347172762 125 14 +15 -1496.6826680867007 125 15 diff --git a/irlc/lectures/lec07/pendulum12_lqr/2023-03-17_08-13-45.172/trajectories.pkl b/irlc/lectures/lec07/pendulum12_lqr/2023-03-17_08-13-45.172/trajectories.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3e9f2d19af2a1f4bdcdbc7a8922852a440795c46 Binary files /dev/null and b/irlc/lectures/lec07/pendulum12_lqr/2023-03-17_08-13-45.172/trajectories.pkl differ diff --git a/irlc/lectures/lec07/tmp-pdfcrop-10536.tex b/irlc/lectures/lec07/tmp-pdfcrop-10536.tex new file mode 100644 index 0000000000000000000000000000000000000000..ea5c21c82776ad1a4c43bc64861caa907b877528 --- /dev/null +++ b/irlc/lectures/lec07/tmp-pdfcrop-10536.tex @@ -0,0 +1,131 @@ +\catcode37 14 % percent +\catcode33 12 % exclam +\catcode34 12 % quote +\catcode35 6 % hash +\catcode39 12 % apostrophe +\catcode40 12 % left parenthesis +\catcode41 12 % right parenthesis +\catcode45 12 % minus +\catcode46 12 % period +\catcode60 12 % less +\catcode61 12 % equals +\catcode62 12 % greater +\catcode64 12 % at +\catcode91 12 % left square +\catcode93 12 % right square +\catcode96 12 % back tick +\catcode123 1 % left curly brace +\catcode125 2 % right curly brace +\catcode126 12 % tilde +\catcode`\#=6 % +\escapechar=92 % +\def\IfUndefined#1#2#3{% + \begingroup\expandafter\expandafter\expandafter\endgroup + \expandafter\ifx\csname#1\endcsname\relax + #2% + \else + #3% + \fi +} +\def\pdffilehex{746D702D70646663726F702D31303533362D696D672E706466} +\IfUndefined{pdfunescapehex}{% + \begingroup + \gdef\pdffile{}% + \def\do#1#2{% + \ifx\relax#2\relax + \ifx\relax#1\relax + \else + \errmessage{Invalid hex string, should not happen!}% + \fi + \else + \lccode`0="#1#2\relax + \lowercase{% + \xdef\pdffile{\pdffile0}% + }% + \expandafter\do + \fi + }% + \expandafter\do\pdffilehex\relax\relax + \endgroup +}{% + \edef\pdffile{\pdfunescapehex{\pdffilehex}}% +} +\immediate\write-1{Input file: \pdffile} +\pdfcompresslevel=9 \pdfoutput=1 % +\csname pdfmapfile\endcsname{} +\def\setpdfversion#1#2{% + \IfUndefined{pdfobjcompresslevel}{% + }{% + \ifnum#1=1 % + \ifnum#2<5 + \pdfobjcompresslevel=0 % + \else + \pdfobjcompresslevel=2 % + \fi + \fi + }% + \IfUndefined{pdfminorversion}{% + \IfUndefined{pdfoptionpdfminorversion}{% + }{% + \pdfoptionpdfminorversion=#2\relax + }% + }{% + \pdfminorversion=#2\relax + \IfUndefined{pdfmajorversion}{% + \ifnum#2=0 \pdfminorversion=5\fi} + {\pdfmajorversion=#1\relax}% + }% +} +\def\page #1 [#2 #3 #4 #5]{% + \count0=#1\relax + \setbox0=\hbox{% + \pdfximage page #1 mediabox{\pdffile}% + \pdfrefximage\pdflastximage + }% + \pdfhorigin=-#2bp\relax + \pdfvorigin=#3bp\relax + \pdfpagewidth=#4bp\relax + \advance\pdfpagewidth by -#2bp\relax + \pdfpageheight=#5bp\relax + \advance\pdfpageheight by -#3bp\relax + \ht0=\pdfpageheight + \shipout\box0\relax +} +\def\pageclip #1 [#2 #3 #4 #5][#6 #7 #8 #9]{% + \count0=#1\relax + \dimen0=#4bp\relax \advance\dimen0 by -#2bp\relax + \edef\imagewidth{\the\dimen0}% + \dimen0=#5bp\relax \advance\dimen0 by -#3bp\relax + \edef\imageheight{\the\dimen0}% + \pdfximage page #1 mediabox{\pdffile}% + \setbox0=\hbox{% + \kern -#2bp\relax + \lower #3bp\hbox{\pdfrefximage\pdflastximage}% + }% + \wd0=\imagewidth\relax + \ht0=\imageheight\relax + \dp0=0pt\relax + \pdfhorigin=#6pt\relax + \pdfvorigin=#7bp\relax + \pdfpagewidth=\imagewidth + \advance\pdfpagewidth by #6bp\relax + \advance\pdfpagewidth by #8bp\relax + \pdfpageheight=\imageheight\relax + \advance\pdfpageheight by #7bp\relax + \advance\pdfpageheight by #9bp\relax + \pdfxform0\relax + \shipout\hbox{\pdfrefxform\pdflastxform}% +}% +\def\pageinclude#1{% + \pdfhorigin=0pt\relax + \pdfvorigin=0pt\relax + \pdfximage page #1 mediabox{\pdffile}% + \setbox0=\hbox{\pdfrefximage\pdflastximage}% + \pdfpagewidth=\wd0\relax + \pdfpageheight=\ht0\relax + \advance\pdfpageheight by \dp0\relax + \shipout\hbox{% + \raise\dp0\box0\relax + }% +} +\setpdfversion{1}{4} diff --git a/irlc/lectures/lec07/tmp-pdfcrop-12592.tex b/irlc/lectures/lec07/tmp-pdfcrop-12592.tex new file mode 100644 index 0000000000000000000000000000000000000000..479d43b09cb3943939732adbb8bf9fc8cecbc151 --- /dev/null +++ b/irlc/lectures/lec07/tmp-pdfcrop-12592.tex @@ -0,0 +1,131 @@ +\catcode37 14 % percent +\catcode33 12 % exclam +\catcode34 12 % quote +\catcode35 6 % hash +\catcode39 12 % apostrophe +\catcode40 12 % left parenthesis +\catcode41 12 % right parenthesis +\catcode45 12 % minus +\catcode46 12 % period +\catcode60 12 % less +\catcode61 12 % equals +\catcode62 12 % greater +\catcode64 12 % at +\catcode91 12 % left square +\catcode93 12 % right square +\catcode96 12 % back tick +\catcode123 1 % left curly brace +\catcode125 2 % right curly brace +\catcode126 12 % tilde +\catcode`\#=6 % +\escapechar=92 % +\def\IfUndefined#1#2#3{% + \begingroup\expandafter\expandafter\expandafter\endgroup + \expandafter\ifx\csname#1\endcsname\relax + #2% + \else + #3% + \fi +} +\def\pdffilehex{746D702D70646663726F702D31323539322D696D672E706466} +\IfUndefined{pdfunescapehex}{% + \begingroup + \gdef\pdffile{}% + \def\do#1#2{% + \ifx\relax#2\relax + \ifx\relax#1\relax + \else + \errmessage{Invalid hex string, should not happen!}% + \fi + \else + \lccode`0="#1#2\relax + \lowercase{% + \xdef\pdffile{\pdffile0}% + }% + \expandafter\do + \fi + }% + \expandafter\do\pdffilehex\relax\relax + \endgroup +}{% + \edef\pdffile{\pdfunescapehex{\pdffilehex}}% +} +\immediate\write-1{Input file: \pdffile} +\pdfcompresslevel=9 \pdfoutput=1 % +\csname pdfmapfile\endcsname{} +\def\setpdfversion#1#2{% + \IfUndefined{pdfobjcompresslevel}{% + }{% + \ifnum#1=1 % + \ifnum#2<5 + \pdfobjcompresslevel=0 % + \else + \pdfobjcompresslevel=2 % + \fi + \fi + }% + \IfUndefined{pdfminorversion}{% + \IfUndefined{pdfoptionpdfminorversion}{% + }{% + \pdfoptionpdfminorversion=#2\relax + }% + }{% + \pdfminorversion=#2\relax + \IfUndefined{pdfmajorversion}{% + \ifnum#2=0 \pdfminorversion=5\fi} + {\pdfmajorversion=#1\relax}% + }% +} +\def\page #1 [#2 #3 #4 #5]{% + \count0=#1\relax + \setbox0=\hbox{% + \pdfximage page #1 mediabox{\pdffile}% + \pdfrefximage\pdflastximage + }% + \pdfhorigin=-#2bp\relax + \pdfvorigin=#3bp\relax + \pdfpagewidth=#4bp\relax + \advance\pdfpagewidth by -#2bp\relax + \pdfpageheight=#5bp\relax + \advance\pdfpageheight by -#3bp\relax + \ht0=\pdfpageheight + \shipout\box0\relax +} +\def\pageclip #1 [#2 #3 #4 #5][#6 #7 #8 #9]{% + \count0=#1\relax + \dimen0=#4bp\relax \advance\dimen0 by -#2bp\relax + \edef\imagewidth{\the\dimen0}% + \dimen0=#5bp\relax \advance\dimen0 by -#3bp\relax + \edef\imageheight{\the\dimen0}% + \pdfximage page #1 mediabox{\pdffile}% + \setbox0=\hbox{% + \kern -#2bp\relax + \lower #3bp\hbox{\pdfrefximage\pdflastximage}% + }% + \wd0=\imagewidth\relax + \ht0=\imageheight\relax + \dp0=0pt\relax + \pdfhorigin=#6pt\relax + \pdfvorigin=#7bp\relax + \pdfpagewidth=\imagewidth + \advance\pdfpagewidth by #6bp\relax + \advance\pdfpagewidth by #8bp\relax + \pdfpageheight=\imageheight\relax + \advance\pdfpageheight by #7bp\relax + \advance\pdfpageheight by #9bp\relax + \pdfxform0\relax + \shipout\hbox{\pdfrefxform\pdflastxform}% +}% +\def\pageinclude#1{% + \pdfhorigin=0pt\relax + \pdfvorigin=0pt\relax + \pdfximage page #1 mediabox{\pdffile}% + \setbox0=\hbox{\pdfrefximage\pdflastximage}% + \pdfpagewidth=\wd0\relax + \pdfpageheight=\ht0\relax + \advance\pdfpageheight by \dp0\relax + \shipout\hbox{% + \raise\dp0\box0\relax + }% +} +\setpdfversion{1}{4} diff --git a/irlc/lectures/lec08/__init__.py b/irlc/lectures/lec08/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec08/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec08/demo_bandit.py b/irlc/lectures/lec08/demo_bandit.py new file mode 100644 index 0000000000000000000000000000000000000000..c1f61c38c14d03d20b91d95a773e31e0e5c9bfe1 --- /dev/null +++ b/irlc/lectures/lec08/demo_bandit.py @@ -0,0 +1,25 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex08.devel.bandit_graphics_environment import GraphicalBandit +import time +from irlc import train +from irlc.ex08.simple_agents import BasicAgent +from irlc import interactive + +def bandit_eps(autoplay=False): + env = GraphicalBandit(10, render_mode='human',frames_per_second=30) + env.reset() + #env.viewer.show_q_star = True + # env.show_q_ucb = True + agent = BasicAgent(env, epsilon=0.1) + agent.method = 'Epsilon-greedy' + env, agent = interactive(env, agent, autoplay=autoplay) + + t0 = time.time() + n = 3000 + stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False) + tpf = (time.time()-t0)/ n + print("tpf", tpf, 'fps', 1/tpf) + env.close() + +if __name__ == "__main__": + bandit_eps() diff --git a/irlc/lectures/lec08/demo_bandit_ucb.py b/irlc/lectures/lec08/demo_bandit_ucb.py new file mode 100644 index 0000000000000000000000000000000000000000..8e596321207df550e83ae0c6176b19af0704249b --- /dev/null +++ b/irlc/lectures/lec08/demo_bandit_ucb.py @@ -0,0 +1,26 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex08.devel.bandit_graphics_environment import GraphicalBandit +from irlc import interactive, train +# import numpy as np +import time + +def bandit_ucb(autoplay=False): + env = GraphicalBandit(10, render_mode='human', frames_per_second=30) + env.reset() + #env.viewer.show_q_star = True + #env.viewer.show_q_ucb = True + from irlc.ex08.ucb_agent import UCBAgent + agent = UCBAgent(env, c=1) + agent.method = 'UCB' + + env, agent = interactive(env, agent, autoplay=autoplay) + t0 = time.time() + n = 500 + stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False) + tpf = (time.time() - t0) / n + print("tpf", tpf, 'fps', 1 / tpf) + env.close() + + +if __name__ == "__main__": + bandit_ucb() diff --git a/irlc/lectures/lec09/__init__.py b/irlc/lectures/lec09/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec09/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec09/unf_frozenlake.py b/irlc/lectures/lec09/unf_frozenlake.py new file mode 100644 index 0000000000000000000000000000000000000000..421bf1b9a76fe0325ec334122297d155f8fe3ab1 --- /dev/null +++ b/irlc/lectures/lec09/unf_frozenlake.py @@ -0,0 +1,11 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex01.agent import Agent +from irlc.gridworld.gridworld_environments import BookGridEnvironment, FrozenLake, FrozenLakeEnv +from irlc import interactive, train + +if __name__ == "__main__": + env = FrozenLake(render_mode='human', print_states=True) + env, agent = interactive(env, Agent(env)) + agent.label = "Random agent" + train(env, agent, num_episodes=100, verbose=False) + env.close() diff --git a/irlc/lectures/lec09/unf_gridworld.py b/irlc/lectures/lec09/unf_gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..e5458d9db1ad9b31db7f7d9665173824c3408594 --- /dev/null +++ b/irlc/lectures/lec09/unf_gridworld.py @@ -0,0 +1,12 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex01.agent import Agent +from irlc.gridworld.gridworld_environments import BookGridEnvironment, FrozenLakeEnv +from irlc import interactive, train + + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', print_states=True, living_reward=-0.05) + env, agent = interactive(env, Agent(env)) + agent.label = "Random agent" + train(env, agent, num_episodes=100, verbose=False) + env.close() diff --git a/irlc/lectures/lec09/unf_policy_evaluation_frozen.py b/irlc/lectures/lec09/unf_policy_evaluation_frozen.py new file mode 100644 index 0000000000000000000000000000000000000000..9adda9fd454d02ed5809cb122d70365c9130c148 --- /dev/null +++ b/irlc/lectures/lec09/unf_policy_evaluation_frozen.py @@ -0,0 +1,20 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import FrozenLake +from irlc import interactive, train +from irlc.gridworld.demo_agents.hidden_agents import PolicyEvaluationAgent2 + +def policy_evaluation(env=None): + agent = PolicyEvaluationAgent2(env, gamma=1., steps_between_policy_improvement=None) + env, agent = interactive(env, agent) + train(env, agent, num_episodes=100) + env.close() + +def policy_improvement(env=None, q_mode=True): + agent = PolicyEvaluationAgent2(env, gamma=1.,steps_between_policy_improvement=20) + env, agent = interactive(env, agent) + train(env, agent, num_episodes=1000, verbose=False) + env.close() + +if __name__ == "__main__": + env = FrozenLake(render_mode='human', living_reward=-0.0) + policy_evaluation(env) diff --git a/irlc/lectures/lec09/unf_policy_evaluation_gridworld.py b/irlc/lectures/lec09/unf_policy_evaluation_gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..efec41198536780a74ac8db2c12bd325a703fd09 --- /dev/null +++ b/irlc/lectures/lec09/unf_policy_evaluation_gridworld.py @@ -0,0 +1,20 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc import interactive, train +from irlc.gridworld.demo_agents.hidden_agents import PolicyEvaluationAgent2 + +def policy_evaluation(env=None): + agent = PolicyEvaluationAgent2(env, gamma=1., steps_between_policy_improvement=None) + env, agent = interactive(env, agent) + train(env, agent, num_episodes=100) + env.close() + +def policy_improvement(env=None, q_mode=True): + agent = PolicyEvaluationAgent2(env, gamma=1.,steps_between_policy_improvement=20) + env, agent = interactive(env, agent) + train(env, agent, num_episodes=1000) + env.close() + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05) + policy_evaluation(env) diff --git a/irlc/lectures/lec09/unf_policy_evaluation_stepwise_gridworld.py b/irlc/lectures/lec09/unf_policy_evaluation_stepwise_gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..a438af8f6d67f7d80b09c1efffdda5e69af84fb4 --- /dev/null +++ b/irlc/lectures/lec09/unf_policy_evaluation_stepwise_gridworld.py @@ -0,0 +1,20 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc import interactive, train +from irlc.gridworld.demo_agents.hidden_agents import PolicyEvaluationAgent2 + +def policy_evaluation_stepwise(env=None): + agent = PolicyEvaluationAgent2(env, gamma=1., steps_between_policy_improvement=None, only_update_current=True) + env, agent = interactive(env, agent) + train(env, agent, num_episodes=100) + env.close() + +def policy_improvement(env=None, q_mode=True): + agent = PolicyEvaluationAgent2(env, gamma=1.,steps_between_policy_improvement=20) + env, agent = interactive(env, agent) + train(env, agent, num_episodes=1000) + env.close() + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05) + policy_evaluation_stepwise(env) diff --git a/irlc/lectures/lec09/unf_policy_improvement_frozenlake.py b/irlc/lectures/lec09/unf_policy_improvement_frozenlake.py new file mode 100644 index 0000000000000000000000000000000000000000..7242b00a8195c5c368bfb4d9abebc79498e35b1c --- /dev/null +++ b/irlc/lectures/lec09/unf_policy_improvement_frozenlake.py @@ -0,0 +1,7 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment, FrozenLake +from irlc.lectures.unf.unf_policy_evaluation_gridworld import policy_improvement + +if __name__ == "__main__": + env = FrozenLake(render_mode='human', living_reward=-0) + policy_improvement(env) diff --git a/irlc/lectures/lec09/unf_policy_improvement_gridworld.py b/irlc/lectures/lec09/unf_policy_improvement_gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..eb6d7623bf8b566aed9e589d4586256d2b5b3fd5 --- /dev/null +++ b/irlc/lectures/lec09/unf_policy_improvement_gridworld.py @@ -0,0 +1,7 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.lectures.unf.unf_policy_evaluation_gridworld import policy_improvement + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05) + policy_improvement(env) diff --git a/irlc/lectures/lec09/unf_vi_frozenlake.py b/irlc/lectures/lec09/unf_vi_frozenlake.py new file mode 100644 index 0000000000000000000000000000000000000000..4ece4f2e944c076be048a9fdae6f453ce6ff01ad --- /dev/null +++ b/irlc/lectures/lec09/unf_vi_frozenlake.py @@ -0,0 +1,17 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import FrozenLake +from irlc.ex01.agent import train +from irlc.gridworld.demo_agents.hidden_agents import ValueIterationAgent3 +from irlc import interactive + +def q1_vi(env): + agent = ValueIterationAgent3(env, epsilon=0, gamma=1, only_update_current=False) + env, agent = interactive(env, agent) + env.reset() + train(env, agent, num_episodes=100) + env.close() + + +if __name__ == "__main__": + env = FrozenLake(render_mode='human', living_reward=-0) + q1_vi(env) diff --git a/irlc/lectures/lec09/unf_vi_gridworld.py b/irlc/lectures/lec09/unf_vi_gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..56319aff50475f896188825feba5b86012791582 --- /dev/null +++ b/irlc/lectures/lec09/unf_vi_gridworld.py @@ -0,0 +1,19 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment +# from irlc.utils.video_monitor import VideoMonitor +from irlc.ex01.agent import train +from irlc.gridworld.demo_agents.hidden_agents import ValueIterationAgent3 +from irlc import interactive + +def q1_vi(env): + agent = ValueIterationAgent3(env, epsilon=0, gamma=1, only_update_current=False) + env, agent = interactive(env, agent) + # experiment = "experiments/q1_value_iteration" + # env = VideoMonitor(env, agent=agent, fps=100, continious_recording=True, agent_monitor_keys=('v', 'v2Q'), render_kwargs={'method_label': 'VI'}) + env.reset() + train(env, agent, num_episodes=100) + env.close() + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05) + q1_vi(env) diff --git a/irlc/lectures/lec09/unf_vi_gridworld_stepwise.py b/irlc/lectures/lec09/unf_vi_gridworld_stepwise.py new file mode 100644 index 0000000000000000000000000000000000000000..152a91b25114104a470fc43d4611a36601e9d9c0 --- /dev/null +++ b/irlc/lectures/lec09/unf_vi_gridworld_stepwise.py @@ -0,0 +1,16 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.ex01.agent import train +from irlc.gridworld.demo_agents.hidden_agents import ValueIterationAgent3 +from irlc import interactive + +def q1_vi(env): + agent = ValueIterationAgent3(env, epsilon=0, gamma=1, only_update_current=True) + env, agent = interactive(env, agent) + env.reset() + train(env, agent, num_episodes=100) + env.close() + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=False) + q1_vi(env) diff --git a/irlc/lectures/lec10/__init__.py b/irlc/lectures/lec10/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec10/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state.py b/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state.py new file mode 100644 index 0000000000000000000000000000000000000000..a55be35a5aad6a1457b56227681f306174dc6108 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state.py @@ -0,0 +1,60 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment +from irlc.ex10.mc_agent import MCAgent + +from irlc.ex10.mc_evaluate import MCEvaluationAgent +import numpy as np +from irlc import interactive, train + +class MCControlAgentOneState(MCAgent): + def __init__(self, *args, state_action=None, **kwargs): + a = 34 + super().__init__(*args, **kwargs) + if state_action is None: + state_action = (self.env.mdp.initial_state, self.env.mdp.A(self.env.mdp.initial_state)[0]) + + self.state_action = state_action + self._clear_states() + + def _clear_states(self, val=None): + for s in self.env.mdp.nonterminal_states: + for a in self.env.mdp.A(s): + # self.Q[s,a] = 0 + if (s,a) != self.state_action: + self.returns_sum[s,a] = val + self.returns_count[s,a] = val + + # if s in self.Q.q_: + k = next(self.env.mdp.Psr(s, self.env.mdp.A(s)[0]).keys().__iter__() )[0] + if not self.env.mdp.is_terminal(k): + self.Q[s,a] = 0 + + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # self.episode = [e for e in self.episode if e[0] == self.state] + self._clear_states(0) + super().train(s, a, r, sp, done) + # Clear out many of the state, actions: + self._clear_states(None) + # for s in self.env.mdp.nonterminal_states: + # if s != self.state: + # self.v[s] = None + + pass + + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=True, zoom=2) + agent = MCControlAgentOneState(env, gamma=1, alpha=None, first_visit=True) + method_label = 'MC (gamma=1)' + agent.label = method_label + autoplay = False + env, agent = interactive(env, agent, autoplay=autoplay) + # agent = PlayWrapper(agent, env,autoplay=autoplay) + # env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label}) + num_episodes = 1000 + train(env, agent, num_episodes=num_episodes) + env.close() + + # keyboard_play(env,agent,method_label='MC (alpha=0.5)') diff --git a/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state_b.py b/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state_b.py new file mode 100644 index 0000000000000000000000000000000000000000..f0f705bb72f4592cfe19888e2e8ad5406e2a3ca9 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_action_value_first_one_state_b.py @@ -0,0 +1,21 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment +from irlc.ex10.mc_agent import MCAgent +from irlc.lectures.lec10.lecture_10_mc_action_value_first_one_state import MCControlAgentOneState +from irlc.ex10.mc_evaluate import MCEvaluationAgent +import numpy as np +from irlc import interactive, train + + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=True, zoom=2) + agent = MCControlAgentOneState(env, gamma=1, alpha=None, first_visit=True, state_action=( (0,2), 2)) + method_label = 'MC control (gamma=1)' + agent.label = method_label + autoplay = False + env, agent = interactive(env, agent, autoplay=autoplay) + num_episodes = 1000 + train(env, agent, num_episodes=num_episodes) + env.close() + # keyboard_play(env,agent,method_label='MC (alpha=0.5)') diff --git a/irlc/lectures/lec10/lecture_10_mc_control.py b/irlc/lectures/lec10/lecture_10_mc_control.py new file mode 100644 index 0000000000000000000000000000000000000000..e286478a8cabf26f4878528a1fbc0f402e5c25ef --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_control.py @@ -0,0 +1,13 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.ex10.mc_agent import MCAgent +import numpy as np + +if __name__ == "__main__": + np.random.seed(433) + env = BookGridEnvironment(render_mode='human',zoom=2) + # agent = MCAgent(env, gamma=0.9, epsilon=0.15, alpha=0.1, first_visit=True) + agent = MCAgent(env, gamma=1.0, epsilon=0.15, alpha=None, first_visit=True) + # env, agent = interactive(env, agent) + keyboard_play(env,agent,method_label='MC control') diff --git a/irlc/lectures/lec10/lecture_10_mc_corner.py b/irlc/lectures/lec10/lecture_10_mc_corner.py new file mode 100644 index 0000000000000000000000000000000000000000..a1ec3e1490270533cb4259ad9f83b2f9bc22bac2 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_corner.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment +from irlc.ex10.mc_agent import MCAgent +import numpy as np + +if __name__ == "__main__": + env = SuttonCornerGridEnvironment(render_mode='human') + agent = MCAgent(env, gamma=1, epsilon=1, alpha=.5, first_visit=False) + keyboard_play(env,agent,method_label='MC (alpha=0.5)') diff --git a/irlc/lectures/lec10/lecture_10_mc_onestate_every.py b/irlc/lectures/lec10/lecture_10_mc_onestate_every.py new file mode 100644 index 0000000000000000000000000000000000000000..710532e519d8971fd7a819c4bb532e4bf253e15e --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_onestate_every.py @@ -0,0 +1,12 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.exam_tabular_examples.helper import keyboard_play_value +# from irlc.gridworld_pyglet.gridworld_environments import BookGridEnvironment +from irlc.ex10.mc_evaluate import MCEvaluationAgent +# from irlc.gridworld_pyglet.gridworld_environments import GridworldEnvironment +from irlc.lectures.lec10.lecture_10_mc_onestate_first import CaughtGrid + + +if __name__ == "__main__": + env = CaughtGrid(view_mode=1, render_mode='humanp') + agent = MCEvaluationAgent(env, gamma=1, alpha=None, first_visit=False) + keyboard_play_value(env,agent,method_label='MC (every visit)') diff --git a/irlc/lectures/lec10/lecture_10_mc_onestate_first.py b/irlc/lectures/lec10/lecture_10_mc_onestate_first.py new file mode 100644 index 0000000000000000000000000000000000000000..c111aa624334fe8611d496bff8bd41ca0dd01ee4 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_onestate_first.py @@ -0,0 +1,18 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.exam_tabular_examples.helper import keyboard_play_value +# from irlc.gridworld_pyglet.gridworld_environments import BookGridEnvironment +from irlc.ex10.mc_evaluate import MCEvaluationAgent +from irlc.gridworld.gridworld_environments import GridworldEnvironment + +map = [['#', '#', '#', '#'], + ['#','S',0,'#'], + ['#','#','#','#']] + +class CaughtGrid(GridworldEnvironment): + def __init__(self, **kwargs): + super().__init__(map, living_reward=1, zoom=1.5, **kwargs) + +if __name__ == "__main__": + env = CaughtGrid(view_mode=1, render_mode='human') + agent = MCEvaluationAgent(env, gamma=1, alpha=None) + keyboard_play_value(env,agent,method_label='MC (first visit)') diff --git a/irlc/lectures/lec10/lecture_10_mc_q_estimation.py b/irlc/lectures/lec10/lecture_10_mc_q_estimation.py new file mode 100644 index 0000000000000000000000000000000000000000..4b6ef32a716a3ab261a6adfb07c6fd0bc9250f0e --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_q_estimation.py @@ -0,0 +1,40 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.berkley.rl.feature_encoder import SimplePacmanExtractor +# from irlc.utils.player_wrapper_pyglet import PlayWrapper +# from irlc.gridworld.gridworld import BerkleyBookGridEnvironment +from irlc.gridworld.gridworld_environments import BookGridEnvironment +# from irlc.utils.video_monitor import VideoMonitor +from irlc import train, interactive +# from irlc import interactive + +def keyboard_play(env, agent, method_label='MC',autoplay=False, num_episodes=1000): + agent.label = method_label + env, agent = interactive(env, agent, autoplay=autoplay) + # agent = PlayWrapper(agent, env,autoplay=autoplay) + # env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label}) + train(env, agent, num_episodes=num_episodes) + env.close() + + +def automatic_play(env, agent, method_label='MC'): + # agent = PlayWrapper(agent, env) + env = VideoMonitor(env, agent=agent, fps=40, continious_recording=True, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label}) + train(env, agent, num_episodes=1000) + env.close() + +def automatic_play_value(env, agent, method_label='MC'): + agent.label = method_label + env, agent = interactive(env, agent) + + # env = VideoMonitor(env, agent=agent, fps=40, continious_recording=True, agent_monitor_keys=('v'), render_kwargs={'method_label': method_label}) + # agent = PlayWrapper(agent, env) + train(env, agent, num_episodes=1000) + env.close() + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', zoom=2, living_reward=-0.05) + from irlc.ex10.mc_agent import MCAgent + agent = MCAgent(env, gamma=0.9, epsilon=1., first_visit=True, alpha=None) + # agent.label = + # env, agent = interactive(env, agent) + keyboard_play(env, agent, method_label='MC Q-estimation (First visit)') diff --git a/irlc/lectures/lec10/lecture_10_mc_value_every.py b/irlc/lectures/lec10/lecture_10_mc_value_every.py new file mode 100644 index 0000000000000000000000000000000000000000..8598fa5e78834d5337f33217a21eeb7694af587e --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_value_every.py @@ -0,0 +1,11 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.exam_tabular_examples.helper import keyboard_play_value +# from irlc.berkley.rl.feature_encoder import SimplePacmanExtractor +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.ex10.mc_evaluate import MCEvaluationAgent + +if __name__ == "__main__": + env = BookGridEnvironment(view_mode=1, render_mode='human', living_reward=-0.05) + agent = MCEvaluationAgent(env, gamma=.9, alpha=None, first_visit=False) + + keyboard_play_value(env,agent,method_label='MC every') diff --git a/irlc/lectures/lec10/lecture_10_mc_value_every_one_state.py b/irlc/lectures/lec10/lecture_10_mc_value_every_one_state.py new file mode 100644 index 0000000000000000000000000000000000000000..bd86fafbf767e54a0313aece2d7b38102fc8f6a7 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_value_every_one_state.py @@ -0,0 +1,58 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment +from irlc.lectures.lec10.lecture_10_mc_value_first_one_state import MCAgentOneState +from irlc.ex10.mc_agent import MCAgent +from irlc.ex10.mc_evaluate import MCEvaluationAgent +import numpy as np +from irlc import interactive, train + +# class MCAgentOneState(MCEvaluationAgent): +# def __init__(self, *args, state=None, **kwargs): +# a = 34 +# super().__init__(*args, **kwargs) +# if state is None: +# state = self.env.mdp.initial_state +# self.state = state +# self._clear_states() +# +# def _clear_states(self, val=None): +# for s in self.env.mdp.nonterminal_states: +# # for a in self.env.mdp.A(s): +# # self.Q[s,a] = 0 +# if s != self.state: +# self.returns_sum_S[s] = val +# self.returns_count_N[s] = val +# +# if s in self.v: +# k = next(self.env.mdp.Psr(s, self.env.mdp.A(s)[0]).keys().__iter__() )[0] +# if not self.env.mdp.is_terminal(k): +# +# del self.v[s] +# +# +# def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): +# # self.episode = [e for e in self.episode if e[0] == self.state] +# self._clear_states(0) +# super().train(s, a, r, sp, done) +# # Clear out many of the state, actions: +# self._clear_states(None) +# # for s in self.env.mdp.nonterminal_states: +# # if s != self.state: +# # self.v[s] = None +# pass + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=True) + agent = MCAgentOneState(env, gamma=1, alpha=None, first_visit=False) + method_label = 'MC (gamma=1)' + agent.label = method_label + autoplay = False + env, agent = interactive(env, agent, autoplay=autoplay) + # agent = PlayWrapper(agent, env,autoplay=autoplay) + # env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label}) + num_episodes = 1000 + train(env, agent, num_episodes=num_episodes) + env.close() + + # keyboard_play(env,agent,method_label='MC (alpha=0.5)') diff --git a/irlc/lectures/lec10/lecture_10_mc_value_first.py b/irlc/lectures/lec10/lecture_10_mc_value_first.py new file mode 100644 index 0000000000000000000000000000000000000000..549b79754bf4508691f5182032566a7562e56b6e --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_value_first.py @@ -0,0 +1,32 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment, BridgeGridEnvironment, GridworldEnvironment +from irlc.ex10.mc_evaluate import MCEvaluationAgent +from irlc import interactive, train + +class BridgeGridEnvironment2(GridworldEnvironment): + def __init__(self, *args, **kwargs): + super().__init__(grid_bridge_grid, *args, **kwargs) + + +grid_bridge_grid = [[ '#',-100, -100, -100, -100, -100, '#'], + [ 1, ' ', 'S', ' ', ' ', ' ', 2], + [ '#',-100, -100, -100, -100, -100, '#']] + + +if __name__ == "__main__": + + # env = BridgeGridEnvironment2(view_mode=1, render_mode='human', living_reward=0) + # agent = MCEvaluationAgent(env, gamma=.8, alpha=None, first_visit=False) + # env, agent = interactive(env, agent) + # train(env, agent, num_episodes=1000) + # env.close() + + env = BookGridEnvironment(view_mode=1, render_mode='human', living_reward=-0.05) + agent = MCEvaluationAgent(env, gamma=1, alpha=None) + # agent = PlayWrapper(agent, env) + agent.label = 'MC First (gamma=1)' + env, agent = interactive(env, agent) + env.view_mode = 1 # Automatically set value-function view-mode. + # env = VideoMonitor(env, agent=agent, fps=200, render_kwargs={'method_label': 'MC first'}) + train(env, agent, num_episodes=1000) + env.close() diff --git a/irlc/lectures/lec10/lecture_10_mc_value_first_one_state.py b/irlc/lectures/lec10/lecture_10_mc_value_first_one_state.py new file mode 100644 index 0000000000000000000000000000000000000000..c998543f234744811dbbf68613dce641776f1934 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_value_first_one_state.py @@ -0,0 +1,64 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment +from irlc.ex10.mc_agent import MCAgent +from irlc.ex10.mc_evaluate import MCEvaluationAgent +import numpy as np +from irlc import interactive, train + +class MCAgentOneState(MCEvaluationAgent): + def __init__(self, *args, state=None, **kwargs): + a = 34 + super().__init__(*args, **kwargs) + if state is None: + state = self.env.mdp.initial_state + self.state = state + self._clear_states() + + def _clear_states(self, val=None): + for s in self.env.mdp.nonterminal_states: + # for a in self.env.mdp.A(s): + # self.Q[s,a] = 0 + if s != self.state: + self.returns_sum_S[s] = val + self.returns_count_N[s] = val + + if s in self.v: + k = next(self.env.mdp.Psr(s, self.env.mdp.A(s)[0]).keys().__iter__() )[0] + if not self.env.mdp.is_terminal(k): + + del self.v[s] + + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # self.episode = [e for e in self.episode if e[0] == self.state] + self._clear_states(0) + super().train(s, a, r, sp, done) + self._clear_states(None) + + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05, print_states=True, zoom=2) + agent = MCAgentOneState(env, gamma=1, alpha=None, first_visit=True) + method_label = 'MC (gamma=1)' + agent.label = method_label + autoplay = False + env, agent = interactive(env, agent, autoplay=autoplay) + # agent = PlayWrapper(agent, env,autoplay=autoplay) + # env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label}) + num_episodes = 1000 + train(env, agent, num_episodes=num_episodes) + env.close() + + import matplotlib.pyplot as plt + import numpy as np + + import matplotlib.pyplot as plt + import numpy as np + + lt = np.linspace(np.log(1000), np.log(2000) + 0*5000) + plt.plot(lt, 5 + 2 * np.sqrt(lt / 500), 'k-') + plt.plot(lt, 10 + 2 * np.sqrt(lt / (np.exp(lt) - 500)), 'r-') + plt.xlabel('log(t)') + plt.show() + # keyboard_play(env,agent,method_label='MC (alpha=0.5)') diff --git a/irlc/lectures/lec10/lecture_10_mc_value_first_one_state_b.py b/irlc/lectures/lec10/lecture_10_mc_value_first_one_state_b.py new file mode 100644 index 0000000000000000000000000000000000000000..6567221b84c2df45f4c73f7921df5173c7e66608 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_mc_value_first_one_state_b.py @@ -0,0 +1,58 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment, BookGridEnvironment +from irlc.lectures.lec10.lecture_10_mc_value_first_one_state import MCAgentOneState +from irlc.ex10.mc_agent import MCAgent +from irlc.ex10.mc_evaluate import MCEvaluationAgent +import numpy as np +from irlc import interactive, train + +# class MCAgentOneState(MCEvaluationAgent): +# def __init__(self, *args, state=None, **kwargs): +# a = 34 +# super().__init__(*args, **kwargs) +# if state is None: +# state = self.env.mdp.initial_state +# self.state = state +# self._clear_states() +# +# def _clear_states(self, val=None): +# for s in self.env.mdp.nonterminal_states: +# # for a in self.env.mdp.A(s): +# # self.Q[s,a] = 0 +# if s != self.state: +# self.returns_sum_S[s] = val +# self.returns_count_N[s] = val +# if s in self.v: +# k = next(self.env.mdp.Psr(s, self.env.mdp.A(s)[0]).keys().__iter__() )[0] +# if not self.env.mdp.is_terminal(k): +# +# del self.v[s] +# +# def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): +# # self.episode = [e for e in self.episode if e[0] == self.state] +# self._clear_states(0) +# super().train(s, a, r, sp, done) +# # Clear out many of the state, actions: +# self._clear_states(None) +# # for s in self.env.mdp.nonterminal_states: +# # if s != self.state: +# # self.v[s] = None +# +# pass + + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05) + agent = MCAgentOneState(env, gamma=1, alpha=None, first_visit=True, state=(0,2)) + method_label = 'MC (gamma=1)' + agent.label = method_label + autoplay = False + env, agent = interactive(env, agent, autoplay=autoplay) + # agent = PlayWrapper(agent, env,autoplay=autoplay) + # env = VideoMonitor(env, agent=agent, fps=100, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label}) + num_episodes = 1000 + train(env, agent, num_episodes=num_episodes) + env.close() + + # keyboard_play(env,agent,method_label='MC (alpha=0.5)') diff --git a/irlc/lectures/lec10/lecture_10_td_corner.py b/irlc/lectures/lec10/lecture_10_td_corner.py new file mode 100644 index 0000000000000000000000000000000000000000..e2aa0cde7955dac16fd6a97edb13339c3ed68ad7 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_td_corner.py @@ -0,0 +1,9 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment +from irlc.ex10.td0_evaluate import TD0ValueAgent + +if __name__ == "__main__": + env = SuttonCornerGridEnvironment() + agent = TD0ValueAgent(env, gamma=1, alpha=0.5) + keyboard_play(env,agent,method_label='TD(0) (alpha=0.5)') diff --git a/irlc/lectures/lec10/lecture_10_td_keyboard.py b/irlc/lectures/lec10/lecture_10_td_keyboard.py new file mode 100644 index 0000000000000000000000000000000000000000..8787900face05cca2791b80d72fc51323dec2392 --- /dev/null +++ b/irlc/lectures/lec10/lecture_10_td_keyboard.py @@ -0,0 +1,9 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec10.lecture_10_mc_q_estimation import automatic_play_value +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.ex10.td0_evaluate import TD0ValueAgent + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', living_reward=-0.05) + agent = TD0ValueAgent(env, gamma=1.0, alpha=0.2) + automatic_play_value(env,agent,method_label='TD(0)') diff --git a/irlc/lectures/lec10/unf_gridworld_action_value.py b/irlc/lectures/lec10/unf_gridworld_action_value.py new file mode 100644 index 0000000000000000000000000000000000000000..67d3ad982874e98221729b5c138a7f1373a6c706 --- /dev/null +++ b/irlc/lectures/lec10/unf_gridworld_action_value.py @@ -0,0 +1,42 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex01.agent import Agent +from irlc.gridworld.gridworld_environments import BookGridEnvironment, FrozenLakeEnv +# from irlc.utils.video_monitor import VideoMonitor +from irlc import interactive, train +# from irlc.ex01.agent import train +# from irlc import PlayWrapper + +from irlc.ex10.mc_agent import MCAgent + +class SingleActionValueAgent(MCAgent): + def __init__(self, env, gamma=1.0, epsilon=0.05, alpha=None, first_visit=True): + super().__init__(env, gamma=1., epsilon=1, alpha=None, first_visit=True) + + def pi(self, s, k, info=None): + if k == 0: + return 1 + else: + return super().pi_eps(s, info=None) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + super().train(s, a, r, sp, done, info_s, info_sp) + for s in self.env.mdp.nonterminal_states: + for a in self.env.mdp.A(s): + if s == (0,0) and a == 1: + pass + elif len(self.env.mdp.A(s)) == 1: + pass + else: + self.Q[s,a] = 0 + a = 234 + + + + + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', print_states=True, living_reward=-0.05) + env, agent = interactive(env, SingleActionValueAgent(env)) + agent.label = "Random agent" + train(env, agent, num_episodes=100, verbose=False) + env.close() diff --git a/irlc/lectures/lec10/unf_gridworld_value.py b/irlc/lectures/lec10/unf_gridworld_value.py new file mode 100644 index 0000000000000000000000000000000000000000..7286e1d56fc41698bb90e488ef242ba6236a8f59 --- /dev/null +++ b/irlc/lectures/lec10/unf_gridworld_value.py @@ -0,0 +1,42 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex01.agent import Agent +from irlc.gridworld.gridworld_environments import BookGridEnvironment, FrozenLakeEnv +# from irlc.utils.video_monitor import VideoMonitor +from irlc import interactive, train +# from irlc.ex01.agent import train +# from irlc import PlayWrapper +from irlc.ex10.mc_agent import MCAgent +from irlc.ex10.mc_evaluate import MCEvaluationAgent + +class SingleActionValueAgent(MCEvaluationAgent): + def __init__(self, env, gamma=1.0, epsilon=0.05, alpha=None, first_visit=True): + super().__init__(env, gamma=1., alpha=None, first_visit=True) + + # def pi(self, s, k, info=None): + # if k == 0: + # return 1 + # else: + # return super().pi_eps(s, info=None) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + super().train(s, a, r, sp, done, info_s, info_sp) + for s in self.env.mdp.nonterminal_states: + # for a in self.env.mdp.A(s): + if s == (0,0):# and a == 1: + pass + elif len(self.env.mdp.A(s)) == 1: + pass + else: + self.v[s] = 0 + a = 234 + + + + + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human', print_states=True, living_reward=-0.05) + env, agent = interactive(env, SingleActionValueAgent(env)) + agent.label = "Random agent" + train(env, agent, num_episodes=100, verbose=False) + env.close() diff --git a/irlc/lectures/lec11/__init__.py b/irlc/lectures/lec11/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec11/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec11/exam_sol.py b/irlc/lectures/lec11/exam_sol.py new file mode 100644 index 0000000000000000000000000000000000000000..7687d1736244fd5531c35cd54ebdac7c25fc0a61 --- /dev/null +++ b/irlc/lectures/lec11/exam_sol.py @@ -0,0 +1,11 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.exam_tabular_examples.sarsa_nstep_delay import SarsaDelayNAgent +from irlc import interactive, train + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human') + agent = SarsaDelayNAgent(env, gamma=1, epsilon=0.1, alpha=0.9, n=1) # Exam problem. + # agent = SarsaDelayNAgent(env, gamma=0.95, epsilon=0.1, alpha=.2, n=1) + env, agent = interactive(env, agent) + train(env, agent, num_episodes=10) diff --git a/irlc/lectures/lec11/lecture_10_grid_lin_q.py b/irlc/lectures/lec11/lecture_10_grid_lin_q.py new file mode 100644 index 0000000000000000000000000000000000000000..659201d8487242b35aaa56cde863327a2d341595 --- /dev/null +++ b/irlc/lectures/lec11/lecture_10_grid_lin_q.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.berkley.rl.semi_grad_q import LinearSemiGradQAgent +from irlc.ex11.feature_encoder import GridworldXYEncoder +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human') + agent = LinearSemiGradQAgent(env, gamma=0.95, epsilon=0.1, alpha=.01, q_encoder=GridworldXYEncoder(env)) + keyboard_play(env, agent, method_label="Q-lin-xy") diff --git a/irlc/lectures/lec11/lecture_10_sarsa_open.py b/irlc/lectures/lec11/lecture_10_sarsa_open.py new file mode 100644 index 0000000000000000000000000000000000000000..4e1ca8c2c18c33a9eb2c0dcf38d8d354c192460a --- /dev/null +++ b/irlc/lectures/lec11/lecture_10_sarsa_open.py @@ -0,0 +1,12 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import OpenGridEnvironment +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.ex11.sarsa_agent import SarsaAgent + +def open_play(Agent, method_label, frames_per_second=30, **args): + env = OpenGridEnvironment(render_mode='human', frames_per_second=frames_per_second) + agent = Agent(env, gamma=0.99, epsilon=0.1, alpha=.5, **args) + keyboard_play(env, agent, method_label=method_label) + +if __name__ == "__main__": + open_play(SarsaAgent, method_label="Sarsa") diff --git a/irlc/lectures/lec11/lecture_11_nstep_open.py b/irlc/lectures/lec11/lecture_11_nstep_open.py new file mode 100644 index 0000000000000000000000000000000000000000..fc5e285575ca230982ed20ac722a483e99e52026 --- /dev/null +++ b/irlc/lectures/lec11/lecture_11_nstep_open.py @@ -0,0 +1,11 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.berkley.rl.feature_encoder import SimplePacmanExtractor + +from irlc.ex11.nstep_sarsa_agent import SarsaNAgent +from irlc.exam_tabular_examples.sarsa_nstep_delay import SarsaDelayNAgent + +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +if __name__ == "__main__": + # env = OpenGridEnvironment() + # agent = (env, gamma=0.95, epsilon=0.1, alpha=.5) + open_play(SarsaDelayNAgent, method_label="Sarsa n=8", n=8) diff --git a/irlc/lectures/lec11/lecture_11_pacman_lin_q.py b/irlc/lectures/lec11/lecture_11_pacman_lin_q.py new file mode 100644 index 0000000000000000000000000000000000000000..3b7e121efe6485e2529359a5979091cfc207cd1a --- /dev/null +++ b/irlc/lectures/lec11/lecture_11_pacman_lin_q.py @@ -0,0 +1,32 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex11.semi_grad_q import LinearSemiGradQAgent +from irlc.pacman.pacman_environment import PacmanEnvironment, PacmanWinWrapper +from irlc.ex11.feature_encoder import SimplePacmanExtractor +import matplotlib.pyplot as plt +# from irlc.utils.video_monitor import VideoMonitor +from irlc.ex01.agent import train +# from irlc import PlayWrapper +from irlc import interactive + +def play_pacman(env, agent, layout = 'smallGrid'): + train(env, agent, num_episodes=100) + + env2 = PacmanWinWrapper(env) + + # env2 = Monitor(env2, directory="experiments/randomdir", force=True) + # env2 = VideoMonitor(env2) + env2, agent = interactive(env, agent) + agent.epsilon = 0 + agent.alpha = 0 + # agent = PlayWrapper(agent, env2) + train(env2, agent, num_episodes=100) + plt.show() + env.close() + +if __name__ == "__main__": + layout = 'smallGrid' + env = PacmanEnvironment(animate_movement=True, layout=layout, render_mode='human', frames_per_second=100) + qex = SimplePacmanExtractor(env) + agent = LinearSemiGradQAgent(env, epsilon=0.05, alpha=0.1, gamma=0.8, q_encoder=qex) + play_pacman(env, agent, layout = 'smallGrid') + # main_plot('experiments/q_lin') diff --git a/irlc/lectures/lec11/lecture_11_pacman_q.py b/irlc/lectures/lec11/lecture_11_pacman_q.py new file mode 100644 index 0000000000000000000000000000000000000000..7a51a0679ae8ee815a34df28dedb721b5632ebee --- /dev/null +++ b/irlc/lectures/lec11/lecture_11_pacman_q.py @@ -0,0 +1,35 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.pacman.pacman_environment import PacmanEnvironment, PacmanWinWrapper +# from irlc.berkley.rl.feature_encoder import SimplePacmanExtractor +# from irlc.utils.player_wrapper_pyglet import PlayWrapper +from irlc import main_plot +import matplotlib.pyplot as plt +# from irlc.utils.video_monitor import VideoMonitor +from irlc.ex01.agent import train +# from irlc.lectures.lecture_09_mc import keyboard_play +from irlc.ex11.q_agent import QAgent +from irlc import interactive + + +def play_pacman(env, agent, layout = 'smallGrid'): + + train(env, agent, num_episodes=100) + env2 = PacmanWinWrapper(env) + # env2 = Monitor(env2, directory="experiments/randomdir", force=True) + # env2 = VideoMonitor(env2) + env2, agent = interactive(env2, agent) + agent.epsilon = 0 + agent.alpha = 0 + # agent = PlayWrapper(agent, env2) + train(env2, agent, num_episodes=100) + plt.show() + env.close() + +if __name__ == "__main__": + layout = 'smallGrid' + env = PacmanEnvironment(animate_movement=False, layout=layout, render_mode='human') + agent = QAgent(env, epsilon=0.05, alpha=0.1, gamma=0.8) + # from irlc import PlayWrapper + # agent = PlayWrapper(agent, env) + play_pacman(env, agent, layout = 'smallGrid') + # main_plot('experiments/q_lin') diff --git a/irlc/lectures/lec11/lecture_11_q.py b/irlc/lectures/lec11/lecture_11_q.py new file mode 100644 index 0000000000000000000000000000000000000000..d3df9dbb8f1836bfbe0c622be1212acbb57b6367 --- /dev/null +++ b/irlc/lectures/lec11/lecture_11_q.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.berkley.rl.feature_encoder import SimplePacmanExtractor +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.ex11.q_agent import QAgent + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human') + agent = QAgent(env, gamma=0.95, epsilon=0.1, alpha=.2) + keyboard_play(env, agent, method_label="Q-learning") diff --git a/irlc/lectures/lec11/lecture_11_q_cliff.py b/irlc/lectures/lec11/lecture_11_q_cliff.py new file mode 100644 index 0000000000000000000000000000000000000000..421db1fa16764a3b432bd03d4a072f2108dabe77 --- /dev/null +++ b/irlc/lectures/lec11/lecture_11_q_cliff.py @@ -0,0 +1,18 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import CliffGridEnvironment, CliffGridEnvironment2 +from irlc.ex11.q_agent import QAgent + + +# def cliffwalk(env, agent, method_label="method"): +# agent = PlayWrapper(agent, env) + # env = VideoMonitor(env, agent=agent, fps=100, continious_recording=True, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label}) + # train(env, agent, num_episodes=200) + # env.close() + +from irlc.lectures.lec11.lecture_11_sarsa_cliff import cliffwalk, gamma, alpha, epsi +if __name__ == "__main__": + import numpy as np + np.random.seed(1) + env = CliffGridEnvironment2(zoom=.8, render_mode='human') + agent = QAgent(env, gamma=gamma, epsilon=epsi, alpha=alpha) + cliffwalk(env, agent, method_label="Q-learning") diff --git a/irlc/lectures/lec11/lecture_11_q_open.py b/irlc/lectures/lec11/lecture_11_q_open.py new file mode 100644 index 0000000000000000000000000000000000000000..f0a35a5ba17fde85fb2b10da97413aba4879c5c6 --- /dev/null +++ b/irlc/lectures/lec11/lecture_11_q_open.py @@ -0,0 +1,12 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld_pyglet.gridworld_environments import OpenGridEnvironment +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.ex11.q_agent import QAgent + +def open_play(Agent, method_label, **args): + env = OpenGridEnvironment() + agent = Agent(env, gamma=0.99, epsilon=0.1, alpha=.5, **args) + keyboard_play(env, agent, method_label=method_label) + +if __name__ == "__main__": + open_play(QAgent, method_label="Q-learning") diff --git a/irlc/lectures/lec11/lecture_11_sarsa.py b/irlc/lectures/lec11/lecture_11_sarsa.py new file mode 100644 index 0000000000000000000000000000000000000000..7dfb39d048975b86a31fbae151fef17944935155 --- /dev/null +++ b/irlc/lectures/lec11/lecture_11_sarsa.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import BookGridEnvironment +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.exam_tabular_examples.sarsa_nstep_delay import SarsaDelayNAgent + +if __name__ == "__main__": + env = BookGridEnvironment(render_mode='human') + # agent = SarsaDelayNAgent(env, gamma=1, epsilon=0.1, alpha=0.9, n=1) # Exam problem. + agent = SarsaDelayNAgent(env, gamma=0.95, epsilon=0.1, alpha=.2, n=1) + keyboard_play(env, agent, method_label="Sarsa") diff --git a/irlc/lectures/lec11/lecture_11_sarsa_cliff.py b/irlc/lectures/lec11/lecture_11_sarsa_cliff.py new file mode 100644 index 0000000000000000000000000000000000000000..3d250fa581975dbbc9fbf1fd2afebd5814c6b6e3 --- /dev/null +++ b/irlc/lectures/lec11/lecture_11_sarsa_cliff.py @@ -0,0 +1,33 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.utils.player_wrapper_pyglet import PlayWrapper +from irlc.gridworld.gridworld_environments import CliffGridEnvironment, CliffGridEnvironment2 +# from irlc.utils.video_monitor import VideoMonitor +from irlc.ex01.agent import train +from irlc import interactive +from irlc.ex11.sarsa_agent import SarsaAgent + + +def cliffwalk(env, agent, method_label="method"): + # agent = PlayWrapper(agent, env) + env.label = method_label + agent.method_label = method_label + agent.label = method_label + agent.method = method_label + + + env, agent = interactive(env, agent) + # env = VideoMonitor(env, agent=agent, fps=200, continious_recording=True, agent_monitor_keys=('pi', 'Q'), render_kwargs={'method_label': method_label}) + train(env, agent, num_episodes=1000) + env.close() + +epsi = 0.5 +gamma = 1.0 +alpha = .3 + +if __name__ == "__main__": + import numpy as np + np.random.seed(1) + env = CliffGridEnvironment2(zoom=.8, render_mode='human') + agent = SarsaAgent(env, gamma=gamma, epsilon=epsi, alpha=alpha) + # agent = QAgent(env, gamma=0.95, epsilon=0.5, alpha=.2) + cliffwalk(env, agent, method_label="Sarsa") diff --git a/irlc/lectures/lec12/__init__.py b/irlc/lectures/lec12/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec12/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec12/lecture_12_mc_open.py b/irlc/lectures/lec12/lecture_12_mc_open.py new file mode 100644 index 0000000000000000000000000000000000000000..e0adf318bfe2985e41e994e4afcdc5a0f26494f0 --- /dev/null +++ b/irlc/lectures/lec12/lecture_12_mc_open.py @@ -0,0 +1,19 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.berkley.rl.feature_encoder import SimplePacmanExtractor +# from irlc.lectures.lecture_09_mc import keyboard_play + +# alpha = 0.5 +# gamma = + +# def open_play(Agent, method_label, **args): +# env = OpenGridEnvironment() +# agent = Agent(env, gamma=0.95, epsilon=0.1, alpha=.5, **args) +# keyboard_play(env, agent, method_label=method_label) + +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +from irlc.ex10.mc_agent import MCAgent +if __name__ == "__main__": + # env = OpenGridEnvironment() + # agent = (env, gamma=0.95, epsilon=0.1, alpha=.5) + open_play(MCAgent, method_label="MC agent") + # diff --git a/irlc/lectures/lec12/lecture_12_pacman.py b/irlc/lectures/lec12/lecture_12_pacman.py new file mode 100644 index 0000000000000000000000000000000000000000..3e3f9fbe04edc908086220f25715d09827db33c9 --- /dev/null +++ b/irlc/lectures/lec12/lecture_12_pacman.py @@ -0,0 +1,21 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex11.semi_grad_q import LinearSemiGradQAgent +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc.ex01.agent import train +from irlc import interactive +from irlc.lectures.chapter14lectures.lecture11pacman import layout, rns +# from irlc import VideoMonitor + +if __name__ == "__main__": + env = PacmanEnvironment(animate_movement=False, layout=layout) + + n, agent = rns[-1] + agent = agent(env) + # env, agent = interactive(env, agent) + + train(env, agent, num_episodes=100, max_runs=20) + env2 = PacmanEnvironment(animate_movement=True, layout=layout, render_mode='human') + # agent.env = env2 + env2, agent = interactive(env2, agent) + train(env2, agent, num_episodes=100, max_runs=20) + env2.close() diff --git a/irlc/lectures/lec12/lecture_12_sarsa_lamda_open.py b/irlc/lectures/lec12/lecture_12_sarsa_lamda_open.py new file mode 100644 index 0000000000000000000000000000000000000000..0e1a233749e000ec65dbdeacecd29cb443f122e4 --- /dev/null +++ b/irlc/lectures/lec12/lecture_12_sarsa_lamda_open.py @@ -0,0 +1,6 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +from irlc.exam_tabular_examples.sarsa_lambda_delay import SarsaLambdaDelayAgent + +if __name__ == "__main__": + open_play(SarsaLambdaDelayAgent, method_label="Sarsa(Lambda)", lamb=0.8) diff --git a/irlc/lectures/lec12/lecture_12_sarsa_nstep.py b/irlc/lectures/lec12/lecture_12_sarsa_nstep.py new file mode 100644 index 0000000000000000000000000000000000000000..0f04c1abc96ad14ca75d458ecbc882eb01354d3c --- /dev/null +++ b/irlc/lectures/lec12/lecture_12_sarsa_nstep.py @@ -0,0 +1,13 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.gridworld.gridworld_environments import OpenGridEnvironment +from irlc import train +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +from irlc.exam_tabular_examples.sarsa_nstep_delay import SarsaDelayNAgent + +if __name__ == "__main__": + n = 8 + env = OpenGridEnvironment() + agent = SarsaDelayNAgent(env, n=n) + train(env, agent, num_episodes=100) + + open_play(SarsaDelayNAgent, method_label=f"Sarsa n={n}", n=n) diff --git a/irlc/lectures/lec12/lecture_12_sarsa_open.py b/irlc/lectures/lec12/lecture_12_sarsa_open.py new file mode 100644 index 0000000000000000000000000000000000000000..dfba5b0e37f8668ef3b847155adcb15d52734e1c --- /dev/null +++ b/irlc/lectures/lec12/lecture_12_sarsa_open.py @@ -0,0 +1,11 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import train +from irlc.gridworld.gridworld_environments import OpenGridEnvironment +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +from irlc.exam_tabular_examples.sarsa_nstep_delay import SarsaDelayNAgent + +if __name__ == "__main__": + env = OpenGridEnvironment() + agent = SarsaDelayNAgent(env, n=1) + train(env, agent, num_episodes=100) + open_play(SarsaDelayNAgent, method_label=f"Sarsa") diff --git a/irlc/lectures/lec13/double_q_viz.py b/irlc/lectures/lec13/double_q_viz.py new file mode 100644 index 0000000000000000000000000000000000000000..cca339d46fbdde4ffc94433b3e146b9ae3145a69 --- /dev/null +++ b/irlc/lectures/lec13/double_q_viz.py @@ -0,0 +1,71 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex01.agent import train +import gymnasium as gym +from irlc import main_plot +import matplotlib.pyplot as plt +from irlc import savepdf +from irlc.ex11.sarsa_agent import SarsaAgent +from irlc.ex11.q_agent import QAgent +from irlc.ex13.tabular_double_q import TabularDoubleQ +from irlc.ex09.rl_agent import TabularQ +from irlc.gridworld.gridworld_environments import CliffGridEnvironment + +class DoubleQVizAgent(TabularDoubleQ): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.Q = TabularQ(self.env) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + super().train(s, a, r, sp, done, info_s,info_sp) + self.Q[s,a] = (self.Q1[s,a] + self.Q2[s,a] )/2 + +def train_cliff(runs=4, extension="long", save_pdf=False, alpha=0.02, num_episodes=5000): + """ Part 1: Cliffwalking """ + # env = gym.make('CliffWalking-v0') + + env = CliffGridEnvironment(zoom=1) + epsilon = 0.1 + # alpha = 0.02 + for _ in range(runs): + agents = [QAgent(env, gamma=1, epsilon=epsilon, alpha=alpha), + SarsaAgent(env, gamma=1, epsilon=epsilon, alpha=alpha), + DoubleQVizAgent(env, gamma=1, epsilon=epsilon, alpha=alpha)] + + experiments = [] + for agent in agents: + expn = f"experiments/doubleq_cliffwalk_{extension}_{str(agent)}" + train(env, agent, expn, num_episodes=num_episodes, max_runs=1e6) + experiments.append(expn) + if save_pdf: + main_plot(experiments, smoothing_window=20, resample_ticks=500) + plt.ylim([-100, 50]) + plt.title(f"Double-Q learning on Cliffwalk ({extension})") + savepdf(f"double_Q_learning_cliff_{extension}") + plt.show() + return agents, env + + +def grid_experiment(runs=20, extension="long", alpha=0.02, num_episodes=5000): + from irlc.gridworld.gridworld_environments import CliffGridEnvironment + # from irlc import VideoMonitor, PlayWrapper + from irlc import interactive + + agents, env = train_cliff(runs=runs, extension=extension, save_pdf=True, alpha=alpha, num_episodes=num_episodes) + labels = ["Q-learning", "Sarsa", "Double Q-learning"] + for na in range(len(agents)): + env2 = CliffGridEnvironment(zoom=1, view_mode='human') + env2, agent = interactive(env2, agent=agents[na])# , agent_monitor_keys=('Q',), render_kwargs={'method_label': labels[na]}) + # agent = PlayWrapper(agents[na], env) + env2.savepdf(f"doubleq_cliff_{extension}_agent_{na}") + env2.close() + + env.close() + pass + +if __name__ == "__main__": + """ + Test cliffwalk in both the long and short version + """ + grid_experiment(runs=1, extension="long", alpha=0.02, num_episodes=5000) + grid_experiment(runs=1, extension="short", alpha=0.25, num_episodes=500) diff --git a/irlc/lectures/lec13/lecture_13_Q_maze.py b/irlc/lectures/lec13/lecture_13_Q_maze.py new file mode 100644 index 0000000000000000000000000000000000000000..c1b0582f47628a24f1c5d2de5c436c96b6b9cb10 --- /dev/null +++ b/irlc/lectures/lec13/lecture_13_Q_maze.py @@ -0,0 +1,14 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +from irlc.ex11.q_agent import QAgent +from irlc.ex13.dyna_q import DynaQ +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonMazeEnvironment + +def sutton_maze_play(Agent, method_label="Q-learning agent", **kwargs): + env = SuttonMazeEnvironment(render_mode='human') + agent = Agent(env, gamma=0.98, epsilon=0.1, alpha=.5, **kwargs) + keyboard_play(env, agent, method_label=method_label) + +if __name__ == "__main__": + sutton_maze_play(DynaQ, method_label="Q-learning agent", n=0) diff --git a/irlc/lectures/lec13/lecture_13_Q_open.py b/irlc/lectures/lec13/lecture_13_Q_open.py new file mode 100644 index 0000000000000000000000000000000000000000..b45e0697e34600e0dc22f5f0b0ba597f1e9beb41 --- /dev/null +++ b/irlc/lectures/lec13/lecture_13_Q_open.py @@ -0,0 +1,6 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +from irlc.ex11.q_agent import QAgent + +if __name__ == "__main__": + open_play(QAgent, method_label="Q-learning agent") diff --git a/irlc/lectures/lec13/lecture_13_dyna_q_5_maze.py b/irlc/lectures/lec13/lecture_13_dyna_q_5_maze.py new file mode 100644 index 0000000000000000000000000000000000000000..293771d49426b2d27e92c6e719bbac8854973438 --- /dev/null +++ b/irlc/lectures/lec13/lecture_13_dyna_q_5_maze.py @@ -0,0 +1,10 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec11.lecture_10_sarsa_open import open_play +from irlc.ex11.q_agent import QAgent +from irlc.ex13.dyna_q import DynaQ +from irlc.lectures.lec10.lecture_10_mc_q_estimation import keyboard_play +from irlc.gridworld.gridworld_environments import SuttonMazeEnvironment +from irlc.lectures.lec13.lecture_13_Q_maze import sutton_maze_play + +if __name__ == "__main__": + sutton_maze_play(DynaQ, method_label="DynaQ (n=5)", n=5) diff --git a/irlc/lectures/lec13/lecture_13_sarsa_lambda_maze.py b/irlc/lectures/lec13/lecture_13_sarsa_lambda_maze.py new file mode 100644 index 0000000000000000000000000000000000000000..4336879fe4d49f99c6852847a968491829afc77f --- /dev/null +++ b/irlc/lectures/lec13/lecture_13_sarsa_lambda_maze.py @@ -0,0 +1,6 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.lectures.lec13.lecture_13_Q_maze import sutton_maze_play +from irlc.ex12.sarsa_lambda_agent import SarsaLambdaAgent + +if __name__ == "__main__": + sutton_maze_play(SarsaLambdaAgent, method_label="Sarsa(Lambda=0.9)", lamb=0.9) diff --git a/irlc/lectures/readme.md b/irlc/lectures/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..1d3ccdbfd57fe2976e3fb29c3256386f548da3ff --- /dev/null +++ b/irlc/lectures/readme.md @@ -0,0 +1,6 @@ +# In-class examples + +This folder contains various examples used throughout class. You should be able to run most of the examples +if you find it helpful (and many of the examples are simply running the exercise code), however, +in some instances I have made small changes to the exercises to provide additional visualizations etc. Also note that the code is sometimes not +well organized -- in other words, the folder is provided "as is" for those who find it helpful, and you are free to ignore it. diff --git a/irlc/pacman/__init__.py b/irlc/pacman/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..991666419eb8411137ce96f826e7d6883892af7b --- /dev/null +++ b/irlc/pacman/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.pacman.pacman_environment import PacmanEnvironment diff --git a/irlc/pacman/__pycache__/__init__.cpython-311.pyc b/irlc/pacman/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59a1fd223427c1d692a1f9db502ea71573779c82 Binary files /dev/null and b/irlc/pacman/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/pacman/__pycache__/gamestate.cpython-311.pyc b/irlc/pacman/__pycache__/gamestate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9aa794223cbc54858e8d02ec80d33c6986577e08 Binary files /dev/null and b/irlc/pacman/__pycache__/gamestate.cpython-311.pyc differ diff --git a/irlc/pacman/__pycache__/layout.cpython-311.pyc b/irlc/pacman/__pycache__/layout.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03091d715903d53e6e467e17b514095681882ca2 Binary files /dev/null and b/irlc/pacman/__pycache__/layout.cpython-311.pyc differ diff --git a/irlc/pacman/__pycache__/pacman_environment.cpython-311.pyc b/irlc/pacman/__pycache__/pacman_environment.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b601df57ba77d362482407b7d03491083d23a656 Binary files /dev/null and b/irlc/pacman/__pycache__/pacman_environment.cpython-311.pyc differ diff --git a/irlc/pacman/__pycache__/pacman_graphics_display.cpython-311.pyc b/irlc/pacman/__pycache__/pacman_graphics_display.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b583f2ce171807c07a0c845e957334a277a79ec Binary files /dev/null and b/irlc/pacman/__pycache__/pacman_graphics_display.cpython-311.pyc differ diff --git a/irlc/pacman/__pycache__/pacman_text_display.cpython-311.pyc b/irlc/pacman/__pycache__/pacman_text_display.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3cf59db02dff7911114b3da4081c6965dc0c4f08 Binary files /dev/null and b/irlc/pacman/__pycache__/pacman_text_display.cpython-311.pyc differ diff --git a/irlc/pacman/__pycache__/pacman_utils.cpython-311.pyc b/irlc/pacman/__pycache__/pacman_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a7be403248e32e151ccbef8ab4ac2bfd4a8ff9b Binary files /dev/null and b/irlc/pacman/__pycache__/pacman_utils.cpython-311.pyc differ diff --git a/irlc/pacman/feature_extractor.py b/irlc/pacman/feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..7a409464496293476228ba0d699316eab62c143e --- /dev/null +++ b/irlc/pacman/feature_extractor.py @@ -0,0 +1,109 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# feature_extractor.py +# -------------------- +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). +from irlc.pacman.pacman_utils import Actions + +## Other classes +class FeatureExtractor: + def getFeatures(self, state, action): + """ + Returns a dict from features to counts + Usually, the count will just be 1.0 for + indicator functions. + """ + raise NotImplementedError() + # util.raiseNotDefined() + +class IdentityExtractor(FeatureExtractor): + def getFeatures(self, state, action): + from collections import defaultdict + feats = defaultdict(lambda: 0) + # feats = util.Counter() + feats[(state,action)] = 1.0 + return feats + +class CoordinateExtractor(FeatureExtractor): + def getFeatures(self, state, action): + from collections import defaultdict + feats = defaultdict(lambda: 0) + # feats = util.Counter() + feats[state] = 1.0 + feats['x=%d' % state[0]] = 1.0 + feats['y=%d' % state[0]] = 1.0 + feats['action=%s' % action] = 1.0 + return feats + +def closestFood(pos, food, walls): + """ + closestFood -- this is similar to the function that we have + worked on in the search project; here its all in one place + """ + fringe = [(pos[0], pos[1], 0)] + expanded = set() + while fringe: + pos_x, pos_y, dist = fringe.pop(0) + if (pos_x, pos_y) in expanded: + continue + expanded.add((pos_x, pos_y)) + # if we find a food at this location then exit + if food[pos_x][pos_y]: + return dist + # otherwise spread out from the location to its neighbours + nbrs = Actions.getLegalNeighbors((pos_x, pos_y), walls) + for nbr_x, nbr_y in nbrs: + fringe.append((nbr_x, nbr_y, dist+1)) + # no food found + return None + +class SimpleExtractor(FeatureExtractor): + """ + Returns simple features for a basic reflex Pacman: + - whether food will be eaten + - how far away the next food is + - whether a ghost collision is imminent + - whether a ghost is one step away + """ + + def getFeatures(self, state, action): + # extract the grid of food and wall locations and get the ghost locations + food = state.getFood() + walls = state.getWalls() + ghosts = state.getGhostPositions() + + from collections import defaultdict + features = defaultdict(lambda: 0) + + # features = util.Counter() + + features["bias"] = 1.0 + + # compute the location of pacman after he takes the action + x, y = state.getPacmanPosition() + dx, dy = Actions.directionToVector(action) + next_x, next_y = int(x + dx), int(y + dy) + + # count the number of ghosts 1-step away + features["#-of-ghosts-1-step-away"] = sum((next_x, next_y) in Actions.getLegalNeighbors(g, walls) for g in ghosts) + + # if there is no danger of ghosts then add the food feature + if not features["#-of-ghosts-1-step-away"] and food[next_x][next_y]: + features["eats-food"] = 1.0 + + dist = closestFood((next_x, next_y), food, walls) + if dist is not None: + # make the distance a number less than one otherwise the update + # will diverge wildly + features["closest-food"] = float(dist) / (walls.width * walls.height) + # features.divideAll(10.0) + features = {k: v/10.0 for k, v in features.items() } + return features diff --git a/irlc/pacman/gamestate.py b/irlc/pacman/gamestate.py new file mode 100644 index 0000000000000000000000000000000000000000..c75db5f6b1b38acd1a17ac217ceac057426163db --- /dev/null +++ b/irlc/pacman/gamestate.py @@ -0,0 +1,812 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# gamestate.py +# --------- +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). + + +""" +Pacman.py holds the logic for the classic pacman game along with the main +code to run a game. This file is divided into three sections: + + (i) Your interface to the pacman world: + Pacman is a complex environment. You probably don't want to + read through all of the code we wrote to make the game runs + correctly. This section contains the parts of the code + that you will need to understand in order to complete the + project. There is also some code in pacman_utils.py that you should + understand. + + (ii) The hidden secrets of pacman: + This section contains all of the logic code that the pacman + environment uses to decide who can move where, who dies when + things collide, etc. You shouldn't need to read this section + of code, but you can if you want. + + (iii) Framework to start a game: + The final section contains the code for reading the command + you use to set up the game, then starting up a new game, along with + linking in all the external parts (agent functions, graphics). + Check this section out to see all the options available to you. + +To play your first game, type 'python gamestate.py' from the command line. +The keys are 'a', 's', 'd', and 'w' to move (or arrow keys). Have fun! +""" +import irlc.pacman.pacman_utils +from irlc.pacman.pacman_utils import GameStateData +from irlc.pacman.pacman_utils import Game +from irlc.pacman.pacman_utils import Directions +from irlc.pacman.pacman_utils import Actions + + +################################################### +# YOUR INTERFACE TO THE PACMAN WORLD: A GameState # +################################################### + +class GameState: + r""" + A `GameState` specifies the full game state, including the food, capsules, + agent configurations and score changes. + + `GameState`\ s are used by the Game object to capture the actual state of the game and + can be used by agents to reason about the game. + + Much of the information in a GameState is stored in a `GameStateData` object. We + strongly suggest that you access that data via the accessor methods below rather + than referring to the `GameStateData` object directly. + + Note that in classic Pacman, Pacman is always agent 0. + + To get you started, here are some examples. + + .. runblock:: pycon + + >>> from irlc.pacman.pacman_environment import PacmanEnvironment, very_small_haunted_maze + >>> env = PacmanEnvironment(layout_str=very_small_haunted_maze) + >>> state, _ = env.reset() # Get starting state + >>> print(state) + + In the above code, `state` is a `GameState` instance -- i.e. has all the methods found in + *this* class. So for instance to know if the game is won or lost you can do: + + .. runblock:: pycon + + >>> from irlc.pacman.pacman_environment import PacmanEnvironment, very_small_haunted_maze + >>> env = PacmanEnvironment(layout_str=very_small_haunted_maze) + >>> state, _ = env.reset() # Get starting state + >>> print("Did we win?", state.is_won(), "did we loose?", state.is_lost()) + + Or to get the available actions, and then the *next* state representing what occurs when you take an action `a`: + + .. runblock:: pycon + + >>> from irlc.pacman.pacman_environment import PacmanEnvironment, very_small_haunted_maze + >>> env = PacmanEnvironment(layout_str=very_small_haunted_maze) + >>> state, _ = env.reset() # Get starting state + >>> actions = state.A() + >>> print("Available actions are", actions) + >>> next_state = state.f(actions[0]) # Take the first action + >>> print(next_state) # Result of taking the first of the available actions. + + When a ghost move, it will select randomly between the available actions. Thus, the chance of a single move is :python:`1/len(state.A())`. + """ + ##################################################### + # 02465-relevant stuff: These methods allows you to # + # interact with the game-state. See comments above. # + ##################################################### + + def player(self) -> int: + """Return the current player. + + The players take turns. Initially ``player=0``, meaning it is Pacman (your) turn, and in case there are ghosts + player will then increment until all ghosts have moved at which point ``player = 0`` again and the game is ready + for the next step. + + :return: The id of the player who will make the next move. + """ + return self._player + + def players(self): + """Return the total number of players. + + :return: Return the number of ghosts + 1 (pacman). + """ + return self.getNumAgents() + + def A(self): + """Return the available actions for the current player in this state. + + If the state is won/lost, the actions will be just the stop-action: ``["Stop"]``. + + :return: Available actions as a list. + """ + if self.is_won() or self.is_lost(): + return [Directions.STOP] + else: + return self.getLegalActions(self.player()) + + def f(self, a : str) -> object: + """Let the current player take action ``a``. + + This will return a new GameState corresponding to the current player taking an action. + + :param a: The action to take. + :return: The next GameState. + """ + if self.is_won() or self.is_lost(): + return self + + suc = self.generateSuccessor(self.player(), a) + suc._player = (self.player() + 1) % self.getNumAgents() + return suc + + def is_lost(self): + """Determine if this is a lost game. + + :return: ``True`` if this GameState corresponds to a lost game (a ghost ate pacman) + """ + return self.data._lose + + def is_won(self): + """Determine if this is a won game. + + :return: ``True`` if this GameState corresponds to a won game (all pellets eaten) + """ + return self.data._win + + ################################################################################################## + # End of 02465-related stuff. These methods are internal to the game and should **not** be used. # + ################################################################################################## + + # static variable keeps track of which states have had getLegalActions called + explored = set() + def getAndResetExplored(): + tmp = GameState.explored.copy() + GameState.explored = set() + return tmp + getAndResetExplored = staticmethod(getAndResetExplored) + + def getLegalActions( self, agentIndex=0 ): + # """ + # Returns the legal actions for the agent specified. + # """ +# GameState.explored.add(self) + if self.is_won() or self.is_lost(): return [] + + if agentIndex == 0: # Pacman is moving + return PacmanRules.getLegalActions( self ) + else: + return GhostRules.getLegalActions( self, agentIndex ) + + def generateSuccessor( self, agentIndex, action): + # """ + # Returns the successor state after the specified agent takes the action. + # """ + # Check that successors exist + if self.is_won() or self.is_lost(): raise Exception('Can\'t generate a successor of a terminal state.') + + # Copy current state + state = GameState(self) + + # Let agent's logic deal with its action's effects on the board + if agentIndex == 0: # Pacman is moving + state.data._eaten = [False for i in range(state.getNumAgents())] + PacmanRules.applyAction( state, action ) + else: # A ghost is moving + GhostRules.applyAction( state, action, agentIndex ) + + # Time passes + if agentIndex == 0: + state.data.scoreChange += -TIME_PENALTY # Penalty for waiting around + else: + GhostRules.decrementTimer( state.data.agentStates[agentIndex] ) + + # Resolve multi-agent effects + GhostRules.checkDeath( state, agentIndex ) + + # Book keeping + state.data._agentMoved = agentIndex + state.data.score += state.data.scoreChange + GameState.explored.add(self) + GameState.explored.add(state) + return state + + + def getLegalPacmanActions( self ): + return self.getLegalActions( 0 ) + + def generatePacmanSuccessor( self, action ): + # """ + # Generates the successor state after the specified pacman move + # """ + return self.generateSuccessor( 0, action ) + + def getPacmanState( self ): + # """ + # Returns an AgentState object for pacman (in pacman_utils.py) + # + # state.pos gives the current position + # state.direction gives the travel vector + # """ + return self.data.agentStates[0].copy() + + def getPacmanPosition( self ): + return self.data.agentStates[0].getPosition() + + def getGhostStates( self ): + return self.data.agentStates[1:] + + def getGhostState( self, agentIndex ): + if agentIndex == 0 or agentIndex >= self.getNumAgents(): + raise Exception("Invalid index passed to getGhostState") + return self.data.agentStates[agentIndex] + + def getGhostPosition( self, agentIndex ): + if agentIndex == 0: + raise Exception("Pacman's index passed to getGhostPosition") + return self.data.agentStates[agentIndex].getPosition() + + def getGhostPositions(self): + return [s.getPosition() for s in self.getGhostStates()] + + def getNumAgents( self ): + return len( self.data.agentStates ) + + def getScore( self ): + return float(self.data.score) + + def getCapsules(self): + # """ + # Returns a list of positions (x,y) of the remaining capsules. + # """ + return self.data.capsules + + def getNumFood( self ): + return self.data.food.count() + + def getFood(self): + # """ + # Returns a Grid of boolean food indicator variables. + # + # Grids can be accessed via list notation, so to check + # if there is food at (x,y), just call + # + # currentFood = state.getFood() + # if currentFood[x][y] == True: ... + # """ + return self.data.food + + def getWalls(self): + # """ + # Returns a Grid of boolean wall indicator variables. + # + # Grids can be accessed via list notation, so to check + # if there is a wall at (x,y), just call + # + # walls = state.getWalls() + # if walls[x][y] == True: ... + # """ + return self.data.layout.walls + + def hasFood(self, x, y): + return self.data.food[x][y] + + def hasWall(self, x, y): + return self.data.layout.walls[x][y] + + + ############################################# + # Helper methods: # + # You shouldn't need to call these directly # + ############################################# + + def __init__( self, prevState = None): + # """ + # Generates a new state by copying information from its predecessor. + # """ + if prevState != None: # Initial state + self.data = GameStateData(prevState.data) + else: + self.data = GameStateData() + self._player = 0 + + def deepCopy( self ): + state = GameState( self ) + state.data = self.data.deepCopy() + return state + + def __eq__( self, other ): + # """ + # Allows two states to be compared. + # """ + return hasattr(other, 'data') and self.data == other.data + + def __hash__( self ): + # """ + # Allows states to be keys of dictionaries. + # """ + return hash( self.data ) + + def __str__( self ): + return str(self.data) + + def initialize( self, layout, numGhostAgents=1000 ): + # """ + # Creates an initial game state from a layout array (see layout.py). + # """ + self.data.initialize(layout, numGhostAgents) + +############################################################################ +# THE HIDDEN SECRETS OF PACMAN # +# # +# You shouldn't need to look through the code in this section of the file. # +############################################################################ + +SCARED_TIME = 40 # Moves ghosts are scared +COLLISION_TOLERANCE = 0.7 # How close ghosts must be to Pacman to kill +TIME_PENALTY = 1 # Number of points lost each round + +class ClassicGameRules: + """ + These game rules manage the control flow of a game, deciding when + and how the game starts and ends. + """ + def __init__(self, timeout=30): + self.timeout = timeout + + def newGame( self, layout, pacmanAgent, ghostAgents, quiet = False, catchExceptions=False, time_penalty=TIME_PENALTY): + agents = [pacmanAgent] + ghostAgents[:layout.getNumGhosts()] + initState = GameState() # Time penalty is my idea + initState.initialize( layout, len(ghostAgents) ) + game = Game(agents=agents, rules=self, catchExceptions=catchExceptions) + game.state = initState + self.initialState = initState.deepCopy() + self.quiet = quiet + return game + + def process(self, state, game): + """ + Checks to see whether it is time to end the game. + """ + if state.is_won(): self.win(state, game) + if state.is_lost(): self.lose(state, game) + + def win( self, state, game ): + if not self.quiet: print("Pacman emerges victorious! Score: %d" % state.data.score) + game.gameOver = True + + def lose( self, state, game ): + if not self.quiet: print("Pacman died! Score: %d" % state.data.score) + game.gameOver = True + + def getProgress(self, game): + return float(game.state.getNumFood()) / self.initialState.getNumFood() + + def agentCrash(self, game, agentIndex): + if agentIndex == 0: + print("Pacman crashed") + else: + print("A ghost crashed") + + def getMaxTotalTime(self, agentIndex): + return self.timeout + + def getMaxStartupTime(self, agentIndex): + return self.timeout + + def getMoveWarningTime(self, agentIndex): + return self.timeout + + def getMoveTimeout(self, agentIndex): + return self.timeout + + def getMaxTimeWarnings(self, agentIndex): + return 0 + +class PacmanRules: + """ + These functions govern how pacman interacts with his environment under + the classic game rules. + """ + PACMAN_SPEED=1 + + def getLegalActions( state ): + """ + Returns a list of possible actions. + """ + return Actions.getPossibleActions( state.getPacmanState().configuration, state.data.layout.walls ) + getLegalActions = staticmethod( getLegalActions ) + + def applyAction( state, action ): + """ + Edits the state to reflect the results of the action. + """ + legal = PacmanRules.getLegalActions( state ) + if action not in legal: + raise Exception("Illegal action " + str(action)) + + pacmanState = state.data.agentStates[0] + + # Update Configuration + vector = Actions.directionToVector( action, PacmanRules.PACMAN_SPEED ) + pacmanState.configuration = pacmanState.configuration.generateSuccessor( vector ) + + # Eat + next = pacmanState.configuration.getPosition() + nearest = nearestPoint( next ) + if manhattanDistance( nearest, next ) <= 0.5 : + # Remove food + PacmanRules.consume( nearest, state ) + applyAction = staticmethod( applyAction ) + + def consume( position, state ): + x,y = position + # Eat food + if state.data.food[x][y]: + state.data.scoreChange += 10 + state.data.food = state.data.food.copy() + state.data.food[x][y] = False + state.data._foodEaten = position + # TODO: cache numFood? + numFood = state.getNumFood() + if numFood == 0 and not state.data._lose: + state.data.scoreChange += 500 + state.data._win = True + # Eat capsule + if( position in state.getCapsules() ): + state.data.capsules.remove( position ) + state.data._capsuleEaten = position + # Reset all ghosts' scared timers + for index in range( 1, len( state.data.agentStates ) ): + state.data.agentStates[index].scaredTimer = SCARED_TIME + consume = staticmethod( consume ) + +class GhostRules: + """ + These functions dictate how ghosts interact with their environment. + """ + GHOST_SPEED=1.0 + def getLegalActions( state, ghostIndex ): + """ + Ghosts cannot stop, and cannot turn around unless they + reach a dead end, but can turn 90 degrees at intersections. + """ + conf = state.getGhostState( ghostIndex ).configuration + possibleActions = Actions.getPossibleActions( conf, state.data.layout.walls ) + reverse = Actions.reverseDirection( conf.direction ) + if Directions.STOP in possibleActions: + possibleActions.remove( Directions.STOP ) + if reverse in possibleActions and len( possibleActions ) > 1: + possibleActions.remove( reverse ) + return possibleActions + getLegalActions = staticmethod( getLegalActions ) + + def applyAction( state, action, ghostIndex): + + legal = GhostRules.getLegalActions( state, ghostIndex ) + if action not in legal: + raise Exception("Illegal ghost action " + str(action)) + + ghostState = state.data.agentStates[ghostIndex] + speed = GhostRules.GHOST_SPEED + if ghostState.scaredTimer > 0: speed /= 2.0 + vector = Actions.directionToVector( action, speed ) + ghostState.configuration = ghostState.configuration.generateSuccessor( vector ) + applyAction = staticmethod( applyAction ) + + def decrementTimer( ghostState): + timer = ghostState.scaredTimer + if timer == 1: + ghostState.configuration.pos = nearestPoint( ghostState.configuration.pos ) + ghostState.scaredTimer = max( 0, timer - 1 ) + decrementTimer = staticmethod( decrementTimer ) + + def checkDeath( state, agentIndex): + pacmanPosition = state.getPacmanPosition() + if agentIndex == 0: # Pacman just moved; Anyone can kill him + for index in range( 1, len( state.data.agentStates ) ): + ghostState = state.data.agentStates[index] + ghostPosition = ghostState.configuration.getPosition() + if GhostRules.canKill( pacmanPosition, ghostPosition ): + GhostRules.collide( state, ghostState, index ) + else: + ghostState = state.data.agentStates[agentIndex] + ghostPosition = ghostState.configuration.getPosition() + if GhostRules.canKill( pacmanPosition, ghostPosition ): + GhostRules.collide( state, ghostState, agentIndex ) + checkDeath = staticmethod( checkDeath ) + + def collide( state, ghostState, agentIndex): + if ghostState.scaredTimer > 0: + state.data.scoreChange += 200 + GhostRules.placeGhost(state, ghostState) + ghostState.scaredTimer = 0 + # Added for first-person + state.data._eaten[agentIndex] = True + else: + if not state.data._win: + state.data.scoreChange -= 500 + state.data._lose = True + collide = staticmethod( collide ) + + def canKill( pacmanPosition, ghostPosition ): + return manhattanDistance( ghostPosition, pacmanPosition ) <= COLLISION_TOLERANCE + canKill = staticmethod( canKill ) + + def placeGhost(state, ghostState): + ghostState.configuration = ghostState.start + placeGhost = staticmethod( placeGhost ) + +############################# +# FRAMEWORK TO START A GAME # +############################# + +def default(str): + return str + ' [Default: %default]' + +def parseAgentArgs(str): + if str == None: return {} + pieces = str.split(',') + opts = {} + for p in pieces: + if '=' in p: + key, val = p.split('=') + else: + key,val = p, 1 + opts[key] = val + return opts + +# def readCommand( argv ): +# """ +# Processes the command used to run pacman from the command line. +# """ +# from optparse import OptionParser +# usageStr = """ +# USAGE: python gamestate.py <options> +# EXAMPLES: (1) python gamestate.py +# - starts an interactive game +# (2) python gamestate.py --layout smallClassic --zoom 2 +# OR python gamestate.py -l smallClassic -z 2 +# - starts an interactive game on a smaller board, zoomed in +# """ +# parser = OptionParser(usageStr) +# +# parser.add_option('-n', '--numGames', dest='numGames', type='int', +# help=default('the number of GAMES to play'), metavar='GAMES', default=1) +# parser.add_option('-l', '--layout', dest='layout', +# help=default('the LAYOUT_FILE from which to load the map layout'), +# metavar='LAYOUT_FILE', default='mediumClassic') +# parser.add_option('-p', '--pacman', dest='pacman', +# help=default('the agent TYPE in the pacmanAgents module to use'), +# metavar='TYPE', default='KeyboardAgent') +# parser.add_option('-t', '--textGraphics', action='store_true', dest='textGraphics', +# help='Display output as text only', default=False) +# parser.add_option('-q', '--quietTextGraphics', action='store_true', dest='quietGraphics', +# help='Generate minimal output and no graphics', default=False) +# parser.add_option('-g', '--ghosts', dest='ghost', +# help=default('the ghost agent TYPE in the ghostAgents module to use'), +# metavar = 'TYPE', default='RandomGhost') +# parser.add_option('-k', '--numghosts', type='int', dest='numGhosts', +# help=default('The maximum number of ghosts to use'), default=4) +# parser.add_option('-z', '--zoom', type='float', dest='zoom', +# help=default('Zoom the size of the graphics window'), default=1.0) +# parser.add_option('-f', '--fixRandomSeed', action='store_true', dest='fixRandomSeed', +# help='Fixes the random seed to always play the same game', default=False) +# parser.add_option('-r', '--recordActions', action='store_true', dest='record', +# help='Writes game histories to a file (named by the time they were played)', default=False) +# parser.add_option('--replay', dest='gameToReplay', +# help='A recorded game file (pickle) to replay', default=None) +# parser.add_option('-a','--agentArgs',dest='agentArgs', +# help='Comma separated values sent to agent. e.g. "opt1=val1,opt2,opt3=val3"') +# parser.add_option('-x', '--numTraining', dest='numTraining', type='int', +# help=default('How many episodes are training (suppresses output)'), default=0) +# parser.add_option('--frameTime', dest='frameTime', type='float', +# help=default('Time to delay between frames; <0 means keyboard'), default=0.1) +# parser.add_option('-c', '--catchExceptions', action='store_true', dest='catchExceptions', +# help='Turns on exception handling and timeouts during games', default=False) +# parser.add_option('--timeout', dest='timeout', type='int', +# help=default('Maximum length of time an agent can spend computing in a single game'), default=30) +# +# options, otherjunk = parser.parse_args(argv) +# if len(otherjunk) != 0: +# raise Exception('Command line input not understood: ' + str(otherjunk)) +# args = dict() +# +# # Fix the random seed +# if options.fixRandomSeed: random.seed('cs188') +# +# # Choose a layout +# args['layout'] = layout.getLayout( options.layout ) +# if args['layout'] == None: raise Exception("The layout " + options.layout + " cannot be found") +# +# # Choose a Pacman agent +# noKeyboard = options.gameToReplay == None and (options.textGraphics or options.quietGraphics) +# pacmanType = loadAgent(options.pacman, noKeyboard) +# agentOpts = parseAgentArgs(options.agentArgs) +# if options.numTraining > 0: +# args['numTraining'] = options.numTraining +# if 'numTraining' not in agentOpts: agentOpts['numTraining'] = options.numTraining +# pacman = pacmanType(**agentOpts) # Instantiate Pacman with agentArgs +# args['pacman'] = pacman +# +# # Don't display training games +# if 'numTrain' in agentOpts: +# options.numQuiet = int(agentOpts['numTrain']) +# options.numIgnore = int(agentOpts['numTrain']) +# +# # Choose a ghost agent +# ghostType = loadAgent(options.ghost, noKeyboard) +# args['ghosts'] = [ghostType( i+1 ) for i in range( options.numGhosts )] +# +# # Choose a display format +# if options.quietGraphics: +# import text_display_pacman +# args['display'] = text_display_pacman.NullGraphics() +# elif options.textGraphics: +# import text_display_pacman +# text_display_pacman.SLEEP_TIME = options.frameTime +# args['display'] = text_display_pacman.PacmanGraphics() +# else: +# pass +# # from gympackman import ggraphicsDisplay +# # args['display'] = ggraphicsDisplay.PacmanGraphics(options.zoom, frameTime = options.frameTime) +# args['numGames'] = options.numGames +# args['record'] = options.record +# args['catchExceptions'] = options.catchExceptions +# args['timeout'] = options.timeout +# +# # Special case: recorded games don't use the runGames method or args structure +# if options.gameToReplay != None: +# print('Replaying recorded game %s.' % options.gameToReplay) +# import cPickle +# f = open(options.gameToReplay) +# try: recorded = cPickle.load(f) +# finally: f.close() +# recorded['display'] = args['display'] +# replayGame(**recorded) +# sys.exit(0) +# +# args['options'] = options +# return args + +# def loadAgent(pacman, nographics): +# # Looks through all pythonPath Directories for the right module, +# pythonPathStr = os.path.expandvars("$PYTHONPATH") +# if pythonPathStr.find(';') == -1: +# pythonPathDirs = pythonPathStr.split(':') +# else: +# pythonPathDirs = pythonPathStr.split(';') +# pythonPathDirs.append('.') +# from irlc.berkley import pacman as pcman +# pythonPathDirs.append(os.path.dirname(pcman.__file__)) +# if pacman == 'PacmanQAgent': +# from irlc.berkley.pacman.qlearningAgents import QLearningAgent +# return QLearningAgent +# if pacman == 'RandomGhost': +# from irlc.berkley.pacman.ghostAgents import RandomGhost +# return RandomGhost +# +# for moduleDir in pythonPathDirs: +# if not os.path.isdir(moduleDir): continue +# moduleNames = [f for f in os.listdir(moduleDir) if f.endswith('gents.py')] +# print(moduleNames) +# for modulename in moduleNames: +# try: +# module = __import__(modulename[:-3]) +# except ImportError: +# continue +# print(module) +# if pacman in dir(module): +# if nographics and modulename == 'keyboardAgents.py': +# raise Exception('Using the keyboard requires graphics (not text display)') +# return getattr(module, pacman) +# raise Exception('The agent ' + pacman + ' is not specified in any *Agents.py.') + +def replayGame( layout, actions, display ): + import ghostAgents + from irlc.berkley import pacmanAgents + rules = ClassicGameRules() + agents = [pacmanAgents.GreedyAgent()] + [irlc.pacman.pacman_utils.RandomGhost(i + 1) for i in range(layout.getNumGhosts())] + game = rules.newGame( layout, agents[0], agents[1:], display ) + state = game.state + display.initialize(state.data) + + for action in actions: + # Execute the action + state = state.generateSuccessor( *action ) + # Change the display + display.update( state.data ) + # Allow for game specific conditions (winning, losing, etc.) + rules.process(state, game) + + display.finish() + +def runGames( layout, pacman, ghosts, display, numGames, record, numTraining = 0, catchExceptions=False, timeout=30 ): + # import __main__ + # global __main__ + # __main__.__dict__['_display'] = display + + rules = ClassicGameRules(timeout) + games = [] + + for i in range( numGames ): + beQuiet = i < numTraining + if beQuiet: + # Suppress output and graphics + import text_display_pacman + gameDisplay = text_display_pacman.NullGraphics() + rules.quiet = True + else: + gameDisplay = display + rules.quiet = False + game = rules.newGame( layout, pacman, ghosts, gameDisplay, beQuiet, catchExceptions) + game.run() + if not beQuiet: games.append(game) + + if record: + import time, cPickle + fname = ('recorded-game-%d' % (i + 1)) + '-'.join([str(t) for t in time.localtime()[1:6]]) + with open(fname, "w") as f: + # f = file(fname, 'w') + components = {'layout': layout, 'actions': game.moveHistory} + cPickle.dump(components, f) + # f.close() + + if (numGames-numTraining) > 0: + scores = [game.state.getScore() for game in games] + wins = [game.state.is_won() for game in games] + winRate = wins.count(True)/ float(len(wins)) + print('Average Score:', sum(scores) / float(len(scores))) + print('Scores: ', ', '.join([str(score) for score in scores])) + print('Win Rate: %d/%d (%.2f)' % (wins.count(True), len(wins), winRate)) + print('Record: ', ', '.join([ ['Loss', 'Win'][int(w)] for w in wins])) + + return games + +# if __name__ == '__main__': +# """ +# The main function called when gamestate.py is run +# from the command line: +# +# > python gamestate.py +# +# See the usage string for more details. +# +# > python gamestate.py --help +# """ +# import sys +# +# sys.adaptor = 'tk' +# # sys.adaptor = 'gym' +# ss = "-p PacmanQAgent -n 1 -l mediumGrid -a numTraining=100" +# +# sys.argv.extend(ss.split()) +# args = readCommand( sys.argv[1:] ) # Get game components based on input +# runGames( **args ) +# +# # import cProfile +# # cProfile.run("runGames( **args )") +# pass + + +def nearestPoint( pos ): + """ + Finds the nearest grid point to a position (discretizes). + """ + ( current_row, current_col ) = pos + + grid_row = int( current_row + 0.5 ) + grid_col = int( current_col + 0.5 ) + return ( grid_row, grid_col ) + +def manhattanDistance( xy1, xy2 ): + "Returns the Manhattan distance between points xy1 and xy2" + return abs( xy1[0] - xy2[0] ) + abs( xy1[1] - xy2[1] ) diff --git a/irlc/pacman/layout.py b/irlc/pacman/layout.py new file mode 100644 index 0000000000000000000000000000000000000000..92413e01429a751e019e77b70c2027d8ab912f76 --- /dev/null +++ b/irlc/pacman/layout.py @@ -0,0 +1,157 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# layout.py +# --------- +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). + + +# from irlc.berkley.util import manhattanDistance +from irlc.pacman.pacman_utils import Grid +import os +import random + +VISIBILITY_MATRIX_CACHE = {} + +def manhattanDistance( xy1, xy2 ): + "Returns the Manhattan distance between points xy1 and xy2" + return abs( xy1[0] - xy2[0] ) + abs( xy1[1] - xy2[1] ) + +class Layout: + """ + A Layout manages the static information about the game board. + """ + + def __init__(self, layoutText): + self.width = len(layoutText[0]) + self.height= len(layoutText) + self.walls = Grid(self.width, self.height, False) + self.food = Grid(self.width, self.height, False) + self.capsules = [] + self.agentPositions = [] + self.numGhosts = 0 + self.processLayoutText(layoutText) + self.layoutText = layoutText + self.totalFood = len(self.food.asList()) + # self.initializeVisibilityMatrix() + + def getNumGhosts(self): + return self.numGhosts + + # def initializeVisibilityMatrix(self): + # global VISIBILITY_MATRIX_CACHE + # if reduce(str.__add__, self.layoutText) not in VISIBILITY_MATRIX_CACHE: + # from game import Directions + # vecs = [(-0.5,0), (0.5,0),(0,-0.5),(0,0.5)] + # dirs = [Directions.NORTH, Directions.SOUTH, Directions.WEST, Directions.EAST] + # vis = Grid(self.width, self.height, {Directions.NORTH:set(), Directions.SOUTH:set(), Directions.EAST:set(), Directions.WEST:set(), Directions.STOP:set()}) + # for x in range(self.width): + # for y in range(self.height): + # if self.walls[x][y] == False: + # for vec, direction in zip(vecs, dirs): + # dx, dy = vec + # nextx, nexty = x + dx, y + dy + # while (nextx + nexty) != int(nextx) + int(nexty) or not self.walls[int(nextx)][int(nexty)] : + # vis[x][y][direction].add((nextx, nexty)) + # nextx, nexty = x + dx, y + dy + # self.visibility = vis + # VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] = vis + # else: + # self.visibility = VISIBILITY_MATRIX_CACHE[reduce(str.__add__, self.layoutText)] + + def isWall(self, pos): + x, col = pos + return self.walls[x][col] + + def getRandomLegalPosition(self): + x = random.choice(range(self.width)) + y = random.choice(range(self.height)) + while self.isWall( (x, y) ): + x = random.choice(range(self.width)) + y = random.choice(range(self.height)) + return (x,y) + + def getRandomCorner(self): + poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)] + return random.choice(poses) + + def getFurthestCorner(self, pacPos): + poses = [(1,1), (1, self.height - 2), (self.width - 2, 1), (self.width - 2, self.height - 2)] + dist, pos = max([(manhattanDistance(p, pacPos), p) for p in poses]) + return pos + + # def isVisibleFrom(self, ghostPos, pacPos, pacDirection): + # row, col = [int(x) for x in pacPos] + # return ghostPos in self.visibility[row][col][pacDirection] + + def __str__(self): + return "\n".join(self.layoutText) + + def deepCopy(self): + return Layout(self.layoutText[:]) + + def processLayoutText(self, layoutText): + """ + Coordinates are flipped from the input format to the (x,y) convention here + + The shape of the maze. Each character + represents a different type of object. + % - Wall + . - Food + o - Capsule + G - Ghost + P - Pacman + Other characters are ignored. + """ + maxY = self.height - 1 + for y in range(self.height): + for x in range(self.width): + layoutChar = layoutText[maxY - y][x] + self.processLayoutChar(x, y, layoutChar) + self.agentPositions.sort() + self.agentPositions = [ ( i == 0, pos) for i, pos in self.agentPositions] + + def processLayoutChar(self, x, y, layoutChar): + if layoutChar == '%': + self.walls[x][y] = True + elif layoutChar == '.': + self.food[x][y] = True + elif layoutChar == 'o': + self.capsules.append((x, y)) + elif layoutChar == 'P': + self.agentPositions.append( (0, (x, y) ) ) + elif layoutChar in ['G']: + self.agentPositions.append( (1, (x, y) ) ) + self.numGhosts += 1 + elif layoutChar in ['1', '2', '3', '4']: + self.agentPositions.append( (int(layoutChar), (x,y))) + self.numGhosts += 1 +def getLayout(name, back = 2): + if name.endswith('.lay'): + layout = tryToLoad('layouts/' + name) + if layout == None: layout = tryToLoad(name) + else: + layout = tryToLoad('layouts/' + name + '.lay') + if layout == None: layout = tryToLoad(name + '.lay') + if layout == None and back >= 0: + curdir = os.path.abspath('.') + os.chdir('..') + layout = getLayout(name, back -1) + os.chdir(curdir) + return layout + +def tryToLoad(fullname): + import pathlib + fullname = os.path.join(fullname, pathlib.Path(__file__).parent.absolute(), fullname) + if(not os.path.exists(fullname)): return None + # os.path.abspath(fullname) + f = open(fullname) + try: return Layout([line.strip() for line in f]) + finally: f.close() diff --git a/irlc/pacman/layouts/bigCorners.lay b/irlc/pacman/layouts/bigCorners.lay new file mode 100644 index 0000000000000000000000000000000000000000..4d89d7bc33868d51dd93d2e019a4dae1f62dd043 --- /dev/null +++ b/irlc/pacman/layouts/bigCorners.layo newline at end of file diff --git a/irlc/pacman/layouts/bigHunt.lay b/irlc/pacman/layouts/bigHunt.lay new file mode 100644 index 0000000000000000000000000000000000000000..48ccd0cc68a28856ad50d0b10b67900e4e24366f --- /dev/null +++ b/irlc/pacman/layouts/bigHunt.lay @@ -0,0 +1,20 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%% +%P % +% %%%%%%%%%%%% %%% % +% %% % +% %% % +% % % % +% %%%%%% %%% %% % %G% +% %%%%%% +% %%%%%% % % % +% % % % % +% % G % % %%%%%%%% % +% % % % % +% % % % %%%%%%%% % +% % G % +% %% % %% %% % +% %% % % +% G% % +%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % % % %%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%% \ No newline at end of file diff --git a/irlc/pacman/layouts/bigMaze.lay b/irlc/pacman/layouts/bigMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..e11fade6e907ee916bf9bedb4c8de3cbddd17c97 --- /dev/null +++ b/irlc/pacman/layouts/bigMaze.lay @@ -0,0 +1,37 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% % % % % % % % +% %%%%%%% % %%% % %%% %%% %%%%%%% % % +% % % % % % % % +%%%%% %%%%% %%% % % % %%% %%%%% % %%% +% % % % % % % % % % % % % % +% %%% % % % %%% %%%%% %%% % %%% %%% % +% % % % % % % % % +%%% %%%%%%%%% %%%%%%% %%% %%% % % % % +% % % % % % % +% % %%%%% % %%% % % %%% % %%% %%% % % +% % % % % % % % % % % % % % +% % % %%%%%%% % %%%%%%%%% %%% % %%% % +% % % % % % % % % % +%%% %%% % %%%%% %%%%% %%% %%% %%%%% % +% % % % % % % % % % % % +% % % % % %%% %%% %%% %%% % % % % % % +% % % % % % % % % +%%% %%%%%%% % % %%%%% %%% % %%% %%%%% +% % % % % % % % % % +%%%%% % % %%%%%%%%% %%%%%%%%%%% % %%% +% % % % % % % % % +% %%% %%%%% %%%%%%%%% %%%%% % % %%% % +% % % % % % % +% % % %%%%% %%% % % % % %%%%%%%%%%%%% +% % % % % % % % % % % % +% % %%% %%% % % % %%%%%%%%% %%% % % % +% % % % % % % % % % % % % +% %%% %%% %%%%% %%% % % %%%%% % %%%%% +% % % % % % % % % +%%% % %%%%% %%%%% %%% %%% % %%% % %%% +% % % % % % % % % % % % % % % +% % %%% % % % % %%%%%%%%% % % % % % % +% % % % % % +% % % % %%% %%% %%%%%%% %%% %%% %%% % +%.% % % % % % % % P% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \ No newline at end of file diff --git a/irlc/pacman/layouts/bigSafeSearch.lay b/irlc/pacman/layouts/bigSafeSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..b5fd414060d7366e3ddf121ac7a0fa6cb325ed26 --- /dev/null +++ b/irlc/pacman/layouts/bigSafeSearch.lay @@ -0,0 +1,8 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%.%.........%% G % o%%%%.....% +%.%.%%%%%%%.%%%%%% %%%%%%%.%%.% +%............%...%............% +%%%%%...%%%.. ..%.%...%.%%% +%o%%%.%%%%%.%%%%%%%.%%%.%.%%%%% +% ..........Po...%...%. o% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/bigSearch.lay b/irlc/pacman/layouts/bigSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..bb59eb8db68c35789c625e8fe7c987f4293a84aa --- /dev/null +++ b/irlc/pacman/layouts/bigSearch.laydiff --git a/irlc/pacman/layouts/boxSearch.lay b/irlc/pacman/layouts/boxSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..4a113fcd9ea16537f99997244336a005c88bc9d2 --- /dev/null +++ b/irlc/pacman/layouts/boxSearch.lay @@ -0,0 +1,14 @@ +%%%%%%%%%%%%%% +%. . . . . % % +% % % +%. . . . . %G% +% % % +%. . . . . % % +% % % +%. . . . . % % +% P %G% +%. . . . . % % +% % % +%. . . . . % % +% % % +%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/capsuleClassic.lay b/irlc/pacman/layouts/capsuleClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..06a5c51ad27818a5871869f2851e3f6229728e3b --- /dev/null +++ b/irlc/pacman/layouts/capsuleClassic.lay @@ -0,0 +1,7 @@ +%%%%%%%%%%%%%%%%%%% +%G. G ....% +%.% % %%%%%% %.%%.% +%.%o% % o% %.o%.% +%.%%%.% %%% %..%.% +%..... P %..%G% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/contestClassic.lay b/irlc/pacman/layouts/contestClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..84c8733a8446bff05f8c1f1f377325c7ba2ae35e --- /dev/null +++ b/irlc/pacman/layouts/contestClassic.lay @@ -0,0 +1,9 @@ +%%%%%%%%%%%%%%%%%%%% +%o...%........%...o% +%.%%.%.%%..%%.%.%%.% +%...... G GG%......% +%.%.%%.%% %%%.%%.%.% +%.%....% ooo%.%..%.% +%.%.%%.% %% %.%.%%.% +%o%......P....%....% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/contoursMaze.lay b/irlc/pacman/layouts/contoursMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..a06895692e8db8d90c85296885232a8d3f1c5829 --- /dev/null +++ b/irlc/pacman/layouts/contoursMaze.lay @@ -0,0 +1,11 @@ +%%%%%%%%%%%%%%%%%%%%% +% % +% % +% % +% % +% P % +% % +% % +% % +%. % +%%%%%%%%%%%%%%%%%%%%% \ No newline at end of file diff --git a/irlc/pacman/layouts/greedySearch.lay b/irlc/pacman/layouts/greedySearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..4072363672fd50bf31b5cf5e10ef10f3f0e8617b --- /dev/null +++ b/irlc/pacman/layouts/greedySearch.lay @@ -0,0 +1,8 @@ +%%%%%% +%....% +% %%.% +% %%.% +%.P .% +%.%%%% +%....% +%%%%%% \ No newline at end of file diff --git a/irlc/pacman/layouts/mediumClassic.lay b/irlc/pacman/layouts/mediumClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..33c5db85a6b6b99130e1c357731bf152813626b5 --- /dev/null +++ b/irlc/pacman/layouts/mediumClassic.lay @@ -0,0 +1,11 @@ +%%%%%%%%%%%%%%%%%%%% +%o...%........%....% +%.%%.%.%%%%%%.%.%%.% +%.%..............%.% +%.%.%%.%% %%.%%.%.% +%......%G G%......% +%.%.%%.%%%%%%.%%.%.% +%.%..............%.% +%.%%.%.%%%%%%.%.%%.% +%....%...P....%...o% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/mediumCorners.lay b/irlc/pacman/layouts/mediumCorners.lay new file mode 100644 index 0000000000000000000000000000000000000000..6a397568874a3e78a575b94fcd5aa307298d4eb1 --- /dev/null +++ b/irlc/pacman/layouts/mediumCorners.lay @@ -0,0 +1,14 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%. % % % %.% +% % % %%%%%% %%%%%%% % % +% % % % % % +%%%%% %%%%% %%% %% %%%%% % %%% +% % % % % % % % % +% %%% % % % %%%%%%%% %%% %%% % +% % %% % % % % +%%% % %%%%%%% %%%% %%% % % % % +% % %% % % % +% % %%%%% % %%%% % %%% %%% % % +% % % % % % %%% % +%. %P%%%%% % %%% % .% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \ No newline at end of file diff --git a/irlc/pacman/layouts/mediumDottedMaze.lay b/irlc/pacman/layouts/mediumDottedMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..103f818d75f1271e3aadf6e078d968676992823e --- /dev/null +++ b/irlc/pacman/layouts/mediumDottedMaze.layo newline at end of file diff --git a/irlc/pacman/layouts/mediumGrid.lay b/irlc/pacman/layouts/mediumGrid.lay new file mode 100644 index 0000000000000000000000000000000000000000..52b270754875523aa9144bfa10498784cb6e3c61 --- /dev/null +++ b/irlc/pacman/layouts/mediumGrid.lay @@ -0,0 +1,7 @@ +%%%%%%%% +%P % +% .% . % +% % % +% .% . % +% G% +%%%%%%%% diff --git a/irlc/pacman/layouts/mediumMaze.lay b/irlc/pacman/layouts/mediumMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..55c1236e1fd462408a7c9ebc9091c2001def89e1 --- /dev/null +++ b/irlc/pacman/layouts/mediumMaze.layo newline at end of file diff --git a/irlc/pacman/layouts/mediumSafeSearch.lay b/irlc/pacman/layouts/mediumSafeSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..e7d6b1cc444b6b5b13bbedcc8b4ab1cf575ff0c6 --- /dev/null +++ b/irlc/pacman/layouts/mediumSafeSearch.lay @@ -0,0 +1,6 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%.% ....%% G %%%%%% o%%.% +%.%o%%%%%%%.%%%%%%% %%%%%.% +% %%%.%%%%%.%%%%%%%.%%%.%.%%%.% +% ..........Po...%.........% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/mediumScaryMaze.lay b/irlc/pacman/layouts/mediumScaryMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..65d4c33d1a422fa6013289415a5269b0d5c8ccf2 --- /dev/null +++ b/irlc/pacman/layouts/mediumScaryMaze.layo newline at end of file diff --git a/irlc/pacman/layouts/mediumSearch.lay b/irlc/pacman/layouts/mediumSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..2f8af420ffda3e75db27f0129da5a145e91849f7 --- /dev/null +++ b/irlc/pacman/layouts/mediumSearch.lay @@ -0,0 +1,8 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%............%%%%%............% +%%%.%...%%%.........%.%...%.%%% +%...%%%.%.%%%%.%.%%%%%%.%%%...% +%.%.....%......%......%.....%.% +%.%%%.%%%%%.%%%%%%%.%%%.%.%%%%% +%.....%........P....%...%.....% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/minimaxClassic.lay b/irlc/pacman/layouts/minimaxClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..a547397b1195dfbba589af8628d90f086470199f --- /dev/null +++ b/irlc/pacman/layouts/minimaxClassic.lay @@ -0,0 +1,5 @@ +%%%%%%%%% +%.P G% +% %.%G%%% +%G %%% +%%%%%%%%% diff --git a/irlc/pacman/layouts/oddSearch.lay b/irlc/pacman/layouts/oddSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..2ddbc9a7e196feb033ff6635ba2c3f83c43d4c7e --- /dev/null +++ b/irlc/pacman/layouts/oddSearch.lay @@ -0,0 +1,7 @@ +%%%%%%%%%%%%%%%%%%%% +%...%.........%%...% +%.%.%.%%%%%%%%%%.%.% +%..................% +%%%%%%%%.%.%%%%%%%P% +%%%%%%%%....... % +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/oneHunt.lay b/irlc/pacman/layouts/oneHunt.lay new file mode 100644 index 0000000000000000000000000000000000000000..45291a9195d9a3a34c0a3de6249c4aa23f69cbf3 --- /dev/null +++ b/irlc/pacman/layouts/oneHunt.lay @@ -0,0 +1,16 @@ +%%%%%%%%%%%%%%%%%%%% +% % +% % +% G G % +% % +% P % +% % +% % +% % +% G G % +% % +% % +% % +%%%%%%%%%%%%%%%%%%%% +% % % % %%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%% \ No newline at end of file diff --git a/irlc/pacman/layouts/openClassic.lay b/irlc/pacman/layouts/openClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..6760b427eea3e203e645a1c3922d9218b852644d --- /dev/null +++ b/irlc/pacman/layouts/openClassic.lay @@ -0,0 +1,9 @@ +%%%%%%%%%%%%%%%%%%%%%%%%% +%.. P .... .... % +%.. ... ... ... ... % +%.. ... ... ... ... % +%.. .... .... G % +%.. ... ... ... ... % +%.. ... ... ... ... % +%.. .... .... o% +%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/openHunt.lay b/irlc/pacman/layouts/openHunt.lay new file mode 100644 index 0000000000000000000000000000000000000000..45d33887be999ebea9360b78caa3906df1a188f8 --- /dev/null +++ b/irlc/pacman/layouts/openHunt.lay @@ -0,0 +1,13 @@ +%%%%%%%%%%%%%%%%%%%% +%P G % +% %%% %%% %% %%% % +% G % +% % % +% % % +% %%%%%% %%%G%%% % +% G % +% % % +% % % +%%%%%%%%%%%%%%%%%%%% +% % % % %%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/openMaze.lay b/irlc/pacman/layouts/openMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..5dee6891b79b07e3afb256b71c8cc1dab68ca975 --- /dev/null +++ b/irlc/pacman/layouts/openMaze.lay @@ -0,0 +1,23 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% P% +% % % +% % % +% % % +% % % +% % % +% % % % +% % % % +% % % % +% % % % +% % % % +% % % % +% % % % +%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%% +% % % +% % % +% % % +% % +% % +% % +%. % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \ No newline at end of file diff --git a/irlc/pacman/layouts/openSearch.lay b/irlc/pacman/layouts/openSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..f02d21d906d09e09f495c1fe37375fb7a89a6e07 --- /dev/null +++ b/irlc/pacman/layouts/openSearch.lay @@ -0,0 +1,7 @@ +%%%%%%%%%%%%%%%%%%%% +%..................% +%..................% +%........P.........% +%..................% +%..................% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/originalClassic.lay b/irlc/pacman/layouts/originalClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..b2770c571cbaa3c2a556738028427a826fea3401 --- /dev/null +++ b/irlc/pacman/layouts/originalClassic.lay @@ -0,0 +1,27 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%............%%............% +%.%%%%.%%%%%.%%.%%%%%.%%%%.% +%o%%%%.%%%%%.%%.%%%%%.%%%%o% +%.%%%%.%%%%%.%%.%%%%%.%%%%.% +%..........................% +%.%%%%.%%.%%%%%%%%.%%.%%%%.% +%.%%%%.%%.%%%%%%%%.%%.%%%%.% +%......%%....%%....%%......% +%%%%%%.%%%%% %% %%%%%.%%%%%% +%%%%%%.%%%%% %% %%%%%.%%%%%% +%%%%%%.% %.%%%%%% +%%%%%%.% %%%% %%%% %.%%%%%% +% . %G GG G% . % +%%%%%%.% %%%%%%%%%% %.%%%%%% +%%%%%%.% %.%%%%%% +%%%%%%.% %%%%%%%%%% %.%%%%%% +%............%%............% +%.%%%%.%%%%%.%%.%%%%%.%%%%.% +%.%%%%.%%%%%.%%.%%%%%.%%%%.% +%o..%%....... .......%%..o% +%%%.%%.%%.%%%%%%%%.%%.%%.%%% +%%%.%%.%%.%%%%%%%%.%%.%%.%%% +%......%%....%%....%%......% +%.%%%%%%%%%%.%%.%%%%%%%%%%.% +%.............P............% +%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/powerClassic.lay b/irlc/pacman/layouts/powerClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..3f3d983a38b1acfbffd4ac77f0909ebc2d4fef42 --- /dev/null +++ b/irlc/pacman/layouts/powerClassic.lay @@ -0,0 +1,7 @@ +%%%%%%%%%%%%%%%%%%%% +%o....o%GGGG%o....o% +%..%...%% %%...%..% +%.%o.%........%.o%.% +%.o%.%.%%%%%%.%.%o.% +%........P.........% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/smallClassic.lay b/irlc/pacman/layouts/smallClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..ce6c1d980029e70c7eb2bc5ce259067385007839 --- /dev/null +++ b/irlc/pacman/layouts/smallClassic.lay @@ -0,0 +1,7 @@ +%%%%%%%%%%%%%%%%%%%% +%......%G G%......% +%.%%...%% %%...%%.% +%.%o.%........%.o%.% +%.%%.%.%%%%%%.%.%%.% +%........P.........% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/smallGrid.lay b/irlc/pacman/layouts/smallGrid.lay new file mode 100644 index 0000000000000000000000000000000000000000..4bbe2b6f630e2dabb95d0e223696e5759f4bd3ae --- /dev/null +++ b/irlc/pacman/layouts/smallGrid.lay @@ -0,0 +1,7 @@ +%%%%%%% +% P % +% %%% % +% %. % +% %%% % +%. G % +%%%%%%% diff --git a/irlc/pacman/layouts/smallHunt.lay b/irlc/pacman/layouts/smallHunt.lay new file mode 100644 index 0000000000000000000000000000000000000000..ef9059a6a5a958f91f9c8dc2a620d78f8e1911e1 --- /dev/null +++ b/irlc/pacman/layouts/smallHunt.lay @@ -0,0 +1,8 @@ +%%%%%%%%%%%%%%%%%%%% +%P G G % +% %%%%% %%%%%% % % % +% G % +% G % +%%%%%%%%%%%%%%%%%%%% +% % % % %%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/smallMaze.lay b/irlc/pacman/layouts/smallMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..72d3ffc68bf290d1c83fc0c640b9e534666cbe54 --- /dev/null +++ b/irlc/pacman/layouts/smallMaze.lay @@ -0,0 +1,10 @@ +%%%%%%%%%%%%%%%%%%%%%% +% %% % % % +% %%%%%% % %%%%%% % +%%%%%% P % % +% % %%%%%% %% %%%%% +% %%%% % % % +% %%% %%% % % +%%%%%%%%%% %%%%%% % +%. %% % +%%%%%%%%%%%%%%%%%%%%%% \ No newline at end of file diff --git a/irlc/pacman/layouts/smallSafeSearch.lay b/irlc/pacman/layouts/smallSafeSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..b97feaa1419ed38709b989c7d086069c93a42b0b --- /dev/null +++ b/irlc/pacman/layouts/smallSafeSearch.lay @@ -0,0 +1,15 @@ +%%%%%%%%% +%.. % G % +%%% %%%%% +% % +%%%%%%% % +% % +% %%%%% % +% % % +%%%%% % % +% %o% +% %%%%%%% +% .% +%%%%%%%.% +%Po .% +%%%%%%%%% diff --git a/irlc/pacman/layouts/smallSearch.lay b/irlc/pacman/layouts/smallSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..c2321d4701bf5fa761407e8bc171c4d5fda66991 --- /dev/null +++ b/irlc/pacman/layouts/smallSearch.lay @@ -0,0 +1,5 @@ +%%%%%%%%%%%%%%%%%%%% +%. ...P .% +%.%%.%%.%%.%%.%% %.% +% %% %..... %.% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/testClassic.lay b/irlc/pacman/layouts/testClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..4b3ffcabca09ecab04f7a6dfab205b459c3ac996 --- /dev/null +++ b/irlc/pacman/layouts/testClassic.lay @@ -0,0 +1,10 @@ +%%%%% +% . % +%.G.% +% . % +%. .% +% % +% .% +% % +%P .% +%%%%% diff --git a/irlc/pacman/layouts/testMaze.lay b/irlc/pacman/layouts/testMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..4d259a4fc7dcf78b904c13a369d40234935b6bfd --- /dev/null +++ b/irlc/pacman/layouts/testMaze.lay @@ -0,0 +1,3 @@ +%%%%%%%%%% +%. P% +%%%%%%%%%% diff --git a/irlc/pacman/layouts/testSearch.lay b/irlc/pacman/layouts/testSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..25bad237d8044018988ab6c0054d60317055207a --- /dev/null +++ b/irlc/pacman/layouts/testSearch.lay @@ -0,0 +1,5 @@ +%%%%% +%.P % +%%% % +%. % +%%%%% diff --git a/irlc/pacman/layouts/tinyCorners.lay b/irlc/pacman/layouts/tinyCorners.lay new file mode 100644 index 0000000000000000000000000000000000000000..526c88061163083aedb184e0ec6876e009b67bc0 --- /dev/null +++ b/irlc/pacman/layouts/tinyCorners.lay @@ -0,0 +1,8 @@ +%%%%%%%% +%. .% +% P % +% %%%% % +% % % +% % %%%% +%.% .% +%%%%%%%% diff --git a/irlc/pacman/layouts/tinyMaze.lay b/irlc/pacman/layouts/tinyMaze.lay new file mode 100644 index 0000000000000000000000000000000000000000..f7035a597d39ca648b0518077d608d1dceca98d7 --- /dev/null +++ b/irlc/pacman/layouts/tinyMaze.lay @@ -0,0 +1,7 @@ +%%%%%%% +% P% +% %%% % +% % % +%% %% +%. %%%% +%%%%%%% diff --git a/irlc/pacman/layouts/tinySafeSearch.lay b/irlc/pacman/layouts/tinySafeSearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..fea686045d0012418eda37be684d513da8c15b28 --- /dev/null +++ b/irlc/pacman/layouts/tinySafeSearch.lay @@ -0,0 +1,7 @@ +%%%%%%%%% +% G %...% +%%%%%%% % +%Po % +%.%%.%%.% +%.%%....% +%%%%%%%%% diff --git a/irlc/pacman/layouts/tinySearch.lay b/irlc/pacman/layouts/tinySearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..c51f4b0400295c0dfd21ad33a4c77a222046c481 --- /dev/null +++ b/irlc/pacman/layouts/tinySearch.lay @@ -0,0 +1,7 @@ +%%%%%%%%% +%.. ..% +%%%%.%% % +% P % +%.%% %%.% +%.%. .% +%%%%%%%%% diff --git a/irlc/pacman/layouts/trappedClassic.lay b/irlc/pacman/layouts/trappedClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..289557f7eb52d0355242220eac38fef8480a5ae2 --- /dev/null +++ b/irlc/pacman/layouts/trappedClassic.lay @@ -0,0 +1,5 @@ +%%%%%%%% +% P G% +%G%%%%%% +%.... % +%%%%%%%% diff --git a/irlc/pacman/layouts/trickyClassic.lay b/irlc/pacman/layouts/trickyClassic.lay new file mode 100644 index 0000000000000000000000000000000000000000..ffa156cca272d7e64af0180da7365e1b974cd7af --- /dev/null +++ b/irlc/pacman/layouts/trickyClassic.lay @@ -0,0 +1,13 @@ +%%%%%%%%%%%%%%%%%%%% +%o...%........%...o% +%.%%.%.%%..%%.%.%%.% +%.%.....%..%.....%.% +%.%.%%.%% %%.%%.%.% +%...... GGGG%.%....% +%.%....%%%%%%.%..%.% +%.%....% oo%.%..%.% +%.%....% %%%%.%..%.% +%.%...........%..%.% +%.%%.%.%%%%%%.%.%%.% +%o...%...P....%...o% +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/layouts/trickySearch.lay b/irlc/pacman/layouts/trickySearch.lay new file mode 100644 index 0000000000000000000000000000000000000000..4a607e648c8a4981875587e88355b1c2742375b6 --- /dev/null +++ b/irlc/pacman/layouts/trickySearch.lay @@ -0,0 +1,7 @@ +%%%%%%%%%%%%%%%%%%%% +%. ..% % +%.%%.%%.%%.%%.%% % % +% P % % +%%%%%%%%%%%%%%%%%% % +%..... % +%%%%%%%%%%%%%%%%%%%% diff --git a/irlc/pacman/pacman_environment.py b/irlc/pacman/pacman_environment.py new file mode 100644 index 0000000000000000000000000000000000000000..cc75de14e55314d70113815242c264a8f2b6bffe --- /dev/null +++ b/irlc/pacman/pacman_environment.py @@ -0,0 +1,243 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import pygame +from irlc.pacman.gamestate import Directions, ClassicGameRules +from irlc.pacman.layout import getLayout +from irlc.pacman.pacman_text_display import PacmanTextDisplay +from irlc.pacman.pacman_graphics_display import PacmanGraphics, FirstPersonPacmanGraphics +from irlc.pacman.pacman_utils import PacAgent, RandomGhost +from irlc.pacman.layout import Layout +import gymnasium as gym +from gymnasium import RewardWrapper +from irlc.utils.common import ExplicitActionSpace, DiscreteTextActionSpace + +datadiscs = """ +%%%%%%% +% .% +%.P%% % +%. .% +%%%%%%% +""" + +very_small_maze = """ +%%%%%% +%P. .% +% %%% +%%%%%% +""" + +very_small_haunted_maze = """ +%%%%%% +%P. .% +% %%%% +% G% +%%%%%% +""" + + +class PacmanEnvironment(gym.Env): + _unpack_search_state = True # A hacky fix to set the search state. + """ + A fairly messy pacman environment class. I do not recommend reading this code. + """ + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 20 + } + + # def A(self, state): + # """ + # Return a list of actions available in the given state. This function should be considered deprecated. + # """ + # raise Exception("HArd deprecation.") + # return state.A() + + def __init__(self, layout_str=None, render_mode=None, animate_movement=None, layout='mediumGrid', zoom=2.0, num_ghosts=4, frames_per_second=30, ghost_agent=None, + method_str='', allow_all_actions=False, verbose=False): + self.metadata['video_frames_per_second'] = frames_per_second + self.ghosts = [ghost_agent(i+1) if ghost_agent is not None else RandomGhost(i+1) for i in range(num_ghosts)] + if animate_movement is None: + animate_movement = render_mode =='human' + if animate_movement: + render_mode = 'human' + + # from irlc.utils. + # self.action_space = ExplicitActionSpace(self) # Wrapper environments copy the action space. + + from irlc.pacman.gamestate import Directions + self.action_space = DiscreteTextActionSpace(seed=None, actions=[Directions.NORTH, Directions.EAST, Directions.SOUTH, Directions.WEST, Directions.STOP]) + + + # Load level layout + if layout_str is not None: + self.layout = Layout([line.strip() for line in layout_str.strip().splitlines()]) + else: + self.layout = getLayout(layout) + if self.layout is None: + raise Exception("Layout file not found", layout) + self.rules = ClassicGameRules(30) + self.options_frametime = 1/frames_per_second + self.game = None + + # Setup displays. + self.first_person_graphics = False + self.animate_movement = animate_movement + self.options_zoom = zoom + self.text_display = PacmanTextDisplay(1 / frames_per_second) + self.graphics_display = None + + # temporary variables for animation/visualization. Don't remove. + self.visitedlist = None + self.ghostbeliefs = None + self.path = None + self.render_mode = render_mode + self.method = method_str + + def reset(self, seed=None, options=None): + """ + Reset the environment. + + :param seed: + :param options: + :return: + """ + self.game = self.rules.newGame(self.layout, PacAgent(index=0), self.ghosts, quiet=True, catchExceptions=False) + self.game.numMoves = 0 + if self.render_mode == 'human': + self.render() + return self.state, {'mask': self.action_space._make_mask(self.state.A()) } + + + def close(self): + if self.graphics_display is not None: + self.graphics_display.close() + return + + @property + def state(self): + if self.game is None: + return None + return self.game.state.deepCopy() + + def get_keys_to_action(self): + return {(pygame.K_LEFT,): Directions.WEST, + (pygame.K_RIGHT,): Directions.EAST, + (pygame.K_UP,): Directions.NORTH, + (pygame.K_DOWN,): Directions.SOUTH, + (pygame.K_s,): Directions.STOP, + } + + def step(self, action): + r_ = self.game.state.getScore() + done = False + + if action not in self.state.A(): + # if action not in self.A(self.state): + raise Exception(f"Agent tried {action=} available actions {self.state.A()}") + + # Let player play `action`, then let the ghosts play their moves in sequence. + for agent_index in range(len(self.game.agents)): + a = self.game.agents[agent_index].getAction(self.game.state) if agent_index > 0 else action + self.game.state = self.game.state.f(a) + self.game.rules.process(self.game.state, self.game) + + if self.graphics_display is not None and self.animate_movement and agent_index == 0: + self.graphics_display.update(self.game.state, animate=self.animate_movement, ghostbeliefs=self.ghostbeliefs, path=self.path, visitedlist=self.visitedlist) + + done = self.game.gameOver or self.game.state.is_won() or self.game.state.is_lost() + if done: + break + reward = self.game.state.getScore() - r_ + return self.state, reward, done, False, {'mask': self.action_space._make_mask(self.state.A())} + + def render(self): + if hasattr(self, 'agent'): + path = self.agent.__dict__.get('path', None) + ghostbeliefs = self.agent.__dict__.get('ghostbeliefs', None) + visitedlist = self.agent.__dict__.get('visitedlist', None) + else: + path, ghostbeliefs, visitedlist = None, None, None + + # Initialize graphics adaptor. + if self.graphics_display is None and self.render_mode in ["human", 'rgb_array']: + if self.first_person_graphics: + self.graphics_display = FirstPersonPacmanGraphics(self.game.state, self.options_zoom, showGhosts=True, frameTime=self.options_frametime, ghostbeliefs=self.ghostbeliefs) + # self.graphics_display.ghostbeliefs = self.ghostbeliefs + else: + self.graphics_display = PacmanGraphics(self.game.state, self.options_zoom, frameTime=self.options_frametime, method=self.method) + + if self.render_mode in ["human", 'rgb_array']: + # if self.graphics_display is None: + # if self.first_person_graphics: + # self.graphics_display = FirstPersonPacmanGraphics(self.options_zoom, showGhosts=True, + # frameTime=self.options_frametime) + # self.graphics_display.ghostbeliefs = self.ghostbeliefs + # else: + # self.graphics_display = PacmanGraphics(self.options_zoom, frameTime=self.options_frametime) + + if not hasattr(self.graphics_display, 'viewer'): + self.graphics_display.initialize(self.game.state.data) + + # We save these because the animation code may need it in step() + self.visitedlist = visitedlist + self.path = path + self.ghostbeliefs = ghostbeliefs + self.graphics_display.master_render(self.game.state, ghostbeliefs=ghostbeliefs, path=path, visitedlist=visitedlist) + + return self.graphics_display.blit(render_mode=self.render_mode) + # return self.graphics_display.viewer.render(return_rgb_array=self.render_mode == "rgb_array") + + elif self.render_mode in ['ascii']: + return self.text_display.draw(self.game.state) + else: + raise Exception("Bad video mode", self.render_mode) + + @property + def viewer(self): + if self.graphics_display is not None and hasattr(self.graphics_display, 'viewer'): + return self.graphics_display.viewer + else: + return None + + +class PacmanWinWrapper(RewardWrapper): + def step(self, action): + observation, reward, done, truncated, info = self.env.step(action) + if self.env.game.state.is_won(): + reward = 1 + else: + reward = 0 + return observation, reward, done, truncated, info + + +if __name__ == "__main__": + # from irlc import VideoMonitor + import time + # from irlc.utils.player_wrapper_pygame import PlayWrapperPygame + # from irlc.utils.player_wrapper import PlayWrapper + from irlc.ex01.agent import Agent, train + from irlc import interactive + + # from irlc.pacman.pacman_environment import PacmanEnvironment + # from irlc import Agent + # env = PacmanEnvironment() + # s, info = env.reset() + # agent = Agent(env) + # agent.pi(s, k=0, info=info) # get a random action + # agent.pi(s, k=0) # If info is not specified, all actions are assumed permissible. + + + env = PacmanEnvironment(layout='mediumClassic', animate_movement=True, render_mode='human') + agent = Agent(env) + # agent = PlayWrapperPygame(agent, env) + env, agent = interactive(env, agent) + + # env = VideoMonitor(env) + # experiment = "experiments/pacman_q" + # if True: + # agent = Agent(env) + # agent = PlayWrapper(agent, env) + train(env, agent, num_episodes=1) + # env.unwrapped.close() + time.sleep(0.1) + env.close() +# 230 174, 159 diff --git a/irlc/pacman/pacman_graphics_display.py b/irlc/pacman/pacman_graphics_display.py new file mode 100644 index 0000000000000000000000000000000000000000..d01f7ec562992c06e59ccf60bdcff49c0c0ef563 --- /dev/null +++ b/irlc/pacman/pacman_graphics_display.py @@ -0,0 +1,700 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# graphicsDisplay.py +# ------------------ +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). + +# Most code by Dan Klein and John Denero written or rewritten for cs188, UC Berkeley. +# Some code from a Pacman implementation by LiveWires, and used / modified with permission. + +# from irlc.utils.gym_graphics_utils import formatColor, GraphicsUtilGym, colorToVector +# from irlc.utils.gym_graphics_utils import formatColor, GraphicsUtilGym, colorToVector +from irlc.utils.graphics_util_pygame import formatColor, GraphicsUtilGym, colorToVector +from irlc.pacman.pacman_utils import Directions +import math +import time + +DEFAULT_GRID_SIZE = 30.0 +INFO_PANE_HEIGHT = 35 +BACKGROUND_COLOR = formatColor(0,0,0) +WALL_COLOR = formatColor(0.0/255.0, 51.0/255.0, 255.0/255.0) +INFO_PANE_COLOR = formatColor(.4,.4,0) +SCORE_COLOR = formatColor(.9, .9, .9) +PACMAN_OUTLINE_WIDTH = 2 +PACMAN_CAPTURE_OUTLINE_WIDTH = 4 + +GHOST_COLORS = [] +GHOST_COLORS.append(formatColor(.9,0,0)) # Red +GHOST_COLORS.append(formatColor(0,.3,.9)) # Blue +GHOST_COLORS.append(formatColor(.98,.41,.07)) # Orange +GHOST_COLORS.append(formatColor(.1,.75,.7)) # Green +GHOST_COLORS.append(formatColor(1.0,0.6,0.0)) # Yellow +GHOST_COLORS.append(formatColor(.4,0.13,0.91)) # Purple + +TEAM_COLORS = GHOST_COLORS[:2] + +GHOST_SHAPE = [ + ( 0, 0.3 ), + ( 0.25, 0.75 ), + ( 0.5, 0.3 ), + ( 0.75, 0.75 ), + ( 0.75, -0.5 ), + ( 0.5, -0.75 ), + (-0.5, -0.75 ), + (-0.75, -0.5 ), + (-0.75, 0.75 ), + (-0.5, 0.3 ), + (-0.25, 0.75 ) + ] +GHOST_SIZE = 0.65 +SCARED_COLOR = formatColor(1,1,1) + +GHOST_VEC_COLORS = [colorToVector(gc) for gc in GHOST_COLORS] + +PACMAN_COLOR = formatColor(255.0/255.0, 255.0/255.0, 61.0/255) +PACMAN_SCALE = 0.5 + +# Food +FOOD_COLOR = formatColor(1,1,1) +FOOD_SIZE = 0.1 + +# Laser +LASER_COLOR = formatColor(1,0,0) +LASER_SIZE = 0.02 + +# Capsule graphics +CAPSULE_COLOR = formatColor(1,1,1) +CAPSULE_SIZE = 0.25 +# Drawing walls +WALL_RADIUS = 0.15 + +class InfoPane: + def __init__(self, ga, layout, gridSize): + self.gridSize = gridSize + self.width = (layout.width) * gridSize + self.base = (layout.height + 1) * gridSize + self.height = INFO_PANE_HEIGHT + self.fontSize = 24 + self.textColor = PACMAN_COLOR + self.drawPane() + self.ga = ga + + + def toScreen(self, pos, y = None): + """ + Translates a point relative from the bottom left of the info pane. + """ + if y == None: + x,y = pos + else: + x = pos + x = self.gridSize + x # Margin + y = self.base + y + return x,y + + def drawPane(self): + self.scoreText = {'pos':self.toScreen(0, 0), + 'color':self.textColor, + 'contents': "SCORE: 0", + 'font': "Times", + 'size': self.fontSize, + 'style': "bold"} + + def initializeGhostDistances(self, distances): + self.ghostDistanceText = [] + size = 20 + if self.width < 240: + size = 12 + if self.width < 160: + size = 10 + + for i, d in enumerate(distances): + t = {'pos': self.toScreen(self.width/2 + self.width/8 * i, 0), + 'color': GHOST_COLORS[i+1], + 'contents': str(d), + 'font': "Times", + 'size':size, + 'style': "bold"} + self.ghostDistanceText.append(t) + + def updateScore(self, score, method=''): + self.scoreText['contents'] = "SCORE: % 4d %s" %(score, method) + + def setTeam(self, isBlue): + txt = "RED TEAM" + if isBlue: txt = "BLUE TEAM" + self.teamText = {'pos': self.toScreen(300, 0 ), + 'color': self.textColor, + 'contents': txt, + 'font': "Times", + 'size': self.fontSize, + 'style': "bold"} + + def updateGhostDistances(self, distances): + if len(distances) == 0: return + self.initializeGhostDistances(distances) + + def master_render(self): + self.ga.text("master_test", **self.scoreText) + if hasattr(self, 'teamText'): + self.ga.text("team_test", **self.teamText) + if hasattr(self, 'ghostDistanceText'): + for d in self.ghostDistanceText: + self.ga.text(f"ghost_distance_text_{d}_", **d) + + def drawGhost(self): + pass + + def drawPacman(self): + pass + + def drawWarning(self): + pass + + def clearIcon(self): + pass + + def updateMessage(self, message): + pass + + def clearMessage(self): + pass + + +class PacmanGraphics: + def __init__(self, state, zoom=1.0, frameTime=0.0, capture=False, isBlue=False, method=''): + self.have_window = 0 + self.currentGhostImages = {} + self.pacmanImage = None + self.zoom = zoom + self.gridSize = DEFAULT_GRID_SIZE * zoom + self.capture = capture + self.frameTime = frameTime + # self.visitedlist = None + # self.ghostbeliefs = None # for the ghost distributions + self.ga = GraphicsUtilGym() + # Used to be initialize. + self.isBlue = isBlue + self.startGraphics(state) + self.distributionImages = None # Initialized lazily + self.previousState = state + self.method = method + + # def initialize(self, state, isBlue = False): + + def master_render(self, state, ghostbeliefs=None, visitedlist=None, path=None): + # self.viewer.geoms = [] + # self.ga.gc. + # assert False + # state = state.data + # This is completely needless. Just update the things that need to be updated and let everything else be. + + # self.ga.gc.clear() + self.ga.draw_background() + if visitedlist is not None: + self.drawExpandedCells(cells=visitedlist) + + if path is not None: + # draw the given path. + path = [self.to_screen(p) for p in path] + x, y = zip(*path) + # name = f"render_path" + for k in range(len(x)-1): + self.ga.line('asdfasdf', here=(x[k], y[k]), there=(x[k+1], y[k+1]), width=4, color= formatColor(0.5, 0.95, 0.5) ) + + # if len(path) > 1: + # self.ga.plot(name, x, y, width=4, color=formatColor(0.5, 0.95, 0.5) ) + + if ghostbeliefs is not None: + self.drawDistributions(state.data, ghostbeliefs=ghostbeliefs) + + self.drawStaticObjects(state.data) + self.drawAgentObjects(state.data) + self.infoPane.updateScore(state.data.score, self.method) + + if 'ghostDistances' in dir(state.data): + self.infoPane.updateGhostDistances(state.data.ghostDistances) + self.infoPane.master_render() + # self.ga.gc.prune_frame() + # self.viewer.render() + + def blit(self, render_mode=None): + return self.ga.blit(render_mode=render_mode) + + + + def close(self): + self.ga.close() + + def startGraphics(self, state): + self.layout = state.data.layout + # layout = self.layout + self.width = self.layout.width + self.height = self.layout.height + self.make_window(self.width, self.height) + self.ga.draw_background() + self.infoPane = InfoPane(ga=self.ga, layout=self.layout, gridSize=self.gridSize) + self.currentState = self.layout # Unclear. + + def drawDistributions(self, state, ghostbeliefs=None): + ghostbeliefs = [gb.copy() for gb in ghostbeliefs] # uses a default dict. + if ghostbeliefs is None or len(ghostbeliefs) == 0: + return + walls = state.layout.walls + for x in range(walls.width): + for y in range(walls.height): + weights = [gb[(x,y)] for gb in ghostbeliefs] + color = [0.0, 0.0, 0.0] + colors = list(GHOST_VEC_COLORS)[1:] # With Pacman + if self.capture: colors = GHOST_VEC_COLORS + + for weight, gcolor in zip(weights, colors): + color = [min(1.0, c + 0.95 * g * weight ** .3) for c, g in zip(color, gcolor)] + color = formatColor(*color) + ( screen_x, screen_y ) = self.to_screen( (x, y) ) + self.ga.square(f"_belif_{x}_{y}_", (screen_x, screen_y), + 0.5 * self.gridSize, + color = color, # BACKGROUND_COLOR, + filled = 1, behind=2) + + def drawStaticObjects(self, state): + layout = self.layout + self.drawWalls(layout.walls) + self.food = self.drawFood(state.food) + self.capsules = self.drawCapsules(state.capsules) + + def drawAgentObjects(self, state): + self.agentImages = [] # (agentState, image) + for index, agent in enumerate(state.agentStates): + if agent.isPacman: + image = self.drawPacman(agent, index) + self.agentImages.append( (agent, image) ) + else: + image = self.drawGhost(agent, index) + self.agentImages.append( (agent, image) ) + + + def update(self, newState, animate=False, ghostbeliefs=None, path=None, visitedlist=None): + # newState = newState.data + agentIndex = newState.data._agentMoved + agentState = newState.data.agentStates[agentIndex] + # assert False + if self.agentImages[agentIndex][0].isPacman != agentState.isPacman: self.swapImages(agentIndex, agentState) + prevState, prevImage = self.agentImages[agentIndex] + if animate: + if agentState.isPacman: + self.animatePacman(agentState, prevState, prevImage, state=newState, ghostbeliefs=ghostbeliefs, path=path, visitedlist=visitedlist) + else: + self.moveGhost(agentState, agentIndex, prevState, prevImage) + + self.agentImages[agentIndex] = (agentState, prevImage) + + if newState.data._foodEaten != None: + self.removeFood(newState.data._foodEaten, self.food) + if newState.data._capsuleEaten != None: + self.removeCapsule(newState.data._capsuleEaten, self.capsules) + + if 'ghostDistances' in dir(newState): + self.infoPane.updateGhostDistances(newState.data.ghostDistances) + self.master_render(newState, ghostbeliefs=ghostbeliefs, path=path, visitedlist=visitedlist) + + def make_window(self, width, height): + grid_width = (width-1) * self.gridSize + grid_height = (height-1) * self.gridSize + screen_width = 2*self.gridSize + grid_width + screen_height = 2*self.gridSize + grid_height + INFO_PANE_HEIGHT + self.viewer = self.ga.begin_graphics(screen_width, screen_height, BACKGROUND_COLOR, "Pacman") + + def drawPacman(self, pacman, index): + position = self.getPosition(pacman) + d = pacman.draw_extra['delta_xy'] + position = (position[0] + d[0], position[1]+d[1]) + screen_point = self.to_screen(position) + + if 'endpoints' in pacman.draw_extra: + endpoints = pacman.draw_extra['endpoints'] + else: + endpoints = self.getEndpoints(self.getDirection(pacman)) + + width = PACMAN_OUTLINE_WIDTH + outlineColor = PACMAN_COLOR + fillColor = PACMAN_COLOR + + if self.capture: + outlineColor = TEAM_COLORS[index % 2] + fillColor = GHOST_COLORS[index] + width = PACMAN_CAPTURE_OUTLINE_WIDTH + + return [self.ga.circle("pacman", screen_point, PACMAN_SCALE * self.gridSize, + fillColor = fillColor, outlineColor = outlineColor, + endpoints = endpoints, + width = width)] + + def getEndpoints(self, direction, position=(0,0)): + x, y = position + pos = x - int(x) + y - int(y) + width = 30 + 80 * math.sin(math.pi* pos) + + delta = width / 2 + if (direction == 'West'): + endpoints = (180+delta, 180-delta) + elif (direction == 'North'): + endpoints = (90+delta, 90-delta) + elif (direction == 'South'): + endpoints = (270+delta, 270-delta) + else: + endpoints = (0+delta, 0-delta) + return endpoints + + def movePacman(self, position, direction, image,pacman): + # screenPosition = self.to_screen(position) + endpoints = self.getEndpoints( direction, position ) + # r = PACMAN_SCALE * self.gridSize + pacman.draw_extra['endpoints'] = endpoints + + def animatePacman(self, pacman, prevPacman, image, nframe=1, frames=4, state=None, ghostbeliefs=None, path=None, visitedlist=None): + if self.frameTime < 0: + print('Press any key to step forward, "q" to play') + if self.frameTime > 0.01 or self.frameTime < 0: + fx, fy = self.getPosition(prevPacman) + px, py = self.getPosition(pacman) + for nframe in range(1,int(frames) + 1): + pos = px*nframe/frames + fx*(frames-nframe)/frames, py*nframe/frames + fy*(frames-nframe)/frames + self.movePacman(pos, self.getDirection(pacman), image, pacman=pacman) + pacman.draw_extra['delta_xy'] = (pos[0]-px, pos[1]-py) + time.sleep(self.frameTime/frames) + self.master_render(state, ghostbeliefs=ghostbeliefs, path=path, visitedlist=visitedlist) + self.blit(render_mode='human') + else: + self.movePacman(self.getPosition(pacman), self.getDirection(pacman), image, pacman=pacman) + + + def getGhostColor(self, ghost, ghostIndex): + if ghost.scaredTimer > 0: + return SCARED_COLOR + else: + return GHOST_COLORS[ghostIndex] + + def drawGhost(self, ghost, agentIndex): + pos = self.getPosition(ghost) + dir = self.getDirection(ghost) + (screen_x, screen_y) = (self.to_screen(pos) ) + coords = [] + for (x, y) in GHOST_SHAPE: + coords.append((x*self.gridSize*GHOST_SIZE + screen_x, y*self.gridSize*GHOST_SIZE + screen_y)) + + colour = self.getGhostColor(ghost, agentIndex) + name = f"ghost_{agentIndex}_" + body = self.ga.polygon(name, coords, colour, filled = 1) + WHITE = formatColor(1.0, 1.0, 1.0) + BLACK = formatColor(0.0, 0.0, 0.0) + + dx = 0 + dy = 0 + if dir == 'North': + dy = -0.2 + if dir == 'South': + dy = 0.2 + if dir == 'East': + dx = 0.2 + if dir == 'West': + dx = -0.2 + leftEye = self.ga.circle(name +"_s1", (screen_x+self.gridSize*GHOST_SIZE*(-0.3+dx/1.5), screen_y-self.gridSize*GHOST_SIZE*(0.3-dy/1.5)), self.gridSize*GHOST_SIZE*0.2, WHITE, WHITE) + rightEye = self.ga.circle(name +"_s2",(screen_x+self.gridSize*GHOST_SIZE*(0.3+dx/1.5), screen_y-self.gridSize*GHOST_SIZE*(0.3-dy/1.5)), self.gridSize*GHOST_SIZE*0.2, WHITE, WHITE) + leftPupil = self.ga.circle(name +"_s3",(screen_x+self.gridSize*GHOST_SIZE*(-0.3+dx), screen_y-self.gridSize*GHOST_SIZE*(0.3-dy)), self.gridSize*GHOST_SIZE*0.08, BLACK, BLACK) + rightPupil = self.ga.circle(name +"_s4",(screen_x+self.gridSize*GHOST_SIZE*(0.3+dx), screen_y-self.gridSize*GHOST_SIZE*(0.3-dy)), self.gridSize*GHOST_SIZE*0.08, BLACK, BLACK) + ghostImageParts = [] + ghostImageParts.append(body) + ghostImageParts.append(leftEye) + ghostImageParts.append(rightEye) + ghostImageParts.append(leftPupil) + ghostImageParts.append(rightPupil) + return ghostImageParts + + def moveEyes(self, pos, dir, eyes): # does this do anything? + (screen_x, screen_y) = (self.to_screen(pos) ) + dx = 0 + dy = 0 + if dir == 'North': + dy = -0.2 + if dir == 'South': + dy = 0.2 + if dir == 'East': + dx = 0.2 + if dir == 'West': + dx = -0.2 + self.ga.moveCircle(eyes[0],(screen_x+self.gridSize*GHOST_SIZE*(-0.3+dx/1.5), screen_y-self.gridSize*GHOST_SIZE*(0.3-dy/1.5)), self.gridSize*GHOST_SIZE*0.2) + self.ga.moveCircle(eyes[1],(screen_x+self.gridSize*GHOST_SIZE*(0.3+dx/1.5), screen_y-self.gridSize*GHOST_SIZE*(0.3-dy/1.5)), self.gridSize*GHOST_SIZE*0.2) + self.ga.moveCircle(eyes[2],(screen_x+self.gridSize*GHOST_SIZE*(-0.3+dx), screen_y-self.gridSize*GHOST_SIZE*(0.3-dy)), self.gridSize*GHOST_SIZE*0.08) + self.ga.moveCircle(eyes[3],(screen_x+self.gridSize*GHOST_SIZE*(0.3+dx), screen_y-self.gridSize*GHOST_SIZE*(0.3-dy)), self.gridSize*GHOST_SIZE*0.08) + + def moveGhost(self, ghost, ghostIndex, prevGhost, ghostImageParts): + old_x, old_y = self.to_screen(self.getPosition(prevGhost)) + new_x, new_y = self.to_screen(self.getPosition(ghost)) + delta = new_x - old_x, new_y - old_y + + if ghost.scaredTimer > 0: + color = SCARED_COLOR + else: + color = GHOST_COLORS[ghostIndex] + self.ga.edit(ghostImageParts[0], ('fill', color), ('outline', color)) + self.moveEyes(self.getPosition(ghost), self.getDirection(ghost), ghostImageParts[-4:]) + + + def getPosition(self, agentState): + if agentState.configuration == None: return (-1000, -1000) + return agentState.getPosition() + + def getDirection(self, agentState): + if agentState.configuration == None: return Directions.STOP + return agentState.configuration.getDirection() + + def to_screen(self, point): + ( x, y ) = point + x = (x + 1)*self.gridSize + y = (self.height - y)*self.gridSize + return ( x, y ) + + # Fixes some TK issue with off-center circles + def to_screen2(self, point): + ( x, y ) = point + #y = self.height - y + x = (x + 1)*self.gridSize + y = (self.height - y)*self.gridSize + return ( x, y ) + + def drawWalls(self, wallMatrix): + wallColor = WALL_COLOR + + for xNum, x in enumerate(wallMatrix): + if self.capture and (xNum * 2) < wallMatrix.width: wallColor = TEAM_COLORS[0] + if self.capture and (xNum * 2) >= wallMatrix.width: wallColor = TEAM_COLORS[1] + + for yNum, cell in enumerate(x): + name = f"{xNum}_{yNum}_" + if cell: # There's a wall here + pos = (xNum, yNum) + screen = self.to_screen(pos) + screen2 = self.to_screen2(pos) + + # draw each quadrant of the square based on adjacent walls + wIsWall = self.isWall(xNum-1, yNum, wallMatrix) + eIsWall = self.isWall(xNum+1, yNum, wallMatrix) + nIsWall = self.isWall(xNum, yNum+1, wallMatrix) + sIsWall = self.isWall(xNum, yNum-1, wallMatrix) + nwIsWall = self.isWall(xNum-1, yNum+1, wallMatrix) + swIsWall = self.isWall(xNum-1, yNum-1, wallMatrix) + neIsWall = self.isWall(xNum+1, yNum+1, wallMatrix) + seIsWall = self.isWall(xNum+1, yNum-1, wallMatrix) + + # NE quadrant + if (not nIsWall) and (not eIsWall): + # inner circle + # self.ga.circle(name + "s1", screen2, WALL_RADIUS * self.gridSize, wallColor, wallColor, (0,91), 'arc') + self.ga.centered_arc(wallColor, screen2, WALL_RADIUS * self.gridSize, 0,90, width=2) + + if (nIsWall) and (not eIsWall): + # vertical line + self.ga.line(name + "s2", add(screen, (self.gridSize*WALL_RADIUS, 0)), add(screen, (self.gridSize*WALL_RADIUS, self.gridSize*(-0.5)-0)), wallColor) + if (not nIsWall) and (eIsWall): + # horizontal line + self.ga.line(name + "s3", add(screen, (0, self.gridSize*(-1)*WALL_RADIUS)), add(screen, (self.gridSize*0.5+0, self.gridSize*(-1)*WALL_RADIUS)), wallColor) + if (nIsWall) and (eIsWall) and (not neIsWall): + # outer circle + # self.ga.circle(name + "s4", add(screen2, (self.gridSize*2*WALL_RADIUS, self.gridSize*(-2)*WALL_RADIUS)), WALL_RADIUS * self.gridSize-1, wallColor, wallColor, (180,271), 'arc') + self.ga.centered_arc(wallColor, add(screen2, (self.gridSize * 2 * WALL_RADIUS, self.gridSize * (-2) * WALL_RADIUS)), WALL_RADIUS * self.gridSize- 0, 180, 270, width=2) + # centered_arc(self, color, pos, r, start_angle, stop_angle, width=1) + self.ga.line(name + "s5", add(screen, (self.gridSize*2*WALL_RADIUS-0, self.gridSize*(-1)*WALL_RADIUS)), add(screen, (self.gridSize*0.5+0, self.gridSize*(-1)*WALL_RADIUS)), wallColor) + self.ga.line(name + "s6", add(screen, (self.gridSize*WALL_RADIUS, self.gridSize*(-2)*WALL_RADIUS+0)), add(screen, (self.gridSize*WALL_RADIUS, self.gridSize*(-0.5))), wallColor) + + # NW quadrant + if (not nIsWall) and (not wIsWall): + # inner circle + # self.ga.circle(name + "s8", screen2, WALL_RADIUS * self.gridSize, wallColor, wallColor, (90,181), 'arc') + self.ga.centered_arc(wallColor, screen2, WALL_RADIUS * self.gridSize, 90,180, width=2) + + if (nIsWall) and (not wIsWall): + # vertical line + self.ga.line(name + "s10", add(screen, (self.gridSize*(-1)*WALL_RADIUS, 0)), add(screen, (self.gridSize*(-1)*WALL_RADIUS, self.gridSize*(-0.5)-0)), wallColor) + if (not nIsWall) and (wIsWall): + # horizontal line + self.ga.line(name + "s11", add(screen, (0, self.gridSize*(-1)*WALL_RADIUS)), add(screen, (self.gridSize*(-0.5)-0, self.gridSize*(-1)*WALL_RADIUS)), wallColor) + if (nIsWall) and (wIsWall) and (not nwIsWall): + # outer circle + # self.ga.circle(name + "s12", add(screen2, (self.gridSize*(-2)*WALL_RADIUS, self.gridSize*(-2)*WALL_RADIUS)), WALL_RADIUS * self.gridSize-1, wallColor, wallColor, (270,361), 'arc') + self.ga.centered_arc(wallColor, add(screen2, (self.gridSize*(-2)*WALL_RADIUS, self.gridSize*(-2)*WALL_RADIUS)), WALL_RADIUS * self.gridSize, 270,360, width=2) + + self.ga.line(name + "s13", add(screen, (self.gridSize*(-2)*WALL_RADIUS+0, self.gridSize*(-1)*WALL_RADIUS)), add(screen, (self.gridSize*(-0.5), self.gridSize*(-1)*WALL_RADIUS)), wallColor) + self.ga.line(name + "s14", add(screen, (self.gridSize*(-1)*WALL_RADIUS, self.gridSize*(-2)*WALL_RADIUS+1)), add(screen, (self.gridSize*(-1)*WALL_RADIUS, self.gridSize*(-0.5))), wallColor) + + # SE quadrant + if (not sIsWall) and (not eIsWall): + # inner circle + # self.ga.circle(name + "s18", screen2, WALL_RADIUS * self.gridSize, wallColor, wallColor, (270,361), 'arc') + self.ga.centered_arc(wallColor, screen2, WALL_RADIUS * self.gridSize, 270,360, width=2) + + if (sIsWall) and (not eIsWall): + # vertical line + self.ga.line(name + "s20", add(screen, (self.gridSize*WALL_RADIUS, 0)), add(screen, (self.gridSize*WALL_RADIUS, self.gridSize*(0.5)+0)), wallColor) + if (not sIsWall) and (eIsWall): + # horizontal line + self.ga.line(name + "s21", add(screen, (0, self.gridSize*(1)*WALL_RADIUS)), add(screen, (self.gridSize*0.5+1, self.gridSize*(1)*WALL_RADIUS)), wallColor) + if (sIsWall) and (eIsWall) and (not seIsWall): + # outer circle + # self.ga.circle(name + "s22", add(screen2, (self.gridSize*2*WALL_RADIUS, self.gridSize*(2)*WALL_RADIUS)), WALL_RADIUS * self.gridSize-1, wallColor, wallColor, (90,181), 'arc') + self.ga.centered_arc(wallColor, add(screen2, (self.gridSize*2*WALL_RADIUS, self.gridSize*(2)*WALL_RADIUS)), WALL_RADIUS * self.gridSize-0, 90,180, width=2) + self.ga.line(name + "s23", add(screen, (self.gridSize*2*WALL_RADIUS-0, self.gridSize*(1)*WALL_RADIUS)), add(screen, (self.gridSize*0.5, self.gridSize*(1)*WALL_RADIUS)), wallColor) + self.ga.line(name + "s24", add(screen, (self.gridSize*WALL_RADIUS, self.gridSize*(2)*WALL_RADIUS-0)), add(screen, (self.gridSize*WALL_RADIUS, self.gridSize*(0.5))), wallColor) + + # SW quadrant + if (not sIsWall) and (not wIsWall): + # inner circle + # self.ga.circle(name + "s30", screen2, WALL_RADIUS * self.gridSize, wallColor, wallColor, (180,271), 'arc') + self.ga.centered_arc(wallColor, screen2, WALL_RADIUS * self.gridSize, 180,270, width=2) + if (sIsWall) and (not wIsWall): + # vertical line + self.ga.line(name + "s31", add(screen, (self.gridSize*(-1)*WALL_RADIUS, 0)), add(screen, (self.gridSize*(-1)*WALL_RADIUS, self.gridSize*(0.5)+1)), wallColor) + if (not sIsWall) and (wIsWall): + # horizontal line + self.ga.line(name + "s32", add(screen, (0, self.gridSize*(1)*WALL_RADIUS)), add(screen, (self.gridSize*(-0.5)-0, self.gridSize*(1)*WALL_RADIUS)), wallColor) + if (sIsWall) and (wIsWall) and (not swIsWall): + # outer circle + # self.ga.circle(name + "s33", add(screen2, (self.gridSize*(-2)*WALL_RADIUS, self.gridSize*(2)*WALL_RADIUS)), WALL_RADIUS * self.gridSize-1, wallColor, wallColor, (0,91), 'arc') + self.ga.centered_arc(wallColor, add(screen2, (self.gridSize*(-2)*WALL_RADIUS, self.gridSize*(2)*WALL_RADIUS)), WALL_RADIUS * self.gridSize-0, 0, 90, width=2) + self.ga.line(name + "s34", add(screen, (self.gridSize*(-2)*WALL_RADIUS+0, self.gridSize*(1)*WALL_RADIUS)), add(screen, (self.gridSize*(-0.5), self.gridSize*(1)*WALL_RADIUS)), wallColor) + self.ga.line(name + "s35", add(screen, (self.gridSize*(-1)*WALL_RADIUS, self.gridSize*(2)*WALL_RADIUS-0)), add(screen, (self.gridSize*(-1)*WALL_RADIUS, self.gridSize*(0.5))), wallColor) + + def isWall(self, x, y, walls): + if x < 0 or y < 0: + return False + if x >= walls.width or y >= walls.height: + return False + return walls[x][y] + + def drawFood(self, foodMatrix ): + foodImages = [] + color = FOOD_COLOR + for xNum, x in enumerate(foodMatrix): + if self.capture and (xNum * 2) <= foodMatrix.width: color = TEAM_COLORS[0] + if self.capture and (xNum * 2) > foodMatrix.width: color = TEAM_COLORS[1] + imageRow = [] + foodImages.append(imageRow) + for yNum, cell in enumerate(x): + name = f"food_{xNum}_{yNum}_" + if cell: # There's food here + screen = self.to_screen((xNum, yNum )) + dot = self.ga.circle(name, screen, + FOOD_SIZE * self.gridSize, + outlineColor = color, fillColor = color, + width = 1) + imageRow.append(dot) + else: + imageRow.append(None) + return foodImages + + def drawCapsules(self, capsules ): + capsuleImages = {} + for capsule in capsules: + ( screen_x, screen_y ) = self.to_screen(capsule) + name = f"capsule_{screen_y}_{screen_x}_" + dot = self.ga.circle(name, (screen_x, screen_y), + CAPSULE_SIZE * self.gridSize, + outlineColor = CAPSULE_COLOR, + fillColor = CAPSULE_COLOR, + width = 1) + capsuleImages[capsule] = dot + return capsuleImages + + def removeFood(self, cell, foodImages ): + x, y = cell + + # remove_from_screen(foodImages[x][y]) + + def removeCapsule(self, cell, capsuleImages ): + x, y = cell + # remove_from_screen(capsuleImages[(x, y)]) + + def drawExpandedCells(self, cells): + """ + Draws an overlay of expanded grid positions for search agents + """ + n = float(len(cells)) + baseColor = [1.0, 0.0, 0.0] + self.clearExpandedCells() + self.expandedCells = [] + for k, cell in enumerate(cells): + screenPos = self.to_screen( cell) + cellColor = formatColor(*[(n-k) * c * .5 / n + .25 for c in baseColor]) + name = f"exp_cell_{screenPos}_" + block = self.ga.square(name, screenPos, + 0.5 * self.gridSize, + color = cellColor, + filled = 1, behind=2) + self.expandedCells.append(block) + # if self.frameTime < 0: + # refresh() + + def clearExpandedCells(self): + if 'expandedCells' in dir(self) and len(self.expandedCells) > 0: + for cell in self.expandedCells: + pass + +class FirstPersonPacmanGraphics(PacmanGraphics): + def __init__(self, state, zoom = 1.0, showGhosts = True, capture = False, frameTime=0, ghostbeliefs=None): + PacmanGraphics.__init__(self, state, zoom=zoom, frameTime=frameTime) + self.showGhosts = showGhosts + self.capture = capture + self.ghostbeliefs = ghostbeliefs + + + def initialize(self, state, isBlue = False): + self.isBlue = isBlue + PacmanGraphics.startGraphics(self, state) + self.layout = state.layout + self.previousState = state + + def lookAhead(self, config, state): + if config.getDirection() == 'Stop': + return + else: + pass + # Draw relevant ghosts + allGhosts = state.getGhostStates() + visibleGhosts = state.getVisibleGhosts() + for i, ghost in enumerate(allGhosts): + if ghost in visibleGhosts: + self.drawGhost(ghost, i) + else: + self.currentGhostImages[i] = None + + def getGhostColor(self, ghost, ghostIndex): + return GHOST_COLORS[ghostIndex] + + def getPosition(self, ghostState): + if not self.showGhosts and not ghostState.isPacman and ghostState.getPosition()[1] > 1: + return (-1000, -1000) + else: + return PacmanGraphics.getPosition(self, ghostState) + +def add(x, y): + return x[0] + y[0], x[1] + y[1] + +# 790 + +if __name__ == '__main__': + from irlc.pacman.pacman_environment import GymPacmanEnvironment + env = GymPacmanEnvironment(animate_movement=True, layout='mediumClassic', frame_time=0.0001) + # env = GymPacmanEnvironment(animate_movement=True, layout='smallClassic') + from irlc import VideoMonitor, train, Agent + env = VideoMonitor(env) + n = 100 + train(env, Agent(env), max_steps=n, num_episodes=1000) + # everything else: 0.20 (61 %), set up graphics: 0.03 (10 %), rendering: 0.09 (27 %) diff --git a/irlc/pacman/pacman_resources.py b/irlc/pacman/pacman_resources.py new file mode 100644 index 0000000000000000000000000000000000000000..6b5660e50a2701ea3d87761e69e78751a06cb578 --- /dev/null +++ b/irlc/pacman/pacman_resources.py @@ -0,0 +1,266 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import math +import numpy as np +import pygame +from PIL import ImageColor +# from pyglet.shapes import Circle, Rectangle, Polygon, Sector +# from irlc.utils.pyglet_rendering import GroupedElement +from irlc.pacman.pacman_graphics_display import GHOST_COLORS, GHOST_SHAPE + +WHITE = (255, 255, 255) +BLACK = (0, 0, 0) + + +# class Eye(GroupedElement): +# normal, cross = None, None +# +# def render(self): +# self.normal = [Circle(0, 0, .2, color=WHITE, batch=self.batch, group=self.group), +# Circle(0, 0, 0.1, color=BLACK, batch=self.batch, group=self.group)] # radius was 0.08 +# ew = 0.6 +# rw = ew/6 +# self.cross = [Rectangle(x=-ew/2, y=-rw/2, width=ew, height=rw, color=BLACK, group=self.group, batch=self.batch), +# Rectangle(x=-rw/2, y=-ew/2, width=rw, height=ew, color=BLACK, group=self.group, batch=self.batch)] +# self.set_eye_dir('stop') +# +# def set_eye_dir(self, direction='stop'): +# dead = direction.lower() == 'dead' +# for n in self.normal: +# n.visible = not dead +# pp = (0, 0) +# if direction.lower() == 'stop': +# pass +# dd = 0.1 +# if direction.lower() == 'east': +# pp = (dd, 0) +# # self.group.translate(dd, 0) +# +# if direction.lower() == 'west': +# pp = (-dd, 0) +# # self.group.translate(-dd, 0) +# if direction.lower() == 'south': +# pp = (0, -dd) +# # self.group.translate(0, -dd) +# if direction.lower() == 'north': +# # self.group.translate(0, dd) +# pp = (0, dd) +# self.normal[1].x = pp[0] +# self.normal[1].y = pp[1] +# +# for e in self.cross: +# e.visible = dead +# self.group.rotate(np.pi/4 if dead else 0) + +from irlc.utils.graphics_util_pygame import rotate_around + +class Ghost: + body_, eyes_ = None, None + def __init__(self, graphics_adaptor, agent_index=1, order=1, scale=10.): + self.agentIndex = agent_index + + # GS = [(x*scale, y*scale) for x,y in GHOST_SHAPE] + # self.GS = GS + # xx, yy = zip(*GS) + # xmin, xmax = min(xx), max(xx) + # ymin, ymax = min(yy), max(yy) + # this creates a surface + # self.GS = GS + # self.surf = pygame.Surface( (int(xmax-xmin), int(ymax-ymin)) ) + # Write ghost to this surface, then turn it to make it lie down. + self.ga = graphics_adaptor + # self.xmin = xmin + # self.ymin = ymin + # self.rect = self.surf.get_rect() + self.x = 0 + self.y = 0 + self.angle = 0 + self.scale = scale + + self.direction = 'stop' + # super().__init__(order=order) + + + def set_scared(self, scared): + return + from irlc.pacman.devel.pyglet_pacman_graphics import SCARED_COLOR, GHOST_COLORS + self.body_.color = SCARED_COLOR if scared else GHOST_COLORS[self.agentIndex] + + def eyes(self, direction): + return + for e in self.eyes_: + e.set_eye_dir(direction) + + def set_position(self, x, y): + # print("setting position", x,y) + # self.group.x = x + # self.group.y = y + # self.group.translate(x, y) + self.x = x + self.y = y + pass + + def rand_eyes(self): + return ['stop', 'east', 'west', 'north', 'south'][np.random.randint(0, 5)] + + + def set_direction(self, direction): + self.direction = direction + + return + self.eyes(direction) + + def kill(self): + self.set_direction('dead') + return + # return + # self.eyes('dead') + self.body_color = ImageColor.getcolor(GHOST_COLORS[3], "RGB") + # self.group.rotate(-np.pi/2) + + def resurrect(self): + self.set_direction(self.rand_eyes()) + # return + # self.eyes('straight') + return + self.body_.color = ImageColor.getcolor(GHOST_COLORS[self.agentIndex], "RGB") + self.group.rotate(0) + + def render(self): + # ghost_shape = tuple((x, -y) for (x, y) in GHOST_SHAPE) + dead = self.direction.lower() == 'dead' + angle = 0 + if dead: + angle = -90 + + ghost_shape = tuple((x*self.scale+self.x, -y*self.scale+self.y) for (x, y) in GHOST_SHAPE) + + # self.ga.polygon() + # print(ghost_shape) + xy0 = (self.x, self.y) + self.ga.polygon("asdfasf", [rotate_around(c, xy0, angle) for c in ghost_shape], GHOST_COLORS[self.agentIndex] if not dead else GHOST_COLORS[3], filled=1) + dx = 0.3 + dy = 0.3 + + # pdx = 0.2 + # pdy = 0.2 + + for k in range(2): + pos = (self.x + (-1 if k == 0 else 1)*dx*self.scale, self.y + dy*self.scale) + self.ga.circle("asdfsF", rotate_around(pos, xy0, angle), 0.15*self.scale, None, WHITE) + # Eyes: + # continue + + direction = self.direction + + + # for n in self.normal: + # n.visible = not dead + pp = (0, 0) + if direction.lower() == 'stop': + pass + dd = 0.1 + if direction.lower() == 'east': + pp = (dd, 0) + # self.group.translate(dd, 0) + if direction.lower() == 'west': + pp = (-dd, 0) + # self.group.translate(-dd, 0) + if direction.lower() == 'south': + pp = (0, -dd) + # self.group.translate(0, -dd) + if direction.lower() == 'north': + # self.group.translate(0, dd) + pp = (0, dd) + # self.normal[1].x = pp[0] + # self.normal[1].y = pp[1] + if not dead: + self.ga.circle("asdfsF", rotate_around( (pos[0] + pp[0]*self.scale, pos[1] + pp[1]*self.scale), xy0, self.angle), + 0.05 * self.scale, None, BLACK) + else: + ew = 0.6 + rw = ew / 6 + for k in range(2): + cross = [(-rw/2, ew/2), + (rw / 2, ew / 2), + (rw / 2, -ew / 2), + (-rw / 2, -ew / 2), + ] + cross = cross + [cross[0]] + cross = [rotate_around(c, (0,0), 45 + 90*k) for c in cross] + cc = [rotate_around( (pos[0]+x *self.scale+ pp[0], pos[1]+y *self.scale+ pp[1]), xy0, angle) for (x,y) in cross] + self.ga.polygon("asdfasf", cc, None, filled=True, fillColor=BLACK) + + + + + + + # self.cross = [ + # Rectangle(x=-ew / 2, y=-rw / 2, width=ew, height=rw, color=BLACK, group=self.group, batch=self.batch), + # Rectangle(x=-rw / 2, y=-ew / 2, width=rw, height=ew, color=BLACK, group=self.group, batch=self.batch)] + + + pass + # Circle(0, 0, .2, color=WHITE, batch=self.batch, group=self.group) + # + # self.normal = [Circle(0, 0, .2, color=WHITE, batch=self.batch, group=self.group), + # Circle(0, 0, 0.1, color=BLACK, batch=self.batch, group=self.group)] # radius was 0.08 + # ew = 0.6 + # rw = ew / 6 + # self.cross = [ + # Rectangle(x=-ew / 2, y=-rw / 2, width=ew, height=rw, color=BLACK, group=self.group, batch=self.batch), + # Rectangle(x=-rw / 2, y=-ew / 2, width=rw, height=ew, color=BLACK, group=self.group, batch=self.batch)] + # + # for e in self.cross: + # e.visible = dead + # return + # self.ga.polygon() + # colour = ImageColor.getcolor(GHOST_COLORS[self.agentIndex], "RGB") + # self.body_ = Polygon(*ghost_shape, color=colour, batch=self.batch, group=self.group) + # self.eyes_ = [Eye(order=self.group.order+1+k, pg=self.group, batch=self.batch) for k in range(2)] + # for k, e in enumerate(self.eyes_): + # e.group.translate(-.3 if k == 0 else .3, .3) + + +PACMAN_COLOR = (255, 255, 61) + + +# class Pacman(GroupedElement): +# body = None +# +# def __init__(self, grid_size, batch, pg=None, parent=None, order=0): +# self.delta = 0 +# self.GRID_SIZE = grid_size +# super().__init__(batch, pg=pg, parent=parent, order=order) +# self.set_animation(0, 4) +# +# def set_animation(self, frame, frames): +# pos = frame/frames +# width = 30 + 80 * math.sin(math.pi * pos) +# delta = width / 2 +# self.delta = delta * np.pi / 180 +# self.body._angle = 2*np.pi-2*self.delta +# self.body._start_angle = self.delta +# self.body._update_position() +# +# def set_direction(self, direction): +# if direction == 'Stop': +# pass +# else: +# angle = 0 +# if direction == 'East': +# angle = 0 +# elif direction == 'North': +# angle = np.pi/2 +# elif direction == 'West': +# angle = np.pi +# elif direction == 'South': +# angle = np.pi*1.5 +# self.group.rotate(angle) +# +# def render(self): +# width = 30 +# delta = width/2 +# delta = delta/180 * np.pi +# self.body = Sector(0, 0, self.GRID_SIZE/2, angle=2*np.pi-2*delta, start_angle=delta, +# color=PACMAN_COLOR, batch=self.batch, group=self.group) diff --git a/irlc/pacman/pacman_text_display.py b/irlc/pacman/pacman_text_display.py new file mode 100644 index 0000000000000000000000000000000000000000..72d4b107a57610227a7202e904808507bb9305c2 --- /dev/null +++ b/irlc/pacman/pacman_text_display.py @@ -0,0 +1,64 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# pacman_text_display.py +# -------------- +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). +import time + +DRAW_EVERY = 1 +SLEEP_TIME = 0 # This can be overwritten by __init__ +DISPLAY_MOVES = False +QUIET = False # Supresses output + +class PacmanTextDisplay: + def __init__(self, speed=None): + if speed != None: + global SLEEP_TIME + SLEEP_TIME = speed + + def initialize(self, state, isBlue = False): + self.draw(state) + self.pause() + self.turn = 0 + self.agentCounter = 0 + + def update(self, state): + numAgents = len(state.agentStates) + self.agentCounter = (self.agentCounter + 1) % numAgents + if self.agentCounter == 0: + self.turn += 1 + if DISPLAY_MOVES: + ghosts = [nearestPoint(state.getGhostPosition(i)) for i in range(1, numAgents)] + print("%4d) P: %-8s" % (self.turn, str(nearestPoint(state.getPacmanPosition()))),'| Score: %-5d' % state.score,'| Ghosts:', ghosts) + if self.turn % DRAW_EVERY == 0: + self.draw(state) + self.pause() + if state._win or state._lose: + self.draw(state) + + def pause(self): + time.sleep(SLEEP_TIME) + + def draw(self, state): + print(state) + + def finish(self): + pass + +def nearestPoint( pos ): + """ + Finds the nearest grid point to a position (discretizes). + """ + ( current_row, current_col ) = pos + + grid_row = int( current_row + 0.5 ) + grid_col = int( current_col + 0.5 ) + return ( grid_row, grid_col ) diff --git a/irlc/pacman/pacman_utils.py b/irlc/pacman/pacman_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1c55dfb8791176d05e2d05f504a1ba112670e3ed --- /dev/null +++ b/irlc/pacman/pacman_utils.py @@ -0,0 +1,680 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# pacman_utils.py +# ------- +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). + + +# pacman_utils.py +# ------- +# Licensing Information: Please do not distribute or publish solutions to this +# project. You are free to use and extend these projects for educational +# purposes. The Pacman AI projects were developed at UC Berkeley, primarily by +# John DeNero (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# For more info, see http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html + +import traceback +import sys +from collections import defaultdict +import io +import numpy as np +# from irlc.berkley.util import manhattanDistance + + +class PacAgent: + """ + An agent must define a getAction method, but may also define the + following methods which will be called if they exist: + + def registerInitialState(self, state): # inspects the starting state + """ + def __init__(self, index=0): + self.index = index + + def getAction(self, state): + """ + The Agent will receive a GameState (from either {pacman, capture, sonar}.py) and + must return an action from Directions.{North, South, East, West, Stop} + """ + raise NotImplementedError() + +class Directions: + NORTH = 'North' + SOUTH = 'South' + EAST = 'East' + WEST = 'West' + STOP = 'Stop' + + LEFT = {NORTH: WEST, + SOUTH: EAST, + EAST: NORTH, + WEST: SOUTH, + STOP: STOP} + + RIGHT = dict([(y,x) for x, y in LEFT.items()]) + + REVERSE = {NORTH: SOUTH, + SOUTH: NORTH, + EAST: WEST, + WEST: EAST, + STOP: STOP} + +class Configuration: + """ + A Configuration holds the (x,y) coordinate of a character, along with its + traveling direction. + + The convention for positions, like a graph, is that (0,0) is the lower left corner, x increases + horizontally and y increases vertically. Therefore, north is the direction of increasing y, or (0,1). + """ + + def __init__(self, pos, direction): + self.pos = pos + self.direction = direction + + def getPosition(self): + return (self.pos) + + def getDirection(self): + return self.direction + + def isInteger(self): + x,y = self.pos + return x == int(x) and y == int(y) + + def __eq__(self, other): + if other == None: return False + return (self.pos == other.pos and self.direction == other.direction) + + def __hash__(self): + x = hash(self.pos) + y = hash(self.direction) + return hash(x + 13 * y) + + def __str__(self): + return "(x,y)="+str(self.pos)+", "+str(self.direction) + + def generateSuccessor(self, vector): + """ + Generates a new configuration reached by translating the current + configuration by the action vector. This is a low-level call and does + not attempt to respect the legality of the movement. + + Actions are movement vectors. + """ + x, y= self.pos + dx, dy = vector + direction = Actions.vectorToDirection(vector) + if direction == Directions.STOP: + direction = self.direction # There is no stop direction + return Configuration((x + dx, y+dy), direction) + +class AgentState: + """ + AgentStates hold the state of an agent (configuration, speed, scared, etc). + """ + + def __init__( self, startConfiguration, isPacman ): + self.start = startConfiguration + self.configuration = startConfiguration + self.isPacman = isPacman + self.scaredTimer = 0 + self.numCarrying = 0 + self.numReturned = 0 + # Tue + # self.draw_delta_xy = (0,0) + # for instance, pacman endpoints, mid-animation movement, etc. + self.draw_extra = {'delta_xy': (0,0)} + + + def __str__( self ): + if self.isPacman: + return "Pacman: " + str( self.configuration ) + else: + return "Ghost: " + str( self.configuration ) + + def __eq__( self, other ): + if other == None: + return False + return self.configuration == other.configuration and self.scaredTimer == other.scaredTimer + + def __hash__(self): + return hash(hash(self.configuration) + 13 * hash(self.scaredTimer)) + + def copy( self ): + state = AgentState( self.start, self.isPacman ) + state.configuration = self.configuration + state.scaredTimer = self.scaredTimer + state.numCarrying = self.numCarrying + state.numReturned = self.numReturned + return state + + def getPosition(self): + if self.configuration == None: return None + return self.configuration.getPosition() + + def getDirection(self): + return self.configuration.getDirection() + +class Grid: + """ + A 2-dimensional array of objects backed by a list of lists. Data is accessed + via grid[x][y] where (x,y) are positions on a Pacman map with x horizontal, + y vertical and the origin (0,0) in the bottom left corner. + + The __str__ method constructs an output that is oriented like a pacman board. + """ + def __init__(self, width, height, initialValue=False, bitRepresentation=None): + if initialValue not in [False, True]: + raise Exception('Grids can only contain booleans') + self.CELLS_PER_INT = 30 + + self.width = width + self.height = height + self.data = [[initialValue for y in range(height)] for x in range(width)] + if bitRepresentation: + self._unpackBits(bitRepresentation) + + def __getitem__(self, i): + return self.data[i] + + def __setitem__(self, key, item): + self.data[key] = item + + def __str__(self): + out = [[str(self.data[x][y])[0] for x in range(self.width)] for y in range(self.height)] + out.reverse() + return '\n'.join([''.join(x) for x in out]) + + def __eq__(self, other): + if other == None: return False + return self.data == other.data + + def __hash__(self): + # return hash(str(self)) + base = 1 + h = 0 + for l in self.data: + for i in l: + if i: + h += base + base *= 2 + return hash(h) + + def copy(self): + g = Grid(self.width, self.height) + g.data = [x[:] for x in self.data] + return g + + def deepCopy(self): + return self.copy() + + def shallowCopy(self): + g = Grid(self.width, self.height) + g.data = self.data + return g + + def count(self, item =True ): + return sum([x.count(item) for x in self.data]) + + def asList(self, key = True): + list = [] + for x in range(self.width): + for y in range(self.height): + if self[x][y] == key: list.append( (x,y) ) + return list + + def packBits(self): + """ + Returns an efficient int list representation + + (width, height, bitPackedInts...) + """ + bits = [self.width, self.height] + currentInt = 0 + for i in range(self.height * self.width): + bit = self.CELLS_PER_INT - (i % self.CELLS_PER_INT) - 1 + x, y = self._cellIndexToPosition(i) + if self[x][y]: + currentInt += 2 ** bit + if (i + 1) % self.CELLS_PER_INT == 0: + bits.append(currentInt) + currentInt = 0 + bits.append(currentInt) + return tuple(bits) + + def _cellIndexToPosition(self, index): + x = index / self.height + y = index % self.height + return x, y + + def _unpackBits(self, bits): + """ + Fills in data from a bit-level representation + """ + cell = 0 + for packed in bits: + for bit in self._unpackInt(packed, self.CELLS_PER_INT): + if cell == self.width * self.height: break + x, y = self._cellIndexToPosition(cell) + self[x][y] = bit + cell += 1 + + def _unpackInt(self, packed, size): + bools = [] + if packed < 0: raise ValueError("must be a positive integer") + for i in range(size): + n = 2 ** (self.CELLS_PER_INT - i - 1) + if packed >= n: + bools.append(True) + packed -= n + else: + bools.append(False) + return bools + +def reconstituteGrid(bitRep): + if type(bitRep) is not type((1,2)): + return bitRep + width, height = bitRep[:2] + return Grid(width, height, bitRepresentation= bitRep[2:]) + +#################################### +# Parts you shouldn't have to read # +#################################### + +class Actions: + """ + A collection of static methods for manipulating move actions. + """ + # Directions + _directions = {Directions.NORTH: (0, 1), + Directions.SOUTH: (0, -1), + Directions.EAST: (1, 0), + Directions.WEST: (-1, 0), + Directions.STOP: (0, 0)} + + _directionsAsList = _directions.items() + + TOLERANCE = .001 + + def reverseDirection(action): + if action == Directions.NORTH: + return Directions.SOUTH + if action == Directions.SOUTH: + return Directions.NORTH + if action == Directions.EAST: + return Directions.WEST + if action == Directions.WEST: + return Directions.EAST + return action + reverseDirection = staticmethod(reverseDirection) + + def vectorToDirection(vector): + dx, dy = vector + if dy > 0: + return Directions.NORTH + if dy < 0: + return Directions.SOUTH + if dx < 0: + return Directions.WEST + if dx > 0: + return Directions.EAST + return Directions.STOP + vectorToDirection = staticmethod(vectorToDirection) + + def directionToVector(direction, speed = 1.0): + # print(direction) + dx, dy = Actions._directions[direction] + # dx, dy = list(Actions._directions.values())[direction] + + return (dx * speed, dy * speed) + directionToVector = staticmethod(directionToVector) + + def getPossibleActions(config, walls): + possible = [] + x, y = config.pos + x_int, y_int = int(x + 0.5), int(y + 0.5) + + # In between grid points, all agents must continue straight + if (abs(x - x_int) + abs(y - y_int) > Actions.TOLERANCE): + return [config.getDirection()] + + for dir, vec in Actions._directionsAsList: + dx, dy = vec + next_y = y_int + dy + next_x = x_int + dx + if not walls[next_x][next_y]: possible.append(dir) + + return possible + + getPossibleActions = staticmethod(getPossibleActions) + + def getLegalNeighbors(position, walls): + x,y = position + x_int, y_int = int(x + 0.5), int(y + 0.5) + neighbors = [] + for dir, vec in Actions._directionsAsList: + dx, dy = vec + next_x = x_int + dx + if next_x < 0 or next_x == walls.width: continue + next_y = y_int + dy + if next_y < 0 or next_y == walls.height: continue + if not walls[next_x][next_y]: neighbors.append((next_x, next_y)) + return neighbors + getLegalNeighbors = staticmethod(getLegalNeighbors) + + def getSuccessor(position, action): + dx, dy = Actions.directionToVector(action) + x, y = position + return (x + dx, y + dy) + getSuccessor = staticmethod(getSuccessor) + +class GameStateData: + """ + + """ + def __init__( self, prevState = None ): + """ + Generates a new data packet by copying information from its predecessor. + """ + if prevState != None: + self.food = prevState.food.shallowCopy() + self.capsules = prevState.capsules[:] + self.agentStates = self.copyAgentStates( prevState.agentStates ) + self.layout = prevState.layout + self._eaten = prevState._eaten + self.score = prevState.score + + self._foodEaten = None + self._foodAdded = None + self._capsuleEaten = None + self._agentMoved = None + self._lose = False + self._win = False + self.scoreChange = 0 + + def deepCopy( self ): + state = GameStateData( self ) + state.food = self.food.deepCopy() + state.layout = self.layout.deepCopy() + state._agentMoved = self._agentMoved + state._foodEaten = self._foodEaten + state._foodAdded = self._foodAdded + state._capsuleEaten = self._capsuleEaten + + # Tue: I added these. I got no idea if this will screw things up. But why should they not be deep copied? + state._win = self._win + state._lose = self._lose + return state + + def copyAgentStates( self, agentStates ): + copiedStates = [] + for agentState in agentStates: + copiedStates.append( agentState.copy() ) + return copiedStates + + def __eq__( self, other ): + """ + Allows two states to be compared. + """ + if other == None: return False + # TODO Check for type of other + if not self.agentStates == other.agentStates: return False + if not self.food == other.food: return False + if not self.capsules == other.capsules: return False + # if not self.score == other.score: return False # This i am very unsure about. + return True + + def __hash__( self ): + """ + Allows states to be keys of dictionaries. + """ + for i, state in enumerate( self.agentStates ): + try: + int(hash(state)) + except TypeError as e: + print(e) + #hash(state) + return int((hash(tuple(self.agentStates)) + 13*hash(self.food) + 113* hash(tuple(self.capsules)) + 0 * hash(self.score)) % 1048575 ) + + def __str__( self ): + width, height = self.layout.width, self.layout.height + map = Grid(width, height) + if type(self.food) == type((1,2)): + self.food = reconstituteGrid(self.food) + for x in range(width): + for y in range(height): + food, walls = self.food, self.layout.walls + map[x][y] = self._foodWallStr(food[x][y], walls[x][y]) + + for agentState in self.agentStates: + if agentState == None: continue + if agentState.configuration == None: continue + x,y = [int( i ) for i in nearestPoint( agentState.configuration.pos )] + agent_dir = agentState.configuration.direction + if agentState.isPacman: + map[x][y] = self._pacStr( agent_dir ) + else: + map[x][y] = self._ghostStr( agent_dir ) + + for x, y in self.capsules: + map[x][y] = 'o' + + return str(map) + ("\nScore: %d\n" % self.score) + + # def str_no_score(self): # + # return "\n".join(str(self).splitlines()[:-1]) + + def _foodWallStr( self, hasFood, hasWall ): + if hasFood: + return '.' + elif hasWall: + return '%' + else: + return ' ' + + def _pacStr( self, dir ): + if dir == Directions.NORTH: + return 'v' + if dir == Directions.SOUTH: + return '^' + if dir == Directions.WEST: + return '>' + return '<' + + def _ghostStr( self, dir ): + return 'G' + if dir == Directions.NORTH: + return 'M' + if dir == Directions.SOUTH: + return 'W' + if dir == Directions.WEST: + return '3' + return 'E' + + def initialize( self, layout, numGhostAgents ): + """ + Creates an initial game state from a layout array (see layout.py). + """ + self.food = layout.food.copy() + #self.capsules = [] + self.capsules = layout.capsules[:] + self.layout = layout + self.score = 0 + self.scoreChange = 0 + + self.agentStates = [] + numGhosts = 0 + for isPacman, pos in layout.agentPositions: + if not isPacman: + if numGhosts == numGhostAgents: continue # Max ghosts reached already + else: numGhosts += 1 + self.agentStates.append( AgentState( Configuration( pos, Directions.STOP), isPacman) ) + self._eaten = [False for a in self.agentStates] + +try: + import boinc + _BOINC_ENABLED = True +except: + _BOINC_ENABLED = False + +class Game: + """ + The Game manages the control flow, soliciting actions from agents. + """ + + def __init__( self, agents, rules, display=None, startingIndex=0, muteAgents=False, catchExceptions=False ): + self.agentCrashed = False + self.agents = agents + # self.display = display + self.rules = rules + self.startingIndex = startingIndex + self.gameOver = False + self.muteAgents = muteAgents + self.catchExceptions = catchExceptions + self.moveHistory = [] + self.totalAgentTimes = [0 for agent in agents] + self.totalAgentTimeWarnings = [0 for agent in agents] + self.agentTimeout = False + # import cStringIO + + self.agentOutput = [io.StringIO() for agent in agents] + + def getProgress(self): + if self.gameOver: + return 1.0 + else: + return self.rules.getProgress(self) + + def _agentCrash( self, agentIndex, quiet=False): + "Helper method for handling agent crashes" + if not quiet: traceback.print_exc() + self.gameOver = True + self.agentCrashed = True + self.rules.agentCrash(self, agentIndex) + + OLD_STDOUT = None + OLD_STDERR = None + + def mute(self, agentIndex): + if not self.muteAgents: return + global OLD_STDOUT, OLD_STDERR + # import cStringIO + OLD_STDOUT = sys.stdout + OLD_STDERR = sys.stderr + sys.stdout = self.agentOutput[agentIndex] + sys.stderr = self.agentOutput[agentIndex] + + def unmute(self): + if not self.muteAgents: return + global OLD_STDOUT, OLD_STDERR + # Revert stdout/stderr to originals + sys.stdout = OLD_STDOUT + sys.stderr = OLD_STDERR + + +def nearestPoint( pos ): + """ + Finds the nearest grid point to a position (discretizes). + """ + ( current_row, current_col ) = pos + grid_row = int( current_row + 0.5 ) + grid_col = int( current_col + 0.5 ) + return ( grid_row, grid_col ) + + +def chooseFromDistribution( distribution ): + "Takes either a counter or a list of (prob, key) pairs and samples" + # k, v = zip( distribution.items() ) + k, v = zip(*distribution.items()) + sel = np.random.choice( list(k), 1, replace=True, p=list(v) ) + return sel[0] + + +class GhostAgent( PacAgent ): + # def __init__( self, index ): + # self.index = index + + def getAction( self, state ): + dist = self.getDistribution(state) + if len(dist) == 0: + return Directions.STOP + else: + return chooseFromDistribution(dist) + # return util.chooseFromDistribution( dist ) + + def getDistribution(self, state): + "Returns a Counter encoding a distribution over actions from the provided state." + raise NotImplementedError() + # util.raiseNotDefined() + + +class RandomGhost( GhostAgent ): + "A ghost that chooses a legal action uniformly at random." + def getDistribution( self, state ): + # dist = util.Counter() + dist = {} + for a in state.getLegalActions( self.index ): + dist[a] = 1.0 + sm = sum(dist.values()) + for a in dist: + dist[a] = dist[a]/sm + + # dist.normalize() + return dist + + +class DirectionalGhost( GhostAgent ): + "A ghost that prefers to rush Pacman, or flee when scared." + def __init__( self, index, prob_attack=0.8, prob_scaredFlee=0.8 ): + self.index = index + self.prob_attack = prob_attack + self.prob_scaredFlee = prob_scaredFlee + + def getDistribution( self, state ): + # Read variables from state + ghostState = state.getGhostState( self.index ) + legalActions = state.getLegalActions( self.index ) + pos = state.getGhostPosition( self.index ) + isScared = ghostState.scaredTimer > 0 + + speed = 1 + if isScared: speed = 0.5 + + actionVectors = [Actions.directionToVector( a, speed ) for a in legalActions] + newPositions = [( pos[0]+a[0], pos[1]+a[1] ) for a in actionVectors] + pacmanPosition = state.getPacmanPosition() + + # Select best actions given the state + distancesToPacman = [manhattanDistance( pos, pacmanPosition ) for pos in newPositions] + if isScared: + bestScore = max( distancesToPacman ) + bestProb = self.prob_scaredFlee + else: + bestScore = min( distancesToPacman ) + bestProb = self.prob_attack + bestActions = [action for action, distance in zip( legalActions, distancesToPacman ) if distance == bestScore] + + # Construct distribution + # dist = util.Counter() + + + dist = defaultdict(lambda: 0) + + for a in bestActions: dist[a] = bestProb / len(bestActions) + for a in legalActions: dist[a] += ( 1-bestProb ) / len(legalActions) + + sm = sum(dist.values()) + for k, v in dist.items(): + dist[k] = v /sm + # dist = {k: v/sm for k, v in dist.items() } + # dist.normalize() + return dist diff --git a/irlc/project0/__init__.py b/irlc/project0/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/project0/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/project0/__pycache__/__init__.cpython-311.pyc b/irlc/project0/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc28450a2987c5790bd8ac7792fb3e66f490d636 Binary files /dev/null and b/irlc/project0/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/project0/fruit_project_grade.py b/irlc/project0/fruit_project_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..1207c3a31a045469dacb5ee27b1a91188c6ffc79 --- /dev/null +++ b/irlc/project0/fruit_project_grade.py @@ -0,0 +1,4 @@ +# irlc/project0/fruit_project_tests.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project0/fruit_project_tests.py b/irlc/project0/fruit_project_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..331949cd0b9b4a335152864137d7ade2295a5d9e --- /dev/null +++ b/irlc/project0/fruit_project_tests.py @@ -0,0 +1,121 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report +from irlc.ex00.fruit_homework import add, misterfy, mean_value, fruits_ordered, BasicFruitShop, OnlineFruitShop, shop_smart +from unitgrade import hide + +class AdditionQuestion(UTestCase): + """ Problem 1: Adding two numbers """ + def test_add(self): + """ Adding two numbers together """ + self.assertEqual(add(2, 3), 5) # Test the add-function. + self.assertEqual(add(2, -917), -915) # Test the add-function. + + + +class MisterfyQuestion(UTestCase): + """ Problem 2: Misterfy a list """ + def test_misterfy(self): + """ Add 'mr' in front of each item in a string """ + self.assertEqualC(misterfy(['dog', 'cat', 'lion'])) + self.assertEqualC(misterfy(['giraffe'])) + self.assertEqualC(misterfy([])) + + + +class MeanOfDie(UTestCase): + """ Problem 3: Mean of die """ + def test_mean_value(self): + """ Compute mean of two dice """ + p_die = {1: 0.20, + 2: 0.10, + 3: 0.15, + 4: 0.05, + 5: 0.10, + 6: 0.40} + self.assertL2(mean_value(p_die), tol=0.0001) + self.assertL2(mean_value({-1: 0.5, 1: 0.5}), tol=0.0001) + + + +class FruitsOrdered(UTestCase): + """ Problem 4: The fruits_ordered function """ + def test_fruits_ordered(self): + """ fruits_ordered """ + order = {'apples': 1.0, + 'oranges': 3.0} + self.assertEqualC(list(sorted(fruits_ordered(order)))) + order2 = {'banana': 4, + 'apples': 1.0, + 'oranges': 3.0, + 'pears': 4} + self.assertEqualC(list(sorted(fruits_ordered(order2)))) + + +class BasicClass(UTestCase): + """ Problem 5: The BasicFruitShop """ + def test_cost(self): + """ Testing cost function """ + price1 = {"apple": 4, "pear": 8, 'orange': 10} + shop1 = BasicFruitShop("Alis Funky Fruits", price1) + self.assertEqualC(shop1.cost("apple")) + self.assertEqualC(shop1.cost("pear")) + + price2 = {'banana': 9, "apple": 5, "pear": 7, 'orange': 11} + shop2 = BasicFruitShop("Hansen Fruit Emporium", price2) + self.assertEqualC(shop2.cost("orange")) + self.assertEqualC(shop2.cost("banana")) + + +class Inheritance(UTestCase): + title = "Problem 6: Inheritance" + + def test_price_of_order(self): + """ Testing the price_of_order function """ + price_of_fruits = {'apples': 2, 'oranges': 1, 'pears': 1.5, 'mellon': 10, 'banana': 1.5} + shopA = OnlineFruitShop('shopA', price_of_fruits) + + order1 = {'apples': 1.0, + 'oranges': 3.0} + self.assertL2(shopA.price_of_order(order1), tol=1e-8) + order2 = {'banana': 4, + 'apples': 1.0, + 'oranges': 3.0, + 'pears': 4} + self.assertL2(shopA.price_of_order(order2), tol=1e-8) + + +class ClassUse(UTestCase): + title = "Problem 7: Using classes" + + def test_shop_smarter(self): + """ Testing the shop_smarter function """ + price_of_fruits = {'apples': 2, 'oranges': 1, 'pears': 1.5, 'mellon': 10} + shopA = OnlineFruitShop('shopA', price_of_fruits) + shopB = OnlineFruitShop('shopB', {'apples': 1.0, 'oranges': 5.0}) + + shops = [shopA, shopB] + order = {'apples': 1.0, + 'oranges': 3.0} + self.assertEqualC(shop_smart(order, shops).name) + order = {'apples': 3.0} # test with a new order. + self.assertEqualC(shop_smart(order, shops).name) + + +class FruitReport(Report): + title = "Fruit example report" + abbreviate_questions = True + questions = [(AdditionQuestion, 10), + (MisterfyQuestion, 10), + (MeanOfDie, 10), + (FruitsOrdered, 10), + (BasicClass, 10), + (Inheritance, 10), + (ClassUse, 10)] + + import irlc + pack_imports = [irlc] + + +if __name__ == "__main__": + from unitgrade import evaluate_report_student + evaluate_report_student(FruitReport()) diff --git a/irlc/project0/fruit_project_tests_complete_grade.py b/irlc/project0/fruit_project_tests_complete_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..b76bbb32817ed635196d11010acc26ce20832851 --- /dev/null +++ b/irlc/project0/fruit_project_tests_complete_grade.py @@ -0,0 +1,4 @@ +# irlc/project0/fruit_project_tests_complete.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project0/unitgrade_data/AdditionQuestion.pkl b/irlc/project0/unitgrade_data/AdditionQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..11c0af9d431d3f61b3f2af0fba319cf8b8bb1958 Binary files /dev/null and b/irlc/project0/unitgrade_data/AdditionQuestion.pkl differ diff --git a/irlc/project0/unitgrade_data/BasicClass.pkl b/irlc/project0/unitgrade_data/BasicClass.pkl new file mode 100644 index 0000000000000000000000000000000000000000..110aa845e0142c58c171373055b1d656633a26d1 Binary files /dev/null and b/irlc/project0/unitgrade_data/BasicClass.pkl differ diff --git a/irlc/project0/unitgrade_data/ClassUse.pkl b/irlc/project0/unitgrade_data/ClassUse.pkl new file mode 100644 index 0000000000000000000000000000000000000000..25c6e361e111f36205adfa0ef92620470f6a0198 Binary files /dev/null and b/irlc/project0/unitgrade_data/ClassUse.pkl differ diff --git a/irlc/project0/unitgrade_data/FruitsOrdered.pkl b/irlc/project0/unitgrade_data/FruitsOrdered.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b55dba6dd8f3b84b5a4bdd0a9f85ba60a0ce7f29 Binary files /dev/null and b/irlc/project0/unitgrade_data/FruitsOrdered.pkl differ diff --git a/irlc/project0/unitgrade_data/Inheritance.pkl b/irlc/project0/unitgrade_data/Inheritance.pkl new file mode 100644 index 0000000000000000000000000000000000000000..32072c814e584f70b67d9d0895c4d07be7286c27 Binary files /dev/null and b/irlc/project0/unitgrade_data/Inheritance.pkl differ diff --git a/irlc/project0/unitgrade_data/MeanOfDie.pkl b/irlc/project0/unitgrade_data/MeanOfDie.pkl new file mode 100644 index 0000000000000000000000000000000000000000..27877f6a8e70ffb0d0ad3cac120b703d81d980fd Binary files /dev/null and b/irlc/project0/unitgrade_data/MeanOfDie.pkl differ diff --git a/irlc/project0/unitgrade_data/MisterfyQuestion.pkl b/irlc/project0/unitgrade_data/MisterfyQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2359530f1174d72bd344f9add58dd05e577704fb Binary files /dev/null and b/irlc/project0/unitgrade_data/MisterfyQuestion.pkl differ diff --git a/irlc/project1/Latex/02465project1_handin.tex b/irlc/project1/Latex/02465project1_handin.tex new file mode 100644 index 0000000000000000000000000000000000000000..f59e1d27e2cf427513a83618a9f3df9d071bd70b --- /dev/null +++ b/irlc/project1/Latex/02465project1_handin.tex @@ -0,0 +1,107 @@ +\documentclass[12pt,twoside]{article} +%\usepackage[table]{xcolor} % important to avoid options clash. +%\input{02465shared_preamble} +%\usepackage{cleveref} +\usepackage{url} +\usepackage{graphics} +\usepackage{multicol} +\usepackage{rotate} +\usepackage{rotating} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{pifont} +\usepackage{latexsym} +\usepackage[english]{babel} +\usepackage{epstopdf} +\usepackage{etoolbox} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{multirow,epstopdf} +\usepackage{fancyhdr} +\usepackage{booktabs} +\usepackage{xcolor} +\newcommand\redt[1]{ {\textcolor[rgb]{0.60, 0.00, 0.00}{\textbf{ #1} } } } + + +\newcommand{\m}[1]{\boldsymbol{ #1}} +\newcommand{\yoursolution}{ \redt{(your solution here) } } + + + +\title{ Report 1 hand-in } +\date{ \today } +\author{Alice (\texttt{s000001})\and Bob (\texttt{s000002})\and Clara (\texttt{s000003}) } + +\begin{document} +\maketitle + +\begin{table}[ht!] +\caption{Attribution table. Feel free to add/remove rows and columns} +\begin{tabular}{llll} +\toprule + & Alice & Bob & Clara \\ +\midrule + 1: A basic blaster-business & 0-100\% & 0-100\% & 0-100\% \\ + 2: Warmup & 0-100\% & 0-100\% & 0-100\% \\ + 3: Manually computing $J_{N-1}$ & 0-100\% & 0-100\% & 0-100\% \\ + 4: Compute optimal policy and value function & 0-100\% & 0-100\% & 0-100\% \\ + 5: Kiosk2 & 0-100\% & 0-100\% & 0-100\% \\ + 6: Explaining the policy & 0-100\% & 0-100\% & 0-100\% \\ + 7: Policy explanation continued & 0-100\% & 0-100\% & 0-100\% \\ + 8: Go east & 0-100\% & 0-100\% & 0-100\% \\ + 9: Describe the go-east problem & 0-100\% & 0-100\% & 0-100\% \\ + 10: Predict consequence of actions & 0-100\% & 0-100\% & 0-100\% \\ + 11: Possible future states & 0-100\% & 0-100\% & 0-100\% \\ + 12: Shortest path & 0-100\% & 0-100\% & 0-100\% \\ + 13: Predict consequence of actions with one ghost & 0-100\% & 0-100\% & 0-100\% \\ + 14: Possible future states with one ghost & 0-100\% & 0-100\% & 0-100\% \\ + 15: Optimal one-ghost planning & 0-100\% & 0-100\% & 0-100\% \\ + 16: Predict consequence of actions with several ghosts & 0-100\% & 0-100\% & 0-100\% \\ + 17: Future states & 0-100\% & 0-100\% & 0-100\% \\ + 18: Optimal planning & 0-100\% & 0-100\% & 0-100\% \\ +\bottomrule +\end{tabular} +\end{table} + +%\paragraph{Statement about collaboration:} +%Please edit this section to reflect how you have used external resources. The following statement will in most cases suffice: +%\emph{The code in the irls/project1 directory is entirely} + +%\paragraph{Main report:} +Headings have been inserted in the document for readability. You only have to edit the part which says \yoursolution. + +\section{The kiosk (\texttt{kiosk.py})} +\subsubsection*{{\color{red}Problem 1: A basic blaster-business}} + +\yoursolution +\redt{To get you started: \begin{align} + N & = 14 \\ + \mbox{for $k=0,\dots,N$: }\quad \mathcal{S}_k & = \dots \\ + \mbox{for $k=0,\dots,N-1$: }\quad \mathcal{A}_k(x_k) & = \dots \\ + & \vdots +\end{align} } + +\subsubsection*{{\color{red}Problem 3: Manually computing $J_{N-1}$}} + + \yoursolution + $$ + J_{N-1}(20) = ... + $$ + +\subsubsection*{{\color{red}Problem 6: Explaining the policy}} + + The first policy... this can be explained by noting ... \yoursolution + +\subsubsection*{{\color{red}Problem 7: Policy explanation continued}} + + $$\mu_{N-1}(0) = ...$$ +\yoursolution + +\section{Avoid the droid (\texttt{pacman.py)}} +\subsubsection*{{\color{red}Problem 9: Describe the go-east problem}} + + The environment is an example of a .... \\ + The controller is an example of a ... + \yoursolution + +\end{document} \ No newline at end of file diff --git a/irlc/project1/Latex/figures/kiosk1.pdf b/irlc/project1/Latex/figures/kiosk1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..54c179fa1703c83e77398a3f6382d3e685fc8fd9 Binary files /dev/null and b/irlc/project1/Latex/figures/kiosk1.pdf differ diff --git a/irlc/project1/Latex/figures/kiosk2.pdf b/irlc/project1/Latex/figures/kiosk2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..07dd964485a357336d64c2393ce3fc97c8af1e14 Binary files /dev/null and b/irlc/project1/Latex/figures/kiosk2.pdf differ diff --git a/irlc/project1/Latex/figures/your_answer.pdf b/irlc/project1/Latex/figures/your_answer.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d8c092974e20aaaf1165958a53bdce3a2ebdbf8f Binary files /dev/null and b/irlc/project1/Latex/figures/your_answer.pdf differ diff --git a/irlc/project1/__init__.py b/irlc/project1/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/project1/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/project1/kiosk.py b/irlc/project1/kiosk.py new file mode 100644 index 0000000000000000000000000000000000000000..70f33719ab70e782558588bd3d336e4d112177fc --- /dev/null +++ b/irlc/project1/kiosk.py @@ -0,0 +1,70 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" +This project resembles the Inventory-control problem discussed in (Her24, Subsection 5.1.2) but with more complicated rules. +If you are stuck, the inventory-control problem will be a good place to start. + +I recommend to use the DP_stochastic function (as we did with the inventory-control example). This means +your main problem is to build appropriate DPModel-classes to represent the different problems. + +References: + [Her24] Tue Herlau. Sequential decision making. (Freely available online), 2024. +""" +from irlc.ex02.dp_model import DPModel +from irlc.ex02.dp import DP_stochastic +import matplotlib.pyplot as plt +from scipy.stats import binom +from irlc import savepdf +import numpy as np + +def plot_policy(pi, title, pdf): + """ Helper function to plot the policy functions pi, as generated by the DP_stochastic function. This function + can be used to visualize which actions are taken in which state (y-axis) at which time step (x-axis). """ + N = len(pi) + W = max(pi[0].keys()) + A = np.zeros((W, N)) + for i in range(W): + for j in range(N): + A[i, j] = pi[j][i] + plt.imshow(A) + plt.title(title) + savepdf(pdf) + plt.show() + +# TODO: 51 lines missing. +raise NotImplementedError("Insert your solution and remove this error.") + +def warmup_states(): + # TODO: 1 lines missing. + raise NotImplementedError("return state set") + +def warmup_actions(): + # TODO: 1 lines missing. + raise NotImplementedError("return action set") + +def solve_kiosk_1(): + # TODO: 1 lines missing. + raise NotImplementedError("Return cost and policy here (same format as DP_stochastic)") + +def solve_kiosk_2(): + # TODO: 1 lines missing. + raise NotImplementedError("Return cost and policy here (same format as DP_stochastic)") + + +def main(): + # Problem 14 + print("Available states S_0:", warmup_states()) + print("Available actions A_0(x_0):", warmup_actions()) + + J, pi = solve_kiosk_1() # Problem 16 + print("Kiosk1: Expected profits: ", -J[0][0], " imperial credits") + plot_policy(pi, "Kiosk1", "Latex/figures/kiosk1") + plt.show() + + J, pi = solve_kiosk_2() # Problem 17 + print("Kiosk 2: Expected profits: ", -J[0][0], " imperial credits") + plot_policy(pi, "Kiosk2", "Latex/figures/kiosk2") + plt.show() + + +if __name__ == "__main__": + main() diff --git a/irlc/project1/pacman.py b/irlc/project1/pacman.py new file mode 100644 index 0000000000000000000000000000000000000000..6ad08ecd50c9ce887f4e572d59e72cb21693d9b7 --- /dev/null +++ b/irlc/project1/pacman.py @@ -0,0 +1,169 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from collections import defaultdict +from irlc import train +from irlc.ex02.dp_model import DPModel +from irlc.ex02.dp import DP_stochastic +from irlc.ex02.dp_agent import DynamicalProgrammingAgent +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc.pacman.gamestate import GameState + +east = """ +%%%%%%%% +% P .% +%%%%%%%% """ + +east2 = """ +%%%%%%%% +% P.% +%%%%%%%% """ + +SS2tiny = """ +%%%%%% +%.P % +% GG.% +%%%%%% +""" + +SS0tiny = """ +%%%%%% +%.P % +% .% +%%%%%% +""" + +SS1tiny = """ +%%%%%% +%.P % +% G.% +%%%%%% +""" + +datadiscs = """ +%%%%%%% +% .% +%.P%% % +%. .% +%%%%%%% +""" + +# TODO: 30 lines missing. +raise NotImplementedError("Put your own code here") + +def p_next(x : GameState, u: str): + """ Given the agent is in GameState x and takes action u, the game will transition to a new state xp. + The state xp will be random when there are ghosts. This function should return a dictionary of the form + + {..., xp: p, ...} + + of all possible next states xp and their probability -- you need to compute this probability. + + Hints: + * In the above, xp should be a GameState, and p will be a float. These are generated using the functions in the GameState x. + * Start simple (zero ghosts). Then make it work with one ghosts, and then finally with any number of ghosts. + * Remember the ghosts move at random. I.e. if a ghost has 3 available actions, it will choose one with probability 1/3 + * The slightly tricky part is that when there are multiple ghosts, different actions by the individual ghosts may lead to the same final state + * Check the probabilities sum to 1. This will be your main way of debugging your code and catching issues relating to the previous point. + """ + # TODO: 8 lines missing. + raise NotImplementedError("Return a dictionary {.., xp: p, ..} where xp is a possible next state and p the probability") + return states + + +def go_east(map): + """ Given a map-string map (see examples in the top of this file) that can be solved by only going east, this will return + a list of states Pacman will traverse. The list it returns should therefore be of the form: + + [s0, s1, s2, ..., sn] + + where each sk is a GameState object, the first element s0 is the start-configuration (corresponding to that in the Map), + and the last configuration sn is a won GameState obtained by going east. + + Note this function should work independently of the number of required east-actions. + + Hints: + * Use the GymPacmanEnvironment class. The report description will contain information about how to set it up, as will pacman_demo.py + * Use this environment to get the first GameState, then use the recommended functions to go east + """ + # TODO: 5 lines missing. + raise NotImplementedError("Return the list of states pacman will traverse if he goes east until he wins the map") + return states + +def get_future_states(x, N): + # TODO: 4 lines missing. + raise NotImplementedError("return a list-of-list of future states [S_0,\dots,S_N]. Each S_k is a state space, i.e. a list of GameState objects.") + return state_spaces + +def win_probability(map, N=10): + """ Assuming you get a reward of -1 on wining (and otherwise zero), the win probability is -J_pi(x_0). """ + # TODO: 5 lines missing. + raise NotImplementedError("Return the chance of winning the given map within N steps or less.") + return win_probability + +def shortest_path(map, N=10): + """ If each move has a cost of 1, the shortest path is the path with the lowest cost. + The actions should be the list of actions taken. + The states should be a list of states the agent visit. The first should be the initial state and the last + should be the won state. """ + # TODO: 4 lines missing. + raise NotImplementedError("Return the cost of the shortest path, the list of actions taken, and the list of states.") + return actions, states + + +def no_ghosts(): + # Check the pacman_demo.py file for help on the GameState class and how to get started. + # This function contains examples of calling your functions. However, you should use unitgrade to verify correctness. + + ## Problem 1: Lets try to go East. Run this code to see if the states you return looks sensible. + states = go_east(east) + for s in states: + print(str(s)) + + ## Problem 3: try the p_next function for a few empty environments. Does the result look sensible? + x, _ = PacmanEnvironment(layout_str=east).reset() + action = x.A()[0] + print(f"Transitions when taking action {action} in map: 'east'") + print(x) + print(p_next(x, action)) # use str(state) to get a nicer representation. + + print(f"Transitions when taking action {action} in map: 'east2'") + x, _ = PacmanEnvironment(layout_str=east2).reset() + print(x) + print(p_next(x, action)) + + ## Problem 4 + print(f"Checking states space S_1 for k=1 in SS0tiny:") + x, _ = PacmanEnvironment(layout_str=SS0tiny).reset() + states = get_future_states(x, N=10) + for s in states[1]: # Print all elements in S_1. + print(s) + print("States at time k=10, |S_10| =", len(states[10])) + + ## Problem 6 + N = 20 # Planning horizon + action, states = shortest_path(east, N) + print("east: Optimal action sequence:", action) + + action, states = shortest_path(datadiscs, N) + print("datadiscs: Optimal action sequence:", action) + + action, states = shortest_path(SS0tiny, N) + print("SS0tiny: Optimal action sequence:", action) + + +def one_ghost(): + # Win probability when planning using a single ghost. Notice this tends to increase with planning depth + wp = [] + for n in range(10): + wp.append(win_probability(SS1tiny, N=n)) + print(wp) + print("One ghost:", win_probability(SS1tiny, N=12)) + + +def two_ghosts(): + # Win probability when planning using two ghosts + print("Two ghosts:", win_probability(SS2tiny, N=12)) + +if __name__ == "__main__": + no_ghosts() + one_ghost() + two_ghosts() diff --git a/irlc/project1/pacman_demo1.py b/irlc/project1/pacman_demo1.py new file mode 100644 index 0000000000000000000000000000000000000000..bf74e07b095507c77acd59521cd892a40b11d1ac --- /dev/null +++ b/irlc/project1/pacman_demo1.py @@ -0,0 +1,53 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc.project1.pacman import east, datadiscs, SS1tiny, SS2tiny +from irlc import interactive, savepdf, Agent, train +import matplotlib +matplotlib.use('qtagg') + +count = """ +%%%% +%P % +%..% +%%%% +""" + + +if __name__ == "__main__": + # Example interaction with an environment: + # Instantiate the map 'east' and get a GameState instance: + env = PacmanEnvironment(layout_str=east, render_mode='human') + x, info = env.reset() # x is a irlc.pacman.gamestate.GameState object. See the online documentation for more examples. + print("Start configuration of board:") + print(x) + env.close() # If you use render_mode = 'human', I recommend you use env.close() at the end of the code to free up graphics resources. + # The GameState object `x` has a handful of useful functions. The important ones are: + # x.A() # Action space + # x.f(action) # State resulting in taking action 'action' in state 'x' + # x.players() # Number of agents on board (at least 1) + # x.player() # Whose turn it is (player = 0 is us) + # x.is_won() # True if we have won + # x.is_lost() # True if we have lost + # You can check if two GameState objects x1 and x2 are the same by simply doing x1 == x2. + # There are other functions in the GameState class, but I advise against using them. + from irlc.pacman.pacman_environment import PacmanEnvironment, datadiscs + env = PacmanEnvironment(layout_str=datadiscs, render_mode='human') + s, _ = env.reset() + + savepdf('pacman_east', env=env) + env.close() + + env = PacmanEnvironment(layout_str=datadiscs, render_mode='human') + env.reset() + savepdf('pacman_datadiscs', env=env) + env.close() + + env = PacmanEnvironment(layout_str=SS1tiny, render_mode='human') + env.reset() + savepdf('pacman_SS1tiny', env=env) + env.close() + + env = PacmanEnvironment(layout_str=SS2tiny, render_mode='human') + env.reset() + savepdf('pacman_SS2tiny', env=env) + env.close() diff --git a/irlc/project1/pacman_demo2.py b/irlc/project1/pacman_demo2.py new file mode 100644 index 0000000000000000000000000000000000000000..a3bf61d9756b70f810ff3f962ee59cb1f478bc11 --- /dev/null +++ b/irlc/project1/pacman_demo2.py @@ -0,0 +1,11 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc.project1.pacman import east, datadiscs, SS1tiny, SS2tiny +from irlc import interactive, savepdf, Agent, train + +if __name__ == "__main__": + env = PacmanEnvironment(layout_str=datadiscs, render_mode='human') + env, agent = interactive(env, Agent(env)) + stats, trajectory = train(env, agent, num_episodes=1) + print("First state was\n", trajectory[0].state[0]) + env.close() diff --git a/irlc/project1/project1_grade.py b/irlc/project1/project1_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..1321a9d1f6c241b906a94210aeae8fb43aa0d013 --- /dev/null +++ b/irlc/project1/project1_grade.py @@ -0,0 +1,4 @@ +# irlc/project1/project1_tests.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project1/project1_tests.py b/irlc/project1/project1_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..dd846223725ae942bc57f2ead0768d4e3bb12de1 --- /dev/null +++ b/irlc/project1/project1_tests.py @@ -0,0 +1,377 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report +from irlc.pacman.gamestate import GameState +from irlc.pacman.pacman_environment import PacmanEnvironment +import numpy as np +from unitgrade import hide + +def get_starting_state(name): + s0, _ = PacmanEnvironment(layout_str=get_map(name)).reset() + return s0 + +def get_map(name): + from irlc.project1.pacman import east, east2, SS0tiny, datadiscs, SS1tiny, SS2tiny + names2maps = {'east': east, + 'east2': east2, + 'datadiscs': datadiscs, + 'SS0tiny': SS0tiny, + 'SS1tiny': SS1tiny, + 'SS2tiny': SS2tiny, + } + return names2maps[name] + +class Pacman1(UTestCase): + """ Problem 1: The go_east function """ + + def test_states_length(self): + from irlc.project1.pacman import go_east, east + self.title = "Checking number of states" + self.assertEqualC(len(go_east(east))) + # assert False + + + def test_first_state(self): + from irlc.project1.pacman import go_east, east + self.title = "Checking first state" + self.assertEqualC(str(go_east(east))[0]) # string representation of the first state. + + def test_all_states(self): + self.title = "Checking complete output" + from irlc.project1.pacman import go_east, east + self.assertEqualC(tuple(str(s) for s in go_east(east))) + + +class Pacman3(UTestCase): + """ Problem 3: the p_next function without droids """ + map = 'east' + action = 'East' + + def get_transitions(self): + from irlc.project1.pacman import p_next + + state = get_starting_state(self.map) + state_transitions = p_next(state, self.action) + self.assertIsInstance(state_transitions, dict) + for x in state_transitions: # Test if each new state is actually a GameState. + self.assertIsInstance(x, GameState) + dd = {s: np.round(p, 4) for s, p in state_transitions.items()} + return dd + + def test_dictionary_size(self): + """ Is the number of keys/values in the dictionary correct? """ + # print(self.get_expected_test_value()) + self.assertEqualC(len(self.get_transitions())) + # self.get_expected_value() + + + def test_probabilities(self): + """ Does the probabilities have the right value? """ + self.assertEqualC(set(self.get_transitions().values())) + + def test_states(self): + """ Does the dictionary contains the right states """ + self.assertEqualC(set(self.get_transitions().keys())) + + def test_everything(self): + """ Test both states and probabilities """ + self.assertEqualC(self.get_transitions()) + + +class Pacman4(UTestCase): + """ Problem 4: Compute the state spaces as a list [S_0, ..., S_N] on the map 'east' using N = 7 """ + map = 'east' + N = 7 + + @property + def states(self): + return self.__class__.states_ + + @property + def sizes(self): + return self.__class__.sizes_ + + @classmethod + def setUpClass(cls): + from irlc.project1.pacman import get_future_states + states = get_future_states(get_starting_state(cls.map), cls.N) + assert isinstance(states, list) + for S in states: + assert isinstance(S, list) + for s in S: + assert isinstance(s, GameState) + cls.sizes_ = [len(S) for S in states] + cls.states_ = [set(S) for S in states] + + def test_state_space_size_S0(self): + self.assertEqualC(self.sizes[0]) + + def test_state_space_size_S1(self): + self.assertEqualC(self.sizes[1]) + + def test_state_space_size_all(self): + self.assertEqualC(self.sizes) + + def test_number_of_spaces(self): + """ Check the list of state spaces has the right length. It should be N+1 long (S_0, ..., S_N) """ + self.assertEqualC(len(self.states)) + + def test_state_space_0(self): + """ Check the first element, the state space S0. + + Hints: + * It should be a list containning a single GameState object (the starting state) """ + self.assertEqualC(self.states[0]) + + def test_state_space_1(self): + """ Check the second element, the state space S1. + + Hints: + * It should be a list containing the GameState objects you can go to in one step. + * You should be able to figure out what they are from the description of the game rules. Note pacman will not move if he walks into the walls. """ + self.assertEqualC(self.states[1]) + + def test_state_spaces(self): + """ Test all state spaces S_0, ..., S_N + + Hints: + * If this method breaks, find the first state space which is wrongly computed, and work out which states are missing or should not be there + * I anticipate the won/lost game configurations may become a source of problems. Note you don't have to specify these manually; they should follow by using the s.f(action)-function. """ + + self.assertEqualC(tuple(self.states)) + + +class Pacman6a(UTestCase): + """ Problem 6a: No ghost optimal path (get_shortest_path) in map 'east' using N=20 """ + map = 'east' + N = 20 + + def get_shortest_path(self): + from irlc.project1.pacman import shortest_path + layout = get_map(self.map) + actions, states = shortest_path(layout, self.N) + return actions, states + + def test_sequence_lengths(self): + """ Test the length of the state/action lists. """ + actions, states = self.get_shortest_path() + print("self.map", self.map, 'actions', actions) + self.assertEqualC(len(actions)) + self.assertEqualC(len(states)) + + def test_trajectory(self): + """ Test the state/action trajectory """ + actions, states = self.get_shortest_path() + self.assertTrue(states[-1].is_won()) + + x0 = states[0] + for k, u in enumerate(actions): + x0 = x0.f(u) + self.assertTrue(x0 == states[k + 1]) + self.assertEqualC(states[1]) + # self.assertEqualC(J) + +class Pacman6b(Pacman6a): + """ Problem 6b: No ghost optimal path (get_shortest_path) in map 'SS1tiny' using N=20 """ + map = 'SS0tiny' + +class Pacman6c(Pacman6a): + """ Problem 6b: No ghost optimal path (get_shortest_path) in map 'datadiscs' using N=20 """ + map = 'datadiscs' + +## ONE GHOST +class Pacman7a(Pacman3): + """ Problem 7a: the p_next function with one droid """ + map = 'SS1tiny' + action = 'East' + +class Pacman7b(Pacman3): + """ Problem 7b: the p_next function with one droid """ + map = 'SS1tiny' + action = 'West' + +class Pacman8a(Pacman4): + """ Problem 5: Test the state spaces as a list [S_0, ..., S_N]. on the map 'SS1tiny' using N = 4 """ + map = 'SS1tiny' + N = 4 + +class Pacman8b(Pacman4): + """ Problem 6: Test the state spaces as a list [S_0, ..., S_N]. on the map 'SS1tiny' using N = 6 """ + map = 'SS1tiny' + N = 6 + pass + +class Pacman9(UTestCase): + """ Problem 9: Testing winrate on the map SS1tiny (win_probability) """ + map = 'SS1tiny' + + def _win_rate(self, N): + self.title = f"Testing winrate in {N} steps" + from irlc.project1.pacman import win_probability + p = np.round(win_probability(get_map(self.map), N), 4) + print("win rate in N ", N, "steps was", p) + # print("Testing win rate", self.get_expected_test_value()) + self.assertEqualC(p) + + def test_win_rate_N4(self): + self._win_rate(N=4) + + def test_win_rate_N5(self): + self._win_rate(N=5) + + def test_win_rate_N6(self): + self._win_rate(N=6) + + +# ## TWO GHOSTS +class Pacman10(Pacman3): # p_next for two ghosts + """ Problem 10: Testing the p_next function using SS2tiny """ + map = 'SS2tiny' + N = 4 + +class Pacman11(Pacman4): # State-space lists + """ Problem 11: Test the state spaces as a list [S_0, ..., S_N]. on the map 'SS2tiny' using N = 3 """ + map = 'SS2tiny' + N = 3 + +class Pacman12(Pacman9): # Optimal planning for two ghost-droids. + """ Problem 12: Testing winrate on the map SS2tiny (win_probability) """ + map = 'SS2tiny' + N = 2 + +class Kiosk1(UTestCase): + """ Problem 14: Warmup check of S_0 and A_0(x_0) """ + def test_warmup_states_length(self): + from irlc.project1.kiosk import warmup_states, warmup_actions + n = len(warmup_states()) + self.title = f"Checking length of state space is {n}" + self.assertEqualC(n) + + def test_warmup_actions_length(self): + from irlc.project1.kiosk import warmup_states, warmup_actions + n = len(warmup_actions()) + self.title = f"Checking length of action space is {n}" + self.assertEqualC(n) + + + def test_warmup_states(self): + self.title = "Checking state space" + from irlc.project1.kiosk import warmup_states, warmup_actions + self.assertEqualC(set(warmup_states())) + + def test_warmup_actions(self): + self.title = "Checking action space" + from irlc.project1.kiosk import warmup_states, warmup_actions + self.assertEqualC(set(warmup_actions())) + + +class Kiosk2(UTestCase): + """ Problem 16: solve_kiosk_1 """ + + @classmethod + def setUpClass(cls) -> None: + from irlc.project1.kiosk import solve_kiosk_1 + cls.J, cls.pi = solve_kiosk_1() + + def mk_title(self, k, x): + self.k = k + self.x = x + + if self.k is not None: + if self.k != -1: + sk = f"N-{-self.k - 1}" if self.k < 0 else str(self.k) + else: + sk = "N" + jp = "J_{" + sk + "}" if len(sk) > 1 else "J_"+sk + else: + jp = "J_k" + if self.x is not None: + xp = f"(x={self.x})" + else: + xp = "(x) for all x" + return "Checking cost-to-go " + jp + xp + + def check_J(self, k, x): + J = [{k: v for k, v in J_.items()} for J_ in self.__class__.J] + t = self.mk_title(k, x) + if k is not None and x is not None: + t += f" = {J[k][x]}" + self.title = t + + if k is not None: + J_ = J[k] + if x is not None: + self.assertAlmostEqualC(J_[x], msg=f"Failed test of J[{k}][{x}]", delta=1e-4) + # self.assertL2(J_[x], msg=f"Failed test of J[{k}][{x}]", tol=1e-5) + else: + for state in sorted(J_.keys()): + self.assertAlmostEqualC(J_[state], msg=f"Failed test of J[{k}][{state}]", delta=1e-4) + else: + for k, J_ in enumerate(J): + for state in sorted(J_.keys()): + self.assertAlmostEqualC(J_[state], msg=f"Failed test of J[{k}][{state}]", delta=1e-4) + + def test_case_1(self): + self.check_J(k=-1, x=10) + + def test_case_2(self): + self.check_J(k=-2, x=20) + + def test_case_3(self): + self.check_J(k=-2, x=0) + + def test_case_4(self): + self.check_J(k=0, x=0) + + def test_case_5(self): + self.check_J(k=1, x=4) + + def test_case_6(self): + self.check_J(k=None, x=None) + + +class Kiosk3(Kiosk2): + """ Problem 17: solve_kiosk_2 """ + @classmethod + def setUpClass(cls) -> None: + from irlc.project1.kiosk import solve_kiosk_2 + cls.J, cls.pi = solve_kiosk_2() + + +class Project1(Report): #240 total. + title = "02465 project part 1: Dynamical Programming" + remote_url = "https://02465material.pages.compute.dtu.dk/02465public/_static/evaluation/" + import irlc + pack_imports = [irlc] + abbreviate_questions = True + + pacman_questions = [ + (Pacman1, 10), # east + (Pacman3, 10), # p_next (g=0) + (Pacman4, 10), # future_states (g=0) + (Pacman6a, 4), # shortest_path (g=0) + (Pacman6b, 3), # shortest_path (g=0) + (Pacman6c, 3), # shortest_path (g=0) + (Pacman7a, 5), # p_next (g=1) + (Pacman7b, 5), # p_next (g=1) + (Pacman8a, 5), # future_states (g=1) + (Pacman8b, 5), # future_states (g=1) + (Pacman9, 10), # optimal planning (g=1) + (Pacman10, 10), # p_next (g=2) + (Pacman11, 10), # future_states (g=2) + (Pacman12, 10), # optimal planning (g=2) + ] + + kiosk_questions = [ + (Kiosk1, 10), + (Kiosk2, 25), + (Kiosk3, 25), + ] + + questions = [] + questions += pacman_questions + questions += kiosk_questions + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Project1()) +# 448, 409 # 303 diff --git a/irlc/project1/project1_tests_complete_grade.py b/irlc/project1/project1_tests_complete_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..aac3b1bc52eb188ee8f3c57f872797baa428605c --- /dev/null +++ b/irlc/project1/project1_tests_complete_grade.py @@ -0,0 +1,4 @@ +# irlc/project1/project1_tests_complete.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project1/unitgrade_data/Kiosk1.pkl b/irlc/project1/unitgrade_data/Kiosk1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9b5467cdfccda39aa44e3d37c721875ed038a5ee Binary files /dev/null and b/irlc/project1/unitgrade_data/Kiosk1.pkl differ diff --git a/irlc/project1/unitgrade_data/Kiosk2.pkl b/irlc/project1/unitgrade_data/Kiosk2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..066206a99d6cfaa3faf6427d526b8d872f6f8b4c Binary files /dev/null and b/irlc/project1/unitgrade_data/Kiosk2.pkl differ diff --git a/irlc/project1/unitgrade_data/Kiosk3.pkl b/irlc/project1/unitgrade_data/Kiosk3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..066206a99d6cfaa3faf6427d526b8d872f6f8b4c Binary files /dev/null and b/irlc/project1/unitgrade_data/Kiosk3.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman1.pkl b/irlc/project1/unitgrade_data/Pacman1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f37fc807282017b9e603748155519d4e98aec43b Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman1.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman10.pkl b/irlc/project1/unitgrade_data/Pacman10.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2d64b4d89e99ddca0c6962c11fd8ba8aa491825a Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman10.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman11.pkl b/irlc/project1/unitgrade_data/Pacman11.pkl new file mode 100644 index 0000000000000000000000000000000000000000..78b2e18bbd93181f3b8ce15001c4913c34de1d6d Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman11.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman12.pkl b/irlc/project1/unitgrade_data/Pacman12.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5a930f541bb5dcc56c4ffc4cd181f270c7f6cd6d Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman12.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman3.pkl b/irlc/project1/unitgrade_data/Pacman3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2d64b4d89e99ddca0c6962c11fd8ba8aa491825a Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman3.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman4.pkl b/irlc/project1/unitgrade_data/Pacman4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..78b2e18bbd93181f3b8ce15001c4913c34de1d6d Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman4.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman6a.pkl b/irlc/project1/unitgrade_data/Pacman6a.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3e711ce40b06ce356765ef1172d9f9524d665ea8 Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman6a.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman6b.pkl b/irlc/project1/unitgrade_data/Pacman6b.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3e711ce40b06ce356765ef1172d9f9524d665ea8 Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman6b.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman6c.pkl b/irlc/project1/unitgrade_data/Pacman6c.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3e711ce40b06ce356765ef1172d9f9524d665ea8 Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman6c.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman7a.pkl b/irlc/project1/unitgrade_data/Pacman7a.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2d64b4d89e99ddca0c6962c11fd8ba8aa491825a Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman7a.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman7b.pkl b/irlc/project1/unitgrade_data/Pacman7b.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2d64b4d89e99ddca0c6962c11fd8ba8aa491825a Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman7b.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman8a.pkl b/irlc/project1/unitgrade_data/Pacman8a.pkl new file mode 100644 index 0000000000000000000000000000000000000000..78b2e18bbd93181f3b8ce15001c4913c34de1d6d Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman8a.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman8b.pkl b/irlc/project1/unitgrade_data/Pacman8b.pkl new file mode 100644 index 0000000000000000000000000000000000000000..78b2e18bbd93181f3b8ce15001c4913c34de1d6d Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman8b.pkl differ diff --git a/irlc/project1/unitgrade_data/Pacman9.pkl b/irlc/project1/unitgrade_data/Pacman9.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5a930f541bb5dcc56c4ffc4cd181f270c7f6cd6d Binary files /dev/null and b/irlc/project1/unitgrade_data/Pacman9.pkl differ diff --git a/irlc/project2/Latex/02465project2_handin.tex b/irlc/project2/Latex/02465project2_handin.tex new file mode 100644 index 0000000000000000000000000000000000000000..045ca4d444de31ffaabdd612bd38a7d7e207cab4 --- /dev/null +++ b/irlc/project2/Latex/02465project2_handin.tex @@ -0,0 +1,146 @@ +\documentclass[12pt,twoside]{article} +%\usepackage[table]{xcolor} % important to avoid options clash. +%\input{02465shared_preamble} +%\usepackage{cleveref} +\usepackage{url} +\usepackage{graphics} +\usepackage{multicol} +\usepackage{rotate} +\usepackage{rotating} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{pifont} +\usepackage{latexsym} +\usepackage[english]{babel} +\usepackage{epstopdf} +\usepackage{etoolbox} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{multirow,epstopdf} +\usepackage{fancyhdr} +\usepackage{booktabs} +\usepackage{xcolor} +\newcommand\redt[1]{ {\textcolor[rgb]{0.60, 0.00, 0.00}{\textbf{ #1} } } } + + +\newcommand{\m}[1]{\boldsymbol{ #1}} +\newcommand{\yoursolution}{ \redt{(your solution here) } } + + + +\title{ Report 2 hand-in } +\date{ \today } +\author{Alice (\texttt{s000001})\and Bob (\texttt{s000002})\and Clara (\texttt{s000003}) } + +\begin{document} +\maketitle + +\begin{table}[ht!] +\caption{Attribution table. Feel free to add/remove rows and columns} +\begin{tabular}{llll} +\toprule + & Alice & Bob & Clara \\ +\midrule + 1: Formulate Yodas pendulum as a linear problem & 0-100\% & 0-100\% & 0-100\% \\ + 2: State at a later time & 0-100\% & 0-100\% & 0-100\% \\ + 3: State at a later time II & 0-100\% & 0-100\% & 0-100\% \\ + 4: Eigenvalues and powers & 0-100\% & 0-100\% & 0-100\% \\ + 5: Analytical expression of Eigenvalues using Euler discretization & 0-100\% & 0-100\% & 0-100\% \\ + 6: Bound using Euler discretization & 0-100\% & 0-100\% & 0-100\% \\ + 7: Matrix norm of Exponential discretization (harder) & 0-100\% & 0-100\% & 0-100\% \\ + 8: Stability & 0-100\% & 0-100\% & 0-100\% \\ + 9: Discretization & 0-100\% & 0-100\% & 0-100\% \\ + 10: Linearization & 0-100\% & 0-100\% & 0-100\% \\ + 11: Unitgrade self-check & 0-100\% & 0-100\% & 0-100\% \\ + 12: Optimal planning & 0-100\% & 0-100\% & 0-100\% \\ + 13: Control using simple linearization & 0-100\% & 0-100\% & 0-100\% \\ + 14: MPC & 0-100\% & 0-100\% & 0-100\% \\ +\bottomrule +\end{tabular} +\end{table} + +%\paragraph{Statement about collaboration:} +%Please edit this section to reflect how you have used external resources. The following statement will in most cases suffice: +%\emph{The code in the irls/project1 directory is entirely} + +%\paragraph{Main report:} +Headings have been inserted in the document for readability. You only have to edit the part which says \yoursolution. + +\section{Master Yodas pendulum (\texttt{yoda.py})}\label{yoda1} +\subsubsection*{{\color{red}Problem 1: Formulate Yodas pendulum as a linear problem}} + + \begin{align} + A & = \begin{bmatrix} \cdots \end{bmatrix} \\ + B & = \begin{bmatrix} \cdots \end{bmatrix} + \end{align} + \yoursolution + +\subsubsection*{{\color{red}Problem 2: State at a later time}} + + To solve the first part, we can write $\m x_N = \begin{bmatrix} \cdots \end{bmatrix}$ + + As for the second part we get: +\begin{align} +\tilde A_0 & = \begin{bmatrix} \cdots \end{bmatrix}, \quad A_0 = \begin{bmatrix} \cdots \end{bmatrix} +\end{align} + \yoursolution +\subsubsection*{{\color{red}Problem 4: Eigenvalues and powers}} + +Assume $\lambda_1, \lambda_2$ are the eigenvalues ... then the Eigenvalues of $M$ is ... similarly for $\tilde M$ ... +\yoursolution + +\subsubsection*{{\color{red}Problem 5: Analytical expression of Eigenvalues using Euler discretization}} + +... we get a characteristic polynomial of ... and therefore it follows from Mat1 that the two Eigenvalues are ... +\yoursolution + +\subsubsection*{{\color{red}Problem 6: Bound using Euler discretization}} + + Using Euler discretization we get the upper bound: + $$ +\| \m x_N \| \leq \cdots +$$ +\yoursolution + +\subsubsection*{{\color{red}Problem 7: Matrix norm of Exponential discretization (harder)}} + +Using exponential discretization we get an upper bound of: + $$ + \| \m x_N \| \leq \cdots + $$ + \yoursolution + +\section{R2D2 and control (\texttt{r2d2.py})} +\subsubsection*{{\color{red}Problem 9: Discretization}} + + $$ + \m x_{k+1} = \m f_k(\m x_k, \m u_k) = \begin{bmatrix} \cdots \\ \cdots \\ \cdots \end{bmatrix}$$ + +\subsubsection*{{\color{red}Problem 10: Linearization}} + +$$ + \m x_{k+1} \approx \begin{bmatrix} \cdots \\ \cdots \\ \cdots \end{bmatrix} \m x_k + + \begin{bmatrix} \cdots \\ \cdots \\ \cdots \end{bmatrix} \m u_k + + \begin{bmatrix} \vdots \end{bmatrix} +$$ + +\subsubsection*{{\color{red}Problem 12: Optimal planning}} + + \begin{center}\includegraphics[width=.5\linewidth]{figures/your_answer}~ + \includegraphics[width=.5\linewidth]{figures/your_answer} \end{center} + +\subsubsection*{{\color{red}Problem 13: Control using simple linearization}} + + % Just generate the figures using the script and change the path below. + \begin{center}\includegraphics[width=.5\linewidth]{figures/your_answer}~ + \includegraphics[width=.5\linewidth]{figures/your_answer} \end{center} +Intuitively, the second case fails because... \yoursolution + +\subsubsection*{{\color{red}Problem 14: MPC}} + + \begin{center}\includegraphics[width=.6\linewidth]{figures/your_answer}%~ + % \includegraphics[width=.5\linewidth]{figures/your_answer} + \end{center} + Iterative linearization solves the problem because... \yoursolution + +\end{document} \ No newline at end of file diff --git a/irlc/project2/Latex/figures/your_answer.pdf b/irlc/project2/Latex/figures/your_answer.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d8c092974e20aaaf1165958a53bdce3a2ebdbf8f Binary files /dev/null and b/irlc/project2/Latex/figures/your_answer.pdf differ diff --git a/irlc/project2/__init__.py b/irlc/project2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8794db4fc72b62ae50ebe61fd5ce31a77a77992e --- /dev/null +++ b/irlc/project2/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This file is required for the test system but should otherwise be empty.""" diff --git a/irlc/project2/project2_grade.py b/irlc/project2/project2_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..4dfffbd5d90249e2df10dbb2faf48011c61185a1 --- /dev/null +++ b/irlc/project2/project2_grade.py @@ -0,0 +1,4 @@ +# irlc/project2/project2_tests.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project2/project2_tests.py b/irlc/project2/project2_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..8b43727d460d025580c4a71c45e6da655252f98a --- /dev/null +++ b/irlc/project2/project2_tests.py @@ -0,0 +1,184 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report +import irlc +import numpy as np + +class YodaProblem1(UTestCase): + """ Test the get_A_B() function (Section 1, Problem 1) """ + def test_A_B(self): + from irlc.project2.yoda import get_A_B + for g in [9.82, 5.1]: + for L in [0.2, 0.5, 1.1]: + A,B = get_A_B(g,L) + # To get the expected output of a test (in the cases where it is not specified manually), + # simply use the line self.get_expected_test_value() *right before* running the test-function itself. + # print("The expected value is", self.get_expected_test_value()) + # If the code does not work, you need to upgrade unitgrade to the most recent version: + # pip install unitgrade --upgrade --no-cache + print(A) + self.assertLinf(A) + print(B) + self.assertLinf(B) + +class YodaProblem2(UTestCase): + r""" Yodas pendulum: Problem 2 """ + def test_A0(self): + from irlc.project2.yoda import A_ei, A_euler + for g in [9.2, 10]: + for L in [0.2, 0.4]: + for Delta in [0.1, 0.2]: + self.assertLinf(A_euler(g, L, Delta)) # Test Euler discretization + self.assertLinf(A_ei(g, L, Delta)) # Test exponential discretization + + +class YodaProblem3(UTestCase): + r""" Yodas pendulum: Problem 3 """ + def test_M(self): + from irlc.project2.yoda import M_ei, M_euler + for g in [9.2, 10]: + for L in [0.2, 0.4]: + for Delta in [0.1, 0.2]: + for N in [3, 5]: + self.assertLinf(M_ei(g, L, Delta, N)) # Test Euler discretization + self.assertLinf(M_euler(g, L, Delta, N)) # Test exponential discretization + + +class YodaProblem6(UTestCase): + r""" Yodas pendulum: Bound using Euler discretization Problem 6 """ + def test_xN_euler_bound(self): + from irlc.project2.yoda import xN_bound_euler + for g in [9.2, 10]: + for L in [0.2, 0.4]: + for Delta in [0.1, 0.2]: + for N in [3, 5]: + self.assertLinf(xN_bound_euler(g, L, Delta, N)) + +class YodaProblem7(UTestCase): + r"""Yodas pendulum: Bound using exponential discretization Problem 7 """ + def test_xN_euler_bound(self): + from irlc.project2.yoda import xN_bound_ei + for g in [9.2, 10]: + for L in [0.2, 0.4]: + for Delta in [0.1, 0.2]: + for N in [3, 5]: + self.assertLinf(xN_bound_ei(g, L, Delta, N)) + + +class R2D2Problem15(UTestCase): + r"""R2D2: Tests the linearization and discretization code in Problem 9 and Problem 10""" + def test_f_euler_zeros(self): + # Test in a simple case: + x = np.zeros((3,)) + u = np.asarray([1,0]) + from irlc.project2.r2d2 import f_euler + self.assertLinf(f_euler(x, u, Delta=0.05)) + self.assertLinf(f_euler(x, u, Delta=0.1)) + + def test_f_euler(self): + np.random.seed(42) + for _ in range(4): + x = np.random.randn(3) + u = np.random.randn(2) + from irlc.project2.r2d2 import f_euler + self.assertLinf(f_euler(x, u, Delta=0.05)) + self.assertLinf(f_euler(x, u, Delta=0.1)) + + def checklin(self, x_bar, u_bar): + from irlc.project2.r2d2 import linearize + A, B, d = linearize(x_bar, u_bar, Delta=0.05) + self.assertLinf(A) + self.assertLinf(B) + self.assertLinf(d) + + def test_linearization1(self): + x_bar = np.asarray([0, 0, 0]) + u_bar = np.asarray([1, 0]) + self.checklin(x_bar, u_bar) + + def test_linearization2(self): + x_bar = np.asarray([0, 0, 0.24]) + u_bar = np.asarray([1, 0]) + self.checklin(x_bar, u_bar) + + def test_linearization3(self): + np.random.seed(42) + for _ in range(10): + x_bar = np.random.randn(3) + u_bar = np.asarray([1, 0]) + self.checklin(x_bar, u_bar) + +class R2D2Direct(UTestCase): + r"""Problem 12: R2D2 and direct methods """ + def chk_direct(self, x_target): + from irlc.project2.r2d2 import drive_to_direct + states = drive_to_direct(x_target=x_target, plot=False) + self.assertIsInstance(states, np.ndarray) # Test states are an ndarray + self.assertEqualC(states.shape) # Test states have the right shape + self.assertL2(states, tol=0.03) + + def test_direct_1(self): + x_target = (2, 0, 0) + self.chk_direct(x_target) + + def test_direct_2(self): + x_target = (2, 2, np.pi / 2) + self.chk_direct(x_target) + + +class R2D2Linearization(UTestCase): + """Problem 13: R2D2 and simple linearization.""" + def chk_linearization(self, x_target): + from irlc.project2.r2d2 import drive_to_linearization + states = drive_to_linearization(x_target=x_target, plot=False) + self.assertIsInstance(states, np.ndarray) # Test states are an ndarray + self.assertEqualC(states.shape) # Test states have the right shape + self.assertL2(states, tol=0.03) + + def test_linearization_1(self): + x_target = (2, 0, 0) + self.chk_linearization(x_target) + + def test_linearization_2(self): + x_target = (2, 2, np.pi / 2) + self.chk_linearization(x_target) + +class R2D2_MPC(UTestCase): + r"""Problem 14: R2D2 and MPC.""" + def chk_mpc(self, x_target): + from irlc.project2.r2d2 import drive_to_mpc + states = drive_to_mpc(x_target=x_target, plot=False) + self.assertIsInstance(states, np.ndarray) # Test states are an ndarray + self.assertEqualC(states.shape) # Test states have the right shape + self.assertL2(states, tol=0.03) + + def test_mpc_1(self): + self.chk_mpc(x_target=(2,0,0) ) + + def test_mpc_2(self): + self.chk_mpc(x_target=(2, 2, np.pi / 2)) + +class Project2(Report): + title = "Project part 2: Control" + pack_imports = [irlc] + + yoda = [ + (YodaProblem1, 10), + (YodaProblem2, 10), + (YodaProblem3, 10), + (YodaProblem6, 8), + (YodaProblem7, 2) + ] + r2d2 = [ + (R2D2Problem15, 10), + (R2D2Direct, 10), + (R2D2Linearization, 10), + (R2D2_MPC, 10), + ] + + questions = [] + questions += yoda + questions += r2d2 + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Project2() ) diff --git a/irlc/project2/project2_tests_complete_grade.py b/irlc/project2/project2_tests_complete_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..70f9d0b9955521d888c23da7afff663de1a2ae20 --- /dev/null +++ b/irlc/project2/project2_tests_complete_grade.py @@ -0,0 +1,4 @@ +# irlc/project2/project2_tests_complete.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project2/r2d2.py b/irlc/project2/r2d2.py new file mode 100644 index 0000000000000000000000000000000000000000..624158bee199c654174d87db792fbd667de38d4c --- /dev/null +++ b/irlc/project2/r2d2.py @@ -0,0 +1,210 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import time +import numpy as np +import sympy as sym +import matplotlib.pyplot as plt +from gymnasium.spaces import Box +# matplotlib.use('Qt5Agg') This line may be useful if you are having matplotlib problems on Linux. +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.ex04.control_environment import ControlEnvironment +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost +from irlc.ex05.direct_agent import DirectAgent +from irlc.ex05.direct import get_opts, guess +from irlc.ex07.linearization_agent import LinearizationAgent +from irlc.project2.utils import R2D2Viewer +from irlc import Agent, train, plot_trajectory, savepdf + +dt = 0.05 # Time discretization Delta +Tmax = 5 # Total simulation time (in all instances). This means that N = Tmax/dt = 100. +x22 = (2, 2, np.pi / 2) # Where we want to drive to: x_target + +class R2D2Model(ControlModel): # This may help you get started. + state_labels = ["$x$", "$y$", r"$\gamma$"] + action_labels = ["Cart velocity $v$", r'Yaw rate $\omega$'] # Define constants as needed here (look at other environments); Note there is an easy way to add labels! + + def __init__(self, x_target=(2,2,np.pi/2), Q0=1.): # This constructor is one possible choice. + # Q0: The Q-matrix for the cF-term in the cost function (see problem description) + # x_target: The state we will drive towards. + self.x_target = np.asarray(x_target) + self.Q0 = Q0 + self.Tmax = 5 # Plan for a maximum of 5 seconds. + # Set up a variable for rendering (optional) and call superclass. + self.viewer = None + super().__init__() + + def get_cost(self) -> SymbolicQRCost: + # The cost function uses self.Q0 to define the appropriate cost. It has the same meaning as the lecture description + cost = SymbolicQRCost(Q=np.zeros(3), R=np.eye(2)) + cost += cost.goal_seeking_cost(x_target=self.x_target)*self.Q0 + return cost + + def tF_bound(self) -> Box: + return Box(self.Tmax, self.Tmax, shape=(1,)) + + def x0_bound(self) -> Box: + return Box(0, 0, shape=(self.state_size,)) + + def xF_bound(self) -> Box: + # TODO: 1 lines missing. + raise NotImplementedError("Complete this function to specify the target of R2D2.") + + # TODO: 3 lines missing. + raise NotImplementedError("Complete model dynamics here.") + + """ These are two helper functions. They add rendering functionality so you can eventually use the environment as + + > env = R2D2Environment(render_mode='human') + + and see a small animation. + """ + def close(self): + if self.viewer is not None: + self.viewer.close() + + def render(self, x, render_mode="human"): + if self.viewer is None: + self.viewer = R2D2Viewer(x_target=self.x_target) # Target is the red cross. + self.viewer.update(x) + time.sleep(0.05) + return self.viewer.blit(render_mode=render_mode) + + +class R2D2Environment(ControlEnvironment): + def __init__(self, Tmax=Tmax, Q0=0., x_target=x22, dt=None, render_mode=None): + assert dt is not None, "Remember to specify the discretization time!" + model = R2D2Model(Q0=Q0, x_target=x_target) # Create an R2D2 ControlModel with the given parameters. + dmodel = DiscreteControlModel(model, dt=dt) # Create a discrete version of the R2D2 ControlModel + super().__init__(dmodel, Tmax=Tmax, render_mode=render_mode) + +# TODO: 9 lines missing. +raise NotImplementedError("Your code here.") + +def f_euler(x : np.ndarray, u : np.ndarray, Delta=0.05) -> np.ndarray: + """ Solve Problem 9. The function should compute + > x_next = f_k(x, u) + """ + # TODO: 1 lines missing. + raise NotImplementedError("return next state") + return x_next + +def linearize(x_bar, u_bar, Delta=0.05): + """ Linearize R2D2's dynamics around the two vectors x_bar, u_bar + and return A, B, d so that + + x_{k+1} = A x_k + B u_k + d (approximately). + + The function should return linearization matrices A, B and d. + """ + # Create A, B, d as numpy ndarrays. + # TODO: 4 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + return A, B, d + +def drive_to_linearization(x_target, plot=True): + """ + Plan in a R2D2 model with specific value of x_target (in the cost function). We use Q0=1.0. + + this function will linearize the dynamics around xbar=0, ubar=0 to get a linear approximation of the model, + and then use that to plan on a horizon of N=50 steps to get a control law (L_0, l_0). This is then applied + to generate actions. + + Plot is an optional parameter to control plotting. the plot_trajectory(trajectory, env) method may be useful. + + The function should return the states visited as a (samples x state-dimensions) matrix, i.e. same format + as the default output of trajectories when you use train(...). + + Hints: + * The control method is identical to one we have seen in the exercises/notes. You can re-purpose the code from that week. + * Remember to set Q0=1 + """ + # TODO: 7 lines missing. + raise NotImplementedError("Implement function body") + return traj[0].state + +def drive_to_direct(x_target, plot=False): + """ + Optimal planning in the R2D2 model with specific value of x_target using the direct method. + Remember that for this problem we set Q0=0, and implement x_target as an end-point constraint (see examples from exercises). + + Plot is an optional parameter to control plotting, and to (optionally) visualize the environment using code such as:: + + env = R2D2Environment(..., render_mode='human' if plot else None) + + For making the actual plot, the plot_trajectory(trajectory, env) method may be useful (see examples from exercises to see how labels can be specified) + + The function should return the states visited as a (samples x state-dimensions) matrix, i.e. same format + as the default output of trajectories when you use train(...). + + Hints: + * The control method (Direct method) is identical to what we did in the exercises, but you have to specify the options + to implement the correct grid-refinement of N=10, N=20 and N=40. + * Remember to set Q0=0. + """ + # TODO: 10 lines missing. + raise NotImplementedError("Implement function body") + return traj[0].state + +def drive_to_mpc(x_target, plot=True) -> np.ndarray: + """ + Plan in a R2D2 model with specific value of x_target (in the cost function) using iterative MPC (see problem text). + Use Q0 = 1. in the cost function (see the R2D2 model class) + + Plot is an optional parameter to control plotting. the plot_trajectory(trajectory, env) method may be useful. + + The function should return the states visited as a (samples x state-dimensions) matrix, i.e. same format + as the default output of trajectories when you use train(...). + + Hints: + * The control method is *nearly* identical to the linearization control method. Think about the differences, + and how a solution to one can be used in another. + * A bit more specific: Linearization is handled similarly to the LinearizationAgent, however, we need to update + (in each step) the xbar/ubar states/actions we are linearizing about, and then just use the immediate action computed + by the linearization agent. + * My approach was to implement a variant of the LinearizationAgent. + """ + # TODO: 6 lines missing. + raise NotImplementedError("Implement function body") + return traj[0].state + +if __name__ == "__main__": + r2d2 = R2D2Model() + print(r2d2) # This will print out details of your R2D2 model. + + # Check Problem 10 + x = np.asarray( [0, 0, 0] ) + u = np.asarray( [1,0]) + print("x_k =", x, "u_k =", u, "x_{k+1} =", f_euler(x, u, dt)) + + A,B,d = linearize(x_bar=x, u_bar=u, Delta=dt) + print("x_{k+1} ~ A x_k + B u_k + d") + print("A:", A) + print("B:", B) + print("d:", d) + + # Test the simple linearization method (Problem 12) + states = drive_to_direct(x22, plot=True) + savepdf('r2d2_direct') + plt.show() + # Build plot assuming that states is in the format (samples x coordinates-of-state). + plt.plot(states[:,0], states[:,1], 'k-', label="R2D2's (x, y) trajectory") + plt.legend() + plt.xlabel("x") + plt.ylabel("y") + savepdf('r2d2_direct_B') + plt.show() + + # Test the simple linearization method (Problem 13) + drive_to_linearization((2,0,0), plot=True) + savepdf('r2d2_linearization_1') + plt.show() + + drive_to_linearization(x22, plot=True) + savepdf('r2d2_linearization_2') + plt.show() + + # Test iterative LQR (Problem 14) + state = drive_to_mpc(x22, plot=True) + print(state[-1]) + savepdf('r2d2_iterative_1') + plt.show() diff --git a/irlc/project2/unitgrade_data/R2D2Direct.pkl b/irlc/project2/unitgrade_data/R2D2Direct.pkl new file mode 100644 index 0000000000000000000000000000000000000000..eb3973b93e24e5cfcc9b5ca7c2289a3a1d3a71e0 Binary files /dev/null and b/irlc/project2/unitgrade_data/R2D2Direct.pkl differ diff --git a/irlc/project2/unitgrade_data/R2D2Linearization.pkl b/irlc/project2/unitgrade_data/R2D2Linearization.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1977f1e9fe2e3c37bbcd85d178b77df83561e8ed Binary files /dev/null and b/irlc/project2/unitgrade_data/R2D2Linearization.pkl differ diff --git a/irlc/project2/unitgrade_data/R2D2Problem15.pkl b/irlc/project2/unitgrade_data/R2D2Problem15.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ba6d982fc7ad02f136ba25a1dfa8984db6233555 Binary files /dev/null and b/irlc/project2/unitgrade_data/R2D2Problem15.pkl differ diff --git a/irlc/project2/unitgrade_data/R2D2_MPC.pkl b/irlc/project2/unitgrade_data/R2D2_MPC.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b3670d7e508ed0fda0dd3ecd811d09893c3234e8 Binary files /dev/null and b/irlc/project2/unitgrade_data/R2D2_MPC.pkl differ diff --git a/irlc/project2/unitgrade_data/YodaProblem1.pkl b/irlc/project2/unitgrade_data/YodaProblem1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e8d95ca14e3032826dca52c996e197de4ff290d5 Binary files /dev/null and b/irlc/project2/unitgrade_data/YodaProblem1.pkl differ diff --git a/irlc/project2/unitgrade_data/YodaProblem2.pkl b/irlc/project2/unitgrade_data/YodaProblem2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..472b8f3f9d46e309fb44636e16cab8ed585b5894 Binary files /dev/null and b/irlc/project2/unitgrade_data/YodaProblem2.pkl differ diff --git a/irlc/project2/unitgrade_data/YodaProblem3.pkl b/irlc/project2/unitgrade_data/YodaProblem3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7cfd67e961d93d87a739c9b7d69d5c97da114a0b Binary files /dev/null and b/irlc/project2/unitgrade_data/YodaProblem3.pkl differ diff --git a/irlc/project2/unitgrade_data/YodaProblem6.pkl b/irlc/project2/unitgrade_data/YodaProblem6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9978ddfe8f3a4fe1b09c0b73a0025163862b742d Binary files /dev/null and b/irlc/project2/unitgrade_data/YodaProblem6.pkl differ diff --git a/irlc/project2/unitgrade_data/YodaProblem7.pkl b/irlc/project2/unitgrade_data/YodaProblem7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e7d916b7cc20fd28de12c5da4fc1ece4dfd4be80 Binary files /dev/null and b/irlc/project2/unitgrade_data/YodaProblem7.pkl differ diff --git a/irlc/project2/utils.py b/irlc/project2/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..355be7a117d3babf8fc90c064fd7f4d67501a54f --- /dev/null +++ b/irlc/project2/utils.py @@ -0,0 +1,53 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.utils.graphics_util_pygame import UpgradedGraphicsUtil, rotate_around +import numpy as np + +""" This file contains code you can either use (or not) to render the R2D2 robot. class is already called correctly by your R2D2 class, +and you don't really have to think too carefully about what the code does unless you want to R2D2 to look better. +""" + + +class R2D2Viewer(UpgradedGraphicsUtil): + def __init__(self, x_target = (0,0)): + self.x_target = x_target + width = 800 + self.scale = width / 1000 + xlim = 3 + self.dw = self.scale * 0.1 + super().__init__(screen_width=width, xmin=-xlim, xmax=xlim, ymin=xlim, ymax=-xlim, title='R2D2') + self.xlim = xlim + def render(self): + # self. + self.draw_background(background_color=(255, 255, 255)) + dw = self.dw + self.line("t1", (-self.xlim, 0), (self.xlim, 0), width=1, color=(0,) * 3) + self.line("t1", (0, -self.xlim), (0, self.xlim), width=1, color=(0,) * 3) + + + self.circle("r2d2", pos=(self.x[0], self.x[1]), r=24, outlineColor=(100, 100, 200), fillColor=(100, 100, 200)) + self.circle("r2d2", pos=(self.x[0], self.x[1]), r=20, outlineColor=(100, 100, 200), fillColor=(150, 150, 255)) + self.circle("r2d2", pos=(self.x[0], self.x[1]), r=2, outlineColor=(100, 100, 200), fillColor=(0,)*3) + + dx = 0.13 + dy = dx/2.5 + wheel = [(-dx, dy), (dx, dy), (dx, -dy), (-dx, -dy) ] + ddy = 0.20 + w1 = [ (x, y + ddy) for x, y in wheel] + w1 = rotate_around(w1, (0,0), angle=self.x[2] / np.pi * 180) + + w2 = [(x, y - ddy) for x, y in wheel] + w2 = rotate_around(w2, (0, 0), angle=self.x[2] / np.pi * 180) + + + self.polygon("wheel1", coords=[ (x + self.x[0], self.x[1] + y) for x, y in w1], filled=True, fillColor=(200,)*3, outlineColor=(100,)*3, closed=True) + self.polygon("wheel2", coords=[ (x + self.x[0], self.x[1] + y) for x, y in w2], filled=True, fillColor=(200,)*3, outlineColor=(100,)*3, closed=True) + + dc = 0.1 + xx = self.x_target[0] + yy = self.x_target[1] + self.line("t1", (xx-dc, yy+dc), (xx+dc, yy-dc), width=4, color=(200, 100, 100)) + self.line("t1", (xx-dc, yy-dc), (xx+dc, yy+dc), width=4, color=(200, 100, 100)) + + + def update(self, x): + self.x = x diff --git a/irlc/project2/yoda.py b/irlc/project2/yoda.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb70a45d25ceb150827269a4abf0625aab29245 --- /dev/null +++ b/irlc/project2/yoda.py @@ -0,0 +1,97 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from scipy.linalg import expm # Computes the matrix exponential e^A for a square matrix A +from numpy.linalg import matrix_power # Computes A^n for matrix A and integer n + + +def get_A_B(g : float, L: float, m=0.1): + r""" Compute the two matrices A, B (see Problem 1) here and return them. + The matrices should be numpy ndarrays. """ + # TODO: 2 lines missing. + raise NotImplementedError("Compute numpy matrices A and B here") + return A, B + + +def A_euler(g : float,L : float, Delta : float) -> np.ndarray: + r""" Compute \tilde{A}_0 (Euler discretization), see Problem 2. + + Hints: + * get_A_B can perhaps save you a line or two. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + return A0_tilde + +def A_ei(g : float,L : float, Delta : float) -> np.ndarray: + r""" Compute A_0 (Exponential discretization), see Problem 2 + + Hints: + * The special function expm(X) computes the matrix exponential e^X. See the lecture notes for more information. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + return A0 + +def M_euler(g : float, L : float, Delta : float, N : int) -> np.ndarray: + r""" Compute \tilde{M} (Euler discretization), see Problem 3 + Hints: + * the matrix_power(X,n) function can compute expressions such as X^n where X is a square matrix and n is a number + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return M_tilde + +def M_ei(g : float,L : float, Delta : float, N : int) -> np.ndarray: + r""" Compute M (Exponential discretization), see Problem 3 """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return M + +def xN_bound_euler(g : float, L : float,Delta : float,N : int) -> float: + r""" Compute upper bound on |x_N| when using Euler discretization, see Problem 6. + The function should just return a number. + + Hints: + * This function uses all input arguments. + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return bound + +def xN_bound_ei(g: float,L : float,Delta : float,N : int) -> float: + r""" Compute upper bound on |x_N| when using exponential discretization, see Problem 7. + + Hints: + * This function does NOT use all input arguments. + * This will be the hardest problem to solve, but the easiest function to implement. + """ + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + return bound + +if __name__ == '__main__': + g = 9.82 # gravitational constant + L = 5 # Length of string + m = 0.1 # Mass of pendulum (in kg) + Delta = 0.3 # Time-discretization constant Delta (in seconds) + N = 100 # Time steps + + # Solve Problem 2 + print("A0_euler") + print(A_euler(g, L, Delta)) + + print("A0_ei") + print(A_ei(g, L, Delta)) + + # Solve Problem 3 + print("M_euler") + print(M_euler(g, L, Delta, N)) + + print("M_ei") + print(M_ei(g, L, Delta, N)) + + # Solve Problem 7, upper bound on x_N using Euler discretization + print("|x_N| <= ", xN_bound_euler(g, L, Delta, N)) + + # Solve Problem 8, upper bound on x_N using Exponential discretization + print("|x_N| <= ", xN_bound_ei(g, L, Delta, N)) diff --git a/irlc/project3/Latex/02465project3_handin.tex b/irlc/project3/Latex/02465project3_handin.tex new file mode 100644 index 0000000000000000000000000000000000000000..b69b431236b95b067b2bb47f78dedcdc1c52d358 --- /dev/null +++ b/irlc/project3/Latex/02465project3_handin.tex @@ -0,0 +1,74 @@ +\documentclass[12pt,twoside]{article} +%\usepackage[table]{xcolor} % important to avoid options clash. +%\input{02465shared_preamble} +%\usepackage{cleveref} +\usepackage{url} +\usepackage{graphics} +\usepackage{multicol} +\usepackage{rotate} +\usepackage{rotating} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{pifont} +\usepackage{latexsym} +\usepackage[english]{babel} +\usepackage{epstopdf} +\usepackage{etoolbox} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{multirow,epstopdf} +\usepackage{fancyhdr} +\usepackage{booktabs} +\usepackage{xcolor} +\newcommand\redt[1]{ {\textcolor[rgb]{0.60, 0.00, 0.00}{\textbf{ #1} } } } + + +\newcommand{\m}[1]{\boldsymbol{ #1}} +\newcommand{\yoursolution}{ \redt{(your solution here) } } + + + +\title{ Report 3 hand-in } +\date{ \today } +\author{Alice (\texttt{s000001})\and Bob (\texttt{s000002})\and Clara (\texttt{s000003}) } + +\begin{document} +\maketitle + +\begin{table}[ht!] +\caption{Attribution table. Feel free to add/remove rows and columns} +\begin{tabular}{llll} +\toprule + & Alice & Bob & Clara \\ +\midrule + 1: Optimal policy & 0-100\% & 0-100\% & 0-100\% \\ + 2: Simulating a finite approximation of the optimal action-value function & 0-100\% & 0-100\% & 0-100\% \\ + 3: Analytically computing the optimal action-value function & 0-100\% & 0-100\% & 0-100\% \\ + 4: Extend solution to all states and actions & 0-100\% & 0-100\% & 0-100\% \\ + 5: UCB-based exploration & 0-100\% & 0-100\% & 0-100\% \\ + 6: Sarlacc rules & 0-100\% & 0-100\% & 0-100\% \\ + 7: Escape the Sarlacc & 0-100\% & 0-100\% & 0-100\% \\ +\bottomrule +\end{tabular} +\end{table} + +%\paragraph{Statement about collaboration:} +%Please edit this section to reflect how you have used external resources. The following statement will in most cases suffice: +%\emph{The code in the irls/project1 directory is entirely} + +%\paragraph{Main report:} +Headings have been inserted in the document for readability. You only have to edit the part which says \yoursolution. + +\section{Jar-Jar at the battle of Naboo (\texttt{jarjar.py})} +\subsubsection*{{\color{red}Problem 3: Analytically computing the optimal action-value function}} + + Using that ... we obtain + \begin{align} + Q^*(0,1) & = \cdots \\ + Q^*(1,-1) & = \cdots + \end{align} + therefore... + +\section{Finding the rebels using UCB-exploration (\texttt{rebels.py})} +\section{Individual contribution: The great sarlacc (\texttt{sarlacc.py})} +\end{document} \ No newline at end of file diff --git a/irlc/project3/Latex/figures/your_answer.pdf b/irlc/project3/Latex/figures/your_answer.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d8c092974e20aaaf1165958a53bdce3a2ebdbf8f Binary files /dev/null and b/irlc/project3/Latex/figures/your_answer.pdf differ diff --git a/irlc/project3/__init__.py b/irlc/project3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8794db4fc72b62ae50ebe61fd5ce31a77a77992e --- /dev/null +++ b/irlc/project3/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This file is required for the test system but should otherwise be empty.""" diff --git a/irlc/project3/jarjar.py b/irlc/project3/jarjar.py new file mode 100644 index 0000000000000000000000000000000000000000..898d4b5246c1b7475ec3c85bd9da19f6b71e5f19 --- /dev/null +++ b/irlc/project3/jarjar.py @@ -0,0 +1,44 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import matplotlib.pyplot as plt +import numpy as np + + +def pi_optimal(s : int) -> int: + """ Compute the optimal policy for Jar-Jar binks. Don't overthink this one! """ + # TODO: 1 lines missing. + raise NotImplementedError("Return the optimal action in state s.") + return action + +def Q0_approximate(gamma : float, N : int) -> float: + """ Return the (estimate) of the optimal action-value function Q^*(0,1) based on + the first N rewards using a discount factor of gamma. Note the similarity to the n-step estimator. """ + # TODO: 1 lines missing. + raise NotImplementedError("Return N-term approximation of the optimal action-value function Q^*(0,1)") + return return_estimate + +def Q_exact(s : int,a : int, gamma : float) -> float: + """ + Return the exact optimal action-value function Q^*(s,a) in the Jar-Jar problem. + I recommend focusing on simple cases first, such as the two cases in the problem. + Then try to look at larger values of s (for instance, s=2), first using actions that 'point in the right direction' (a = -1) + and then actions that point in the 'wrong' direction a=1. + + There are several ways to solve the problem, but the simplest is probably to use recursions. + + *Don't* use your solution to Q0_approximate; it is an approximate (finite-horizon) approximation. + """ + # TODO: 6 lines missing. + raise NotImplementedError("return optimal action-value function Q^*(s,a) as a float.") + + +if __name__ == "__main__": + gamma = 0.8 + + ss = np.asarray(range(-10, 10)) + # Make a plot of your (exact) action-value function Q(s,-1) and Q(s,1). + plt.plot(ss, [Q_exact(s, -1, gamma) for s in ss], 'k-', label='Exact, a=-1') + plt.plot(ss, [Q_exact(s, 1, gamma) for s in ss], 'r-', label='Exact, a=1') + plt.legend() + plt.grid() + plt.show() + print("All done") diff --git a/irlc/project3/project3_grade.py b/irlc/project3/project3_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..46e8b69135822b4f49e77d24b1cd11d2cd4bd0d0 --- /dev/null +++ b/irlc/project3/project3_grade.py @@ -0,0 +1,4 @@ +# irlc/project3/project3_tests.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project3/project3_tests.py b/irlc/project3/project3_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..a50927ecf35be0d1210892c9b0e2f6a116127f0b --- /dev/null +++ b/irlc/project3/project3_tests.py @@ -0,0 +1,142 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report +import irlc + +class JarJarPiOptimal(UTestCase): + """ Problem 1: Compute optimal policy. """ + def test_pi_1(self): + from irlc.project3.jarjar import pi_optimal + self.assertLinf(pi_optimal(1), -1) + + def test_pi_all(self): + from irlc.project3.jarjar import pi_optimal + for s in range(-10, 10): + if s != 0: + self.assertLinf(pi_optimal(s)) + +class JarJarQ0Estimated(UTestCase): + """ Problem 2: Implement Q0_approximate to (approximate) the Q-function for the optimal policy. """ + def test_Q0_N1(self): + from irlc.project3.jarjar import Q0_approximate + import numpy as np + self.assertLinf(np.abs(Q0_approximate(gamma=0.8, N=1))) # TODO: Remove abs. This was added due to typo. + + def test_Q0_N2(self): + from irlc.project3.jarjar import Q0_approximate + import numpy as np + self.assertLinf(np.abs(Q0_approximate(gamma=0.7, N=20))) # TODO: Remove abs. This was added due to typo. + + def test_Q0_N100(self): + from irlc.project3.jarjar import Q0_approximate + import numpy as np + self.assertLinf(np.abs(Q0_approximate(gamma=0.9, N=20))) # TODO: Remove abs. This was added due to typo. + + +class JarJarQExact(UTestCase): + """ Problem 4: Compute Q^*(s,a) exactly by extending analytical solution. """ + def test_Q_s0(self): + from irlc.project3.jarjar import Q_exact + self.assertLinf(Q_exact(0, gamma=0.8, a=1)) + self.assertLinf(Q_exact(0, gamma=0.8, a=-1)) + + def test_Q_s1(self): + from irlc.project3.jarjar import Q_exact + self.assertLinf(Q_exact(1, gamma=0.8, a=-1)) + self.assertLinf(Q_exact(1, gamma=0.95, a=-1)) + self.assertLinf(Q_exact(1, gamma=0.7, a=-1)) + + def test_Q_s_positive(self): + from irlc.project3.jarjar import Q_exact + for s in range(20): + self.assertLinf(Q_exact(s, gamma=0.75, a=-1)) + + def test_Q_all(self): + from irlc.project3.jarjar import Q_exact + for s in range(-20, 20): + self.assertLinf(Q_exact(s, gamma=0.75, a=-1)) + self.assertLinf(Q_exact(s, gamma=0.75, a=1)) + +class RebelsSimple(UTestCase): + """ Problem 5: Test the UCB-algorithm in the basic-environment with a single state """ + def test_simple_four_episodes(self): + """ Test the first four episodes in the simple grid problem. """ + from irlc.project3.rebels import get_ucb_actions, very_basic_grid + actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=4, c=5, plot=False) + # Make sure we only have 4 actions (remember to truncate the action-sequences!) + self.assertEqual(len(actions), 4) # Check the number of actions are correct + self.assertEqual(actions[0], 0) # Check the first action is correct + self.assertEqualC(actions) # Check all actions. + + def test_simple_nine_episodes(self): + """ Test the first nine episodes in the simple grid problem. """ + from irlc.project3.rebels import get_ucb_actions, very_basic_grid + actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=9, c=5, plot=False) + self.assertEqual(len(actions), 9) # Check the number of actions are correct + self.assertEqual(actions[0], 0) # Check the first action is correct + self.assertEqualC(actions) # Check all actions. + + def test_simple_environment(self): + from irlc.project3.rebels import get_ucb_actions, very_basic_grid + actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=100, c=5, plot=False) + # Check the number of actions are correct + self.assertEqualC(len(actions)) + # Check the first action is correct + self.assertEqualC(actions[0]) + # Check all actions. + self.assertEqualC(actions) + + def test_bridge_environment(self): + from irlc.gridworld.gridworld_environments import grid_bridge_grid + from irlc.project3.rebels import get_ucb_actions, very_basic_grid + actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1000, c=2, plot=False) + self.assertEqualC(len(actions)) + # Check all actions. + self.assertEqualC(actions) + +class RebelsBridge(UTestCase): + """ Problem 5: Test the UCB-algorithm in the bridge-environment """ + def test_bridge_environment_one(self): + from irlc.gridworld.gridworld_environments import grid_bridge_grid + from irlc.project3.rebels import get_ucb_actions + actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1, c=2, plot=False) + self.assertEqualC(len(actions)) + self.assertEqualC(actions) + + def test_bridge_environment_two(self): + from irlc.gridworld.gridworld_environments import grid_bridge_grid + from irlc.project3.rebels import get_ucb_actions + actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=2, c=2, plot=False) + self.assertEqualC(len(actions)) + self.assertEqualC(actions) + + def test_bridge_environment_short(self): + from irlc.gridworld.gridworld_environments import grid_bridge_grid + from irlc.project3.rebels import get_ucb_actions + actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=30, c=2, plot=False) + self.assertEqualC(len(actions)) + self.assertEqualC(actions) + + def test_bridge_environment_long(self): + from irlc.gridworld.gridworld_environments import grid_bridge_grid + from irlc.project3.rebels import get_ucb_actions + actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1000, c=2, plot=False) + self.assertEqualC(len(actions)) + self.assertEqualC(actions) + +class Project3(Report): + title = "Project part 3: Reinforcement Learning" + pack_imports = [irlc] + + jarjar1 = [(JarJarPiOptimal, 10), + (JarJarQ0Estimated, 10), + (JarJarQExact, 10) ] + + rebels = [(RebelsSimple, 20), + (RebelsBridge, 20) ] + questions = [] + questions += jarjar1 + questions += rebels + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Project3()) diff --git a/irlc/project3/project3_tests_complete_grade.py b/irlc/project3/project3_tests_complete_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..17fda1138f74db71d0116934800cda3a05058c15 --- /dev/null +++ b/irlc/project3/project3_tests_complete_grade.py @@ -0,0 +1,4 @@ +# irlc/project3/project3_tests_complete.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project3/rebels.py b/irlc/project3/rebels.py new file mode 100644 index 0000000000000000000000000000000000000000..951a543d5327a1ae5202c14386498aaad6e47af2 --- /dev/null +++ b/irlc/project3/rebels.py @@ -0,0 +1,58 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc.ex11.q_agent import QAgent +from irlc.gridworld.gridworld_environments import GridworldEnvironment, grid_bridge_grid +from irlc import train +from irlc.ex09.rl_agent import TabularQ + +# A simple UCB action-selection problem (basic problem) +very_basic_grid = [['#',1, '#'], + [1, 'S', 2], + ['#',1, '#']] + + +# TODO: 21 lines missing. +raise NotImplementedError("I wrote an agent that inherited from the Q-agent, and updated the self.pi and self.train-functions to do UCB-based exploration.") + +def get_ucb_actions(layout : list, alpha : float, c : float, episodes : int, plot=False) -> list: + """ Return the sequence of actions the agent tries in the environment with the given layout-string when trained over 'episodes' episodes. + To create an environment, you can use the line: + + > env = GridworldEnvironment(layout) + + See also the demo-file. + + The 'plot'-parameter is optional; you can use it to add visualization using a line such as: + + if plot: + env = GridworldEnvironment(layout, render_mode='human') + + Or you can just ignore it. Make sure to return the truncated action list (see the rebels_demo.py-file or project description). + In other words, the return value should be a long list of integers corresponding to actions: + actions = [0, 1, 2, ..., 1, 3, 2, 1, 0, ...] + """ + # TODO: 6 lines missing. + raise NotImplementedError("Implement function body") + return actions + +if __name__ == "__main__": + actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=4, plot=False) + print("Number of actions taken", len(actions)) + print("List of actions taken over 4 episodes", actions) + + actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=8, plot=False) + print("Number of actions taken", len(actions)) + print("Actions taken over 8 episodes", actions) + + actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=9, plot=False) + print("Number of actions taken", len(actions)) + print("Actions taken over 9 episodes", actions) # In this particular case, you can also predict the 9th action. Why? + + # Simulate 100 episodes. This should solve the problem. + actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=100, plot=False) + print("Basic: Actions taken over 100 episodes", actions) + + # Simulate 100 episodes for the bridge-environment. The UCB-based method should solve the environment without being overly sensitive to c. + # You can compare your result with the Q-learning agent in the demo, which performs horribly. + actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, c=5, episodes=300, plot=False) + print("Bridge: Actions taken over 300 episodes. The agent should solve the environment:", actions) diff --git a/irlc/project3/rebels_demo.py b/irlc/project3/rebels_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..923c69fffbb2badb27fe581c4638516bc953577a --- /dev/null +++ b/irlc/project3/rebels_demo.py @@ -0,0 +1,50 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +from irlc import train, Agent, interactive, savepdf +from irlc.gridworld.gridworld_environments import GridworldEnvironment, grid_bridge_grid +from irlc.project3.rebels import very_basic_grid +from irlc.ex11.q_agent import QAgent +import matplotlib +import matplotlib.pyplot as plt +matplotlib.use('qtagg') + + +if __name__ == "__main__": + np.random.seed(42) # Fix the seed for reproduciability + env = GridworldEnvironment(very_basic_grid, render_mode='human') # Create an environment + env.reset() # Reset (to set up the visualization) + savepdf("rebels_basic", env=env) # Save a snapshot of the starting state + env.close() + + # Create an interactive version. + env = GridworldEnvironment(very_basic_grid, render_mode='human') # Create an environment + agent = QAgent(env) # This agent will display the Q-values. + # agent = Agent(env) # A random agent. + # env, agent = interactive(env, agent) # Uncomment this line to play in 'env' environment. Use space to let the agent move. + stats, trajectories = train(env, agent, num_episodes=16, return_trajectory=True) + env.close() + print("Trajectory 0: States traversed", trajectories[0].state, "actions taken", trajectories[0].action) + print("Trajectory 1: States traversed", trajectories[1].state, "actions taken", trajectories[1].action) + all_actions = [t.action[:-1] for t in trajectories] # Concatenate all action sequence excluding the last dummy-action. + print("All actions taken in 16 episodes, excluding the terminal (dummy) action", all_actions) + # Note the last list is of length 20 -- this is because the environment will always terminate after two actions, + # and since we discard the last (dummy) action we get 20 actions. + # In general, the list of actions will be longer, as only the last action should be discarded (as in the code above). + + # A more minimalistic example to plot the bridge-grid environment + bridge_env = GridworldEnvironment(grid_bridge_grid, render_mode='human') + bridge_env.reset() + savepdf("rebels_bridge", env=bridge_env) + bridge_env.close() + + # The following code will simulate a Q-learning agent for 3000 (!) episodes and plot the Q-functions. + np.random.seed(42) # Fix the seed for reproduciability + env = GridworldEnvironment(grid_bridge_grid) + agent = QAgent(env, alpha=0.1, epsilon=0.2, gamma=1) + """ Uncomment the next line to play in the environment. + Use the space-bar to let the agent take an action, p to unpause, and otherwise use the keyboard arrows """ + train(env, agent, num_episodes=3000) # Train for 3000 episodes. Surely the rebels must be found by now! + bridge_env, agent = interactive(env, agent) + bridge_env.reset() + bridge_env.savepdf("rebels_bridge_Q") + bridge_env.close() diff --git a/irlc/project3/unitgrade_data/JarJarPiOptimal.pkl b/irlc/project3/unitgrade_data/JarJarPiOptimal.pkl new file mode 100644 index 0000000000000000000000000000000000000000..efc6383731dea50b9985335b11ba3c5bb47cc889 Binary files /dev/null and b/irlc/project3/unitgrade_data/JarJarPiOptimal.pkl differ diff --git a/irlc/project3/unitgrade_data/JarJarQ0Estimated.pkl b/irlc/project3/unitgrade_data/JarJarQ0Estimated.pkl new file mode 100644 index 0000000000000000000000000000000000000000..36e3c87cea22839893c43b95665bde11a166e771 Binary files /dev/null and b/irlc/project3/unitgrade_data/JarJarQ0Estimated.pkl differ diff --git a/irlc/project3/unitgrade_data/JarJarQExact.pkl b/irlc/project3/unitgrade_data/JarJarQExact.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0b7858760854f8e7e8dff5061a15a650a295130c Binary files /dev/null and b/irlc/project3/unitgrade_data/JarJarQExact.pkl differ diff --git a/irlc/project3/unitgrade_data/RebelsBridge.pkl b/irlc/project3/unitgrade_data/RebelsBridge.pkl new file mode 100644 index 0000000000000000000000000000000000000000..06affd8ab0a70532880f475bd1455f6f96f5da5b Binary files /dev/null and b/irlc/project3/unitgrade_data/RebelsBridge.pkl differ diff --git a/irlc/project3/unitgrade_data/RebelsSimple.pkl b/irlc/project3/unitgrade_data/RebelsSimple.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3ca65445fc193f3aca2915c4959db9eb9afacd79 Binary files /dev/null and b/irlc/project3/unitgrade_data/RebelsSimple.pkl differ diff --git a/irlc/project3i/__init__.py b/irlc/project3i/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8794db4fc72b62ae50ebe61fd5ce31a77a77992e --- /dev/null +++ b/irlc/project3i/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This file is required for the test system but should otherwise be empty.""" diff --git a/irlc/project3i/project3_individual_grade.py b/irlc/project3i/project3_individual_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..f88cec94b4d102c20325699cc548f644e155011a --- /dev/null +++ b/irlc/project3i/project3_individual_grade.py @@ -0,0 +1,4 @@ +# irlc/project3i/project3_individual_tests.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project3i/project3_individual_tests.py b/irlc/project3i/project3_individual_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..3bb6f833084908d9e4a03b8d7b90ddad150e5e2d --- /dev/null +++ b/irlc/project3i/project3_individual_tests.py @@ -0,0 +1,61 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report +import irlc +import numpy as np + +class SarlaccGameRules(UTestCase): + def check_rules(self, rules): + from irlc.project3i.sarlacc import game_rules + # Test what happens at the starting square s=0 for roll 1 + self.assertEqualC(game_rules(rules, state=0, roll=1)) + # Test what happens at the starting square s=0 for other rolls + for roll in [2, 3, 4, 5, 6]: + self.assertEqualC(game_rules(rules, state=0, roll=roll)) + + # Test all states: + for s in range(max(rules.keys())): + if s not in rules: # We skip because s is not a legal state to be in. + for roll in [1, 2, 3, 4, 5, 6]: + self.assertEqualC(game_rules(rules, s, roll)) + + def test_empty_board_rules(self): + rules = {55: -1} + self.check_rules(rules) + + def test_rules(self): + from irlc.project3i.sarlacc import rules + self.check_rules(rules) + +class SarlacReturn(UTestCase): + def check_return(self, rules, gamma): + from irlc.project3i.sarlacc import sarlacc_return + v = sarlacc_return(rules, gamma) + # Check that the keys (states) that are included in v are correct. I.e., that the return is computed for the right states. + states = list(sorted(v.keys())) + self.assertEqualC(states) + + for s in states: + self.assertL2(v[s], tol=1e-2) + + def test_sarlacc_return_empty_gamma1(self): + self.check_return({55: -1}, gamma=1) + + def test_sarlacc_return(self): + from irlc.project3i.sarlacc import rules + self.check_return(rules, gamma=.8) + + +class Project3Individual(Report): + title = "Project part 3: Reinforcement Learning (individual)" + pack_imports = [irlc] + + sarlacc = [(SarlaccGameRules, 20), + (SarlacReturn, 20)] + + questions = [] + questions += sarlacc + + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Project3Individual()) diff --git a/irlc/project3i/project3_individual_tests_complete_grade.py b/irlc/project3i/project3_individual_tests_complete_grade.py new file mode 100644 index 0000000000000000000000000000000000000000..8cfcfa743853c5ae06a0d381b9a33d5c0262b9a2 --- /dev/null +++ b/irlc/project3i/project3_individual_tests_complete_grade.py @@ -0,0 +1,4 @@ +# irlc/project3i/project3_individual_tests_complete.py +''' WARNING: Modifying, decompiling or otherwise tampering with this script, it's data or the resulting .token file will be investigated as a cheating attempt. ''' +import bz2, base64 +exec(bz2.decompress(base64.b64decode(''))) \ No newline at end of file diff --git a/irlc/project3i/sarlacc.py b/irlc/project3i/sarlacc.py new file mode 100644 index 0000000000000000000000000000000000000000..55e2463837c91ddbc65a07f32a514e1c61eeddb3 --- /dev/null +++ b/irlc/project3i/sarlacc.py @@ -0,0 +1,120 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +from irlc import savepdf +from irlc.ex09.mdp import MDP +from irlc.ex09.value_iteration import value_iteration +import matplotlib.pyplot as plt +import numpy as np + +# These are the game rules of the sarlac: If you land on a state s in the dictionary, you are teleported to rules[s]. +rules = { + 2: 16, + 4: 8, + 7: 21, + 10: 3, + 12: 25, + 14: 1, + 17: 27, + 19: 5, + 22: 3, + 23: 32, + 24: 44, + 26: 44, + 28: 38, + 30: 18, + 33: 48, + 35: 11, + 36: 34, + 40: 53, + 41: 29, + 42: 9, + 45: 51, + 47: 31, + 50: 25, + 52: 38, + 55: -1, + } + +def game_rules(rules : dict, state : int, roll : int) -> int: + """ Compute the next state given the game rules in 'rules', the current state 'state', and the roll + which can be roll = 1, 2, 3, 4, 5, 6. + The output should be -1 in case the game terminates, and otherwise the function should return the next state + as an integer. Read the description of the project for examples on the rules. """ + # TODO: 4 lines missing. + raise NotImplementedError("Return the next state") + return state_next + +# TODO: 19 lines missing. +raise NotImplementedError("Put your code here.") + +def sarlacc_return(rules : dict, gamma : float) -> dict: + """ Compute the value-function using a discount of gamma and the game rules 'rules'. + Result should be reasonable accurate. + + The value you return should be a dictionary v, so that v[state] is the value function in that state. + (i.e., the standard output format of the value_iteration function). + + Hints: + * One way to solve this problem is to create a MDP-class (see for instance the Gambler-problem in week 9) + and use the value_iteration function from week 9 to solve the problem. But I don't think the problem + is much harder to solve by just writing your own value-iteration method as in (SB18). + """ + # TODO: 2 lines missing. + raise NotImplementedError("Return the value function") + return v + + +if __name__ == "__main__": + """ + Rules for the snakes and ladder game: + The player starts in square s=0, and the game terminates when the player is in square s = 55. + When a player reaches the base of a ladder he/she climbs it, and when they reach a snakes mouth of a snake they are translated to the base. + When a player overshoots the goal state they go backwards from the goal state by the amount of moves they overshoot with. + + A few examples (using the rules in the 'rules' dictionary in this file): + If the player is in position s=0 (start) + > roll 2: Go to state s=16 (using the ladder) + > roll 3: Go to state s=3. + + Or if the player is in state s=54 + > Roll 1: Win the game + > Roll 2: stay in 54 + > Roll 3: Go to 53 + > Roll 4: Go to 38 + """ + # Test the game rules: + for roll in [1, 2, 3, 4, 5, 6]: + print(f"In state s=0 (start), using roll {roll}, I ended up in ", game_rules(rules, 0, roll)) + # Test the game rules again: + for roll in [1, 2, 3, 4, 5, 6]: + print(f"In state s=54, using roll {roll}, I ended up in ", game_rules(rules, 54, roll)) + + # Compute value function with the ordinary rules. + V_rules = sarlacc_return(rules, gamma=1) + # Compute value function with no rules, i.e. with an empty dictionary except for the winning state: + V_norule = sarlacc_return({55: -1}, gamma=1) + print("Time to victory when there are no snakes/ladders", V_norule[0]) + print("Time to victory when there are snakes/ladders", V_rules[0]) + + # Make a plot of the value-functions (optional). + width = .4 + def v2bar(V): + k, x = zip(*V.items()) + return np.asarray(k), np.asarray(x) + + plt.figure(figsize=(10,5)) + plt.grid() + k,x = v2bar(V_norule) + plt.bar(k-width/2, x, width=width, label="No rules") + + k, x = v2bar(V_rules) + plt.bar(k + width / 2, x, width=width, label="Rules") + plt.legend() + plt.xlabel("Current tile") + plt.ylabel("Moves remaining") + savepdf('sarlacc_value_function') + plt.show() diff --git a/irlc/project3i/unitgrade_data/SarlacReturn.pkl b/irlc/project3i/unitgrade_data/SarlacReturn.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3ead9ae290d76fb309b7f682728dac309b6606f0 Binary files /dev/null and b/irlc/project3i/unitgrade_data/SarlacReturn.pkl differ diff --git a/irlc/project3i/unitgrade_data/SarlaccGameRules.pkl b/irlc/project3i/unitgrade_data/SarlaccGameRules.pkl new file mode 100644 index 0000000000000000000000000000000000000000..da00e5ccb061289943675d09fd862f10675071ab Binary files /dev/null and b/irlc/project3i/unitgrade_data/SarlaccGameRules.pkl differ diff --git a/irlc/tests/__init__.py b/irlc/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/tests/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/tests/tests_week01.py b/irlc/tests/tests_week01.py new file mode 100644 index 0000000000000000000000000000000000000000..812c8fa77f27109db9e9e46f821a97c43085a08f --- /dev/null +++ b/irlc/tests/tests_week01.py @@ -0,0 +1,132 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import Report +import irlc +# from irlc.ex01.frozen_lake import FrozenAgentDownRight +import gymnasium as gym +from unitgrade import UTestCase +from irlc.ex01.inventory_environment import InventoryEnvironment, simplified_train, RandomAgent +from unitgrade import Capturing2 +import numpy as np +from gymnasium.envs.toy_text.frozen_lake import RIGHT, DOWN # The down and right-actions; may be relevant. +from irlc.ex01.pacman_hardcoded import GoAroundAgent, layout +from irlc.pacman.pacman_environment import PacmanEnvironment +from irlc import Agent, train +from irlc.ex01.bobs_friend import BobFriendEnvironment, AlwaysAction_u1, AlwaysAction_u0 + + +class Problem1BobsFriend(UTestCase): + def test_a_env_basic(self): + env = BobFriendEnvironment() + s0, _ = env.reset() + self.assertEqual(s0, 20, msg="Reset must return the initial state, i.e. the amount of money we start out with") + + def test_a_env_u0(self): + env = BobFriendEnvironment() + env.reset() + s1, r, done, _, _ = env.step(0) + self.assertEqual(r, 2, msg="When taking action u0, we must get a reward of 2.") + self.assertEqual(s1, 22, msg="When taking action u0, we must end in state x1=22") + self.assertEqual(done, True, msg="After taking an action, the environment must terminate") + +class Problem2BobsPolicy(UTestCase): + def test_a_env_u1(self): + env = BobFriendEnvironment() + env.reset() + s1, r, done, _, _ = env.step(1) + print(r) + self.assertTrue(r == 12 or r == -20, msg="When taking action u1, we must get a reward of 0 or 12.") + self.assertTrue(s1 == 0 or s1 == 32, msg="When taking action u1, we must end in state x1=0 or x1 = 34") + self.assertEqual(done, True, msg="After taking an action, the environment must terminate") + + def test_b_always_action_u0(self): + env = BobFriendEnvironment() + stats, _ = train(env, AlwaysAction_u0(env), num_episodes=1000) + avg = np.mean( [stat['Accumulated Reward'] for stat in stats] ) + self.assertL2(avg, 2, msg="Average reward when we always take action u=0 must be 2.") + + def test_b_always_action_u1(self): + env = BobFriendEnvironment() + stats, _ = train(env, AlwaysAction_u1(env), num_episodes=10000) + avg = np.mean( [stat['Accumulated Reward'] for stat in stats] ) + self.assertL2(avg, 4, tol=0.5, msg="Average reward when we always take action u=0 must be about 4.") + + def test_b_always_action_u1_starting_200(self): + env = BobFriendEnvironment(x0=200) + stats, _ = train(env, AlwaysAction_u1(env), num_episodes=10000) + avg = np.mean( [stat['Accumulated Reward'] for stat in stats] ) + self.assertL2(avg, -42, tol=4, msg="Average reward when we always take action u=0 must be about 4.") + + def test_b_always_action_u0_starting_200(self): + env = BobFriendEnvironment(x0=200) + stats, _ = train(env, AlwaysAction_u0(env), num_episodes=10000) + avg = np.mean( [stat['Accumulated Reward'] for stat in stats] ) + self.assertL2(avg, 20, msg="Average reward when we always take action u=0 must be about 4.") + + + +class Problem5PacmanHardcoded(UTestCase): + """ Test the hardcoded pacman agent """ + def test_pacman(self): + env = PacmanEnvironment(layout_str=layout) + agent = GoAroundAgent(env) + stats, _ = train(env, agent, num_episodes=1) + self.assertEqual(stats[0]['Length'] < 100, True) + + +class Problem6ChessTournament(UTestCase): + def test_chess(self): + """ Test the correct result in the little chess-tournament """ + from irlc.ex01.chess import main + with Capturing2() as c: + main() + # Extract the numbers from the console output. + print("Numbers extracted from console output was") + print(c.numbers) + self.assertLinf(c.numbers[-2], 26/33, tol=0.05) + +class Problem3InventoryInventoryEnvironment(UTestCase): + def test_environment(self): + env = InventoryEnvironment() + # agent = RandomAgent(env) + stats, _ = train(env, Agent(env), num_episodes=2000, verbose=False) + avg_reward = np.mean([stat['Accumulated Reward'] for stat in stats]) + self.assertLinf(avg_reward, tol=0.6) + + def test_random_agent(self): + env = InventoryEnvironment() + stats, _ = train(env, RandomAgent(env), num_episodes=2000, verbose=False) + avg_reward = np.mean([stat['Accumulated Reward'] for stat in stats]) + self.assertLinf(avg_reward, tol=0.6) + +class Problem4InventoryTrain(UTestCase): + def test_simplified_train(self): + env = InventoryEnvironment() + agent = Agent(env) + avg_reward_simplified_train = np.mean([simplified_train(env, agent) for i in range(1000)]) + self.assertLinf(avg_reward_simplified_train, tol=0.5) + +# class FrozenLakeTest(UTestCase): +# def test_frozen_lake(self): +# env = gym.make("FrozenLake-v1") +# agent = FrozenAgentDownRight(env) +# s = env.reset() +# for k in range(10): +# self.assertEqual(agent.pi(s, k), DOWN if k % 2 == 0 else RIGHT) + + +class Week01Tests(Report): #240 total. + title = "Tests for week 01" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (Problem1BobsFriend, 10), + (Problem2BobsPolicy, 10), + (Problem3InventoryInventoryEnvironment, 10), + (Problem4InventoryTrain, 10), + (Problem5PacmanHardcoded, 10), + (Problem6ChessTournament, 10), # Week 1: Everything + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week01Tests()) diff --git a/irlc/tests/tests_week02.py b/irlc/tests/tests_week02.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c474c657d24ff73716e5ff1d124808a4263c5e --- /dev/null +++ b/irlc/tests/tests_week02.py @@ -0,0 +1,270 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# from irlc.ex02.graph_traversal import pi_inc, pi_smart, pi_silly, policy_rollout, SmallGraphDP +from irlc.ex02.graph_traversal import SmallGraphDP +from collections import defaultdict +# from irlc.ex02.chessmatch import ChessMatch +import gymnasium as gym +from unitgrade import Report +import irlc +from unitgrade import UTestCase +from irlc.ex02.inventory import InventoryDPModel, DP_stochastic +from irlc.ex02.graph_traversal import SmallGraphDP +# from irlc.ex02.frozen_lake_dp import Gym2DPModel + +def gN_dp(self, env): + for s in sorted(self.env.S(self.env.N)): + self.assertLinf(self.env.gN(s)) + +def f_dp(self, env): + self.assertEqualC(self.env.N) + + for k in range(self.env.N): + for s in sorted(self.env.S(k)): + for a in sorted(self.env.A(s,k)): + from collections import defaultdict + + dd_f = defaultdict(float) + # dd_g = defaultdict(float) + + for w, pw in self.env.Pw(s,a,k).items(): + dd_f[(s,a, self.env.f(s,a,w,k))] += pw + # dd_g[(s, a, self.env.g(s, a, w, k))] += pw + + # Check transition probabilities sum to 1. + self.assertAlmostEqual(sum(dd_f.values()), 1, places=6) + # self.assertAlmostEqual(sum(dd_g.values()), 1, places=6) + + for key in sorted(dd_f.keys()): + self.assertEqualC(key) + self.assertLinf(dd_f[key], tol=1e-7) + + # for key in sorted(dd_g.keys()): + # self.assertEqualC(key) + # self.assertLinf(dd_g[key], tol=1e-7) + +def g_dp(self, env): + for k in range(self.env.N): + for s in sorted(self.env.S(k)): + for a in sorted(self.env.A(s, k)): + + # dd_f = defaultdict(float) + dd_g = defaultdict(float) + + for w, pw in self.env.Pw(s, a, k).items(): + # dd_f[(s, a, self.env.f(s, a, w, k))] += pw + dd_g[(s, a, self.env.g(s, a, w, k))] += pw + + # Check transition probabilities sum to 1. + # self.assertAlmostEqual(sum(dd_f.values()), 1, places=6) + self.assertAlmostEqual(sum(dd_g.values()), 1, places=6) + + # for key in sorted(dd_f.keys()): + # self.assertEqualC(key) + # self.assertLinf(dd_f[key], tol=1e-7) + + for key in sorted(dd_g.keys()): + self.assertEqualC(key) + self.assertLinf(dd_g[key], tol=1e-7) + + +class Problem1SmallGraph(UTestCase): + @property + def env(self): + return SmallGraphDP(t=5) + + # @classmethod + # def setUpClass(cls) -> None: + # cls.env = SmallGraphDP(t=5) + + # def test_N(self): + # self.assertEqualC(self.__class__.env.N) + + # def test_states(self): + # # for k in range(self.class.model.S): + # # self.assertEqualC(len(cls.model.S)) + # # self.assertEqualC() + # for k in range(self.env.N+1): + # self.assertEqualC(set(self.env.S(k))) + # + # def test_actions(self): + # for k in range(self.env.N): + # for s in sorted(self.env.S(k)): + # self.assertEqualC(set(self.env.A(s, k))) + + def test_f(self): + f_dp(self, self.env) + + def test_g(self): + g_dp(self, self.env) + + + def test_gN(self): + gN_dp(self, self.env) + + # def test_states(self): + # for k in range(self.env.N+1): + # self.assertEqualC(set(self.env.S(k))) + # + # def test_actions(self): + # for k in range(self.env.N): + # for s in sorted(self.env.S(k)): + # self.assertEqualC(set(self.env.A(s, k))) + + + +class Problem3StochasticDP(UTestCase): + """ Inventory control """ + def test_policy(self): + inv = InventoryDPModel() + J, pi = DP_stochastic(inv) + + # Test action at time step N-1 + self.assertEqual(pi[-1][0], 1) + self.assertEqual(pi[-1][1], 0) + self.assertEqual(pi[-1][2], 0) + + # test all actions at time step N-1 + self.assertEqualC(pi[-1]) + + # Test all actions at all time steps + self.assertEqualC(pi) + + def test_J(self): + inv = InventoryDPModel() + J, pi = DP_stochastic(inv) + + self.assertLinf(J[-1][0], tol=1e-8) + self.assertLinf(J[-1][1], tol=1e-8) + self.assertLinf(J[-1][2], tol=1e-8) + + for k in range(len(J)): + for x in [0,1,2]: + print("testing", J[k][x]) + self.assertLinf(J[k][x], tol=1e-8) + +class Problem4DPAgent(UTestCase): + def test_agent(self): + from irlc.ex01.inventory_environment import InventoryEnvironment + from irlc.ex02.inventory import InventoryDPModel + from irlc.ex02.dp_agent import DynamicalProgrammingAgent + env = InventoryEnvironment(N=3) + inventory = InventoryDPModel(N=3) + agent = DynamicalProgrammingAgent(env, model=inventory) + s0, _ = env.reset() + self.assertEqualC(agent.pi(s0, 0)) # We just test the first action. + + +# class DPChessMatch(UTestCase): +# """ Chessmatch """ +# def test_J(self): +# N = 2 +# pw = 0.45 +# pd = 0.8 +# cm = ChessMatch(N, pw=pw, pd=pd) +# J, pi = DP_stochastic(cm) +# self.assertLinf(J[-1][0], tol=1e-4) +# self.assertLinf(J[-2][0], tol=1e-4) +# self.assertLinf(J[0][0], tol=1e-4) + + + + +# class SmallGraphPolicies(UTestCase): +# """ Test the policies in the small graph environment """ +# def test_pi_smart(self): +# self.assertEqual(pi_smart(1, 0), 5) +# +# def test_pi_inc(self): +# from irlc.ex02.graph_traversal import pi_inc, pi_smart, pi_silly +# for k in range(5): +# self.assertEqual(pi_inc(k+1, k), k+2) +# # self.assertEqual(pi_smart(k + 1, k), 5) +# # self.assertEqual(pi_smart(k + 1, k), 5) +# +# def test_rollout(self): +# # self.assertEqual(3, 1) +# t = 5 +# x0 = 1 # starting node +# model = SmallGraphDP(t=t) +# +# self.assertEqualC(policy_rollout(model, pi_silly, x0)[0]) +# self.assertEqualC(policy_rollout(model, pi_smart, x0)[0]) +# self.assertEqualC(policy_rollout(model, pi_inc, x0)[0]) + +class Problem2DeterministicDP(UTestCase): + def test_dp_deterministic(self): + model = SmallGraphDP(t=5) + J, pi = DP_stochastic(model) + + self.assertLinf(J[-1][1], tol=1e-5) + self.assertLinf(J[-1][2], tol=1e-5) + self.assertLinf(J[-1][3], tol=1e-5) + + self.assertLinf(J[0][1], tol=1e-5) + self.assertLinf(J[0][2], tol=1e-5) + self.assertLinf(J[0][3], tol=1e-5) + + +# class TestInventoryModel(UTestCase): +# @property +# def env(self): +# return InventoryDPModel() +# +# def test_gN(self): +# gN_dp(self, self.env) +# +# def test_f(self): +# f_dp(self, self.env) +# +# def test_g(self): +# g_dp(self, self.env) + + + +# class TestFrozenDP(UTestCase): +# @property +# def env(self): +# return Gym2DPModel(gym_env=gym.make("FrozenLake-v1")) +# +# def test_f(self): +# f_dp(self, self.env) +# +# def test_g(self): +# g_dp(self, self.env) + +class ExamQuestion7FlowersStore(UTestCase): + def test_a_get_policy(self): + from irlc.ex02.flower_store import a_get_policy + x0 = 0 + c = 0.5 + N = 3 + self.assertEqual(a_get_policy(N, c, x0), 1) + + def test_b_prob_empty(self): + from irlc.ex02.flower_store import b_prob_one + x0 = 0 + N = 3 + self.assertAlmostEqual(b_prob_one(N, x0), 0.492, places=2) + + +class Week02Tests(Report): + title = "Tests for week 02" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (Problem1SmallGraph, 10), + (Problem2DeterministicDP, 10), + (Problem3StochasticDP, 10), + (Problem4DPAgent, 10), + (ExamQuestion7FlowersStore, 10), + ] + + +# (SmallGraphPolicies, 10), +# (TestInventoryModel, 10), +# (DPChessMatch, 10), +# (TestFrozenDP, 10), + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week02Tests() ) diff --git a/irlc/tests/tests_week03.py b/irlc/tests/tests_week03.py new file mode 100644 index 0000000000000000000000000000000000000000..403e29a53524dbc4250ec8b49a8dbd06bdc84e58 --- /dev/null +++ b/irlc/tests/tests_week03.py @@ -0,0 +1,88 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import Report +import irlc +from unitgrade import UTestCase +from irlc.ex03.kuramoto import KuramotoModel, f +import sympy as sym +import numpy as np + +class Problem1Kuramoto(UTestCase): + """ Test the Kuromoto Osscilator """ + def test_continious_model(self): + cmodel = KuramotoModel() + x, u = sym.symbols("x u") + expr = cmodel.sym_f([x], [u]) + # Check the expression has the right type. + self.assertIsInstance(expr, list) + # Evaluate the expression and check the result in a given point. + self.assertEqualC(expr[0].subs([(x, 0.2), (u, 0.93)])) + + def test_f(self): + self.assertLinf(f([0.1], [0.4]), tol=1e-6) + + + def test_RK4(self): + from irlc.ex03.kuramoto import rk4_simulate + + cmodel = KuramotoModel() + x0 = np.asarray(cmodel.x0_bound().low) # Get the starting state x=0. + u = 1.3 + xs, ts = rk4_simulate(x0, [u], t0=0, tF=20, N=100) + + # xs, us, ts = cmodel.simulate(x0, u_fun=u , t0=0, tF=20) + self.assertLinf(ts, tol=1e-6) + # self.assertLinf(us, tol=1e-6) + self.assertLinf(xs, tol=1e-6) + + # Test the same with a varying function: + xs, ts = rk4_simulate(x0, [u+1], t0=0, tF=10, N=50) + # xs, us, ts = cmodel.simulate(x0, u_fun=lambda x,t: np.sin(x + u) , t0=0, tF=10) + self.assertLinf(ts, tol=1e-6) + # self.assertLinf(us, tol=1e-6) + self.assertLinf(xs, tol=1e-6) + +class Exam5InventoryEvaluation(UTestCase): + def test_a_test_expected_items_next_day(self): + from irlc.ex03.inventory_evaluation import a_expected_items_next_day + self.assertAlmostEqual(a_expected_items_next_day(x=0, u=1), 0.1, places=5) + + def test_b_test_expected_items_next_day(self): + from irlc.ex03.inventory_evaluation import b_evaluate_policy + pi = self.get_pi() + self.assertAlmostEqual(b_evaluate_policy(pi, 1), 2.7, places=5) + + def get_pi(self): + from irlc.ex02.inventory import InventoryDPModel + model = InventoryDPModel() + pi = [{x: 1 if x == 0 else 0 for x in model.S(k)} for k in range(model.N)] + return pi + +class Exam6Toy2d(UTestCase): + def test_rk4_a(self): + from irlc.ex03.toy_2d_control import toy_simulation + w = toy_simulation(u0=0.4, T=5) + self.assertFalse(isinstance(w, np.ndarray), msg="Your toy_simulation function must return a float") + self.assertEqual(type(float(w)), float, msg="Your toy_simulation function must return a float") + self.assertLinf(w, tol=0.01, msg="Your simulation ended up at the wrong angle") + + def test_rk4_b(self): + from irlc.ex03.toy_2d_control import toy_simulation + w = toy_simulation(u0=-0.1, T=2) + self.assertFalse( isinstance(w, np.ndarray), msg="Your toy_simulation function must return a float") + self.assertEqual(type(float(w)), float, msg="Your toy_simulation function must return a float") + self.assertLinf(w, tol=0.01, msg="Your simulation ended up at the wrong angle") + + +class Week03Tests(Report): #240 total. + title = "Tests for week 03" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (Problem1Kuramoto, 10), + (Exam5InventoryEvaluation, 10), + (Exam6Toy2d, 10), + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week03Tests()) diff --git a/irlc/tests/tests_week04.py b/irlc/tests/tests_week04.py new file mode 100644 index 0000000000000000000000000000000000000000..b032c0bc49423607e2113a55d244181aa7763ec7 --- /dev/null +++ b/irlc/tests/tests_week04.py @@ -0,0 +1,131 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import Report +from unitgrade import UTestCase +import irlc +from irlc.car.car_model import CarEnvironment +from irlc.ex04.pid_car import PIDCarAgent +from irlc import train +from irlc.ex04.pid_locomotive_agent import LocomotiveEnvironment, PIDLocomotiveAgent +from irlc.ex03.kuramoto import KuramotoModel, f +from irlc.ex04.discrete_kuramoto import fk, dfk_dx +import sympy as sym +import numpy as np + +class Problem1DiscreteKuromoto(UTestCase): + """ Test the Kuromoto Osscilator """ + def test_continious_model(self): + cmodel = KuramotoModel() + x, u = sym.symbols("x u") + expr = cmodel.sym_f([x], [u]) + # Check the expression has the right type. + self.assertIsInstance(expr, list) + # Evaluate the expression and check the result in a given point. + self.assertEqualC(expr[0].subs([(x, 0.2), (u, 0.93)])) + + def test_f(self): + self.assertLinf(f([0.1], [0.4]), tol=1e-6) + + def test_fk(self): + self.assertLinf(fk([0.1], [0.4]), tol=1e-6) + + def test_dfk_dx(self): + self.assertLinf(dfk_dx([0.1], [0.4]), tol=1e-6) + +class Problem3PID(UTestCase): + """ PID Control """ + + def test_pid_class(self, Kp=40, Ki=0, Kd=0, target=0, x=0): + dt = 0.08 + from irlc.ex04.pid import PID + pid = PID(Kp=Kp, Kd=Kd, Ki=Ki, target=target, dt=0.8) + u = pid.pi(x) + self.assertL2(u, tol=1e-4) + + def test_pid_Kp(self): + self.test_pid_class(40, 0, 0, 0, 1) + self.test_pid_class(10, 0, 0, 0, 2) + + + def test_pid_target(self): + self.test_pid_class(40, 0, 0, 3, 1) + self.test_pid_class(20, 0, 0, 0, 2) + + + def test_pid_all(self): + self.test_pid_class(4, 3, 8, 1, 1) + self.test_pid_class(40, 10, 3, 0, 2) + + +class Problem4PIDAgent(UTestCase): + """ PID Control """ + + def pid_locomotive(self, Kp=40, Ki=0, Kd=0, slope=0, target=0): + dt = 0.08 + env = LocomotiveEnvironment(m=10, slope=slope, dt=dt, Tmax=5) + agent = PIDLocomotiveAgent(env, dt=dt, Kp=Kp, Ki=Ki, Kd=Kd, target=target) + stats, traj = train(env, agent, return_trajectory=True, verbose=False) + self.assertL2(traj[0].state, tol=1e-4) + + def test_locomotive_flat(self): + self.pid_locomotive() + + def test_locomotive_Kd(self): + """ Test the derivative term """ + self.pid_locomotive(Kd = 10) + + def test_locomotive_Ki(self): + """ Test the integral term """ + self.pid_locomotive(Kd = 10, Ki=5, slope=5) + + + def test_locomotive_all(self): + """ Test all terms """ + self.pid_locomotive(Kp=35, Kd = 10, Ki=5, slope=5, target=1) + + + + +class Problem7PIDCar(UTestCase): + lt = -1 + + @classmethod + def setUpClass(cls) -> None: + env = CarEnvironment(noise_scale=0, Tmax=80, max_laps=2) + agent = PIDCarAgent(env, v_target=1.0) + stats, trajectories = train(env, agent, num_episodes=1, return_trajectory=True) + d = trajectories[0].state[:, 4] + lt = len(d) * env.dt / 2 + print("Lap time", lt) + cls.lt = lt + + def test_below_60(self): + """ Testing if lap time is < 60 """ + self.assertTrue(0 < self.__class__.lt < 60) + + def test_below_40(self): + """ Testing if lap time is < 60 """ + self.assertTrue(0 < self.__class__.lt < 40) + + + def test_below_30(self): + """ Testing if lap time is < 60 """ + self.assertTrue(0 < self.__class__.lt < 30) + + def test_below_22(self): + """ Testing if lap time is < 22 """ + self.assertTrue(0 < self.__class__.lt < 22) + +class Week04Tests(Report): + title = "Tests for week 04" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (Problem1DiscreteKuromoto, 10), + (Problem3PID, 10), + (Problem4PIDAgent, 10), # ok + (Problem7PIDCar, 10), # ok + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week04Tests()) diff --git a/irlc/tests/tests_week05.py b/irlc/tests/tests_week05.py new file mode 100644 index 0000000000000000000000000000000000000000..4a7f813840b6670d6caa99c16576d2b90ff7572c --- /dev/null +++ b/irlc/tests/tests_week05.py @@ -0,0 +1,114 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import Report +from irlc.ex05.direct_agent import train_direct_agent +from unitgrade import UTestCase +import irlc +from irlc.ex05.direct import run_direct_small_problem + + +class DirectMethods(UTestCase): + title = "Direct methods z, z0, z_lb/z_ub definitions+" + + @classmethod + def setUpClass(cls) -> None: + env, solution = run_direct_small_problem() + cls.solution = solution[-1] + + def test_z_variable_vector(self): + self.assertEqualC(str(DirectMethods.solution['inputs']['z'])) + + def test_z0_initial_state(self): + self.assertL2(DirectMethods.solution['inputs']['z0'], tol=1e-6) + + def test_zU_upper_bound(self): + self.assertL2(DirectMethods.solution['inputs']['z_ub'], tol=1e-6) + + def test_zL_lower_bound(self): + self.assertL2(DirectMethods.solution['inputs']['z_lb'], tol=1e-6) + + +class DirectAgentPendulum(UTestCase): + """ Direct agent: Test of pendulum environment """ + def test_pendulum(self): + stats,_,_ = train_direct_agent(animate=False) + return self.assertL2(stats[0]['Accumulated Reward'], tol=0.03) + +class DirectSolverQuestion(UTestCase): + """ Test the Direct solver on the Pendulum using run_direct_small_problem() """ + @classmethod + def setUpClass(cls): + cls.solution = cls.compute_solution() + + @classmethod + def compute_solution(cls): + from irlc.ex05.direct import run_direct_small_problem + env, solution = run_direct_small_problem() + return solution + # cls.solution = solution + + def test_solver_success(self): + self.assertTrue(self.__class__.solution[-1]['solver']['success']) + + def test_solver_fun(self): + self.assertL2(self.__class__.solution[-1]['solver']['fun'], tol=0.01) + + def test_constraint_violation(self): + self.assertL2(self.__class__.solution[-1]['eqC_val'], tol=0.01) + + +class PendulumQuestion(DirectSolverQuestion): + """ Direct solver on the pendulum problem """ + @classmethod + def compute_solution(cls): + from irlc.ex05.direct_pendulum import compute_pendulum_solutions + return compute_pendulum_solutions()[1] + + +class CartpoleTimeQuestion(DirectSolverQuestion): + """ Direct solver on the cartpole (minimum time) task """ + @classmethod + def compute_solution(cls): + from irlc.ex05.direct_cartpole_time import compute_solutions + return compute_solutions()[1] + + +class CartpoleCostQuestion(DirectSolverQuestion): + """ Direct solver on the cartpole (kelly) task """ + @classmethod + def compute_solution(cls): + from irlc.ex05.direct_cartpole_kelly import compute_solutions + return compute_solutions()[1] + +class BrachistochroneQuestion(DirectSolverQuestion): + """ Brachistochrone (unconstrained) """ + + @classmethod + def compute_solution(cls): + from irlc.ex05.direct_brachistochrone import compute_constrained_solutions + return compute_constrained_solutions()[1] + +class BrachistochroneConstrainedQuestion(DirectSolverQuestion): + """ Brachistochrone (constrained) """ + @classmethod + def compute_solution(cls): + from irlc.ex05.direct_brachistochrone import compute_constrained_solutions + return compute_constrained_solutions()[1] + +class Week05Tests(Report): + title = "Tests for week 05" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (DirectMethods, 10), # ok + (DirectSolverQuestion, 10), # ok + (PendulumQuestion, 5), # ok + (DirectAgentPendulum, 10), # ok + (CartpoleTimeQuestion, 5), # ok + (CartpoleCostQuestion, 5), # ok + (BrachistochroneQuestion, 5), # ok + (BrachistochroneConstrainedQuestion, 10), # ok + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week05Tests()) diff --git a/irlc/tests/tests_week06.py b/irlc/tests/tests_week06.py new file mode 100644 index 0000000000000000000000000000000000000000..a72463838f7fba08f8db8d4bf789532d313e7e2d --- /dev/null +++ b/irlc/tests/tests_week06.py @@ -0,0 +1,147 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex06.model_boeing import BoeingEnvironment +from unitgrade import UTestCase, Report +import irlc +from irlc import train +import numpy as np +from irlc.ex04.locomotive import LocomotiveEnvironment +from irlc.ex04.model_harmonic import HarmonicOscilatorEnvironment + +matrices = ['L', 'l', 'V', 'v', 'vc'] + +class Problem3LQR(UTestCase): + title = "LQR, full check of implementation" + + @classmethod + def setUpClass(cls): + # def init(self): + from irlc.ex06.dlqr_check import check_LQR + (cls.L, cls.l), (cls.V, cls.v, cls.vc) = check_LQR() + # self.M = list(zip(matrices, [L, l, V, v, vc])) + + def chk_item(self, m_list): + self.assertIsInstance(m_list, list) + self.assertEqualC(len(m_list)) + for m in m_list: + self.assertIsInstance(m, np.ndarray) + self.assertEqualC(m.shape) + self.assertL2(m, tol=1e-6) + + def test_L(self): + self.chk_item(self.__class__.L) + + def test_l(self): + self.chk_item(self.__class__.l) + + def test_V(self): + self.chk_item(self.__class__.V) + + def test_v(self): + self.chk_item(self.__class__.v) + + def test_vc(self): + vc = self.__class__.vc + self.assertIsInstance(vc, list) + for d in vc: + self.assertL2(d, tol=1e-6) + + self.chk_item(self.__class__.l) + +class Problem4LQRAgent(UTestCase): + def _mkagent(self, val=0.): + A = np.ones((2, 2))* (1+val) + A[1, 0] = 0 + B = np.asarray([[0], [1]]) + Q = np.eye(2) * (3+val) + R = np.ones((1, 1)) * 2 + q = np.asarray([-1.1 + val, 0]) + from irlc.ex06.lqr_agent import LQRAgent + env = LocomotiveEnvironment(render_mode=None, Tmax=5, slope=1) + agent = LQRAgent(env, A=A, B=B, Q=Q, R=R, q=q) + return agent + + def test_policy_lqr_a(self): + agent = self._mkagent(0) + self.assertL2(agent.pi(np.asarray([1, 0]), k=0)) + self.assertL2(agent.pi(np.asarray([1, 0]), k=5)) + + def test_policy_lqr_b(self): + agent = self._mkagent(0.2) + self.assertL2(agent.pi(np.asarray([1, 0]), k=0)) + self.assertL2(agent.pi(np.asarray([1, 0]), k=5)) + +class Problem5_6_Boeing(UTestCase): + + def test_compute_A_B_d(self): + from irlc.ex06.boeing_lqr import compute_A_B_d, compute_Q_R_q + model = BoeingEnvironment(Tmax=10).discrete_model.continuous_model + A, B, d = compute_A_B_d(model, dt=0.2) + self.assertL2(A) + self.assertL2(B) + self.assertL2(d) + + def test_compute_Q_R_q(self): + from irlc.ex06.boeing_lqr import compute_A_B_d, compute_Q_R_q + model = BoeingEnvironment(Tmax=10).discrete_model.continuous_model + Q, R, q = compute_Q_R_q(model, dt=0.2) + self.assertL2(Q) + self.assertL2(R) + self.assertL2(q) + + def test_boing_path(self): + from irlc.ex06.boeing_lqr import boeing_simulation + stats, trajectories, env = boeing_simulation() + self.assertL2(trajectories[-1].state, tol=1e-6) + + +class Problem7_8_PidLQR(UTestCase): + def test_constant_lqr_agent(self): + Delta = 0.06 # Time discretization constant + # Define a harmonic osscilator environment. Use .., render_mode='human' to see a visualization. + env = HarmonicOscilatorEnvironment(Tmax=8, dt=Delta, m=0.5, R=np.eye(1) * 8, + render_mode=None) # set render_mode='human' to see the oscillator. + model = env.discrete_model.continuous_model # Get the ControlModel corresponding to this environment. + + from irlc.ex06.boeing_lqr import compute_A_B_d, compute_Q_R_q + from irlc.ex06.lqr_pid import ConstantLQRAgent + A, B, d = compute_A_B_d(model, Delta) + Q, R, q = compute_Q_R_q(model, Delta) + x0, _ = env.reset() + + # Run the LQR agent + lqr_agent = ConstantLQRAgent(env, A=A, B=B, d=d, Q=Q, R=R, q=q) + self.assertLinf(lqr_agent.pi(x0, k=0), tol=1e-3) + self.assertLinf(lqr_agent.pi(x0, k=10), tol=1e-3) + + + def test_KpKd(self): + Delta = 0.06 # Time discretization constant + # Define a harmonic osscilator environment. Use .., render_mode='human' to see a visualization. + env = HarmonicOscilatorEnvironment(Tmax=8, dt=Delta, m=0.5, R=np.eye(1) * 8, + render_mode=None) # set render_mode='human' to see the oscillator. + model = env.discrete_model.continuous_model # Get the ControlModel corresponding to this environment. + from irlc.ex06.boeing_lqr import compute_A_B_d, compute_Q_R_q + from irlc.ex06.lqr_pid import ConstantLQRAgent, get_Kp_Kd + A, B, d = compute_A_B_d(model, Delta) + Q, R, q = compute_Q_R_q(model, Delta) + lqr_agent = ConstantLQRAgent(env, A=A, B=B, d=d, Q=Q, R=R, q=q) + Kp, Kd = get_Kp_Kd(lqr_agent.L[0]) + self.assertAlmostEqualC(Kp, places=3) + self.assertAlmostEqualC(Kd, places=3) + + + + +class Week06Tests(Report): + title = "Tests for week 06" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (Problem3LQR, 10), + (Problem4LQRAgent, 10), + (Problem5_6_Boeing, 10), + (Problem7_8_PidLQR, 10), + ] +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week06Tests()) diff --git a/irlc/tests/tests_week07.py b/irlc/tests/tests_week07.py new file mode 100644 index 0000000000000000000000000000000000000000..1f46427025ca7e635d340fafa678f4a7e2c309a7 --- /dev/null +++ b/irlc/tests/tests_week07.py @@ -0,0 +1,62 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import Report +import irlc +from unitgrade import UTestCase +import numpy as np +from irlc import Agent, train + +class RendevouzItem(UTestCase): + def test_rendevouz_without_linesearch(self): + """ Rendevouz with iLQR (no linesearch) """ + from irlc.ex07.ilqr_rendovouz_basic import solve_rendovouz + (xs, us, J_hist, l, L), env = solve_rendovouz(use_linesearch=False) + # print(J_hist[-1]) + self.assertL2(xs[-1], tol=1e-2) + + def test_rendevouz_with_linesearch(self): + """ Rendevouz with iLQR (with linesearch) """ + from irlc.ex07.ilqr_rendovouz_basic import solve_rendovouz + (xs, us, J_hist, l, L), env = solve_rendovouz(use_linesearch=True) + # print(J_hist[-1]) + self.assertL2(xs[-1], tol=1e-2) + # return l, L, xs + + + + + +class ILQRAgentQuestion(UTestCase): + """ iLQR Agent on Rendevouz """ + def test_ilqr_agent(self): + from irlc.ex07.ilqr_agent import solve_rendevouz + stats, trajectories, agent = solve_rendevouz() + self.assertL2(trajectories[-1].state[-1], tol=1e-2) + + +class ILQRPendulumQuestion(UTestCase): + """ iLQR Agent on Pendulum """ + + def test_ilqr_agent_pendulum(self): + from irlc.ex07.ilqr_pendulum_agent import Tmax, N + from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment + from irlc.ex07.ilqr_agent import ILQRAgent + dt = Tmax / N + env = GymSinCosPendulumEnvironment(dt, Tmax=Tmax, supersample_trajectory=True) + agent = ILQRAgent(env, env.discrete_model, N=N, ilqr_iterations=200, use_linesearch=True) + stats, trajectories = train(env, agent, num_episodes=1, return_trajectory=True) + state = trajectories[-1].state[-1] + self.assertL2(state, tol=2e-2) + +class Week07Tests(Report): #240 total. + title = "Tests for week 07" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (RendevouzItem, 10), # ok + (ILQRAgentQuestion, 10), # ok + (ILQRPendulumQuestion, 10), # ok + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week07Tests()) diff --git a/irlc/tests/tests_week08.py b/irlc/tests/tests_week08.py new file mode 100644 index 0000000000000000000000000000000000000000..340d69c01c3ef2cae94901444ba52b9887a47bef --- /dev/null +++ b/irlc/tests/tests_week08.py @@ -0,0 +1,278 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report, cache +import numpy as np +from irlc import train + + +def train_recording(env, agent, trajectories): + for t in trajectories: + env.reset() + for k in range(len(t.action)): + s = t.state[k] + r = t.reward[k] + a = t.action[k] + sp = t.state[k+1] + agent.pi(s,k) + agent.train(s, a, r, sp, done=k == len(t.action)-1) + + +class BanditQuestion(UTestCase): + """ Value (Q) function estimate """ + tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined. + # testfun = QPrintItem.assertL2 + + # def setUpClass(cls) -> None: + # from irlc.ex08.simple_agents import BasicAgent + # from irlc.ex08.bandits import StationaryBandit + # env = StationaryBandit(k=10, ) + # agent = BasicAgent(env, epsilon=0.1) + # _, cls.trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + # cls.Q = agent.Q + # cls.env = env + # cls.agent = agent + + def get_env_agent(self): + from irlc.ex08.simple_agents import BasicAgent + from irlc.ex08.bandits import StationaryBandit + env = StationaryBandit(k=10) + agent = BasicAgent(env, epsilon=0.1) + return env, agent + + @cache + def get_trajectories(self): + env, agent = self.get_env_agent() + _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + return trajectories + + # def precompute_payload(self): + # env, agent = self.get_env_agent() + # _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + # return trajectories, agent.Q + + + def test_agent(self): + trajectories = self.get_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + self.assertL2(agent.Q, tol=1e-5) + # return agent.Q + # self.Q = Q + # self.question.agent = agent + # return agent.Q + + # testfun = QPrintItem.assertL2 + + def test_action_distributin(self): + T = 10000 + tol = 1 / np.sqrt(T) * 5 + trajectories = self.get_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + # for k in self._cache.keys(): print(k) + + from collections import Counter + counts = Counter([agent.pi(None, k) for k in range(T)]) + distrib = [counts[k] / T for k in range(env.k)] + self.assertL2(np.asarray(distrib), tol=tol) + + + # def process_output(self, res, txt, numbers): + # return res + + # def process_output(self, res, txt, numbers): + # return res + # + # def test(self, computed, expected): + # super().test(computed, self.Q) + +# class BanditQuestion(QPrintItem): +# # tol = 1e-6 +# tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined. +# title = "Value (Q) function estimate" +# testfun = QPrintItem.assertL2 +# +# def get_env_agent(self): +# from irlc.ex08.simple_agents import BasicAgent +# from irlc.ex08.bandits import StationaryBandit +# env = StationaryBandit(k=10, ) +# agent = BasicAgent(env, epsilon=0.1) +# return env, agent +# +# def precompute_payload(self): +# env, agent = self.get_env_agent() +# _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) +# return trajectories, agent.Q +# +# def compute_answer_print(self): +# trajectories, Q = self.precomputed_payload() +# env, agent = self.get_env_agent() +# train_recording(env, agent, trajectories) +# self.Q = Q +# self.question.agent = agent +# return agent.Q +# +# def process_output(self, res, txt, numbers): +# return res +# +# def test(self, computed, expected): +# super().test(computed, self.Q) +# +# class BanditItemActionDistribution(QPrintItem): +# # Assumes setup has already been done. +# title = "Action distribution test" +# T = 10000 +# tol = 1/np.sqrt(T)*5 +# testfun = QPrintItem.assertL2 +# +# def compute_answer_print(self): +# # print("In agent print code") +# from collections import Counter +# counts = Counter( [self.question.agent.pi(None, k) for k in range(self.T)] ) +# distrib = [counts[k] / self.T for k in range(self.question.agent.env.k)] +# return np.asarray(distrib) +# +# def process_output(self, res, txt, numbers): +# return res +# +# class BanditQuestion(QuestionGroup): +# title = "Simple bandits" +# class SimpleBanditItem(BanditItem): +# #title = "Value function estimate" +# def get_env_agent(self): +# from irlc.ex08.simple_agents import BasicAgent +# from irlc.ex08.bandits import StationaryBandit +# env = StationaryBandit(k=10, ) +# agent = BasicAgent(env, epsilon=0.1) +# return env, agent +# class SimpleBanditActionDistribution(BanditItemActionDistribution): +# pass + + + +class GradientBanditQuestion(BanditQuestion): + """ Gradient agent """ + # class SimpleBanditItem(BanditItem): + # title = "Simple agent question" + def get_env_agent(self): + from irlc.ex08.bandits import StationaryBandit + from irlc.ex08.gradient_agent import GradientAgent + env = StationaryBandit(k=10) + agent = GradientAgent(env, alpha=0.05) + return env, agent + + # def precompute_payload(self): + # env, agent = self.get_env_agent() + # _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + # return trajectories + + def test_agent(self): + trajectories = self.get_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + self.assertL2(agent.H, tol=1e-5) + + + # def test(self, computed, expected): + # self.testfun(computed, self.H) + # + # class SimpleBanditActionDistribution(BanditItemActionDistribution): + # pass + + +# class GradientBanditQuestion(QuestionGroup): +# title = "Gradient agent" +# class SimpleBanditItem(BanditItem): +# # title = "Simple agent question" +# def get_env_agent(self): +# from irlc.ex08.bandits import StationaryBandit +# from irlc.ex08.gradient_agent import GradientAgent +# env = StationaryBandit(k=10) +# agent = GradientAgent(env, alpha=0.05) +# return env, agent +# +# def precompute_payload(self): +# env, agent = self.get_env_agent() +# _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) +# return trajectories, agent.H +# +# def compute_answer_print(self): +# trajectories, H = self.precomputed_payload() +# env, agent = self.get_env_agent() +# train_recording(env, agent, trajectories) +# self.H = H +# self.question.agent = agent +# return agent.H +# +# def test(self, computed, expected): +# self.testfun(computed, self.H) +# +# class SimpleBanditActionDistribution(BanditItemActionDistribution): +# pass + + + +class UCBAgentQuestion(BanditQuestion): + """ UCB agent """ + # class UCBAgentItem(BanditItem): + def get_env_agent(self): + from irlc.ex08.bandits import StationaryBandit + from irlc.ex08.ucb_agent import UCBAgent + env = StationaryBandit(k=10) + agent = UCBAgent(env) + return env, agent + + # class UCBAgentActionDistribution(BanditItemActionDistribution): + # pass + + +# class UCBAgentQuestion(QuestionGroup): +# title = "UCB agent" +# class UCBAgentItem(BanditItem): +# def get_env_agent(self): +# from irlc.ex08.bandits import StationaryBandit +# from irlc.ex08.ucb_agent import UCBAgent +# env = StationaryBandit(k=10) +# agent = UCBAgent(env) +# return env, agent +# +# class UCBAgentActionDistribution(BanditItemActionDistribution): +# pass + +# class NonstatiotnaryAgentQuestion(QuestionGroup): +# title = "Nonstationary bandit environment" +# class NonstationaryItem(BanditItem): +# def get_env_agent(self): +# epsilon = 0.1 +# from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent +# bandit = NonstationaryBandit(k=10) +# agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15) +# return bandit, agent +# +# class NonstationaryActionDistribution(BanditItemActionDistribution): +# pass + +class NonstatiotnaryAgentQuestion(BanditQuestion): + """ UCB agent """ + # class UCBAgentItem(BanditItem): + def get_env_agent(self): + epsilon = 0.1 + from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent + bandit = NonstationaryBandit(k=10) + agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15) + return bandit, agent + +import irlc +class Week08Tests(Report): + title = "Tests for week 08" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (BanditQuestion, 10), + (GradientBanditQuestion, 10), + (UCBAgentQuestion, 5), + (NonstatiotnaryAgentQuestion, 5) + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week08Tests()) diff --git a/irlc/tests/tests_week09.py b/irlc/tests/tests_week09.py new file mode 100644 index 0000000000000000000000000000000000000000..ca5d4aee9979bb4cfe60d95050bc27d10e031ad7 --- /dev/null +++ b/irlc/tests/tests_week09.py @@ -0,0 +1,314 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report +import numpy as np +import irlc +from irlc import train +from irlc.ex09.small_gridworld import SmallGridworldMDP +from irlc.ex09.policy_iteration import policy_iteration +from irlc.ex09.value_iteration import value_iteration +from irlc.gridworld.gridworld_environments import FrozenLake +from irlc.ex09.policy_evaluation import policy_evaluation + +class Problem1_to_3_Warmup(UTestCase): + def test_part1_average_reward(self): + from irlc.ex09.mdp_warmup import expected_reward + mdp = FrozenLake(living_reward=0.2).mdp # Get the MDP of this environment. + s0 = mdp.initial_state + ## Part 1: Expected reward + self.assertAlmostEqualC(expected_reward(mdp, s=s0, a=0), places=5) + self.assertAlmostEqualC(expected_reward(mdp, s=s0, a=2), places=5) + self.assertAlmostEqualC(expected_reward(mdp, s=(1,2), a=0), places=5) + mdp = FrozenLake(living_reward=0.2).mdp # Get the MDP of this environment. + self.assertAlmostEqualC(expected_reward(mdp, s=s0, a=2), places=5) + + def test_part2_v2q(self): + ## Part 2 + # First let's create a non-trivial value function + V = {} + mdp = FrozenLake(living_reward=0.3).mdp + + for k, s in enumerate(sorted(mdp.nonterminal_states)): + V[s] = 2 * (s[0] - s[1]) - 3.5 + + from irlc.ex09.mdp_warmup import value_function2q_function + + states = [(0, 1), (2, 3), (0, 3), (1,3), (1, 2)] + + s0 = mdp.initial_state + + q_ = value_function2q_function(mdp, s=s0, gamma=0.9, v=V) + self.assertIsInstance(q_, dict) + self.assertEqual(list(sorted(q_.keys())), [0, 1, 2, 3] ) + + self.assertEqual(len(q_), 4) + self.assertEqual(len(value_function2q_function(mdp, s=(1,2), gamma=0.9, v=V)), 1) + self.assertAlmostEqualC(q_[0],places=4) + self.assertAlmostEqualC(q_[2], places=4) + + + for s in sorted(states): + q_ = value_function2q_function(mdp, s=s, gamma=0.9, v=V) + for a in [0, 1, 2, 3]: + if a in mdp.A(s): + self.assertAlmostEqualC(q_[a], places=4) + + def test_part2_q2v(self): + ## Part 3 + mdp = FrozenLake(living_reward=0.2).mdp + from irlc.ex09.mdp_warmup import value_function2q_function, q_function2value_function + # Create a non-trivial Q-function for this problem. + Q = {} + s0 = mdp.initial_state + + for k, s in enumerate(mdp.nonterminal_states): + for a in mdp.A(s): + Q[s, a] = (s[0] - s[1]) - 5 * a # The particular values are not important in this example + # Create a policy. In this case pi(a=3) = 0.4. + pi = {0: 0.2, + 1: 0.4, + 2: 0.2, + 3: 0.2} + self.assertAlmostEqualC(q_function2value_function(pi, Q, s=s0), places=4) + +def train_recording(env, agent, trajectories): + for t in trajectories: + env.reset() + for k in range(len(t.action)): + s = t.state[k] + r = t.reward[k] + a = t.action[k] + sp = t.state[k+1] + info = t.info[k] + info_sp = t.info[k+1] + + agent.pi(s,k) + agent.train(s, a, r, sp, done=k == len(t.action)-1, info_s = info, info_sp=info_sp) + + +class ValueFunctionTest(UTestCase): + def check_value_function(self, mdp, V): + self.assertL2(np.asarray([V[s] for s in mdp.states]), tol=1e-3) + +class Problem5PolicyIteration(ValueFunctionTest): + """ Iterative Policy iteration """ + def test_policy_iteration(self): + env = SmallGridworldMDP() + pi, v = policy_iteration(env, gamma=0.91) + self.check_value_function(env, v) + + + +class Problem6ValueIteration(ValueFunctionTest): + """ Iterative value iteration """ + def test_value_iteration(self): + env = SmallGridworldMDP() + # from i + pi, v = value_iteration(env, gamma=0.91) + self.check_value_function(env, v) + + + +class Problem4PolicyEvaluation(ValueFunctionTest): + """ Iterative value iteration """ + def test_policy_evaluation(self): + mdp = SmallGridworldMDP() + pi = {s: {a: 1/len(mdp.A(s)) for a in mdp.A(s) } for s in mdp.nonterminal_states } + v = policy_evaluation(pi, mdp, gamma=0.91) + self.check_value_function(mdp, v) + + def test_policy_evaluation_b(self): + mdp = SmallGridworldMDP() + pi = {s: {a: 1 if a == 0 else 0 for a in mdp.A(s) } for s in mdp.nonterminal_states } + v = policy_evaluation(pi, mdp, gamma=0.91) + self.check_value_function(mdp, v) + + + + +class Problem9Gambler(ValueFunctionTest): + """ Gambler's problem """ + def test_gambler_value_function(self): + # from irlc.ex09.small_gridworld import SmallGridworldMDP, plot_value_function + # from irlc.ex09.policy_iteration import policy_iteration + # from irlc.ex09.value_iteration import value_iteration + from irlc.ex09.gambler import GamblerEnv + env = GamblerEnv() + pi, v = value_iteration(env, gamma=0.91) + self.check_value_function(env, v) + +# class JackQuestion(ValueFunctionTest): +# """ Gambler's problem """ +# def test_jacks_rental_value_function(self): +# # from irlc.ex09.small_gridworld import SmallGridworldMDP, plot_value_function +# # from irlc.ex09.policy_iteration import policy_iteration +# # from irlc.ex09.value_iteration import value_iteration +# # from irlc.ex09.gambler import GamblerEnv +# from irlc.ex09.jacks_car_rental import JackRentalMDP +# max_cars = 5 +# env = JackRentalMDP(max_cars=max_cars, verbose=True) +# pi, V = value_iteration(env, gamma=.9, theta=1e-3, max_iters=1000, verbose=True) +# self.check_value_function(env, V) + +# class JackQuestion(QuestionGroup): +# title = "Jacks car rental problem" +# +# class JackItem(GridworldDPItem): +# title = "Value function test" +# max_cars = 5 +# tol = 0.01 +# +# def get_value_function(self): +# from irlc.ex09.value_iteration import value_iteration +# from irlc.ex09.jacks_car_rental import JackRentalMDP +# env = JackRentalMDP(max_cars=self.max_cars, verbose=True) +# pi, V = value_iteration(env, gamma=.9, theta=1e-3, max_iters=1000, verbose=True) +# return V, env + + + # return v, env + # pass +# class DynamicalProgrammingGroup(QuestionGroup): +# title = "Dynamical Programming test" +# +# class PolicyEvaluationItem(GridworldDPItem): +# title = "Iterative Policy evaluation" +# +# +# +# class PolicyIterationItem(GridworldDPItem): +# title = "policy iteration" +# def get_value_function(self): +# from irlc.ex09.small_gridworld import SmallGridworldMDP +# from irlc.ex09.policy_iteration import policy_iteration +# env = SmallGridworldMDP() +# pi, v = policy_iteration(env, gamma=0.91) +# return v, env +# class ValueIteartionItem(GridworldDPItem): +# title = "value iteration" +# +# def get_value_function(self): +# from irlc.ex09.value_iteration import value_iteration +# from irlc.ex09.small_gridworld import SmallGridworldMDP +# env = SmallGridworldMDP() +# policy, v = value_iteration(env, gamma=0.92, theta=1e-6) +# return v, env + +# class GamlerQuestion(QuestionGroup): +# title = "Gamblers problem" +# class GamlerItem(GridworldDPItem): +# title = "Value-function test" +# def get_value_function(self): +# # from irlc.ex09.small_gridworld import SmallGridworldMDP, plot_value_function +# # from irlc.ex09.policy_iteration import policy_iteration +# from irlc.ex09.value_iteration import value_iteration +# from irlc.ex09.gambler import GamblerEnv +# env = GamblerEnv() +# pi, v = value_iteration(env, gamma=0.91) +# return v, env + +# class JackQuestion(QuestionGroup): +# title ="Jacks car rental problem" +# class JackItem(GridworldDPItem): +# title = "Value function test" +# max_cars = 5 +# tol = 0.01 +# def get_value_function(self): +# from irlc.ex09.value_iteration import value_iteration +# from irlc.ex09.jacks_car_rental import JackRentalMDP +# env = JackRentalMDP(max_cars=self.max_cars, verbose=True) +# pi, V = value_iteration(env, gamma=.9, theta=1e-3, max_iters=1000, verbose=True) +# return V, env + +class Problem8ValueIterationAgent(UTestCase): + """ Value-iteration agent test """ + + def test_sutton_gridworld(self): + tol = 1e-2 + from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment + env = SuttonCornerGridEnvironment(living_reward=-1) + from irlc.ex09.value_iteration_agent import ValueIterationAgent + agent = ValueIterationAgent(env, mdp=env.mdp) + stats, _ = train(env, agent, num_episodes=1000) + self.assertL2(np.mean([s['Accumulated Reward'] for s in stats]), tol=tol) + + def test_bookgrid_gridworld(self): + tol = 1e-2 + from irlc.gridworld.gridworld_environments import BookGridEnvironment + env = BookGridEnvironment(living_reward=-1) + from irlc.ex09.value_iteration_agent import ValueIterationAgent + agent = ValueIterationAgent(env, mdp=env.mdp) + stats, _ = train(env, agent, num_episodes=1000) + self.assertL2(np.mean([s['Accumulated Reward'] for s in stats]), tol=tol) + + + # + # + # pass + # class ValueAgentItem(GridworldDPItem): + # title = "Evaluation on Suttons small gridworld" + # tol = 1e-2 + # def get_env(self): + # from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment + # return SuttonCornerGridEnvironment(living_reward=-1) + # + # def compute_answer_print(self): + # env = self.get_env() + # from irlc.ex09.value_iteration_agent import ValueIterationAgent + # agent = ValueIterationAgent(env, mdp=env.mdp) + # # env = VideoMonitor(env, agent=agent, agent_monitor_keys=('v',)) + # stats, _ = train(env, agent, num_episodes=1000) + # return np.mean( [s['Accumulated Reward'] for s in stats]) + # + # def process_output(self, res, txt, numbers): + # return res + + # class BookItem(ValueAgentItem): + # title = "Evaluation on alternative gridworld (Bookgrid)" + # def get_env(self): + # from irlc.gridworld.gridworld_environments import BookGridEnvironment + # return BookGridEnvironment(living_reward=-0.6) + +# class DPAgentRLQuestion(QuestionGroup): +# title = "Value-iteration agent test" +# class ValueAgentItem(GridworldDPItem): +# title = "Evaluation on Suttons small gridworld" +# tol = 1e-2 +# def get_env(self): +# from irlc.gridworld.gridworld_environments import SuttonCornerGridEnvironment +# return SuttonCornerGridEnvironment(living_reward=-1) +# +# def compute_answer_print(self): +# env = self.get_env() +# from irlc.ex09.value_iteration_agent import ValueIterationAgent +# agent = ValueIterationAgent(env, mdp=env.mdp) +# # env = VideoMonitor(env, agent=agent, agent_monitor_keys=('v',)) +# stats, _ = train(env, agent, num_episodes=1000) +# return np.mean( [s['Accumulated Reward'] for s in stats]) +# +# def process_output(self, res, txt, numbers): +# return res +# +# class BookItem(ValueAgentItem): +# title = "Evaluation on alternative gridworld (Bookgrid)" +# def get_env(self): +# from irlc.gridworld.gridworld_environments import BookGridEnvironment +# return BookGridEnvironment(living_reward=-0.6) + +class Week09Tests(Report): + title = "Tests for week 09" + pack_imports = [irlc] + individual_imports = [] + questions = [ (Problem1_to_3_Warmup, 10), + (Problem4PolicyEvaluation, 10), + (Problem5PolicyIteration, 10), + (Problem6ValueIteration, 10), + (Problem8ValueIterationAgent, 10), + (Problem9Gambler, 10), + ] + # (JackQuestion, 10), + # (ValueFunctionTest, 20), + + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week09Tests()) diff --git a/irlc/tests/tests_week10.py b/irlc/tests/tests_week10.py new file mode 100644 index 0000000000000000000000000000000000000000..b5dd4e6580fd2cd8dcebf7de0ba5f90e9edd9ca8 --- /dev/null +++ b/irlc/tests/tests_week10.py @@ -0,0 +1,132 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.ex10.question_td0 import a_compute_deltas, b_perform_td0, c_perform_td0_batched +from unitgrade import Report, UTestCase, cache +from irlc import train +import irlc.ex10.envs +import gymnasium as gym +from gymnasium.wrappers import TimeLimit +from irlc.tests.tests_week08 import train_recording + + +class MCAgentQuestion(UTestCase): + """ Test of MC agent """ + def get_env_agent(self): + from irlc.ex10.mc_agent import MCAgent + env = gym.make("SmallGridworld-v0") + env = TimeLimit(env, max_episode_steps=1000) + gamma = .8 + agent = MCAgent(env, gamma=gamma, first_visit=True) + return env, agent + + @cache + def compute_trajectories(self): + env, agent = self.get_env_agent() + _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + return trajectories, agent.Q.to_dict() + + def test_Q_function(self): + trajectories, Q = self.compute_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + Qc = [] + Qe = [] + for s, qa in Q.items(): + for a,q in qa.items(): + Qe.append(q) + Qc.append(agent.Q[s,a]) + + self.assertL2(Qe, Qc, tol=1e-5) + + +# class BlackjackQuestion(UTestCase): +# """ MC policy evaluation agent and Blacjack """ +# def test_blackjack_mc(self): +# env = gym.make("Blackjack-v1") +# episodes = 50000 +# from irlc.ex10.mc_evaluate import MCEvaluationAgent +# from irlc.ex10.mc_evaluate_blackjack import get_by_ace, to_matrix, policy20 +# agent = MCEvaluationAgent(env, policy=policy20, gamma=1) +# train(env, agent, num_episodes=episodes) +# w = get_by_ace(agent.v, ace=True) +# X, Y, Z = to_matrix(w) +# print(Z) +# print(Z.dtype) +# self.assertL2(Z, tol=2.5) + + +class TD0Question(UTestCase): + """ Test of TD(0) evaluation agent """ + gamma = 0.8 + + def get_env_agent(self): + from irlc.ex10.td0_evaluate import TD0ValueAgent + env = gym.make("SmallGridworld-v0") + # env = TimeLimit(env, max_episode_steps=1000) + agent = TD0ValueAgent(env, gamma=self.gamma) + return env, agent + + @cache + def compute_trajectories(self): + env, agent = self.get_env_agent() + _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + return trajectories, agent.v + + def test_value_function(self): + # for k in range(1000): + trajectories, v = self.compute_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + Qc = [] + Qe = [] + for s, value in v.items(): + Qe.append(value) + Qc.append(agent.v[s]) + + self.assertL2(Qe, Qc, tol=1e-5) + +class MCEvaluationQuestion(TD0Question): + """ Test of MC evaluation agent """ + def get_env_agent(self): + from irlc.ex10.mc_evaluate import MCEvaluationAgent + env = gym.make("SmallGridworld-v0") + env = TimeLimit(env, max_episode_steps=1000) + gamma = .8 + agent = MCEvaluationAgent(env, gamma=gamma, first_visit=True) + return env, agent + + +class ExamQuestionTD0(UTestCase): + + def get_problem(self): + states = [1, 0, 2, -1, 2, 4, 5, 4, 3, 2, 1, -1] + rewards = [1, 1, -1, 0, 1, 2, 2, 0, 0, -1, 1] + v = {s: 0 for s in states} + gamma = 0.9 + alpha = 0.2 + return v, states, rewards, gamma, alpha + + def test_a(self): + v, states, rewards, gamma, alpha = self.get_problem() + self.assertEqualC(a_compute_deltas(v, states, rewards, gamma)) + + def test_b(self): + v, states, rewards, gamma, alpha = self.get_problem() + self.assertEqualC(b_perform_td0(v, states, rewards, gamma, alpha)) + + def test_c(self): + v, states, rewards, gamma, alpha = self.get_problem() + self.assertEqualC(c_perform_td0_batched(v, states, rewards, gamma, alpha)) +class Week10Tests(Report): + title = "Tests for week 10" + pack_imports = [irlc] + individual_imports = [] + questions = [(MCAgentQuestion, 10), + (MCEvaluationQuestion, 10), + # (BlackjackQuestion,5), + (TD0Question, 10), + (ExamQuestionTD0, 10), + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week10Tests()) diff --git a/irlc/tests/tests_week11.py b/irlc/tests/tests_week11.py new file mode 100644 index 0000000000000000000000000000000000000000..1fc1087986f8a86071f2fc3ad9466d2b4b6c1d56 --- /dev/null +++ b/irlc/tests/tests_week11.py @@ -0,0 +1,199 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report, cache +import numpy as np +from irlc import train +import irlc.ex10.envs +import gymnasium as gym +from irlc.tests.tests_week08 import train_recording +from irlc.tests.tests_week10 import TD0Question, MCAgentQuestion + + +# This problem no longer exists. +# class NStepSarseEvaluationQuestion(TD0Question): +# """ Test of TD-n evaluation agent """ +# # class EvaluateTabular(VExperienceItem): +# # title = "Value-function test" +# gamma = 0.8 +# def get_env_agent(self): +# envn = "SmallGridworld-v0" +# from irlc.ex11.nstep_td_evaluate import TDnValueAgent +# env = gym.make(envn) +# agent = TDnValueAgent(env, gamma=self.gamma, n=5) +# return env, agent + + + +class QAgentQuestion(MCAgentQuestion): + """ Test of Q Agent """ + # class EvaluateTabular(QExperienceItem): + # title = "Q-value test" + + def get_env_agent(self): + from irlc.ex11.q_agent import QAgent + env = gym.make("SmallGridworld-v0") + agent = QAgent(env, gamma=.8) + return env, agent + + +# class LinearWeightVectorTest(UTestCase): + + + +# class LinearValueFunctionTest(LinearWeightVectorTest): +# title = "Linear value-function test" +# def compute_answer_print(self): +# trajectories, Q = self.precomputed_payload() +# env, agent = self.get_env_agent() +# train_recording(env, agent, trajectories) +# self.Q = Q +# self.question.agent = agent +# vfun = [agent.Q[s,a] for s, a in zip(trajectories[0].state, trajectories[0].action)] +# return vfun + +# class TabularAgentStub(UTestCase): +# +# pass + +class TabularAgentStub(UTestCase): + """ Average return over many simulated episodes """ + gamma = 0.95 + epsilon = 0.2 + tol = 0.1 + tol_qs = 0.3 + + def get_env(self): + return gym.make("SmallGridworld-v0") + + def get_env_agent(self): + raise NotImplementedError() + # from irlc.ex11.sarsa_agent import SarsaAgent + # agent = SarsaAgent(self.get_env(), gamma=self.gamma) + # return agent.env, agent + + def get_trained_agent(self): + env, agent = self.get_env_agent() + stats, _ = train(env, agent, num_episodes=9000) + return agent, stats + + def chk_accumulated_reward(self): + agent, stats = self.get_trained_agent() + s0, _ = agent.env.reset() + actions, qs = agent.Q.get_Qs(s0) + print("Tolerance is", self.tol_qs) + self.assertL2(qs, tol=self.tol_qs) + self.assertL2(np.mean([s['Accumulated Reward'] for s in stats]), tol=self.tol) + + # def test_accumulated_reward(self): + # env, agent = self.get_env_agent() + # stats, _ = train(env, agent, num_episodes=5000) + # s = env.reset() + # actions, qs = agent.Q.get_Qs(s) + # self.assertL2(qs, tol=0.3) + # self.assertL2(np.mean([s['Accumulated Reward'] for s in stats]), tol=self.tol) + +class SarsaQuestion(TabularAgentStub): + + + def get_env_agent(self): + from irlc.ex11.sarsa_agent import SarsaAgent + agent = SarsaAgent(self.get_env(), gamma=self.gamma) + return agent.env, agent + + def test_accumulated_reward(self): + self.tol_qs = 2.7 # Got 2.65 in one run. + self.chk_accumulated_reward() + + +class NStepSarsaQuestion(TabularAgentStub): + title = "N-step Sarsa" + # class SarsaReturnItem(SarsaQuestion): + def get_env_agent(self): + from irlc.ex11.nstep_sarsa_agent import SarsaNAgent + agent = SarsaNAgent(self.get_env(), gamma=self.gamma, n=5) + return agent.env, agent + + def test_accumulated_reward(self): + self.tol_qs = 2.7 + self.chk_accumulated_reward() + + +class LinearAgentStub(UTestCase): + # class LinearExperienceItem(LinearWeightVectorTest): + tol = 1e-6 + # title = "Linear sarsa agent" + alpha = 0.08 + num_episodes = 300 + # title = "Weight-vector test" + # testfun = QPrintItem.assertL2 + gamma = 0.8 + tol_w = 1e-5 + + + def get_env_agent(self): + raise NotImplementedError() + + def get_env(self): + return gym.make("MountainCar500-v0") + + # def get_env_agent(self): + # return None, None + + @cache + def compute_trajectories(self): + env, agent = self.get_env_agent() + _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + return trajectories, agent.Q.w + + def chk_Q_weight_vector_w(self): + trajectories, w = self.compute_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + print(w) + print(agent.Q.w) + self.assertL2(agent.Q.w, w, tol=self.tol_w) + + pass +class LinearSarsaAgentQuestion(LinearAgentStub): + """ Sarsa Agent with linear function approximators """ + + def get_env_agent(self): + env = self.get_env() + from irlc.ex11.semi_grad_sarsa import LinearSemiGradSarsa + agent = LinearSemiGradSarsa(env, gamma=1, alpha=self.alpha, epsilon=0) + return env, agent + + def test_Q_weight_vector_w(self): + self.tol_w = 1.4 + self.chk_Q_weight_vector_w() + +class LinearQAgentQuestion(LinearAgentStub): + """ Test of Linear Q Agent """ + + def get_env_agent(self): + env = self.get_env() + alpha = 0.1 + from irlc.ex11.semi_grad_q import LinearSemiGradQAgent + agent = LinearSemiGradQAgent(env, gamma=1, alpha=alpha, epsilon=0) + return env, agent + + def test_Q_weight_vector_w(self): + # self.tol_qs = 1.9 + self.tol_w = 7 + self.chk_Q_weight_vector_w() + + +class Week11Tests(Report): + title = "Tests for week 11" + pack_imports = [irlc] + individual_imports = [] + questions =[ + # (NStepSarseEvaluationQuestion, 10), + (QAgentQuestion, 10), + (LinearQAgentQuestion, 10), + (LinearSarsaAgentQuestion, 10), + (SarsaQuestion, 10), + (NStepSarsaQuestion, 5), + ] +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week11Tests()) diff --git a/irlc/tests/tests_week12.py b/irlc/tests/tests_week12.py new file mode 100644 index 0000000000000000000000000000000000000000..17c6c620939f3465a03a25c2862020bd6f8e7eec --- /dev/null +++ b/irlc/tests/tests_week12.py @@ -0,0 +1,64 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, cache, Report +import irlc.ex10.envs +## WEEK 12: +from irlc.tests.tests_week11 import TabularAgentStub, LinearAgentStub + +class LinearSarsaNstepAgentQuestion(LinearAgentStub): + """ Test of Linear n-step sarsa Agent """ + tol = 2200 + num_episodes = 150 + gamma = 1 + tol_w = 2.5 + + def get_env_agent(self): + env = self.get_env() + from irlc.ex12.semi_grad_nstep_sarsa import LinearSemiGradSarsaN + from irlc.ex12.semi_grad_sarsa_lambda import alpha + agent = LinearSemiGradSarsaN(env, gamma=self.gamma, alpha=alpha, epsilon=0) + return env, agent + + def test_Q_weight_vector_w(self): + + self.chk_Q_weight_vector_w() + + +class LinearSarsaLambdaAgentQuestion(LinearAgentStub): + """ Test of Linear sarsa(Lambda) Agent """ + tol = 2200 + num_episodes = 150 + gamma = 1 + tol_w = 15 + + def get_env_agent(self): + env = self.get_env() + from irlc.ex12.semi_grad_sarsa_lambda import LinearSemiGradSarsaLambda, alpha + agent = LinearSemiGradSarsaLambda(env, gamma=self.gamma, alpha=alpha, epsilon=0) + return env, agent + + def test_Q_weight_vector_w(self): + self.chk_Q_weight_vector_w() + +class SarsaLambdaQuestion(TabularAgentStub): + """ Sarsa(lambda) """ + def get_env_agent(self): + from irlc.ex12.sarsa_lambda_agent import SarsaLambdaAgent + agent = SarsaLambdaAgent(self.get_env(), gamma=self.gamma, lamb=0.7) + return agent.env, agent + + def test_reward_function(self): + self.tol_qs = 3.1 + self.chk_accumulated_reward() + +class Week12Tests(Report): + title = "Tests for week 12" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (SarsaLambdaQuestion, 10), + (LinearSarsaLambdaAgentQuestion, 10), + (LinearSarsaNstepAgentQuestion, 10),] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week12Tests()) diff --git a/irlc/tests/tests_week13.py b/irlc/tests/tests_week13.py new file mode 100644 index 0000000000000000000000000000000000000000..a405795d56f6bc556e0af30ad34b9fc585fed5fe --- /dev/null +++ b/irlc/tests/tests_week13.py @@ -0,0 +1,76 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report +import numpy as np +from irlc import train +import irlc.ex10.envs +from irlc.tests.tests_week11 import TabularAgentStub + +class DoubleQQuestion(TabularAgentStub): + """ Double Q learning """ + def test_accumulated_reward(self): + env, agent = self.get_env_agent() + stats, _ = train(env, agent, num_episodes=5000) + s, info = env.reset() + actions, qs = agent.Q1.get_Qs(s, info) + self.assertL2(qs, tol=10) + self.assertL2(np.mean([s['Accumulated Reward'] for s in stats]), tol=self.tol) + return stats + + def get_env_agent(self): + from irlc.ex13.tabular_double_q import TabularDoubleQ + agent = TabularDoubleQ(self.get_env(), gamma=self.gamma) + return agent.env, agent + + +class DynaQQuestion(TabularAgentStub): + """ Dyna Q learning """ + # class DynaQReturnItem(SarsaReturnTypeItem): + def get_env_agent(self): + from irlc.ex13.dyna_q import DynaQ + agent = DynaQ(self.get_env(), gamma=self.gamma) + return agent.env, agent + + def test_accumulated_reward(self): + self.chk_accumulated_reward() + +class Week13Tests(Report): + title = "Tests for week 13" + pack_imports = [irlc] + individual_imports = [] + questions = [(DoubleQQuestion, 10), + (DynaQQuestion, 10) + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week13Tests()) + + # class DynaQItem(SarsaTypeQItem): + # title = "Dyna Q action distribution" + +# class DoubleQQuestion(QuestionGroup): +# title = "Double Q learning" +# class DQReturnItem(SarsaReturnTypeItem): +# def get_env_agent(self): +# from irlc.ex13.tabular_double_q import TabularDoubleQ +# agent = TabularDoubleQ(self.get_env(), gamma=self.gamma) +# return agent.env, agent +# +# class DoubleQItem(SarsaTypeQItem): +# tol = 1 +# def compute_answer_print(self): +# s = self.question.env.reset() +# actions, qs = self.question.agent.Q1.get_Qs(s) +# return qs +# title = "Double Q action distribution" +# +# class DynaQQuestion(QuestionGroup): +# title = "Dyna Q learning" +# class DynaQReturnItem(SarsaReturnTypeItem): +# def get_env_agent(self): +# from irlc.ex13.dyna_q import DynaQ +# agent = DynaQ(self.get_env(), gamma=self.gamma) +# return agent.env, agent +# +# class DynaQItem(SarsaTypeQItem): +# title = "Dyna Q action distribution" diff --git a/irlc/tests/unitgrade_data/BanditQuestion.pkl b/irlc/tests/unitgrade_data/BanditQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2c5a18391d06f2648ea043f218bd0c21e42e5bf7 Binary files /dev/null and b/irlc/tests/unitgrade_data/BanditQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl b/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f89ebd0b17dcf615ea283960334caa2a4c4a402d Binary files /dev/null and b/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl b/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f89ebd0b17dcf615ea283960334caa2a4c4a402d Binary files /dev/null and b/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl b/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f89ebd0b17dcf615ea283960334caa2a4c4a402d Binary files /dev/null and b/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl b/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f89ebd0b17dcf615ea283960334caa2a4c4a402d Binary files /dev/null and b/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl b/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0c8ed034d34e4f1249429c51e84cde358e9e0c53 Binary files /dev/null and b/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl differ diff --git a/irlc/tests/unitgrade_data/DirectMethods.pkl b/irlc/tests/unitgrade_data/DirectMethods.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7807f4c5c34901502f48afab8f9e4b5368ddb9c7 Binary files /dev/null and b/irlc/tests/unitgrade_data/DirectMethods.pkl differ diff --git a/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl b/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f89ebd0b17dcf615ea283960334caa2a4c4a402d Binary files /dev/null and b/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/DoubleQQuestion.pkl b/irlc/tests/unitgrade_data/DoubleQQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..338359a9b6b0183a8855b431d00c4230184b6531 Binary files /dev/null and b/irlc/tests/unitgrade_data/DoubleQQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/DynaQQuestion.pkl b/irlc/tests/unitgrade_data/DynaQQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..881f7e8445a02d321d4c116613de9ba2555be5b4 Binary files /dev/null and b/irlc/tests/unitgrade_data/DynaQQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl b/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9fdf18dc7fa643011b14ff9347ca6e4d145fe2b1 Binary files /dev/null and b/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl differ diff --git a/irlc/tests/unitgrade_data/Exam6Toy2d.pkl b/irlc/tests/unitgrade_data/Exam6Toy2d.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1670e52ae3c857948adaa2e31f95386afc7141ed Binary files /dev/null and b/irlc/tests/unitgrade_data/Exam6Toy2d.pkl differ diff --git a/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl b/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cb028ef6008443edeaf2d59477e056c3f5c3435b Binary files /dev/null and b/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl differ diff --git a/irlc/tests/unitgrade_data/ExamQuestionTD0.pkl b/irlc/tests/unitgrade_data/ExamQuestionTD0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8058414ea4d73d7348da513847bf99750505a346 Binary files /dev/null and b/irlc/tests/unitgrade_data/ExamQuestionTD0.pkl differ diff --git a/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl b/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2c5a18391d06f2648ea043f218bd0c21e42e5bf7 Binary files /dev/null and b/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl b/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e59b83ccdc4b3f776c5a856eec67e5fd46e7d7d0 Binary files /dev/null and b/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl b/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4f0c03f218e5d0b490f478d91439da34bd99266e Binary files /dev/null and b/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/LinearQAgentQuestion.pkl b/irlc/tests/unitgrade_data/LinearQAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b33a50f38070a14eeec2fae6b109a09352122a5e Binary files /dev/null and b/irlc/tests/unitgrade_data/LinearQAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/LinearSarsaAgentQuestion.pkl b/irlc/tests/unitgrade_data/LinearSarsaAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..aa697f66dfdc784268f10c345ef0eedd1d3aec6e Binary files /dev/null and b/irlc/tests/unitgrade_data/LinearSarsaAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/LinearSarsaLambdaAgentQuestion.pkl b/irlc/tests/unitgrade_data/LinearSarsaLambdaAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ac6a2adfc3d2faa7e4931cd49899e28973477304 Binary files /dev/null and b/irlc/tests/unitgrade_data/LinearSarsaLambdaAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/LinearSarsaNstepAgentQuestion.pkl b/irlc/tests/unitgrade_data/LinearSarsaNstepAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8488e90d29bc64d169076721ebc84563b582102a Binary files /dev/null and b/irlc/tests/unitgrade_data/LinearSarsaNstepAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/MCAgentQuestion.pkl b/irlc/tests/unitgrade_data/MCAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..713e0329e51abbb76d789ee80671d47f60c6f853 Binary files /dev/null and b/irlc/tests/unitgrade_data/MCAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/MCEvaluationQuestion.pkl b/irlc/tests/unitgrade_data/MCEvaluationQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f4f0406a0c565589ce80ec15300a0a6fb8a40aa2 Binary files /dev/null and b/irlc/tests/unitgrade_data/MCEvaluationQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/NStepSarsaQuestion.pkl b/irlc/tests/unitgrade_data/NStepSarsaQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c98a9b05f5d8467d392af2386aa023b3f1b6751b Binary files /dev/null and b/irlc/tests/unitgrade_data/NStepSarsaQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl b/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2c5a18391d06f2648ea043f218bd0c21e42e5bf7 Binary files /dev/null and b/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/PendulumQuestion.pkl b/irlc/tests/unitgrade_data/PendulumQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f89ebd0b17dcf615ea283960334caa2a4c4a402d Binary files /dev/null and b/irlc/tests/unitgrade_data/PendulumQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl b/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d78b291c68506ecf4fd989c63798af420f4a6796 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl b/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl new file mode 100644 index 0000000000000000000000000000000000000000..92cac8fbf95496ae843852ed47ffa0126de01034 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl b/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl new file mode 100644 index 0000000000000000000000000000000000000000..be8942b1ca5a38b7fc7729522a74b70bfb9ff86f Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem1SmallGraph.pkl b/irlc/tests/unitgrade_data/Problem1SmallGraph.pkl new file mode 100644 index 0000000000000000000000000000000000000000..457e6cfae3680e41113fb761612b2d6549e0f33a Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem1SmallGraph.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem1_to_3_Warmup.pkl b/irlc/tests/unitgrade_data/Problem1_to_3_Warmup.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5fe4c1c222730aae3e22f3624738a4c59c0eac1d Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem1_to_3_Warmup.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl b/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6a3aeda8f5e65ae42b2ef46f1e849800365db6cd Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl b/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b2c9f862612e2ec027398436366b5a8529c55b9e Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl b/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2884379ef4ad61d5c1f40f6b0d358ed37807e00a Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem3LQR.pkl b/irlc/tests/unitgrade_data/Problem3LQR.pkl new file mode 100644 index 0000000000000000000000000000000000000000..dd9396d7727e03e60310dbeb194c6fa0e926ad71 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem3LQR.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem3PID.pkl b/irlc/tests/unitgrade_data/Problem3PID.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0a50350d1e3873dc5a0027ddef29807f8cb73567 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem3PID.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl b/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a9e1f718b4a5fe6496c6ce865c9debb2532f36a8 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4DPAgent.pkl b/irlc/tests/unitgrade_data/Problem4DPAgent.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4803c3df36a3efdeb10bbaf156bb55a8cbaf8a78 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem4DPAgent.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl b/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1d5ec57b3873975837e96bcefd4032d25e793dff Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl b/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fdf20eabff7843729b6fc296dc17968c489684c2 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl b/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0f93099ff362ede4c65df1a0b1d97713b51cd828 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4PolicyEvaluation.pkl b/irlc/tests/unitgrade_data/Problem4PolicyEvaluation.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e508cb77936ac1c0cf998ccfc2e2c6ef195f1fae Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem4PolicyEvaluation.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl b/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7456df2b732f41fded93f4777ebfbb7de7ac4635 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem5PolicyIteration.pkl b/irlc/tests/unitgrade_data/Problem5PolicyIteration.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a735b4e1bfbb9327fef0896cac48c27181fd52e6 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem5PolicyIteration.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl b/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8962e84a52523dd962cccb029fe7420f69b17262 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl b/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4f72ab97d7183e2c0a92e473bbc41b1995e7700e Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem6ValueIteration.pkl b/irlc/tests/unitgrade_data/Problem6ValueIteration.pkl new file mode 100644 index 0000000000000000000000000000000000000000..11808b575d0b326f4d0f2f81886157ec91205e30 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem6ValueIteration.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem7PIDCar.pkl b/irlc/tests/unitgrade_data/Problem7PIDCar.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c838b752379b807ec2bc13988d3e7a5185e68f1d Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem7PIDCar.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl b/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d72e621ef2aca6c202e903ee802dce60b23cabdd Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem8ValueIterationAgent.pkl b/irlc/tests/unitgrade_data/Problem8ValueIterationAgent.pkl new file mode 100644 index 0000000000000000000000000000000000000000..945ae0a0ae941571d4533d2c71c2f01c82beb9cc Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem8ValueIterationAgent.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem9Gambler.pkl b/irlc/tests/unitgrade_data/Problem9Gambler.pkl new file mode 100644 index 0000000000000000000000000000000000000000..edce9bc7f5b6c047adacf9bc2b090c35dead8b63 Binary files /dev/null and b/irlc/tests/unitgrade_data/Problem9Gambler.pkl differ diff --git a/irlc/tests/unitgrade_data/QAgentQuestion.pkl b/irlc/tests/unitgrade_data/QAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..016ea7eaeaa3fb5d36c759909b4f43314d6ed9d9 Binary files /dev/null and b/irlc/tests/unitgrade_data/QAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/RendevouzItem.pkl b/irlc/tests/unitgrade_data/RendevouzItem.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c007adc309fa60055fc59ec8c3bba3ce8aab72a2 Binary files /dev/null and b/irlc/tests/unitgrade_data/RendevouzItem.pkl differ diff --git a/irlc/tests/unitgrade_data/SarsaLambdaQuestion.pkl b/irlc/tests/unitgrade_data/SarsaLambdaQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5ba0d797f173e178b1d8c28c2beabf4941532788 Binary files /dev/null and b/irlc/tests/unitgrade_data/SarsaLambdaQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/SarsaQuestion.pkl b/irlc/tests/unitgrade_data/SarsaQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..157e90bfa38a07c4ecb0813c73f687ce88904ca7 Binary files /dev/null and b/irlc/tests/unitgrade_data/SarsaQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/TD0Question.pkl b/irlc/tests/unitgrade_data/TD0Question.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6a18f61d92942a8ae3c89fcaa3a6e16b94322e4f Binary files /dev/null and b/irlc/tests/unitgrade_data/TD0Question.pkl differ diff --git a/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl b/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2c5a18391d06f2648ea043f218bd0c21e42e5bf7 Binary files /dev/null and b/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl differ diff --git a/irlc/update_files.py b/irlc/update_files.py new file mode 100644 index 0000000000000000000000000000000000000000..783901432a68d39a1853059cf06e79caada6b778 --- /dev/null +++ b/irlc/update_files.py @@ -0,0 +1,109 @@ +import fnmatch +import requests +from io import BytesIO +import zipfile +import os +import sys + +print("Hello! This is an automatic updating script that will perform the following operations:") +print("1) Download the most current version of the course material from gitlab") +print("2) Check if you are missing any files and create them") +print("3) update this script to the most recent version") +print("4) Update certain files that you should not edit (_grade-scripts and so on) to the most recent version") + +url_install = "https://02465material.pages.compute.dtu.dk/02465public/information/installation.html" +sdir = os.path.dirname(__file__) +dry = False + +if "02465public" in sdir and "tuhe" in sdir: + dry = True + print("-"*100) + print("It has been detected that this script is running on the teachers computer.") + print("This means that your files will not be overwritten normally.") + print("In the highly unusual case this is a mistake, please change dry=False in the code.") + print("-"*100) + # raise Exception("(teachers not to himself: Don't run this on your own computer)") + + +print("The script is being run using python version:", sys.executable) + +if not os.path.basename(sdir) == "irlc": + print("The script was unable to locate an 'irlc' folder. The most likely reason this occurs is that you have moved the location of the script, or that you have deleted the irlc folder. ") + print("The current location of the script is:", sdir) + print("Make sure this folder contains an irlc folder. If you have deleted it, simply start over with the installation instructions. ") + sys.exit(1) # Exit with error code 1 + +try: + import unitgrade # type: ignore + # import irlc +except ImportError as e: + print("Your python environment was unable to locate unitgrade") + print("This means that you either did not install the software correctly, or that you installed it in the wrong python interpreter (i.e., you have multiple versions of python installed).") + + print("VS Code: Please select a different Python through the Command Palette (Ctrl+Shift+P) and choose ""Python: Select Interpreter"".") + print("Try all the Pythons you can choose and run the script from them") + print(f"See also {url_install}") + sys.exit(1) # Exit with error code 1 + +def read_and_extract_zip(url): + # Download the zip file from the URL + base_dir = url.split("/main/")[-1].split(".zip")[0] + response = requests.get(url) + local_students_folder = os.path.dirname(os.path.dirname(__file__)) + always_overwrite = ['irlc/update_files.py', 'irlc/__init__.py', 'irlc/tests/*', '**/unitgrade_data/*.pkl', 'irlc/car/*', 'irlc/gridworld/*', 'irlc/pacman/*', 'irlc/utils/*', '*_grade.py', '*/project*_tests.py'] + # Check if the request was successful (status code 200) + if response.status_code == 200: + zip_content = BytesIO(response.content) + # Open the zip file using the zipfile module + with zipfile.ZipFile(zip_content, 'r') as zip_ref: + # List the files in the zip file + # Iterate over the files in the zip file + for file_name in zip_ref.filelist: + # Read the content of each file + if not file_name.is_dir(): + rp = os.path.relpath(file_name.filename, base_dir) + new_path = os.path.join(local_students_folder, rp) + overwrite = [p for p in always_overwrite if fnmatch.fnmatch(rp, p)] + if len(overwrite) > 0 or not os.path.isfile(new_path): + commit = True + try: + if os.path.isfile(new_path): + with open(new_path, 'rb') as newf: + if newf.read() == zip_ref.read(file_name.filename): + commit = False + else: + commit = True + except Exception as e: + print("Problem reading local file", new_path) + pass + + if commit: + print("> Overwriting...", new_path) + if not dry: + if not os.path.isdir(os.path.dirname(new_path)): + os.makedirs(os.path.dirname(new_path)) + with open(new_path, 'wb') as f: + f.write(zip_ref.read(file_name.filename)) + else: + pass + else: + print(f"Failed to download the zip file. Status code: {response.status_code}. The DTU Gitlab server may be overloaded, unavailable, or you have no network.") + a = 34 + +# Replace 'your_zip_file_url' with the actual URL of the zip file +zip_file_url = 'https://gitlab.compute.dtu.dk/02465material/02465students/-/archive/main/02465students-main.zip' +read_and_extract_zip(zip_file_url) + +try: + import irlc +except ImportError as e: + print("Oh no, Python encountered a problem during importing irlc.") + import site + print("") + print("This is possibly because you moved or renamed the 02465students folder after the installation was completed, ") + print("or because you selected another python interpreter than the one you used during install. ") + print("Please move/rename the students folder back so it can be found at the this path again, and/or select another interpreter from the command pallette") + print(f"See also {url_install}") + sys.exit(1) # Exit with error code 1 + +print("> The script terminated successfully. Your files should be up to date.") \ No newline at end of file diff --git a/irlc/utils/__init__.py b/irlc/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/utils/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/utils/__pycache__/__init__.cpython-311.pyc b/irlc/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d927f6899883d10645c4359d8b581fd82bf24276 Binary files /dev/null and b/irlc/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/irlc/utils/__pycache__/common.cpython-311.pyc b/irlc/utils/__pycache__/common.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..117dbad6cb0d38b3aafcc6ba26318720288fab93 Binary files /dev/null and b/irlc/utils/__pycache__/common.cpython-311.pyc differ diff --git a/irlc/utils/__pycache__/graphics_util_pygame.cpython-311.pyc b/irlc/utils/__pycache__/graphics_util_pygame.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a2de9b202c80da608b15438e243b1157f00709d Binary files /dev/null and b/irlc/utils/__pycache__/graphics_util_pygame.cpython-311.pyc differ diff --git a/irlc/utils/__pycache__/irlc_plot.cpython-311.pyc b/irlc/utils/__pycache__/irlc_plot.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7aa1b05105fae565302e536103d713e6c25f1e18 Binary files /dev/null and b/irlc/utils/__pycache__/irlc_plot.cpython-311.pyc differ diff --git a/irlc/utils/__pycache__/lazylog.cpython-311.pyc b/irlc/utils/__pycache__/lazylog.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46db0cc630ee8ec0f509e98df741e7c06852dc91 Binary files /dev/null and b/irlc/utils/__pycache__/lazylog.cpython-311.pyc differ diff --git a/irlc/utils/__pycache__/player_wrapper.cpython-311.pyc b/irlc/utils/__pycache__/player_wrapper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f57153c5e14540a2c97df79d5ceba88e28817fea Binary files /dev/null and b/irlc/utils/__pycache__/player_wrapper.cpython-311.pyc differ diff --git a/irlc/utils/__pycache__/ptext.cpython-311.pyc b/irlc/utils/__pycache__/ptext.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb451879a9f6a5d67719c036754531e523e49a34 Binary files /dev/null and b/irlc/utils/__pycache__/ptext.cpython-311.pyc differ diff --git a/irlc/utils/__pycache__/timer.cpython-311.pyc b/irlc/utils/__pycache__/timer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92b2dccac3ab657ffcb6961d6f22f710e8c4a9ea Binary files /dev/null and b/irlc/utils/__pycache__/timer.cpython-311.pyc differ diff --git a/irlc/utils/common.py b/irlc/utils/common.py new file mode 100644 index 0000000000000000000000000000000000000000..43c9d705fefb113279d5337f49235f3c268b33b5 --- /dev/null +++ b/irlc/utils/common.py @@ -0,0 +1,206 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from gymnasium import spaces +import collections +import inspect +import types +import numpy as np +import os, glob, csv +from irlc.utils.lazylog import LazyLog + +class defaultdict2(collections.defaultdict): + def __missing__(self, key): + if self.default_factory is None: + raise KeyError((key,)) + + if isinstance(self.default_factory, types.FunctionType): + nargs = len(inspect.getfullargspec(self.default_factory).args) + self[key] = value = self.default_factory(key) if nargs == 1 else self.default_factory() + return value + else: + return super().__missing__(key) + +## Helper functions for saving/loading a time series +def load_time_series(experiment_name, exclude_empty=True): + """ + Load most recent non-empty time series (we load non-empty since lazylog creates a new dir immediately) + """ + files = list(filter(os.path.isdir, glob.glob(experiment_name+"/*"))) + if exclude_empty: + files = [f for f in files if os.path.exists(os.path.join(f, "log.txt")) and os.stat(os.path.join(f, "log.txt")).st_size > 0] + + if len(files) == 0: + return [], None + recent = sorted(files, key=lambda file: os.path.basename(file))[-1] + stats = [] + with open(recent + '/log.txt', 'r') as f: + csv_reader = csv.reader(f, delimiter='\t') + for i, row in enumerate(csv_reader): + if i == 0: + head = row + else: + def tofloat(v): + try: + return float(v) + except Exception: + return v + + stats.append( {k:tofloat(v) for k, v in zip(head, row) } ) + return stats, recent + +def average_trajectories(trajectories): + if len(trajectories) == 0: + return None + from irlc.ex01.agent import Trajectory, fields + t = trajectories[0] + # t._asdict() + # n = max( [len(t.time) for t in trajectories] ) + trajectories2 = sorted(trajectories, key=lambda t: len(t.time)) + tlong = trajectories2[-1] + dd = dict(state=[], action=[],reward=[]) + # keys = list(dd.keys()) + + for t in range(len(tlong.time)): + for k in ['state', 'action', 'reward']: + avg = [] + for traj in trajectories: + z = traj.__getattribute__(k) + if len(z) > t: + avg.append(z[t]) + if len(avg) > 0: + # avg = np.stack(avg) + avg = np.mean(avg, axis=0) + dd[k].append(avg) + + dd = {k: np.stack(v) for k, v in dd.items()} + tavg = Trajectory(**dd, time=tlong.time, env_info=[]) + return tavg + + # tlong.state *= 0 + # tlong.action *= 0 + + # for i in range(n): + + +def experiment_load(experiment_name, exclude_empty=True): + files = list(filter(os.path.isdir, glob.glob(experiment_name + "/*"))) + if exclude_empty: + files = [f for f in files if + os.path.exists(os.path.join(f, "log.txt")) and os.stat(os.path.join(f, "log.txt")).st_size > 0] + if len(files) == 0: + return [] + values = [] + files = sorted(files, key=lambda file: os.path.basename(file)) + for recent in files: + # recent = sorted(files, key=lambda file: os.path.basename(file))[-1] + stats = [] + with open(recent + '/log.txt', 'r') as f: + csv_reader = csv.reader(f, delimiter='\t') + for i, row in enumerate(csv_reader): + if i == 0: + head = row + else: + def tofloat(v): + try: + return float(v) + except Exception: + return v + + stats.append({k: tofloat(v) for k, v in zip(head, row)}) + + from irlc import cache_read, cache_write, cache_exists + tpath = recent + "/trajectories.pkl" + if cache_exists(tpath): + trajectories = cache_read(tpath) + else: + trajectories = None + values.append( (stats, trajectories, recent) ) + return values + +def log_time_series(experiment, list_obs, max_xticks_to_log=None, run_name=None): + logdir = f"{experiment}/" + + if max_xticks_to_log is not None and len(list_obs) > max_xticks_to_log: + I = np.round(np.linspace(0, len(list_obs) - 1, max_xticks_to_log)) + list_obs = [o for i, o in enumerate(list_obs) if i in I.astype(np.int).tolist()] + + akeys = list(list_obs[0].keys()) + akeys += [k for k in list_obs[-1].keys() if k not in akeys] + with LazyLog(logdir) as logz: + for n,l in enumerate(list_obs): + for k in akeys: + v = None + if k not in l: + for ll in list_obs[n:]: + if k in ll: + v = ll[k] + break + if v is None: + v = np.nan + else: + v = l.get(k) + logz.log_tabular(k,v) + if "Steps" not in l: + logz.log_tabular("Steps", n) + if "Episode" not in l: + logz.log_tabular("Episode",n) + logz.dump_tabular(verbose=False) + experiment_name = logz.experiment_name + return experiment_name + + +class DiscreteTextActionSpace(spaces.Space): + def __init__(self, actions, seed=None): + # self.env = env + # self._actions = actions + self.actions = actions + self.ds = spaces.Discrete(seed=seed, n=len(actions)) + # self.start = 0 + # self.actions = actions + # super().__init__(shape=(len(actions),)) + + # @property + # def actions(self): + # return self._actions + # return self.env.A(self.env.state) + + def sample(self, mask=None): + return self.actions[self.ds.sample(mask)] + + @property + def n(self): + return self.ds.n + + def _make_mask(self, actions): + mask = np.zeros((self.n,), dtype=np.int8) + for a in actions: + mask[self.actions.index(a)] = 1 + return mask + + def __str__(self): + return f"<ExplicitAction space with actions: {', '.join(self.actions)}>" + + # def __contains__(self, action): + # return + + +class ExplicitActionSpace(spaces.Discrete): + # Hacky stuff I don't think I need anymore. + + def __init__(self, env): + self.env = env + self.start = 0 + raise Exception() + # pass + # self.actions = actions + # super().__init__(len(actions)) + + @property + def actions(self): + return self.env.A(self.env.state) + + @property + def n(self): + return len(self.actions) + + def sample(self): + return np.random.choice(self.actions) diff --git a/irlc/utils/graphics/car.png b/irlc/utils/graphics/car.png new file mode 100644 index 0000000000000000000000000000000000000000..386a86e58b77fe213f662d638df86609c5294be2 Binary files /dev/null and b/irlc/utils/graphics/car.png differ diff --git a/irlc/utils/graphics/dtu_icon.png b/irlc/utils/graphics/dtu_icon.png new file mode 100644 index 0000000000000000000000000000000000000000..9bcea902ea9d3e647d7b73e3a90dbc194dfdfd8b Binary files /dev/null and b/irlc/utils/graphics/dtu_icon.png differ diff --git a/irlc/utils/graphics/locomotive.png b/irlc/utils/graphics/locomotive.png new file mode 100644 index 0000000000000000000000000000000000000000..95e5dc6682930e7ecff714937e2f021e5c26b98d Binary files /dev/null and b/irlc/utils/graphics/locomotive.png differ diff --git a/irlc/utils/graphics_util_pygame.py b/irlc/utils/graphics_util_pygame.py new file mode 100644 index 0000000000000000000000000000000000000000..379244046b13dd3d776b366074d5ef77bd29418e --- /dev/null +++ b/irlc/utils/graphics_util_pygame.py @@ -0,0 +1,415 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# graphicsUtils.py +# ---------------- +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). +import numpy as np +import os +import pygame +from pygame import gfxdraw +import threading +import time +import pygame +import platform +import sys + +ghost_shape = [ + (0, - 0.5), + (0.25, - 0.75), + (0.5, - 0.5), + (0.75, - 0.75), + (0.75, 0.5), + (0.5, 0.75), + (- 0.5, 0.75), + (- 0.75, 0.5), + (- 0.75, - 0.75), + (- 0.5, - 0.5), + (- 0.25, - 0.75) +] + +def _adjust_coords(coord_list, x, y): + for i in range(0, len(coord_list), 2): + coord_list[i] = coord_list[i] + x + coord_list[i + 1] = coord_list[i + 1] + y + return coord_list + +def formatColor(r, g, b): + return '#%02x%02x%02x' % (int(r * 255), int(g * 255), int(b * 255)) + +def colorToVector(color): + return list(map(lambda x: int(x, 16) / 256.0, [color[1:3], color[3:5], color[5:7]])) + +def h2rgb(color): + if color is None or isinstance(color, tuple): + return color + if color.startswith("#"): + color = color[1:] + return tuple(int(color[i:i + 2], 16) / 255 for i in (0, 2, 4)) + +def h2rgb255(color): + if isinstance(color, tuple): + return color + # c = + return tuple(int(cc*255) for cc in h2rgb(color)) + if color is None: + return None + if color.startswith("#"): + color = color[1:] + return tuple(int(color[i:i + 2], 16) / 255 for i in (0, 2, 4)) + +class GraphicsCache: + break_cache = False + def __init__(self, viewer, verbose=False): + self.viewer = viewer + # self._items_in_viewer = {} + # self._seen_things = set() + self.clear() + self.verbose = verbose + + def copy_all(self): + self._seen_things.update( set( self._items_in_viewer.keys() ) ) + + def clear(self): + self._seen_things = set() + self.viewer.geoms.clear() + self._items_in_viewer = {} + + def prune_frame(self): + s0 = len(self._items_in_viewer) + self._items_in_viewer = {k: v for k, v in self._items_in_viewer.items() if k in self._seen_things } + if self.verbose: + print("removed", len(self._items_in_viewer) - s0, "geom size", len(self._items_in_viewer)) + self.viewer.geoms = list( self._items_in_viewer.values() ) + self._seen_things = set() + + + def add_geometry(self, name, geom): + if self.break_cache: + if self._items_in_viewer == None: + self.viewer.geoms = [] + self._items_in_viewer = {} + + self._items_in_viewer[name] = geom + self._seen_things.add(name) + + + +class GraphicsUtilGym: + viewer = None + _canvas_xs = None # Size of canvas object + _canvas_ys = None + _canvas_x = None # Current position on canvas + _canvas_y = None + + def begin_graphics(self, width=640, height=480, color=formatColor(0, 0, 0), title="02465 environment", local_xmin_xmax_ymin_ymax=None, verbose=False, + frames_per_second=None): + """ Main interface for managing graphics. + The local_xmin_xmax_ymin_ymax controls the (local) coordinate system which is mapped onto screen coordinates. I.e. specify this + to work in a native x/y coordinate system. If not, it will default to screen coordinates familiar from Gridworld. + """ + width = int(width) + height = int(height) # For width/height to be integers to avoid crashes on some systems. + + icon = os.path.dirname(__file__) + "/../utils/graphics/dtu_icon.png" + pygame_icon = pygame.image.load(icon) + pygame.display.set_icon(pygame_icon) + screen_width = width + screen_height = height + pygame.init() + pygame.display.init() + self.frames_per_second = frames_per_second + + + self.screen = pygame.display.set_mode( + (screen_width, screen_height) + ) + self.screen_width = width + self.screen_height = height + + pygame.display.set_caption(title) + + if height % 2 == 1: + height += 1 # Must be divisible by 2. + self._bg_color = color + # viewer = Viewer(width=int(width), height=int(height)) + # viewer.window.set_caption(title) + # self.viewer = viewer + # self.gc = GraphicsCache(viewer, verbose=verbose) + self._canvas_xs, self._canvas_ys = width - 1, height - 1 + self._canvas_x, self._canvas_y = 0, self._canvas_ys + if local_xmin_xmax_ymin_ymax is None: + # local_coordinates = [] + # This will align the coordinate system so it begins in the top-left corner. + # This is the default behavior of pygame. + local_xmin_xmax_ymin_ymax = (0, width, 0, height) + self._local_xmin_xmax_ymin_ymax = local_xmin_xmax_ymin_ymax + + self.demand_termination = threading.Event() + self.pause_refresh = False + self.ask_for_pause = False + self.is_paused = False + self.time_last_blit = -1 + + + def refresh_window(gutils): + refresh_interval_seconds = 0.1 # Miliseconds + t0 = time.time() + while not gutils.demand_termination.is_set(): + t1 = time.time() + if t1 - t0 > refresh_interval_seconds: + if not self.ask_for_pause: + self.is_paused = False + if not (sys.platform == 'darwin' and platform.processor() == 'i386'): + pass # Disable the thread startup. This causes problems on linux (segfaults). Must find better fix, perhaps win-only. + # pygame.display.update() + else: + self.is_paused = True + t0 = t1 + time.sleep(refresh_interval_seconds/100) + + self.refresh_thread = threading.Thread(target=refresh_window, args=(self, )) + self.refresh_thread.start() + + def close(self): + self.demand_termination.set() + self.refresh_thread.join(timeout=1000) + pygame.display.quit() + pygame.quit() + # TH 2023: These two lines are super important. + # pdraw cache the fonts. So when pygame is loaded/quites, + # the font cache is not flushed. This is not a problem + # when determining the width of strings the font has seen, + # but causes a segfault with NEW strings. + from irlc.utils import ptext + ptext._font_cache = {} + self.isopen = False + + def render(self): + pass + + def blit(self, render_mode=None): + self.render() + self.screen.blit(self.surf, (0, 0)) + if render_mode == "human": + tc = time.time() + + if self.frames_per_second is not None: + + if tc - self.time_last_blit < 1/self.frames_per_second: + tw = 1/self.frames_per_second - (tc - self.time_last_blit ) + time.sleep(tw) + else: + tw = 0 + + self.time_last_blit = tc + + pygame.event.pump() + pygame.display.flip() + elif render_mode == "rgb_array": + return np.transpose(np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)) + + def rectangle(self, color, x, y, width, height, border=0, fill_color=None): + x2,y2 = self.fixxy((x+width, y+height)) + x, y = self.fixxy((x,y)) + + c1 = min([x, x2]) + c2 = min([y, y2]) + + w = abs(x-x2) + h = abs(y - y2) + + pygame.draw.rect(self.surf, color, pygame.Rect( int(c1), int(c2), int(w), int(h)), border) + + + def draw_background(self, background_color=None): + if background_color is None: + background_color = (0, 0, 0) + self._bg_color = background_color + x1, x2, y1, y2 = self._local_xmin_xmax_ymin_ymax + corners = [ (x1, y1), (x2, y1), (x2, y2), (x1, y2) ] + self.surf = pygame.Surface((self.screen_width, self.screen_height)) + self.polygon(name="background", coords=corners, outlineColor=self._bg_color, fillColor=self._bg_color, filled=True, smoothed=False) + + def fixxy(self, xy): + x,y = xy + x = (x - self._local_xmin_xmax_ymin_ymax[0]) / (self._local_xmin_xmax_ymin_ymax[1] - self._local_xmin_xmax_ymin_ymax[0]) * self.screen.get_width() + y = (y - self._local_xmin_xmax_ymin_ymax[2]) / (self._local_xmin_xmax_ymin_ymax[3] - self._local_xmin_xmax_ymin_ymax[2]) * self.screen.get_height() + return int(x), int(y) + + + def plot(self, name, x, y, color=None, width=1.0): + coords = [(x_,y_) for (x_, y_) in zip(x,y)] + if color is None: + color = "#000000" + return self.polygon(name, coords, outlineColor=color, filled=False, width=width) + + def polygon(self, name, coords, outlineColor=None, fillColor=None, filled=True, smoothed=1, behind=0, width=1.0, closed=False): + c = [] + for coord in coords: + c.append(coord[0]) + c.append(coord[1]) + + coords = [self.fixxy(c) for c in coords] + if fillColor == None: fillColor = outlineColor + poly = None + if not filled: fillColor = "" + + c = [self.fixxy(tuple(c[i:i+2])) for i in range(0, len(c), 2)] + if not filled: + gfxdraw.polygon(self.surf, coords, h2rgb255(outlineColor)) + pygame.draw.polygon(self.surf, h2rgb255(outlineColor), coords, width=int(width)) + + else: + gfxdraw.filled_polygon(self.surf, coords, h2rgb255(fillColor)) + + if outlineColor is not None and len(outlineColor) > 0 and filled: # Not sure why this cannot be merged with the filled case... + # gfxdraw.polygon(self.surf, coords, h2rgb255(outlineColor), width=int(width)) + pygame.draw.polygon(self.surf, h2rgb255(outlineColor), coords, width=int(width)) + + return poly + + def square(self, name, pos, r, color, filled=1, behind=0): + x, y = pos + coords = [(x - r, y - r), (x + r, y - r), (x + r, y + r), (x - r, y + r)] + return self.polygon(name, coords, color, color, filled, 0, behind=behind) + + def centered_arc(self, color, pos, r, start_angle, stop_angle, width=1): + # Draw a centered arc (pygame defaults to boxed arcs) + x, y = pos + tt = np.linspace(start_angle / 360 * 2 * np.pi,stop_angle / 360 * 2 * np.pi, int(r * 10)) + px = np.cos(tt) * r + py = -np.sin(tt) * r + pp = list(zip(px.tolist(), py.tolist())) + + pp = [((x + a, y + b)) for (a, b) in pp] + # if style == 'arc': # For pacman. I guess this one makes the rounded wall segments. + pp = [self.fixxy(p_) for p_ in pp] + + pygame.draw.lines(self.surf, h2rgb255(color), False, pp, width) + + def circle(self, name, pos, r, outlineColor=None, fillColor=None, endpoints=None, style='pieslice', width=2): + pos = self.fixxy(pos) + x, y = pos + if endpoints == None: + e = [0, 359] + else: + e = list(endpoints) + while e[0] > e[1]: e[1] = e[1] + 360 + if endpoints is not None and len(endpoints) > 0: + tt = np.linspace(e[0]/360 * 2*np.pi, e[-1]/360 * 2*np.pi, int(r*20) ) + px = np.cos(tt) * r + py = -np.sin(tt) * r + pp = list(zip(px.tolist(), py.tolist())) + if style == 'pieslice': + pp = [(0,0),] + pp + [(0,0),] + pp = [( (x+a, y+b)) for (a,b) in pp ] + if style == 'arc': # For pacman. I guess this one makes the rounded wall segments. + pp = [self.fixxy(p_) for p_ in pp] + pygame.draw.lines(self.surf, outlineColor, False, pp, width) + elif style == 'pieslice': + self.polygon(name, pp, fillColor=fillColor, outlineColor=outlineColor, width=width) + else: + raise Exception("bad style", style) + else: + gfxdraw.filled_circle(self.surf, x, y, int(r), h2rgb255(fillColor)) + + def text(self, name, pos, color, contents, font='Helvetica', size=12, style='normal', anchor="w", fontsize=24, + bold=False): + pos = self.fixxy(pos) + ax = "center" + ax = "left" if anchor == "w" else ax + ay = "center" + ay = "baseline" if anchor == "s" else ay + + from irlc.utils.ptext import draw + if anchor == 'w': + opts = dict(midleft=pos) + elif anchor == 'e': + opts = dict(midright=pos) + elif anchor == 's': + opts = dict(midbottom=pos) + elif anchor == 'n': + opts = dict(midtop=pos) + elif anchor == 'c': + opts = dict(center=pos) + else: + raise Exception("Unknown anchor", anchor) + opts['fontsize'] = fontsize + opts['bold'] = bold + draw(contents, surf=self.surf, color=h2rgb255(color), pos=pos, **opts) + return + + + def line(self, name, here, there, color=formatColor(0, 0, 0), width=2): + + here, there = self.fixxy(here), self.fixxy(there) + pygame.draw.line(self.surf, h2rgb255(color), here, there, width) + + def polyline(self, name, xs, ys, color=formatColor(0, 0, 0), width=2): + for i in range(len(xs) - 1): + self.line("asfasf", here=(xs[i] , ys[i]), + there=(xs[i + 1], ys[i + 1]), + color=color, width=width) + + +def rotate_around(pos, xy0, angle): + if isinstance(pos, list) and isinstance(pos[0], tuple): + return [rotate_around(p, xy0, angle) for p in pos] + return ((pos[0] - xy0[0]) * np.cos(angle / 180 * np.pi) - (pos[1] - xy0[1]) * np.sin(angle / 180 * np.pi) + xy0[0], + (pos[0] - xy0[0]) * np.sin(angle / 180 * np.pi) + (pos[1] - xy0[1]) * np.cos(angle / 180 * np.pi) + xy0[1]) + +class Object(pygame.sprite.Sprite): + def __init__(self, file, image_width=None, graphics=None): + super(Object, self).__init__() + fpath = os.path.dirname(__file__) +"/graphics/"+file + image = pygame.image.load(fpath).convert_alpha() + if image_width is not None: + image_height = int( image_width / image.get_width() * image.get_height() ) + self.og_surf = pygame.transform.smoothscale(image, (image_width, image_height)) + # raise Exception("Implement this") + else: + self.og_surf = image + # self.og_surf = pygame.transform.smoothscale(image, (100, 100)) + self.surf = self.og_surf + self.rect = self.surf.get_rect(center=(400, 400)) + self.ga = graphics + + def move_center_to_xy(self, x, y): + # Note: These are in the local coordinate system coordinates. + x,y = self.ga.fixxy((x,y)) + self.rect.center = (x,y) + + def rotate(self, angle): + """ Rotate sprite around it's center. """ + self.angle = angle + self.surf = pygame.transform.rotate(self.og_surf, self.angle) + self.rect = self.surf.get_rect(center=self.rect.center) + + def blit(self, surf): + surf.blit(self.surf, self.rect) + + +class UpgradedGraphicsUtil(GraphicsUtilGym): + def __init__(self, screen_width=800, screen_height=None, xmin=0., xmax=800., ymin=0., ymax=600., title="Gym window"): + if screen_height is None: + screen_height = np.abs( int(screen_width / (xmax - xmin) * (ymax-ymin)) ) + elif xmin is None: + xmin = 0 + xmax = screen_width + ymin = 0 + ymax = screen_height + else: + raise Exception() + self.begin_graphics(width=screen_width, height=screen_height, local_xmin_xmax_ymin_ymax=(xmin, xmax, ymin, ymax), title=title) + + def get_sprite(self, name): + """ Load a sprite from the graphics directory. """ + pass diff --git a/irlc/utils/irlc_plot.py b/irlc/utils/irlc_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..fcbb498578a9b67468f0b86c9be1bb402d114dca --- /dev/null +++ b/irlc/utils/irlc_plot.py @@ -0,0 +1,266 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import os +import numpy as np + +""" +Using the plotter: + +Call it from the command line, and supply it with logdirs to experiments. +Suppose you ran an experiment with name 'test', and you ran 'test' for 10 +random seeds. The runner code stored it in the directory structure + + data + L test_EnvName_DateTime + L 0 + L log.txt + L params.json + L 1 + L log.txt + L params.json + . + . + . + L 9 + L log.txt + L params.json + +To plot learning curves from the experiment, averaged over all random +seeds, call + + python lmpc_plot.py data/test_EnvName_DateTime --value AverageReturn + +and voila. To see a different statistics, change what you put in for +the keyword --value. You can also enter /multiple/ values, and it will +make all of them in order. + + +Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried +a different set of hyperparameters from 'test1', and now you would like +to compare them -- see their learning curves side-by-side. Just call + + python lmpc_plot.py data/test1 data/test2 + +and it will plot them both! They will be given titles in the legend according +to their exp_name parameters. If you want to use custom legend titles, use +the --legend flag and then provide a title for each logdir. + +""" + +def plot_data(data, y="accumulated_reward", x="Episode", ci=95, estimator='mean', **kwargs): + import seaborn as sns + import matplotlib.pyplot as plt + import pandas as pd + if isinstance(data, list): # is this correct even? + data = pd.concat(data, ignore_index=True,axis=0) + plt.figure(figsize=(12, 6)) + sns.set(style="darkgrid", font_scale=1.5) + lp = sns.lineplot(data=data, x=x, y=y, hue="Condition", errorbar=('ci', 95), estimator=estimator, **kwargs) + plt.legend(loc='best') #.set_draggable(True) + +def existing_runs(experiment): + nex = 0 + for root, dir, files in os.walk(experiment): + if 'log.txt' in files: + nex += 1 + return nex + +def _get_most_recent_log_dir(fpath): + files = [os.path.basename(root) for root, dir, files in os.walk(fpath) if 'log.txt' in files] + return sorted(files, key=lambda file: os.path.basename(file))[-1] if len(files) > 0 else None + +def get_datasets(fpath, x, condition=None, smoothing_window=None, resample_key=None, resample_ticks=None, only_most_recent=False): + import pandas as pd + unit = 0 + if condition is None: + condition = fpath + datasets = [] + + if only_most_recent: + most_recent = _get_most_recent_log_dir(fpath) + + for root, dir, files in os.walk(fpath): + # print(files) + if 'log.txt' in files: + if only_most_recent and most_recent is not None and os.path.basename(root) != most_recent: # Skip this log. + continue + json = os.path.join(root, 'params.json') + if os.path.exists(json): + with open(json) as f: + param_path = open(json) + params = json.load(param_path) + # exp_name = params['exp_name'] + + log_path = os.path.join(root, 'log.txt') + if os.stat(log_path).st_size == 0: + print("Bad plot file", log_path, "size is zero. Skipping") + continue + experiment_data = pd.read_table(log_path) + + if smoothing_window: + ed_x = experiment_data[x] + experiment_data = experiment_data.rolling(smoothing_window,min_periods=1).mean() + experiment_data[x] = ed_x + + experiment_data.insert( + len(experiment_data.columns), + 'Unit', + unit + ) + experiment_data.insert( + len(experiment_data.columns), + 'Condition', + condition) + + datasets.append(experiment_data) + unit += 1 + + nc = f"({unit}x)"+condition[condition.rfind("/")+1:] + for i, d in enumerate(datasets): + datasets[i] = d.assign(Condition=lambda x: nc) + + if resample_key is not None: + nmax = 0 + vmax = -np.inf + vmin = np.inf + for d in datasets: + nmax = max( d.shape[0], nmax) + vmax = max(d[resample_key].max(), vmax) + vmin = min(d[resample_key].min(), vmin) + if resample_ticks is not None: + nmax = min(resample_ticks, nmax) + + new_datasets = [] + tnew = np.linspace(vmin + 1e-6, vmax - 1e-6, nmax) + for d in datasets: + nd = {} + cols = d.columns.tolist() + for c in cols: + if c == resample_key: + y = tnew + elif d[c].dtype == 'O': + y = [ d[c][0] ] * len(tnew) + else: + y = np.interp(tnew, d[resample_key].tolist(), d[c], left=np.nan, right=np.nan) + y = y.astype(d[c].dtype) + nd[c] = y + + ndata = pd.DataFrame(nd) + ndata = ndata.dropna() + new_datasets.append(ndata) + datasets = new_datasets + return datasets + + +def _load_data(experiments, legends=None, smoothing_window=None, resample_ticks=None, + x_key="Episode", + only_most_recent=False): + ensure_list = lambda x: x if isinstance(x, list) else [x] + experiments = ensure_list(experiments) + if legends is None: + legends = experiments + legends = ensure_list(legends) + + data = [] + for logdir, legend_title in zip(experiments, legends): + resample_key = x_key if resample_ticks is not None else None + data += get_datasets(logdir, x=x_key, condition=legend_title, smoothing_window=smoothing_window, resample_key=resample_key, resample_ticks=resample_ticks, + only_most_recent=only_most_recent) + return data + +def main_plot(experiments, legends=None, smoothing_window=None, resample_ticks=None, + x_key="Episode", + y_key='Accumulated Reward', + no_shading=False, + **kwargs): + if no_shading: + kwargs['units'] = 'Unit' + kwargs['estimator'] = None + + ensure_list = lambda x: x if isinstance(x, list) else [x] + experiments = ensure_list(experiments) + + if legends is None: + legends = experiments + legends = ensure_list(legends) + + data = [] + for logdir, legend_title in zip(experiments, legends): + resample_key = x_key if resample_ticks is not None else None + data += get_datasets(logdir, x=x_key, condition=legend_title, smoothing_window=smoothing_window, resample_key=resample_key, resample_ticks=resample_ticks) + + plot_data(data, y=y_key, x=x_key, **kwargs) + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('logdir', nargs='*') + parser.add_argument('--legend', nargs='*') + parser.add_argument('--value', default='AverageReturn', nargs='*') + parser.add_argument('--title', default="please specify title", help="The title to show") + parser.add_argument('--pdf_name', default=None, help="Name of pdf") + + args = parser.parse_args() + main_plot(args.logdir, args.legend, args.value, title=args.title) + +if __name__ == "__main__": + main() + + +#### TRAJECTORY PLOTTING HERE #### +def plot_trajectory(trajectory, env=None, xkeys=None, ukeys=None): + """ + Used to visualize trajectories returned from the :func:`~irlc.ex01.agent.train`-function. An example: + + .. plot:: + :include-source: + + import matplotlib.pyplot as plt + import numpy as np + from irlc import Agent, plot_trajectory, train + from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment + env = GymSinCosPendulumEnvironment() + stats, trajectories = train(env, Agent(env), num_episodes=1, return_trajectory=True) + plot_trajectory(trajectories[0], env) + + Labels will be derived from the ``env`` if supplied. The parameters ``xkeys`` and ``ukeys`` can be used to limit which + coordinates are plotted. For instance, if you only want to plot the first two x-coordinates you can set ``xkeys=[0,1]``: + + + .. plot:: + + import matplotlib.pyplot as plt + import numpy as np + from irlc import Agent, plot_trajectory, train + from irlc.ex04.model_pendulum import GymSinCosPendulumEnvironment + env = GymSinCosPendulumEnvironment() + stats, trajectories = train(env, Agent(env), num_episodes=1, return_trajectory=True) + plot_trajectory(trajectories[0], env, xkeys=[0,1], ukeys=[]) + + :param trajectory: A single trajectory computed using ``train`` (see example above) + :param env: A gym control environment (optional) + :param xkeys: List of integers corresponding to the coordinates of :math:`x` we wish to plot + :param ukeys: List of integers corresponding to the coordinates of :math:`u` we wish to plot + + .. tip:: + If the plot does not show, you might want to import matplotlib as ``import matplotlib.pyplot as plt`` and call ``plt.show()`` + """ + if xkeys is None: + xkeys = [i for i in range(trajectory.state.shape[1])] + if ukeys is None: # all + ukeys = [i for i in range(trajectory.action.shape[-1])] + import seaborn as sns + import matplotlib.pyplot as plt + plt.figure(figsize=(12, 6)) + sns.set(style="darkgrid", font_scale=1.5) + def fp(time, X, keys, labels): + for i, k in enumerate(keys): + label = labels[k] if labels is not None else None + sns.lineplot(x=time, y=X[:,k], label=label) + + time = trajectory.time.squeeze() + fp(time, trajectory.state, xkeys, labels=env.state_labels if env is not None else None) + fp(time[:-1], trajectory.action, ukeys, labels=env.action_labels if env is not None else None) + plt.xlabel("Time / seconds") + if env is not None: + plt.legend() diff --git a/irlc/utils/lazylog.py b/irlc/utils/lazylog.py new file mode 100644 index 0000000000000000000000000000000000000000..8b1fdb8c87320192abd2d022df45a2c37b8bfaf4 --- /dev/null +++ b/irlc/utils/lazylog.py @@ -0,0 +1,140 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" +Inspired by logz from berkleys deep RL course but re-written as a context manager like God intended. + +To load the learning curves, you can do, for yafcport + +A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) +A['EpRewMean'] + +""" +import json +import os +import time +from datetime import datetime + +color2num = dict( + gray=30, + red=31, + green=32, + yellow=33, + blue=34, + magenta=35, + cyan=36, + white=37, + crimson=38) + + +def colorize(string, color, bold=False, highlight=False): + attr = [] + num = color2num[color] + if highlight: num += 10 + attr.append(str(num)) + if bold: attr.append('1') + return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) + + +class LazyLog(object): + output_dir = None + output_file = None + first_row = True + log_headers = [] + log_current_row = {} + + def __init__(self, experiment_name, run_name=None, data=None): + if run_name is None: + experiment_name += "/"+ datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S.%f")[:-3] + else: + experiment_name += "/" + run_name + self.experiment_name = experiment_name + configure_output_dir(self, experiment_name) + if data is not None: + self.save_params(data) + + def __enter__(self): + return self + + def save_params(self, data): + save_params(self, data) + + def dump_tabular(self, verbose=False): + dump_tabular(self, verbose) + + def log_tabular(self, key, value): + log_tabular(self, key, value) + + def __exit__(self, type, value, traceback): + self.output_file.close() + + +def configure_output_dir(G, d=None): + """ + Set output directory to d, or to /tmp/somerandomnumber if d is None + """ + # CDIR = os.path.dirname(os.path.realpath(__file__)).replace('\\', '/') + G.first_row = True + G.output_dir = d or "/tmp/experiments/%i" % int(time.time()) + assert not os.path.exists( + G.output_dir), "Log dir %s already exists! Delete it first or use a different dir" % G.output_dir + os.makedirs(G.output_dir) + G.output_file = open(os.path.join(G.output_dir, "log.txt"), 'w') + print(colorize("Logging data to %s" % G.output_file.name, 'green', bold=True)) + +def log_tabular(G, key, val): + """ + Log a value of some diagnostic + Call this once for each diagnostic quantity, each iteration + """ + if G.first_row: + G.log_headers.append(key) + else: + assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration" % key + assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()" % key + G.log_current_row[key] = val + + +def save_params(G, params): + with open(os.path.join(G.output_dir, "params.json"), 'w') as out: + out.write(json.dumps(params, separators=(',\n', '\t:\t'), sort_keys=True)) + + +# def pickle_tf_vars(): +# import tensorflow as tf +# """ +# Saves tensorflow variables +# Requires them to be initialized first, also a default session must exist +# """ +# _dict = {v.name: v.eval() for v in tf.global_variables()} +# with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: +# pickle.dump(_dict, f) + + +def dump_tabular(G, verbose=True): + """ + Write all of the diagnostics from the current iteration + """ + vals = [] + key_lens = [len(key) for key in G.log_headers] + max_key_len = max(15, max(key_lens)) + keystr = '%' + '%d' % max_key_len + fmt = "| " + keystr + "s | %15s |" + n_slashes = 22 + max_key_len + print("-" * n_slashes) if verbose else None + for key in G.log_headers: + val = G.log_current_row.get(key, "") + if hasattr(val, "__float__"): + valstr = "%8.3g" % val + else: + valstr = val + print(fmt % (key, valstr)) if verbose else None + vals.append(val) + print("-" * n_slashes) if verbose else None + if G.output_file is not None: + if G.first_row: + G.output_file.write("\t".join(G.log_headers)) + G.output_file.write("\n") + G.output_file.write("\t".join(map(str, vals))) + G.output_file.write("\n") + G.output_file.flush() + G.log_current_row.clear() + G.first_row = False diff --git a/irlc/utils/minigrid.py b/irlc/utils/minigrid.py new file mode 100644 index 0000000000000000000000000000000000000000..3498ea1fcfbd85f401caa416d2de9db3b8e9e74e --- /dev/null +++ b/irlc/utils/minigrid.py @@ -0,0 +1,102 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import gymnasium as gym +from gymnasium.spaces.discrete import Discrete +from minigrid.core.constants import OBJECT_TO_IDX, COLOR_TO_IDX +from minigrid.wrappers import FullyObsWrapper +import numpy as np + + +class ProjectObservationSpaceWrapper(gym.core.ObservationWrapper): + """ + Use the image as the only observation output, no language/mission. + """ + def __init__(self, env, dims): + super().__init__(env) + os = self.observation_space.spaces['image'] + # if dims is not None: + os.high = os.high[:,:,dims] + os.low = os.low[:,:,dims] + + self.observation_space.spaces['image'] = os + self.dims = dims + + def observation(self, obs): + obs['image'] = obs['image'][:, :, self.dims] + return obs + + +class SaneBoundsWrapper(gym.core.ObservationWrapper): + """ + Use the image as the only observation output, no language/mission. + """ + def __init__(self, env): + super().__init__(env) + os = self.observation_space.spaces['image'] + os.high[:, :, 0] = max(OBJECT_TO_IDX.values()) + if os.high.shape[2] >= 2: + os.high[:, :, 1] = max(COLOR_TO_IDX.values()) + if os.high.shape[2] >= 3: + os.high[:, :, 2] = 3 + self.observation_space.spaces['image'] = os + + def observation(self, obs): + return obs + +class HashableImgObsWrapper(gym.core.ObservationWrapper): + """ + Use the image as the only observation output, no language/mission. + """ + + def __init__(self, env,dims=None): + super().__init__(env) + self.observation_space = env.observation_space.spaces['image'] + + def observation(self, obs): + # ls = obs['image'].flat.tolist() + return tuple( obs['image'].flat ) + # return obs['image'] + + +class LinearSpaceWrapper(gym.core.ObservationWrapper): + """ + Fully observable gridworld using a compact grid encoding + """ + def __init__(self, env): + super().__init__(env) + sz = self.observation_space.spaces['image'].shape + npo = np.zeros( sz, dtype=np.object) + for i in range(sz[0]): + for j in range(sz[1]): + for k in range(sz[2]): + if k == 0: + n = max(OBJECT_TO_IDX.values())+1 + elif k == 1: + n = max(COLOR_TO_IDX.values())+1 + elif k == 2: + n = 4 + else: + raise Exception("Bad k") + + npo[i,j,k] = Discrete(n) + ospace = tuple(npo.flat) + + sz = np.cumsum([o.n for o in ospace]) + sz = sz - sz[0] + self.sz = sz + # from gym.spaces.box import Box + self.observation_space = ospace + + def observation(self, obs): + s = obs['image'].reshape((obs['image'].size,)) + return s + + +if __name__ == "__main__": + """ Example use: """ + env = gym.make("MiniGrid-Empty-5x5-v0") + env = FullyObsWrapper(env) # use this + env = LinearSpaceWrapper(env) + s = env.reset() + print(s) + # Use with for instance: + # agent = LinearSemiGradSarsa(env, gamma=1, epsilon=0.1, alpha=0.5) diff --git a/irlc/utils/player_wrapper.py b/irlc/utils/player_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..2d6a0b35c88d17e43e3f6ff15ea22ad99caf7939 --- /dev/null +++ b/irlc/utils/player_wrapper.py @@ -0,0 +1,370 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from gymnasium import logger +from irlc.ex01.agent import Agent +import time +import sys +import gymnasium as gym +import os + +try: + # Imports that may not be availble: + # Using this backend apparently clash with scientific mode. Not sure why it was there in the first place so + # disabling it for now. + # matplotlib.use('TkAgg') + import matplotlib.pyplot as plt + import pygame +except ImportError as e: + logger.warn('failed to set matplotlib backend, plotting will not work: %s' % str(e)) + plt = None + + +class AgentWrapper(Agent): + """Wraps the environment to allow a modular transformation. + + This class is the base class for all wrappers. The subclass could override + some methods to change the behavior of the original environment without touching the + original code. + + .. note:: + + Don't forget to call ``super().__init__(env)`` if the subclass overrides :meth:`__init__`. + + """ + def __init__(self, agent, env): + # print("AgentWrapper is deprecated. ") + self.agent = agent + self.env = env + + def __getattr__(self, name): + if name.startswith('_'): + raise AttributeError("attempted to get missing private attribute '{}'".format(name)) + return getattr(self.agent, name) + + @classmethod + def class_name(cls): + return cls.__name__ + + def pi(self, state, k, info=None): + return self.agent.pi(state, k, info) + # return self.env.step(action) + + def train(self, *args, **kwargs): + return self.agent.train(*args, **kwargs) + + def __str__(self): + return '<{}{}>'.format(type(self).__name__, self.agent) + + def __repr__(self): + return str(self) + + @property + def unwrapped(self): + return self.agent.unwrapped + +PAUSE_KEY = ord('p') +SPACEBAR = "_SPACE_BAR_PRESSED_" +class PlayWrapperPygame(AgentWrapper): + def __init__(self, agent : Agent, env : gym.Env, keys_to_action=None, autoplay=False): + super().__init__(agent, env) + if keys_to_action is None: + if hasattr(env, 'get_keys_to_action'): + keys_to_action = env.get_keys_to_action() + elif hasattr(env.env, 'get_keys_to_action'): + keys_to_action = env.env.get_keys_to_action() + elif hasattr(env.unwrapped, 'get_keys_to_action'): + keys_to_action = env.unwrapped.get_keys_to_action() + else: + print(env.spec.id +" does not have explicit key to action mapping, please specify one manually") + assert False, env.spec.id + " does not have explicit key to action mapping, " + \ + "please specify one manually" + # keys_to_action = dict() + self.keys_to_action = keys_to_action + self.env = env + self.human_wants_restart = False + self.human_sets_pause = False + self.human_agent_action = -1 + self.human_demand_autoplay = autoplay + # Now fix the train function + train2 = agent.train + def train_(s, a, r, sp, done, info1, info2): + train2(s, a, r, sp, done, info1, info2) + env.render() + + agent.train = train_ + env.agent = agent + + # space bar: 0x0020 + def key_press(self,key, mod): + if key == 0xff0d: self.human_wants_restart = True + if key == PAUSE_KEY: + self.human_demand_autoplay = not self.human_demand_autoplay + a = -1 + else: + a = self.keys_to_action.get((key,), -1) + + if a == -1 and hasattr(self.env, 'keypress'): + self.env.keypress(key) + + if key == 0x0020: + a = SPACEBAR + self.human_agent_action = a + + def key_release(self,key, mod): + pass + + # def _get_viewer(self): + # return None + # return self.env.viewer if hasattr(self.env, 'viewer') else self.env.unwrapped.viewer + + # def setup(self): + # # print("In play wrapper - setup") + # # print(self._get_viewer()) + # # return + # return + # viewer = self._get_viewer() + # if viewer is not None: + # viewer.window.on_key_press = self.key_press + # viewer.window.on_key_release = self.key_release + + + def pi(self,state, k, info=None): + pi_action = super().pi(state, k, info) # make sure super class pi method is called in case it has side effects. + # self.setup() + # If unpaused, don't use events given by keyboard until pause is hit again. + a = None + while True: + # Get pygame events: + # for event in pygame.event.get(): + # # get the pressed key + for event in pygame.event.get(): + if event.type == pygame.QUIT: + # print("Want to quit") + if hasattr(self, 'env'): + self.env.close() + time.sleep(0.1) + pygame.display.quit() + time.sleep(0.1) + pygame.quit() + time.sleep(0.1) + # print("Laila tov!") + sys.exit() + + + # checking if keydown event happened or not + if event.type == pygame.KEYDOWN: + # if keydown event happened + # than printing a string to output + # print("A key has been pressed", event) + # if event.key == pygame.K_LEFT: + # print("LEFT!") + # print(event.key, event.unicode) + # Determine if event is one environment should handle. + + if event.key == pygame.K_SPACE: + # Got space, autoplay. + a = pi_action + break + elif (event.key,) in self.keys_to_action: + a = self.keys_to_action[(event.key,)] + if info is not None and 'mask' in info: + # Consider refactoring the environment later. + from irlc.utils.common import DiscreteTextActionSpace + + if isinstance(self.env.action_space, DiscreteTextActionSpace): + aint = self.env.action_space.actions.index(a) + else: + aint = a + + if info['mask'][aint] == 0: + # The action was masked. This means that this action is unavailable, and we should select another. + # The default is to select one of the available actions from the mask. + a = info['mask'].argmax() + if isinstance(self.env.action_space, DiscreteTextActionSpace): + a = self.env.action_space.actions[a] + + + + else: + break + elif event.unicode == 'p': + # unpause + self.human_demand_autoplay = not self.human_demand_autoplay + break + else: + # try to pass event on to the game. + if hasattr(self.env, 'keypress'): + self.env.keypress(event) + # now broke and got event. + if self.human_demand_autoplay: + a = pi_action + + if a is not None: + # return a # We don't are if action is not in action-space. + # if hasattr(self.env, 'A') and a not in self.env.A(state): + # print(f"Got action {a} not available in action space {self.env.A(state)}") + # a = self.env.A(state)[-1] # Last because of the gym environment. + # else: + # return a + try: + from irlc.pacman.gamestate import GameState + if isinstance(state, GameState): + if a not in state.A(): + a = "Stop" + except Exception as e: + pass + + return a + # viewer = self._get_viewer() + time.sleep(0.1) + # if viewer is not None: + # viewer.window.dispatch_events() + # a = self.human_agent_action + # if a == SPACEBAR or self.human_demand_autoplay: + # # Just do what the agent wanted us to do + # action_okay = True + # a = pi_action + # elif hasattr(self.env, 'P'): + # if len(self.env.P[state]) == 1 and a != -1: + # a = next(iter(self.env.P[state])) + # action_okay = a in self.env.P[state] + # elif self.env.action_space is not None: + # action_okay = self.env.action_space.contains(a) + # else: + # action_okay = a != -1 + # if action_okay: + # self.human_agent_action = -1 + # break + # print("In keyboard wrapper, returning action", a) + # return a + + +def interactive(env : gym.Env, agent: Agent, autoplay=False) -> (gym.Env, Agent): + """ + This function is used for visualizations. It can + + - Allow you to input keyboard commands to an environment + - Allow you to save results + - Visualize reinforcement-learning agents in the gridworld environment. + + by adding a single extra line ``env, agent = interactive(env,agent)``. + The following shows an example: + + >>> from irlc.gridworld.gridworld_environments import BookGridEnvironment + >>> from irlc import train, Agent, interactive + >>> env = BookGridEnvironment(render_mode="human", zoom=0.8) # Pass render_mode='human' for visualization. + >>> env, agent = interactive(env, Agent(env)) # Make the environment interactive. Note that it needs an agent. + >>> train(env, agent, num_episodes=2) # You can train and use the agent and environment as usual. + >>> env.close() + + It also enables you to visualize the environment at a matplotlib figure or save it as a pdf file using ``env.plot()`` and ``env.savepdf('my_file.pdf)``. + + All demos and figures in the notes are made using this function. + + :param env: A gym environment (an instance of the ``Env`` class) + :param agent: An agent (an instance of the ``Agent`` class) + :param autoplay: Whether the simulation should be unpaused automatically + :return: An environment and agent which have been slightly updated to make them interact with each other. You can use them as usual with the ``train``-function. + """ + from PIL import Image # Let's put this one here in case we run the code in headless mode. + + agent = PlayWrapperPygame(agent, env, autoplay=autoplay) + + def plot(): + env.render_mode, rmt = 'rgb_array', env.render_mode + frame = env.render() + env.render_mode = rmt + im = Image.fromarray(frame) + plt.imshow(im) + plt.axis('off') + plt.axis('off') + plt.tight_layout() + + def savepdf(file): + env.render_mode, rmt = 'rgb_array', env.render_mode + frame = env.render() + env.render_mode = rmt + + im = Image.fromarray(frame) + snapshot_base = file + if snapshot_base.endswith(".png"): + sf = snapshot_base[:-4] + fext = 'png' + else: + fext = 'pdf' + if snapshot_base.endswith(".pdf"): + sf = snapshot_base[:-4] + else: + sf = snapshot_base + + sf = f"{sf}.{fext}" + dn = os.path.dirname(sf) + if len(dn) > 0 and not os.path.isdir(dn): + os.makedirs(dn) + print("Saving snapshot of environment to", os.path.abspath(sf)) + if fext == 'png': + im.save(sf) + from irlc import _move_to_output_directory + _move_to_output_directory(sf) + else: + plt.figure(figsize=(16, 16)) + plt.imshow(im) + plt.axis('off') + plt.tight_layout() + from irlc import savepdf + savepdf(sf, verbose=True) + plt.show() + env.plot = plot + env.savepdf = savepdf + return env, agent + + +def main(): + from irlc.ex11.q_agent import QAgent + + from irlc.gridworld.gridworld_environments import BookGridEnvironment + from irlc import train, Agent + env = BookGridEnvironment(render_mode="human", zoom=0.8) # Pass render_mode='human' for visualization. + env, agent = interactive(env, Agent(env)) # Make th + env.reset() # We always need to call reset + env.plot() # Plot the environment. + env.close() + + # Interaction with a random agent. + from irlc.gridworld.gridworld_environments import BookGridEnvironment + from irlc import train, Agent + env = BookGridEnvironment(render_mode="human", zoom=0.8) # Pass render_mode='human' for visualization. + env, agent = interactive(env, Agent(env)) # Make the environment interactive. Note that it needs an agent. + train(env, agent, num_episodes=100) # You can train and use the agent and environment as usual. + env.close() + + # Second example: plotting. + + + a = 234 + # from irlc.utils.berkley import BerkleyBookGridEnvironment + # from irlc.ex11.sarsa_agent import SarsaAgent + # from irlc.ex01.agent import train + # from irlc.utils.berkley import VideoMonitor + # env = BerkleyBookGridEnvironment(adaptor='gym') + # agent = SarsaAgent(env, gamma=0.95, alpha=0.5) + # """ + # agent = PlayWrapper(agent, env) + + # env = VideoMonitor(env, agent=agent, video_file="videos/SarsaGridworld.mp4", fps=30, continious_recording=True, + # label="SADSF", + # monitor_keys=("Q",)) + # """ + # env.reset() + # env.render() + # train(env, agent, num_episodes=3) + # env.close() + # parser = argparse.ArgumentParser() + # parser.add_argument('--env', type=str, default='MontezumaRevengeNoFrameskip-v4', help='Define Environment') + # args = parser.parse_args() + # env = gym.make(args.env) + # play(env, zoom=4, fps=60) + +if __name__ == "__main__": + + + main() diff --git a/irlc/utils/ptext.py b/irlc/utils/ptext.py new file mode 100644 index 0000000000000000000000000000000000000000..c552f09bf66657ae5936d520c9b7997f5da33b83 --- /dev/null +++ b/irlc/utils/ptext.py @@ -0,0 +1,991 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +# ptext module: place this in your import directory. + +# ptext.draw(text, pos=None, **options) + +# Please see README.md for explanation of options. +# https://github.com/cosmologicon/pygame-text + +from __future__ import division, print_function + +from math import ceil, sin, cos, radians, exp +from collections import namedtuple +import pygame + +# Global default values +DEFAULT_FONT_SIZE = 24 +REFERENCE_FONT_SIZE = 100 +DEFAULT_LINE_HEIGHT = 1.0 +DEFAULT_PARAGRAPH_SPACE = 0.0 +DEFAULT_FONT_NAME = None +DEFAULT_SYSFONT_NAME = None +FONT_NAME_TEMPLATE = "%s" +DEFAULT_COLOR = "white" +DEFAULT_BACKGROUND = None +DEFAULT_SHADE = 0 +DEFAULT_OUTLINE_WIDTH = None +DEFAULT_OUTLINE_COLOR = "black" +OUTLINE_UNIT = 1 / 24 +DEFAULT_SHADOW_OFFSET = None +DEFAULT_SHADOW_COLOR = "black" +SHADOW_UNIT = 1 / 18 +DEFAULT_ALIGN = "left" # left, center, or right +DEFAULT_ANCHOR = 0, 0 # 0, 0 = top left ; 1, 1 = bottom right +DEFAULT_STRIP = True +ALPHA_RESOLUTION = 16 +ANGLE_RESOLUTION_DEGREES = 3 +DEFAULT_UNDERLINE_TAG = None +DEFAULT_BOLD_TAG = None +DEFAULT_ITALIC_TAG = None +DEFAULT_COLOR_TAG = {} + +AUTO_CLEAN = True +MEMORY_LIMIT_MB = 64 +MEMORY_REDUCTION_FACTOR = 0.5 + +pygame.font.init() + + +# Options objects encapsulate the keyword arguments to functions that take a lot of optional keyword +# arguments. + +# Options object base class. Subclass for Options objects specific to different functions. +# Specify valid fields in the _fields list. All keyword fields are optional. Unspecified fields +# default to None, unless otherwise specified in the _defaults list. +class _Options(object): + _fields = () + _defaults = {} + + def __init__(self, **kwargs): + fields = self._allfields() + badfields = set(kwargs) - fields + if badfields: + raise ValueError("Unrecognized args: " + ", ".join(badfields)) + for field in fields: + value = kwargs[field] if field in kwargs else self._defaults.get(field) + setattr(self, field, value) + + @classmethod + def _allfields(cls): + return set(cls._fields) | set(cls._defaults) + + def asdict(self): + return {field: getattr(self, field) for field in self._allfields()} + + def copy(self): + return self.__class__(**self.asdict()) + + def keys(self): + return self._allfields() + + def __getitem__(self, field): + return getattr(self, field) + + def update(self, **newkwargs): + kwargs = self.asdict() + kwargs.update(**newkwargs) + return self.__class__(**kwargs) + + # For cached function calls, this is a hashable representation of the options object. Assumes + # that all field values are either hashable, or dicts whose keys are comparable and values are + # hashable. + def key(self): + values = [] + for field in sorted(self._allfields()): + value = getattr(self, field) + if isinstance(value, dict): + value = tuple(sorted(value.items())) + values.append(value) + return tuple(values) + + def getsuboptions(self, optclass): + return {field: getattr(self, field) for field in optclass._allfields() if hasattr(self, field)} + + # The following methods are just put here for code deduplication. A couple different functions + # use a lot of the same code. + def resolvetags(self): + if self.underlinetag is _default_sentinel: + self.underlinetag = DEFAULT_UNDERLINE_TAG + if self.boldtag is _default_sentinel: + self.boldtag = DEFAULT_BOLD_TAG + if self.italictag is _default_sentinel: + self.italictag = DEFAULT_ITALIC_TAG + if self.colortag is _default_sentinel: + self.colortag = DEFAULT_COLOR_TAG + + +# Used as the default value for any argument for which (1) None is a valid value, and (2) there's a +# global default value. +_default_sentinel = () + + +# Options argument for the draw function. Specifies both text styling and positioning. +class _DrawOptions(_Options): + _fields = ("pos", + "fontname", "fontsize", "sysfontname", "antialias", "bold", "italic", "underline", + "color", "background", + "top", "left", "bottom", "right", "topleft", "bottomleft", "topright", "bottomright", + "midtop", "midleft", "midbottom", "midright", "center", "centerx", "centery", + "width", "widthem", "lineheight", "pspace", "strip", "align", + "owidth", "ocolor", "shadow", "scolor", "gcolor", "shade", + "alpha", "anchor", "angle", + "underlinetag", "boldtag", "italictag", "colortag", + "surf", "cache") + _defaults = { + "fontname": _default_sentinel, + "sysfontname": _default_sentinel, + "antialias": True, "alpha": 1.0, "angle": 0, + "owidth": _default_sentinel, + "shadow": _default_sentinel, + "underlinetag": _default_sentinel, + "boldtag": _default_sentinel, + "italictag": _default_sentinel, + "colortag": _default_sentinel, + "surf": _default_sentinel, "cache": True} + + def __init__(self, **kwargs): + _Options.__init__(self, **kwargs) + self.expandposition() + self.expandanchor() + self.resolvesurf() + + # Expand each 2-element position specifier and overwrite the corresponding 1-element + # position specifiers. + def expandposition(self): + if self.topleft: self.left, self.top = self.topleft + if self.bottomleft: self.left, self.bottom = self.bottomleft + if self.topright: self.right, self.top = self.topright + if self.bottomright: self.right, self.bottom = self.bottomright + if self.midtop: self.centerx, self.top = self.midtop + if self.midleft: self.left, self.centery = self.midleft + if self.midbottom: self.centerx, self.bottom = self.midbottom + if self.midright: self.right, self.centery = self.midright + if self.center: self.centerx, self.centery = self.center + + # Update the pos and anchor fields, if unspecified, to be specified by the positional + # keyword arguments. + def expandanchor(self): + x, y = self.pos or (None, None) + hanchor, vanchor = self.anchor or (None, None) + if self.left is not None: x, hanchor = self.left, 0 + if self.centerx is not None: x, hanchor = self.centerx, 0.5 + if self.right is not None: x, hanchor = self.right, 1 + if self.top is not None: y, vanchor = self.top, 0 + if self.centery is not None: y, vanchor = self.centery, 0.5 + if self.bottom is not None: y, vanchor = self.bottom, 1 + if x is None: + raise ValueError("Unable to determine horizontal position") + if y is None: + raise ValueError("Unable to determine vertical position") + self.pos = x, y + + if self.align is None: self.align = hanchor + if hanchor is None: hanchor = DEFAULT_ANCHOR[0] + if vanchor is None: vanchor = DEFAULT_ANCHOR[1] + self.anchor = hanchor, vanchor + + # Unspecified surf values default to the display surface. + def resolvesurf(self): + if self.surf is _default_sentinel: + self.surf = pygame.display.get_surface() + + def togetsurfoptions(self): + return self.getsuboptions(_GetsurfOptions) + + +# Options for the layout function. By design, this has the same options as draw, although some of +# them are silently ignored. +class _LayoutOptions(_DrawOptions): + def __init__(self, **kwargs): + _Options.__init__(self, **kwargs) + self.expandposition() + self.expandanchor() + if self.lineheight is None: self.lineheight = DEFAULT_LINE_HEIGHT + if self.pspace is None: self.pspace = DEFAULT_PARAGRAPH_SPACE + self.resolvetags() + + def towrapoptions(self): + return self.getsuboptions(_WrapOptions) + + def togetfontoptions(self): + return self.getsuboptions(_GetfontOptions) + + +class _DrawboxOptions(_Options): + _fields = ( + "fontname", "sysfontname", "antialias", "bold", "italic", "underline", + "color", "background", + "lineheight", "pspace", "strip", "align", + "owidth", "ocolor", "shadow", "scolor", "gcolor", "shade", + "underlinetag", "boldtag", "italictag", "colortag", + "alpha", "anchor", "angle", "surf", "cache") + _defaults = { + "fontname": _default_sentinel, + "sysfontname": _default_sentinel, + "antialias": True, "alpha": 1.0, "angle": 0, "anchor": (0.5, 0.5), + "owidth": _default_sentinel, + "shadow": _default_sentinel, + "underlinetag": _default_sentinel, + "boldtag": _default_sentinel, + "italictag": _default_sentinel, + "colortag": _default_sentinel, + "surf": _default_sentinel, "cache": True} + + def __init__(self, **kwargs): + _Options.__init__(self, **kwargs) + if self.fontname is _default_sentinel: self.fontname = DEFAULT_FONT_NAME + if self.sysfontname is _default_sentinel: self.sysfontname = DEFAULT_SYSFONT_NAME + if self.lineheight is None: self.lineheight = DEFAULT_LINE_HEIGHT + if self.pspace is None: self.pspace = DEFAULT_PARAGRAPH_SPACE + + def todrawoptions(self): + return self.getsuboptions(_DrawOptions) + + def tofitsizeoptions(self): + return self.getsuboptions(_FitsizeOptions) + + +class _GetsurfOptions(_Options): + _fields = ("fontname", "fontsize", "sysfontname", "bold", "italic", "underline", "width", + "widthem", "strip", "color", "background", "antialias", "ocolor", "owidth", "scolor", + "shadow", "gcolor", "shade", "alpha", "align", "lineheight", "pspace", "angle", + "underlinetag", "boldtag", "italictag", "colortag", "cache") + _defaults = { + "fontname": _default_sentinel, + "sysfontname": _default_sentinel, + "antialias": True, "alpha": 1.0, "angle": 0, + "owidth": _default_sentinel, + "shadow": _default_sentinel, + "underlinetag": _default_sentinel, + "boldtag": _default_sentinel, + "italictag": _default_sentinel, + "colortag": _default_sentinel, + "cache": True} + + def __init__(self, **kwargs): + _Options.__init__(self, **kwargs) + if self.fontname is _default_sentinel: self.fontname = DEFAULT_FONT_NAME + if self.sysfontname is _default_sentinel: self.sysfontname = DEFAULT_SYSFONT_NAME + if self.fontsize is None: self.fontsize = DEFAULT_FONT_SIZE + self.fontsize = int(round(self.fontsize)) + if self.align is None: self.align = DEFAULT_ALIGN + if self.align in ["left", "center", "right"]: + self.align = [0, 0.5, 1][["left", "center", "right"].index(self.align)] + if self.lineheight is None: self.lineheight = DEFAULT_LINE_HEIGHT + if self.pspace is None: self.pspace = DEFAULT_PARAGRAPH_SPACE + self.color = _resolvecolor(self.color, DEFAULT_COLOR) + self.background = _resolvecolor(self.background, DEFAULT_BACKGROUND) + self.gcolor = _resolvecolor(self.gcolor, None) + if self.shade is None: self.shade = DEFAULT_SHADE + if self.shade: + self.gcolor = _applyshade(self.gcolor or self.color, self.shade) + self.shade = 0 + self.resolveoutlineshadow() + self.alpha = _resolvealpha(self.alpha) + self.angle = _resolveangle(self.angle) + self.strip = DEFAULT_STRIP if self.strip is None else self.strip + self.resolvetags() + + def resolveoutlineshadow(self): + if self.owidth is _default_sentinel: + self.owidth = DEFAULT_OUTLINE_WIDTH + if self.shadow is _default_sentinel: + self.shadow = DEFAULT_SHADOW_OFFSET + self.ocolor = None if self.owidth is None else _resolvecolor(self.ocolor, DEFAULT_OUTLINE_COLOR) + self.scolor = None if self.shadow is None else _resolvecolor(self.scolor, DEFAULT_SHADOW_COLOR) + self._opx = None if self.owidth is None else ceil(self.owidth * self.fontsize * OUTLINE_UNIT) + self._spx = None if self.shadow is None else tuple(ceil(s * self.fontsize * SHADOW_UNIT) for s in self.shadow) + + def checkinline(self): + if self.angle is None or self._opx is not None or self._spx is not None or self.align != 0 or self.gcolor or self.shade: + raise ValueError( + "Inline style not compatible with rotation, outline, drop shadow, gradient, or non-left-aligned text.") + + def towrapoptions(self): + return self.getsuboptions(_WrapOptions) + + def togetfontoptions(self): + return self.getsuboptions(_GetfontOptions) + + +class _WrapOptions(_Options): + _fields = ("fontname", "fontsize", "sysfontname", + "bold", "italic", "underline", "width", "widthem", "strip", + "color", + "underlinetag", "boldtag", "italictag", "colortag") + _defaults = { + "underlinetag": _default_sentinel, + "boldtag": _default_sentinel, + "italictag": _default_sentinel, + "colortag": _default_sentinel, + } + + def __init__(self, **kwargs): + _Options.__init__(self, **kwargs) + self.resolvetags() + if self.widthem is not None and self.width is not None: + raise ValueError("Can't set both width and widthem") + + if self.widthem is not None: + self.fontsize = REFERENCE_FONT_SIZE + self.width = self.widthem * self.fontsize + + if self.strip is None: + self.strip = DEFAULT_STRIP + + def togetfontoptions(self): + return self.getsuboptions(_GetfontOptions) + + +class _GetfontOptions(_Options): + _fields = ("fontname", "fontsize", "sysfontname", "bold", "italic", "underline") + _defaults = { + "fontname": _default_sentinel, + "sysfontname": _default_sentinel, + } + + def __init__(self, **kwargs): + _Options.__init__(self, **kwargs) + if self.fontname is _default_sentinel: self.fontname = DEFAULT_FONT_NAME + if self.sysfontname is _default_sentinel: self.sysfontname = DEFAULT_SYSFONT_NAME + if self.fontname is not None and self.sysfontname is not None: + raise ValueError("Can't set both fontname and sysfontname") + if self.fontsize is None: + self.fontsize = DEFAULT_FONT_SIZE + + def getfontpath(self): + return self.fontname if self.fontname is None else FONT_NAME_TEMPLATE % self.fontname + + +class _FitsizeOptions(_Options): + _fields = ("fontname", "sysfontname", "bold", "italic", "underline", + "lineheight", "pspace", "strip", + "underlinetag", "boldtag", "italictag", "colortag") + _defaults = { + "underlinetag": _default_sentinel, + "boldtag": _default_sentinel, + "italictag": _default_sentinel, + "colortag": _default_sentinel, + } + + def togetfontoptions(self): + return self.getsuboptions(_GetfontOptions) + + def towrapoptions(self): + return self.getsuboptions(_WrapOptions) + + +_font_cache = {} + + +def getfont(**kwargs): + options = _GetfontOptions(**kwargs) + key = options.key() + if key in _font_cache: return _font_cache[key] + if options.sysfontname is not None: + font = pygame.font.SysFont(options.sysfontname, options.fontsize, options.bold or False, + options.italic or False) + else: + try: + font = pygame.font.Font(options.getfontpath(), options.fontsize) + except IOError: + raise IOError("unable to read font filename: %s" % options.getfontpath()) + if options.bold is not None: + font.set_bold(options.bold) + if options.italic is not None: + font.set_italic(options.italic) + if options.underline is not None: + font.set_underline(options.underline) + _font_cache[key] = font + return font + + +# Return the largest integer in the range [xmin, xmax] such that f(x) is True. +def _binarysearch(f, xmin=1, xmax=256): + if not f(xmin): return xmin + if f(xmax): return xmax + # xmin is the largest known value for which f(x) is True + # xmax is the smallest known value for which f(x) is False + while xmax - xmin > 1: + x = (xmax + xmin) // 2 + if f(x): + xmin = x + else: + xmax = x + return xmin + + +_fit_cache = {} + + +def _fitsize(text, size, **kwargs): + options = _FitsizeOptions(**kwargs) + key = text, size, options.key() + if key in _fit_cache: return _fit_cache[key] + width, height = size + + def fits(fontsize): + opts = options.copy() + wmax, hmax = 0, 0 + for span in _wrap(text, fontsize=fontsize, width=width, **opts.towrapoptions()): + y = span.font.get_linesize() * (opts.pspace * span.jpara + opts.lineheight * span.jline) + w, h = span.font.size(span.text) + wmax = max(wmax, span.right) + hmax = max(hmax, y + h) + return wmax <= width and hmax <= height + + fontsize = _binarysearch(fits) + _fit_cache[key] = fontsize + return fontsize + + +# Returns the color as a color RGB or RGBA tuple (i.e. 3 or 4 integers in the range 0-255) +# If color is None, fall back to the default. If default is also None, return None. +# Both color and default can be a list, tuple, a color name, an HTML color format string, a hex +# number string, or an integer pixel value. See pygame.Color constructor for specification. +def _resolvecolor(color, default): + if color is None: color = default + if color is None: return None + try: + return tuple(pygame.Color(color)) + except ValueError: + return tuple(color) + + +def _applyshade(color, shade): + f = exp(-0.4 * shade) + r, g, b = [ + min(max(int(round((c + 50) * f - 50)), 0), 255) + for c in color[:3] + ] + return (r, g, b) + tuple(color[3:]) + + +def _resolvealpha(alpha): + if alpha >= 1: + return 1 + return max(int(round(alpha * ALPHA_RESOLUTION)) / ALPHA_RESOLUTION, 0) + + +def _resolveangle(angle): + if not angle: + return 0 + angle %= 360 + return int(round(angle / ANGLE_RESOLUTION_DEGREES)) * ANGLE_RESOLUTION_DEGREES + + +# Return the set of points in the circle radius r, using Bresenham's circle algorithm +_circle_cache = {} + + +def _circlepoints(r): + r = int(round(r)) + if r in _circle_cache: + return _circle_cache[r] + x, y, e = r, 0, 1 - r + _circle_cache[r] = points = [] + while x >= y: + points.append((x, y)) + y += 1 + if e < 0: + e += 2 * y - 1 + else: + x -= 1 + e += 2 * (y - x) - 1 + points += [(y, x) for x, y in points if x > y] + points += [(-x, y) for x, y in points if x] + points += [(x, -y) for x, y in points if y] + points.sort() + return points + + +# Rotate the given surface by the given angle, in degrees. +# If angle is an exact multiple of 90, use pygame.transform.rotate, otherwise fall back to +# pygame.transform.rotozoom. +def _rotatesurf(surf, angle): + if angle in (90, 180, 270): + return pygame.transform.rotate(surf, angle) + else: + return pygame.transform.rotozoom(surf, angle, 1.0) + + +# Apply the given alpha value to a copy of the Surface. +def _fadesurf(surf, alpha): + surf = surf.copy() + asurf = surf.copy() + asurf.fill((255, 255, 255, int(round(255 * alpha)))) + surf.blit(asurf, (0, 0), None, pygame.BLEND_RGBA_MULT) + return surf + + +def _istransparent(color): + return len(color) > 3 and color[3] == 0 + + +# Produce a 1xh Surface with the given color gradient. +_grad_cache = {} + + +def _gradsurf(h, y0, y1, color0, color1): + key = h, y0, y1, color0, color1 + if key in _grad_cache: + return _grad_cache[key] + surf = pygame.Surface((1, h)).convert_alpha() + r0, g0, b0 = color0[:3] + r1, g1, b1 = color1[:3] + for y in range(h): + f = min(max((y - y0) / (y1 - y0), 0), 1) + g = 1 - f + surf.set_at((0, y), ( + int(round(g * r0 + f * r1)), + int(round(g * g0 + f * g1)), + int(round(g * b0 + f * b1)), + 0 + )) + _grad_cache[key] = surf + return surf + + +# Tracks everything that can be updated by tags. +class TagSpec(namedtuple("TagSpec", ["underline", "bold", "italic", "color"])): + @staticmethod + def fromoptions(options): + return TagSpec( + underline=options.underline, + bold=options.bold, + italic=options.italic, + color=options.color + ) + + def updateoptions(self, options): + options.underline = self.underline + options.bold = self.bold + options.italic = self.italic + options.color = self.color + + def toggleunderline(self): + return self._replace(underline=not self.underline) + + def togglebold(self): + return self._replace(bold=not self.bold) + + def toggleitalic(self): + return self._replace(italic=not self.italic) + + def setcolor(self, color): + return self._replace(color=color) + + +# Splits a string into substrings with corresponding tag specs. +# Empty strings are skipped. Consecutive identical tag specs are not merged. +# e.g. if tagspec0.underline = False and underlinetag = "_" then: +# _splitbytags("_abc__def_ ghi_") yields three items: +# ("abc", TagSpec(underline=True)) +# ("def", TagSpec(underline=True)) +# (" ghi", TagSpec(underline=False)) +def _splitbytags(text, tagspec0, color0, underlinetag, boldtag, italictag, colortag): + colortag = {k: _resolvecolor(v, color0) for k, v in colortag.items()} + tags = sorted((set([underlinetag, boldtag, italictag]) | set(colortag.keys())) - set([None])) + if not tags: + yield text, tagspec0 + return + tagspec = tagspec0 + while text: + tagsin = [tag for tag in tags if tag in text] + if not tagsin: + break + a, tag = min((text.index(tag), tag) for tag in tagsin) + if a > 0: + yield text[:a], tagspec + text = text[a + len(tag):] + if tag == underlinetag: + tagspec = tagspec.toggleunderline() + if tag == boldtag: + tagspec = tagspec.togglebold() + if tag == italictag: + tagspec = tagspec.toggleitalic() + if tag in colortag: + tagspec = tagspec.setcolor(colortag[tag]) + if text: + yield text, tagspec + + +# The _Span class tracks many attributes of a single span of text, i.e. a string of text within a +# single line that has a single font and TagSpec. That is, a single span corresponds to a single +# call to font.render. +# This is not a clean abstraction, and some of the state of this object only makes sense in the +# context of the overall draw call. At various stages of the call, some of the fields will not yet +# be populated. +class _Span: + # Phase 1: set by _wrapline + def __init__(self, text, tagspec, x, font): + self.tagspec = tagspec + self.x = x # Offset from the beginning of the line + self.font = font + self.settext(text) + + # Phase 2: set by _wrap + def setlayout(self, jpara, jline, linewidth): + self.jpara = jpara + self.jline = jline + self.linewidth = linewidth + + # Phase 3: set by getsurf + # These are not required to determine layout or position, only for rendering. + def setdetails(self, antialias, gcolor, background): + self.antialias = antialias + self.gcolor = gcolor + self.background = background + + def settext(self, text): + self.text = text + self.width = self.getwidth(self.text) + self.right = self.x + self.width + + def getwidth(self, text): + if text == '0': + pass + return self.font.size(text)[0] + + def render(self): + if self.gcolor is None: + # Workaround: pygame.Font.render does not allow passing None as an argument value for + # background. We have to call the 3-argument form to specify no background. + args = self.text, self.antialias, self.tagspec.color + if self.background is not None and not _istransparent(self.background): + args += (self.background,) + self.surf = self.font.render(*args).convert_alpha() + else: + self.surf = self.font.render(self.text, self.antialias, (0, 0, 0)).convert_alpha() + w, h = self.surf.get_size() + asc = self.font.get_ascent() + gsurf0 = _gradsurf(h, 0.5 * asc, asc, self.tagspec.color, self.gcolor) + gsurf = pygame.transform.scale(gsurf0, (w, h)) + self.surf.blit(gsurf, (0, 0), None, pygame.BLEND_RGBA_ADD) + + +# Finds the last valid breakpoint in the line of text. A breakpoint is a position at which the line +# can be split without improperly breaking words. +# Returns (breaktext, breakpoint) +def _breaktext(text, width, font, canbreakatstart=False): + # TODO: binary search + # The text to be printed that actually comes from text. Does not include stripped characters, + # e.g. soft hyphens, trailing or otherwise. Does include trailing spaces. + btext = "" + # Index of the first character in text that does not appear in btext. + b = 0 if canbreakatstart else None + # Any additional characters to be appended on return, i.e. hyphen generated by soft hyphens. + bapp = "" + # Partial buildup of btext. + ptext = "" + + def isvalid(t): + return width is None or font.size(t)[0] <= width + + for j, c in enumerate(text): + atbreak, napp = False, "" + # Space and hyphen character allow for a breakpoint. + if c in [" ", "-"]: + atbreak = True + # Non-breaking space. No breakpoint here. Instead just add a space. + elif c == "\u00A0": + c = " " + # Non-breaking hyphen. No breakpoint here. Instead just add a hyphen. + elif c == "\u2011": + c = "-" + # Zero-width space. Allow a breakpoint but don't add anything (i.e. remove this character) + elif c == "\u200B": + atbreak = True + c = "" + # Soft hyphen. Allow a breakpoint with an appending string of hyphen ("-"). + elif c == "\u00AD": + atbreak = True + c = "" + napp = "-" + ptext += c + if atbreak: + if b is None or isvalid((ptext + napp).rstrip(" ")): + btext = ptext + b = j + 1 + bapp = napp + else: + break + else: + # One past the end of the line is always considered a breakpoint. + if b is None or isvalid(ptext): + return ptext, len(text) + # Invalid breakpoint found. Take trailing spaces starting from the last valid breakpoint. + while b < len(text) and text[b] == " ": + b += 1 + bapp += " " + return btext + bapp, b + + +# Split a single line of text. +# textandtags is the output of _splitbytags, i.e. a sequence of (string, tag spec) tuples. +def _wrapline(textandtags, width, getfontbytagspec): + x = 0 + canbreakatstart = False + lines = [] + line = [] + for text, tagspec in textandtags: + font = getfontbytagspec(tagspec) + while text: + rwidth = None if width is None else width - x + btext, b = _breaktext(text, rwidth, font, canbreakatstart) + if b == 0: + lines.append((line, x)) + line = [] + x = 0 + canbreakatstart = False + else: + span = _Span(btext, tagspec, x, font) + line.append(span) + x += span.width + text = text[b:] + canbreakatstart = True + lines.append((line, x)) + return lines + + +def _wrap(text, **kwargs): + options = _WrapOptions(**kwargs) + # Returns a function mapping strings to int widths in the specified font + opts = options.copy() + + def getfontbytagspec(tagspec): + tagspec.updateoptions(opts) + return getfont(**opts.togetfontoptions()) + + # Apparently Font.render accepts None for the text argument, in which case it's treated as the + # empty string. We match that behavior here. + if text is None: text = "" + spans = [] + tagspec0 = TagSpec.fromoptions(options) + jline = 0 + for jpara, para in enumerate(text.replace("\t", " ").split("\n")): + if options.strip: + para = para.rstrip(" ") + tagargs = options.underlinetag, options.boldtag, options.italictag, options.colortag + textandtags = list(_splitbytags(para, tagspec0, options.color, *tagargs)) + _, tagspec0 = textandtags[-1] + for line, linewidth in _wrapline(textandtags, options.width, getfontbytagspec): + if not line: + jline += 1 + continue + # Strip trailing spaces from the end of each line. + span = line[-1] + if options.strip: + span.settext(span.text.rstrip(" ")) + elif options.width is not None: + while span.text[-1] == " " and span.right > options.width: + span.settext(span.text[:-1]) + linewidth = span.right + for span in line: + span.setlayout(jpara, jline, linewidth) + spans.append(span) + jline += 1 + return spans + + +_surf_cache = {} +_surf_tick_usage = {} +_surf_size_total = 0 +_unrotated_size = {} +_tick = 0 + + +def getsurf(text, **kwargs): + global _tick, _surf_size_total + options = _GetsurfOptions(**kwargs) + key = text, options.key() + if key in _surf_cache: + _surf_tick_usage[key] = _tick + _tick += 1 + return _surf_cache[key] + + if options.angle: + surf0 = getsurf(text, **options.update(angle=0)) + surf = _rotatesurf(surf0, options.angle) + # draw() requires the unrotated size for proper positioning, but the unrotated surface will + # not necessarily be cached, so we add it to a global store here. In principle you could + # compute it from surf.get_size() and options.angle, were it not for rounding issues. + _unrotated_size[(surf.get_size(), options.angle, text)] = surf0.get_size() + elif options.alpha < 1.0: + surf = _fadesurf(getsurf(text, **options.update(alpha=1.0)), options.alpha) + elif options._spx is not None: + color = (0, 0, 0) if _istransparent(options.color) else options.color + surf0 = getsurf(text, **options.update(background=(0, 0, 0, 0), color=color, shadow=None, scolor=None)) + sopts = { + "color": options.scolor, + "shadow": None, + "scolor": None, + "background": (0, 0, 0, 0), + "gcolor": None, + "colortag": {k: None for k in options.colortag}, + } + ssurf = getsurf(text, **options.update(**sopts)) + w0, h0 = surf0.get_size() + sx, sy = options._spx + surf = pygame.Surface((w0 + abs(sx), h0 + abs(sy))).convert_alpha() + surf.fill(options.background or (0, 0, 0, 0)) + dx, dy = max(sx, 0), max(sy, 0) + surf.blit(ssurf, (dx, dy)) + x0, y0 = abs(sx) - dx, abs(sy) - dy + if _istransparent(options.color): + surf.blit(surf0, (x0, y0), None, pygame.BLEND_RGBA_SUB) + else: + surf.blit(surf0, (x0, y0)) + elif options._opx is not None: + color = (0, 0, 0) if _istransparent(options.color) else options.color + surf0 = getsurf(text, **options.update(color=color, ocolor=None, owidth=None)) + oopts = { + "color": options.ocolor, + "ocolor": None, + "owidth": None, + "background": (0, 0, 0, 0), + "gcolor": None, + "colortag": {k: None for k in options.colortag}, + } + osurf = getsurf(text, **options.update(**oopts)) + w0, h0 = surf0.get_size() + opx = options._opx + surf = pygame.Surface((w0 + 2 * opx, h0 + 2 * opx)).convert_alpha() + surf.fill(options.background or (0, 0, 0, 0)) + for dx, dy in _circlepoints(opx): + surf.blit(osurf, (dx + opx, dy + opx)) + if _istransparent(options.color): + surf.blit(surf0, (opx, opx), None, pygame.BLEND_RGBA_SUB) + else: + surf.blit(surf0, (opx, opx)) + else: + # Each span is rendered separately into a Surface, and then the different spans' Surfaces + # are blitted onto the final Surface. + spans = _wrap(text, **options.towrapoptions()) + for span in spans: + span.setdetails(options.antialias, options.gcolor, options.background) + span.render() + # Now to blit the span Surfaces together onto a single Surface. As an optimization, when + # there is only one span Surface, just use that. (We can't use this optimization if there's + # a gradient color, because the background color still needs to be applied.) + if not spans: + surf = pygame.Surface((0, 0)).convert_alpha() + elif len(spans) == 1 and options.gcolor is None: + surf = spans[0].surf + else: + font = spans[0].font + w = max(span.linewidth for span in spans) + linesize = font.get_linesize() * options.lineheight + parasize = font.get_linesize() * options.pspace + for span in spans: + span.y = int(round(span.jline * linesize + span.jpara * parasize)) + h = max(span.y for span in spans) + font.get_height() + surf = pygame.Surface((w, h)).convert_alpha() + surf.fill(options.background or (0, 0, 0, 0)) + for span in spans: + x = int(round(span.x + options.align * (w - span.linewidth))) + surf.blit(span.surf, (x, span.y)) + if options.cache: + w, h = surf.get_size() + _surf_size_total += 4 * w * h + _surf_cache[key] = surf + _surf_tick_usage[key] = _tick + _tick += 1 + return surf + + +# The actual position on the screen where the surf is to be blitted, rather than the specified +# anchor position. +def _blitpos(angle, pos, anchor, size, text): + angle = _resolveangle(angle) + x, y = pos + sw, sh = size + hanchor, vanchor = anchor + if angle: + w0, h0 = _unrotated_size[(size, angle, text)] + S, C = sin(radians(angle)), cos(radians(angle)) + dx, dy = (0.5 - hanchor) * w0, (0.5 - vanchor) * h0 + x += dx * C + dy * S - 0.5 * sw + y += -dx * S + dy * C - 0.5 * sh + else: + x -= hanchor * sw + y -= vanchor * sh + x = int(round(x)) + y = int(round(y)) + return x, y + + +def layout(text, **kwargs): + options = _LayoutOptions(**kwargs) + if options.angle != 0: + raise ValueError("Nonzero angle not yet supported for ptext.layout") + font = getfont(**options.togetfontoptions()) + fl = font.get_linesize() + linesize = fl * options.lineheight + parasize = fl * options.pspace + + spans = _wrap(text, **options.towrapoptions()) + + rects = [] + sw = max(span.linewidth for span in spans) + for span in spans: + y = int(round(span.jpara * parasize + span.jline * linesize)) + rect = pygame.Rect(span.x, y, *font.size(span.text)) + rect.x += int(round(options.align * (sw - span.linewidth))) + rects.append(rect) + sh = max(rect.bottom for rect in rects) + + x0, y0 = _blitpos(options.angle, options.pos, options.anchor, (sw, sh), None) + + # Adjust the rects as necessary to account for outline and shadow. + # TODO: the following is duplicated from _GetsurfOptions.__init__ + dx, dy = 0, 0 + if options.owidth is not None: + opx = ceil(options.owidth * options.fontsize * OUTLINE_UNIT) + dx, dy = max(dx, abs(opx)), max(dy, abs(opx)) + if options.shadow is not None: + spx, spy = (ceil(s * options.fontsize * SHADOW_UNIT) for s in options.shadow) + dx, dy = max(dx, -spx), max(dy, -spy) + rects = [rect.move(x0 + dx, y0 + dy) for rect in rects] + + return [(span.text, rect, span.font) for span, rect in zip(spans, rects)] + + +def draw(text, pos=None, **kwargs): + # if text == '0': + # print("herpaderp") + pass + options = _DrawOptions(pos=pos, **kwargs) + tsurf = getsurf(text, **options.togetsurfoptions()) + pos = _blitpos(options.angle, options.pos, options.anchor, tsurf.get_size(), text) + if options.surf is not None: + options.surf.blit(tsurf, pos) + if AUTO_CLEAN: + clean() + return tsurf, pos + + +def drawbox(text, rect, **kwargs): + options = _DrawboxOptions(**kwargs) + rect = pygame.Rect(rect) + hanchor, vanchor = options.anchor + x = rect.x + hanchor * rect.width + y = rect.y + vanchor * rect.height + fontsize = _fitsize(text, rect.size, **options.tofitsizeoptions()) + return draw(text, pos=(x, y), width=rect.width, fontsize=fontsize, **options.todrawoptions()) + + +def clean(): + global _surf_size_total + memory_limit = MEMORY_LIMIT_MB * (1 << 20) + if _surf_size_total < memory_limit: + return + memory_limit *= MEMORY_REDUCTION_FACTOR + keys = sorted(_surf_cache, key=_surf_tick_usage.get) + for key in keys: + w, h = _surf_cache[key].get_size() + del _surf_cache[key] + del _surf_tick_usage[key] + _surf_size_total -= 4 * w * h + if _surf_size_total < memory_limit: + break diff --git a/irlc/utils/timer.py b/irlc/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..14614f144d8c1d53db0648820bf456f5501c161a --- /dev/null +++ b/irlc/utils/timer.py @@ -0,0 +1,45 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from collections import defaultdict +import datetime + +class Timer: + def __init__(self, show_time_per_tic=True, start=False): + self.tspend = defaultdict(lambda: 0) + self.t_start = {} + self.n_tics = defaultdict(lambda: 0) + self.s_ = None + self.show_time_per_tic = show_time_per_tic + if start: + self.start() + + + def start(self): + self.s_ = datetime.datetime.now() + + def tic(self, name): + self.lst = name + self.t_start[name] = datetime.datetime.now() + + def toc(self, name=None): + name = name if name is not None else self.lst + self.tspend[name] += (datetime.datetime.now() - self.t_start[name]).total_seconds() + self.n_tics[name] += 1 + + def display(self): + Tknown = sum(self.tspend.values()) + if self.s_ is not None: + Ttot = (datetime.datetime.now() - self.s_).total_seconds() + + if self.show_time_per_tic: + spend = {k: v/self.n_tics[k] for k, v in self.tspend.items()} + # Tknown = + else: + spend = self.tspend + + s = ", ".join( [f"{k}: {v:.2f} ({int(self.tspend[k]/Tknown*100)} %)" for k, v in spend.items()] ) + + + if self.s_ is not None: + return f"{Ttot:.2f} ({(Tknown/Ttot*100):.1f} %). " + s + else: + return s diff --git a/requirements_conda.txt b/requirements_conda.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b402cf578db0e423773c3f874ce0d9e79d35232 --- /dev/null +++ b/requirements_conda.txt @@ -0,0 +1,16 @@ +# On linux, you also need these packages: +# apt install build-essential python3.11-dev swig +# (replace 3.11 with your python version; this works on Ubuntu 23.10 mantic) +gymnasium[box2d]<=0.29.1 +torch +sympy +tqdm +seaborn +pillow +scikit-learn +matplotlib +requests # Required when updating the local files (read stuff from gitlab). +pyqt5 +pygame +numpy<=1.26.4 # Version 2 has a problem with gymnasium + diff --git a/requirements_pip.txt b/requirements_pip.txt new file mode 100644 index 0000000000000000000000000000000000000000..a375525d9963ced95672329d771550ff26c5d5ae --- /dev/null +++ b/requirements_pip.txt @@ -0,0 +1,3 @@ +# PyQt5>=5.15.9 # 5.15.8 has a problem with matplotlib; but newest version is 5.15.9 +unitgrade +-e . diff --git a/solutions/ex00/fruit_homework_TODO_1.py b/solutions/ex00/fruit_homework_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..b498ceb06d3fa5ab7a183da45428d83a52e1a5cb --- /dev/null +++ b/solutions/ex00/fruit_homework_TODO_1.py @@ -0,0 +1 @@ + return a+b \ No newline at end of file diff --git a/solutions/ex00/fruit_homework_TODO_2.py b/solutions/ex00/fruit_homework_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..f546843734ee5aa45c7345b9bd8d3bfdca5600ff --- /dev/null +++ b/solutions/ex00/fruit_homework_TODO_2.py @@ -0,0 +1 @@ + return ["mr " + a for a in animals] \ No newline at end of file diff --git a/solutions/ex00/fruit_homework_TODO_3.py b/solutions/ex00/fruit_homework_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..5be72c6f38c133f5b1a2f5a09acff6d2eefda3ee --- /dev/null +++ b/solutions/ex00/fruit_homework_TODO_3.py @@ -0,0 +1 @@ + return sum([x * p for x, p in p_dict.items()]) \ No newline at end of file diff --git a/solutions/ex00/fruit_homework_TODO_4.py b/solutions/ex00/fruit_homework_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..c836aa8e22e9b64e93d6c1874af9d506f1e36f22 --- /dev/null +++ b/solutions/ex00/fruit_homework_TODO_4.py @@ -0,0 +1 @@ + return list(order_dict.keys()) \ No newline at end of file diff --git a/solutions/ex00/fruit_homework_TODO_5.py b/solutions/ex00/fruit_homework_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..84c3b39c208f1eb0dbda3f5f8001c82e9af2cb4b --- /dev/null +++ b/solutions/ex00/fruit_homework_TODO_5.py @@ -0,0 +1 @@ + return self.prices[fruit] \ No newline at end of file diff --git a/solutions/ex00/fruit_homework_TODO_6.py b/solutions/ex00/fruit_homework_TODO_6.py new file mode 100644 index 0000000000000000000000000000000000000000..dc8b7ab2f626d208b2b5af689970745c6f663a09 --- /dev/null +++ b/solutions/ex00/fruit_homework_TODO_6.py @@ -0,0 +1 @@ + return sum([quantity * self.cost(fruit) for fruit, quantity in order.items()]) \ No newline at end of file diff --git a/solutions/ex00/fruit_homework_TODO_7.py b/solutions/ex00/fruit_homework_TODO_7.py new file mode 100644 index 0000000000000000000000000000000000000000..f02ea8d2bbf4b264771be3878c5caf72854b2666 --- /dev/null +++ b/solutions/ex00/fruit_homework_TODO_7.py @@ -0,0 +1,2 @@ + cs = [s.price_of_order(order) for s in fruit_shops] + best_shop = fruit_shops[cs.index(min(cs))] \ No newline at end of file diff --git a/solutions/ex01/bobs_friend_TODO_1.py b/solutions/ex01/bobs_friend_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..2d03d7c5a8beff870e8ced0000a122c5b9952d75 --- /dev/null +++ b/solutions/ex01/bobs_friend_TODO_1.py @@ -0,0 +1,3 @@ + + self.s = self.x0 + \ No newline at end of file diff --git a/solutions/ex01/bobs_friend_TODO_2.py b/solutions/ex01/bobs_friend_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..9caf28ad6988a09d81ea82bcebf257c4fd4caf0c --- /dev/null +++ b/solutions/ex01/bobs_friend_TODO_2.py @@ -0,0 +1,9 @@ + terminated = True + if a == 0: + s_next = self.s * 1.1 + else: + if np.random.rand() < 1/4: + s_next = 0 + else: + s_next = self.s + 12 + reward = s_next - self.s \ No newline at end of file diff --git a/solutions/ex01/bobs_friend_TODO_3.py b/solutions/ex01/bobs_friend_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..8399f7fba970e6acf6b370dadd567754b5f8bc7e --- /dev/null +++ b/solutions/ex01/bobs_friend_TODO_3.py @@ -0,0 +1 @@ + return 0 \ No newline at end of file diff --git a/solutions/ex01/bobs_friend_TODO_4.py b/solutions/ex01/bobs_friend_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..36a268f2fa289cbbb6a96ae75376e8e1cc5ea729 --- /dev/null +++ b/solutions/ex01/bobs_friend_TODO_4.py @@ -0,0 +1 @@ + return 1 \ No newline at end of file diff --git a/solutions/ex01/chess_TODO_1.py b/solutions/ex01/chess_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..f8752f92f9185d2b2e05cd157a1b5372b6b9560b --- /dev/null +++ b/solutions/ex01/chess_TODO_1.py @@ -0,0 +1 @@ + self.s = [] \ No newline at end of file diff --git a/solutions/ex01/chess_TODO_2.py b/solutions/ex01/chess_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..9b829905c94cfd57d43edfcfeecdfe8bd36039b6 --- /dev/null +++ b/solutions/ex01/chess_TODO_2.py @@ -0,0 +1,7 @@ + if np.random.rand() < self.p_draw: + game_outcome = 0 + else: + if np.random.rand() < self.p_win: + game_outcome = 1 + else: + game_outcome = -1 \ No newline at end of file diff --git a/solutions/ex01/chess_TODO_3.py b/solutions/ex01/chess_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..29e14434d6508e15f098119b60f2e0e6e15390d9 --- /dev/null +++ b/solutions/ex01/chess_TODO_3.py @@ -0,0 +1 @@ + done = len(self.s) >= 2 and self.s[-1] == self.s[-2] and self.s[-1] != 0 \ No newline at end of file diff --git a/solutions/ex01/chess_TODO_4.py b/solutions/ex01/chess_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..d45e38a4592d6c99a05d741916eb6655c2babebf --- /dev/null +++ b/solutions/ex01/chess_TODO_4.py @@ -0,0 +1 @@ + r = self.s[-1] == 1 if done else 0 \ No newline at end of file diff --git a/solutions/ex01/chess_TODO_5.py b/solutions/ex01/chess_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..c270359f1626954b313683214d93beb8a786521b --- /dev/null +++ b/solutions/ex01/chess_TODO_5.py @@ -0,0 +1 @@ + stats, _ = train(env, Agent(env), num_episodes=T) \ No newline at end of file diff --git a/solutions/ex01/inventory_environment_TODO_1.py b/solutions/ex01/inventory_environment_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5f5a775b790f2dfb76f573b6c1b4ee7b4a8442fb --- /dev/null +++ b/solutions/ex01/inventory_environment_TODO_1.py @@ -0,0 +1,5 @@ + s_next = max(0, min(2, self.s-w+a)) # next state; x_{k+1} = f_k(x_k, u_k, w_k) + reward = -(a + (self.s + a - w)**2) # reward = -cost = -g_k(x_k, u_k, w_k) + terminated = self.k == self.N-1 # Have we terminated? (i.e. is k==N-1) + self.s = s_next # update environment state + self.k += 1 # update current time step \ No newline at end of file diff --git a/solutions/ex01/inventory_environment_TODO_2.py b/solutions/ex01/inventory_environment_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..bebe04bc25189f1dad2fbd10c30070cd823b5b9e --- /dev/null +++ b/solutions/ex01/inventory_environment_TODO_2.py @@ -0,0 +1 @@ + return np.random.choice(3) # Return a random action \ No newline at end of file diff --git a/solutions/ex01/inventory_environment_TODO_3.py b/solutions/ex01/inventory_environment_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..0855951dfbae56cfc204cab87b19954e2e4bf074 --- /dev/null +++ b/solutions/ex01/inventory_environment_TODO_3.py @@ -0,0 +1,7 @@ + a = agent.pi(s, k) + sp, r, terminated, truncated, metadata = env.step(a) + agent.train(s, a, sp, r, terminated) + s = sp + J += r + if terminated or truncated: + break \ No newline at end of file diff --git a/solutions/ex01/pacman_hardcoded_TODO_1.py b/solutions/ex01/pacman_hardcoded_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5c532d7fae997f5c6a4d7e8383dbbb14af8a12e8 --- /dev/null +++ b/solutions/ex01/pacman_hardcoded_TODO_1.py @@ -0,0 +1,7 @@ + if k < 7: + return 'South' + elif k < 14: + return 'East' + elif k < 21: + return 'North' + elif k < 28: \ No newline at end of file diff --git a/solutions/ex02/dp_TODO_1.py b/solutions/ex02/dp_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..a266a96faac19f97576e30e2ab483c3518f64eb4 --- /dev/null +++ b/solutions/ex02/dp_TODO_1.py @@ -0,0 +1,4 @@ + Qu = {u: sum(pw * (model.g(x, u, w, k) + J[k + 1][model.f(x, u, w, k)]) for w, pw in model.Pw(x, u, k).items()) for u in model.A(x, k)} + umin = min(Qu, key=Qu.get) + J[k][x] = Qu[umin] # Compute the expected cost function + pi[k][x] = umin # Compute the optimal policy \ No newline at end of file diff --git a/solutions/ex02/dp_agent_TODO_1.py b/solutions/ex02/dp_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..18f9f782a449efa896b4e9516a633aff48d78a37 --- /dev/null +++ b/solutions/ex02/dp_agent_TODO_1.py @@ -0,0 +1 @@ + action = self.pi_[k][s] \ No newline at end of file diff --git a/solutions/ex02/flower_store_TODO_1.py b/solutions/ex02/flower_store_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..4fec1d31c73c6e3a40dd0f340ef1e3c40f4a950f --- /dev/null +++ b/solutions/ex02/flower_store_TODO_1.py @@ -0,0 +1,23 @@ +class FlowerStoreModel(InventoryDPModel): + def __init__(self, N=3, c=0., prob_empty=False): + self.c = c + self.prob_empty = prob_empty + super().__init__(N=N) + + def g(self, x, u, w, k): # Cost function g_k(x,u,w) + if self.prob_empty: + return 0 + return u * self.c + np.abs(x + u - w) + + def f(self, x, u, w, k): # Dynamics f_k(x,u,w) + return max(0, min(max(self.S(k)), x + u - w)) + + def Pw(self, x, u, k): # Distribution over random disturbances + pw = {0: .1, 1: .3, 2: .6} + return pw + + def gN(self, x): + if self.prob_empty: + return -1 if x == 1 else 0 + else: + return 0 \ No newline at end of file diff --git a/solutions/ex02/flower_store_TODO_2.py b/solutions/ex02/flower_store_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..29eed8491b7026d6ca202083467679520c7cdb23 --- /dev/null +++ b/solutions/ex02/flower_store_TODO_2.py @@ -0,0 +1,3 @@ + model = FlowerStoreModel(N=N, c=c, prob_empty=False) + J, pi = DP_stochastic(model) + u = pi[0][x0] \ No newline at end of file diff --git a/solutions/ex02/flower_store_TODO_3.py b/solutions/ex02/flower_store_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..62bdb1dc5fd12c76ae2092df7cc21e98b72d55cc --- /dev/null +++ b/solutions/ex02/flower_store_TODO_3.py @@ -0,0 +1,3 @@ + model = FlowerStoreModel(N=N, prob_empty=True) + J, pi = DP_stochastic(model) + pr_empty = -J[0][x0] \ No newline at end of file diff --git a/solutions/ex02/graph_traversal_TODO_1.py b/solutions/ex02/graph_traversal_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..2eb7542942bd0d36aa3bfdc99abe428c6b1b52eb --- /dev/null +++ b/solutions/ex02/graph_traversal_TODO_1.py @@ -0,0 +1 @@ + return u \ No newline at end of file diff --git a/solutions/ex02/graph_traversal_TODO_2.py b/solutions/ex02/graph_traversal_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..cd59ca2ac4d2ef3ae0f0d2162383b16cdda1b597 --- /dev/null +++ b/solutions/ex02/graph_traversal_TODO_2.py @@ -0,0 +1 @@ + return self.G[(x,u)] \ No newline at end of file diff --git a/solutions/ex02/graph_traversal_TODO_3.py b/solutions/ex02/graph_traversal_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..3bdbfbc0087daff787fe6aad1152dde1401cf76b --- /dev/null +++ b/solutions/ex02/graph_traversal_TODO_3.py @@ -0,0 +1 @@ + return 0 if x == self.t else np.inf \ No newline at end of file diff --git a/solutions/ex02/inventory_TODO_1.py b/solutions/ex02/inventory_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..38ded4d6bdfbf7113936292bd1bb4857da6c48ac --- /dev/null +++ b/solutions/ex02/inventory_TODO_1.py @@ -0,0 +1 @@ + return {0:.1, 1:.7, 2:0.2} \ No newline at end of file diff --git a/solutions/ex03/inventory_evaluation_TODO_1.py b/solutions/ex03/inventory_evaluation_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..ec037ab6c0e4495d508b769cb3f844361bc27766 --- /dev/null +++ b/solutions/ex03/inventory_evaluation_TODO_1.py @@ -0,0 +1,2 @@ + k = 0 + expected_number_of_items = sum([p * model.f(x, u, w, k=0) for w, p in model.Pw(x, u, k).items()]) \ No newline at end of file diff --git a/solutions/ex03/inventory_evaluation_TODO_2.py b/solutions/ex03/inventory_evaluation_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..e2897b1bbe779fd70b1d103bc72fada72f0e465a --- /dev/null +++ b/solutions/ex03/inventory_evaluation_TODO_2.py @@ -0,0 +1,12 @@ + model = InventoryDPModel() + N = model.N + J = [{} for _ in range(N + 1)] + J[N] = {x: model.gN(x) for x in model.S(model.N)} + for k in range(N - 1, -1, -1): + for x in model.S(k): + Qu = {u: sum(pw * (model.g(x, u, w, k) + J[k + 1][model.f(x, u, w, k)]) for w, pw in model.Pw(x, u, k).items()) for u + in model.A(x, k)} + + umin = pi[k][x] # min(Qu, key=Qu.get) + J[k][x] = Qu[umin] # Compute the expected cost function + J_pi_x0 = J[0][x0] \ No newline at end of file diff --git a/solutions/ex03/kuramoto_TODO_1.py b/solutions/ex03/kuramoto_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..27e0fcd91a1912fcd9510bd5c42085b40f5f035d --- /dev/null +++ b/solutions/ex03/kuramoto_TODO_1.py @@ -0,0 +1 @@ + symbolic_f_list = [u[0] + sym.cos(x[0])] \ No newline at end of file diff --git a/solutions/ex03/kuramoto_TODO_2.py b/solutions/ex03/kuramoto_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..0f1d61147d8b54964f5a35d9493bd13ead61df13 --- /dev/null +++ b/solutions/ex03/kuramoto_TODO_2.py @@ -0,0 +1 @@ + f_value = cmodel.f(x, u, t=0) \ No newline at end of file diff --git a/solutions/ex03/kuramoto_TODO_3.py b/solutions/ex03/kuramoto_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..f28ac99d8a97bd8ed4423c5dbaff106350e41d07 --- /dev/null +++ b/solutions/ex03/kuramoto_TODO_3.py @@ -0,0 +1,7 @@ + Delta = tt[k + 1] - tt[k] + xn = xs[k] + k1 = np.asarray(f(xn, u)) + k2 = np.asarray(f(xn + Delta * k1/2, u)) + k3 = np.asarray(f(xn + Delta * k2/2, u)) + k4 = np.asarray(f(xn + Delta * k3, u)) + x_next = xn + 1/6 * Delta * (k1 + 2*k2 + 2*k3 + k4) \ No newline at end of file diff --git a/solutions/ex03/toy_2d_control_TODO_1.py b/solutions/ex03/toy_2d_control_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..137a63b9c00938de0a696f65e96e233a0fec8e0f --- /dev/null +++ b/solutions/ex03/toy_2d_control_TODO_1.py @@ -0,0 +1,2 @@ + def sym_f(self, x, u, t=None): + return [x[1], sym.cos(x[0] + u[0])] \ No newline at end of file diff --git a/solutions/ex03/toy_2d_control_TODO_2.py b/solutions/ex03/toy_2d_control_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..0b32ca6fe36c550f93539ec2737c272747af152a --- /dev/null +++ b/solutions/ex03/toy_2d_control_TODO_2.py @@ -0,0 +1,4 @@ + toy = Toy2DControl() + x0 = np.asarray([np.pi/2, 0]) + xs, us, ts = toy.simulate( x0=x0, u_fun = u0, t0=0, tF=T) + wT = xs[-1][0] \ No newline at end of file diff --git a/solutions/ex04/discrete_kuramoto_TODO_1.py b/solutions/ex04/discrete_kuramoto_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..e0e9d220bfca21780cfec5223e38ff4958686882 --- /dev/null +++ b/solutions/ex04/discrete_kuramoto_TODO_1.py @@ -0,0 +1 @@ + f_euler = dmodel.f(x, u, k=0) \ No newline at end of file diff --git a/solutions/ex04/discrete_kuramoto_TODO_2.py b/solutions/ex04/discrete_kuramoto_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..001427a3f9d5aed4c60fe95eb3a97b90aae10c8e --- /dev/null +++ b/solutions/ex04/discrete_kuramoto_TODO_2.py @@ -0,0 +1 @@ + f_euler_derivative, _ = dmodel.f_jacobian(x, u) \ No newline at end of file diff --git a/solutions/ex04/discrete_kuramoto_TODO_3.py b/solutions/ex04/discrete_kuramoto_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..5b50fea70c4ba210d1e03515ff6168f1a468badd --- /dev/null +++ b/solutions/ex04/discrete_kuramoto_TODO_3.py @@ -0,0 +1 @@ + next_x, cost, terminated, _, metadata = env.step([u]) \ No newline at end of file diff --git a/solutions/ex04/model_pendulum_TODO_1.py b/solutions/ex04/model_pendulum_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..bbdc0738a7080bfbd4b7f03fb06eb5c882a56314 --- /dev/null +++ b/solutions/ex04/model_pendulum_TODO_1.py @@ -0,0 +1 @@ + x_dot = model.f([1, 2], [0], t=0) \ No newline at end of file diff --git a/solutions/ex04/model_pendulum_TODO_2.py b/solutions/ex04/model_pendulum_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..59b6a6b42c376a52c882f8b7b8de42ad7d5cb0e2 --- /dev/null +++ b/solutions/ex04/model_pendulum_TODO_2.py @@ -0,0 +1 @@ + x_dot_numpy = model.f([1, 2], [0], t=0) \ No newline at end of file diff --git a/solutions/ex04/pid_TODO_1.py b/solutions/ex04/pid_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..be1769bbc56e0ee03a1640f22d0d6728e541d7d9 --- /dev/null +++ b/solutions/ex04/pid_TODO_1.py @@ -0,0 +1,6 @@ + e = self.target - x + # if self.e_prior == 0 and self.I == 0: + # self.e_prior = e + self.I = self.I + e * self.dt + u = self.Kp * e + self.Ki * self.I + self.Kd * (e - self.e_prior)/self.dt + self.e_prior = e \ No newline at end of file diff --git a/solutions/ex04/pid_TODO_2.py b/solutions/ex04/pid_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..7468df372bdf3409cc75603b2e2b18ebf479674f --- /dev/null +++ b/solutions/ex04/pid_TODO_2.py @@ -0,0 +1 @@ + u = pid.pi(x_cur[0]) \ No newline at end of file diff --git a/solutions/ex04/pid_car_TODO_1.py b/solutions/ex04/pid_car_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..4201c479926696c912381e7ffe10823507f4a151 --- /dev/null +++ b/solutions/ex04/pid_car_TODO_1.py @@ -0,0 +1,2 @@ + self.pid_angle = PID(dt=env.discrete_model.dt, Kp=1.0, Ki=0, Kd=0, target=0) + self.pid_velocity = PID(dt=env.discrete_model.dt, Kp=1.5, Ki=0, Kd=0, target=v_target) \ No newline at end of file diff --git a/solutions/ex04/pid_car_TODO_2.py b/solutions/ex04/pid_car_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..d7b67d3b6753bb62a99e28f23c8be52076b35e9d --- /dev/null +++ b/solutions/ex04/pid_car_TODO_2.py @@ -0,0 +1,2 @@ + xx = x[5] + x[3] if self.use_both_x5_x3 else x[5] + u = np.asarray([self.pid_angle.pi(xx), self.pid_velocity.pi(x[0])]) \ No newline at end of file diff --git a/solutions/ex04/pid_locomotive_agent_TODO_1.py b/solutions/ex04/pid_locomotive_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..e8912a077ba579f6c8bd90e1688686f353692c49 --- /dev/null +++ b/solutions/ex04/pid_locomotive_agent_TODO_1.py @@ -0,0 +1 @@ + self.pid = PID(dt=dt, Kp=Kp, Ki=Ki, Kd=Kd, target=target) \ No newline at end of file diff --git a/solutions/ex04/pid_locomotive_agent_TODO_2.py b/solutions/ex04/pid_locomotive_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..a512e14a83f51a2d69a4afee52e59b3528ae20f0 --- /dev/null +++ b/solutions/ex04/pid_locomotive_agent_TODO_2.py @@ -0,0 +1 @@ + u = self.pid.pi(x[0]) \ No newline at end of file diff --git a/solutions/ex04/pid_lunar_TODO_1.py b/solutions/ex04/pid_lunar_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..7a4671f8db2f159d0993f2b077721380e744a042 --- /dev/null +++ b/solutions/ex04/pid_lunar_TODO_1.py @@ -0,0 +1,2 @@ + alt_adj = self.pid_alt.pi( -(np.abs(x[0])- x[1]) ) + ang_adj = self.pid_ang.pi( -((.25 * np.pi) * (x[0] + x[2]) - x[4]) ) \ No newline at end of file diff --git a/solutions/ex04/pid_pendulum_TODO_1.py b/solutions/ex04/pid_pendulum_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..d21532b4596dacb5efbeccce0781561304ad8891 --- /dev/null +++ b/solutions/ex04/pid_pendulum_TODO_1.py @@ -0,0 +1,2 @@ + u = self.pid.pi(x[0]) + u = np.clip(u, self.env.action_space.low, self.env.action_space.high) \ No newline at end of file diff --git a/solutions/ex04/pid_pendulum_TODO_2.py b/solutions/ex04/pid_pendulum_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..6ae32326084e9f78717c46a596b0dc8e7174086e --- /dev/null +++ b/solutions/ex04/pid_pendulum_TODO_2.py @@ -0,0 +1 @@ + agent = PIDPendulumAgent(env, dt=env.dt, Kp=12, Ki=0, Kd=2, target_angle=0) \ No newline at end of file diff --git a/solutions/ex04/pid_pendulum_TODO_3.py b/solutions/ex04/pid_pendulum_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..388da4755975d6e6772c294f26dc97a4d2f3d9ed --- /dev/null +++ b/solutions/ex04/pid_pendulum_TODO_3.py @@ -0,0 +1 @@ + agent = PIDPendulumAgent(env, dt=env.dt, Kp=12, Ki=2, Kd=2, target_angle=np.pi/6) \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_1.py b/solutions/ex05/direct_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..943a62d27dbd778b5eddd0c82122cfa0799174ad --- /dev/null +++ b/solutions/ex05/direct_TODO_1.py @@ -0,0 +1 @@ + guess = {k: solutions[i - 1]['fun'][k] for k in ['t0', 'tF', 'x', 'u'] } \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_10.py b/solutions/ex05/direct_TODO_10.py new file mode 100644 index 0000000000000000000000000000000000000000..30c2fff9faabb09f8331bb95e183db33f0751221 --- /dev/null +++ b/solutions/ex05/direct_TODO_10.py @@ -0,0 +1 @@ + x_interp = xs[:, k] + tau * fs[:, k] + (tau ** 2 / (2 * hk)) * (fs[:, k + 1] - fs[:, k]) \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_2.py b/solutions/ex05/direct_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..93276c3bc9f36521881d60e44ed738dfe6636d77 --- /dev/null +++ b/solutions/ex05/direct_TODO_2.py @@ -0,0 +1,2 @@ + z, z0, z_lb, z_ub = z + xs[k], z0 + list(guess['x'](tk).flat), z_lb + x_low, z_ub + x_high + z, z0, z_lb, z_ub = z + us[k], z0 + list(guess['u'](tk).flat), z_lb + u_low, z_ub + u_high \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_3.py b/solutions/ex05/direct_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..2a48c18f75f3e1558e571ef4067568ee77821f80 --- /dev/null +++ b/solutions/ex05/direct_TODO_3.py @@ -0,0 +1,2 @@ + z, z0, z_lb, z_ub = z+[t0], z0+[guess['t0']], z_lb+[model.t0_bound().low[0]], z_ub+[model.t0_bound().high[0]] + z, z0, z_lb, z_ub = z+[tF], z0+[guess['tF']], z_lb+[model.tF_bound().low[0]], z_ub+[model.tF_bound().high[0]] \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_4.py b/solutions/ex05/direct_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..a17d399eead11f94af4e6bb3c2603a16a9d5cf30 --- /dev/null +++ b/solutions/ex05/direct_TODO_4.py @@ -0,0 +1,2 @@ + fs.append(model.sym_f(x=xs[k], u=us[k], t=ts[k])) + cs.append(cost.sym_c(x=xs[k], u=us[k], t=ts[k])) \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_5.py b/solutions/ex05/direct_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..4503612468fbb19116e8be806e3d13bbe045f6b8 --- /dev/null +++ b/solutions/ex05/direct_TODO_5.py @@ -0,0 +1,2 @@ + hk = (ts[k + 1] - ts[k]) + J += .5 * hk * (cs[k] + cs[k + 1]) \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_6.py b/solutions/ex05/direct_TODO_6.py new file mode 100644 index 0000000000000000000000000000000000000000..d44ee27290c29626ebdf058261237ee65daaa73a --- /dev/null +++ b/solutions/ex05/direct_TODO_6.py @@ -0,0 +1 @@ + Ieq.append((xs[k+1][j] - xs[k][j]) - 0.5*hk*(fs[k+1][j] + fs[k][j])) \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_7.py b/solutions/ex05/direct_TODO_7.py new file mode 100644 index 0000000000000000000000000000000000000000..40f3af49c7238ac22b21afc9d978e7fb5c661c7d --- /dev/null +++ b/solutions/ex05/direct_TODO_7.py @@ -0,0 +1 @@ + Iineq += model.sym_h(x=xs[k], u=us[k], t=ts[k]) \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_8.py b/solutions/ex05/direct_TODO_8.py new file mode 100644 index 0000000000000000000000000000000000000000..10dcc178f95c3c856a6bc23168aa7b781101a71d --- /dev/null +++ b/solutions/ex05/direct_TODO_8.py @@ -0,0 +1 @@ + J_jac = sym.lambdify([z], sym.derive_by_array(J, z), modules='numpy') \ No newline at end of file diff --git a/solutions/ex05/direct_TODO_9.py b/solutions/ex05/direct_TODO_9.py new file mode 100644 index 0000000000000000000000000000000000000000..5ed2dd8f65a38e268a96829e5d32aa0a9602df1a --- /dev/null +++ b/solutions/ex05/direct_TODO_9.py @@ -0,0 +1,3 @@ + for k in range(len(ts) - 1): + if ts[k] <= t_new and t_new <= ts[k + 1]: + break \ No newline at end of file diff --git a/solutions/ex05/direct_agent_TODO_1.py b/solutions/ex05/direct_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..dca993fcb24b2e3f44b7c0229c15225af74e5867 --- /dev/null +++ b/solutions/ex05/direct_agent_TODO_1.py @@ -0,0 +1 @@ + self.ufun = solutions[-1]['fun']['u'] \ No newline at end of file diff --git a/solutions/ex05/direct_agent_TODO_2.py b/solutions/ex05/direct_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..311c0af323b01f6295f9936644d401545dc02ba5 --- /dev/null +++ b/solutions/ex05/direct_agent_TODO_2.py @@ -0,0 +1,7 @@ + t = info['time_seconds'] + if t > self.ts_grid[-1]: + print("Simulation time is", t, "which exceeds the maximal planning horizon t_F =", self.ts_grid[-1]) + raise Exception("Time exceed agents planning horizon") + + u = self.ufun(t) + u = np.asarray(self.env.discrete_model.phi_u(u)) \ No newline at end of file diff --git a/solutions/ex05/direct_cartpole_kelly_TODO_1.py b/solutions/ex05/direct_cartpole_kelly_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..c3da19ea709346d5b01a715fd7ecb31c099908ad --- /dev/null +++ b/solutions/ex05/direct_cartpole_kelly_TODO_1.py @@ -0,0 +1,2 @@ + Q = np.zeros((4, 4)) + return SymbolicQRCost(Q=Q, R=np.asarray([[1.0]]) ) \ No newline at end of file diff --git a/solutions/ex05/direct_cartpole_kelly_TODO_2.py b/solutions/ex05/direct_cartpole_kelly_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..11a9b85305f2ee81c9b9a659a221a68f1633e215 --- /dev/null +++ b/solutions/ex05/direct_cartpole_kelly_TODO_2.py @@ -0,0 +1,2 @@ + duration = 2 + return Box(duration, duration, shape=(1,)) \ No newline at end of file diff --git a/solutions/ex05/model_brachistochrone_TODO_1.py b/solutions/ex05/model_brachistochrone_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..056e07343fbf43a6270238dadee15550cc74c256 --- /dev/null +++ b/solutions/ex05/model_brachistochrone_TODO_1.py @@ -0,0 +1 @@ + cost = SymbolicQRCost(Q=np.zeros((3,3)), R = np.zeros((1,1)), qc=1.0) \ No newline at end of file diff --git a/solutions/ex05/model_brachistochrone_TODO_2.py b/solutions/ex05/model_brachistochrone_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..bce8341a0d9a7d21cb435d75bbc2a441467c406d --- /dev/null +++ b/solutions/ex05/model_brachistochrone_TODO_2.py @@ -0,0 +1,3 @@ + v = x[2] + uu = u[0] + xp = [v * sym.sin(uu), -v * sym.cos(uu), self.g * sym.cos(uu)] \ No newline at end of file diff --git a/solutions/ex05/model_brachistochrone_TODO_3.py b/solutions/ex05/model_brachistochrone_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..1e993c335fc4ac387dd382abfa490a269a7b3237 --- /dev/null +++ b/solutions/ex05/model_brachistochrone_TODO_3.py @@ -0,0 +1 @@ + return [ -x[1] - x[0]/2 - self.h ] \ No newline at end of file diff --git a/solutions/ex06/boeing_lqr_TODO_1.py b/solutions/ex06/boeing_lqr_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..649101b863582d56a84037200d9af8aa6f4f240a --- /dev/null +++ b/solutions/ex06/boeing_lqr_TODO_1.py @@ -0,0 +1 @@ + Q, R, q = compute_Q_R_q(model, dt) \ No newline at end of file diff --git a/solutions/ex06/boeing_lqr_TODO_2.py b/solutions/ex06/boeing_lqr_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..ba3dbc6d413831fcf93881aaa3cc7c57de0faa8a --- /dev/null +++ b/solutions/ex06/boeing_lqr_TODO_2.py @@ -0,0 +1 @@ + agent = LQRAgent(env, A=A, B=B, d=d, Q=Q, R=R, q=q) \ No newline at end of file diff --git a/solutions/ex06/boeing_lqr_TODO_3.py b/solutions/ex06/boeing_lqr_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..e5328ff95c26a95435a5b895e55ae51f313de1b6 --- /dev/null +++ b/solutions/ex06/boeing_lqr_TODO_3.py @@ -0,0 +1,3 @@ + Q = cost.Q * dt + R = cost.R * dt + q = cost.q * dt \ No newline at end of file diff --git a/solutions/ex06/boeing_lqr_TODO_4.py b/solutions/ex06/boeing_lqr_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..51001722bf435248844d7ff1017d944e8cb76515 --- /dev/null +++ b/solutions/ex06/boeing_lqr_TODO_4.py @@ -0,0 +1,2 @@ + B_discrete = scipy.linalg.inv(model.A) @ (A_discrete - np.eye(model.A.shape[0])) @ model.B + d_discrete = scipy.linalg.inv(model.A) @ (A_discrete - np.eye(model.A.shape[0])) @ d \ No newline at end of file diff --git a/solutions/ex06/dlqr_TODO_1.py b/solutions/ex06/dlqr_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..7615471215df32bb0a15c21521405bc6c5a98620 --- /dev/null +++ b/solutions/ex06/dlqr_TODO_1.py @@ -0,0 +1,2 @@ +import matplotlib +matplotlib.use('agg') \ No newline at end of file diff --git a/solutions/ex06/dlqr_TODO_2.py b/solutions/ex06/dlqr_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..f34eec89ded8b2cd4c0e26f7ae7c591acee7983c --- /dev/null +++ b/solutions/ex06/dlqr_TODO_2.py @@ -0,0 +1 @@ + V[N], v[N], vc[N] = QN, qN, qcN \ No newline at end of file diff --git a/solutions/ex06/dlqr_TODO_3.py b/solutions/ex06/dlqr_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..14bb8adaaca89f614c42617c8fc365d0942ef10a --- /dev/null +++ b/solutions/ex06/dlqr_TODO_3.py @@ -0,0 +1,4 @@ + Suu = R[k] + B[k].T @ (V[k+1] + mu * In) @ B[k] + Sux = H[k] + B[k].T @ (V[k+1] + mu * In) @ A[k] + Su = r[k] + B[k].T @ v[k + 1] + B[k].T @ V[k + 1] @ d[k] + L[k] = -np.linalg.solve(Suu, Sux) \ No newline at end of file diff --git a/solutions/ex06/lqr_agent_TODO_1.py b/solutions/ex06/lqr_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..f82d69ea2e0a0dd9908e4225fc5d91d15aeba58d --- /dev/null +++ b/solutions/ex06/lqr_agent_TODO_1.py @@ -0,0 +1 @@ + (self.L, self.l), _ = LQR(A=[A]*N, B=[B]*N, d=[d]*N if d is not None else None, Q=[Q]*N, q=[q]*N if q is not None else None, R=[R]*N) \ No newline at end of file diff --git a/solutions/ex06/lqr_agent_TODO_2.py b/solutions/ex06/lqr_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..e8994b09a036a2f4a3fdc42140cef04aa9f56c86 --- /dev/null +++ b/solutions/ex06/lqr_agent_TODO_2.py @@ -0,0 +1 @@ + u = self.L[k] @ x + self.l[k] \ No newline at end of file diff --git a/solutions/ex06/lqr_pid_TODO_1.py b/solutions/ex06/lqr_pid_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..920d0e7e101dad7c87a2df2be8ecd3209a946a8d --- /dev/null +++ b/solutions/ex06/lqr_pid_TODO_1.py @@ -0,0 +1,3 @@ + def pi(self,x, k, info=None): + action = self.L[0] @ x + self.l[0] + return action \ No newline at end of file diff --git a/solutions/ex06/lqr_pid_TODO_2.py b/solutions/ex06/lqr_pid_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..2f1ad2ab157ba7524df0a89e73c93cdf860e398f --- /dev/null +++ b/solutions/ex06/lqr_pid_TODO_2.py @@ -0,0 +1 @@ + Kp, Kd = (-L0).flat \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_1.py b/solutions/ex07/ilqr_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..c9ada2c3a7adcd0a1d3995bd723716a749f4c08e --- /dev/null +++ b/solutions/ex07/ilqr_TODO_1.py @@ -0,0 +1,2 @@ + l, L = [np.zeros((m,))]*N, [np.zeros((m,n))]*N + x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_10.py b/solutions/ex07/ilqr_TODO_10.py new file mode 100644 index 0000000000000000000000000000000000000000..97423181e2f17ec2e986788e79b4fd978a7c6838 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_10.py @@ -0,0 +1 @@ + Delta, mu = max(1.0, Delta) * Delta_0, max(mu_min, mu * Delta) # Increase \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_11.py b/solutions/ex07/ilqr_TODO_11.py new file mode 100644 index 0000000000000000000000000000000000000000..dafc65dda3f1d6f36c21e3a0a612e97e4606bafb --- /dev/null +++ b/solutions/ex07/ilqr_TODO_11.py @@ -0,0 +1,4 @@ + R = c_uu + H = c_ux + q, qN = c_x[:-1], c_x[-1] + r = c_u \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_12.py b/solutions/ex07/ilqr_TODO_12.py new file mode 100644 index 0000000000000000000000000000000000000000..b5823c6a0680218628af756b10761f3211c04501 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_12.py @@ -0,0 +1,4 @@ + # fs = [(v[1],v[2]) for v in [model.f(x, u, k, compute_jacobian=True) for k, (x, u) in enumerate(zip(x_bar[:-1], u_bar))]] + fs = [model.f_jacobian(x, u, k) for k, (x, u) in enumerate(zip(x_bar[:-1], u_bar))] + + A, B = zip(*fs) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_13.py b/solutions/ex07/ilqr_TODO_13.py new file mode 100644 index 0000000000000000000000000000000000000000..41ceba56deb0495616a22805f7a38bb9733ec181 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_13.py @@ -0,0 +1,2 @@ + gs = [model.cost.c(x, u, i, compute_gradients=True) for i, (x, u) in enumerate(zip(x_bar[:-1], u_bar))] + c, c_x, c_u, c_xx, c_ux, c_uu = zip(*gs) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_14.py b/solutions/ex07/ilqr_TODO_14.py new file mode 100644 index 0000000000000000000000000000000000000000..8c925083e59871befab170fb5ec809e6311e5452 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_14.py @@ -0,0 +1,3 @@ + c = c + (cN,) + c_x = c_x + (c_xN,) + c_xx = c_xx + (c_xxN,) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_15.py b/solutions/ex07/ilqr_TODO_15.py new file mode 100644 index 0000000000000000000000000000000000000000..73a0fa4ac59c0fedc7f4d7e87645ab4372a80399 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_15.py @@ -0,0 +1 @@ + u_star[i] = u_bar[i] + alpha * l[i] + L[i] @ (x[i] - x_bar[i]) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_16.py b/solutions/ex07/ilqr_TODO_16.py new file mode 100644 index 0000000000000000000000000000000000000000..20904f4a15cb7b9db30da28ccb88bb21a1bb040d --- /dev/null +++ b/solutions/ex07/ilqr_TODO_16.py @@ -0,0 +1 @@ + x[i + 1] = model.f(x[i], u_star[i], i) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_2.py b/solutions/ex07/ilqr_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..a5ff3ca1047f1886bf4e94a5705d33b3eb88f1fc --- /dev/null +++ b/solutions/ex07/ilqr_TODO_2.py @@ -0,0 +1,2 @@ + A, B, c, c_x, c_u, c_xx, c_ux, c_uu = get_derivatives(model, x_bar, u_bar) + J = sum(c) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_3.py b/solutions/ex07/ilqr_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..417755d3e4956fbbf5bc0bb596377781412cb47b --- /dev/null +++ b/solutions/ex07/ilqr_TODO_3.py @@ -0,0 +1 @@ + L, l = backward_pass(A, B, c_x, c_u, c_xx, c_ux, c_uu, mu) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_4.py b/solutions/ex07/ilqr_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..6db866f5b27f7921e409d179462e51c8b8ea1420 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_4.py @@ -0,0 +1 @@ + x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l, alpha=alpha) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_5.py b/solutions/ex07/ilqr_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..4ead664be5795faaebda22e5321ae4673eefdebf --- /dev/null +++ b/solutions/ex07/ilqr_TODO_5.py @@ -0,0 +1,2 @@ + l, L = [np.zeros((m,))] * N, [np.zeros((m, n))] * N + x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_6.py b/solutions/ex07/ilqr_TODO_6.py new file mode 100644 index 0000000000000000000000000000000000000000..2e7b84c3c79530d3e24b420e11f4d15140e6fcd9 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_6.py @@ -0,0 +1,2 @@ + A, B, c, c_x, c_u, c_xx, c_ux, c_uu = get_derivatives(model, x_bar, u_bar) + J_prime = sum(c) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_7.py b/solutions/ex07/ilqr_TODO_7.py new file mode 100644 index 0000000000000000000000000000000000000000..f23d1c9bd15a76d13938844705782aa675cc9e7e --- /dev/null +++ b/solutions/ex07/ilqr_TODO_7.py @@ -0,0 +1 @@ + L, l = backward_pass(A, B, c_x, c_u, c_xx, c_ux, c_uu, mu) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_8.py b/solutions/ex07/ilqr_TODO_8.py new file mode 100644 index 0000000000000000000000000000000000000000..123b9bb988a13d8f828a0af3b37c6a9f213495ff --- /dev/null +++ b/solutions/ex07/ilqr_TODO_8.py @@ -0,0 +1 @@ + J_new = cost_of_trajectory(model, x_hat, u_hat) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_9.py b/solutions/ex07/ilqr_TODO_9.py new file mode 100644 index 0000000000000000000000000000000000000000..1fe4de670d25185a976a1fb36ae4eb04e539a456 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_9.py @@ -0,0 +1 @@ + Delta, mu = min(1.0, Delta) / Delta_0, max(0, mu*Delta) \ No newline at end of file diff --git a/solutions/ex07/ilqr_agent_TODO_1.py b/solutions/ex07/ilqr_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5632d146890b0bcadefd5b04fccf7393338debcf --- /dev/null +++ b/solutions/ex07/ilqr_agent_TODO_1.py @@ -0,0 +1 @@ + u = self.ubar[k] + self.L[k]@ (x-self.xbar[k]) + self.l[k] \ No newline at end of file diff --git a/solutions/ex07/ilqr_pendulum_TODO_1.py b/solutions/ex07/ilqr_pendulum_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..dee07f700ec7c23c71a96ddb29e0e4e1eb9ba7e5 --- /dev/null +++ b/solutions/ex07/ilqr_pendulum_TODO_1.py @@ -0,0 +1 @@ + xs, us, J_hist, L, l = ilqr(model, N, x0, n_iter=n_iter, use_linesearch=use_linesearch) \ No newline at end of file diff --git a/solutions/ex07/linearization_agent_TODO_1.py b/solutions/ex07/linearization_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..9a9c162e64d62675433a1f5174da17654e528e42 --- /dev/null +++ b/solutions/ex07/linearization_agent_TODO_1.py @@ -0,0 +1,4 @@ + xp = model.f(xbar, ubar, k=0) + A, B = model.f_jacobian(xbar, ubar, k=0) + + d = xp - A @ xbar - B @ ubar \ No newline at end of file diff --git a/solutions/ex07/linearization_agent_TODO_2.py b/solutions/ex07/linearization_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..f84fac4dd7bf0800e2734cee2d913762b410355e --- /dev/null +++ b/solutions/ex07/linearization_agent_TODO_2.py @@ -0,0 +1 @@ + (self.L, self.l), (V, v, vc) = LQR(A=[A]*N, B=[B]*N, d=[d]*N, Q=[Q]*N, q=[q]*N, R=[self.model.cost.R]*N) \ No newline at end of file diff --git a/solutions/ex07/linearization_agent_TODO_3.py b/solutions/ex07/linearization_agent_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..797bd934a4c3c08c245bbdbe956a9008510367a8 --- /dev/null +++ b/solutions/ex07/linearization_agent_TODO_3.py @@ -0,0 +1 @@ + u = self.L[0] @ x + self.l[0] \ No newline at end of file diff --git a/solutions/ex08/bandits_TODO_1.py b/solutions/ex08/bandits_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..f971a9c7fb74dc21be27bd92f6e8c5aec3af804b --- /dev/null +++ b/solutions/ex08/bandits_TODO_1.py @@ -0,0 +1,2 @@ + reward = self.q_star[a] + np.random.randn() + regret = self.q_star[self.optimal_action] - self.q_star[a] \ No newline at end of file diff --git a/solutions/ex08/gradient_agent_TODO_1.py b/solutions/ex08/gradient_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..2c166f9787f05f6acd9b977fe814558616388c3d --- /dev/null +++ b/solutions/ex08/gradient_agent_TODO_1.py @@ -0,0 +1,9 @@ + pi_a = self.Pa() + for b in range(self.k): + if b == a: + self.H[b] += self.alpha * (r - self.R_bar) * (1 - pi_a[b]) + else: + self.H[b] -= self.alpha * (r - self.R_bar) * pi_a[b] + + if self.baseline: + self.R_bar = self.R_bar + (self.alpha if self.alpha is not None else 1/(self.t+1)) * (r - self.R_bar) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_1.py b/solutions/ex08/grand_bandit_race_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..06e8845cc54405e1d5cac07b51eb414b9170e1ef --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_1.py @@ -0,0 +1 @@ + bandit1 = StationaryBandit(k=10) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_2.py b/solutions/ex08/grand_bandit_race_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..c9ffb8db36b8c1313eab01a4668a45f30d3cc243 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_2.py @@ -0,0 +1,5 @@ + agents = [BasicAgent(bandit1, epsilon=epsilon)] + agents += [MovingAverageAgent(bandit1, epsilon=epsilon, alpha=alpha)] + agents += [GradientAgent(bandit1, alpha=alpha,use_baseline=False) ] + agents += [GradientAgent(bandit1, alpha=alpha,use_baseline=True) ] + agents += [UCBAgent(bandit1, c=2)] \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_3.py b/solutions/ex08/grand_bandit_race_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..807579c172456281f047d47aa499d87869460af6 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_3.py @@ -0,0 +1 @@ + eval_and_plot(bandit1, agents, max_episodes=2000, labels=labels) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_4.py b/solutions/ex08/grand_bandit_race_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..3a9cb82fb4ef4d9adbd1f7d726d53189b2827710 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_4.py @@ -0,0 +1 @@ + bandit2 = StationaryBandit(k=10, q_star_mean=4) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_5.py b/solutions/ex08/grand_bandit_race_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..1a6cfc7bd723679fac58db343018033760fcd8f9 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_5.py @@ -0,0 +1 @@ + eval_and_plot(bandit2, agents, max_episodes=2000, labels=labels) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_6.py b/solutions/ex08/grand_bandit_race_TODO_6.py new file mode 100644 index 0000000000000000000000000000000000000000..20c9ba027fd3dc0a70ea72bf97622c69031199f3 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_6.py @@ -0,0 +1 @@ + bandit3 = NonstationaryBandit(k=10) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_7.py b/solutions/ex08/grand_bandit_race_TODO_7.py new file mode 100644 index 0000000000000000000000000000000000000000..a2a5676b54581b39119cddc327bdf79459f97f57 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_7.py @@ -0,0 +1 @@ + eval_and_plot(bandit3, agents, max_episodes=2000, steps=10000, labels=labels) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_8.py b/solutions/ex08/grand_bandit_race_TODO_8.py new file mode 100644 index 0000000000000000000000000000000000000000..b34bc62119f16531be2e9792bf8b442f365132d2 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_8.py @@ -0,0 +1 @@ + eval_and_plot(bandit1, agents2, steps=10000, labels=labels) \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_1.py b/solutions/ex08/nonstationary_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..e53da8bcceffb8f6b6c54f0cdfe39eaef44f9959 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_1.py @@ -0,0 +1,2 @@ + self.q_star += self.reward_change_std * np.random.randn(self.k) + self.optimal_action = np.argmax(self.q_star) \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_2.py b/solutions/ex08/nonstationary_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..26dc95b4931d7fab277c37fff8b5894f3315c332 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_2.py @@ -0,0 +1,2 @@ + self.alpha=alpha + super().__init__(env, epsilon=epsilon) \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_3.py b/solutions/ex08/nonstationary_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..d45b3a0f69beac0af8bfd8249534906f13bdd167 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_3.py @@ -0,0 +1 @@ + self.Q[a] = self.Q[a] + self.alpha * (r-self.Q[a]) \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_4.py b/solutions/ex08/nonstationary_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ffd80476a720982fcd5655d5b58f00ee8e6d42 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_4.py @@ -0,0 +1,4 @@ + bandit = NonstationaryBandit(k=10) + + agents = [BasicAgent(bandit, epsilon=epsilon)] + agents += [MovingAverageAgent(bandit, epsilon=epsilon, alpha=alpha) for alpha in alphas] \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_5.py b/solutions/ex08/nonstationary_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..9742313984f8906af30c34cac26785b1b2ec8791 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_5.py @@ -0,0 +1 @@ + labels += [f"Mov.avg. agent, epsilon={epsilon}, alpha={alpha}" for alpha in alphas] \ No newline at end of file diff --git a/solutions/ex08/simple_agents_TODO_1.py b/solutions/ex08/simple_agents_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5416e07d0958fe82276fa186510f0402eafc7ece --- /dev/null +++ b/solutions/ex08/simple_agents_TODO_1.py @@ -0,0 +1,2 @@ + self.Q = np.zeros((self.k,)) + self.N = np.zeros((self.k,)) \ No newline at end of file diff --git a/solutions/ex08/simple_agents_TODO_2.py b/solutions/ex08/simple_agents_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..d91b3938f2b78d14de98a867793d8b33f61cae55 --- /dev/null +++ b/solutions/ex08/simple_agents_TODO_2.py @@ -0,0 +1 @@ + return np.random.randint(self.k) if np.random.rand() < self.epsilon else np.argmax(self.Q) \ No newline at end of file diff --git a/solutions/ex08/simple_agents_TODO_3.py b/solutions/ex08/simple_agents_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..df218f01d8927b72f1920bc2841a8f2ae0b0e616 --- /dev/null +++ b/solutions/ex08/simple_agents_TODO_3.py @@ -0,0 +1,2 @@ + self.N[a] = self.N[a] + 1 + self.Q[a] = self.Q[a] + 1/self.N[a] * (r-self.Q[a]) \ No newline at end of file diff --git a/solutions/ex08/ucb_agent_TODO_1.py b/solutions/ex08/ucb_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..4812f63b0e4fc5378d676a4d2e0459c3d2b07ca8 --- /dev/null +++ b/solutions/ex08/ucb_agent_TODO_1.py @@ -0,0 +1,2 @@ + self.N[a] += 1 + self.Q[a] += 1/self.N[a] * (r - self.Q[a]) \ No newline at end of file diff --git a/solutions/ex08/ucb_agent_TODO_2.py b/solutions/ex08/ucb_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..437563cd1c647bce69dc8778e93c5bf9854f0a5c --- /dev/null +++ b/solutions/ex08/ucb_agent_TODO_2.py @@ -0,0 +1,3 @@ + k = self.env.action_space.n + self.Q = np.zeros((k,)) + self.N = np.zeros((k,)) \ No newline at end of file diff --git a/solutions/ex08/ucb_agent_TODO_3.py b/solutions/ex08/ucb_agent_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..59255040547725524492f30aa7b91b3267f04c27 --- /dev/null +++ b/solutions/ex08/ucb_agent_TODO_3.py @@ -0,0 +1 @@ + return np.argmax( self.Q + self.c * np.sqrt( np.log(k+1)/(self.N+1e-8) ) ) \ No newline at end of file diff --git a/solutions/ex09/gambler_TODO_1.py b/solutions/ex09/gambler_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5edd9179f3c7d5cecdf4ed62874f892d3b277a49 --- /dev/null +++ b/solutions/ex09/gambler_TODO_1.py @@ -0,0 +1 @@ + return state in [0, self.goal] \ No newline at end of file diff --git a/solutions/ex09/gambler_TODO_2.py b/solutions/ex09/gambler_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..63c4cf777199a52f2eea1c333f5eb98a692b6e61 --- /dev/null +++ b/solutions/ex09/gambler_TODO_2.py @@ -0,0 +1 @@ + return list( range(1, min(s, self.goal - s) + 1)) \ No newline at end of file diff --git a/solutions/ex09/gambler_TODO_3.py b/solutions/ex09/gambler_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..b4e0a660fca2209ea59af40b1b18bc7aad4e9c39 --- /dev/null +++ b/solutions/ex09/gambler_TODO_3.py @@ -0,0 +1,4 @@ + r = 1 if s + a == 100 else 0 + WIN = (s+a, r) + LOSS = (s-a, 0) + outcome_dict = {WIN: self.p_heads, LOSS: 1-self.p_heads } if WIN != LOSS else {WIN: 1.} \ No newline at end of file diff --git a/solutions/ex09/jacks_car_rental_TODO_1.py b/solutions/ex09/jacks_car_rental_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..14da723df9fff8944cca36e6f08033a1d9ba4cb7 --- /dev/null +++ b/solutions/ex09/jacks_car_rental_TODO_1.py @@ -0,0 +1,3 @@ + max_from_1 = min([self.max_move,c1]) + max_to_1 = min([self.max_move,c2]) + a = [s for s in range(-max_to_1, max_from_1 + 1 )] \ No newline at end of file diff --git a/solutions/ex09/jacks_car_rental_TODO_2.py b/solutions/ex09/jacks_car_rental_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..cf19c18a8ff14200b83dc572dade4894b8bd8c29 --- /dev/null +++ b/solutions/ex09/jacks_car_rental_TODO_2.py @@ -0,0 +1 @@ + s = (s[0]-a, s[1]+a) \ No newline at end of file diff --git a/solutions/ex09/jacks_car_rental_TODO_3.py b/solutions/ex09/jacks_car_rental_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..097e1969082e3be96b8a849084f5a8e35a3461e6 --- /dev/null +++ b/solutions/ex09/jacks_car_rental_TODO_3.py @@ -0,0 +1 @@ + d[((c1, c2), reward_1 + reward_2 + abs(a)*self.move_cost) ] += pc1 * pc2 \ No newline at end of file diff --git a/solutions/ex09/mdp_warmup_TODO_1.py b/solutions/ex09/mdp_warmup_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..b8ee7db7c1bc9e038ea07feff49720d151785490 --- /dev/null +++ b/solutions/ex09/mdp_warmup_TODO_1.py @@ -0,0 +1 @@ + q_dict = {a: sum([p*(r+ (gamma*v[sp] if not mdp.is_terminal(sp) else 0)) for (sp,r), p in mdp.Psr(s,a).items()]) for a in mdp.A(s)} \ No newline at end of file diff --git a/solutions/ex09/mdp_warmup_TODO_2.py b/solutions/ex09/mdp_warmup_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..f605ec389d9213a9f951ad32bf2d96ed93f83424 --- /dev/null +++ b/solutions/ex09/mdp_warmup_TODO_2.py @@ -0,0 +1 @@ + raise NotImplementedError("Insert your solution and remove this error.") \ No newline at end of file diff --git a/solutions/ex09/mdp_warmup_TODO_3.py b/solutions/ex09/mdp_warmup_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..c8f9a461ff9e0126baa93f2f1abbf35d365a52d6 --- /dev/null +++ b/solutions/ex09/mdp_warmup_TODO_3.py @@ -0,0 +1 @@ + expected_reward = sum( [r * p for (sp, r), p in mdp.Psr(s, a).items() ] ) \ No newline at end of file diff --git a/solutions/ex09/mdp_warmup_TODO_4.py b/solutions/ex09/mdp_warmup_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..bb8d2810f8b9e635f333b72539f47e18035d818e --- /dev/null +++ b/solutions/ex09/mdp_warmup_TODO_4.py @@ -0,0 +1 @@ + V_s = sum( [Q[s,a] * p for a, p in policy.items()] ) \ No newline at end of file diff --git a/solutions/ex09/policy_evaluation_TODO_1.py b/solutions/ex09/policy_evaluation_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..290d5ab43f92e519b08e9adbd01916b4f70cd81b --- /dev/null +++ b/solutions/ex09/policy_evaluation_TODO_1.py @@ -0,0 +1,2 @@ + q = value_function2q_function(mdp, s, gamma, v) + v_, v[s] = v[s], sum( [q[a] * pi_a for a,pi_a in pi[s].items()] ) \ No newline at end of file diff --git a/solutions/ex09/policy_iteration_TODO_1.py b/solutions/ex09/policy_iteration_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..00c8a957fb3b6a2650aec38a6da1f8fe541ea5ee --- /dev/null +++ b/solutions/ex09/policy_iteration_TODO_1.py @@ -0,0 +1,6 @@ + for s in [mdp.nonterminal_states[i] for i in np.random.permutation(len(mdp.nonterminal_states))]: + old_a = pi[s] # The best action we would take under the current policy + Qs = value_function2q_function(mdp, s, gamma, V) + pi[s] = max(Qs, key=Qs.get) + if old_a != pi[s]: + policy_stable = False \ No newline at end of file diff --git a/solutions/ex09/value_iteration_TODO_1.py b/solutions/ex09/value_iteration_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..d07abe42531f4a01ecb6e6ef69b30abdaef4c0b1 --- /dev/null +++ b/solutions/ex09/value_iteration_TODO_1.py @@ -0,0 +1,2 @@ + v, V[s] = V[s], max(value_function2q_function(mdp, s, gamma, V).values()) if len(mdp.A(s)) > 0 else 0 + Delta = max(Delta, np.abs(v - V[s])) \ No newline at end of file diff --git a/solutions/ex09/value_iteration_TODO_2.py b/solutions/ex09/value_iteration_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..89339fefc3a2ea8871c75d63d26586b2e07f0f1b --- /dev/null +++ b/solutions/ex09/value_iteration_TODO_2.py @@ -0,0 +1,2 @@ + Q = {a: v-(1e-8*a if isinstance(a, int) else 0) for a,v in value_function2q_function(mdp, s, gamma, V).items()} + pi[s] = max(Q, key=Q.get) \ No newline at end of file diff --git a/solutions/ex09/value_iteration_agent_TODO_1.py b/solutions/ex09/value_iteration_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..49090726ea7151cfdb717088aa6b371b78baee41 --- /dev/null +++ b/solutions/ex09/value_iteration_agent_TODO_1.py @@ -0,0 +1 @@ + self.policy, self.v = value_iteration(mdp, gamma=gamma, **kwargs) \ No newline at end of file diff --git a/solutions/ex09/value_iteration_agent_TODO_2.py b/solutions/ex09/value_iteration_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..5a41f1466ec31a8bbec8921330340cf1693f5de2 --- /dev/null +++ b/solutions/ex09/value_iteration_agent_TODO_2.py @@ -0,0 +1 @@ + action = self.policy[s] \ No newline at end of file diff --git a/solutions/ex10/mc_agent_TODO_1.py b/solutions/ex10/mc_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..ae75aadcb9b72b66929a305601827d0ffe5c351b --- /dev/null +++ b/solutions/ex10/mc_agent_TODO_1.py @@ -0,0 +1,2 @@ + G = gamma * G + episode[t][2] + sa_t = episode[t][:2] \ No newline at end of file diff --git a/solutions/ex10/mc_agent_TODO_2.py b/solutions/ex10/mc_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..0c62b909b68e8d986512f7a887cee22389c69203 --- /dev/null +++ b/solutions/ex10/mc_agent_TODO_2.py @@ -0,0 +1 @@ + returns.append( sa_t + (G,) ) \ No newline at end of file diff --git a/solutions/ex10/mc_agent_TODO_3.py b/solutions/ex10/mc_agent_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..dd67432e760d24ed7c8838b56ace568982d756bb --- /dev/null +++ b/solutions/ex10/mc_agent_TODO_3.py @@ -0,0 +1 @@ + return self.pi_eps(s, info) \ No newline at end of file diff --git a/solutions/ex10/mc_agent_TODO_4.py b/solutions/ex10/mc_agent_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..a910769dd5d99550a32fc31a3761aebf3a679d4c --- /dev/null +++ b/solutions/ex10/mc_agent_TODO_4.py @@ -0,0 +1,12 @@ + self.episode.append((s, a, r)) + if done: + returns = get_MC_return_SA(self.episode, self.gamma, self.first_visit) + for s, a, G in returns: + # s,a = sa + if self.alpha is None: + self.returns_sum[s, a] += G + self.returns_count[s, a] += 1 + self.Q[s, a] = self.returns_sum[s, a] / self.returns_count[s, a] + else: + self.Q[s, a] += self.alpha * (G - self.Q[s, a]) + self.episode = [] \ No newline at end of file diff --git a/solutions/ex10/mc_agent_blackjack_TODO_1.py b/solutions/ex10/mc_agent_blackjack_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..7d71b63bbe2107ab467f6ceeaf4e77dd46aee1cc --- /dev/null +++ b/solutions/ex10/mc_agent_blackjack_TODO_1.py @@ -0,0 +1 @@ + train(env, agent, expn, num_episodes=episodes, return_trajectory=False) \ No newline at end of file diff --git a/solutions/ex10/mc_evaluate_TODO_1.py b/solutions/ex10/mc_evaluate_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..a4192abc18fc36355ac9e6361da9325d2df65cfb --- /dev/null +++ b/solutions/ex10/mc_evaluate_TODO_1.py @@ -0,0 +1,2 @@ + G = gamma * G + episode[t][2] + s_t = episode[t][0] \ No newline at end of file diff --git a/solutions/ex10/mc_evaluate_TODO_2.py b/solutions/ex10/mc_evaluate_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..d653590006dd94c813935fbb5dc94a6b7012ffa6 --- /dev/null +++ b/solutions/ex10/mc_evaluate_TODO_2.py @@ -0,0 +1 @@ + returns.append((s_t, G)) \ No newline at end of file diff --git a/solutions/ex10/mc_evaluate_TODO_3.py b/solutions/ex10/mc_evaluate_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..9f00d39a4727224001cd994508cfea57c463cc2a --- /dev/null +++ b/solutions/ex10/mc_evaluate_TODO_3.py @@ -0,0 +1 @@ + self.v[s] = self.v[s] + self.alpha * (G - self.v[s]) \ No newline at end of file diff --git a/solutions/ex10/mc_evaluate_TODO_4.py b/solutions/ex10/mc_evaluate_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..587cd32680705be5b8039bfc14b5dbf42312e690 --- /dev/null +++ b/solutions/ex10/mc_evaluate_TODO_4.py @@ -0,0 +1,3 @@ + self.returns_sum_S[s] += G + self.returns_count_N[s] += 1.0 + self.v[s] = self.returns_sum_S[s] / self.returns_count_N[s] \ No newline at end of file diff --git a/solutions/ex10/mc_evaluate_TODO_5.py b/solutions/ex10/mc_evaluate_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..851276319c82264a4f758e6d2f108ee28b25e841 --- /dev/null +++ b/solutions/ex10/mc_evaluate_TODO_5.py @@ -0,0 +1 @@ + agent_every = MCEvaluationAgent(env, gamma=gamma, first_visit=False) \ No newline at end of file diff --git a/solutions/ex10/mc_evaluate_TODO_6.py b/solutions/ex10/mc_evaluate_TODO_6.py new file mode 100644 index 0000000000000000000000000000000000000000..67af451908b3dca96d9ceb318f1098c1e879cbad --- /dev/null +++ b/solutions/ex10/mc_evaluate_TODO_6.py @@ -0,0 +1 @@ + train(env, agent_every, num_episodes=episodes, verbose=False) \ No newline at end of file diff --git a/solutions/ex10/mc_evaluate_blackjack_TODO_1.py b/solutions/ex10/mc_evaluate_blackjack_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..ddcf6a0bb4c8617c85843d28abbdb017afebc130 --- /dev/null +++ b/solutions/ex10/mc_evaluate_blackjack_TODO_1.py @@ -0,0 +1 @@ + return 0 if s[0] >= 20 else 1 \ No newline at end of file diff --git a/solutions/ex10/mc_evaluate_blackjack_TODO_2.py b/solutions/ex10/mc_evaluate_blackjack_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..6918a6d557e57f850439242ab12979dd735959eb --- /dev/null +++ b/solutions/ex10/mc_evaluate_blackjack_TODO_2.py @@ -0,0 +1,2 @@ + agent = MCEvaluationAgent(env, policy=policy20, gamma=1) + train(env, agent, experiment_name=experiment, num_episodes=episodes) \ No newline at end of file diff --git a/solutions/ex10/question_td0_TODO_1.py b/solutions/ex10/question_td0_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..05e6e865fb5cc20f929632637b5be7fb6fb16024 --- /dev/null +++ b/solutions/ex10/question_td0_TODO_1.py @@ -0,0 +1,5 @@ + deltas = [] + for t, (s, r) in enumerate(zip(states[:-1], rewards)): + sp = states[t + 1] + delta = (r + gamma * v[sp]) - v[s] + deltas.append(delta) \ No newline at end of file diff --git a/solutions/ex10/question_td0_TODO_2.py b/solutions/ex10/question_td0_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..a32f983c69a6e90c69f19883548dd822722ef1ca --- /dev/null +++ b/solutions/ex10/question_td0_TODO_2.py @@ -0,0 +1,6 @@ + for t in range(len(rewards)): + s = states[t] + sp = states[t + 1] + r = rewards[t] + delta = r + gamma * v[sp] - v[s] + v[s] = v[s] + alpha * delta \ No newline at end of file diff --git a/solutions/ex10/question_td0_TODO_3.py b/solutions/ex10/question_td0_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..bceb6387836f6f75e597c150ad53acfbb18c0258 --- /dev/null +++ b/solutions/ex10/question_td0_TODO_3.py @@ -0,0 +1,4 @@ + deltas = a_compute_deltas(v, states, rewards, gamma) + for t in range(len(rewards)): + s = states[t] + v[s] = v[s] + alpha * deltas[t] \ No newline at end of file diff --git a/solutions/ex10/random_walk_example_TODO_1.py b/solutions/ex10/random_walk_example_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..e0f1b9919862220c6891bf28b3dff5bd0bd6ce84 --- /dev/null +++ b/solutions/ex10/random_walk_example_TODO_1.py @@ -0,0 +1 @@ + sp = s+(2*a-1) \ No newline at end of file diff --git a/solutions/ex10/td0_evaluate_TODO_1.py b/solutions/ex10/td0_evaluate_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..6953d59eb1b823953d481c6ad65c53898375b547 --- /dev/null +++ b/solutions/ex10/td0_evaluate_TODO_1.py @@ -0,0 +1,3 @@ + if isinstance(s, np.ndarray): + print("Bad type.") + self.v[s] += self.alpha * (r + self.gamma * (self.v[sp] if not done else 0) - self.v[s]) \ No newline at end of file diff --git a/solutions/ex11/nstep_sarsa_agent_TODO_1.py b/solutions/ex11/nstep_sarsa_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..388556a4170a7877d0811962cb7c553de5e772c5 --- /dev/null +++ b/solutions/ex11/nstep_sarsa_agent_TODO_1.py @@ -0,0 +1,4 @@ + G = sum([self.gamma**(i-tau-1)*self.R[i%(n+1)] for i in range(tau+1, min(tau+n, T)+1)]) + S_tau_n, A_tau_n = self.S[(tau+n)%(n+1)], self.A[(tau+n)%(n+1)] + if tau+n < T: + G += self.gamma**n * self._q(S_tau_n, A_tau_n) \ No newline at end of file diff --git a/solutions/ex11/q_agent_TODO_1.py b/solutions/ex11/q_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..15b029e16b421af014e30645ddf5bef6c15e2811 --- /dev/null +++ b/solutions/ex11/q_agent_TODO_1.py @@ -0,0 +1 @@ + action = self.pi_eps(s, info=info) \ No newline at end of file diff --git a/solutions/ex11/q_agent_TODO_2.py b/solutions/ex11/q_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..b1109163b3163665c3627e5b1d617f4885084ef5 --- /dev/null +++ b/solutions/ex11/q_agent_TODO_2.py @@ -0,0 +1,3 @@ + if not done: + a_star = self.Q.get_optimal_action(sp, info_sp) + self.Q[s,a] += self.alpha * (r + self.gamma * (0 if done else self.Q[sp,a_star]) - self.Q[s,a]) \ No newline at end of file diff --git a/solutions/ex11/sarsa_agent_TODO_1.py b/solutions/ex11/sarsa_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5cabe83149ef61349da6e638cbe235ad8a87db09 --- /dev/null +++ b/solutions/ex11/sarsa_agent_TODO_1.py @@ -0,0 +1 @@ + return self.pi_eps(s, info) \ No newline at end of file diff --git a/solutions/ex11/sarsa_agent_TODO_2.py b/solutions/ex11/sarsa_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..3a8a80cead115cdff3822df88d52e9c6607d1cc1 --- /dev/null +++ b/solutions/ex11/sarsa_agent_TODO_2.py @@ -0,0 +1 @@ + return self.a \ No newline at end of file diff --git a/solutions/ex11/sarsa_agent_TODO_3.py b/solutions/ex11/sarsa_agent_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..5c6eb6088708097da5ca156e80c56cebd03667d5 --- /dev/null +++ b/solutions/ex11/sarsa_agent_TODO_3.py @@ -0,0 +1 @@ + self.a = self.pi_eps(sp, info_sp) if not done else -1 \ No newline at end of file diff --git a/solutions/ex11/sarsa_agent_TODO_4.py b/solutions/ex11/sarsa_agent_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..cecfe59aa108c55ef37449f0c180c8c70e023b97 --- /dev/null +++ b/solutions/ex11/sarsa_agent_TODO_4.py @@ -0,0 +1,2 @@ + delta = r + (self.gamma * self.Q[sp,self.a] if not done else 0) - self.Q[s,a] + self.Q[s,a] += self.alpha * delta \ No newline at end of file diff --git a/solutions/ex11/semi_grad_q_TODO_1.py b/solutions/ex11/semi_grad_q_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..1da3194c7198d9471a6bf6cb60566c7e940dffd2 --- /dev/null +++ b/solutions/ex11/semi_grad_q_TODO_1.py @@ -0,0 +1,4 @@ + if not done: + a_star = self.Q.get_optimal_action(sp, info_sp) + td_delta = r + (0 if done else self.gamma * self.Q(sp, a_star)) - self.Q(s, a) + self.Q.w += self.alpha * td_delta * self.Q.x(s, a) \ No newline at end of file diff --git a/solutions/ex11/semi_grad_sarsa_TODO_1.py b/solutions/ex11/semi_grad_sarsa_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..4cc4b80940abd706d44bc514ad07a6898f50b1b5 --- /dev/null +++ b/solutions/ex11/semi_grad_sarsa_TODO_1.py @@ -0,0 +1 @@ + action = self.a if k > 0 else super().pi(s, k, info) \ No newline at end of file diff --git a/solutions/ex11/semi_grad_sarsa_TODO_2.py b/solutions/ex11/semi_grad_sarsa_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..0690b1e7194e11f607195a573ddb90eb24f76e2c --- /dev/null +++ b/solutions/ex11/semi_grad_sarsa_TODO_2.py @@ -0,0 +1,4 @@ + a_prime = super().pi(sp, k=0, info=info_sp) + delta = r + (0 if done else self.gamma * self.Q(sp, a_prime)) - self.Q(s, a) + self.Q.w += self.alpha * delta * self.Q.x(s,a) + self.a = a_prime \ No newline at end of file diff --git a/solutions/ex12/minigrid_wrappers_TODO_1.py b/solutions/ex12/minigrid_wrappers_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..8f7190396df4e0b97ee6bad8087da89ad4c0d48b --- /dev/null +++ b/solutions/ex12/minigrid_wrappers_TODO_1.py @@ -0,0 +1 @@ + box.high[:, :, i] = nbounds[i] \ No newline at end of file diff --git a/solutions/ex12/minigrid_wrappers_TODO_2.py b/solutions/ex12/minigrid_wrappers_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..1607b9a79357bbf3191133fd7cc928d5b38c173b --- /dev/null +++ b/solutions/ex12/minigrid_wrappers_TODO_2.py @@ -0,0 +1,2 @@ + box.high = box.high[:,:,dims] + box.low = box.low[:,:,dims] \ No newline at end of file diff --git a/solutions/ex12/minigrid_wrappers_TODO_3.py b/solutions/ex12/minigrid_wrappers_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..d967cd1e02c7e71cd86fa0a3a878e8e0e0570620 --- /dev/null +++ b/solutions/ex12/minigrid_wrappers_TODO_3.py @@ -0,0 +1 @@ + x = obs['image'][:, :, self.dims] \ No newline at end of file diff --git a/solutions/ex12/minigrid_wrappers_TODO_4.py b/solutions/ex12/minigrid_wrappers_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..bd9020da77bbe889da37788db22dda9c9b54397b --- /dev/null +++ b/solutions/ex12/minigrid_wrappers_TODO_4.py @@ -0,0 +1 @@ + return tuple( obs['image'].flat ) \ No newline at end of file diff --git a/solutions/ex12/mountain_car_TODO_1.py b/solutions/ex12/mountain_car_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..baeb97c73560e295b43eeb03d120294e5ef51262 --- /dev/null +++ b/solutions/ex12/mountain_car_TODO_1.py @@ -0,0 +1,16 @@ + for i, alpha in enumerate(alphas): + n = n_steps[i] + agent = LinearSemiGradSarsaN(env, gamma=1, alpha=alpha / num_of_tilings, epsilon=0, n=n) + experiment = f"experiments/mountaincar_10-2_{agent}_{episodes}" + train(env, agent, experiment_name=experiment, num_episodes=episodes, max_runs=max_runs) + experiments.append(experiment) + + agent = LinearSemiGradSarsaLambda(env, gamma=1, alpha=alphas[1]/num_of_tilings, epsilon=0, lamb=0.9) + experiment = f"experiments/mountaincar_10-2_{agent}_{episodes}" + train(env, agent, experiment_name=experiment, num_episodes=episodes, max_runs=max_runs) + experiments.append(experiment) + + agent = LinearSemiGradQAgent(env, gamma=1, alpha=alphas[1] / num_of_tilings, epsilon=0) + experiment = f"experiments/mountaincar_10-2_{agent}_{episodes}" + train(env, agent, experiment_name=experiment, num_episodes=episodes, max_runs=max_runs) + experiments.append(experiment) \ No newline at end of file diff --git a/solutions/ex12/sarsa_lambda_agent_TODO_1.py b/solutions/ex12/sarsa_lambda_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5723174eefa11ed0b20f1ef766b2f8eb334bd999 --- /dev/null +++ b/solutions/ex12/sarsa_lambda_agent_TODO_1.py @@ -0,0 +1 @@ + a_prime = self.pi_eps(sp, info_sp) if not done else -1 \ No newline at end of file diff --git a/solutions/ex12/sarsa_lambda_agent_TODO_2.py b/solutions/ex12/sarsa_lambda_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..13d29a2bd7ea80e075b6211ef83e2b065a7445b9 --- /dev/null +++ b/solutions/ex12/sarsa_lambda_agent_TODO_2.py @@ -0,0 +1 @@ + delta = r + self.gamma * (self.Q[sp,a_prime] if not done else 0) - self.Q[s,a] \ No newline at end of file diff --git a/solutions/ex12/sarsa_lambda_agent_TODO_3.py b/solutions/ex12/sarsa_lambda_agent_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..5c85d462776cb6228079fc83452f7aef14fb165f --- /dev/null +++ b/solutions/ex12/sarsa_lambda_agent_TODO_3.py @@ -0,0 +1 @@ + self.e[(s,a)] += 1 \ No newline at end of file diff --git a/solutions/ex12/sarsa_lambda_agent_TODO_4.py b/solutions/ex12/sarsa_lambda_agent_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..e36f86331c3278f032c2857538b54180ec902b6e --- /dev/null +++ b/solutions/ex12/sarsa_lambda_agent_TODO_4.py @@ -0,0 +1,2 @@ + self.Q[s,a] += self.alpha * delta * ee + self.e[(s,a)] = self.gamma * self.lamb * ee \ No newline at end of file diff --git a/solutions/ex12/semi_grad_nstep_sarsa_TODO_1.py b/solutions/ex12/semi_grad_nstep_sarsa_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..290a33b00a5a0e236bf0975bf4cde9ca7fa5ae63 --- /dev/null +++ b/solutions/ex12/semi_grad_nstep_sarsa_TODO_1.py @@ -0,0 +1 @@ + return self.Q(s, a) \ No newline at end of file diff --git a/solutions/ex12/semi_grad_nstep_sarsa_TODO_2.py b/solutions/ex12/semi_grad_nstep_sarsa_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..89368e6624fe3eb464ea3da1b26e8bcdc1728828 --- /dev/null +++ b/solutions/ex12/semi_grad_nstep_sarsa_TODO_2.py @@ -0,0 +1 @@ + self.Q.w += self.alpha * delta * self.Q.x(s,a) # Update q(s,a)/weights given change in q-values: delta = [G-\hat{q}(..)] \ No newline at end of file diff --git a/solutions/ex12/semi_grad_sarsa_lambda_TODO_1.py b/solutions/ex12/semi_grad_sarsa_lambda_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..1fa61eb5c6da3a9948a225ba80d5ec5b9bd0ed67 --- /dev/null +++ b/solutions/ex12/semi_grad_sarsa_lambda_TODO_1.py @@ -0,0 +1,5 @@ + Q = self.Q.w @ self.x + Q_prime = self.Q.w @ x_prime if not done else None + delta = r + (self.gamma * Q_prime if not done else 0) - Q + self.z = self.gamma * self.lamb * self.z + (1-self.alpha * self.gamma * self.lamb *self.z @ self.x) * self.x + self.Q.w += self.alpha * (delta + Q - self.Q_old) * self.z - self.alpha * (Q-self.Q_old) * self.x \ No newline at end of file diff --git a/solutions/ex13/deepq_agent_TODO_1.py b/solutions/ex13/deepq_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..824ede3c76517a24ed8ad2c1f9fcc2e4e3533032 --- /dev/null +++ b/solutions/ex13/deepq_agent_TODO_1.py @@ -0,0 +1,3 @@ + y = r[:,0] + self.gamma * np.max(self.Q(sp), axis=1) * (1-done) + target = self.Q(s) + target[range(len(a)), a] = y \ No newline at end of file diff --git a/solutions/ex13/double_deepq_agent_TODO_1.py b/solutions/ex13/double_deepq_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..be47b430303aa44b57c4cb1dc57c456fb8800755 --- /dev/null +++ b/solutions/ex13/double_deepq_agent_TODO_1.py @@ -0,0 +1 @@ + self.target.update_Phi(self.Q, tau=self.tau) \ No newline at end of file diff --git a/solutions/ex13/double_deepq_agent_TODO_2.py b/solutions/ex13/double_deepq_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..3edd701d8fb798e973f6536e5d329b1e7146c701 --- /dev/null +++ b/solutions/ex13/double_deepq_agent_TODO_2.py @@ -0,0 +1,5 @@ + sp[done, :] = 0 + astar = np.argmax(self.Q(sp), axis=1) * (1-np.asarray(done)) + y = r[:,0] + self.gamma * self.target(sp)[range(len(sp)), astar] * (1 - done) + target = self.Q(s) + target[range(len(a)), a] = y \ No newline at end of file diff --git a/solutions/ex13/dyna_q_TODO_1.py b/solutions/ex13/dyna_q_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..c7bd2b634d109ce7df7f938e9bd2aaede2a426e8 --- /dev/null +++ b/solutions/ex13/dyna_q_TODO_1.py @@ -0,0 +1 @@ + self.Q[s,a] += self.alpha * (r + (self.gamma * self.Q[sp, self.Q.get_optimal_action(sp, info_sp)] if not done else 0) - self.Q[s,a]) \ No newline at end of file diff --git a/solutions/ex13/dyna_q_TODO_2.py b/solutions/ex13/dyna_q_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..46704a9b39b54c32d5b19dd1a63c100d8999418b --- /dev/null +++ b/solutions/ex13/dyna_q_TODO_2.py @@ -0,0 +1,2 @@ + s_, a_, r_, sp_,done_ = self.Model[np.random.randint(len(self.Model))] + self.q_update(s_,a_,r_,sp_,done_, info_s, info_sp) \ No newline at end of file diff --git a/solutions/ex13/dyna_q_TODO_3.py b/solutions/ex13/dyna_q_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..0495ae68405d295cb8fba062f4772b1b22e4f9c4 --- /dev/null +++ b/solutions/ex13/dyna_q_TODO_3.py @@ -0,0 +1 @@ + experiments = dyna_experiment(env, env_name='cliff',num_episodes=200,epsilon=epsilon, alpha=alpha, gamma=gamma, runs=4) \ No newline at end of file diff --git a/solutions/ex13/keras_networks_TODO_1.py b/solutions/ex13/keras_networks_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..3f6fcaa6d661bec8d69e2c5ac4f4f4eb27eb33c3 --- /dev/null +++ b/solutions/ex13/keras_networks_TODO_1.py @@ -0,0 +1,6 @@ + adv_dense = layers.Dense(hidden_size, activation='relu', kernel_initializer=init())(dense2) + adv_out = layers.Dense(num_actions, kernel_initializer=init())(adv_dense) + v_dense = layers.Dense(hidden_size, activation='relu', kernel_initializer=init())(dense2) + v_out = layers.Dense(1, kernel_initializer=init())(v_dense) + norm_adv = layers.Lambda(lambda x: x - tf.reduce_mean(x))(adv_out) + combine = layers.add([v_out, norm_adv]) \ No newline at end of file diff --git a/solutions/ex13/maximization_bias_environment_TODO_1.py b/solutions/ex13/maximization_bias_environment_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..3f2bdf72de8a6adf0315eb8d48acf88138a9864e --- /dev/null +++ b/solutions/ex13/maximization_bias_environment_TODO_1.py @@ -0,0 +1 @@ + return {(t, 0): 1} \ No newline at end of file diff --git a/solutions/ex13/maximization_bias_environment_TODO_2.py b/solutions/ex13/maximization_bias_environment_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..ea4a34ee74e2c8300bf02cb098830cf0750ad0c5 --- /dev/null +++ b/solutions/ex13/maximization_bias_environment_TODO_2.py @@ -0,0 +1 @@ + return {(self.state_B, 0): 1} \ No newline at end of file diff --git a/solutions/ex13/tabular_double_q_TODO_1.py b/solutions/ex13/tabular_double_q_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..1320700514b4189dc8946e50617d8b7b9b3dc8ec --- /dev/null +++ b/solutions/ex13/tabular_double_q_TODO_1.py @@ -0,0 +1 @@ + return Agent.pi(self, s, k, info) if np.random.rand() < self.epsilon else a1[np.argmax(Q + np.random.rand(len(Q)) * 1e-8)] \ No newline at end of file diff --git a/solutions/ex13/tabular_double_q_TODO_2.py b/solutions/ex13/tabular_double_q_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..b9f397fcaa3473ba60895f1fa09dd3250c2bcc56 --- /dev/null +++ b/solutions/ex13/tabular_double_q_TODO_2.py @@ -0,0 +1,4 @@ + def train_(Q1,Q2, s, a, r, sp, done=False): + Q1[s,a] += self.alpha * (r + (self.gamma * Q2[sp,Q1.get_optimal_action(sp, info_sp)] if not done else 0) - Q1[s,a] ) + + train_(self.Q1, self.Q2, s, a, r, sp,done) if np.random.rand() < 0.5 else train_(self.Q2, self.Q1, s, a, r, sp,done) \ No newline at end of file diff --git a/solutions/ex13/torch_networks_TODO_1.py b/solutions/ex13/torch_networks_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..1ab89a8ad75f9c71bc6c82fd1c5f37e436812a12 --- /dev/null +++ b/solutions/ex13/torch_networks_TODO_1.py @@ -0,0 +1,4 @@ + s = Variable(torch.FloatTensor(s)) + x = self.feature(s) + advantage = self.advantage(x) + value = self.value(x) \ No newline at end of file