diff --git a/irlc/exam/exam2023spring/solution/exam2023spring_solutions.pdf b/irlc/exam/exam2023spring/solution/exam2023spring_solutions.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bde9cb39f7147957f4b9637895de9b27de578613 Binary files /dev/null and b/irlc/exam/exam2023spring/solution/exam2023spring_solutions.pdf differ diff --git a/irlc/exam/exam2023spring/solution/question_bandit.py b/irlc/exam/exam2023spring/solution/question_bandit.py new file mode 100644 index 0000000000000000000000000000000000000000..c6a852a9aefaca073ea302e38eab9c530c3001de --- /dev/null +++ b/irlc/exam/exam2023spring/solution/question_bandit.py @@ -0,0 +1,35 @@ +import numpy as np + +def a_select_next_action_epsilon0(k : int, actions : list, rewards : list) -> int: + a = b_select_next_action(k, actions, rewards, epsilon=0) + return a + +def b_select_next_action(k : int, actions : list, rewards : list, epsilon : float) -> int: + N = {a: 0 for a in range(k)} + S = {a: 0 for a in range(k)} + for (a, r) in zip(actions, rewards): + S[a] += r + N[a] += 1 + Q = {a: S[a] / N[a] if N[a] > 0 else 0 for a in range(k)} + if np.random.rand() < epsilon: + a = np.random.randint(k) + else: + a = max(Q, key=Q.get) + return a + +def c_nonstationary_Qs(k : int, actions : list, rewards : list, alpha : float) -> dict: + Q = {a: 0 for a in range(k)} + for (a, r) in zip(actions, rewards): + Q[a] = Q[a] + alpha * (r - Q[a]) + return Q + +if __name__ == "__main__": + actions = [1, 0, 2, 1, 2, 4, 5, 4, 3, 2, 1, 1] + rewards = [1, 1, 1, 0, 1, 3, 2, 0, 4, 1, 1, 2] + k = 10 + + a_t = a_select_next_action_epsilon0(k, actions, rewards) + print(f"a) The next action is suppoed to be 3, you computed {a_t}") + print(f"b) The action you computed was", b_select_next_action(k, actions, rewards, epsilon=0.3)) + Q = c_nonstationary_Qs(k, actions, rewards, alpha=0.1) + print(f"c) The Q-value associated with arm a=2 is supposed to be Q(2) = 0.271, you got", Q[2]) \ No newline at end of file diff --git a/irlc/exam/exam2023spring/solution/question_inventory.py b/irlc/exam/exam2023spring/solution/question_inventory.py new file mode 100644 index 0000000000000000000000000000000000000000..82bea9e53187300aa68e205039dc98c19f915983 --- /dev/null +++ b/irlc/exam/exam2023spring/solution/question_inventory.py @@ -0,0 +1,50 @@ +from irlc.exam.exam2023spring.inventory import InventoryDPModel +from irlc.exam.exam2023spring.dp import DP_stochastic +import numpy as np + +class InventoryDPModelB(InventoryDPModel): + + def __init__(self, N=3, c=0., prob_empty=False): + self.c = c + self.prob_empty = prob_empty + super().__init__(N=N) + + def g(self, x, u, w, k): # Cost function g_k(x,u,w) + if self.prob_empty: + return 0 + return u * self.c + np.abs(x + u - w) + + def f(self, x, u, w, k): # Dynamics f_k(x,u,w) + return max(0, min(max(self.S(k)), x + u - w)) + + def Pw(self, x, u, k): # Distribution over random disturbances + pw = {0: .1, 1: .3, 2: .6} + return pw + + def gN(self, x): + if self.prob_empty: + return -1 if x == 1 else 0 + else: + return 0 + +def a_get_policy(N: int, c: float, x0 : int) -> int: + model = InventoryDPModelB(N=N, c=c, prob_empty=False) + J, pi = DP_stochastic(model) + u = pi[0][x0] + return u + +def b_prob_one(N : int, x0 : int) -> float: + model = InventoryDPModelB(N=N, prob_empty=True) + J, pi = DP_stochastic(model) + pr_empty = -J[0][x0] + return pr_empty + + +if __name__ == "__main__": + model = InventoryDPModel() + pi = [{s: 0 for s in model.S(k)} for k in range(model.N)] + x0 = 0 + c = 0.5 + N = 3 + print(f"a) The policy choice for {c=} is {a_get_policy(N, c,x0)} should be 1") + print(f"b) The probability of ending up with a single element in the inventory is {b_prob_one(N, x0)} and should be 0.492") \ No newline at end of file diff --git a/irlc/exam/exam2023spring/solution/question_lqr.py b/irlc/exam/exam2023spring/solution/question_lqr.py new file mode 100644 index 0000000000000000000000000000000000000000..b30e822421dfbebaaff36ba13a71dc5d2290c5d6 --- /dev/null +++ b/irlc/exam/exam2023spring/solution/question_lqr.py @@ -0,0 +1,50 @@ +from irlc.ex04.model_pendulum import PendulumModel +from irlc.ex04.discrete_control_model import DiscreteControlModel +from irlc.exam.exam2023spring.dlqr import LQR +import numpy as np + +def getAB(a : float): + return np.asarray([[1,a], [0, 1]]), np.asarray([0, 1])[:,np.newaxis], np.asarray([1, 0]) + +def a_LQR_solve(a : float, x0 : np.ndarray) -> float: + A,B,d = getAB(a) + Q = np.eye(2) + R = np.eye(1) + N = 100 + (L, l), _ = LQR(A=[A]*N, B=[B]*N, d=[d] * N, Q=[Q]*N, R=[R]*N) + u = float( L[0] @ x0 + l[0]) + return u + +def b_linearize(theta : float): + model = PendulumModel() + dmodel = DiscreteControlModel(model=model, dt=0.5) + xbar = np.asarray([theta, 0]) + ubar = np.asarray([0]) + xp = dmodel.f(xbar, ubar, k=0) + A, B = dmodel.f_jacobian(xbar, ubar, k=0) + d = xp - A @ xbar - B @ ubar + return A, B, d + + +def c_get_optimal_linear_policy(x0 : np.ndarray) -> float: + x0 = np.asarray(x0) + # xstar = np.asarray([np.pi/2, 0]) + Q = np.eye(2) + R = np.eye(1) + # q = -Q @ xstar + # q0 = 0.5 * q@Q @q + A, B, d = b_linearize(theta=0) + N = 100 + (L, l), _ = LQR([A] * N, [B]*N, [d]*N, Q=[Q]*N, R=[R]*N) + u = float(L[0] @ x0 + l[0]) + return u + +if __name__ == "__main__": + theta = np.pi/2 # An example: linearize around theta = pi/2. + a = 1 + x0 = np.asarray([1, 0]) + print(f"a) LQR action should be approximately -1.666, you got: {a_LQR_solve(a, x0)=}") + A, B, d = b_linearize(theta) # Get the three matrices. + print(f"b) Entry d[1] should be approx. 4.91, you got: {d[1]=}") + theta = 0.1 # Try a small initial angle. + print(f"c) Optimal policy for linearized problem should be approximately -1.07, you got: {c_get_optimal_linear_policy(x0=np.asarray([theta, 0]))=}") \ No newline at end of file diff --git a/irlc/exam/exam2024spring/solution/exam2024spring_solutions.pdf b/irlc/exam/exam2024spring/solution/exam2024spring_solutions.pdf new file mode 100644 index 0000000000000000000000000000000000000000..29c56afdae36cda9da3469d0b92df2a089c118c5 Binary files /dev/null and b/irlc/exam/exam2024spring/solution/exam2024spring_solutions.pdf differ diff --git a/irlc/exam/exam2024spring/solution/question_bill_mdp.py b/irlc/exam/exam2024spring/solution/question_bill_mdp.py new file mode 100644 index 0000000000000000000000000000000000000000..d5347525a74180e10108064f4c5e19545b35b12a --- /dev/null +++ b/irlc/exam/exam2024spring/solution/question_bill_mdp.py @@ -0,0 +1,57 @@ +from irlc.exam.exam2024spring.mdp import MDP +from irlc.exam.exam2024spring.policy_evaluation import policy_evaluation +from irlc.exam.exam2024spring.value_iteration import value_iteration + +class BigSpender(MDP): + def __init__(self, r_airbnb=0.01): + self.p_win = 0.45 + self.r_airbnb = r_airbnb + super().__init__(initial_state=1) # s0 = 1 means we have an appartment. + + def is_terminal(self, state): + return False + + def A(self, s): + if s == 0: # if there is no appartment, there is nothing we can do + return [0] + if s == 1: # If we have an appartment, we can airbnb, a=0, or gamble, a=1. + return [0, 1] + + def Psr(self, s, a): + if s == 0: + return {(0, 0): 1} # No appartment means p(s=0, r=0 | s,a) = 1. + if s == 1 and a == 1: # with appartment and gambling + return {(0, 0): 1-self.p_win, # p(s=0, r=0 | s,a=1) = 1-p_win + (1, 2): self.p_win} # p(s=1, r=2 | s,a=1) = p_win + if s == 1 and a == 0: # with appartment and no gambling, p(s=1, r=r_airbnb | s,a) = 1. + return {(1, self.r_airbnb): 1} + +def a_always_airbnb(r_airbnb : float, gamma : float) -> float: + mdp = BigSpender(r_airbnb=r_airbnb) + pi = {0: {0: 1}, + 1: {0: 1, 1:0}} + J = policy_evaluation(pi=pi, mdp=mdp, gamma=gamma) + r1 = mdp.r_airbnb * 1/(1-gamma) #n.b. this solution, which simply compute the return explicitly, is also legal. + r2 = J[1] + assert abs(r1 - r2) < 1e-3 + v = r1 + return v + +def b_random_decisions(r_airbnb : float, gamma : float) -> float: + mdp = BigSpender(r_airbnb=r_airbnb) + pi = {0: {0: 1}, 1: {0: 0.5, 1: 0.5}} + J = policy_evaluation(pi=pi, mdp=mdp, gamma=gamma) + v = J[1] + return v + +def c_is_it_better_to_gamble(r_airbnb : float, gamma : float) -> bool: + mdp = BigSpender(r_airbnb=r_airbnb) + pi, V = value_iteration(mdp, gamma) + better_to_gamble = pi[1] == 1 + return better_to_gamble + +if __name__ == "__main__": + print("a) The expected return is approximately 1, your result:", a_always_airbnb(r_airbnb=0.01, gamma=0.99)) + print("b) The expected return is approximately 1.612, your result:", b_random_decisions(r_airbnb=0.01, gamma=0.99)) + print("c1) In this case, you should return False as it is better to AirBnB, your result:", c_is_it_better_to_gamble(r_airbnb=0.02, gamma=0.99)) + print("c2) In this case, you should return True as it is better to gamble, your result:", c_is_it_better_to_gamble(r_airbnb=0.01, gamma=0.99)) \ No newline at end of file diff --git a/irlc/exam/exam2024spring/solution/question_control.py b/irlc/exam/exam2024spring/solution/question_control.py new file mode 100644 index 0000000000000000000000000000000000000000..08654bcb92c1261b6772f1dca20a089efc7471e6 --- /dev/null +++ b/irlc/exam/exam2024spring/solution/question_control.py @@ -0,0 +1,30 @@ +import numpy as np +import sympy as sym +from irlc.ex03.control_model import ControlModel +from irlc.ex03.control_cost import SymbolicQRCost + +class Simulation(ControlModel): + def sym_f(self, x, u, t=None): + return [-sym.exp( u[0] -x[0]**2 )] + + def get_cost(self): # The cost is only required to specify dimensions of x and u. + return SymbolicQRCost(Q=np.eye(1), R=np.eye(1)) + +def a_xdot(x : float, a : float) -> float: + m = Simulation() + u = a * x**2 # This approach validates our implementation of the system. A manual implementation is just as good. + xd_ = -np.exp( u - x**2 ) + xdot = m.f((x,), (u,), 0)[0] + assert xd_ == xdot + return xdot + +def b_rk4_simulate(u0 : float, tF : float): + x = 0 + m = Simulation() + xs, us, ts, J_ = m.simulate((x,), u_fun=(u0,), t0=0, tF=tF) + xF =xs[-1][0] + return xF + +if __name__ == "__main__": + print(f"a): dx/dt should be -1, you got {a_xdot(x=2, a=1)=}") + print(f"b): Final position x(tF) should be approximately -2.09, you got {b_rk4_simulate(u0=2, tF=3)=}") \ No newline at end of file diff --git a/irlc/exam/exam2024spring/solution/question_inventory.py b/irlc/exam/exam2024spring/solution/question_inventory.py new file mode 100644 index 0000000000000000000000000000000000000000..4f42b3e1d420c08a4d034b71eb4a38e9c7a30135 --- /dev/null +++ b/irlc/exam/exam2024spring/solution/question_inventory.py @@ -0,0 +1,55 @@ +import math +from irlc.exam.exam2024spring.inventory import InventoryDPModel +from irlc.exam.exam2024spring.dp import DP_stochastic + +class InventoryDPModelGowns(InventoryDPModel): + action_sale = "sale" + def __init__(self, N=3, m=3, allow_sale=False): + self.m = m + self.allow_sale = allow_sale + super().__init__(N=N) + + def A(self, x, k): # Action space A_k(x) + space = list(range(self.m)) + if self.allow_sale: + space = space + [self.action_sale] + return space + else: + return space + + def g(self, x, u, w, k): # Cost function g_k(x,u,w) + if u == self.action_sale: + return 3/4 * (self.m - w) + else: + return InventoryDPModel.g(self, x, u, w, k) + + def f(self, x, u, w, k): # Dynamics f_k(x,u,w) + if u == self.action_sale: + return 0 + else: + return InventoryDPModel.f(self, x, u, w, k) # max(0, min(self.m, x + u - w)) + + def Pw(self, x, u, k): # Distribution over random disturbances + pw = {w: 1/self.m for w in range(self.m)} + assert math.fabs(sum(pw.values()) - 1) < 1e-6 + return pw + +def a_get_cost(N: int, m: int, x0 : int) -> float: + model = InventoryDPModelGowns(N=N, m=m, allow_sale=False) + J, pi = DP_stochastic(model) + expected_cost = J[0][x0] + return expected_cost + +def b_sale(N : int, m : int, x0 : int) -> float: + model = InventoryDPModelGowns(N=N, m=m, allow_sale=True) + J, pi = DP_stochastic(model) + expected_cost = J[0][x0] + return expected_cost + + +if __name__ == "__main__": + x0 = 0 + N = 6 + m = 4 + print(f"a) The expected cost should be 13.75, and you got {a_get_cost(N, m=m, x0=x0)=}") + print(f"b) Expected cost when the sales-option is available should be approximately 11.25, and you got {b_sale(N, m=m, x0=x0)=}") \ No newline at end of file diff --git a/irlc/exam/midterm2023a/solution/midterm2023a_solutions.pdf b/irlc/exam/midterm2023a/solution/midterm2023a_solutions.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a18b0c9d9552211a509b3bca6b93d0b7ddbcaac1 Binary files /dev/null and b/irlc/exam/midterm2023a/solution/midterm2023a_solutions.pdf differ diff --git a/irlc/exam/midterm2023a/solution/question_dp.py b/irlc/exam/midterm2023a/solution/question_dp.py new file mode 100644 index 0000000000000000000000000000000000000000..e4f58a77f54668fcb43a3aee2dfcf2a26b50a7f2 --- /dev/null +++ b/irlc/exam/midterm2023a/solution/question_dp.py @@ -0,0 +1,35 @@ +from irlc.exam.midterm2023a.inventory import InventoryDPModel + +def a_expected_items_next_day(x : int, u : int) -> float: + model = InventoryDPModel() + expected_number_of_items = None + k = 0 + expected_number_of_items = sum([p * model.f(x, u, w, k=0) for w, p in model.Pw(x, u, k).items()]) + return expected_number_of_items + + +def b_evaluate_policy(pi : list, x0 : int) -> float: + model = InventoryDPModel() + N = model.N + J = [{} for _ in range(N + 1)] + J[N] = {x: model.gN(x) for x in model.S(model.N)} + for k in range(N - 1, -1, -1): + for x in model.S(k): + Qu = {u: sum(pw * (model.g(x, u, w, k) + J[k + 1][model.f(x, u, w, k)]) for w, pw in model.Pw(x, u, k).items()) for u + in model.A(x, k)} + + umin = pi[k][x] # min(Qu, key=Qu.get) + J[k][x] = Qu[umin] # Compute the expected cost function + J_pi_x0 = J[0][x0] + return J_pi_x0 + +if __name__ == "__main__": + model = InventoryDPModel() + # Create a policy that always buy an item if the inventory is empty. + pi = [{s: 1 if s == 0 else 0 for s in model.S(k)} for k in range(model.N)] + x = 0 + u = 1 + x0 = 1 + a_expected_items_next_day(x=0, u=1) + print(f"Given inventory is {x=} and we buy {u=}, the expected items on day k=1 is {a_expected_items_next_day(x, u)} and should be 0.1") + print(f"Evaluation of policy is {b_evaluate_policy(pi, x0)} and should be 2.7") \ No newline at end of file diff --git a/irlc/exam/midterm2023a/solution/question_pid.py b/irlc/exam/midterm2023a/solution/question_pid.py new file mode 100644 index 0000000000000000000000000000000000000000..2d27813c6d8fa6c61718d875affd443884600ef0 --- /dev/null +++ b/irlc/exam/midterm2023a/solution/question_pid.py @@ -0,0 +1,52 @@ +def pid(xs : list, xstar :float , Kp=0., Ki=0., Kd=0., stable=False): + us = [] + e_prev = 0 + es = [] + I = 0 + Delta = 1 + for k, x in enumerate(xs): + e = xstar - x + es.append(e) + + I = I + Delta * e + + if k > 2 and stable: + d1 = (es[-1] - es[-2])/Delta + d2 = (es[-2] - es[-3]) / Delta + + dterm = (d1+d2)/2 + else: + dterm = (e-e_prev)/ Delta + + u = Kp * e + Ki * I + Kd * dterm + e_prev = e + us.append(u) + return us[-1] + +def a_pid_Kp(xs : list, xstar : float, Kp : float) -> float: + u = pid(xs, xstar, Kp=Kp) + return u + +def b_pid_full(xs : list, xstar : float, Kp : float, Ki : float, Kd : float) -> float: + u = pid(xs, xstar, Kp=Kp, Ki=Ki, Kd=Kd) + return u + +def c_pid_stable(xs : list, xstar : float, Kp : float, Ki : float, Kd : float) -> float: + u = pid(xs, xstar, Kp=Kp, Ki=Ki, Kd=Kd, stable=True) + return u + + +if __name__ == "__main__": + xs = [10, 8, 7, 5, 3, 1, 0, -2, -1, 0, 2] # Sequence of inputs x_k + Kp = 0.5 + Ki = 0.05 + Kd = 0.25 + xstar = -1 + u_a = a_pid_Kp(xs, xstar=0, Kp=Kp) + print(f"Testing part a. Got {u_a}, expected -1.") + + u_b = b_pid_full(xs, xstar=-1, Kp=Kp, Ki=Ki, Kd=Kd) + print(f"Testing part b. Got {u_b}, expected -4.2") + + u_c = c_pid_stable(xs, xstar=-1, Kp=Kp, Ki=Ki, Kd=Kd) + print(f"Testing part c. Got {u_c}, expected -4.075") \ No newline at end of file diff --git a/irlc/exam/midterm2023b/solution/midterm2023b_solutions.pdf b/irlc/exam/midterm2023b/solution/midterm2023b_solutions.pdf new file mode 100644 index 0000000000000000000000000000000000000000..904c65e5f6a47f280aea6c17029001b25a3867d4 Binary files /dev/null and b/irlc/exam/midterm2023b/solution/midterm2023b_solutions.pdf differ diff --git a/irlc/exam/midterm2023b/solution/question_mdp.py b/irlc/exam/midterm2023b/solution/question_mdp.py new file mode 100644 index 0000000000000000000000000000000000000000..b830f0021e6ac05255b1f8771ec2b038a013f612 --- /dev/null +++ b/irlc/exam/midterm2023b/solution/question_mdp.py @@ -0,0 +1,86 @@ +# from irlc.exam.midterm2023b.inventory import InventoryDPModel +# from irlc.exam.midterm2023b.dp import DP_stochastic +# from irlc.exam +# import irlc +import numpy as np +from irlc.exam.midterm2023b.mdp import MDP + +class SmallGambler(MDP): + """ + Implements a variant of the gambler problem. Please refer to the problem text for a description. You can consider this + implementation of the environment to be authoritative, and I do not recommend changing it. + """ + def __init__(self): + goal = 40 + super().__init__(initial_state=goal // 2) + self.goal = 40 + self.p_heads = .4 # Chance of winning. + + def is_terminal(self, state): + """ Environment has been modified to never terminate. """ + return False + + def A(self, s): + """ Action is the amount you choose to gamble. + You can gamble from 0 and up to the amount of money you have (state), + + If you are either in s = 0 or s = self.goal, you cannot gamble anything (A(s) = {0}). """ + return range(0, min(s, self.goal - s) + 1) + + def Psr(self, s, a): + """ Implement transition probabilities here. + the reward is 1 if s < self.goal and s + a == self.goal and otherwise 0. Remember the format should + return a dictionary with entries: + > { (sp, r) : probability } + """ + r = 1 if s + a == self.goal and s < self.goal else -a/100 + if a == 0: + d = {(s + a, r): 1} + else: + d = {(s + a, r): self.p_heads, (s - a, 0): 1 - self.p_heads} + assert sum(d.values()) == 1 # Sanity check: the probabilities must sum to 1. + return d + + +def a_get_reward(s : int, a : int) -> float: + mdp = SmallGambler() + avg_reward = 0 + for (sp, r), p in mdp.Psr(s, a).items(): + avg_reward += r * p + return avg_reward + +def b_get_best_immediate_action(s : int) -> int: + mdp = SmallGambler() + if s not in mdp.nonterminal_states: + return 0 + d = {a: a_get_reward(s, a) for a in mdp.A(s)} + astar = max(d, key=d.get) + vs = [v for v in d.values() if np.abs(v - d[astar]) < 1e-6] + if len( vs )>1: + print(vs) + assert False + return astar + +def c_get_best_action_twosteps(s : int) -> int: + mdp = SmallGambler() + d = {} + for a in mdp.A(s): + d[a] = 0 + for (sp, r), p in mdp.Psr(s,a).items(): + d[a] += p * (r + a_get_reward(sp, b_get_best_immediate_action(sp))) + + astar = max(d, key=d.get) + vs = [v for v in d.values() if np.abs(v-d[astar]) < 1e-6] + if len( vs )>1: + print(vs) + assert False + return astar + +if __name__ == "__main__": + mdp = SmallGambler() + s = 16 + a = 26 + + print(f"When {s=} and {a=} the average reward is -0.104; your value is {a_get_reward(s,a)=}") + print(f"When {s=} the best immediate action is 0, your value is {b_get_best_immediate_action(s)=}") + print(f"When {s=} the best action over two steps is 4, your value is {c_get_best_action_twosteps(s)=}") \ No newline at end of file diff --git a/irlc/exam/midterm2023b/solution/question_td0.py b/irlc/exam/midterm2023b/solution/question_td0.py new file mode 100644 index 0000000000000000000000000000000000000000..a7d7ed9eacc22eb2a848a1488db671242b79d410 --- /dev/null +++ b/irlc/exam/midterm2023b/solution/question_td0.py @@ -0,0 +1,44 @@ +def a_compute_deltas(v: dict, states: list, rewards: list, gamma: float) -> list: + deltas = [] # !b;nolines + for t, (s, r) in enumerate(zip(states[:-1], rewards)): + sp = states[t + 1] + delta = (r + gamma * v[sp]) - v[s] + deltas.append(delta) # !b + return deltas + + +def b_perform_td0(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict: + for t in range(len(rewards)): # !b;nolines + s = states[t] + sp = states[t + 1] + r = rewards[t] + delta = r + gamma * v[sp] - v[s] + v[s] = v[s] + alpha * delta # !b + return v + + +def c_perform_td0_batched(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict: + deltas = a_compute_deltas(v, states, rewards, gamma) # !b;nolines + for t in range(len(rewards)): + s = states[t] + v[s] = v[s] + alpha * deltas[t] # !b + return v + + +if __name__ == "__main__": + states = [1, 0, 2, -1, 2, 4, 5, 4, 3, 2, 1, -1] + rewards = [1, 0.5, -1, 0, 1, 2, 2, 0, 0, -1, 0.5] + # In the notation of the problem: T = len(rewards). + v = {s: 0 for s in states} # Initialize the value function v. + gamma = 0.9 + alpha = 0.2 + + deltas = a_compute_deltas(v, states, rewards, gamma) + print(f"The first value of delta should be 1, your value is {deltas[0]=}") + + v = b_perform_td0(v, states, rewards, gamma, alpha) + print(f"The value function v(s=1) should be 0.25352, your value is {v[1]=}") + + v_batched = {s: 0 for s in states} # Initialize the value function anew + v_batched = c_perform_td0_batched(v_batched, states, rewards, gamma, alpha) + print(f"The batched value function in v(s=1) should be 0.3, your value is {v_batched[1]=}") \ No newline at end of file