Solutions for week 10+11

c16ac062 · tuhe · 72620f61 · c16ac062 · c16ac062 · c16ac062
Commit c16ac062 authored 3 months ago by tuhe
--- a/solutions/ex10/mc_agent_TODO_1.py
+++ b/solutions/ex10/mc_agent_TODO_1.py
+        G = gamma * G + episode[t][2] 
+        sa_t = episode[t][:2] 
\ No newline at end of file
--- a/solutions/ex10/mc_agent_TODO_2.py
+++ b/solutions/ex10/mc_agent_TODO_2.py
+            returns.append(sa_t + (G,) )
\ No newline at end of file
--- a/solutions/ex10/mc_agent_TODO_3.py
+++ b/solutions/ex10/mc_agent_TODO_3.py
+        return self.pi_eps(s, info)
\ No newline at end of file
--- a/solutions/ex10/mc_agent_TODO_4.py
+++ b/solutions/ex10/mc_agent_TODO_4.py
+        self.episode.append((s, a, r))
+        if done:
+            returns = get_MC_return_SA(self.episode, self.gamma, self.first_visit)
+            for s, a, G in returns:
+                # s,a = sa
+                if self.alpha is None:
+                    self.returns_sum_S[s, a] += G
+                    self.returns_count_N[s, a] += 1
+                    self.Q[s, a] = self.returns_sum_S[s, a] / self.returns_count_N[s, a]
+                else:
+                    self.Q[s, a] += self.alpha * (G - self.Q[s, a])
+            self.episode = [] 
\ No newline at end of file
--- a/solutions/ex10/mc_agent_blackjack_TODO_1.py
+++ b/solutions/ex10/mc_agent_blackjack_TODO_1.py
+    train(env, agent, expn, num_episodes=episodes, return_trajectory=False) 
\ No newline at end of file
--- a/solutions/ex10/mc_evaluate_TODO_1.py
+++ b/solutions/ex10/mc_evaluate_TODO_1.py
+        G = gamma * G + episode[t][2] 
+        s_t = episode[t][0] 
\ No newline at end of file
--- a/solutions/ex10/mc_evaluate_TODO_2.py
+++ b/solutions/ex10/mc_evaluate_TODO_2.py
+            returns.append((s_t, G))
\ No newline at end of file
--- a/solutions/ex10/mc_evaluate_TODO_3.py
+++ b/solutions/ex10/mc_evaluate_TODO_3.py
+                    self.v[s] = self.v[s] + self.alpha * (G - self.v[s])
\ No newline at end of file
--- a/solutions/ex10/mc_evaluate_TODO_4.py
+++ b/solutions/ex10/mc_evaluate_TODO_4.py
+                    self.returns_sum_S[s] += G
+                    self.returns_count_N[s] += 1.0
+                    self.v[s] = self.returns_sum_S[s] / self.returns_count_N[s] 
\ No newline at end of file
--- a/solutions/ex10/mc_evaluate_TODO_5.py
+++ b/solutions/ex10/mc_evaluate_TODO_5.py
+        agent_every = MCEvaluationAgent(env, gamma=gamma, first_visit=False) 
\ No newline at end of file
--- a/solutions/ex10/mc_evaluate_TODO_6.py
+++ b/solutions/ex10/mc_evaluate_TODO_6.py
+        train(env, agent_every, num_episodes=episodes, verbose=False) 
\ No newline at end of file
--- a/solutions/ex10/mc_evaluate_blackjack_TODO_1.py
+++ b/solutions/ex10/mc_evaluate_blackjack_TODO_1.py
+    return 0 if s[0] >= 20 else 1
\ No newline at end of file
--- a/solutions/ex10/mc_evaluate_blackjack_TODO_2.py
+++ b/solutions/ex10/mc_evaluate_blackjack_TODO_2.py
+    agent = MCEvaluationAgent(env, policy=policy20, gamma=1) 
+    train(env, agent, experiment_name=experiment, num_episodes=episodes) 
\ No newline at end of file
--- a/solutions/ex10/question_td0_TODO_1.py
+++ b/solutions/ex10/question_td0_TODO_1.py
+    deltas = []  
+    for t, (s, r) in enumerate(zip(states[:-1], rewards)):
+        sp = states[t + 1]
+        delta = (r + gamma * v[sp]) - v[s]
+        deltas.append(delta)  
\ No newline at end of file
--- a/solutions/ex10/question_td0_TODO_2.py
+++ b/solutions/ex10/question_td0_TODO_2.py
+    for t in range(len(rewards)):  
+        s = states[t]
+        sp = states[t + 1]
+        r = rewards[t]
+        delta = r + gamma * v[sp] - v[s]
+        v[s] = v[s] + alpha * delta  
\ No newline at end of file
--- a/solutions/ex10/question_td0_TODO_3.py
+++ b/solutions/ex10/question_td0_TODO_3.py
+    deltas = a_compute_deltas(v, states, rewards, gamma)  
+    for t in range(len(rewards)):
+        s = states[t]
+        v[s] = v[s] + alpha * deltas[t]  
\ No newline at end of file
--- a/solutions/ex10/random_walk_example_TODO_1.py
+++ b/solutions/ex10/random_walk_example_TODO_1.py
+        sp = s+(2*a-1)
\ No newline at end of file
--- a/solutions/ex10/td0_evaluate_TODO_1.py
+++ b/solutions/ex10/td0_evaluate_TODO_1.py
+        if isinstance(s, np.ndarray):
+            print("Bad type.")
+        self.v[s] += self.alpha * (r + self.gamma * (self.v[sp] if not done else 0) - self.v[s]) 
\ No newline at end of file
--- a/solutions/ex11/nstep_sarsa_agent_TODO_1.py
+++ b/solutions/ex11/nstep_sarsa_agent_TODO_1.py
+                G = sum([self.gamma**(i-tau-1)*self.R[i%(n+1)] for i in range(tau+1, min(tau+n, T)+1)]) 
+                S_tau_n, A_tau_n = self.S[(tau+n)%(n+1)], self.A[(tau+n)%(n+1)]
+                if tau+n < T:
+                    G += self.gamma**n * self._q(S_tau_n, A_tau_n) 
\ No newline at end of file
--- a/solutions/ex11/q_agent_TODO_1.py
+++ b/solutions/ex11/q_agent_TODO_1.py
+        action =  self.pi_eps(s, info=info)
\ No newline at end of file