Prob. 3 solved - Cartpole & InvertedPendulum!

2996dfb7 · Florian Gawrilowicz · 23adef71 · 2996dfb7
Commit 2996dfb7 authored Mar 27, 2019 by Florian Gawrilowicz
--- a/hw2/train_pg_f18.py
+++ b/hw2/train_pg_f18.py
@@ -5,13 +5,15 @@ Adapted for CS294-112 Fall 2018 by Michael Chang and Soroush Nasiriany
 """
 import numpy as np
 import tensorflow as tf
+import roboschool
 import gym
-import logz
+from hw2 import logz
 import os
 import time
 import inspect
 from multiprocessing import Process
 # ============================================================================================#
 # Utilities
 # ============================================================================================#
@@ -19,7 +21,8 @@ from multiprocessing import Process
 # ========================================================================================#
 #                           ----------PROBLEM 2----------
 # ========================================================================================#
-def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None):
+def build_mlp(input_placeholder, output_size, scope='', n_layers=2, size=32, activation=tf.tanh,
+              output_activation=None):
    """
        Builds a feedforward neural network
@@ -38,12 +41,27 @@ def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=
        Hint: use tf.layers.dense    
    """
    # YOUR CODE HERE
-    raise NotImplementedError
+    # raise NotImplementedError
+    with tf.variable_scope(scope):
+        x = input_placeholder
+        for i in range(n_layers):
+            x = tf.layers.dense(
+                inputs=x,
+                units=size,
+                activation=activation
+            )
+        output_placeholder = tf.layers.dense(
+            inputs=x,
+            units=output_size,
+            activation=output_activation
+        )
    return output_placeholder
 def pathlength(path):
    return len(path["reward"])
 def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
@@ -52,6 +70,7 @@ def setup_logger(logdir, locals_):
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)
 # ============================================================================================#
 # Policy Gradient
 # ============================================================================================#
@@ -95,17 +114,16 @@ class Agent(object):
                sy_ac_na: placeholder for actions
                sy_adv_n: placeholder for advantages
        """
-        raise NotImplementedError
+        # raise NotImplementedError
        sy_ob_no = tf.placeholder(shape=[None, self.ob_dim], name="ob", dtype=tf.float32)
        if self.discrete:
            sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
        else:
            sy_ac_na = tf.placeholder(shape=[None, self.ac_dim], name="ac", dtype=tf.float32)
        # YOUR CODE HERE
-        sy_adv_n = None
+        sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
        return sy_ob_no, sy_ac_na, sy_adv_n
    # ========================================================================================#
    #                           ----------PROBLEM 2----------
    # ========================================================================================#
@@ -134,15 +152,18 @@ class Agent(object):
                Pass in self.n_layers for the 'n_layers' argument, and
                pass in self.size for the 'size' argument.
        """
-        raise NotImplementedError
+        # raise NotImplementedError
        if self.discrete:
            # YOUR_CODE_HERE
-            sy_logits_na = None
+            sy_logits_na = build_mlp(
+                sy_ob_no, self.ac_dim, scope='RL', n_layers=self.n_layers, size=self.size)
            return sy_logits_na
        else:
            # YOUR_CODE_HERE
-            sy_mean = None
+            sy_mean = build_mlp(
-            sy_logstd = None
+                sy_ob_no, self.ac_dim, scope='RL', n_layers=self.n_layers, size=self.size)
+            # logstd should just be a trainable variable, not a network output.
+            sy_logstd = tf.get_variable("logstd", shape=self.ac_dim, dtype=tf.float32)
            return (sy_mean, sy_logstd)
    # ========================================================================================#
@@ -172,15 +193,15 @@ class Agent(object):
                 This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
        """
-        raise NotImplementedError
+        # raise NotImplementedError
        if self.discrete:
            sy_logits_na = policy_parameters
            # YOUR_CODE_HERE
-            sy_sampled_ac = None
+            sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), axis=1)
        else:
            sy_mean, sy_logstd = policy_parameters
            # YOUR_CODE_HERE
-            sy_sampled_ac = None
+            sy_sampled_ac = tf.random_normal(shape=tf.shape(sy_mean), mean=sy_mean, stddev=tf.exp(sy_logstd))
        return sy_sampled_ac
    # ========================================================================================#
@@ -209,15 +230,21 @@ class Agent(object):
                For the discrete case, use the log probability under a categorical distribution.
                For the continuous case, use the log probability under a multivariate gaussian.
        """
-        raise NotImplementedError
+        # raise NotImplementedError
        if self.discrete:
            sy_logits_na = policy_parameters
            # YOUR_CODE_HERE
-            sy_logprob_n = None
+            # a = tf.nn.softmax(sy_logits_na, axis=1)
+            # sy_logprob_n = tf.log(a[:, sy_ac_na])
+            # sy_ac_na_oh = tf.one_hot(sy_ac_na, self.ac_dim)
+            sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na)
        else:
            sy_mean, sy_logstd = policy_parameters
            # YOUR_CODE_HERE
-            sy_logprob_n = None
+            # import tensorflow_probability as tfp
+            mvn = tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, scale_diag=tf.exp(sy_logstd))
+            sy_logprob_n = tf.log(mvn.prob(sy_ac_na))
        return sy_logprob_n
    def build_computation_graph(self):
@@ -258,7 +285,7 @@ class Agent(object):
        #                           ----------PROBLEM 2----------
        # Loss Function and Training Operation
        # ========================================================================================#
-        loss = None # YOUR CODE HERE
+        loss = -tf.reduce_mean(tf.multiply(self.sy_logprob_n, self.sy_adv_n))  # YOUR CODE HERE
        self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)
        # ========================================================================================#
@@ -306,8 +333,8 @@ class Agent(object):
            # ====================================================================================#
            #                           ----------PROBLEM 3----------
            # ====================================================================================#
-            raise NotImplementedError
+            # raise NotImplementedError
-            ac = None # YOUR CODE HERE
+            ac = self.sess.run(self.sy_sampled_ac, {self.sy_ob_no: [ob]})  # YOUR CODE HERE
            ac = ac[0]
            acs.append(ac)
            ob, rew, done, _ = env.step(ac)
@@ -391,9 +418,27 @@ class Agent(object):
        """
        # YOUR_CODE_HERE
        if self.reward_to_go:
-            raise NotImplementedError
+            # raise NotImplementedError
+            q_n = []
+            for re in re_n:
+                sor = []
+                for t in range(len(re)):
+                    tot = 0
+                    for t_p, r in enumerate(re[t:]):
+                        tot += self.gamma ** t_p * r
+                    sor.append(tot)
+                q_n.append(sor)
+            q_n = np.hstack(q_n)
        else:
-            raise NotImplementedError
+            # raise NotImplementedError
+            q_n = []
+            for re in re_n:
+                sor = 0
+                for t_p, r in enumerate(re):
+                    sor += self.gamma ** t_p * r
+                q_n.append(np.array([sor] * len(re)))
+            q_n = np.hstack(q_n)
+        print(q_n)
        return q_n
    def compute_advantage(self, ob_no, q_n):
@@ -460,8 +505,11 @@ class Agent(object):
        if self.normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
-            raise NotImplementedError
+            # raise NotImplementedError
-            adv_n = None # YOUR_CODE_HERE
+            adv_n -= np.mean(adv_n)
+            std = np.std(adv_n)
+            if np.isfinite(1./std):
+                adv_n /= std  # YOUR_CODE_HERE
        return q_n, adv_n
    def update_parameters(self, ob_no, ac_na, q_n, adv_n):
@@ -512,7 +560,8 @@ class Agent(object):
        # and after an update, and then log them below. 
        # YOUR_CODE_HERE
-        raise NotImplementedError
+        # raise NotImplementedError
+        self.sess.run(self.update_op, {self.sy_ob_no: ob_no, self.sy_ac_na: ac_na, self.sy_adv_n: adv_n})
 def train_PG(
@@ -531,7 +580,6 @@ def train_PG(
        seed,
        n_layers,
        size):
    start = time.time()
    # ========================================================================================#
@@ -683,6 +731,7 @@ def main():
                n_layers=args.n_layers,
                size=args.size
            )
        # # Awkward hacky process runs, because Tensorflow does not like
        # # repeatedly calling train_PG in the same thread.
        p = Process(target=train_func, args=tuple())
@@ -695,5 +744,6 @@ def main():
    for p in processes:
        p.join()
 if __name__ == "__main__":
    main()