Skip to content
Snippets Groups Projects
Commit 2996dfb7 authored by Florian Gawrilowicz's avatar Florian Gawrilowicz
Browse files

Prob. 3 solved - Cartpole & InvertedPendulum!

parent 23adef71
No related branches found
No related tags found
No related merge requests found
...@@ -5,13 +5,15 @@ Adapted for CS294-112 Fall 2018 by Michael Chang and Soroush Nasiriany ...@@ -5,13 +5,15 @@ Adapted for CS294-112 Fall 2018 by Michael Chang and Soroush Nasiriany
""" """
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
import roboschool
import gym import gym
import logz from hw2 import logz
import os import os
import time import time
import inspect import inspect
from multiprocessing import Process from multiprocessing import Process
# ============================================================================================# # ============================================================================================#
# Utilities # Utilities
# ============================================================================================# # ============================================================================================#
...@@ -19,7 +21,8 @@ from multiprocessing import Process ...@@ -19,7 +21,8 @@ from multiprocessing import Process
# ========================================================================================# # ========================================================================================#
# ----------PROBLEM 2---------- # ----------PROBLEM 2----------
# ========================================================================================# # ========================================================================================#
def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None): def build_mlp(input_placeholder, output_size, scope='', n_layers=2, size=32, activation=tf.tanh,
output_activation=None):
""" """
Builds a feedforward neural network Builds a feedforward neural network
...@@ -38,12 +41,27 @@ def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation= ...@@ -38,12 +41,27 @@ def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=
Hint: use tf.layers.dense Hint: use tf.layers.dense
""" """
# YOUR CODE HERE # YOUR CODE HERE
raise NotImplementedError # raise NotImplementedError
with tf.variable_scope(scope):
x = input_placeholder
for i in range(n_layers):
x = tf.layers.dense(
inputs=x,
units=size,
activation=activation
)
output_placeholder = tf.layers.dense(
inputs=x,
units=output_size,
activation=output_activation
)
return output_placeholder return output_placeholder
def pathlength(path): def pathlength(path):
return len(path["reward"]) return len(path["reward"])
def setup_logger(logdir, locals_): def setup_logger(logdir, locals_):
# Configure output directory for logging # Configure output directory for logging
logz.configure_output_dir(logdir) logz.configure_output_dir(logdir)
...@@ -52,6 +70,7 @@ def setup_logger(logdir, locals_): ...@@ -52,6 +70,7 @@ def setup_logger(logdir, locals_):
params = {k: locals_[k] if k in locals_ else None for k in args} params = {k: locals_[k] if k in locals_ else None for k in args}
logz.save_params(params) logz.save_params(params)
# ============================================================================================# # ============================================================================================#
# Policy Gradient # Policy Gradient
# ============================================================================================# # ============================================================================================#
...@@ -95,17 +114,16 @@ class Agent(object): ...@@ -95,17 +114,16 @@ class Agent(object):
sy_ac_na: placeholder for actions sy_ac_na: placeholder for actions
sy_adv_n: placeholder for advantages sy_adv_n: placeholder for advantages
""" """
raise NotImplementedError # raise NotImplementedError
sy_ob_no = tf.placeholder(shape=[None, self.ob_dim], name="ob", dtype=tf.float32) sy_ob_no = tf.placeholder(shape=[None, self.ob_dim], name="ob", dtype=tf.float32)
if self.discrete: if self.discrete:
sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
else: else:
sy_ac_na = tf.placeholder(shape=[None, self.ac_dim], name="ac", dtype=tf.float32) sy_ac_na = tf.placeholder(shape=[None, self.ac_dim], name="ac", dtype=tf.float32)
# YOUR CODE HERE # YOUR CODE HERE
sy_adv_n = None sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
return sy_ob_no, sy_ac_na, sy_adv_n return sy_ob_no, sy_ac_na, sy_adv_n
# ========================================================================================# # ========================================================================================#
# ----------PROBLEM 2---------- # ----------PROBLEM 2----------
# ========================================================================================# # ========================================================================================#
...@@ -134,15 +152,18 @@ class Agent(object): ...@@ -134,15 +152,18 @@ class Agent(object):
Pass in self.n_layers for the 'n_layers' argument, and Pass in self.n_layers for the 'n_layers' argument, and
pass in self.size for the 'size' argument. pass in self.size for the 'size' argument.
""" """
raise NotImplementedError # raise NotImplementedError
if self.discrete: if self.discrete:
# YOUR_CODE_HERE # YOUR_CODE_HERE
sy_logits_na = None sy_logits_na = build_mlp(
sy_ob_no, self.ac_dim, scope='RL', n_layers=self.n_layers, size=self.size)
return sy_logits_na return sy_logits_na
else: else:
# YOUR_CODE_HERE # YOUR_CODE_HERE
sy_mean = None sy_mean = build_mlp(
sy_logstd = None sy_ob_no, self.ac_dim, scope='RL', n_layers=self.n_layers, size=self.size)
# logstd should just be a trainable variable, not a network output.
sy_logstd = tf.get_variable("logstd", shape=self.ac_dim, dtype=tf.float32)
return (sy_mean, sy_logstd) return (sy_mean, sy_logstd)
# ========================================================================================# # ========================================================================================#
...@@ -172,15 +193,15 @@ class Agent(object): ...@@ -172,15 +193,15 @@ class Agent(object):
This reduces the problem to just sampling z. (Hint: use tf.random_normal!) This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
""" """
raise NotImplementedError # raise NotImplementedError
if self.discrete: if self.discrete:
sy_logits_na = policy_parameters sy_logits_na = policy_parameters
# YOUR_CODE_HERE # YOUR_CODE_HERE
sy_sampled_ac = None sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), axis=1)
else: else:
sy_mean, sy_logstd = policy_parameters sy_mean, sy_logstd = policy_parameters
# YOUR_CODE_HERE # YOUR_CODE_HERE
sy_sampled_ac = None sy_sampled_ac = tf.random_normal(shape=tf.shape(sy_mean), mean=sy_mean, stddev=tf.exp(sy_logstd))
return sy_sampled_ac return sy_sampled_ac
# ========================================================================================# # ========================================================================================#
...@@ -209,15 +230,21 @@ class Agent(object): ...@@ -209,15 +230,21 @@ class Agent(object):
For the discrete case, use the log probability under a categorical distribution. For the discrete case, use the log probability under a categorical distribution.
For the continuous case, use the log probability under a multivariate gaussian. For the continuous case, use the log probability under a multivariate gaussian.
""" """
raise NotImplementedError # raise NotImplementedError
if self.discrete: if self.discrete:
sy_logits_na = policy_parameters sy_logits_na = policy_parameters
# YOUR_CODE_HERE # YOUR_CODE_HERE
sy_logprob_n = None # a = tf.nn.softmax(sy_logits_na, axis=1)
# sy_logprob_n = tf.log(a[:, sy_ac_na])
# sy_ac_na_oh = tf.one_hot(sy_ac_na, self.ac_dim)
sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na)
else: else:
sy_mean, sy_logstd = policy_parameters sy_mean, sy_logstd = policy_parameters
# YOUR_CODE_HERE # YOUR_CODE_HERE
sy_logprob_n = None # import tensorflow_probability as tfp
mvn = tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, scale_diag=tf.exp(sy_logstd))
sy_logprob_n = tf.log(mvn.prob(sy_ac_na))
return sy_logprob_n return sy_logprob_n
def build_computation_graph(self): def build_computation_graph(self):
...@@ -258,7 +285,7 @@ class Agent(object): ...@@ -258,7 +285,7 @@ class Agent(object):
# ----------PROBLEM 2---------- # ----------PROBLEM 2----------
# Loss Function and Training Operation # Loss Function and Training Operation
# ========================================================================================# # ========================================================================================#
loss = None # YOUR CODE HERE loss = -tf.reduce_mean(tf.multiply(self.sy_logprob_n, self.sy_adv_n)) # YOUR CODE HERE
self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss) self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)
# ========================================================================================# # ========================================================================================#
...@@ -306,8 +333,8 @@ class Agent(object): ...@@ -306,8 +333,8 @@ class Agent(object):
# ====================================================================================# # ====================================================================================#
# ----------PROBLEM 3---------- # ----------PROBLEM 3----------
# ====================================================================================# # ====================================================================================#
raise NotImplementedError # raise NotImplementedError
ac = None # YOUR CODE HERE ac = self.sess.run(self.sy_sampled_ac, {self.sy_ob_no: [ob]}) # YOUR CODE HERE
ac = ac[0] ac = ac[0]
acs.append(ac) acs.append(ac)
ob, rew, done, _ = env.step(ac) ob, rew, done, _ = env.step(ac)
...@@ -391,9 +418,27 @@ class Agent(object): ...@@ -391,9 +418,27 @@ class Agent(object):
""" """
# YOUR_CODE_HERE # YOUR_CODE_HERE
if self.reward_to_go: if self.reward_to_go:
raise NotImplementedError # raise NotImplementedError
q_n = []
for re in re_n:
sor = []
for t in range(len(re)):
tot = 0
for t_p, r in enumerate(re[t:]):
tot += self.gamma ** t_p * r
sor.append(tot)
q_n.append(sor)
q_n = np.hstack(q_n)
else: else:
raise NotImplementedError # raise NotImplementedError
q_n = []
for re in re_n:
sor = 0
for t_p, r in enumerate(re):
sor += self.gamma ** t_p * r
q_n.append(np.array([sor] * len(re)))
q_n = np.hstack(q_n)
print(q_n)
return q_n return q_n
def compute_advantage(self, ob_no, q_n): def compute_advantage(self, ob_no, q_n):
...@@ -460,8 +505,11 @@ class Agent(object): ...@@ -460,8 +505,11 @@ class Agent(object):
if self.normalize_advantages: if self.normalize_advantages:
# On the next line, implement a trick which is known empirically to reduce variance # On the next line, implement a trick which is known empirically to reduce variance
# in policy gradient methods: normalize adv_n to have mean zero and std=1. # in policy gradient methods: normalize adv_n to have mean zero and std=1.
raise NotImplementedError # raise NotImplementedError
adv_n = None # YOUR_CODE_HERE adv_n -= np.mean(adv_n)
std = np.std(adv_n)
if np.isfinite(1./std):
adv_n /= std # YOUR_CODE_HERE
return q_n, adv_n return q_n, adv_n
def update_parameters(self, ob_no, ac_na, q_n, adv_n): def update_parameters(self, ob_no, ac_na, q_n, adv_n):
...@@ -512,7 +560,8 @@ class Agent(object): ...@@ -512,7 +560,8 @@ class Agent(object):
# and after an update, and then log them below. # and after an update, and then log them below.
# YOUR_CODE_HERE # YOUR_CODE_HERE
raise NotImplementedError # raise NotImplementedError
self.sess.run(self.update_op, {self.sy_ob_no: ob_no, self.sy_ac_na: ac_na, self.sy_adv_n: adv_n})
def train_PG( def train_PG(
...@@ -531,7 +580,6 @@ def train_PG( ...@@ -531,7 +580,6 @@ def train_PG(
seed, seed,
n_layers, n_layers,
size): size):
start = time.time() start = time.time()
# ========================================================================================# # ========================================================================================#
...@@ -683,6 +731,7 @@ def main(): ...@@ -683,6 +731,7 @@ def main():
n_layers=args.n_layers, n_layers=args.n_layers,
size=args.size size=args.size
) )
# # Awkward hacky process runs, because Tensorflow does not like # # Awkward hacky process runs, because Tensorflow does not like
# # repeatedly calling train_PG in the same thread. # # repeatedly calling train_PG in the same thread.
p = Process(target=train_func, args=tuple()) p = Process(target=train_func, args=tuple())
...@@ -695,5 +744,6 @@ def main(): ...@@ -695,5 +744,6 @@ def main():
for p in processes: for p in processes:
p.join() p.join()
if __name__ == "__main__": if __name__ == "__main__":
main() main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment