Skip to content
Snippets Groups Projects
Commit 06481f82 authored by Florian Gawrilowicz's avatar Florian Gawrilowicz
Browse files

adding NN baseline and optional normalization

parent ab00315e
No related branches found
No related tags found
No related merge requests found
......@@ -13,10 +13,11 @@ import time
import inspect
from multiprocessing import Process
# ============================================================================================#
# Utilities
# ============================================================================================#
import utils
# ========================================================================================#
# ----------PROBLEM 2----------
......@@ -94,6 +95,8 @@ class Agent(object):
self.nn_baseline = estimate_return_args['nn_baseline']
self.normalize_advantages = estimate_return_args['normalize_advantages']
self.hint_bl = estimate_return_args['hint_bl']
def init_tf_sess(self):
tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
self.sess = tf.Session(config=tf_config)
......@@ -296,16 +299,12 @@ class Agent(object):
# neural network baseline. These will be used to fit the neural network baseline.
# ========================================================================================#
if self.nn_baseline:
raise NotImplementedError
self.baseline_prediction = tf.squeeze(build_mlp(
self.sy_ob_no,
1,
"nn_baseline",
n_layers=self.n_layers,
size=self.size))
# raise NotImplementedError
self.baseline_prediction = tf.squeeze(
build_mlp(self.sy_ob_no, 1, "nn_baseline", n_layers=self.n_layers, size=self.size))
# YOUR_CODE_HERE
self.sy_target_n = None
baseline_loss = None
self.sy_target_n = tf.placeholder(shape=[None], name="target", dtype=tf.float32)
baseline_loss = tf.nn.l2_loss(self.sy_target_n - self.baseline_prediction)
self.baseline_update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(baseline_loss)
def sample_trajectories(self, itr, env):
......@@ -470,9 +469,11 @@ class Agent(object):
# Hint #bl1: rescale the output from the nn_baseline to match the statistics
# (mean and std) of the current batch of Q-values. (Goes with Hint
# #bl2 in Agent.update_parameters.
raise NotImplementedError
b_n = None # YOUR CODE HERE
adv_n = q_n - b_n
# raise NotImplementedError
b_n = self.sess.run(
self.baseline_prediction, feed_dict={self.sy_ob_no: ob_no}
) # YOUR CODE HERE
adv_n = q_n - (utils.normalize(b_n, mean=np.mean(q_n), std=np.std(q_n)) if self.hint_bl else b_n)
else:
adv_n = q_n.copy()
return adv_n
......@@ -506,10 +507,7 @@ class Agent(object):
# On the next line, implement a trick which is known empirically to reduce variance
# in policy gradient methods: normalize adv_n to have mean zero and std=1.
# raise NotImplementedError
adv_n -= np.mean(adv_n)
std = np.std(adv_n)
if np.isfinite(1./std):
adv_n /= std # YOUR_CODE_HERE
adv_n = utils.normalize(adv_n) # YOUR_CODE_HERE
return q_n, adv_n
def update_parameters(self, ob_no, ac_na, q_n, adv_n):
......@@ -545,8 +543,9 @@ class Agent(object):
# Agent.compute_advantage.)
# YOUR_CODE_HERE
raise NotImplementedError
target_n = None
# raise NotImplementedError
target_n = utils.normalize(q_n) if self.hint_bl else q_n
self.sess.run(self.baseline_update_op, feed_dict={self.sy_ob_no: ob_no, self.sy_target_n: target_n})
# ====================================================================================#
# ----------PROBLEM 3----------
......@@ -579,7 +578,8 @@ def train_PG(
nn_baseline,
seed,
n_layers,
size):
size,
hint_bl):
start = time.time()
# ========================================================================================#
......@@ -632,6 +632,7 @@ def train_PG(
'reward_to_go': reward_to_go,
'nn_baseline': nn_baseline,
'normalize_advantages': normalize_advantages,
'hint_bl': hint_bl
}
agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)
......@@ -691,6 +692,7 @@ def main():
parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
parser.add_argument('--reward_to_go', '-rtg', action='store_true')
parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
parser.add_argument('--hint_bl', '-hbl', action='store_true')
parser.add_argument('--nn_baseline', '-bl', action='store_true')
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--n_experiments', '-e', type=int, default=1)
......@@ -729,7 +731,8 @@ def main():
nn_baseline=args.nn_baseline,
seed=seed,
n_layers=args.n_layers,
size=args.size
size=args.size,
hint_bl=args.hint_bl
)
# # Awkward hacky process runs, because Tensorflow does not like
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment