Skip to content
Snippets Groups Projects
Commit 21d35b4c authored by Florian Gawrilowicz's avatar Florian Gawrilowicz
Browse files

hw4 q1

parent a2be6607
No related branches found
No related tags found
No related merge requests found
......@@ -22,6 +22,7 @@ class ModelBasedPolicy(object):
self._num_random_action_selection = num_random_action_selection
self._nn_layers = nn_layers
self._learning_rate = 1e-3
self._reuse = False
self._sess, self._state_ph, self._action_ph, self._next_state_ph, \
self._next_state_pred, self._loss, self._optimizer, self._best_action = self._setup_graph()
......@@ -41,7 +42,10 @@ class ModelBasedPolicy(object):
"""
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
state_ph = tf.placeholder(shape=[None, self._state_dim], name="ob", dtype=tf.float32)
action_ph = tf.placeholder(shape=[None, self._action_dim], name="ac", dtype=tf.float32)
next_state_ph = tf.placeholder(shape=[None, self._state_dim], name="ob", dtype=tf.float32)
return state_ph, action_ph, next_state_ph
......@@ -65,8 +69,18 @@ class ModelBasedPolicy(object):
"""
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
s = utils.normalize(state, self._init_dataset.state_mean, self._init_dataset.state_std)
a = utils.normalize(action, self._init_dataset.action_mean, self._init_dataset.action_std)
input_layer = tf.concat([s, a], axis=1)
delta_pred_norm = utils.build_mlp(
input_layer, self._state_dim, 'dynamics_func', n_layers=self._nn_layers, reuse=reuse)
delta_pred = utils.unnormalize(
delta_pred_norm, self._init_dataset.delta_state_mean, self._init_dataset.delta_state_std)
next_state_pred = state + delta_pred
return next_state_pred
def _setup_training(self, state_ph, next_state_ph, next_state_pred):
......@@ -89,7 +103,12 @@ class ModelBasedPolicy(object):
"""
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
delta = next_state_ph - state_ph
delta_pred = next_state_pred - state_ph
loss = tf.losses.mean_squared_error(delta, delta_pred)
optimizer = tf.train.AdamOptimizer(self._learning_rate).minimize(loss)
return loss, optimizer
......@@ -136,7 +155,14 @@ class ModelBasedPolicy(object):
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
tf_config.gpu_options.allow_growth = True # may need if using GPU
sess = tf.Session(config=tf_config)
state_ph, action_ph, next_state_ph = self._setup_placeholders()
next_state_pred = self._dynamics_func(state_ph, action_ph, self._reuse)
loss, optimizer = self._setup_training(state_ph, next_state_ph, next_state_pred)
### PROBLEM 2
### YOUR CODE HERE
best_action = None
......@@ -155,7 +181,10 @@ class ModelBasedPolicy(object):
"""
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
loss, _ = self._sess.run(
[self._loss, self._optimizer],
feed_dict={self._state_ph: states, self._action_ph: actions, self._next_state_ph: next_states})
return loss
......@@ -174,7 +203,10 @@ class ModelBasedPolicy(object):
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
next_state_pred = self._sess.run(
self._next_state_pred,
feed_dict={self._state_ph: [state], self._action_ph: [action]})[0]
assert np.shape(next_state_pred) == (self._state_dim,)
return next_state_pred
......
......@@ -85,7 +85,11 @@ class ModelBasedRL(object):
losses = []
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
for e in range(self._training_epochs):
for states, actions, next_states, rewards, dones in dataset.random_iterator(self._training_batch_size):
losses.append(self._policy.train_step(states, actions, next_states))
logger.record_tabular('TrainingLossStart', losses[0])
logger.record_tabular('TrainingLossFinal', losses[-1])
......@@ -117,7 +121,8 @@ class ModelBasedRL(object):
logger.info('Training policy....')
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
self._train_policy(self._random_dataset)
logger.info('Evaluating predictions...')
for r_num, (states, actions, _, _, _) in enumerate(self._random_dataset.rollout_iterator()):
......@@ -125,7 +130,11 @@ class ModelBasedRL(object):
### PROBLEM 1
### YOUR CODE HERE
raise NotImplementedError
# raise NotImplementedError
s_pred = states[0]
for a in actions:
s_pred = self._policy.predict(s_pred, a)
pred_states.append(s_pred)
states = np.asarray(states)
pred_states = np.asarray(pred_states)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment