Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
AdvRL19
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Deploy
Releases
Model registry
Analyze
Contributor analytics
Repository analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
flgw
AdvRL19
Commits
2996dfb7
Commit
2996dfb7
authored
Mar 27, 2019
by
Florian Gawrilowicz
Browse files
Options
Downloads
Patches
Plain Diff
Prob. 3 solved - Cartpole & InvertedPendulum!
parent
23adef71
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
hw2/train_pg_f18.py
+152
-102
152 additions, 102 deletions
hw2/train_pg_f18.py
with
152 additions
and
102 deletions
hw2/train_pg_f18.py
+
152
−
102
View file @
2996dfb7
...
@@ -5,13 +5,15 @@ Adapted for CS294-112 Fall 2018 by Michael Chang and Soroush Nasiriany
...
@@ -5,13 +5,15 @@ Adapted for CS294-112 Fall 2018 by Michael Chang and Soroush Nasiriany
"""
"""
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
import
roboschool
import
gym
import
gym
import
logz
from
hw2
import
logz
import
os
import
os
import
time
import
time
import
inspect
import
inspect
from
multiprocessing
import
Process
from
multiprocessing
import
Process
# ============================================================================================#
# ============================================================================================#
# Utilities
# Utilities
# ============================================================================================#
# ============================================================================================#
...
@@ -19,7 +21,8 @@ from multiprocessing import Process
...
@@ -19,7 +21,8 @@ from multiprocessing import Process
# ========================================================================================#
# ========================================================================================#
# ----------PROBLEM 2----------
# ----------PROBLEM 2----------
# ========================================================================================#
# ========================================================================================#
def
build_mlp
(
input_placeholder
,
output_size
,
scope
,
n_layers
,
size
,
activation
=
tf
.
tanh
,
output_activation
=
None
):
def
build_mlp
(
input_placeholder
,
output_size
,
scope
=
''
,
n_layers
=
2
,
size
=
32
,
activation
=
tf
.
tanh
,
output_activation
=
None
):
"""
"""
Builds a feedforward neural network
Builds a feedforward neural network
...
@@ -38,12 +41,27 @@ def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=
...
@@ -38,12 +41,27 @@ def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=
Hint: use tf.layers.dense
Hint: use tf.layers.dense
"""
"""
# YOUR CODE HERE
# YOUR CODE HERE
raise
NotImplementedError
# raise NotImplementedError
with
tf
.
variable_scope
(
scope
):
x
=
input_placeholder
for
i
in
range
(
n_layers
):
x
=
tf
.
layers
.
dense
(
inputs
=
x
,
units
=
size
,
activation
=
activation
)
output_placeholder
=
tf
.
layers
.
dense
(
inputs
=
x
,
units
=
output_size
,
activation
=
output_activation
)
return
output_placeholder
return
output_placeholder
def
pathlength
(
path
):
def
pathlength
(
path
):
return
len
(
path
[
"
reward
"
])
return
len
(
path
[
"
reward
"
])
def
setup_logger
(
logdir
,
locals_
):
def
setup_logger
(
logdir
,
locals_
):
# Configure output directory for logging
# Configure output directory for logging
logz
.
configure_output_dir
(
logdir
)
logz
.
configure_output_dir
(
logdir
)
...
@@ -52,6 +70,7 @@ def setup_logger(logdir, locals_):
...
@@ -52,6 +70,7 @@ def setup_logger(logdir, locals_):
params
=
{
k
:
locals_
[
k
]
if
k
in
locals_
else
None
for
k
in
args
}
params
=
{
k
:
locals_
[
k
]
if
k
in
locals_
else
None
for
k
in
args
}
logz
.
save_params
(
params
)
logz
.
save_params
(
params
)
# ============================================================================================#
# ============================================================================================#
# Policy Gradient
# Policy Gradient
# ============================================================================================#
# ============================================================================================#
...
@@ -95,17 +114,16 @@ class Agent(object):
...
@@ -95,17 +114,16 @@ class Agent(object):
sy_ac_na: placeholder for actions
sy_ac_na: placeholder for actions
sy_adv_n: placeholder for advantages
sy_adv_n: placeholder for advantages
"""
"""
raise
NotImplementedError
#
raise NotImplementedError
sy_ob_no
=
tf
.
placeholder
(
shape
=
[
None
,
self
.
ob_dim
],
name
=
"
ob
"
,
dtype
=
tf
.
float32
)
sy_ob_no
=
tf
.
placeholder
(
shape
=
[
None
,
self
.
ob_dim
],
name
=
"
ob
"
,
dtype
=
tf
.
float32
)
if
self
.
discrete
:
if
self
.
discrete
:
sy_ac_na
=
tf
.
placeholder
(
shape
=
[
None
],
name
=
"
ac
"
,
dtype
=
tf
.
int32
)
sy_ac_na
=
tf
.
placeholder
(
shape
=
[
None
],
name
=
"
ac
"
,
dtype
=
tf
.
int32
)
else
:
else
:
sy_ac_na
=
tf
.
placeholder
(
shape
=
[
None
,
self
.
ac_dim
],
name
=
"
ac
"
,
dtype
=
tf
.
float32
)
sy_ac_na
=
tf
.
placeholder
(
shape
=
[
None
,
self
.
ac_dim
],
name
=
"
ac
"
,
dtype
=
tf
.
float32
)
# YOUR CODE HERE
# YOUR CODE HERE
sy_adv_n
=
None
sy_adv_n
=
tf
.
placeholder
(
shape
=
[
None
],
name
=
"
adv
"
,
dtype
=
tf
.
float32
)
return
sy_ob_no
,
sy_ac_na
,
sy_adv_n
return
sy_ob_no
,
sy_ac_na
,
sy_adv_n
# ========================================================================================#
# ========================================================================================#
# ----------PROBLEM 2----------
# ----------PROBLEM 2----------
# ========================================================================================#
# ========================================================================================#
...
@@ -134,15 +152,18 @@ class Agent(object):
...
@@ -134,15 +152,18 @@ class Agent(object):
Pass in self.n_layers for the
'
n_layers
'
argument, and
Pass in self.n_layers for the
'
n_layers
'
argument, and
pass in self.size for the
'
size
'
argument.
pass in self.size for the
'
size
'
argument.
"""
"""
raise
NotImplementedError
#
raise NotImplementedError
if
self
.
discrete
:
if
self
.
discrete
:
# YOUR_CODE_HERE
# YOUR_CODE_HERE
sy_logits_na
=
None
sy_logits_na
=
build_mlp
(
sy_ob_no
,
self
.
ac_dim
,
scope
=
'
RL
'
,
n_layers
=
self
.
n_layers
,
size
=
self
.
size
)
return
sy_logits_na
return
sy_logits_na
else
:
else
:
# YOUR_CODE_HERE
# YOUR_CODE_HERE
sy_mean
=
None
sy_mean
=
build_mlp
(
sy_logstd
=
None
sy_ob_no
,
self
.
ac_dim
,
scope
=
'
RL
'
,
n_layers
=
self
.
n_layers
,
size
=
self
.
size
)
# logstd should just be a trainable variable, not a network output.
sy_logstd
=
tf
.
get_variable
(
"
logstd
"
,
shape
=
self
.
ac_dim
,
dtype
=
tf
.
float32
)
return
(
sy_mean
,
sy_logstd
)
return
(
sy_mean
,
sy_logstd
)
# ========================================================================================#
# ========================================================================================#
...
@@ -172,15 +193,15 @@ class Agent(object):
...
@@ -172,15 +193,15 @@ class Agent(object):
This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
"""
"""
raise
NotImplementedError
#
raise NotImplementedError
if
self
.
discrete
:
if
self
.
discrete
:
sy_logits_na
=
policy_parameters
sy_logits_na
=
policy_parameters
# YOUR_CODE_HERE
# YOUR_CODE_HERE
sy_sampled_ac
=
None
sy_sampled_ac
=
tf
.
squeeze
(
tf
.
multinomial
(
sy_logits_na
,
1
),
axis
=
1
)
else
:
else
:
sy_mean
,
sy_logstd
=
policy_parameters
sy_mean
,
sy_logstd
=
policy_parameters
# YOUR_CODE_HERE
# YOUR_CODE_HERE
sy_sampled_ac
=
None
sy_sampled_ac
=
tf
.
random_normal
(
shape
=
tf
.
shape
(
sy_mean
),
mean
=
sy_mean
,
stddev
=
tf
.
exp
(
sy_logstd
))
return
sy_sampled_ac
return
sy_sampled_ac
# ========================================================================================#
# ========================================================================================#
...
@@ -209,15 +230,21 @@ class Agent(object):
...
@@ -209,15 +230,21 @@ class Agent(object):
For the discrete case, use the log probability under a categorical distribution.
For the discrete case, use the log probability under a categorical distribution.
For the continuous case, use the log probability under a multivariate gaussian.
For the continuous case, use the log probability under a multivariate gaussian.
"""
"""
raise
NotImplementedError
#
raise NotImplementedError
if
self
.
discrete
:
if
self
.
discrete
:
sy_logits_na
=
policy_parameters
sy_logits_na
=
policy_parameters
# YOUR_CODE_HERE
# YOUR_CODE_HERE
sy_logprob_n
=
None
# a = tf.nn.softmax(sy_logits_na, axis=1)
# sy_logprob_n = tf.log(a[:, sy_ac_na])
# sy_ac_na_oh = tf.one_hot(sy_ac_na, self.ac_dim)
sy_logprob_n
=
-
tf
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
labels
=
sy_ac_na
,
logits
=
sy_logits_na
)
else
:
else
:
sy_mean
,
sy_logstd
=
policy_parameters
sy_mean
,
sy_logstd
=
policy_parameters
# YOUR_CODE_HERE
# YOUR_CODE_HERE
sy_logprob_n
=
None
# import tensorflow_probability as tfp
mvn
=
tf
.
contrib
.
distributions
.
MultivariateNormalDiag
(
loc
=
sy_mean
,
scale_diag
=
tf
.
exp
(
sy_logstd
))
sy_logprob_n
=
tf
.
log
(
mvn
.
prob
(
sy_ac_na
))
return
sy_logprob_n
return
sy_logprob_n
def
build_computation_graph
(
self
):
def
build_computation_graph
(
self
):
...
@@ -258,7 +285,7 @@ class Agent(object):
...
@@ -258,7 +285,7 @@ class Agent(object):
# ----------PROBLEM 2----------
# ----------PROBLEM 2----------
# Loss Function and Training Operation
# Loss Function and Training Operation
# ========================================================================================#
# ========================================================================================#
loss
=
None
# YOUR CODE HERE
loss
=
-
tf
.
reduce_mean
(
tf
.
multiply
(
self
.
sy_logprob_n
,
self
.
sy_adv_n
))
# YOUR CODE HERE
self
.
update_op
=
tf
.
train
.
AdamOptimizer
(
self
.
learning_rate
).
minimize
(
loss
)
self
.
update_op
=
tf
.
train
.
AdamOptimizer
(
self
.
learning_rate
).
minimize
(
loss
)
# ========================================================================================#
# ========================================================================================#
...
@@ -306,8 +333,8 @@ class Agent(object):
...
@@ -306,8 +333,8 @@ class Agent(object):
# ====================================================================================#
# ====================================================================================#
# ----------PROBLEM 3----------
# ----------PROBLEM 3----------
# ====================================================================================#
# ====================================================================================#
raise
NotImplementedError
#
raise NotImplementedError
ac
=
None
# YOUR CODE HERE
ac
=
self
.
sess
.
run
(
self
.
sy_sampled_ac
,
{
self
.
sy_ob_no
:
[
ob
]})
# YOUR CODE HERE
ac
=
ac
[
0
]
ac
=
ac
[
0
]
acs
.
append
(
ac
)
acs
.
append
(
ac
)
ob
,
rew
,
done
,
_
=
env
.
step
(
ac
)
ob
,
rew
,
done
,
_
=
env
.
step
(
ac
)
...
@@ -391,9 +418,27 @@ class Agent(object):
...
@@ -391,9 +418,27 @@ class Agent(object):
"""
"""
# YOUR_CODE_HERE
# YOUR_CODE_HERE
if
self
.
reward_to_go
:
if
self
.
reward_to_go
:
raise
NotImplementedError
# raise NotImplementedError
q_n
=
[]
for
re
in
re_n
:
sor
=
[]
for
t
in
range
(
len
(
re
)):
tot
=
0
for
t_p
,
r
in
enumerate
(
re
[
t
:]):
tot
+=
self
.
gamma
**
t_p
*
r
sor
.
append
(
tot
)
q_n
.
append
(
sor
)
q_n
=
np
.
hstack
(
q_n
)
else
:
else
:
raise
NotImplementedError
# raise NotImplementedError
q_n
=
[]
for
re
in
re_n
:
sor
=
0
for
t_p
,
r
in
enumerate
(
re
):
sor
+=
self
.
gamma
**
t_p
*
r
q_n
.
append
(
np
.
array
([
sor
]
*
len
(
re
)))
q_n
=
np
.
hstack
(
q_n
)
print
(
q_n
)
return
q_n
return
q_n
def
compute_advantage
(
self
,
ob_no
,
q_n
):
def
compute_advantage
(
self
,
ob_no
,
q_n
):
...
@@ -460,8 +505,11 @@ class Agent(object):
...
@@ -460,8 +505,11 @@ class Agent(object):
if
self
.
normalize_advantages
:
if
self
.
normalize_advantages
:
# On the next line, implement a trick which is known empirically to reduce variance
# On the next line, implement a trick which is known empirically to reduce variance
# in policy gradient methods: normalize adv_n to have mean zero and std=1.
# in policy gradient methods: normalize adv_n to have mean zero and std=1.
raise
NotImplementedError
# raise NotImplementedError
adv_n
=
None
# YOUR_CODE_HERE
adv_n
-=
np
.
mean
(
adv_n
)
std
=
np
.
std
(
adv_n
)
if
np
.
isfinite
(
1.
/
std
):
adv_n
/=
std
# YOUR_CODE_HERE
return
q_n
,
adv_n
return
q_n
,
adv_n
def
update_parameters
(
self
,
ob_no
,
ac_na
,
q_n
,
adv_n
):
def
update_parameters
(
self
,
ob_no
,
ac_na
,
q_n
,
adv_n
):
...
@@ -512,7 +560,8 @@ class Agent(object):
...
@@ -512,7 +560,8 @@ class Agent(object):
# and after an update, and then log them below.
# and after an update, and then log them below.
# YOUR_CODE_HERE
# YOUR_CODE_HERE
raise
NotImplementedError
# raise NotImplementedError
self
.
sess
.
run
(
self
.
update_op
,
{
self
.
sy_ob_no
:
ob_no
,
self
.
sy_ac_na
:
ac_na
,
self
.
sy_adv_n
:
adv_n
})
def
train_PG
(
def
train_PG
(
...
@@ -531,7 +580,6 @@ def train_PG(
...
@@ -531,7 +580,6 @@ def train_PG(
seed
,
seed
,
n_layers
,
n_layers
,
size
):
size
):
start
=
time
.
time
()
start
=
time
.
time
()
# ========================================================================================#
# ========================================================================================#
...
@@ -683,6 +731,7 @@ def main():
...
@@ -683,6 +731,7 @@ def main():
n_layers
=
args
.
n_layers
,
n_layers
=
args
.
n_layers
,
size
=
args
.
size
size
=
args
.
size
)
)
# # Awkward hacky process runs, because Tensorflow does not like
# # Awkward hacky process runs, because Tensorflow does not like
# # repeatedly calling train_PG in the same thread.
# # repeatedly calling train_PG in the same thread.
p
=
Process
(
target
=
train_func
,
args
=
tuple
())
p
=
Process
(
target
=
train_func
,
args
=
tuple
())
...
@@ -695,5 +744,6 @@ def main():
...
@@ -695,5 +744,6 @@ def main():
for
p
in
processes
:
for
p
in
processes
:
p
.
join
()
p
.
join
()
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
main
()
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment