Skip to content
Snippets Groups Projects
Commit 49151bb2 authored by Florian Gawrilowicz's avatar Florian Gawrilowicz
Browse files

rescue

parent 21d35b4c
No related branches found
No related tags found
No related merge requests found
...@@ -3,13 +3,21 @@ import numpy as np ...@@ -3,13 +3,21 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
import pickle import pickle
import os import os
from hw1 import tf_util import tf_util
import gym import gym
import argparse
envname = 'RoboschoolAnt-v1' parser = argparse.ArgumentParser()
envname = 'RoboschoolHumanoid-v1' parser.add_argument('envname', type=str)
parser.add_argument('-r', '--render', action='store_true')
parser.add_argument("--max_timesteps", type=int, default=1000)
parser.add_argument("--epochs", type=int, default=10000)
parser.add_argument("--batch_size", type=int, default=256)
parser.add_argument("--lr", type=float, default=0.001)
parser.add_argument('--num_rollouts', type=int, default=20, help='Number of clone roll outs')
args = parser.parse_args()
with open(os.path.join('expert_data', envname + '.pkl'), 'rb') as f: with open(os.path.join('expert_data', args.envname + '.pkl'), 'rb') as f:
expert_data = pickle.load(f) expert_data = pickle.load(f)
x = tf.placeholder(tf.float32, shape=[None, expert_data['observations'].shape[1]]) x = tf.placeholder(tf.float32, shape=[None, expert_data['observations'].shape[1]])
...@@ -21,54 +29,36 @@ y_pred = model(hidden(x)) ...@@ -21,54 +29,36 @@ y_pred = model(hidden(x))
loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred) loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred)
optimizer = tf.train.AdamOptimizer(0.001) optimizer = tf.train.AdamOptimizer(args.lr)
train = optimizer.minimize(loss) train = optimizer.minimize(loss)
'''
model = tf.keras.models.Sequential([ batch_size = args.batch_size
tf.keras.layers.Dense(64, activation=tf.nn.relu, input_shape=[expert_data['observations'].shape[1]]),
tf.keras.layers.Dense(expert_data['actions'].shape[1])
])
model.compile(optimizer='adam',
loss='mse',
metrics=['mae', 'mse'])
model.summary()
model.fit(expert_data['observations'], expert_data['actions'], epochs=5, batch_size=256)
'''
epochs = 300
batch_size = 256
with tf.Session() as sess: with tf.Session() as sess:
'''
'''
tf_util.initialize() tf_util.initialize()
for e in range(epochs): for e in range(args.epochs):
for i in range(0, expert_data['observations'].shape[0], batch_size): for i in range(0, expert_data['observations'].shape[0], batch_size):
_, loss_value = sess.run( _, loss_value = sess.run(
(train, loss), (train, loss), feed_dict={x: expert_data['observations'][i:i + batch_size, :],
feed_dict={x: expert_data['observations'][i:i + batch_size, :], y_true: expert_data['actions'][i:i + batch_size, :]}) y_true: expert_data['actions'][i:i + batch_size, :]})
if e % 100 == 0:
print(loss_value) print(loss_value)
# Play if args.render:
env = gym.make(envname) for i in range(args.num_rollouts):
max_steps = env.spec.timestep_limit env = gym.make(args.envname)
obs = env.reset() obs = env.reset()
done = False done = False
totalr = 0. totalr = 0.
steps = 0 steps = 0
while not done: while not done:
action = sess.run(y_pred, feed_dict={x: obs[np.newaxis, :]}) action = sess.run(y_pred, feed_dict={x: obs[np.newaxis, :]})
# action = model.predict(obs[np.newaxis, :]) action = np.squeeze(action)
# observations.append(obs) action[~np.isfinite(action)] = 0
# actions.append(action) obs, r, done, _ = env.step(action)
obs, r, done, _ = env.step(np.squeeze(action))
totalr += r totalr += r
steps += 1 steps += 1
env.render() env.render()
if steps % 100 == 0: if steps % 100 == 0:
print("%i/%i" % (steps, max_steps)) print("%i/%i" % (steps, args.max_timesteps))
if steps >= max_steps: if steps >= args.max_timesteps:
break break
# print(sess.run(y_pred))
# np.mean((expert_data['actions'] - np.mean(expert_data['actions'], axis=0)) ** 2)
import roboschool
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
import pickle import pickle
import os import os
from hw1 import tf_util import tf_util
import gym import gym
import roboschool_agents
import argparse
from hw1 import roboschool_agents parser = argparse.ArgumentParser()
parser.add_argument('envname', type=str)
envname = 'RoboschoolAtlasForwardWalk-v1' parser.add_argument('-r', '--render', action='store_true')
envname = 'RoboschoolHumanoid-v1' parser.add_argument("--max_timesteps", type=int, default=1000)
parser.add_argument("--epochs", type=int, default=10000)
parser.add_argument("--cloneing_epochs", type=int, default=1000)
parser.add_argument("--batch_size", type=int, default=256)
parser.add_argument("--lr", type=float, default=0.001)
parser.add_argument('--num_rollouts', type=int, default=20, help='Number of clone roll outs')
args = parser.parse_args()
envname=args.envname
with open(os.path.join('expert_data', envname + '.pkl'), 'rb') as f: with open(os.path.join('expert_data', envname + '.pkl'), 'rb') as f:
expert_data = pickle.load(f) expert_data = pickle.load(f)
...@@ -25,34 +33,37 @@ y_pred = model(hidden(x)) ...@@ -25,34 +33,37 @@ y_pred = model(hidden(x))
loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred) loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred)
optimizer = tf.train.AdamOptimizer(0.001) optimizer = tf.train.AdamOptimizer(args.lr)
train = optimizer.minimize(loss) train = optimizer.minimize(loss)
epochs = 30 epochs = args.epochs
training_epochs = 5 cloneing_epochs = args.cloneing_epochs
batch_size = 256 batch_size = args.batch_size
do_render = False do_render = args.render
with tf.Session() as sess: with tf.Session() as sess:
tf_util.initialize() tf_util.initialize()
env = gym.make(envname) env = gym.make(envname)
max_steps = 1000 # env.spec.timestep_limit max_steps = args.max_timesteps
pi = roboschool_agents.load_policy(envname, env) pi = roboschool_agents.load_policy(envname, env)
for e in range(epochs): for e in range(epochs):
print(f'epochs {e}') print(f'epochs {e}')
for _ in range(training_epochs): # Fit
for c in range(cloneing_epochs):
idcs = np.random.permutation(X.shape[0]) idcs = np.random.permutation(X.shape[0])
for i in range(0, X.shape[0], batch_size): for i in range(0, X.shape[0], batch_size):
_, loss_value = sess.run( _, loss_value = sess.run(
(train, loss), (train, loss),
feed_dict={x: X[idcs[i:i + batch_size], :], y_true: expert_actions[idcs[i:i + batch_size], :]}) feed_dict={x: X[idcs[i:i + batch_size], :], y_true: expert_actions[idcs[i:i + batch_size], :]})
if c % 100 == 0:
print(loss_value) print(loss_value)
# Play # Ask expert
observations = [] observations = []
actions = [] actions = []
for d in range(1): for d in range(args.num_rollouts):
obs = env.reset() obs = env.reset()
done = False done = False
totalr = 0. totalr = 0.
...@@ -63,7 +74,9 @@ with tf.Session() as sess: ...@@ -63,7 +74,9 @@ with tf.Session() as sess:
observations.append(obs) observations.append(obs)
actions.append(expert_act) actions.append(expert_act)
obs, r, done, _ = env.step(np.squeeze(action)) action = np.squeeze(action)
action[~np.isfinite(action)] = 0
obs, r, done, _ = env.step(action)
totalr += r totalr += r
steps += 1 steps += 1
if e == epochs - 1: if e == epochs - 1:
...@@ -76,5 +89,3 @@ with tf.Session() as sess: ...@@ -76,5 +89,3 @@ with tf.Session() as sess:
X = np.concatenate((X, np.vstack(observations)), axis=0) X = np.concatenate((X, np.vstack(observations)), axis=0)
expert_actions = np.concatenate((expert_actions, np.vstack(actions)), axis=0) expert_actions = np.concatenate((expert_actions, np.vstack(actions)), axis=0)
# print(sess.run(y_pred))
# np.mean((expert_data['actions'] - np.mean(expert_data['actions'], axis=0)) ** 2)
#!/bin/bash #!/bin/bash
set -eux set -eux
for e in Hopper-v2 Ant-v2 HalfCheetah-v2 Humanoid-v2 Reacher-v2 Walker2d-v2 for e in RoboschoolHopper-v1 RoboschoolAnt-v1 RoboschoolHalfCheetah-v1 RoboschoolHumanoid-v1 RoboschoolWalker2d-v1\
RoboschoolAtlasForwardWalk-v1 RoboschoolPong-v1 RoboschoolInvertedPendulum-v1 RoboschoolInvertedPendulumSwingup-v1\
RoboschoolInvertedDoublePendulum-v1 RoboschoolReacher-v1
do do
python run_expert.py experts/$e.pkl $e --render --num_rollouts=1 python run_expert.py $e --render --num_rollouts=1
done done
\ No newline at end of file
import pickle import pickle, tensorflow as tf, tf_util, numpy as np
import tensorflow as tf
import numpy as np
from . import tf_util
def load_policy(filename): def load_policy(filename):
with open(filename, 'rb') as f: with open(filename, 'rb') as f:
...@@ -39,8 +35,7 @@ def load_policy(filename): ...@@ -39,8 +35,7 @@ def load_policy(filename):
obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
normedobs_bo = (obs_bo - obsnorm_mean) / ( normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
curr_activations_bd = normedobs_bo curr_activations_bd = normedobs_bo
......
gym==0.10.5 gym
mujoco-py==1.50.1.56 roboschool
tensorflow tensorflow
numpy numpy
seaborn seaborn
import RoboschoolAnt_v1_2017jul from agent_zoo import RoboschoolAnt_v1_2017jul
import RoboschoolAtlasForwardWalk_v1_2017jul from agent_zoo import RoboschoolAtlasForwardWalk_v1_2017jul
import RoboschoolHumanoid_v1_2017jul from agent_zoo import RoboschoolHumanoid_v1_2017jul
from agent_zoo import RoboschoolHopper_v1_2017jul
from agent_zoo import RoboschoolHalfCheetah_v1_2017jul
from agent_zoo import RoboschoolReacher_v0_2017may
from agent_zoo import RoboschoolWalker2d_v1_2017jul
from agent_zoo import RoboschoolPong_v0_2017may2
from agent_zoo import RoboschoolInvertedPendulum_v0_2017may
from agent_zoo import RoboschoolInvertedPendulumSwingup_v0_2017may
from agent_zoo import RoboschoolInvertedDoublePendulum_v0_2017may
def load_policy(env_name, env): def load_policy(env_name, env):
pi = None pi = None
model_name = "mymodel1"
if 'Ant' in env_name: if 'Ant' in env_name:
pi = RoboschoolAnt_v1_2017jul.ZooPolicyTensorflow( pi = RoboschoolAnt_v1_2017jul.ZooPolicyTensorflow(
"mymodel1", env.observation_space, env.action_space) model_name, env.observation_space, env.action_space)
if 'Atlas' in env_name: if 'Atlas' in env_name:
pi = RoboschoolAtlasForwardWalk_v1_2017jul.ZooPolicyTensorflow( pi = RoboschoolAtlasForwardWalk_v1_2017jul.ZooPolicyTensorflow(
"mymodel1", env.observation_space, env.action_space) model_name, env.observation_space, env.action_space)
if 'Human' in env_name: if 'Humanoid' in env_name:
pi = RoboschoolHumanoid_v1_2017jul.ZooPolicyTensorflow( pi = RoboschoolHumanoid_v1_2017jul.ZooPolicyTensorflow(
"mymodel1", env.observation_space, env.action_space) model_name, env.observation_space, env.action_space)
if 'Hopper' in env_name:
pi = RoboschoolHopper_v1_2017jul.ZooPolicyTensorflow(
model_name, env.observation_space, env.action_space)
if 'HalfCheetah' in env_name:
pi = RoboschoolHalfCheetah_v1_2017jul.ZooPolicyTensorflow(
model_name, env.observation_space, env.action_space)
if 'Walker2d' in env_name:
pi = RoboschoolWalker2d_v1_2017jul.ZooPolicyTensorflow(
model_name, env.observation_space, env.action_space)
if 'RoboschoolPong' in env_name:
pi = RoboschoolPong_v0_2017may2.SmallReactivePolicy(
env.observation_space, env.action_space)
if 'RoboschoolInvertedPendulum-' in env_name:
pi = RoboschoolInvertedPendulum_v0_2017may.SmallReactivePolicy(
env.observation_space, env.action_space)
if 'RoboschoolInvertedPendulumSwingup' in env_name:
pi = RoboschoolInvertedPendulumSwingup_v0_2017may.SmallReactivePolicy(
env.observation_space, env.action_space)
if 'RoboschoolInvertedDoublePendulum' in env_name:
pi = RoboschoolInvertedDoublePendulum_v0_2017may.SmallReactivePolicy(
env.observation_space, env.action_space)
if 'RoboschoolReacher' in env_name:
pi = RoboschoolReacher_v0_2017may.SmallReactivePolicy(
env.observation_space, env.action_space)
return pi return pi
#!/bin/bash
set -eux
for e in RoboschoolHopper-v1 RoboschoolAnt-v1 RoboschoolHalfCheetah-v1 RoboschoolHumanoid-v1 RoboschoolWalker2d-v1\
RoboschoolAtlasForwardWalk-v1 RoboschoolPong-v1 RoboschoolInvertedPendulum-v1 RoboschoolInvertedPendulumSwingup-v1\
RoboschoolInvertedDoublePendulum-v1 RoboschoolReacher-v1
do
python run_expert.py $e --num_rollouts=30
done
\ No newline at end of file
...@@ -13,22 +13,18 @@ import os ...@@ -13,22 +13,18 @@ import os
import pickle import pickle
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
from hw1 import tf_util import tf_util
import gym import gym
from hw1 import load_policy import roboschool_agents
from hw1 import roboschool_agents
def main(): def main():
import argparse import argparse
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('expert_policy_file', type=str)
parser.add_argument('envname', type=str) parser.add_argument('envname', type=str)
parser.add_argument('-r', '--render', action='store_true') parser.add_argument('-r', '--render', action='store_true')
parser.add_argument('--roboschool', action='store_true') parser.add_argument("--max_timesteps", type=int, default=1000)
parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs')
parser.add_argument('--num_rollouts', type=int, default=20,
help='Number of expert roll outs')
args = parser.parse_args() args = parser.parse_args()
env = gym.make(args.envname) env = gym.make(args.envname)
...@@ -36,14 +32,8 @@ def main(): ...@@ -36,14 +32,8 @@ def main():
with tf.Session(): with tf.Session():
tf_util.initialize() tf_util.initialize()
print('loading and building expert policy') print('loading and building expert policy for: {}'.format(args.envname))
if args.roboschool:
pi = roboschool_agents.load_policy(args.envname, env) pi = roboschool_agents.load_policy(args.envname, env)
else:
policy_fn = load_policy.load_policy(args.expert_policy_file)
print('loaded and built')
max_steps = args.max_timesteps or env.spec.timestep_limit
returns = [] returns = []
observations = [] observations = []
...@@ -55,10 +45,7 @@ def main(): ...@@ -55,10 +45,7 @@ def main():
totalr = 0. totalr = 0.
steps = 0 steps = 0
while not done: while not done:
if args.roboschool:
action = pi.act(obs, env) action = pi.act(obs, env)
else:
action = policy_fn(obs[None, :])
observations.append(obs) observations.append(obs)
actions.append(action) actions.append(action)
obs, r, done, _ = env.step(action) obs, r, done, _ = env.step(action)
...@@ -66,13 +53,12 @@ def main(): ...@@ -66,13 +53,12 @@ def main():
steps += 1 steps += 1
if args.render: if args.render:
env.render() env.render()
if steps % 100 == 0: if steps % 100 == 0: print("%i/%i" % (steps, args.max_timesteps))
print("%i/%i" % (steps, max_steps)) if steps >= args.max_timesteps:
if steps >= max_steps:
break break
returns.append(totalr)
if args.render: if args.render:
break break
returns.append(totalr)
print('returns', returns) print('returns', returns)
print('mean return', np.mean(returns)) print('mean return', np.mean(returns))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment