dagger is extension of behavior_clone.py

04c078a2 · Florian Gawrilowicz · 82a3e357 · 04c078a2
Commit 04c078a2 authored Mar 27, 2019 by Florian Gawrilowicz
--- a/hw1/dagger.py
+++ b/hw1/dagger.py
+import roboschool
+import numpy as np
+import tensorflow as tf
+import pickle
+import os
+from hw1 import tf_util
+import gym
+
+from hw1 import roboschool_agents
+
+envname = 'RoboschoolAtlasForwardWalk-v1'
+envname = 'RoboschoolHumanoid-v1'
+
+with open(os.path.join('expert_data', envname + '.pkl'), 'rb') as f:
+    expert_data = pickle.load(f)
+X = expert_data['observations']
+expert_actions = expert_data['actions']
+
+x = tf.placeholder(tf.float32, shape=[None, expert_data['observations'].shape[1]])
+y_true = tf.placeholder(tf.float32, shape=[None, expert_data['actions'].shape[1]])
+
+hidden = tf.layers.Dense(units=128, activation=tf.nn.relu)
+model = tf.layers.Dense(units=expert_data['actions'].shape[1], use_bias=False)
+y_pred = model(hidden(x))
+
+loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred)
+
+optimizer = tf.train.AdamOptimizer(0.001)
+train = optimizer.minimize(loss)
+
+epochs = 30
+training_epochs = 5
+batch_size = 256
+do_render = False
+with tf.Session() as sess:
+    tf_util.initialize()
+
+    env = gym.make(envname)
+    max_steps = 1000  # env.spec.timestep_limit
+    pi = roboschool_agents.load_policy(envname, env)
+
+    for e in range(epochs):
+        print(f'epochs {e}')
+        for _ in range(training_epochs):
+            idcs = np.random.permutation(X.shape[0])
+            for i in range(0, X.shape[0], batch_size):
+                _, loss_value = sess.run(
+                    (train, loss),
+                    feed_dict={x: X[idcs[i:i + batch_size], :], y_true: expert_actions[idcs[i:i + batch_size], :]})
+            print(loss_value)
+
+        # Play
+        observations = []
+        actions = []
+        for d in range(1):
+            obs = env.reset()
+            done = False
+            totalr = 0.
+            steps = 0
+            while not done:
+                action = sess.run(y_pred, feed_dict={x: obs[np.newaxis, :]})
+                expert_act = pi.act(obs, env)
+                observations.append(obs)
+                actions.append(expert_act)
+
+                obs, r, done, _ = env.step(np.squeeze(action))
+                totalr += r
+                steps += 1
+                if e == epochs - 1:
+                    env.render()
+                elif steps >= max_steps:
+                    break
+                if steps % 100 == 0:
+                    print("%i/%i: %f" % (steps, max_steps, totalr))
+                    totalr = 0.
+        X = np.concatenate((X, np.vstack(observations)), axis=0)
+        expert_actions = np.concatenate((expert_actions, np.vstack(actions)), axis=0)
+
+# print(sess.run(y_pred))
+# np.mean((expert_data['actions'] - np.mean(expert_data['actions'], axis=0)) ** 2)