add HW5c: meta-learning

f2fffb11 · Kate Rakelly · 03cd551f · f2fffb11 · f2fffb11 · f2fffb11
Commit f2fffb11 authored Nov 1, 2018 by Kate Rakelly
--- a/hw5/meta/README.md
+++ b/hw5/meta/README.md
+# CS294-112 HW 5c: Meta-Learning
+Dependencies:
+ * Python **3.5**
+ * Numpy version 1.14.5
+ * TensorFlow version 1.10.5
+ * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
+ * OpenAI Gym version **0.10.5**
+ * seaborn
+ * Box2D==2.3.2
+See the [HW5c PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5c.pdf) for further instructions.
--- a/hw5/meta/logz.py
+++ b/hw5/meta/logz.py
+import json
+"""
+Some simple logging functionality, inspired by rllab's logging.
+Assumes that each diagnostic gets logged each iteration
+Call logz.configure_output_dir() to start logging to a 
+tab-separated-values file (some_folder_name/log.txt)
+To load the learning curves, you can do, for example
+A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
+A['EpRewMean']
+"""
+import os.path as osp, shutil, time, atexit, os, subprocess
+import pickle
+import tensorflow as tf
+color2num = dict(
+    gray=30,
+    red=31,
+    green=32,
+    yellow=33,
+    blue=34,
+    magenta=35,
+    cyan=36,
+    white=37,
+    crimson=38
+)
+def colorize(string, color, bold=False, highlight=False):
+    attr = []
+    num = color2num[color]
+    if highlight: num += 10
+    attr.append(str(num))
+    if bold: attr.append('1')
+    return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
+class G:
+    output_dir = None
+    output_file = None
+    first_row = True
+    log_headers = []
+    log_current_row = {}
+def configure_output_dir(d=None):
+    """
+    Set output directory to d, or to /tmp/somerandomnumber if d is None
+    """
+    G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
+    assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
+    os.makedirs(G.output_dir)
+    G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
+    atexit.register(G.output_file.close)
+    print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
+def log_tabular(key, val):
+    """
+    Log a value of some diagnostic
+    Call this once for each diagnostic quantity, each iteration
+    """
+    if G.first_row:
+        G.log_headers.append(key)
+    else:
+        assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
+    assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
+    G.log_current_row[key] = val
+def save_params(params):
+    with open(osp.join(G.output_dir, "params.json"), 'w') as out:
+        out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
+def pickle_tf_vars():  
+    """
+    Saves tensorflow variables
+    Requires them to be initialized first, also a default session must exist
+    """
+    _dict = {v.name : v.eval() for v in tf.global_variables()}
+    with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
+        pickle.dump(_dict, f)
+def dump_tabular():
+    """
+    Write all of the diagnostics from the current iteration
+    """
+    vals = []
+    key_lens = [len(key) for key in G.log_headers]
+    max_key_len = max(15,max(key_lens))
+    keystr = '%'+'%d'%max_key_len
+    fmt = "| " + keystr + "s | %15s |"
+    n_slashes = 22 + max_key_len
+    print("-"*n_slashes)
+    for key in G.log_headers:
+        val = G.log_current_row.get(key, "")
+        if hasattr(val, "__float__"): valstr = "%8.3g"%val
+        else: valstr = val
+        print(fmt%(key, valstr))
+        vals.append(val)
+    print("-"*n_slashes)
+    if G.output_file is not None:
+        if G.first_row:
+            G.output_file.write("\t".join(G.log_headers))
+            G.output_file.write("\n")
+        G.output_file.write("\t".join(map(str,vals)))
+        G.output_file.write("\n")
+        G.output_file.flush()
+    G.log_current_row.clear()
+    G.first_row=False
--- a/hw5/meta/plot.py
+++ b/hw5/meta/plot.py
+import seaborn as sns
+import pandas as pd
+import matplotlib.pyplot as plt
+import json
+import os
+"""
+Using the plotter:
+Call it from the command line, and supply it with logdirs to experiments.
+Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
+random seeds. The runner code stored it in the directory structure
+    data
+    L test_EnvName_DateTime
+      L  0
+        L log.txt
+        L params.json
+      L  1
+        L log.txt
+        L params.json
+       .
+       .
+       .
+      L  9
+        L log.txt
+        L params.json
+To plot learning curves from the experiment, averaged over all random
+seeds, call
+    python plot.py data/test_EnvName_DateTime --value AverageReturn
+and voila. To see a different statistics, change what you put in for
+the keyword --value. You can also enter /multiple/ values, and it will 
+make all of them in order.
+Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
+a different set of hyperparameters from 'test1', and now you would like 
+to compare them -- see their learning curves side-by-side. Just call
+    python plot.py data/test1 data/test2
+and it will plot them both! They will be given titles in the legend according
+to their exp_name parameters. If you want to use custom legend titles, use
+the --legend flag and then provide a title for each logdir.
+"""
+def plot_data(data, value="AverageReturn"):
+    if isinstance(data, list):
+        data = pd.concat(data, ignore_index=True)
+    sns.set(style="darkgrid", font_scale=1.5)
+    sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
+    plt.legend(loc='best').draggable()
+    plt.show()
+def get_datasets(fpath, condition=None):
+    unit = 0
+    datasets = []
+    for root, dir, files in os.walk(fpath):
+        if 'log.txt' in files:
+            param_path = open(os.path.join(root,'params.json'))
+            params = json.load(param_path)
+            exp_name = params['exp_name']
+            log_path = os.path.join(root,'log.txt')
+            experiment_data = pd.read_table(log_path)
+            experiment_data.insert(
+                len(experiment_data.columns),
+                'Unit',
+                unit
+                )        
+            experiment_data.insert(
+                len(experiment_data.columns),
+                'Condition',
+                condition or exp_name
+                )
+            datasets.append(experiment_data)
+            unit += 1
+    return datasets
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('logdir', nargs='*')
+    parser.add_argument('--legend', nargs='*')
+    parser.add_argument('--value', default='AverageReturn', nargs='*')
+    args = parser.parse_args()
+    use_legend = False
+    if args.legend is not None:
+        assert len(args.legend) == len(args.logdir), \
+            "Must give a legend title for each set of experiments."
+        use_legend = True
+    data = []
+    if use_legend:
+        for logdir, legend_title in zip(args.logdir, args.legend):
+            data += get_datasets(logdir, legend_title)
+    else:
+        for logdir in args.logdir:
+            data += get_datasets(logdir)
+    if isinstance(args.value, list):
+        values = args.value
+    else:
+        values = [args.value]
+    for value in values:
+        plot_data(data, value=value)
+if __name__ == "__main__":
+    main()
--- a/hw5/meta/point_mass.py
+++ b/hw5/meta/point_mass.py
+import numpy as np
+from gym import spaces
+from gym import Env
+class PointEnv(Env):
+    """
+    point mass on a 2-D plane
+    two tasks: move to (-1, -1) or move to (1,1)
+    """
+    def __init__(self, num_tasks=1):
+        self.reset_task()
+        self.reset()
+        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
+        self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
+    def reset_task(self, is_evaluation=False):
+        '''
+        sample a new task randomly
+        Problem 3: make training and evaluation goals disjoint sets
+        if `is_evaluation` is true, sample from the evaluation set,
+        otherwise sample from the training set
+        '''
+        #====================================================================================#
+        #                           ----------PROBLEM 3----------
+        #====================================================================================#
+        # YOUR CODE HERE
+        x = np.random.uniform(-10, 10)
+        y = np.random.uniform(-10, 10)
+        self._goal = np.array([x, y])
+    def get_all_task_idx(self):
+        return [0]
+    def reset(self):
+        self._state = np.array([0, 0], dtype=np.float32)
+        return self._get_obs()
+    def _get_obs(self):
+        return np.copy(self._state)
+    def reward_function(self, x, y):
+        return - (x ** 2 + y ** 2) ** 0.5
+    def step(self, action):
+        x, y = self._state
+        # compute reward, add penalty for large actions instead of clipping them
+        x -= self._goal[0]
+        y -= self._goal[1]
+        # check if task is complete
+        done = abs(x) < .01 and abs(y) < .01
+        reward = self.reward_function(x, y)
+        # move to next state
+        self._state = self._state + action
+        ob = self._get_obs()
+        return ob, reward, done, dict()
+    def viewer_setup(self):
+        print('no viewer')
+        pass
+    def render(self):
+        print('current state:', self._state)
+    def seed(self, seed):
+        np.random.seed = seed
--- a/hw5/meta/point_mass_observed.py
+++ b/hw5/meta/point_mass_observed.py
+import numpy as np
+from gym import spaces
+from gym import Env
+class ObservedPointEnv(Env):
+    """
+    point mass on a 2-D plane
+    two tasks: move to (-1, -1) or move to (1,1)
+    Problem 1: augment the observation with a one-hot vector encoding the task ID
+     - change the dimension of the observation space
+     - augment the observation with a one-hot vector that encodes the task ID
+    """
+    #====================================================================================#
+    #                           ----------PROBLEM 1----------
+    #====================================================================================#
+    # YOUR CODE SOMEWHERE HERE
+    def __init__(self, num_tasks=1):
+        self.tasks = [0, 1, 2, 3][:num_tasks]
+        self.reset_task()
+        self.reset()
+        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
+        self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
+    def reset_task(self, is_evaluation=False):
+        idx = np.random.choice(len(self.tasks))
+        self._task = self.tasks[idx]
+        goals = [[-1, -1], [-1, 1], [1, -1], [1, 1]]
+        self._goal = np.array(goals[idx])*10
+    def reset(self):
+        self._state = np.array([0, 0], dtype=np.float32)
+        return self._get_obs()
+    def _get_obs(self):
+        return np.copy(self._state)
+    def step(self, action):
+        x, y = self._state
+        # compute reward, add penalty for large actions instead of clipping them
+        x -= self._goal[0]
+        y -= self._goal[1]
+        reward = - (x ** 2 + y ** 2) ** 0.5
+        # check if task is complete
+        done = abs(x) < 0.01 and abs(y) < 0.01
+        # move to next state
+        self._state = self._state + action
+        ob = self._get_obs()
+        return ob, reward, done, dict()
+    def viewer_setup(self):
+        print('no viewer')
+        pass
+    def render(self):
+        print('current state:', self._state)
+    def seed(self, seed):
+        np.random.seed = seed
--- a/hw5/meta/replay_buffer.py
+++ b/hw5/meta/replay_buffer.py
+import numpy as np
+class ReplayBuffer(object):
+    '''
+    minimalistic replay buffer
+    a sample consists of
+     - observation
+     - action
+     - reward
+     - terminal
+     - hidden state for recurrent policy
+     it is memory inefficient to store windowed observations this way
+     so do not run on tasks with large observations (e.g. from vision)
+    '''
+    def __init__(self, max_size, ob_dim, ac_dim, hidden_dim, task_dim):
+        self.max_size = max_size
+        self.ob_dim = ob_dim
+        self.ac_dim = ac_dim
+        self.hidden_dim = hidden_dim
+        self.task_dim = task_dim
+        self.flush()
+    def flush(self):
+        '''
+        set buffer to empty
+        '''
+        self._observations = np.zeros((self.max_size, *self.ob_dim))
+        self._actions = np.zeros((self.max_size, *self.ac_dim))
+        self._rewards = np.zeros((self.max_size, 1))
+        self._terminals = np.zeros((self.max_size, 1))
+        self._hiddens = np.zeros((self.max_size, self.hidden_dim))
+        self._tasks = np.zeros((self.max_size, self.task_dim))
+        self._top = 0
+        self._size = 0
+    def _advance(self):
+        '''
+        move pointer to top of buffer
+        if end of buffer is reached, overwrite oldest data
+        '''
+        self._top = (self._top + 1) % self.max_size
+        if self._size < self.max_size:
+            self._size += 1
+    def add_sample(self, ob, ac, re, te, hi, task):
+        '''
+        add sample to buffer
+        '''
+        self._observations[self._top] = ob
+        self._actions[self._top] = ac
+        self._rewards[self._top] = re
+        self._terminals[self._top] = te
+        self._hiddens[self._top] = hi
+        self._tasks[self._top] = task
+        self._advance()
+    def get_samples(self, indices):
+        '''
+        return buffer data indexed by `indices`
+        '''
+        return dict(
+            observations=self._observations[indices],
+            actions=self._actions[indices],
+            rewards=self._rewards[indices],
+            terminals=self._terminals[indices],
+            hiddens=self._hiddens[indices],
+            tasks=self._tasks[indices],
+        )
+    def random_batch(self, batch_size):
+        '''
+        return random sample of `batch_size` transitions
+        '''
+        indices = np.random.randint(0, self._size, batch_size)
+        return self.get_samples(indices)
+    def all_batch(self):
+        '''
+        return all data in the buffer
+        '''
+        indices = list(range(self._size))
+        return self.get_samples(indices)
+    def num_steps_can_sample(self):
+        return self._size
+class PPOReplayBuffer(object):
+    '''
+    replay buffer for PPO algorithm
+    store fixed log probs, advantages, and returns for use in multiple updates
+    n.b. samples must be added as a batch, and we assume that the
+    batch is the same size as that of the simple buffer
+    '''
+    def __init__(self, simple_buffer):
+        self.simple_buffer = simple_buffer
+        self.max_size = self.simple_buffer.max_size
+        self.flush()
+    def flush(self):
+        self.simple_buffer.flush()
+        self._log_probs = np.zeros((self.max_size, 1))
+        self._advantages = np.zeros((self.max_size, 1))
+        self._returns = np.zeros((self.max_size, 1))
+    def add_samples(self, lp, adv, ret):
+        self._log_probs = lp
+        self._advantages = adv
+        self._returns = ret
+    def get_samples(self, indices):
+        return dict(
+            log_probs = self._log_probs[indices],
+            advantages = self._advantages[indices],
+            returns = self._returns[indices],
+        )
+    def random_batch(self, batch_size):
+        indices = np.random.randint(0, self.simple_buffer._size, batch_size)
+        simple = self.simple_buffer.get_samples(indices)
+        ppo = self.get_samples(indices)
+        return {**simple, **ppo}
--- a/hw5/meta/requirements.txt
+++ b/hw5/meta/requirements.txt
+mujoco-py==1.50.1.56
+gym==0.10.5
+tensorflow==1.10.0
+numpy==1.14.5
+scipy==1.1.0
+tensorflow-probability==0.3.0
+seaborn
+Box2D==2.3.2
--- a/hw5/meta/train_policy.py
+++ b/hw5/meta/train_policy.py