Skip to content
Snippets Groups Projects
Commit f2fffb11 authored by Kate Rakelly's avatar Kate Rakelly
Browse files

add HW5c: meta-learning

parent 03cd551f
No related branches found
No related tags found
No related merge requests found
# CS294-112 HW 5c: Meta-Learning
Dependencies:
* Python **3.5**
* Numpy version 1.14.5
* TensorFlow version 1.10.5
* MuJoCo version **1.50** and mujoco-py **1.50.1.56**
* OpenAI Gym version **0.10.5**
* seaborn
* Box2D==2.3.2
See the [HW5c PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5c.pdf) for further instructions.
import json
"""
Some simple logging functionality, inspired by rllab's logging.
Assumes that each diagnostic gets logged each iteration
Call logz.configure_output_dir() to start logging to a
tab-separated-values file (some_folder_name/log.txt)
To load the learning curves, you can do, for example
A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
A['EpRewMean']
"""
import os.path as osp, shutil, time, atexit, os, subprocess
import pickle
import tensorflow as tf
color2num = dict(
gray=30,
red=31,
green=32,
yellow=33,
blue=34,
magenta=35,
cyan=36,
white=37,
crimson=38
)
def colorize(string, color, bold=False, highlight=False):
attr = []
num = color2num[color]
if highlight: num += 10
attr.append(str(num))
if bold: attr.append('1')
return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
class G:
output_dir = None
output_file = None
first_row = True
log_headers = []
log_current_row = {}
def configure_output_dir(d=None):
"""
Set output directory to d, or to /tmp/somerandomnumber if d is None
"""
G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
os.makedirs(G.output_dir)
G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
atexit.register(G.output_file.close)
print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
def log_tabular(key, val):
"""
Log a value of some diagnostic
Call this once for each diagnostic quantity, each iteration
"""
if G.first_row:
G.log_headers.append(key)
else:
assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
G.log_current_row[key] = val
def save_params(params):
with open(osp.join(G.output_dir, "params.json"), 'w') as out:
out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
def pickle_tf_vars():
"""
Saves tensorflow variables
Requires them to be initialized first, also a default session must exist
"""
_dict = {v.name : v.eval() for v in tf.global_variables()}
with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
pickle.dump(_dict, f)
def dump_tabular():
"""
Write all of the diagnostics from the current iteration
"""
vals = []
key_lens = [len(key) for key in G.log_headers]
max_key_len = max(15,max(key_lens))
keystr = '%'+'%d'%max_key_len
fmt = "| " + keystr + "s | %15s |"
n_slashes = 22 + max_key_len
print("-"*n_slashes)
for key in G.log_headers:
val = G.log_current_row.get(key, "")
if hasattr(val, "__float__"): valstr = "%8.3g"%val
else: valstr = val
print(fmt%(key, valstr))
vals.append(val)
print("-"*n_slashes)
if G.output_file is not None:
if G.first_row:
G.output_file.write("\t".join(G.log_headers))
G.output_file.write("\n")
G.output_file.write("\t".join(map(str,vals)))
G.output_file.write("\n")
G.output_file.flush()
G.log_current_row.clear()
G.first_row=False
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
"""
Using the plotter:
Call it from the command line, and supply it with logdirs to experiments.
Suppose you ran an experiment with name 'test', and you ran 'test' for 10
random seeds. The runner code stored it in the directory structure
data
L test_EnvName_DateTime
L 0
L log.txt
L params.json
L 1
L log.txt
L params.json
.
.
.
L 9
L log.txt
L params.json
To plot learning curves from the experiment, averaged over all random
seeds, call
python plot.py data/test_EnvName_DateTime --value AverageReturn
and voila. To see a different statistics, change what you put in for
the keyword --value. You can also enter /multiple/ values, and it will
make all of them in order.
Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
a different set of hyperparameters from 'test1', and now you would like
to compare them -- see their learning curves side-by-side. Just call
python plot.py data/test1 data/test2
and it will plot them both! They will be given titles in the legend according
to their exp_name parameters. If you want to use custom legend titles, use
the --legend flag and then provide a title for each logdir.
"""
def plot_data(data, value="AverageReturn"):
if isinstance(data, list):
data = pd.concat(data, ignore_index=True)
sns.set(style="darkgrid", font_scale=1.5)
sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
plt.legend(loc='best').draggable()
plt.show()
def get_datasets(fpath, condition=None):
unit = 0
datasets = []
for root, dir, files in os.walk(fpath):
if 'log.txt' in files:
param_path = open(os.path.join(root,'params.json'))
params = json.load(param_path)
exp_name = params['exp_name']
log_path = os.path.join(root,'log.txt')
experiment_data = pd.read_table(log_path)
experiment_data.insert(
len(experiment_data.columns),
'Unit',
unit
)
experiment_data.insert(
len(experiment_data.columns),
'Condition',
condition or exp_name
)
datasets.append(experiment_data)
unit += 1
return datasets
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('logdir', nargs='*')
parser.add_argument('--legend', nargs='*')
parser.add_argument('--value', default='AverageReturn', nargs='*')
args = parser.parse_args()
use_legend = False
if args.legend is not None:
assert len(args.legend) == len(args.logdir), \
"Must give a legend title for each set of experiments."
use_legend = True
data = []
if use_legend:
for logdir, legend_title in zip(args.logdir, args.legend):
data += get_datasets(logdir, legend_title)
else:
for logdir in args.logdir:
data += get_datasets(logdir)
if isinstance(args.value, list):
values = args.value
else:
values = [args.value]
for value in values:
plot_data(data, value=value)
if __name__ == "__main__":
main()
import numpy as np
from gym import spaces
from gym import Env
class PointEnv(Env):
"""
point mass on a 2-D plane
two tasks: move to (-1, -1) or move to (1,1)
"""
def __init__(self, num_tasks=1):
self.reset_task()
self.reset()
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
def reset_task(self, is_evaluation=False):
'''
sample a new task randomly
Problem 3: make training and evaluation goals disjoint sets
if `is_evaluation` is true, sample from the evaluation set,
otherwise sample from the training set
'''
#====================================================================================#
# ----------PROBLEM 3----------
#====================================================================================#
# YOUR CODE HERE
x = np.random.uniform(-10, 10)
y = np.random.uniform(-10, 10)
self._goal = np.array([x, y])
def get_all_task_idx(self):
return [0]
def reset(self):
self._state = np.array([0, 0], dtype=np.float32)
return self._get_obs()
def _get_obs(self):
return np.copy(self._state)
def reward_function(self, x, y):
return - (x ** 2 + y ** 2) ** 0.5
def step(self, action):
x, y = self._state
# compute reward, add penalty for large actions instead of clipping them
x -= self._goal[0]
y -= self._goal[1]
# check if task is complete
done = abs(x) < .01 and abs(y) < .01
reward = self.reward_function(x, y)
# move to next state
self._state = self._state + action
ob = self._get_obs()
return ob, reward, done, dict()
def viewer_setup(self):
print('no viewer')
pass
def render(self):
print('current state:', self._state)
def seed(self, seed):
np.random.seed = seed
import numpy as np
from gym import spaces
from gym import Env
class ObservedPointEnv(Env):
"""
point mass on a 2-D plane
two tasks: move to (-1, -1) or move to (1,1)
Problem 1: augment the observation with a one-hot vector encoding the task ID
- change the dimension of the observation space
- augment the observation with a one-hot vector that encodes the task ID
"""
#====================================================================================#
# ----------PROBLEM 1----------
#====================================================================================#
# YOUR CODE SOMEWHERE HERE
def __init__(self, num_tasks=1):
self.tasks = [0, 1, 2, 3][:num_tasks]
self.reset_task()
self.reset()
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
def reset_task(self, is_evaluation=False):
idx = np.random.choice(len(self.tasks))
self._task = self.tasks[idx]
goals = [[-1, -1], [-1, 1], [1, -1], [1, 1]]
self._goal = np.array(goals[idx])*10
def reset(self):
self._state = np.array([0, 0], dtype=np.float32)
return self._get_obs()
def _get_obs(self):
return np.copy(self._state)
def step(self, action):
x, y = self._state
# compute reward, add penalty for large actions instead of clipping them
x -= self._goal[0]
y -= self._goal[1]
reward = - (x ** 2 + y ** 2) ** 0.5
# check if task is complete
done = abs(x) < 0.01 and abs(y) < 0.01
# move to next state
self._state = self._state + action
ob = self._get_obs()
return ob, reward, done, dict()
def viewer_setup(self):
print('no viewer')
pass
def render(self):
print('current state:', self._state)
def seed(self, seed):
np.random.seed = seed
import numpy as np
class ReplayBuffer(object):
'''
minimalistic replay buffer
a sample consists of
- observation
- action
- reward
- terminal
- hidden state for recurrent policy
it is memory inefficient to store windowed observations this way
so do not run on tasks with large observations (e.g. from vision)
'''
def __init__(self, max_size, ob_dim, ac_dim, hidden_dim, task_dim):
self.max_size = max_size
self.ob_dim = ob_dim
self.ac_dim = ac_dim
self.hidden_dim = hidden_dim
self.task_dim = task_dim
self.flush()
def flush(self):
'''
set buffer to empty
'''
self._observations = np.zeros((self.max_size, *self.ob_dim))
self._actions = np.zeros((self.max_size, *self.ac_dim))
self._rewards = np.zeros((self.max_size, 1))
self._terminals = np.zeros((self.max_size, 1))
self._hiddens = np.zeros((self.max_size, self.hidden_dim))
self._tasks = np.zeros((self.max_size, self.task_dim))
self._top = 0
self._size = 0
def _advance(self):
'''
move pointer to top of buffer
if end of buffer is reached, overwrite oldest data
'''
self._top = (self._top + 1) % self.max_size
if self._size < self.max_size:
self._size += 1
def add_sample(self, ob, ac, re, te, hi, task):
'''
add sample to buffer
'''
self._observations[self._top] = ob
self._actions[self._top] = ac
self._rewards[self._top] = re
self._terminals[self._top] = te
self._hiddens[self._top] = hi
self._tasks[self._top] = task
self._advance()
def get_samples(self, indices):
'''
return buffer data indexed by `indices`
'''
return dict(
observations=self._observations[indices],
actions=self._actions[indices],
rewards=self._rewards[indices],
terminals=self._terminals[indices],
hiddens=self._hiddens[indices],
tasks=self._tasks[indices],
)
def random_batch(self, batch_size):
'''
return random sample of `batch_size` transitions
'''
indices = np.random.randint(0, self._size, batch_size)
return self.get_samples(indices)
def all_batch(self):
'''
return all data in the buffer
'''
indices = list(range(self._size))
return self.get_samples(indices)
def num_steps_can_sample(self):
return self._size
class PPOReplayBuffer(object):
'''
replay buffer for PPO algorithm
store fixed log probs, advantages, and returns for use in multiple updates
n.b. samples must be added as a batch, and we assume that the
batch is the same size as that of the simple buffer
'''
def __init__(self, simple_buffer):
self.simple_buffer = simple_buffer
self.max_size = self.simple_buffer.max_size
self.flush()
def flush(self):
self.simple_buffer.flush()
self._log_probs = np.zeros((self.max_size, 1))
self._advantages = np.zeros((self.max_size, 1))
self._returns = np.zeros((self.max_size, 1))
def add_samples(self, lp, adv, ret):
self._log_probs = lp
self._advantages = adv
self._returns = ret
def get_samples(self, indices):
return dict(
log_probs = self._log_probs[indices],
advantages = self._advantages[indices],
returns = self._returns[indices],
)
def random_batch(self, batch_size):
indices = np.random.randint(0, self.simple_buffer._size, batch_size)
simple = self.simple_buffer.get_samples(indices)
ppo = self.get_samples(indices)
return {**simple, **ppo}
mujoco-py==1.50.1.56
gym==0.10.5
tensorflow==1.10.0
numpy==1.14.5
scipy==1.1.0
tensorflow-probability==0.3.0
seaborn
Box2D==2.3.2
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment