Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
bb85463
smooth noise sampling and started gSDE
lukasmolnar Feb 28, 2024
243cf0f
adress comments
lukasmolnar Mar 1, 2024
d838dc6
start moving things to SmoothActor
lukasmolnar Mar 1, 2024
aea5d88
moved everything to SmoothActor (it runs)
lukasmolnar Mar 1, 2024
3603205
possibly resample in PPO update
lukasmolnar Mar 1, 2024
bb0a273
learn_features=True and correct sample dim
lukasmolnar Mar 4, 2024
40025c4
adjust sample freq and update plotting
lukasmolnar Mar 4, 2024
41e9dea
Merge branch 'dev' into lm/smooth-exploration
lukasmolnar Mar 11, 2024
c301746
update log_std_init=0.0 and refactor
lukasmolnar Mar 15, 2024
a372a52
log joint data for training and play
lukasmolnar Apr 3, 2024
599b2ca
update logging and plotting
lukasmolnar Apr 6, 2024
4994550
plot FT script
lukasmolnar Apr 19, 2024
0205d99
added sweep config
lukasmolnar Apr 19, 2024
a68d7ce
Merge remote-tracking branch 'origin/dev' into lm/smooth-exploration
lukasmolnar Apr 19, 2024
64b9b14
update on policy and old policy runners, get nans for log_probs
lukasmolnar Apr 24, 2024
91656e9
run 200 iterations before starting training, to burn in normalization
sheim Apr 26, 2024
c0a6d61
Merge pull request #15 from mit-biomimetics/sh/smooth-exploration
lukasmolnar Apr 26, 2024
f74e7fe
update dummy input in export_network
lukasmolnar May 3, 2024
18045e9
good choice of params: sample 16, rollout 32, LR x1.1, des_kl 0.02
lukasmolnar May 3, 2024
adfb078
export latent network and update configs
lukasmolnar May 6, 2024
8ff82ae
export latent net with norm and log get_std
lukasmolnar May 6, 2024
007ea93
export actor std to txt file
lukasmolnar May 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,7 @@ ipython_config.py
venv/
env.bak/
venv.bak/

# Smooth exploration
gym/smooth_exploration/data*
gym/smooth_exploration/figures*
1 change: 1 addition & 0 deletions gym/envs/base/legged_robot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ class actor:
# can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
normalize_obs = True
smooth_exploration = False

obs = [
"observation_a",
Expand Down
23 changes: 14 additions & 9 deletions gym/envs/mini_cheetah/mini_cheetah_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,17 @@ class actor:
hidden_dims = [256, 256, 128]
# * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
smooth_exploration = True
exploration_sample_freq = 16

obs = [
"base_lin_vel",
# "base_lin_vel",
"base_ang_vel",
"projected_gravity",
"commands",
"dof_pos_obs",
"dof_vel",
"dof_pos_target",
# "dof_pos_target",
]
actions = ["dof_pos_target"]
add_noise = True
Expand Down Expand Up @@ -194,20 +196,23 @@ class algorithm(LeggedRobotRunnerCfg.algorithm):
value_loss_coef = 1.0
use_clipped_value_loss = True
clip_param = 0.2
entropy_coef = 0.02
num_learning_epochs = 4
entropy_coef = 0.01
num_learning_epochs = 6
# * mini batch size = num_envs*nsteps / nminibatches
num_mini_batches = 8
learning_rate = 1.0e-5
schedule = "adaptive" # can be adaptive or fixed
num_mini_batches = 4
discount_horizon = 1.0 # [s]
# GAE_bootstrap_horizon = 2.0 # [s]
desired_kl = 0.01
desired_kl = 0.02
max_grad_norm = 1.0
# * Learning rate
learning_rate = 0.002
schedule = "adaptive" # can be adaptive or fixed
lr_range = [2e-4, 1e-2]
lr_ratio = 1.3

class runner(LeggedRobotRunnerCfg.runner):
run_name = ""
experiment_name = "mini_cheetah"
max_iterations = 500
max_iterations = 800
algorithm_class_name = "PPO2"
num_steps_per_env = 32
1 change: 1 addition & 0 deletions gym/envs/mini_cheetah/mini_cheetah_osc_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ class policy:
critic_hidden_dims = [256, 256, 128]
# * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
smooth_exploration = False

obs = [
"base_ang_vel",
Expand Down
14 changes: 10 additions & 4 deletions gym/envs/mini_cheetah/mini_cheetah_ref_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ class actor:
hidden_dims = [256, 256, 128]
# * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
smooth_exploration = True
exploration_sample_freq = 16

normalize_obs = True
obs = [
"base_ang_vel",
Expand Down Expand Up @@ -148,17 +151,20 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
num_mini_batches = 4
storage_size = 2**17 # new
mini_batch_size = 2**15 # new
learning_rate = 5.0e-5
schedule = "adaptive" # can be adaptive, fixed
discount_horizon = 1.0 # [s]
lam = 0.95
GAE_bootstrap_horizon = 2.0 # [s]
desired_kl = 0.01
desired_kl = 0.02
max_grad_norm = 1.0
# * Learning rate
learning_rate = 0.002
schedule = "adaptive" # can be adaptive or fixed
lr_range = [3e-4, 1e-2]
lr_ratio = 1.3

class runner(MiniCheetahRunnerCfg.runner):
run_name = ""
experiment_name = "mini_cheetah_ref"
max_iterations = 500 # number of policy updates
max_iterations = 800 # number of policy updates
algorithm_class_name = "PPO2"
num_steps_per_env = 32
1 change: 1 addition & 0 deletions gym/envs/mit_humanoid/mit_humanoid_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ class actor:
critic_hidden_dims = [512, 256, 128]
# * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
activation = "elu"
smooth_exploration = False

obs = [
"base_height",
Expand Down
56 changes: 56 additions & 0 deletions gym/smooth_exploration/plot_ft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import numpy as np
import matplotlib.pyplot as plt
import os

SAMPLE_FREQ = 8
STEPS = 500

smooth_name = "ref_sample_8_len_500"
baseline_name = "ref_baseline_len_500"

smooth_data_dir = "./data_train/" + smooth_name
baseline_data_dir = "./data_train/" + baseline_name
fig_dir = "./figures_train/"

if not os.path.exists(fig_dir):
os.makedirs(fig_dir)

# load data
smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0]
baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0]
smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0]
baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0]

# compute FFT averages
smooth_ffts = [[], [], []]
baseline_ffts = [[], [], []]
for it in range(0, smooth_pos_target.shape[0], 10):
# only use data that didn't terminate
if not np.any(smooth_terminated[it, :, 0]):
for idx in range(3):
fft = np.fft.fft(smooth_pos_target[it, :, idx])
smooth_ffts[idx].append(fft[: len(fft) // 2])

if not np.any(baseline_terminated[it, :, 0]):
for idx in range(3):
fft = np.fft.fft(baseline_pos_target[it, :, idx])
baseline_ffts[idx].append(fft[: len(fft) // 2])

print(f"Total smooth FFTS: {len(smooth_ffts[0])}")
print(f"Total baseline FFTS: {len(baseline_ffts[0])}")

smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)]
baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)]

# plot FFTs
fig, axs = plt.subplots(3, 1, figsize=(10, 10))
for idx in range(3):
axs[idx].plot(np.abs(smooth_fft_means[idx]))
axs[idx].plot(np.abs(baseline_fft_means[idx]))
axs[idx].set_title(f"FT Amplitude idx {idx}")
axs[idx].set_xlabel("Frequency")
axs[idx].set_ylabel("Amplitude")
axs[idx].legend(["smooth", "baseline"])

fig.tight_layout()
fig.savefig(fig_dir + "/" + smooth_name + ".png")
38 changes: 38 additions & 0 deletions gym/smooth_exploration/plot_play.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
import matplotlib.pyplot as plt

name = "mini_cheetah_ref"
data_dir = "./data_play/" + name
fig_dir = "./figures_play/" + name

# load data
dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
dof_vel = np.load(data_dir + "/dof_vel.npy")[0]

# plot data
n_steps = 200
fig, axs = plt.subplots(3, figsize=(10, 10))
plt.suptitle(name)

for i in range(3):
axs[0].plot(dof_pos_obs[:n_steps, i])
axs[0].set_title("dof_pos_obs")
axs[0].legend(["idx 0", "idx 1", "idx 2"])
axs[0].set_xlabel("time steps")

for i in range(3):
axs[1].plot(dof_pos_target[:n_steps, i])
axs[1].set_title("dof_pos_target")
axs[1].legend(["idx 0", "idx 1", "idx 2"])
axs[1].set_xlabel("time steps")

for i in range(3):
axs[2].plot(dof_vel[:n_steps, i])
axs[2].set_title("dof_vel")
axs[2].legend(["idx 0", "idx 1", "idx 2"])
axs[2].set_xlabel("time steps")

plt.tight_layout()
plt.savefig(fig_dir + "/" + name + ".png")
plt.show()
93 changes: 93 additions & 0 deletions gym/smooth_exploration/plot_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import numpy as np
import matplotlib.pyplot as plt
import os

FOURIER = True
SMOOTH = True
SAMPLE_FREQ = 16
STEPS = 1000

name = "ref_sample_16_len_1000"
data_dir = "./data_train/" + name
fig_dir = "./figures_train/" + name

if not os.path.exists(fig_dir):
os.makedirs(fig_dir)

# load data
dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
torques = np.load(data_dir + "/torques.npy")[0]
terminated = np.load(data_dir + "/terminated.npy")[0]


# plot fourier trainsform
def plot_fourier(data, it):
fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10))
for i in range(3):
ft = np.fft.fft(data[:, i])
ft_half = ft[: len(ft) // 2]
axs_ft[0].plot(np.abs(ft_half))
axs_ft[1].plot(np.angle(ft_half))

axs_ft[0].set_title("FT Amplitude")
axs_ft[0].set_xlabel("Frequency")
axs_ft[0].set_ylabel("Amplitude")
axs_ft[0].legend(["idx 0", "idx 1", "idx 2"])
axs_ft[1].set_title("FT Phase")
axs_ft[1].set_xlabel("Frequency")
axs_ft[1].set_ylabel("Phase")
axs_ft[1].legend(["idx 0", "idx 1", "idx 2"])

fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png")


# plot data for each iteration
for it in range(0, dof_pos_obs.shape[0], 10):
# check if iteration terminated
terminate_idx = np.where(terminated[it, :, 0] == 1)[0]
if terminate_idx.size > 0:
n_steps = terminate_idx[0]
else:
n_steps = dof_pos_obs.shape[1]
print(n_steps)

# generate figure
fig, axs = plt.subplots(4, figsize=(10, 10))
plt.suptitle(name + " iteration " + str(it))

axs[0].set_title("dof_pos_obs")
for i in range(3):
axs[0].plot(dof_pos_obs[it, :n_steps, i])

axs[1].set_title("dof_pos_target")
for i in range(3):
axs[1].plot(dof_pos_target[it, :n_steps, i])
if FOURIER and n_steps == STEPS:
plot_fourier(dof_pos_target[it, :n_steps, :], it)

axs[2].set_title("dof_vel")
for i in range(3):
axs[2].plot(dof_vel[it, :n_steps, i])

axs[3].set_title("torques")
for i in range(3):
axs[3].plot(torques[it, :n_steps, i])

# format plots
for idx in range(4):
axs[idx].legend(["idx 0", "idx 1", "idx 2"])
axs[idx].set_xlabel("time steps")
axs[idx].set_xlim([0, n_steps])

if SMOOTH:
# plot vertical lines where noise is resampled
for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ):
axs[0].axvline(x, color="r", linestyle="--")
axs[1].axvline(x, color="r", linestyle="--")
axs[2].axvline(x, color="r", linestyle="--")
axs[3].axvline(x, color="r", linestyle="--")

fig.tight_layout()
fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
8 changes: 6 additions & 2 deletions learning/algorithms/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import torch.nn as nn
import torch.optim as optim

from learning.modules import ActorCritic
from learning.modules import ActorCritic, SmoothActor
from learning.storage import RolloutStorage


Expand Down Expand Up @@ -162,7 +162,11 @@ def update(self):
old_mu_batch,
old_sigma_batch,
) in generator:
self.actor_critic.act(obs_batch)
# TODO[lm]: Look into resampling noise here, gSDE paper seems to do it.
if isinstance(self.actor_critic.actor, SmoothActor):
batch_size = obs_batch.shape[0]
self.actor_critic.actor.sample_weights(batch_size)
self.actor_critic.actor.update_distribution(obs_batch)
actions_log_prob_batch = self.actor_critic.get_actions_log_prob(
actions_batch
)
Expand Down
22 changes: 18 additions & 4 deletions learning/algorithms/ppo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
create_uniform_generator,
compute_generalized_advantages,
)
from learning.modules import SmoothActor


class PPO2:
Expand All @@ -26,13 +27,17 @@ def __init__(
desired_kl=0.01,
loss_fn="MSE",
device="cpu",
lr_range=[1e-4, 1e-2],
lr_ratio=1.3,
**kwargs,
):
self.device = device

self.desired_kl = desired_kl
self.schedule = schedule
self.learning_rate = learning_rate
self.lr_range = lr_range
self.lr_ratio = lr_ratio

# * PPO components
self.actor = actor.to(self.device)
Expand Down Expand Up @@ -99,7 +104,7 @@ def update_actor(self, data):
self.mean_surrogate_loss = 0
counter = 0

self.actor.act(data["actor_obs"])
self.actor.update_distribution(data["actor_obs"])
data["old_sigma_batch"] = self.actor.action_std.detach()
data["old_mu_batch"] = self.actor.action_mean.detach()
data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob(
Expand All @@ -111,8 +116,12 @@ def update_actor(self, data):
batch_size = total_data // self.num_mini_batches
generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
for batch in generator:
# * Re-sample noise for smooth actor
if isinstance(self.actor, SmoothActor):
self.actor.sample_weights(batch_size)

# ! refactor how this is done
self.actor.act(batch["actor_obs"])
self.actor.update_distribution(batch["actor_obs"])
actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"])
mu_batch = self.actor.action_mean
sigma_batch = self.actor.action_std
Expand All @@ -132,11 +141,16 @@ def update_actor(self, data):
axis=-1,
)
kl_mean = torch.mean(kl)
lr_min, lr_max = self.lr_range

if kl_mean > self.desired_kl * 2.0:
self.learning_rate = max(1e-5, self.learning_rate / 1.5)
self.learning_rate = max(
lr_min, self.learning_rate / self.lr_ratio
)
elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
self.learning_rate = min(1e-2, self.learning_rate * 1.5)
self.learning_rate = min(
lr_max, self.learning_rate * self.lr_ratio
)

for param_group in self.optimizer.param_groups:
# ! check this
Expand Down
Loading