sheim · lukasmolnar · Feb 28, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/.gitignore b/.gitignore
@@ -81,3 +81,7 @@ ipython_config.py
 venv/
 env.bak/
 venv.bak/
+
+# Smooth exploration
+gym/smooth_exploration/data*
+gym/smooth_exploration/figures*
diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py
@@ -239,6 +239,7 @@ class actor:
         # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         normalize_obs = True
+        smooth_exploration = False
 
         obs = [
             "observation_a",

diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -130,15 +130,17 @@ class actor:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = True
+        exploration_sample_freq = 16
 
         obs = [
-            "base_lin_vel",
+            # "base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
             "dof_pos_obs",
             "dof_vel",
-            "dof_pos_target",
+            # "dof_pos_target",
         ]
         actions = ["dof_pos_target"]
         add_noise = True
@@ -194,20 +196,23 @@ class algorithm(LeggedRobotRunnerCfg.algorithm):
         value_loss_coef = 1.0
         use_clipped_value_loss = True
         clip_param = 0.2
-        entropy_coef = 0.02
-        num_learning_epochs = 4
+        entropy_coef = 0.01
+        num_learning_epochs = 6
         # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 8
-        learning_rate = 1.0e-5
-        schedule = "adaptive"  # can be adaptive or fixed
+        num_mini_batches = 4
         discount_horizon = 1.0  # [s]
         # GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
+        desired_kl = 0.02
         max_grad_norm = 1.0
+        # * Learning rate
+        learning_rate = 0.002
+        schedule = "adaptive"  # can be adaptive or fixed
+        lr_range = [2e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner(LeggedRobotRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah"
-        max_iterations = 500
+        max_iterations = 800
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
@@ -166,6 +166,7 @@ class policy:
         critic_hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         obs = [
             "base_ang_vel",

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -73,6 +73,9 @@ class actor:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = True
+        exploration_sample_freq = 16
+
         normalize_obs = True
         obs = [
             "base_ang_vel",
@@ -148,17 +151,20 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         num_mini_batches = 4
         storage_size = 2**17  # new
         mini_batch_size = 2**15  #  new
-        learning_rate = 5.0e-5
-        schedule = "adaptive"  # can be adaptive, fixed
         discount_horizon = 1.0  # [s]
         lam = 0.95
         GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
+        desired_kl = 0.02
         max_grad_norm = 1.0
+        # * Learning rate
+        learning_rate = 0.002
+        schedule = "adaptive"  # can be adaptive or fixed
+        lr_range = [3e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 500  # number of policy updates
+        max_iterations = 800  # number of policy updates
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py
@@ -187,6 +187,7 @@ class actor:
         critic_hidden_dims = [512, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         obs = [
             "base_height",

diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py
@@ -0,0 +1,56 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+SAMPLE_FREQ = 8
+STEPS = 500
+
+smooth_name = "ref_sample_8_len_500"
+baseline_name = "ref_baseline_len_500"
+
+smooth_data_dir = "./data_train/" + smooth_name
+baseline_data_dir = "./data_train/" + baseline_name
+fig_dir = "./figures_train/"
+
+if not os.path.exists(fig_dir):
+    os.makedirs(fig_dir)
+
+# load data
+smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0]
+baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0]
+smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0]
+baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0]
+
+# compute FFT averages
+smooth_ffts = [[], [], []]
+baseline_ffts = [[], [], []]
+for it in range(0, smooth_pos_target.shape[0], 10):
+    # only use data that didn't terminate
+    if not np.any(smooth_terminated[it, :, 0]):
+        for idx in range(3):
+            fft = np.fft.fft(smooth_pos_target[it, :, idx])
+            smooth_ffts[idx].append(fft[: len(fft) // 2])
+
+    if not np.any(baseline_terminated[it, :, 0]):
+        for idx in range(3):
+            fft = np.fft.fft(baseline_pos_target[it, :, idx])
+            baseline_ffts[idx].append(fft[: len(fft) // 2])
+
+print(f"Total smooth FFTS: {len(smooth_ffts[0])}")
+print(f"Total baseline FFTS: {len(baseline_ffts[0])}")
+
+smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)]
+baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)]
+
+# plot FFTs
+fig, axs = plt.subplots(3, 1, figsize=(10, 10))
+for idx in range(3):
+    axs[idx].plot(np.abs(smooth_fft_means[idx]))
+    axs[idx].plot(np.abs(baseline_fft_means[idx]))
+    axs[idx].set_title(f"FT Amplitude idx {idx}")
+    axs[idx].set_xlabel("Frequency")
+    axs[idx].set_ylabel("Amplitude")
+    axs[idx].legend(["smooth", "baseline"])
+
+fig.tight_layout()
+fig.savefig(fig_dir + "/" + smooth_name + ".png")
diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py
@@ -0,0 +1,38 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+name = "mini_cheetah_ref"
+data_dir = "./data_play/" + name
+fig_dir = "./figures_play/" + name
+
+# load data
+dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
+dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
+dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
+
+# plot data
+n_steps = 200
+fig, axs = plt.subplots(3, figsize=(10, 10))
+plt.suptitle(name)
+
+for i in range(3):
+    axs[0].plot(dof_pos_obs[:n_steps, i])
+axs[0].set_title("dof_pos_obs")
+axs[0].legend(["idx 0", "idx 1", "idx 2"])
+axs[0].set_xlabel("time steps")
+
+for i in range(3):
+    axs[1].plot(dof_pos_target[:n_steps, i])
+axs[1].set_title("dof_pos_target")
+axs[1].legend(["idx 0", "idx 1", "idx 2"])
+axs[1].set_xlabel("time steps")
+
+for i in range(3):
+    axs[2].plot(dof_vel[:n_steps, i])
+axs[2].set_title("dof_vel")
+axs[2].legend(["idx 0", "idx 1", "idx 2"])
+axs[2].set_xlabel("time steps")
+
+plt.tight_layout()
+plt.savefig(fig_dir + "/" + name + ".png")
+plt.show()
diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py
@@ -0,0 +1,93 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+FOURIER = True
+SMOOTH = True
+SAMPLE_FREQ = 16
+STEPS = 1000
+
+name = "ref_sample_16_len_1000"
+data_dir = "./data_train/" + name
+fig_dir = "./figures_train/" + name
+
+if not os.path.exists(fig_dir):
+    os.makedirs(fig_dir)
+
+# load data
+dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
+dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
+dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
+torques = np.load(data_dir + "/torques.npy")[0]
+terminated = np.load(data_dir + "/terminated.npy")[0]
+
+
+# plot fourier trainsform
+def plot_fourier(data, it):
+    fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10))
+    for i in range(3):
+        ft = np.fft.fft(data[:, i])
+        ft_half = ft[: len(ft) // 2]
+        axs_ft[0].plot(np.abs(ft_half))
+        axs_ft[1].plot(np.angle(ft_half))
+
+    axs_ft[0].set_title("FT Amplitude")
+    axs_ft[0].set_xlabel("Frequency")
+    axs_ft[0].set_ylabel("Amplitude")
+    axs_ft[0].legend(["idx 0", "idx 1", "idx 2"])
+    axs_ft[1].set_title("FT Phase")
+    axs_ft[1].set_xlabel("Frequency")
+    axs_ft[1].set_ylabel("Phase")
+    axs_ft[1].legend(["idx 0", "idx 1", "idx 2"])
+
+    fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png")
+
+
+# plot data for each iteration
+for it in range(0, dof_pos_obs.shape[0], 10):
+    # check if iteration terminated
+    terminate_idx = np.where(terminated[it, :, 0] == 1)[0]
+    if terminate_idx.size > 0:
+        n_steps = terminate_idx[0]
+    else:
+        n_steps = dof_pos_obs.shape[1]
+    print(n_steps)
+
+    # generate figure
+    fig, axs = plt.subplots(4, figsize=(10, 10))
+    plt.suptitle(name + " iteration " + str(it))
+
+    axs[0].set_title("dof_pos_obs")
+    for i in range(3):
+        axs[0].plot(dof_pos_obs[it, :n_steps, i])
+
+    axs[1].set_title("dof_pos_target")
+    for i in range(3):
+        axs[1].plot(dof_pos_target[it, :n_steps, i])
+    if FOURIER and n_steps == STEPS:
+        plot_fourier(dof_pos_target[it, :n_steps, :], it)
+
+    axs[2].set_title("dof_vel")
+    for i in range(3):
+        axs[2].plot(dof_vel[it, :n_steps, i])
+
+    axs[3].set_title("torques")
+    for i in range(3):
+        axs[3].plot(torques[it, :n_steps, i])
+
+    # format plots
+    for idx in range(4):
+        axs[idx].legend(["idx 0", "idx 1", "idx 2"])
+        axs[idx].set_xlabel("time steps")
+        axs[idx].set_xlim([0, n_steps])
+
+    if SMOOTH:
+        # plot vertical lines where noise is resampled
+        for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ):
+            axs[0].axvline(x, color="r", linestyle="--")
+            axs[1].axvline(x, color="r", linestyle="--")
+            axs[2].axvline(x, color="r", linestyle="--")
+            axs[3].axvline(x, color="r", linestyle="--")
+
+    fig.tight_layout()
+    fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py
@@ -34,7 +34,7 @@
 import torch.nn as nn
 import torch.optim as optim
 
-from learning.modules import ActorCritic
+from learning.modules import ActorCritic, SmoothActor
 from learning.storage import RolloutStorage
 
 
@@ -162,7 +162,11 @@ def update(self):
             old_mu_batch,
             old_sigma_batch,
         ) in generator:
-            self.actor_critic.act(obs_batch)
+            # TODO[lm]: Look into resampling noise here, gSDE paper seems to do it.
+            if isinstance(self.actor_critic.actor, SmoothActor):
+                batch_size = obs_batch.shape[0]
+                self.actor_critic.actor.sample_weights(batch_size)
+            self.actor_critic.actor.update_distribution(obs_batch)
             actions_log_prob_batch = self.actor_critic.get_actions_log_prob(
                 actions_batch
             )

diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
@@ -6,6 +6,7 @@
     create_uniform_generator,
     compute_generalized_advantages,
 )
+from learning.modules import SmoothActor
 
 
 class PPO2:
@@ -26,13 +27,17 @@ def __init__(
         desired_kl=0.01,
         loss_fn="MSE",
         device="cpu",
+        lr_range=[1e-4, 1e-2],
+        lr_ratio=1.3,
         **kwargs,
     ):
         self.device = device
 
         self.desired_kl = desired_kl
         self.schedule = schedule
         self.learning_rate = learning_rate
+        self.lr_range = lr_range
+        self.lr_ratio = lr_ratio
 
         # * PPO components
         self.actor = actor.to(self.device)
@@ -99,7 +104,7 @@ def update_actor(self, data):
         self.mean_surrogate_loss = 0
         counter = 0
 
-        self.actor.act(data["actor_obs"])
+        self.actor.update_distribution(data["actor_obs"])
         data["old_sigma_batch"] = self.actor.action_std.detach()
         data["old_mu_batch"] = self.actor.action_mean.detach()
         data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob(
@@ -111,8 +116,12 @@ def update_actor(self, data):
         batch_size = total_data // self.num_mini_batches
         generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
         for batch in generator:
+            # * Re-sample noise for smooth actor
+            if isinstance(self.actor, SmoothActor):
+                self.actor.sample_weights(batch_size)
+
             # ! refactor how this is done
-            self.actor.act(batch["actor_obs"])
+            self.actor.update_distribution(batch["actor_obs"])
             actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"])
             mu_batch = self.actor.action_mean
             sigma_batch = self.actor.action_std
@@ -132,11 +141,16 @@ def update_actor(self, data):
                         axis=-1,
                     )
                     kl_mean = torch.mean(kl)
+                    lr_min, lr_max = self.lr_range
 
                     if kl_mean > self.desired_kl * 2.0:
-                        self.learning_rate = max(1e-5, self.learning_rate / 1.5)
+                        self.learning_rate = max(
+                            lr_min, self.learning_rate / self.lr_ratio
+                        )
                     elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
-                        self.learning_rate = min(1e-2, self.learning_rate * 1.5)
+                        self.learning_rate = min(
+                            lr_max, self.learning_rate * self.lr_ratio
+                        )
 
                     for param_group in self.optimizer.param_groups:
                         # ! check this