From bb85463c8c8cbd981673262a396f615ee38ba3fa Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 28 Feb 2024 13:30:12 -0500
Subject: [PATCH 01/19] smooth noise sampling and started gSDE

---
 .gitignore                                    |   2 +
 .../mini_cheetah/mini_cheetah_ref_config.py   |   2 +-
 learning/algorithms/ppo.py                    |   2 +-
 learning/modules/actor.py                     |  19 ++
 learning/modules/actor_critic.py              |  15 ++
 learning/modules/utils/__init__.py            |   1 +
 learning/modules/utils/gSDE.py                | 191 ++++++++++++++++++
 learning/modules/utils/neural_net.py          |   9 +-
 learning/runners/on_policy_runner.py          |   4 +
 plots/plot.py                                 |  19 ++
 10 files changed, 260 insertions(+), 4 deletions(-)
 create mode 100644 learning/modules/utils/gSDE.py
 create mode 100644 plots/plot.py

diff --git a/.gitignore b/.gitignore
index 0b68375a..d2761927 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,8 @@ gym/wandb
 *.npz
 user/wandb_config.json
 *trajectories/
+plots/*.csv
+plots/*.png
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 41bb634f..4db376ce 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -67,7 +67,7 @@ class scaling(MiniCheetahCfg.scaling):
 
 class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg):
     seed = -1
-    runner_class_name = "MyRunner"
+    runner_class_name = "OnPolicyRunner"
 
     class policy(MiniCheetahRunnerCfg.policy):
         actor_hidden_dims = [256, 256, 128]
diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py
index 9c7454bd..3ca5a5b4 100644
--- a/learning/algorithms/ppo.py
+++ b/learning/algorithms/ppo.py
@@ -152,7 +152,7 @@ def update(self):
             old_mu_batch,
             old_sigma_batch,
         ) in generator:
-            self.actor_critic.act(obs_batch)
+            self.actor_critic.actor.update_distribution(obs_batch)
             actions_log_prob_batch = self.actor_critic.get_actions_log_prob(
                 actions_batch
             )
diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index f9c82396..e588ea5e 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -37,6 +37,9 @@ def __init__(
         self.distribution = None
         # disable args validation for speedup
         Normal.set_default_validate_args = False
+        # smooth exploration
+        self.use_smooth_expl = True
+        self.episode_noise = None
 
     @property
     def action_mean(self):
@@ -58,8 +61,24 @@ def update_distribution(self, observations):
 
     def act(self, observations):
         self.update_distribution(observations)
+        if self.use_smooth_expl:
+            return self.act_smooth()
         return self.distribution.sample()
 
+    def act_smooth(self):
+        mean = self.distribution.mean
+        if self.episode_noise is None:
+            sample = self.distribution.sample()
+            self.episode_noise = sample - self.distribution.mean
+        else:
+            sample = mean + self.episode_noise
+        # write to csv
+        with open(
+            "/home/lmolnar/workspace/QGym/plots/distribution_smooth.csv", "a"
+        ) as f:
+            f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
+        return sample
+
     def get_actions_log_prob(self, actions):
         return self.distribution.log_prob(actions).sum(dim=-1)
 
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index ed837f5e..91365fcc 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -2,6 +2,7 @@
 
 from .actor import Actor
 from .critic import Critic
+from .utils import StateDependentNoiseDistribution
 
 
 class ActorCritic(nn.Module):
@@ -15,6 +16,7 @@ def __init__(
         activation="elu",
         init_noise_std=1.0,
         normalize_obs=True,
+        use_sde=True,
         **kwargs,
     ):
         if kwargs:
@@ -40,6 +42,19 @@ def __init__(
         print(f"Actor MLP: {self.actor.NN}")
         print(f"Critic MLP: {self.critic.NN}")
 
+        # TODO[lm]: Decide how to handle the state dependent noise distribution in
+        # this class. In stable-baselines3 there is a class MlpExtractor which does
+        # what the Actor and Critic classes do here, just with the latent representation
+        # of the networks. Either I make a new class in a similar way and store the
+        # action distribution here, or I make the changes in Actor and Critic and change
+        # the distribution there.
+        if use_sde:
+            self.action_dist = StateDependentNoiseDistribution(
+                num_actions,
+                num_actor_obs,
+                num_critic_obs,
+            )
+
     @property
     def action_mean(self):
         return self.actor.action_mean
diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py
index 1422b82d..3bc82600 100644
--- a/learning/modules/utils/__init__.py
+++ b/learning/modules/utils/__init__.py
@@ -1,2 +1,3 @@
 from .neural_net import create_MLP, export_network
 from .normalize import RunningMeanStd
+from .gSDE import StateDependentNoiseDistribution
diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py
new file mode 100644
index 00000000..2920b796
--- /dev/null
+++ b/learning/modules/utils/gSDE.py
@@ -0,0 +1,191 @@
+from typing import Tuple, Optional
+
+import torch
+import torch.nn as nn
+from torch.distributions import Normal
+
+
+# The follow implementation was used in the gSDE paper on smooth exploration. See code:
+# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
+# TODO[lm]: Need to update some of the naming conventions to fit our codebase.
+class StateDependentNoiseDistribution:
+    """
+    Distribution class for using generalized State Dependent Exploration (gSDE).
+    It is used to create the noise exploration matrix and compute the log
+    probability of an action with that noise.
+    """
+
+    latent_sde_dim: Optional[int]
+    weights_dist: Normal
+    _latent_sde: torch.Tensor
+    exploration_mat: torch.Tensor
+    exploration_matrices: torch.Tensor
+
+    def __init__(
+        self,
+        num_actions: int,
+        num_actor_obs: int,
+        num_critic_obs: int,
+        full_std: bool = True,
+        use_expln: bool = False,
+        learn_features: bool = False,
+        epsilon: float = 1e-6,
+    ):
+        self.num_actions = num_actions
+        self.num_actor_obs = num_actor_obs
+        self.num_critic_obs = num_critic_obs
+        self.full_std = full_std
+        self.use_expln = use_expln
+        self.learn_features = learn_features
+        self.epsilon = epsilon
+        self.mean_actions = None
+        self.log_std = None
+
+    def get_std(self, log_std: torch.Tensor) -> torch.Tensor:
+        """
+        Get the standard deviation from the learned parameter
+        (log of it by default). This ensures that the std is positive.
+
+        :param log_std:
+        :return:
+        """
+        if self.use_expln:
+            # From gSDE paper, it allows to keep variance
+            # above zero and prevent it from growing too fast
+            below_threshold = torch.exp(log_std) * (log_std <= 0)
+            # Avoid NaN: zeros values that are below zero
+            safe_log_std = log_std * (log_std > 0) + self.epsilon
+            above_threshold = (torch.log1p(safe_log_std) + 1.0) * (log_std > 0)
+            std = below_threshold + above_threshold
+        else:
+            # Use normal exponential
+            std = torch.exp(log_std)
+
+        if self.full_std:
+            return std
+        assert self.latent_sde_dim is not None
+        # Reduce the number of parameters:
+        return torch.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std
+
+    def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None:
+        """
+        Sample weights for the noise exploration matrix,
+        using a centered Gaussian distribution.
+
+        :param log_std:
+        :param batch_size:
+        """
+        std = self.get_std(log_std)
+        self.weights_dist = Normal(torch.zeros_like(std), std)
+        # Reparametrization trick to pass gradients
+        self.exploration_mat = self.weights_dist.rsample()
+        # Pre-compute matrices in case of parallel exploration
+        self.exploration_matrices = self.weights_dist.rsample((batch_size,))
+
+    def proba_distribution_net(
+        self,
+        latent_dim: int,
+        log_std_init: float = -2.0,
+        latent_sde_dim: Optional[int] = None,
+    ) -> Tuple[nn.Module, nn.Parameter]:
+        """
+        Create the layers and parameter that represent the distribution:
+        one output will be the deterministic action, the other parameter will be the
+        standard deviation of the distribution that control the weights of the noise
+        matrix.
+
+        :param latent_dim: Dimension of the last layer of the policy (before the
+        action layer)
+        :param log_std_init: Initial value for the log standard deviation
+        :param latent_sde_dim: Dimension of the last layer of the features extractor
+            for gSDE. By default, it is shared with the policy network.
+        :return:
+        """
+        # Network for the deterministic action, it represents the mean of the
+        # distribution
+        mean_actions_net = nn.Linear(latent_dim, self.action_dim)
+        # When we learn features for the noise, the feature dimension
+        # can be different between the policy and the noise network
+        self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim
+        # Reduce the number of parameters if needed
+        log_std = (
+            torch.ones(self.latent_sde_dim, self.action_dim)
+            if self.full_std
+            else torch.ones(self.latent_sde_dim, 1)
+        )
+        # Transform it to a parameter so it can be optimized
+        log_std = nn.Parameter(log_std * log_std_init, requires_grad=True)
+        # Sample an exploration matrix
+        self.sample_weights(log_std)
+        return mean_actions_net, log_std
+
+    def proba_distribution(
+        self,
+        mean_actions: torch.Tensor,
+        log_std: torch.Tensor,
+        latent_sde: torch.Tensor,
+    ) -> "StateDependentNoiseDistribution":
+        """
+        Create the distribution given its parameters (mean, std)
+
+        :param mean_actions:
+        :param log_std:
+        :param latent_sde:
+        :return:
+        """
+        # Stop gradient if we don't want to influence the features
+        self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
+        variance = torch.mm(self._latent_sde**2, self.get_std(log_std) ** 2)
+        self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
+        return self
+
+    def log_prob(self, actions: torch.Tensor):
+        # TODO[lm]: verify that summed correctly (stable-baselines3 is different)
+        return self.distribution.log_prob(actions).sum(-1)
+
+    def entropy(self):
+        # TODO[lm]: verify that summed correctly (stable-baselines3 is different)
+        return self.distribution.entropy().sum(-1)
+
+    def sample(self) -> torch.Tensor:
+        noise = self.get_noise(self._latent_sde)
+        actions = self.distribution.mean + noise
+        return actions
+
+    def mode(self) -> torch.Tensor:
+        return self.distribution.mean
+
+    def get_noise(self, latent_sde: torch.Tensor) -> torch.Tensor:
+        latent_sde = latent_sde if self.learn_features else latent_sde.detach()
+        # Default case: only one exploration matrix
+        if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
+            return torch.mm(latent_sde, self.exploration_mat)
+        # Use batch matrix multiplication for efficient computation
+        # (batch_size, n_features) -> (batch_size, 1, n_features)
+        latent_sde = latent_sde.unsqueeze(dim=1)
+        # (batch_size, 1, n_actions)
+        noise = torch.bmm(latent_sde, self.exploration_matrices)
+        return noise.squeeze(dim=1)
+
+    def actions_from_params(
+        self,
+        mean_actions: torch.Tensor,
+        log_std: torch.Tensor,
+        latent_sde: torch.Tensor,
+        deterministic: bool = False,
+    ) -> torch.Tensor:
+        # Update the proba distribution
+        self.proba_distribution(mean_actions, log_std, latent_sde)
+        if deterministic:
+            return self.mode()
+        return self.sample()
+
+    def log_prob_from_params(
+        self,
+        mean_actions: torch.Tensor,
+        log_std: torch.Tensor,
+        latent_sde: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        actions = self.actions_from_params(mean_actions, log_std, latent_sde)
+        log_prob = self.log_prob(actions)
+        return actions, log_prob
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 5fd6ab3b..fa25d77b 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -3,7 +3,9 @@
 import copy
 
 
-def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None):
+def create_MLP(
+    num_inputs, num_outputs, hidden_dims, activation, dropouts=None, latent=False
+):
     activation = get_activation(activation)
 
     if dropouts is None:
@@ -15,7 +17,10 @@ def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None):
     else:
         add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0])
         for i in range(len(hidden_dims)):
-            if i == len(hidden_dims) - 1:
+            # TODO[lm]: Could also create a separate function that gives the latent
+            # reprentation used for smooth exploration (but if it doesn't mess up
+            # anything, this is simpler)
+            if i == len(hidden_dims) - 1 and not latent:
                 add_layer(layers, hidden_dims[i], num_outputs)
             else:
                 add_layer(
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index e2fb524e..9bf7c9f7 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -37,6 +37,10 @@ def learn(self):
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
+                    # Reset noise for smooth exploration
+                    if self.alg.actor_critic.actor.use_smooth_expl and i == 0:
+                        self.alg.actor_critic.actor.episode_noise = None
+
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.policy_cfg["actions"],
diff --git a/plots/plot.py b/plots/plot.py
new file mode 100644
index 00000000..2a33eafa
--- /dev/null
+++ b/plots/plot.py
@@ -0,0 +1,19 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Read the CSV file
+name = "distribution_smooth"
+data = pd.read_csv(name + ".csv")
+
+# Plot the data
+n = 200
+plt.plot(data.iloc[:n, 0])
+plt.plot(data.iloc[:n, 1])
+plt.xlabel("timestep")
+plt.ylabel("action")
+plt.title("Smoothing every rollout")
+plt.legend(["mean", "sample"])
+# plt.show()
+
+# Save the plot to a file
+plt.savefig(name + ".png")

From 243cf0f787b268d4b6f75e35b6524e3de911108b Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 1 Mar 2024 10:41:25 -0500
Subject: [PATCH 02/19] adress comments

---
 gym/envs/base/legged_robot_config.py          |  1 +
 gym/envs/mini_cheetah/mini_cheetah_config.py  |  1 +
 .../mini_cheetah/mini_cheetah_osc_config.py   |  1 +
 .../mini_cheetah/mini_cheetah_ref_config.py   |  1 +
 gym/envs/mit_humanoid/mit_humanoid_config.py  |  1 +
 learning/modules/actor.py                     | 44 +++++++++++--------
 learning/modules/actor_critic.py              | 32 +++++++++-----
 learning/modules/utils/gSDE.py                | 29 +++---------
 learning/runners/on_policy_runner.py          |  2 +-
 9 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py
index 0de83e0e..144f8323 100644
--- a/gym/envs/base/legged_robot_config.py
+++ b/gym/envs/base/legged_robot_config.py
@@ -240,6 +240,7 @@ class policy:
         # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         normalize_obs = True
+        smooth_exploration = False
 
         actor_obs = [
             "observation_a",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index de015b21..ef9896e0 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -130,6 +130,7 @@ class policy(LeggedRobotRunnerCfg.policy):
         critic_hidden_dims = [128, 64]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         actor_obs = [
             "base_lin_vel",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
index a08d5179..45191a9f 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
@@ -166,6 +166,7 @@ class policy(MiniCheetahRunnerCfg.policy):
         critic_hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         actor_obs = [
             "base_ang_vel",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 4db376ce..2f894bde 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -74,6 +74,7 @@ class policy(MiniCheetahRunnerCfg.policy):
         critic_hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = True
 
         actor_obs = [
             "base_ang_vel",
diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py
index 3439e58c..25db6163 100644
--- a/gym/envs/mit_humanoid/mit_humanoid_config.py
+++ b/gym/envs/mit_humanoid/mit_humanoid_config.py
@@ -187,6 +187,7 @@ class policy(LeggedRobotRunnerCfg.policy):
         critic_hidden_dims = [512, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         actor_obs = [
             "base_height",
diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index e588ea5e..857d7326 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -5,6 +5,8 @@
 from .utils import export_network
 from .utils import RunningMeanStd
 
+from gym import LEGGED_GYM_ROOT_DIR
+
 
 class Actor(nn.Module):
     def __init__(
@@ -37,9 +39,6 @@ def __init__(
         self.distribution = None
         # disable args validation for speedup
         Normal.set_default_validate_args = False
-        # smooth exploration
-        self.use_smooth_expl = True
-        self.episode_noise = None
 
     @property
     def action_mean(self):
@@ -61,24 +60,8 @@ def update_distribution(self, observations):
 
     def act(self, observations):
         self.update_distribution(observations)
-        if self.use_smooth_expl:
-            return self.act_smooth()
         return self.distribution.sample()
 
-    def act_smooth(self):
-        mean = self.distribution.mean
-        if self.episode_noise is None:
-            sample = self.distribution.sample()
-            self.episode_noise = sample - self.distribution.mean
-        else:
-            sample = mean + self.episode_noise
-        # write to csv
-        with open(
-            "/home/lmolnar/workspace/QGym/plots/distribution_smooth.csv", "a"
-        ) as f:
-            f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
-        return sample
-
     def get_actions_log_prob(self, actions):
         return self.distribution.log_prob(actions).sum(dim=-1)
 
@@ -97,3 +80,26 @@ def normalize(self, observation):
 
     def export(self, path):
         export_network(self, "policy", path, self.num_obs)
+
+
+class SmoothActor(Actor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Noise for smooth exploration
+        self.episode_noise = None
+        self.debug = False
+
+    def act(self, observations):
+        self.update_distribution(observations)
+        mean = self.distribution.mean
+        if self.episode_noise is None:
+            sample = self.distribution.sample()
+            self.episode_noise = sample - self.distribution.mean
+        else:
+            sample = mean + self.episode_noise
+
+        if self.debug:
+            # write to csv (used for plotting)
+            with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f:
+                f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
+        return sample
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index 91365fcc..669a9f6a 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from .actor import Actor
+from .actor import Actor, SmoothActor
 from .critic import Critic
 from .utils import StateDependentNoiseDistribution
 
@@ -16,7 +16,7 @@ def __init__(
         activation="elu",
         init_noise_std=1.0,
         normalize_obs=True,
-        use_sde=True,
+        smooth_exploration=False,
         **kwargs,
     ):
         if kwargs:
@@ -26,14 +26,24 @@ def __init__(
             )
         super(ActorCritic, self).__init__()
 
-        self.actor = Actor(
-            num_actor_obs,
-            num_actions,
-            actor_hidden_dims,
-            activation,
-            init_noise_std,
-            normalize_obs,
-        )
+        if smooth_exploration:
+            self.actor = SmoothActor(
+                num_actor_obs,
+                num_actions,
+                actor_hidden_dims,
+                activation,
+                init_noise_std,
+                normalize_obs,
+            )
+        else:
+            self.actor = Actor(
+                num_actor_obs,
+                num_actions,
+                actor_hidden_dims,
+                activation,
+                init_noise_std,
+                normalize_obs,
+            )
 
         self.critic = Critic(
             num_critic_obs, critic_hidden_dims, activation, normalize_obs
@@ -48,7 +58,7 @@ def __init__(
         # of the networks. Either I make a new class in a similar way and store the
         # action distribution here, or I make the changes in Actor and Critic and change
         # the distribution there.
-        if use_sde:
+        if smooth_exploration:
             self.action_dist = StateDependentNoiseDistribution(
                 num_actions,
                 num_actor_obs,
diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py
index 2920b796..6e04b0e9 100644
--- a/learning/modules/utils/gSDE.py
+++ b/learning/modules/utils/gSDE.py
@@ -5,7 +5,8 @@
 from torch.distributions import Normal
 
 
-# The follow implementation was used in the gSDE paper on smooth exploration. See code:
+# The following implementation was used in the gSDE paper on smooth exploration.
+# See code:
 # https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
 # TODO[lm]: Need to update some of the naming conventions to fit our codebase.
 class StateDependentNoiseDistribution:
@@ -45,9 +46,6 @@ def get_std(self, log_std: torch.Tensor) -> torch.Tensor:
         """
         Get the standard deviation from the learned parameter
         (log of it by default). This ensures that the std is positive.
-
-        :param log_std:
-        :return:
         """
         if self.use_expln:
             # From gSDE paper, it allows to keep variance
@@ -71,9 +69,6 @@ def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None:
         """
         Sample weights for the noise exploration matrix,
         using a centered Gaussian distribution.
-
-        :param log_std:
-        :param batch_size:
         """
         std = self.get_std(log_std)
         self.weights_dist = Normal(torch.zeros_like(std), std)
@@ -82,7 +77,7 @@ def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None:
         # Pre-compute matrices in case of parallel exploration
         self.exploration_matrices = self.weights_dist.rsample((batch_size,))
 
-    def proba_distribution_net(
+    def get_distribution_net(
         self,
         latent_dim: int,
         log_std_init: float = -2.0,
@@ -93,13 +88,6 @@ def proba_distribution_net(
         one output will be the deterministic action, the other parameter will be the
         standard deviation of the distribution that control the weights of the noise
         matrix.
-
-        :param latent_dim: Dimension of the last layer of the policy (before the
-        action layer)
-        :param log_std_init: Initial value for the log standard deviation
-        :param latent_sde_dim: Dimension of the last layer of the features extractor
-            for gSDE. By default, it is shared with the policy network.
-        :return:
         """
         # Network for the deterministic action, it represents the mean of the
         # distribution
@@ -119,7 +107,7 @@ def proba_distribution_net(
         self.sample_weights(log_std)
         return mean_actions_net, log_std
 
-    def proba_distribution(
+    def get_distribution(
         self,
         mean_actions: torch.Tensor,
         log_std: torch.Tensor,
@@ -127,11 +115,6 @@ def proba_distribution(
     ) -> "StateDependentNoiseDistribution":
         """
         Create the distribution given its parameters (mean, std)
-
-        :param mean_actions:
-        :param log_std:
-        :param latent_sde:
-        :return:
         """
         # Stop gradient if we don't want to influence the features
         self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
@@ -174,8 +157,8 @@ def actions_from_params(
         latent_sde: torch.Tensor,
         deterministic: bool = False,
     ) -> torch.Tensor:
-        # Update the proba distribution
-        self.proba_distribution(mean_actions, log_std, latent_sde)
+        # Update the distribution
+        self.get_distribution(mean_actions, log_std, latent_sde)
         if deterministic:
             return self.mode()
         return self.sample()
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 9bf7c9f7..1cc3e0ef 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -38,7 +38,7 @@ def learn(self):
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
                     # Reset noise for smooth exploration
-                    if self.alg.actor_critic.actor.use_smooth_expl and i == 0:
+                    if self.policy_cfg["smooth_exploration"] and i == 0:
                         self.alg.actor_critic.actor.episode_noise = None
 
                     actions = self.alg.act(actor_obs, critic_obs)

From d838dc6d140b648f84300efd4f92c9f714745e1c Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 1 Mar 2024 11:34:49 -0500
Subject: [PATCH 03/19] start  moving things to SmoothActor

---
 learning/modules/actor.py            | 22 +++++++++++++++++++++-
 learning/modules/actor_critic.py     | 14 --------------
 learning/modules/critic.py           |  2 +-
 learning/modules/utils/gSDE.py       |  2 --
 learning/modules/utils/neural_net.py |  5 +++--
 5 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index 857d7326..c8ffe255 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -4,6 +4,7 @@
 from .utils import create_MLP
 from .utils import export_network
 from .utils import RunningMeanStd
+from .utils import StateDependentNoiseDistribution
 
 from gym import LEGGED_GYM_ROOT_DIR
 
@@ -32,7 +33,11 @@ def __init__(
 
         self.num_obs = num_obs
         self.num_actions = num_actions
-        self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation)
+        self.hidden_dims = hidden_dims
+        self.activation = activation
+        self.NN = create_MLP(
+            num_obs, num_actions, hidden_dims, activation, latent=False
+        )
 
         # Action noise
         self.std = nn.Parameter(init_noise_std * torch.ones(num_actions))
@@ -85,11 +90,26 @@ def export(self, path):
 class SmoothActor(Actor):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        # Create latent NN
+        self.NN = create_MLP(
+            self.num_obs,
+            self.num_actions,
+            self.hidden_dims,
+            self.activation,
+            latent=True,
+        )
+        # State dependent action distribution
+        self.distribution = StateDependentNoiseDistribution(
+            self.num_actions,
+            self.num_obs,
+        )
         # Noise for smooth exploration
         self.episode_noise = None
+        # Debug mode for plotting
         self.debug = False
 
     def act(self, observations):
+        # TODO[lm]: update distribution for gSDE
         self.update_distribution(observations)
         mean = self.distribution.mean
         if self.episode_noise is None:
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index 669a9f6a..770fdc71 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -2,7 +2,6 @@
 
 from .actor import Actor, SmoothActor
 from .critic import Critic
-from .utils import StateDependentNoiseDistribution
 
 
 class ActorCritic(nn.Module):
@@ -52,19 +51,6 @@ def __init__(
         print(f"Actor MLP: {self.actor.NN}")
         print(f"Critic MLP: {self.critic.NN}")
 
-        # TODO[lm]: Decide how to handle the state dependent noise distribution in
-        # this class. In stable-baselines3 there is a class MlpExtractor which does
-        # what the Actor and Critic classes do here, just with the latent representation
-        # of the networks. Either I make a new class in a similar way and store the
-        # action distribution here, or I make the changes in Actor and Critic and change
-        # the distribution there.
-        if smooth_exploration:
-            self.action_dist = StateDependentNoiseDistribution(
-                num_actions,
-                num_actor_obs,
-                num_critic_obs,
-            )
-
     @property
     def action_mean(self):
         return self.actor.action_mean
diff --git a/learning/modules/critic.py b/learning/modules/critic.py
index 47e0decf..430eca33 100644
--- a/learning/modules/critic.py
+++ b/learning/modules/critic.py
@@ -20,7 +20,7 @@ def __init__(
             )
         super().__init__()
 
-        self.NN = create_MLP(num_obs, 1, hidden_dims, activation)
+        self.NN = create_MLP(num_obs, 1, hidden_dims, activation, latent=False)
         self._normalize_obs = normalize_obs
         if self._normalize_obs:
             self.obs_rms = RunningMeanStd(num_obs)
diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py
index 6e04b0e9..cd100053 100644
--- a/learning/modules/utils/gSDE.py
+++ b/learning/modules/utils/gSDE.py
@@ -26,7 +26,6 @@ def __init__(
         self,
         num_actions: int,
         num_actor_obs: int,
-        num_critic_obs: int,
         full_std: bool = True,
         use_expln: bool = False,
         learn_features: bool = False,
@@ -34,7 +33,6 @@ def __init__(
     ):
         self.num_actions = num_actions
         self.num_actor_obs = num_actor_obs
-        self.num_critic_obs = num_critic_obs
         self.full_std = full_std
         self.use_expln = use_expln
         self.learn_features = learn_features
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index fa25d77b..057e724f 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -20,8 +20,9 @@ def create_MLP(
             # TODO[lm]: Could also create a separate function that gives the latent
             # reprentation used for smooth exploration (but if it doesn't mess up
             # anything, this is simpler)
-            if i == len(hidden_dims) - 1 and not latent:
-                add_layer(layers, hidden_dims[i], num_outputs)
+            if i == len(hidden_dims) - 1:
+                if not latent:
+                    add_layer(layers, hidden_dims[i], num_outputs)
             else:
                 add_layer(
                     layers,

From aea5d88fb736087108cf37e66aa0ea8093638514 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 1 Mar 2024 16:25:05 -0500
Subject: [PATCH 04/19] moved everything to SmoothActor (it runs)

---
 learning/modules/actor.py            |  41 -------
 learning/modules/actor_critic.py     |   3 +-
 learning/modules/smooth_actor.py     | 130 ++++++++++++++++++++
 learning/modules/utils/__init__.py   |   1 -
 learning/modules/utils/gSDE.py       | 172 ---------------------------
 learning/runners/on_policy_runner.py |   7 +-
 plots/plot.py                        |  10 +-
 7 files changed, 140 insertions(+), 224 deletions(-)
 create mode 100644 learning/modules/smooth_actor.py
 delete mode 100644 learning/modules/utils/gSDE.py

diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index c8ffe255..17cf0566 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -4,9 +4,6 @@
 from .utils import create_MLP
 from .utils import export_network
 from .utils import RunningMeanStd
-from .utils import StateDependentNoiseDistribution
-
-from gym import LEGGED_GYM_ROOT_DIR
 
 
 class Actor(nn.Module):
@@ -85,41 +82,3 @@ def normalize(self, observation):
 
     def export(self, path):
         export_network(self, "policy", path, self.num_obs)
-
-
-class SmoothActor(Actor):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # Create latent NN
-        self.NN = create_MLP(
-            self.num_obs,
-            self.num_actions,
-            self.hidden_dims,
-            self.activation,
-            latent=True,
-        )
-        # State dependent action distribution
-        self.distribution = StateDependentNoiseDistribution(
-            self.num_actions,
-            self.num_obs,
-        )
-        # Noise for smooth exploration
-        self.episode_noise = None
-        # Debug mode for plotting
-        self.debug = False
-
-    def act(self, observations):
-        # TODO[lm]: update distribution for gSDE
-        self.update_distribution(observations)
-        mean = self.distribution.mean
-        if self.episode_noise is None:
-            sample = self.distribution.sample()
-            self.episode_noise = sample - self.distribution.mean
-        else:
-            sample = mean + self.episode_noise
-
-        if self.debug:
-            # write to csv (used for plotting)
-            with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f:
-                f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
-        return sample
diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py
index 770fdc71..29f6cdf7 100644
--- a/learning/modules/actor_critic.py
+++ b/learning/modules/actor_critic.py
@@ -1,6 +1,7 @@
 import torch.nn as nn
 
-from .actor import Actor, SmoothActor
+from .actor import Actor
+from .smooth_actor import SmoothActor
 from .critic import Critic
 
 
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
new file mode 100644
index 00000000..6c6a42c7
--- /dev/null
+++ b/learning/modules/smooth_actor.py
@@ -0,0 +1,130 @@
+import torch
+import torch.nn as nn
+from torch.distributions import Normal
+
+from .actor import Actor
+from .utils import create_MLP
+
+from gym import LEGGED_GYM_ROOT_DIR
+
+
+# The following implementation is based on the gSDE paper. See code:
+# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
+class SmoothActor(Actor):
+    weights_dist: Normal
+    _latent_sde: torch.Tensor
+    exploration_mat: torch.Tensor
+    exploration_matrices: torch.Tensor
+
+    def __init__(
+        self,
+        *args,
+        full_std: bool = True,
+        use_exp_ln: bool = False,
+        learn_features: bool = False,
+        epsilon: float = 1e-6,
+        log_std_init: float = -2.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.full_std = full_std
+        self.use_expln = use_exp_ln
+        self.learn_features = learn_features
+        self.epsilon = epsilon
+        self.log_std_init = log_std_init
+
+        # Create latent NN and last layer
+        self.latent_net = create_MLP(
+            self.num_obs,
+            self.num_actions,
+            self.hidden_dims,
+            self.activation,
+            latent=True,
+        )
+        self.latent_dim = self.hidden_dims[-1]
+        self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions)
+        # Reduce the number of parameters if needed
+        if self.full_std:
+            log_std = torch.ones(self.latent_dim, self.num_actions)
+        else:
+            log_std = torch.ones(self.latent_dim, 1)
+        self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True)
+        # Sample an exploration matrix (this sets the exploration matrices)
+        self.sample_weights()
+        self.distribution = None
+
+        # Debug mode for plotting
+        self.debug = False
+
+    def sample_weights(self, batch_size=1):
+        # Sample weights for the noise exploration matrix
+        std = self.get_std()
+        self.weights_dist = Normal(torch.zeros_like(std), std)
+        # Reparametrization trick to pass gradients
+        self.exploration_mat = self.weights_dist.rsample()
+        # Pre-compute matrices in case of parallel exploration
+        self.exploration_matrices = self.weights_dist.rsample((batch_size,))
+
+    def get_std(self):
+        # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally
+        if self.use_expln:
+            # From gSDE paper, it allows to keep variance
+            # above zero and prevent it from growing too fast
+            below_threshold = torch.exp(self.log_std) * (self.log_std <= 0)
+            # Avoid NaN: zeros values that are below zero
+            safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon
+            above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0)
+            std = below_threshold + above_threshold
+        else:
+            # Use normal exponential
+            std = torch.exp(self.log_std)
+
+        if self.full_std:
+            return std
+        assert self.latent_dim is not None
+        # Reduce the number of parameters:
+        return (
+            torch.ones(self.latent_dim, self.num_actions).to(self.log_std.device) * std
+        )
+
+    def update_distribution(self, observations):
+        if self._normalize_obs:
+            observations = self.normalize(observations)
+        # Get latent features and compute distribution
+        self._latent_sde = self.latent_net(observations)
+        if not self.learn_features:
+            self._latent_sde = self._latent_sde.detach()
+        variance = torch.mm(self._latent_sde**2, self.get_std() ** 2)
+        mean_actions = self.mean_actions_net(self._latent_sde)
+        self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
+
+    def act(self, observations):
+        self.update_distribution(observations)
+        mean = self.distribution.mean
+        sample = mean + self.get_noise()
+        if self.debug:
+            # write to csv (used for plotting)
+            with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f:
+                f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
+        return sample
+
+    def act_inference(self, observations):
+        if self._normalize_obs:
+            observations = self.normalize(observations)
+        latent_sde = self.latent_net(observations)
+        mean_actions = self.mean_actions_net(latent_sde)
+        return mean_actions
+
+    def get_noise(self):
+        latent_sde = self._latent_sde
+        if not self.learn_features:
+            latent_sde = latent_sde.detach()
+        # Default case: only one exploration matrix
+        if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
+            return torch.mm(latent_sde, self.exploration_mat.to(latent_sde.device))
+        # Use batch matrix multiplication for efficient computation
+        # (batch_size, n_features) -> (batch_size, 1, n_features)
+        latent_sde = latent_sde.unsqueeze(dim=1)
+        # (batch_size, 1, n_actions)
+        noise = torch.bmm(latent_sde, self.exploration_matrices)
+        return noise.squeeze(dim=1)
diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py
index 3bc82600..1422b82d 100644
--- a/learning/modules/utils/__init__.py
+++ b/learning/modules/utils/__init__.py
@@ -1,3 +1,2 @@
 from .neural_net import create_MLP, export_network
 from .normalize import RunningMeanStd
-from .gSDE import StateDependentNoiseDistribution
diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py
deleted file mode 100644
index cd100053..00000000
--- a/learning/modules/utils/gSDE.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from typing import Tuple, Optional
-
-import torch
-import torch.nn as nn
-from torch.distributions import Normal
-
-
-# The following implementation was used in the gSDE paper on smooth exploration.
-# See code:
-# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
-# TODO[lm]: Need to update some of the naming conventions to fit our codebase.
-class StateDependentNoiseDistribution:
-    """
-    Distribution class for using generalized State Dependent Exploration (gSDE).
-    It is used to create the noise exploration matrix and compute the log
-    probability of an action with that noise.
-    """
-
-    latent_sde_dim: Optional[int]
-    weights_dist: Normal
-    _latent_sde: torch.Tensor
-    exploration_mat: torch.Tensor
-    exploration_matrices: torch.Tensor
-
-    def __init__(
-        self,
-        num_actions: int,
-        num_actor_obs: int,
-        full_std: bool = True,
-        use_expln: bool = False,
-        learn_features: bool = False,
-        epsilon: float = 1e-6,
-    ):
-        self.num_actions = num_actions
-        self.num_actor_obs = num_actor_obs
-        self.full_std = full_std
-        self.use_expln = use_expln
-        self.learn_features = learn_features
-        self.epsilon = epsilon
-        self.mean_actions = None
-        self.log_std = None
-
-    def get_std(self, log_std: torch.Tensor) -> torch.Tensor:
-        """
-        Get the standard deviation from the learned parameter
-        (log of it by default). This ensures that the std is positive.
-        """
-        if self.use_expln:
-            # From gSDE paper, it allows to keep variance
-            # above zero and prevent it from growing too fast
-            below_threshold = torch.exp(log_std) * (log_std <= 0)
-            # Avoid NaN: zeros values that are below zero
-            safe_log_std = log_std * (log_std > 0) + self.epsilon
-            above_threshold = (torch.log1p(safe_log_std) + 1.0) * (log_std > 0)
-            std = below_threshold + above_threshold
-        else:
-            # Use normal exponential
-            std = torch.exp(log_std)
-
-        if self.full_std:
-            return std
-        assert self.latent_sde_dim is not None
-        # Reduce the number of parameters:
-        return torch.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std
-
-    def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None:
-        """
-        Sample weights for the noise exploration matrix,
-        using a centered Gaussian distribution.
-        """
-        std = self.get_std(log_std)
-        self.weights_dist = Normal(torch.zeros_like(std), std)
-        # Reparametrization trick to pass gradients
-        self.exploration_mat = self.weights_dist.rsample()
-        # Pre-compute matrices in case of parallel exploration
-        self.exploration_matrices = self.weights_dist.rsample((batch_size,))
-
-    def get_distribution_net(
-        self,
-        latent_dim: int,
-        log_std_init: float = -2.0,
-        latent_sde_dim: Optional[int] = None,
-    ) -> Tuple[nn.Module, nn.Parameter]:
-        """
-        Create the layers and parameter that represent the distribution:
-        one output will be the deterministic action, the other parameter will be the
-        standard deviation of the distribution that control the weights of the noise
-        matrix.
-        """
-        # Network for the deterministic action, it represents the mean of the
-        # distribution
-        mean_actions_net = nn.Linear(latent_dim, self.action_dim)
-        # When we learn features for the noise, the feature dimension
-        # can be different between the policy and the noise network
-        self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim
-        # Reduce the number of parameters if needed
-        log_std = (
-            torch.ones(self.latent_sde_dim, self.action_dim)
-            if self.full_std
-            else torch.ones(self.latent_sde_dim, 1)
-        )
-        # Transform it to a parameter so it can be optimized
-        log_std = nn.Parameter(log_std * log_std_init, requires_grad=True)
-        # Sample an exploration matrix
-        self.sample_weights(log_std)
-        return mean_actions_net, log_std
-
-    def get_distribution(
-        self,
-        mean_actions: torch.Tensor,
-        log_std: torch.Tensor,
-        latent_sde: torch.Tensor,
-    ) -> "StateDependentNoiseDistribution":
-        """
-        Create the distribution given its parameters (mean, std)
-        """
-        # Stop gradient if we don't want to influence the features
-        self._latent_sde = latent_sde if self.learn_features else latent_sde.detach()
-        variance = torch.mm(self._latent_sde**2, self.get_std(log_std) ** 2)
-        self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
-        return self
-
-    def log_prob(self, actions: torch.Tensor):
-        # TODO[lm]: verify that summed correctly (stable-baselines3 is different)
-        return self.distribution.log_prob(actions).sum(-1)
-
-    def entropy(self):
-        # TODO[lm]: verify that summed correctly (stable-baselines3 is different)
-        return self.distribution.entropy().sum(-1)
-
-    def sample(self) -> torch.Tensor:
-        noise = self.get_noise(self._latent_sde)
-        actions = self.distribution.mean + noise
-        return actions
-
-    def mode(self) -> torch.Tensor:
-        return self.distribution.mean
-
-    def get_noise(self, latent_sde: torch.Tensor) -> torch.Tensor:
-        latent_sde = latent_sde if self.learn_features else latent_sde.detach()
-        # Default case: only one exploration matrix
-        if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
-            return torch.mm(latent_sde, self.exploration_mat)
-        # Use batch matrix multiplication for efficient computation
-        # (batch_size, n_features) -> (batch_size, 1, n_features)
-        latent_sde = latent_sde.unsqueeze(dim=1)
-        # (batch_size, 1, n_actions)
-        noise = torch.bmm(latent_sde, self.exploration_matrices)
-        return noise.squeeze(dim=1)
-
-    def actions_from_params(
-        self,
-        mean_actions: torch.Tensor,
-        log_std: torch.Tensor,
-        latent_sde: torch.Tensor,
-        deterministic: bool = False,
-    ) -> torch.Tensor:
-        # Update the distribution
-        self.get_distribution(mean_actions, log_std, latent_sde)
-        if deterministic:
-            return self.mode()
-        return self.sample()
-
-    def log_prob_from_params(
-        self,
-        mean_actions: torch.Tensor,
-        log_std: torch.Tensor,
-        latent_sde: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        actions = self.actions_from_params(mean_actions, log_std, latent_sde)
-        log_prob = self.log_prob(actions)
-        return actions, log_prob
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 1cc3e0ef..4f68e164 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -34,13 +34,12 @@ def learn(self):
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
+            # * Re-sample noise matrix for smooth exploration
+            if self.policy_cfg["smooth_exploration"]:
+                self.alg.actor_critic.actor.sample_weights()
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
-                    # Reset noise for smooth exploration
-                    if self.policy_cfg["smooth_exploration"] and i == 0:
-                        self.alg.actor_critic.actor.episode_noise = None
-
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.policy_cfg["actions"],
diff --git a/plots/plot.py b/plots/plot.py
index 2a33eafa..adda11ce 100644
--- a/plots/plot.py
+++ b/plots/plot.py
@@ -5,13 +5,13 @@
 name = "distribution_smooth"
 data = pd.read_csv(name + ".csv")
 
-# Plot the data
-n = 200
-plt.plot(data.iloc[:n, 0])
-plt.plot(data.iloc[:n, 1])
+# Plot the data (last n steps)
+n = 500
+plt.plot(data.iloc[-n:, 0])
+plt.plot(data.iloc[-n:, 1])
 plt.xlabel("timestep")
 plt.ylabel("action")
-plt.title("Smoothing every rollout")
+plt.title("gSDE")
 plt.legend(["mean", "sample"])
 # plt.show()
 

From 3603205cf8ab988e04af4e0b69412768b04b4607 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 1 Mar 2024 18:21:56 -0500
Subject: [PATCH 05/19] possibly resample in PPO update

---
 learning/algorithms/ppo.py   | 6 +++++-
 learning/modules/__init__.py | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py
index 3ca5a5b4..ad31f846 100644
--- a/learning/algorithms/ppo.py
+++ b/learning/algorithms/ppo.py
@@ -34,7 +34,7 @@
 import torch.nn as nn
 import torch.optim as optim
 
-from learning.modules import ActorCritic
+from learning.modules import ActorCritic, SmoothActor
 from learning.storage import RolloutStorage
 
 
@@ -152,6 +152,10 @@ def update(self):
             old_mu_batch,
             old_sigma_batch,
         ) in generator:
+            # TODO[lm]: Look into resampling noise here, gSDE paper seems to do it.
+            if isinstance(self.actor_critic.actor, SmoothActor):
+                batch_size = obs_batch.shape[0]
+                self.actor_critic.actor.sample_weights(batch_size)
             self.actor_critic.actor.update_distribution(obs_batch)
             actions_log_prob_batch = self.actor_critic.get_actions_log_prob(
                 actions_batch
diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py
index 75cca5e2..86c96759 100644
--- a/learning/modules/__init__.py
+++ b/learning/modules/__init__.py
@@ -33,3 +33,4 @@
 from .actor_critic import ActorCritic
 from .state_estimator import StateEstimatorNN
 from .actor import Actor
+from .smooth_actor import SmoothActor

From bb0a273982a5b90f3b97bbbfca4c8f7851fa5b63 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 4 Mar 2024 11:04:36 -0500
Subject: [PATCH 06/19] learn_features=True and correct sample dim

---
 learning/modules/smooth_actor.py     | 4 ++--
 learning/runners/on_policy_runner.py | 2 +-
 plots/plot.py                        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index 6c6a42c7..200f03f2 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -21,7 +21,7 @@ def __init__(
         *args,
         full_std: bool = True,
         use_exp_ln: bool = False,
-        learn_features: bool = False,
+        learn_features: bool = True,
         epsilon: float = 1e-6,
         log_std_init: float = -2.0,
         **kwargs,
@@ -49,7 +49,7 @@ def __init__(
         else:
             log_std = torch.ones(self.latent_dim, 1)
         self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True)
-        # Sample an exploration matrix (this sets the exploration matrices)
+        # Sample an exploration matrix
         self.sample_weights()
         self.distribution = None
 
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 4f68e164..e183b816 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -36,7 +36,7 @@ def learn(self):
             logger.tic("collection")
             # * Re-sample noise matrix for smooth exploration
             if self.policy_cfg["smooth_exploration"]:
-                self.alg.actor_critic.actor.sample_weights()
+                self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs)
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
diff --git a/plots/plot.py b/plots/plot.py
index adda11ce..5de8ba35 100644
--- a/plots/plot.py
+++ b/plots/plot.py
@@ -10,7 +10,7 @@
 plt.plot(data.iloc[-n:, 0])
 plt.plot(data.iloc[-n:, 1])
 plt.xlabel("timestep")
-plt.ylabel("action")
+plt.ylabel("action (NN output)")
 plt.title("gSDE")
 plt.legend(["mean", "sample"])
 # plt.show()

From 40025c43de53627d9ca615c0aa40d9b88476755d Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 4 Mar 2024 12:01:29 -0500
Subject: [PATCH 07/19] adjust sample freq and update plotting

---
 gym/envs/mini_cheetah/mini_cheetah_ref_config.py |  1 +
 learning/modules/actor.py                        | 16 +++++++++++++++-
 learning/modules/smooth_actor.py                 |  7 +++----
 learning/runners/on_policy_runner.py             | 10 +++++++---
 plots/plot.py                                    |  4 ++--
 5 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 2f894bde..fda3f345 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -75,6 +75,7 @@ class policy(MiniCheetahRunnerCfg.policy):
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         smooth_exploration = True
+        exploration_sample_freq = 8
 
         actor_obs = [
             "base_ang_vel",
diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index 17cf0566..25af26e6 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -5,6 +5,8 @@
 from .utils import export_network
 from .utils import RunningMeanStd
 
+from gym import LEGGED_GYM_ROOT_DIR
+
 
 class Actor(nn.Module):
     def __init__(
@@ -42,6 +44,9 @@ def __init__(
         # disable args validation for speedup
         Normal.set_default_validate_args = False
 
+        # Debug mode for plotting
+        self.debug = False
+
     @property
     def action_mean(self):
         return self.distribution.mean
@@ -62,7 +67,12 @@ def update_distribution(self, observations):
 
     def act(self, observations):
         self.update_distribution(observations)
-        return self.distribution.sample()
+        sample = self.distribution.sample()
+        if self.debug:
+            mean = self.distribution.mean
+            path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_baseline.csv"
+            self.log_actions(mean[0][2], sample[0][2], path)
+        return sample
 
     def get_actions_log_prob(self, actions):
         return self.distribution.log_prob(actions).sum(dim=-1)
@@ -82,3 +92,7 @@ def normalize(self, observation):
 
     def export(self, path):
         export_network(self, "policy", path, self.num_obs)
+
+    def log_actions(self, mean, sample, path):
+        with open(path, "a") as f:
+            f.write(str(mean.item()) + ", " + str(sample.item()) + "\n")
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index 200f03f2..bc262798 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -54,7 +54,7 @@ def __init__(
         self.distribution = None
 
         # Debug mode for plotting
-        self.debug = False
+        self.debug = True
 
     def sample_weights(self, batch_size=1):
         # Sample weights for the noise exploration matrix
@@ -103,9 +103,8 @@ def act(self, observations):
         mean = self.distribution.mean
         sample = mean + self.get_noise()
         if self.debug:
-            # write to csv (used for plotting)
-            with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f:
-                f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n")
+            path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv"
+            self.log_actions(mean[0][2], sample[0][2], path)
         return sample
 
     def act_inference(self, observations):
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index e183b816..3b7ce33e 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -34,12 +34,16 @@ def learn(self):
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
-            # * Re-sample noise matrix for smooth exploration
-            if self.policy_cfg["smooth_exploration"]:
-                self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs)
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
+                    # * Re-sample noise matrix for smooth exploration
+                    sample_freq = self.policy_cfg["exploration_sample_freq"]
+                    if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor_critic.actor.sample_weights(
+                            batch_size=self.env.num_envs
+                        )
+
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.policy_cfg["actions"],
diff --git a/plots/plot.py b/plots/plot.py
index 5de8ba35..5018cb4d 100644
--- a/plots/plot.py
+++ b/plots/plot.py
@@ -6,12 +6,12 @@
 data = pd.read_csv(name + ".csv")
 
 # Plot the data (last n steps)
-n = 500
+n = 200
 plt.plot(data.iloc[-n:, 0])
 plt.plot(data.iloc[-n:, 1])
 plt.xlabel("timestep")
 plt.ylabel("action (NN output)")
-plt.title("gSDE")
+plt.title("gSDE (sample_freq=8)")
 plt.legend(["mean", "sample"])
 # plt.show()
 

From c3017461e5ba7629e02dc75a5448fc4bf81e57c4 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 15 Mar 2024 14:17:22 -0400
Subject: [PATCH 08/19] update log_std_init=0.0 and refactor

---
 learning/modules/smooth_actor.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index bc262798..91754aeb 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -23,12 +23,12 @@ def __init__(
         use_exp_ln: bool = False,
         learn_features: bool = True,
         epsilon: float = 1e-6,
-        log_std_init: float = -2.0,
+        log_std_init: float = 0.0,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
         self.full_std = full_std
-        self.use_expln = use_exp_ln
+        self.use_exp_ln = use_exp_ln
         self.learn_features = learn_features
         self.epsilon = epsilon
         self.log_std_init = log_std_init
@@ -54,7 +54,7 @@ def __init__(
         self.distribution = None
 
         # Debug mode for plotting
-        self.debug = True
+        self.debug = False
 
     def sample_weights(self, batch_size=1):
         # Sample weights for the noise exploration matrix
@@ -67,7 +67,7 @@ def sample_weights(self, batch_size=1):
 
     def get_std(self):
         # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally
-        if self.use_expln:
+        if self.use_exp_ln:
             # From gSDE paper, it allows to keep variance
             # above zero and prevent it from growing too fast
             below_threshold = torch.exp(self.log_std) * (self.log_std <= 0)
@@ -83,9 +83,7 @@ def get_std(self):
             return std
         assert self.latent_dim is not None
         # Reduce the number of parameters:
-        return (
-            torch.ones(self.latent_dim, self.num_actions).to(self.log_std.device) * std
-        )
+        return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std
 
     def update_distribution(self, observations):
         if self._normalize_obs:

From a372a5274224c7b0bd18d6232452bb00cbec8778 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 3 Apr 2024 14:01:52 -0400
Subject: [PATCH 09/19] log joint data for training and play

---
 .gitignore                           |   6 +-
 gym/smooth_exploration/plot_play.py  |  38 +++++++++
 gym/smooth_exploration/plot_train.py |  55 +++++++++++++
 learning/runners/on_policy_runner.py |  13 ++-
 plots/plot.py                        |  19 -----
 scripts/log_play.py                  | 117 +++++++++++++++++++++++++++
 scripts/log_train.py                 |  92 +++++++++++++++++++++
 7 files changed, 318 insertions(+), 22 deletions(-)
 create mode 100644 gym/smooth_exploration/plot_play.py
 create mode 100644 gym/smooth_exploration/plot_train.py
 delete mode 100644 plots/plot.py
 create mode 100644 scripts/log_play.py
 create mode 100644 scripts/log_train.py

diff --git a/.gitignore b/.gitignore
index d2761927..20093cee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,8 +16,6 @@ gym/wandb
 *.npz
 user/wandb_config.json
 *trajectories/
-plots/*.csv
-plots/*.png
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -83,3 +81,7 @@ ipython_config.py
 venv/
 env.bak/
 venv.bak/
+
+# Smooth exploration
+gym/smooth_exploration/data
+gym/smooth_exploration/figures
diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py
new file mode 100644
index 00000000..eedee17f
--- /dev/null
+++ b/gym/smooth_exploration/plot_play.py
@@ -0,0 +1,38 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+name = "mini_cheetah_ref"
+data_dir = "./data/play/" + name
+fig_dir = "./figures/play"
+
+# load data
+dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
+dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
+dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
+
+# plot data
+n_steps = 200
+fig, axs = plt.subplots(3, figsize=(10, 10))
+plt.suptitle(name)
+
+for i in range(3):
+    axs[0].plot(dof_pos_obs[:n_steps, i])
+axs[0].set_title("dof_pos_obs")
+axs[0].legend(["idx 0", "idx 1", "idx 2"])
+axs[0].set_xlabel("time steps")
+
+for i in range(3):
+    axs[1].plot(dof_pos_target[:n_steps, i])
+axs[1].set_title("dof_pos_target")
+axs[1].legend(["idx 0", "idx 1", "idx 2"])
+axs[1].set_xlabel("time steps")
+
+for i in range(3):
+    axs[2].plot(dof_vel[:n_steps, i])
+axs[2].set_title("dof_vel")
+axs[2].legend(["idx 0", "idx 1", "idx 2"])
+axs[2].set_xlabel("time steps")
+
+plt.tight_layout()
+plt.savefig(fig_dir + "/" + name + ".png")
+plt.show()
diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py
new file mode 100644
index 00000000..8a45729f
--- /dev/null
+++ b/gym/smooth_exploration/plot_train.py
@@ -0,0 +1,55 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+SMOOTH = True
+name = "ref_sample_8"
+data_dir = "./data/train/" + name
+fig_dir = "./figures/train"
+
+# load data
+dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
+dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
+dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
+torques = np.load(data_dir + "/torques.npy")[0]
+
+iterations = range(0, dof_pos_obs.shape[0], 10)
+
+# plot data for each iteration
+for it in iterations:
+    fig, axs = plt.subplots(4, figsize=(10, 10))
+    plt.suptitle(name + " iteration " + str(it))
+
+    for i in range(3):
+        axs[0].plot(dof_pos_obs[it, :, i])
+        axs[0].set_title("dof_pos_obs")
+        axs[0].legend(["idx 0", "idx 1", "idx 2"])
+        axs[0].set_xlabel("time steps")
+
+    for i in range(3):
+        axs[1].plot(dof_pos_target[it, :, i])
+        axs[1].set_title("dof_pos_target")
+        axs[1].legend(["idx 0", "idx 1", "idx 2"])
+        axs[1].set_xlabel("time steps")
+
+    for i in range(3):
+        axs[2].plot(dof_vel[it, :, i])
+        axs[2].set_title("dof_vel")
+        axs[2].legend(["idx 0", "idx 1", "idx 2"])
+        axs[2].set_xlabel("time steps")
+
+    for i in range(3):
+        axs[3].plot(torques[it, :, i])
+        axs[3].set_title("torques")
+        axs[3].legend(["idx 0", "idx 1", "idx 2"])
+        axs[3].set_xlabel("time steps")
+
+    if SMOOTH:
+        # plot vertical lines where noise is resampled
+        for x in range(0, dof_pos_obs.shape[1], 8):
+            axs[0].axvline(x, color="r", linestyle="--")
+            axs[1].axvline(x, color="r", linestyle="--")
+            axs[2].axvline(x, color="r", linestyle="--")
+            axs[3].axvline(x, color="r", linestyle="--")
+
+    plt.tight_layout()
+    plt.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 9c167cd7..86541f9b 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -18,7 +18,7 @@ def __init__(self, env, train_cfg, device="cpu"):
             self.device,
         )
 
-    def learn(self):
+    def learn(self, states_to_log_dict=None):
         self.set_up_logger()
 
         rewards_dict = {}
@@ -53,6 +53,17 @@ def learn(self):
 
                     self.env.step()
 
+                    # * Log states
+                    # This continuously overwrites the rollout data, so when the state
+                    # dict is written to a file it contains the last rollout for each
+                    # iteration.
+                    it_idx = self.it - 1
+                    if states_to_log_dict is not None:
+                        for state in states_to_log_dict:
+                            states_to_log_dict[state][0, it_idx, i, :] = getattr(
+                                self.env, state
+                            )[0, :]
+
                     actor_obs = self.get_noisy_obs(
                         self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
                     )
diff --git a/plots/plot.py b/plots/plot.py
deleted file mode 100644
index 5018cb4d..00000000
--- a/plots/plot.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pandas as pd
-import matplotlib.pyplot as plt
-
-# Read the CSV file
-name = "distribution_smooth"
-data = pd.read_csv(name + ".csv")
-
-# Plot the data (last n steps)
-n = 200
-plt.plot(data.iloc[-n:, 0])
-plt.plot(data.iloc[-n:, 1])
-plt.xlabel("timestep")
-plt.ylabel("action (NN output)")
-plt.title("gSDE (sample_freq=8)")
-plt.legend(["mean", "sample"])
-# plt.show()
-
-# Save the plot to a file
-plt.savefig(name + ".png")
diff --git a/scripts/log_play.py b/scripts/log_play.py
new file mode 100644
index 00000000..adc1b598
--- /dev/null
+++ b/scripts/log_play.py
@@ -0,0 +1,117 @@
+from gym.envs import __init__  # noqa: F401
+from gym.utils import get_args, task_registry
+from gym.utils import KeyboardInterface
+from gym.utils import VisualizationRecorder
+from gym import LEGGED_GYM_ROOT_DIR
+
+# torch needs to be imported after isaacgym imports in local source
+import torch
+import os
+import numpy as np
+
+TEST_TOTAL_TIMESTEPS = 1000
+
+
+def create_logging_dict(runner, test_total_timesteps):
+    states_to_log = [
+        "dof_pos_target",
+        "dof_pos_obs",
+        "dof_vel",
+        "torques",
+        "commands",
+        # 'base_lin_vel',
+        # 'base_ang_vel',
+        # 'oscillators',
+        # 'grf',
+        # 'base_height'
+    ]
+
+    states_to_log_dict = {}
+
+    for state in states_to_log:
+        array_dim = runner.get_obs_size(
+            [
+                state,
+            ]
+        )
+        states_to_log_dict[state] = torch.zeros(
+            (1, test_total_timesteps, array_dim), device=runner.env.device
+        )
+    return states_to_log_dict
+
+
+def setup(args):
+    env_cfg, train_cfg = task_registry.create_cfgs(args)
+    env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16)
+    if hasattr(env_cfg, "push_robots"):
+        env_cfg.push_robots.toggle = False
+    if hasattr(env_cfg, "commands"):
+        env_cfg.commands.resampling_time = 9999
+    env_cfg.env.episode_length_s = 9999
+    env_cfg.env.num_projectiles = 20
+    task_registry.make_gym_and_sim()
+    env = task_registry.make_env(args.task, env_cfg)
+    env.cfg.init_state.reset_mode = "reset_to_basic"
+    train_cfg.runner.resume = True
+    train_cfg.logging.enable_local_saving = False
+    runner = task_registry.make_alg_runner(env, train_cfg)
+
+    # * switch to evaluation mode (dropout for example)
+    runner.switch_to_eval()
+    return env, runner, train_cfg
+
+
+def play(env, runner, train_cfg):
+    protocol_name = train_cfg.runner.experiment_name
+
+    # * set up logging
+    log_file_path = os.path.join(
+        LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "play", protocol_name + ".npz"
+    )
+    if not os.path.exists(os.path.dirname(log_file_path)):
+        os.makedirs(os.path.dirname(log_file_path))
+
+    states_to_log_dict = create_logging_dict(runner, TEST_TOTAL_TIMESTEPS)
+
+    # * set up recording
+    if env.cfg.viewer.record:
+        recorder = VisualizationRecorder(
+            env, train_cfg.runner.experiment_name, train_cfg.runner.load_run
+        )
+
+    # * set up interface: GamepadInterface(env) or KeyboardInterface(env)
+    COMMANDS_INTERFACE = hasattr(env, "commands")
+    if COMMANDS_INTERFACE:
+        # interface = GamepadInterface(env)
+        interface = KeyboardInterface(env)
+
+    for t in range(TEST_TOTAL_TIMESTEPS):
+        if COMMANDS_INTERFACE:
+            interface.update(env)
+        if env.cfg.viewer.record:
+            recorder.update(t)
+        runner.set_actions(
+            runner.policy_cfg["actions"],
+            runner.get_inference_actions(),
+            runner.policy_cfg["disable_actions"],
+        )
+        env.step()
+        env.check_exit()
+
+        # * log
+        for state in states_to_log_dict:
+            states_to_log_dict[state][:, t, :] = getattr(env, state)[0, :]
+
+    # * save data
+    # first convert tensors to cpu
+    log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()}
+    np.savez_compressed(log_file_path, **log_dict_cpu)
+    print("saved to ", log_file_path)
+    return states_to_log_dict
+
+
+if __name__ == "__main__":
+    args = get_args()
+    with torch.no_grad():
+        env, runner, train_cfg = setup(args)
+        play(env, runner, train_cfg)
diff --git a/scripts/log_train.py b/scripts/log_train.py
new file mode 100644
index 00000000..918d0b8f
--- /dev/null
+++ b/scripts/log_train.py
@@ -0,0 +1,92 @@
+from gym.envs import __init__  # noqa: F401
+from gym.utils import get_args, task_registry, randomize_episode_counters
+from gym.utils.logging_and_saving import wandb_singleton
+from gym.utils.logging_and_saving import local_code_save_helper
+from gym import LEGGED_GYM_ROOT_DIR
+
+# torch needs to be imported after isaacgym imports in local source
+import torch
+import os
+import numpy as np
+
+TRAIN_ITERATIONS = 100
+ROLLOUT_TIMESTEPS = 32
+
+
+def create_logging_dict(runner):
+    states_to_log = [
+        "dof_pos_target",
+        "dof_pos_obs",
+        "dof_vel",
+        "torques",
+        "commands",
+        # 'base_lin_vel',
+        # 'base_ang_vel',
+        # 'oscillators',
+        # 'grf',
+        # 'base_height'
+    ]
+
+    states_to_log_dict = {}
+
+    for state in states_to_log:
+        array_dim = runner.get_obs_size(
+            [
+                state,
+            ]
+        )
+        states_to_log_dict[state] = torch.zeros(
+            (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim),
+            device=runner.env.device,
+        )
+    return states_to_log_dict
+
+
+def setup():
+    args = get_args()
+    wandb_helper = wandb_singleton.WandbSingleton()
+
+    env_cfg, train_cfg = task_registry.create_cfgs(args)
+    task_registry.make_gym_and_sim()
+    wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args)
+    env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
+    randomize_episode_counters(env)
+
+    train_cfg.runner.max_iterations = TRAIN_ITERATIONS
+    policy_runner = task_registry.make_alg_runner(env, train_cfg)
+
+    local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir)
+
+    return train_cfg, policy_runner
+
+
+def train(train_cfg, policy_runner):
+    protocol_name = train_cfg.runner.experiment_name
+
+    # * set up logging
+    log_file_path = os.path.join(
+        LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "train", protocol_name + ".npz"
+    )
+    if not os.path.exists(os.path.dirname(log_file_path)):
+        os.makedirs(os.path.dirname(log_file_path))
+
+    states_to_log_dict = create_logging_dict(policy_runner)
+
+    # * train
+    # wandb_helper = wandb_singleton.WandbSingleton()
+
+    policy_runner.learn(states_to_log_dict)
+
+    # wandb_helper.close_wandb()
+
+    # * save data
+    # first convert tensors to cpu
+    log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()}
+    np.savez_compressed(log_file_path, **log_dict_cpu)
+    print("saved to ", log_file_path)
+    return states_to_log_dict
+
+
+if __name__ == "__main__":
+    train_cfg, policy_runner = setup()
+    train(train_cfg=train_cfg, policy_runner=policy_runner)

From 599b2cab114cbcbae72ead09ed682b4fa1b92ff7 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Sat, 6 Apr 2024 18:05:29 -0400
Subject: [PATCH 10/19] update logging and plotting

---
 .gitignore                           |  4 +-
 gym/smooth_exploration/plot_play.py  |  4 +-
 gym/smooth_exploration/plot_train.py | 86 ++++++++++++++++++++--------
 learning/runners/on_policy_runner.py | 57 ++++++++++++++----
 scripts/log_play.py                  |  6 +-
 scripts/log_train.py                 | 13 ++++-
 6 files changed, 128 insertions(+), 42 deletions(-)

diff --git a/.gitignore b/.gitignore
index 20093cee..c78e6696 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,5 +83,5 @@ env.bak/
 venv.bak/
 
 # Smooth exploration
-gym/smooth_exploration/data
-gym/smooth_exploration/figures
+gym/smooth_exploration/data*
+gym/smooth_exploration/figures*
diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py
index eedee17f..5768f91a 100644
--- a/gym/smooth_exploration/plot_play.py
+++ b/gym/smooth_exploration/plot_play.py
@@ -2,8 +2,8 @@
 import matplotlib.pyplot as plt
 
 name = "mini_cheetah_ref"
-data_dir = "./data/play/" + name
-fig_dir = "./figures/play"
+data_dir = "./data_play/" + name
+fig_dir = "./figures_play/" + name
 
 # load data
 dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py
index 8a45729f..58c50396 100644
--- a/gym/smooth_exploration/plot_train.py
+++ b/gym/smooth_exploration/plot_train.py
@@ -1,55 +1,93 @@
 import numpy as np
 import matplotlib.pyplot as plt
+import os
 
+FOURIER = True
 SMOOTH = True
-name = "ref_sample_8"
-data_dir = "./data/train/" + name
-fig_dir = "./figures/train"
+SAMPLE_FREQ = 16
+STEPS = 1000
+
+name = "ref_sample_16_len_1000"
+data_dir = "./data_train/" + name
+fig_dir = "./figures_train/" + name
+
+if not os.path.exists(fig_dir):
+    os.makedirs(fig_dir)
 
 # load data
 dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
 dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
 dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
 torques = np.load(data_dir + "/torques.npy")[0]
+terminated = np.load(data_dir + "/terminated.npy")[0]
+
+
+# plot fourier trainsform
+def plot_fourier(data, it):
+    fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10))
+    for i in range(3):
+        ft = np.fft.fft(data[:, i])
+        ft_half = ft[: len(ft) // 2]
+        axs_ft[0].plot(np.abs(ft_half))
+        axs_ft[1].plot(np.angle(ft_half))
+
+    axs_ft[0].set_title("FT Amplitude")
+    axs_ft[0].set_xlabel("Frequency")
+    axs_ft[0].set_ylabel("Amplitude")
+    axs_ft[0].legend(["idx 0", "idx 1", "idx 2"])
+    axs_ft[1].set_title("FT Phase")
+    axs_ft[1].set_xlabel("Frequency")
+    axs_ft[1].set_ylabel("Phase")
+    axs_ft[1].legend(["idx 0", "idx 1", "idx 2"])
+
+    fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png")
 
-iterations = range(0, dof_pos_obs.shape[0], 10)
 
 # plot data for each iteration
-for it in iterations:
+for it in range(0, dof_pos_obs.shape[0], 10):
+    # check if iteration terminated
+    terminate_idx = np.where(terminated[it, :, 0] == 1)[0]
+    if terminate_idx.size > 0:
+        n_steps = terminate_idx[0]
+    else:
+        n_steps = dof_pos_obs.shape[1]
+    print(n_steps)
+
+    # generate figure
     fig, axs = plt.subplots(4, figsize=(10, 10))
     plt.suptitle(name + " iteration " + str(it))
 
+    axs[0].set_title("dof_pos_obs")
     for i in range(3):
-        axs[0].plot(dof_pos_obs[it, :, i])
-        axs[0].set_title("dof_pos_obs")
-        axs[0].legend(["idx 0", "idx 1", "idx 2"])
-        axs[0].set_xlabel("time steps")
+        axs[0].plot(dof_pos_obs[it, :n_steps, i])
 
+    axs[1].set_title("dof_pos_target")
     for i in range(3):
-        axs[1].plot(dof_pos_target[it, :, i])
-        axs[1].set_title("dof_pos_target")
-        axs[1].legend(["idx 0", "idx 1", "idx 2"])
-        axs[1].set_xlabel("time steps")
+        axs[1].plot(dof_pos_target[it, :n_steps, i])
+    if FOURIER and n_steps == STEPS:
+        plot_fourier(dof_pos_target[it, :n_steps, :], it)
 
+    axs[2].set_title("dof_vel")
     for i in range(3):
-        axs[2].plot(dof_vel[it, :, i])
-        axs[2].set_title("dof_vel")
-        axs[2].legend(["idx 0", "idx 1", "idx 2"])
-        axs[2].set_xlabel("time steps")
+        axs[2].plot(dof_vel[it, :n_steps, i])
 
+    axs[3].set_title("torques")
     for i in range(3):
-        axs[3].plot(torques[it, :, i])
-        axs[3].set_title("torques")
-        axs[3].legend(["idx 0", "idx 1", "idx 2"])
-        axs[3].set_xlabel("time steps")
+        axs[3].plot(torques[it, :n_steps, i])
+
+    # format plots
+    for idx in range(4):
+        axs[idx].legend(["idx 0", "idx 1", "idx 2"])
+        axs[idx].set_xlabel("time steps")
+        axs[idx].set_xlim([0, n_steps])
 
     if SMOOTH:
         # plot vertical lines where noise is resampled
-        for x in range(0, dof_pos_obs.shape[1], 8):
+        for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ):
             axs[0].axvline(x, color="r", linestyle="--")
             axs[1].axvline(x, color="r", linestyle="--")
             axs[2].axvline(x, color="r", linestyle="--")
             axs[3].axvline(x, color="r", linestyle="--")
 
-    plt.tight_layout()
-    plt.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
+    fig.tight_layout()
+    fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 86541f9b..0729b994 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -34,6 +34,13 @@ def learn(self, states_to_log_dict=None):
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
+
+            # * Simulate environment and log states
+            if states_to_log_dict is not None:
+                it_idx = self.it - 1
+                if it_idx % 10 == 0:
+                    self.sim_and_log_states(states_to_log_dict, it_idx)
+
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
@@ -53,17 +60,6 @@ def learn(self, states_to_log_dict=None):
 
                     self.env.step()
 
-                    # * Log states
-                    # This continuously overwrites the rollout data, so when the state
-                    # dict is written to a file it contains the last rollout for each
-                    # iteration.
-                    it_idx = self.it - 1
-                    if states_to_log_dict is not None:
-                        for state in states_to_log_dict:
-                            states_to_log_dict[state][0, it_idx, i, :] = getattr(
-                                self.env, state
-                            )[0, :]
-
                     actor_obs = self.get_noisy_obs(
                         self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
                     )
@@ -171,3 +167,42 @@ def init_storage(self):
             critic_obs_shape=[num_critic_obs],
             action_shape=[num_actions],
         )
+
+    def sim_and_log_states(self, states_to_log_dict, it_idx):
+        # Simulate environment for as many steps as expected in the dict.
+        # Log states to the dict, as well as whether the env terminated.
+        steps = states_to_log_dict["terminated"].shape[2]
+        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
+        critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+
+        with torch.inference_mode():
+            for i in range(steps):
+                sample_freq = self.policy_cfg["exploration_sample_freq"]
+                if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
+                    self.alg.actor_critic.actor.sample_weights(
+                        batch_size=self.env.num_envs
+                    )
+
+                actions = self.alg.act(actor_obs, critic_obs)
+                self.set_actions(
+                    self.policy_cfg["actions"],
+                    actions,
+                    self.policy_cfg["disable_actions"],
+                )
+
+                self.env.step()
+
+                actor_obs = self.get_noisy_obs(
+                    self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
+                )
+                critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+
+                # Log states (just for the first env)
+                terminated = self.get_terminated()[0]
+                for state in states_to_log_dict:
+                    if state == "terminated":
+                        states_to_log_dict[state][0, it_idx, i, :] = terminated
+                    else:
+                        states_to_log_dict[state][0, it_idx, i, :] = getattr(
+                            self.env, state
+                        )[0, :]
diff --git a/scripts/log_play.py b/scripts/log_play.py
index adc1b598..f7fd68e8 100644
--- a/scripts/log_play.py
+++ b/scripts/log_play.py
@@ -66,7 +66,11 @@ def play(env, runner, train_cfg):
 
     # * set up logging
     log_file_path = os.path.join(
-        LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "play", protocol_name + ".npz"
+        LEGGED_GYM_ROOT_DIR,
+        "gym",
+        "smooth_exploration",
+        "data_play",
+        protocol_name + ".npz",
     )
     if not os.path.exists(os.path.dirname(log_file_path)):
         os.makedirs(os.path.dirname(log_file_path))
diff --git a/scripts/log_train.py b/scripts/log_train.py
index 918d0b8f..37fca0da 100644
--- a/scripts/log_train.py
+++ b/scripts/log_train.py
@@ -10,7 +10,7 @@
 import numpy as np
 
 TRAIN_ITERATIONS = 100
-ROLLOUT_TIMESTEPS = 32
+ROLLOUT_TIMESTEPS = 1000
 
 
 def create_logging_dict(runner):
@@ -39,6 +39,11 @@ def create_logging_dict(runner):
             (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim),
             device=runner.env.device,
         )
+
+    # log whether rollout terminated
+    states_to_log_dict["terminated"] = torch.zeros(
+        (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, 1), device=runner.env.device
+    )
     return states_to_log_dict
 
 
@@ -65,7 +70,11 @@ def train(train_cfg, policy_runner):
 
     # * set up logging
     log_file_path = os.path.join(
-        LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "train", protocol_name + ".npz"
+        LEGGED_GYM_ROOT_DIR,
+        "gym",
+        "smooth_exploration",
+        "data_train",
+        protocol_name + ".npz",
     )
     if not os.path.exists(os.path.dirname(log_file_path)):
         os.makedirs(os.path.dirname(log_file_path))

From 4994550b747b289011c91eddcfd40d4d2f59acff Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 19 Apr 2024 16:20:08 -0400
Subject: [PATCH 11/19] plot FT script

---
 gym/smooth_exploration/plot_ft.py | 56 +++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 gym/smooth_exploration/plot_ft.py

diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py
new file mode 100644
index 00000000..5ad86b63
--- /dev/null
+++ b/gym/smooth_exploration/plot_ft.py
@@ -0,0 +1,56 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+SAMPLE_FREQ = 8
+STEPS = 500
+
+smooth_name = "ref_sample_8_len_500"
+baseline_name = "ref_baseline_len_500"
+
+smooth_data_dir = "./data_train/" + smooth_name
+baseline_data_dir = "./data_train/" + baseline_name
+fig_dir = "./figures_train/"
+
+if not os.path.exists(fig_dir):
+    os.makedirs(fig_dir)
+
+# load data
+smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0]
+baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0]
+smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0]
+baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0]
+
+# compute FFT averages
+smooth_ffts = [[], [], []]
+baseline_ffts = [[], [], []]
+for it in range(0, smooth_pos_target.shape[0], 10):
+    # only use data that didn't terminate
+    if not np.any(smooth_terminated[it, :, 0]):
+        for idx in range(3):
+            fft = np.fft.fft(smooth_pos_target[it, :, idx])
+            smooth_ffts[idx].append(fft[: len(fft) // 2])
+
+    if not np.any(baseline_terminated[it, :, 0]):
+        for idx in range(3):
+            fft = np.fft.fft(baseline_pos_target[it, :, idx])
+            baseline_ffts[idx].append(fft[: len(fft) // 2])
+
+print(f"Total smooth FFTS: {len(smooth_ffts[0])}")
+print(f"Total baseline FFTS: {len(baseline_ffts[0])}")
+
+smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)]
+baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)]
+
+# plot FFTs
+fig, axs = plt.subplots(3, 1, figsize=(10, 10))
+for idx in range(3):
+    axs[idx].plot(np.abs(smooth_fft_means[idx]))
+    axs[idx].plot(np.abs(baseline_fft_means[idx]))
+    axs[idx].set_title(f"FT Amplitude idx {idx}")
+    axs[idx].set_xlabel("Frequency")
+    axs[idx].set_ylabel("Amplitude")
+    axs[idx].legend(["smooth", "baseline"])
+
+fig.tight_layout()
+fig.savefig(fig_dir + "/" + smooth_name + ".png")

From 0205d99b33fc338c6ed3644ef55638fcc6bb1b5e Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 19 Apr 2024 18:47:36 -0400
Subject: [PATCH 12/19] added sweep config

---
 .../sweep_config_smooth_exploration.json      | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 scripts/sweep_configs/sweep_config_smooth_exploration.json

diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json
new file mode 100644
index 00000000..952282fe
--- /dev/null
+++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json
@@ -0,0 +1,19 @@
+{
+    "method": "grid",
+    "metric": {
+        "name": "rewards/total_rewards",
+        "goal": "maximize"
+    },
+    "run_cap": 10,
+    "parameters": {
+        "train_cfg.policy.exploration_sample_freq":{
+            "values": [4, 8, 16, 32]
+        },
+        "train_cfg.runner.num_steps_per_env":{
+            "values": [32, 64]
+        },
+        "train_cfg.runner.max_iterations":{
+            "values": [800]
+        }
+    }
+}
\ No newline at end of file

From 64b9b1446f4d63ca150db859d889b6d643a8b296 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Wed, 24 Apr 2024 13:56:26 -0400
Subject: [PATCH 13/19] update on policy and old policy runners, get nans for
 log_probs

---
 learning/algorithms/ppo2.py           |  9 +++++++--
 learning/modules/smooth_actor.py      | 25 ++++++++++++++-----------
 learning/runners/BaseRunner.py        |  7 +++++--
 learning/runners/old_policy_runner.py | 18 ++++++++++++++++--
 learning/runners/on_policy_runner.py  | 12 +++++++-----
 5 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index 5866052c..b6b4fec0 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -6,6 +6,7 @@
     create_uniform_generator,
     compute_generalized_advantages,
 )
+from learning.modules import SmoothActor
 
 
 class PPO2:
@@ -99,7 +100,7 @@ def update_actor(self, data):
         self.mean_surrogate_loss = 0
         counter = 0
 
-        self.actor.act(data["actor_obs"])
+        self.actor.update_distribution(data["actor_obs"])
         data["old_sigma_batch"] = self.actor.action_std.detach()
         data["old_mu_batch"] = self.actor.action_mean.detach()
         data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob(
@@ -111,8 +112,12 @@ def update_actor(self, data):
         batch_size = total_data // self.num_mini_batches
         generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
         for batch in generator:
+            # * Re-sample noise for smooth actor
+            if isinstance(self.actor, SmoothActor):
+                self.actor.sample_weights(batch_size)
+
             # ! refactor how this is done
-            self.actor.act(batch["actor_obs"])
+            self.actor.update_distribution(batch["actor_obs"])
             actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"])
             mu_batch = self.actor.action_mean
             sigma_batch = self.actor.action_std
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index 91754aeb..dae53c9e 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -13,14 +13,13 @@
 class SmoothActor(Actor):
     weights_dist: Normal
     _latent_sde: torch.Tensor
-    exploration_mat: torch.Tensor
     exploration_matrices: torch.Tensor
 
     def __init__(
         self,
         *args,
         full_std: bool = True,
-        use_exp_ln: bool = False,
+        use_exp_ln: bool = True,
         learn_features: bool = True,
         epsilon: float = 1e-6,
         log_std_init: float = 0.0,
@@ -60,8 +59,6 @@ def sample_weights(self, batch_size=1):
         # Sample weights for the noise exploration matrix
         std = self.get_std()
         self.weights_dist = Normal(torch.zeros_like(std), std)
-        # Reparametrization trick to pass gradients
-        self.exploration_mat = self.weights_dist.rsample()
         # Pre-compute matrices in case of parallel exploration
         self.exploration_matrices = self.weights_dist.rsample((batch_size,))
 
@@ -87,12 +84,20 @@ def get_std(self):
 
     def update_distribution(self, observations):
         if self._normalize_obs:
-            observations = self.normalize(observations)
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
         # Get latent features and compute distribution
         self._latent_sde = self.latent_net(observations)
         if not self.learn_features:
             self._latent_sde = self._latent_sde.detach()
-        variance = torch.mm(self._latent_sde**2, self.get_std() ** 2)
+        if self._latent_sde.dim() == 2:
+            variance = torch.mm(self._latent_sde**2, self.get_std() ** 2)
+        elif self._latent_sde.dim() == 3:
+            variance = torch.einsum(
+                "abc,cd->abd", self._latent_sde**2, self.get_std() ** 2
+            )
+        else:
+            raise ValueError("Invalid latent_sde dimension")
         mean_actions = self.mean_actions_net(self._latent_sde)
         self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
 
@@ -107,7 +112,8 @@ def act(self, observations):
 
     def act_inference(self, observations):
         if self._normalize_obs:
-            observations = self.normalize(observations)
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
         latent_sde = self.latent_net(observations)
         mean_actions = self.mean_actions_net(latent_sde)
         return mean_actions
@@ -116,12 +122,9 @@ def get_noise(self):
         latent_sde = self._latent_sde
         if not self.learn_features:
             latent_sde = latent_sde.detach()
-        # Default case: only one exploration matrix
-        if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices):
-            return torch.mm(latent_sde, self.exploration_mat.to(latent_sde.device))
         # Use batch matrix multiplication for efficient computation
         # (batch_size, n_features) -> (batch_size, 1, n_features)
         latent_sde = latent_sde.unsqueeze(dim=1)
         # (batch_size, 1, n_actions)
-        noise = torch.bmm(latent_sde, self.exploration_matrices)
+        noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device))
         return noise.squeeze(dim=1)
diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py
index 9e5ab2e3..39ce336d 100644
--- a/learning/runners/BaseRunner.py
+++ b/learning/runners/BaseRunner.py
@@ -1,6 +1,6 @@
 import torch
 from learning.algorithms import *  # noqa: F403
-from learning.modules import Actor, Critic
+from learning.modules import Actor, Critic, SmoothActor
 from learning.utils import remove_zero_weighted_rewards
 
 
@@ -22,7 +22,10 @@ def _set_up_alg(self):
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
         num_actions = self.get_action_size(self.actor_cfg["actions"])
         num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
-        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
         critic = Critic(num_critic_obs, **self.critic_cfg)
         alg_class = eval(self.cfg["algorithm_class_name"])
         self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py
index 42f019a3..8b58dbe4 100644
--- a/learning/runners/old_policy_runner.py
+++ b/learning/runners/old_policy_runner.py
@@ -4,7 +4,7 @@
 from learning.utils import Logger
 from .BaseRunner import BaseRunner
 from learning.algorithms import PPO  # noqa: F401
-from learning.modules import ActorCritic, Actor, Critic
+from learning.modules import ActorCritic, Actor, Critic, SmoothActor
 
 logger = Logger()
 
@@ -24,7 +24,10 @@ def _set_up_alg(self):
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
         num_actions = self.get_action_size(self.actor_cfg["actions"])
         num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
-        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
         critic = Critic(num_critic_obs, **self.critic_cfg)
         actor_critic = ActorCritic(actor, critic)
         alg_class = eval(self.cfg["algorithm_class_name"])
@@ -42,6 +45,10 @@ def learn(self):
 
         self.save()
 
+        # * Initialize smooth exploration matrices
+        if self.actor_cfg["smooth_exploration"]:
+            self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs)
+
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
@@ -49,6 +56,13 @@ def learn(self):
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
+                    # * Re-sample noise matrix for smooth exploration
+                    sample_freq = self.actor_cfg["exploration_sample_freq"]
+                    if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor_critic.actor.sample_weights(
+                            batch_size=self.env.num_envs
+                        )
+
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.actor_cfg["actions"],
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 3d38b48a..79f086d7 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -32,6 +32,10 @@ def learn(self, states_to_log_dict=None):
         tot_iter = self.it + self.num_learning_iterations
         self.save()
 
+        # * Initialize smooth exploration matrices
+        if self.actor_cfg["smooth_exploration"]:
+            self.alg.actor.sample_weights(batch_size=self.env.num_envs)
+
         # * start up storage
         transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
         transition.update(
@@ -65,11 +69,9 @@ def learn(self, states_to_log_dict=None):
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
                     # * Re-sample noise matrix for smooth exploration
-                    sample_freq = self.policy_cfg["exploration_sample_freq"]
-                    if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
-                        self.alg.actor_critic.actor.sample_weights(
-                            batch_size=self.env.num_envs
-                        )
+                    sample_freq = self.actor_cfg["exploration_sample_freq"]
+                    if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor.sample_weights(batch_size=self.env.num_envs)
 
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(

From 91656e9b81d4a1722100d100337c63dd31d0cd56 Mon Sep 17 00:00:00 2001
From: Steve Heim <heim.steve@gmail.com>
Date: Thu, 25 Apr 2024 20:08:52 -0400
Subject: [PATCH 14/19] run 200 iterations before starting training, to burn in
 normalization

---
 learning/runners/on_policy_runner.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 79f086d7..fc754e17 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -54,6 +54,20 @@ def learn(self, states_to_log_dict=None):
             device=self.device,
         )
 
+        # burn in observation normalization.
+        with torch.inference_mode():
+            for _ in range(200):
+                actions = self.alg.act(actor_obs, critic_obs)
+                self.set_actions(
+                    self.actor_cfg["actions"],
+                    actions,
+                    self.actor_cfg["disable_actions"],
+                )
+                self.env.step()
+                actor_obs = self.get_noisy_obs(
+                    self.actor_cfg["obs"], self.actor_cfg["noise"]
+                )
+
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")

From f74e7feaa103c7ace373c55b9e95db0d0a21f38f Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 3 May 2024 14:37:30 -0400
Subject: [PATCH 15/19] update dummy input in export_network

---
 learning/modules/utils/neural_net.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 057e724f..c28762be 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -80,7 +80,7 @@ def export_network(network, network_name, path, num_inputs):
     model = copy.deepcopy(network).to("cpu")
     # To trace model, must be evaluated once with arbitrary input
     model.eval()
-    dummy_input = torch.rand((1, num_inputs))
+    dummy_input = torch.rand((num_inputs))
     model_traced = torch.jit.trace(model, dummy_input)
     torch.jit.save(model_traced, path_TS)
     torch.onnx.export(model_traced, dummy_input, path_onnx)

From 18045e9289dec5f6d8c3cc74cacaf6b873489fe3 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Fri, 3 May 2024 18:50:12 -0400
Subject: [PATCH 16/19] good choice of params: sample 16, rollout 32, LR x1.1,
 des_kl 0.02

---
 gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 8 ++++----
 learning/algorithms/ppo2.py                      | 4 ++--
 learning/runners/on_policy_runner.py             | 4 +++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 2e108f16..a6f12e18 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -74,7 +74,7 @@ class actor:
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         smooth_exploration = True
-        exploration_sample_freq = 8
+        exploration_sample_freq = 16
 
         normalize_obs = True
         obs = [
@@ -151,17 +151,17 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         num_mini_batches = 4
         storage_size = 2**17  # new
         mini_batch_size = 2**15  #  new
-        learning_rate = 5.0e-5
+        learning_rate = 0.002
         schedule = "adaptive"  # can be adaptive, fixed
         discount_horizon = 1.0  # [s]
         lam = 0.95
         GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
+        desired_kl = 0.02
         max_grad_norm = 1.0
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 500  # number of policy updates
+        max_iterations = 800  # number of policy updates
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index b6b4fec0..58945bb3 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -139,9 +139,9 @@ def update_actor(self, data):
                     kl_mean = torch.mean(kl)
 
                     if kl_mean > self.desired_kl * 2.0:
-                        self.learning_rate = max(1e-5, self.learning_rate / 1.5)
+                        self.learning_rate = max(2e-4, self.learning_rate / 1.1)
                     elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
-                        self.learning_rate = min(1e-2, self.learning_rate * 1.5)
+                        self.learning_rate = min(1e-2, self.learning_rate * 1.1)
 
                     for param_group in self.optimizer.param_groups:
                         # ! check this
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index fc754e17..1e885e81 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -167,7 +167,9 @@ def set_up_logger(self):
         )
         logger.register_rewards(["total_rewards"])
         logger.register_category(
-            "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"]
+            "algorithm",
+            self.alg,
+            ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
         )
         logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
 

From adfb0788fb61883f20c15dbacc93ec9010b471ac Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Sun, 5 May 2024 22:48:28 -0400
Subject: [PATCH 17/19] export latent network and update configs

---
 gym/envs/mini_cheetah/mini_cheetah_config.py  | 24 +++++++++++--------
 .../mini_cheetah/mini_cheetah_ref_config.py   |  7 ++++--
 learning/algorithms/ppo2.py                   | 13 ++++++++--
 learning/modules/utils/neural_net.py          | 10 +++++++-
 .../sweep_config_smooth_exploration.json      |  3 ---
 5 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index 104e5f64..6631704c 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -130,16 +130,17 @@ class actor:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
-        smooth_exploration = False
+        smooth_exploration = True
+        exploration_sample_freq = 16
 
         obs = [
-            "base_lin_vel",
+            # "base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
             "dof_pos_obs",
             "dof_vel",
-            "dof_pos_target",
+            # "dof_pos_target",
         ]
         actions = ["dof_pos_target"]
         add_noise = True
@@ -195,20 +196,23 @@ class algorithm(LeggedRobotRunnerCfg.algorithm):
         value_loss_coef = 1.0
         use_clipped_value_loss = True
         clip_param = 0.2
-        entropy_coef = 0.02
-        num_learning_epochs = 4
+        entropy_coef = 0.01
+        num_learning_epochs = 6
         # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 8
-        learning_rate = 1.0e-5
-        schedule = "adaptive"  # can be adaptive or fixed
+        num_mini_batches = 4
         discount_horizon = 1.0  # [s]
         # GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
+        desired_kl = 0.02
         max_grad_norm = 1.0
+        # * Learning rate
+        learning_rate = 0.002
+        schedule = "adaptive"  # can be adaptive or fixed
+        lr_range = [2e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner(LeggedRobotRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah"
-        max_iterations = 500
+        max_iterations = 800
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index a6f12e18..4f50f7d2 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -151,13 +151,16 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         num_mini_batches = 4
         storage_size = 2**17  # new
         mini_batch_size = 2**15  #  new
-        learning_rate = 0.002
-        schedule = "adaptive"  # can be adaptive, fixed
         discount_horizon = 1.0  # [s]
         lam = 0.95
         GAE_bootstrap_horizon = 2.0  # [s]
         desired_kl = 0.02
         max_grad_norm = 1.0
+        # * Learning rate
+        learning_rate = 0.002
+        schedule = "adaptive"  # can be adaptive or fixed
+        lr_range = [3e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index 58945bb3..528ef882 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -27,6 +27,8 @@ def __init__(
         desired_kl=0.01,
         loss_fn="MSE",
         device="cpu",
+        lr_range=[1e-4, 1e-2],
+        lr_ratio=1.3,
         **kwargs,
     ):
         self.device = device
@@ -34,6 +36,8 @@ def __init__(
         self.desired_kl = desired_kl
         self.schedule = schedule
         self.learning_rate = learning_rate
+        self.lr_range = lr_range
+        self.lr_ratio = lr_ratio
 
         # * PPO components
         self.actor = actor.to(self.device)
@@ -137,11 +141,16 @@ def update_actor(self, data):
                         axis=-1,
                     )
                     kl_mean = torch.mean(kl)
+                    lr_min, lr_max = self.lr_range
 
                     if kl_mean > self.desired_kl * 2.0:
-                        self.learning_rate = max(2e-4, self.learning_rate / 1.1)
+                        self.learning_rate = max(
+                            lr_min, self.learning_rate / self.lr_ratio
+                        )
                     elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
-                        self.learning_rate = min(1e-2, self.learning_rate * 1.1)
+                        self.learning_rate = min(
+                            lr_max, self.learning_rate * self.lr_ratio
+                        )
 
                     for param_group in self.optimizer.param_groups:
                         # ! check this
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index c28762be..24ed3df7 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -62,7 +62,7 @@ def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0):
         layer_list.append(activation)
 
 
-def export_network(network, network_name, path, num_inputs):
+def export_network(network, network_name, path, num_inputs, latent=True):
     """
     Thsi function traces and exports the given network module in .pt and
     .onnx file formats. These can be used for evaluation on other systems
@@ -84,3 +84,11 @@ def export_network(network, network_name, path, num_inputs):
     model_traced = torch.jit.trace(model, dummy_input)
     torch.jit.save(model_traced, path_TS)
     torch.onnx.export(model_traced, dummy_input, path_onnx)
+
+    if latent:
+        path_latent = os.path.join(path, network_name + "_latent.onnx")
+        model_latent = model.latent_net
+        model_latent.eval()
+        dummy_input = torch.rand((num_inputs))
+        model_traced = torch.jit.trace(model_latent, dummy_input)
+        torch.onnx.export(model_traced, dummy_input, path_latent)
diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json
index 952282fe..49fc4125 100644
--- a/scripts/sweep_configs/sweep_config_smooth_exploration.json
+++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json
@@ -11,9 +11,6 @@
         },
         "train_cfg.runner.num_steps_per_env":{
             "values": [32, 64]
-        },
-        "train_cfg.runner.max_iterations":{
-            "values": [800]
         }
     }
 }
\ No newline at end of file

From 8ff82ae26b1e73daf0f72dc76af7756aab12de83 Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 6 May 2024 11:13:00 -0400
Subject: [PATCH 18/19] export latent net with norm and log get_std

---
 learning/modules/smooth_actor.py     | 9 ++++-----
 learning/modules/utils/neural_net.py | 2 +-
 learning/runners/on_policy_runner.py | 4 +++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
index dae53c9e..5cb1da89 100644
--- a/learning/modules/smooth_actor.py
+++ b/learning/modules/smooth_actor.py
@@ -57,11 +57,12 @@ def __init__(
 
     def sample_weights(self, batch_size=1):
         # Sample weights for the noise exploration matrix
-        std = self.get_std()
+        std = self.get_std
         self.weights_dist = Normal(torch.zeros_like(std), std)
         # Pre-compute matrices in case of parallel exploration
         self.exploration_matrices = self.weights_dist.rsample((batch_size,))
 
+    @property
     def get_std(self):
         # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally
         if self.use_exp_ln:
@@ -91,11 +92,9 @@ def update_distribution(self, observations):
         if not self.learn_features:
             self._latent_sde = self._latent_sde.detach()
         if self._latent_sde.dim() == 2:
-            variance = torch.mm(self._latent_sde**2, self.get_std() ** 2)
+            variance = torch.mm(self._latent_sde**2, self.get_std**2)
         elif self._latent_sde.dim() == 3:
-            variance = torch.einsum(
-                "abc,cd->abd", self._latent_sde**2, self.get_std() ** 2
-            )
+            variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2)
         else:
             raise ValueError("Invalid latent_sde dimension")
         mean_actions = self.mean_actions_net(self._latent_sde)
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 24ed3df7..034c13d4 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -87,7 +87,7 @@ def export_network(network, network_name, path, num_inputs, latent=True):
 
     if latent:
         path_latent = os.path.join(path, network_name + "_latent.onnx")
-        model_latent = model.latent_net
+        model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net)
         model_latent.eval()
         dummy_input = torch.rand((num_inputs))
         model_traced = torch.jit.trace(model_latent, dummy_input)
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 1e885e81..dded2739 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -171,7 +171,9 @@ def set_up_logger(self):
             self.alg,
             ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
         )
-        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
+        logger.register_category(
+            "actor", self.alg.actor, ["action_std", "entropy", "get_std"]
+        )
 
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 

From 007ea933c6d72077c8582e44162a121490cc529d Mon Sep 17 00:00:00 2001
From: Lukas Molnar <lukas.molnar@bluewin.ch>
Date: Mon, 6 May 2024 14:22:30 -0400
Subject: [PATCH 19/19] export actor std to txt file

---
 learning/modules/utils/neural_net.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 034c13d4..6e6f41d6 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -1,6 +1,7 @@
 import torch
 import os
 import copy
+import numpy as np
 
 
 def create_MLP(
@@ -86,9 +87,16 @@ def export_network(network, network_name, path, num_inputs, latent=True):
     torch.onnx.export(model_traced, dummy_input, path_onnx)
 
     if latent:
+        # Export latent model
         path_latent = os.path.join(path, network_name + "_latent.onnx")
         model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net)
         model_latent.eval()
         dummy_input = torch.rand((num_inputs))
         model_traced = torch.jit.trace(model_latent, dummy_input)
         torch.onnx.export(model_traced, dummy_input, path_latent)
+
+        # Save actor std of shape (num_actions, latent_dim)
+        # It is important that the shape is the same as the exploration matrix
+        path_std = os.path.join(path, network_name + "_std.txt")
+        std_transposed = model.get_std.numpy().T
+        np.savetxt(path_std, std_transposed)