diff --git a/.gitignore b/.gitignore
index 0b68375a..c78e6696 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,3 +81,7 @@ ipython_config.py
 venv/
 env.bak/
 venv.bak/
+
+# Smooth exploration
+gym/smooth_exploration/data*
+gym/smooth_exploration/figures*
diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py
index 7e6cabc6..1f6462ce 100644
--- a/gym/envs/base/legged_robot_config.py
+++ b/gym/envs/base/legged_robot_config.py
@@ -239,6 +239,7 @@ class actor:
         # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
         normalize_obs = True
+        smooth_exploration = False
 
         obs = [
             "observation_a",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py
index 28c70123..6631704c 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_config.py
@@ -130,15 +130,17 @@ class actor:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = True
+        exploration_sample_freq = 16
 
         obs = [
-            "base_lin_vel",
+            # "base_lin_vel",
             "base_ang_vel",
             "projected_gravity",
             "commands",
             "dof_pos_obs",
             "dof_vel",
-            "dof_pos_target",
+            # "dof_pos_target",
         ]
         actions = ["dof_pos_target"]
         add_noise = True
@@ -194,20 +196,23 @@ class algorithm(LeggedRobotRunnerCfg.algorithm):
         value_loss_coef = 1.0
         use_clipped_value_loss = True
         clip_param = 0.2
-        entropy_coef = 0.02
-        num_learning_epochs = 4
+        entropy_coef = 0.01
+        num_learning_epochs = 6
         # * mini batch size = num_envs*nsteps / nminibatches
-        num_mini_batches = 8
-        learning_rate = 1.0e-5
-        schedule = "adaptive"  # can be adaptive or fixed
+        num_mini_batches = 4
         discount_horizon = 1.0  # [s]
         # GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
+        desired_kl = 0.02
         max_grad_norm = 1.0
+        # * Learning rate
+        learning_rate = 0.002
+        schedule = "adaptive"  # can be adaptive or fixed
+        lr_range = [2e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner(LeggedRobotRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah"
-        max_iterations = 500
+        max_iterations = 800
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
index a4acefa6..d5f767e8 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py
@@ -166,6 +166,7 @@ class policy:
         critic_hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         obs = [
             "base_ang_vel",
diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
index 3c92882d..4f50f7d2 100644
--- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
+++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py
@@ -73,6 +73,9 @@ class actor:
         hidden_dims = [256, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = True
+        exploration_sample_freq = 16
+
         normalize_obs = True
         obs = [
             "base_ang_vel",
@@ -148,17 +151,20 @@ class algorithm(MiniCheetahRunnerCfg.algorithm):
         num_mini_batches = 4
         storage_size = 2**17  # new
         mini_batch_size = 2**15  #  new
-        learning_rate = 5.0e-5
-        schedule = "adaptive"  # can be adaptive, fixed
         discount_horizon = 1.0  # [s]
         lam = 0.95
         GAE_bootstrap_horizon = 2.0  # [s]
-        desired_kl = 0.01
+        desired_kl = 0.02
         max_grad_norm = 1.0
+        # * Learning rate
+        learning_rate = 0.002
+        schedule = "adaptive"  # can be adaptive or fixed
+        lr_range = [3e-4, 1e-2]
+        lr_ratio = 1.3
 
     class runner(MiniCheetahRunnerCfg.runner):
         run_name = ""
         experiment_name = "mini_cheetah_ref"
-        max_iterations = 500  # number of policy updates
+        max_iterations = 800  # number of policy updates
         algorithm_class_name = "PPO2"
         num_steps_per_env = 32
diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py
index 86a8d606..6f8eaa1d 100644
--- a/gym/envs/mit_humanoid/mit_humanoid_config.py
+++ b/gym/envs/mit_humanoid/mit_humanoid_config.py
@@ -187,6 +187,7 @@ class actor:
         critic_hidden_dims = [512, 256, 128]
         # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid
         activation = "elu"
+        smooth_exploration = False
 
         obs = [
             "base_height",
diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py
new file mode 100644
index 00000000..5ad86b63
--- /dev/null
+++ b/gym/smooth_exploration/plot_ft.py
@@ -0,0 +1,56 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+SAMPLE_FREQ = 8
+STEPS = 500
+
+smooth_name = "ref_sample_8_len_500"
+baseline_name = "ref_baseline_len_500"
+
+smooth_data_dir = "./data_train/" + smooth_name
+baseline_data_dir = "./data_train/" + baseline_name
+fig_dir = "./figures_train/"
+
+if not os.path.exists(fig_dir):
+    os.makedirs(fig_dir)
+
+# load data
+smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0]
+baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0]
+smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0]
+baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0]
+
+# compute FFT averages
+smooth_ffts = [[], [], []]
+baseline_ffts = [[], [], []]
+for it in range(0, smooth_pos_target.shape[0], 10):
+    # only use data that didn't terminate
+    if not np.any(smooth_terminated[it, :, 0]):
+        for idx in range(3):
+            fft = np.fft.fft(smooth_pos_target[it, :, idx])
+            smooth_ffts[idx].append(fft[: len(fft) // 2])
+
+    if not np.any(baseline_terminated[it, :, 0]):
+        for idx in range(3):
+            fft = np.fft.fft(baseline_pos_target[it, :, idx])
+            baseline_ffts[idx].append(fft[: len(fft) // 2])
+
+print(f"Total smooth FFTS: {len(smooth_ffts[0])}")
+print(f"Total baseline FFTS: {len(baseline_ffts[0])}")
+
+smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)]
+baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)]
+
+# plot FFTs
+fig, axs = plt.subplots(3, 1, figsize=(10, 10))
+for idx in range(3):
+    axs[idx].plot(np.abs(smooth_fft_means[idx]))
+    axs[idx].plot(np.abs(baseline_fft_means[idx]))
+    axs[idx].set_title(f"FT Amplitude idx {idx}")
+    axs[idx].set_xlabel("Frequency")
+    axs[idx].set_ylabel("Amplitude")
+    axs[idx].legend(["smooth", "baseline"])
+
+fig.tight_layout()
+fig.savefig(fig_dir + "/" + smooth_name + ".png")
diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py
new file mode 100644
index 00000000..5768f91a
--- /dev/null
+++ b/gym/smooth_exploration/plot_play.py
@@ -0,0 +1,38 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+name = "mini_cheetah_ref"
+data_dir = "./data_play/" + name
+fig_dir = "./figures_play/" + name
+
+# load data
+dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
+dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
+dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
+
+# plot data
+n_steps = 200
+fig, axs = plt.subplots(3, figsize=(10, 10))
+plt.suptitle(name)
+
+for i in range(3):
+    axs[0].plot(dof_pos_obs[:n_steps, i])
+axs[0].set_title("dof_pos_obs")
+axs[0].legend(["idx 0", "idx 1", "idx 2"])
+axs[0].set_xlabel("time steps")
+
+for i in range(3):
+    axs[1].plot(dof_pos_target[:n_steps, i])
+axs[1].set_title("dof_pos_target")
+axs[1].legend(["idx 0", "idx 1", "idx 2"])
+axs[1].set_xlabel("time steps")
+
+for i in range(3):
+    axs[2].plot(dof_vel[:n_steps, i])
+axs[2].set_title("dof_vel")
+axs[2].legend(["idx 0", "idx 1", "idx 2"])
+axs[2].set_xlabel("time steps")
+
+plt.tight_layout()
+plt.savefig(fig_dir + "/" + name + ".png")
+plt.show()
diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py
new file mode 100644
index 00000000..58c50396
--- /dev/null
+++ b/gym/smooth_exploration/plot_train.py
@@ -0,0 +1,93 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+FOURIER = True
+SMOOTH = True
+SAMPLE_FREQ = 16
+STEPS = 1000
+
+name = "ref_sample_16_len_1000"
+data_dir = "./data_train/" + name
+fig_dir = "./figures_train/" + name
+
+if not os.path.exists(fig_dir):
+    os.makedirs(fig_dir)
+
+# load data
+dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0]
+dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0]
+dof_vel = np.load(data_dir + "/dof_vel.npy")[0]
+torques = np.load(data_dir + "/torques.npy")[0]
+terminated = np.load(data_dir + "/terminated.npy")[0]
+
+
+# plot fourier trainsform
+def plot_fourier(data, it):
+    fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10))
+    for i in range(3):
+        ft = np.fft.fft(data[:, i])
+        ft_half = ft[: len(ft) // 2]
+        axs_ft[0].plot(np.abs(ft_half))
+        axs_ft[1].plot(np.angle(ft_half))
+
+    axs_ft[0].set_title("FT Amplitude")
+    axs_ft[0].set_xlabel("Frequency")
+    axs_ft[0].set_ylabel("Amplitude")
+    axs_ft[0].legend(["idx 0", "idx 1", "idx 2"])
+    axs_ft[1].set_title("FT Phase")
+    axs_ft[1].set_xlabel("Frequency")
+    axs_ft[1].set_ylabel("Phase")
+    axs_ft[1].legend(["idx 0", "idx 1", "idx 2"])
+
+    fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png")
+
+
+# plot data for each iteration
+for it in range(0, dof_pos_obs.shape[0], 10):
+    # check if iteration terminated
+    terminate_idx = np.where(terminated[it, :, 0] == 1)[0]
+    if terminate_idx.size > 0:
+        n_steps = terminate_idx[0]
+    else:
+        n_steps = dof_pos_obs.shape[1]
+    print(n_steps)
+
+    # generate figure
+    fig, axs = plt.subplots(4, figsize=(10, 10))
+    plt.suptitle(name + " iteration " + str(it))
+
+    axs[0].set_title("dof_pos_obs")
+    for i in range(3):
+        axs[0].plot(dof_pos_obs[it, :n_steps, i])
+
+    axs[1].set_title("dof_pos_target")
+    for i in range(3):
+        axs[1].plot(dof_pos_target[it, :n_steps, i])
+    if FOURIER and n_steps == STEPS:
+        plot_fourier(dof_pos_target[it, :n_steps, :], it)
+
+    axs[2].set_title("dof_vel")
+    for i in range(3):
+        axs[2].plot(dof_vel[it, :n_steps, i])
+
+    axs[3].set_title("torques")
+    for i in range(3):
+        axs[3].plot(torques[it, :n_steps, i])
+
+    # format plots
+    for idx in range(4):
+        axs[idx].legend(["idx 0", "idx 1", "idx 2"])
+        axs[idx].set_xlabel("time steps")
+        axs[idx].set_xlim([0, n_steps])
+
+    if SMOOTH:
+        # plot vertical lines where noise is resampled
+        for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ):
+            axs[0].axvline(x, color="r", linestyle="--")
+            axs[1].axvline(x, color="r", linestyle="--")
+            axs[2].axvline(x, color="r", linestyle="--")
+            axs[3].axvline(x, color="r", linestyle="--")
+
+    fig.tight_layout()
+    fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png")
diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py
index 92898ad5..ff09d788 100644
--- a/learning/algorithms/ppo.py
+++ b/learning/algorithms/ppo.py
@@ -34,7 +34,7 @@
 import torch.nn as nn
 import torch.optim as optim
 
-from learning.modules import ActorCritic
+from learning.modules import ActorCritic, SmoothActor
 from learning.storage import RolloutStorage
 
 
@@ -162,7 +162,11 @@ def update(self):
             old_mu_batch,
             old_sigma_batch,
         ) in generator:
-            self.actor_critic.act(obs_batch)
+            # TODO[lm]: Look into resampling noise here, gSDE paper seems to do it.
+            if isinstance(self.actor_critic.actor, SmoothActor):
+                batch_size = obs_batch.shape[0]
+                self.actor_critic.actor.sample_weights(batch_size)
+            self.actor_critic.actor.update_distribution(obs_batch)
             actions_log_prob_batch = self.actor_critic.get_actions_log_prob(
                 actions_batch
             )
diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py
index 5866052c..528ef882 100644
--- a/learning/algorithms/ppo2.py
+++ b/learning/algorithms/ppo2.py
@@ -6,6 +6,7 @@
     create_uniform_generator,
     compute_generalized_advantages,
 )
+from learning.modules import SmoothActor
 
 
 class PPO2:
@@ -26,6 +27,8 @@ def __init__(
         desired_kl=0.01,
         loss_fn="MSE",
         device="cpu",
+        lr_range=[1e-4, 1e-2],
+        lr_ratio=1.3,
         **kwargs,
     ):
         self.device = device
@@ -33,6 +36,8 @@ def __init__(
         self.desired_kl = desired_kl
         self.schedule = schedule
         self.learning_rate = learning_rate
+        self.lr_range = lr_range
+        self.lr_ratio = lr_ratio
 
         # * PPO components
         self.actor = actor.to(self.device)
@@ -99,7 +104,7 @@ def update_actor(self, data):
         self.mean_surrogate_loss = 0
         counter = 0
 
-        self.actor.act(data["actor_obs"])
+        self.actor.update_distribution(data["actor_obs"])
         data["old_sigma_batch"] = self.actor.action_std.detach()
         data["old_mu_batch"] = self.actor.action_mean.detach()
         data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob(
@@ -111,8 +116,12 @@ def update_actor(self, data):
         batch_size = total_data // self.num_mini_batches
         generator = create_uniform_generator(data, batch_size, self.num_learning_epochs)
         for batch in generator:
+            # * Re-sample noise for smooth actor
+            if isinstance(self.actor, SmoothActor):
+                self.actor.sample_weights(batch_size)
+
             # ! refactor how this is done
-            self.actor.act(batch["actor_obs"])
+            self.actor.update_distribution(batch["actor_obs"])
             actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"])
             mu_batch = self.actor.action_mean
             sigma_batch = self.actor.action_std
@@ -132,11 +141,16 @@ def update_actor(self, data):
                         axis=-1,
                     )
                     kl_mean = torch.mean(kl)
+                    lr_min, lr_max = self.lr_range
 
                     if kl_mean > self.desired_kl * 2.0:
-                        self.learning_rate = max(1e-5, self.learning_rate / 1.5)
+                        self.learning_rate = max(
+                            lr_min, self.learning_rate / self.lr_ratio
+                        )
                     elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
-                        self.learning_rate = min(1e-2, self.learning_rate * 1.5)
+                        self.learning_rate = min(
+                            lr_max, self.learning_rate * self.lr_ratio
+                        )
 
                     for param_group in self.optimizer.param_groups:
                         # ! check this
diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py
index 3caf5fec..410e2f25 100644
--- a/learning/modules/__init__.py
+++ b/learning/modules/__init__.py
@@ -34,3 +34,4 @@
 from .state_estimator import StateEstimatorNN
 from .actor import Actor
 from .critic import Critic
+from .smooth_actor import SmoothActor
diff --git a/learning/modules/actor.py b/learning/modules/actor.py
index fbaa6868..9fc37c3b 100644
--- a/learning/modules/actor.py
+++ b/learning/modules/actor.py
@@ -5,6 +5,8 @@
 from .utils import export_network
 from .utils import RunningMeanStd
 
+from gym import LEGGED_GYM_ROOT_DIR
+
 
 class Actor(nn.Module):
     def __init__(
@@ -25,7 +27,11 @@ def __init__(
 
         self.num_obs = num_obs
         self.num_actions = num_actions
-        self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation)
+        self.hidden_dims = hidden_dims
+        self.activation = activation
+        self.NN = create_MLP(
+            num_obs, num_actions, hidden_dims, activation, latent=False
+        )
 
         # Action noise
         self.std = nn.Parameter(init_noise_std * torch.ones(num_actions))
@@ -33,6 +39,9 @@ def __init__(
         # disable args validation for speedup
         Normal.set_default_validate_args = False
 
+        # Debug mode for plotting
+        self.debug = False
+
     @property
     def action_mean(self):
         return self.distribution.mean
@@ -51,7 +60,12 @@ def update_distribution(self, observations):
 
     def act(self, observations):
         self.update_distribution(observations)
-        return self.distribution.sample()
+        sample = self.distribution.sample()
+        if self.debug:
+            mean = self.distribution.mean
+            path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_baseline.csv"
+            self.log_actions(mean[0][2], sample[0][2], path)
+        return sample
 
     def get_actions_log_prob(self, actions):
         return self.distribution.log_prob(actions).sum(dim=-1)
@@ -67,3 +81,7 @@ def forward(self, observations):
 
     def export(self, path):
         export_network(self, "policy", path, self.num_obs)
+
+    def log_actions(self, mean, sample, path):
+        with open(path, "a") as f:
+            f.write(str(mean.item()) + ", " + str(sample.item()) + "\n")
diff --git a/learning/modules/critic.py b/learning/modules/critic.py
index 732f8eda..3ee930dc 100644
--- a/learning/modules/critic.py
+++ b/learning/modules/critic.py
@@ -15,7 +15,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.NN = create_MLP(num_obs, 1, hidden_dims, activation)
+        self.NN = create_MLP(num_obs, 1, hidden_dims, activation, latent=False)
         self._normalize_obs = normalize_obs
         if self._normalize_obs:
             self.obs_rms = RunningMeanStd(num_obs)
diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py
new file mode 100644
index 00000000..5cb1da89
--- /dev/null
+++ b/learning/modules/smooth_actor.py
@@ -0,0 +1,129 @@
+import torch
+import torch.nn as nn
+from torch.distributions import Normal
+
+from .actor import Actor
+from .utils import create_MLP
+
+from gym import LEGGED_GYM_ROOT_DIR
+
+
+# The following implementation is based on the gSDE paper. See code:
+# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38
+class SmoothActor(Actor):
+    weights_dist: Normal
+    _latent_sde: torch.Tensor
+    exploration_matrices: torch.Tensor
+
+    def __init__(
+        self,
+        *args,
+        full_std: bool = True,
+        use_exp_ln: bool = True,
+        learn_features: bool = True,
+        epsilon: float = 1e-6,
+        log_std_init: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.full_std = full_std
+        self.use_exp_ln = use_exp_ln
+        self.learn_features = learn_features
+        self.epsilon = epsilon
+        self.log_std_init = log_std_init
+
+        # Create latent NN and last layer
+        self.latent_net = create_MLP(
+            self.num_obs,
+            self.num_actions,
+            self.hidden_dims,
+            self.activation,
+            latent=True,
+        )
+        self.latent_dim = self.hidden_dims[-1]
+        self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions)
+        # Reduce the number of parameters if needed
+        if self.full_std:
+            log_std = torch.ones(self.latent_dim, self.num_actions)
+        else:
+            log_std = torch.ones(self.latent_dim, 1)
+        self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True)
+        # Sample an exploration matrix
+        self.sample_weights()
+        self.distribution = None
+
+        # Debug mode for plotting
+        self.debug = False
+
+    def sample_weights(self, batch_size=1):
+        # Sample weights for the noise exploration matrix
+        std = self.get_std
+        self.weights_dist = Normal(torch.zeros_like(std), std)
+        # Pre-compute matrices in case of parallel exploration
+        self.exploration_matrices = self.weights_dist.rsample((batch_size,))
+
+    @property
+    def get_std(self):
+        # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally
+        if self.use_exp_ln:
+            # From gSDE paper, it allows to keep variance
+            # above zero and prevent it from growing too fast
+            below_threshold = torch.exp(self.log_std) * (self.log_std <= 0)
+            # Avoid NaN: zeros values that are below zero
+            safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon
+            above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0)
+            std = below_threshold + above_threshold
+        else:
+            # Use normal exponential
+            std = torch.exp(self.log_std)
+
+        if self.full_std:
+            return std
+        assert self.latent_dim is not None
+        # Reduce the number of parameters:
+        return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std
+
+    def update_distribution(self, observations):
+        if self._normalize_obs:
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
+        # Get latent features and compute distribution
+        self._latent_sde = self.latent_net(observations)
+        if not self.learn_features:
+            self._latent_sde = self._latent_sde.detach()
+        if self._latent_sde.dim() == 2:
+            variance = torch.mm(self._latent_sde**2, self.get_std**2)
+        elif self._latent_sde.dim() == 3:
+            variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2)
+        else:
+            raise ValueError("Invalid latent_sde dimension")
+        mean_actions = self.mean_actions_net(self._latent_sde)
+        self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon))
+
+    def act(self, observations):
+        self.update_distribution(observations)
+        mean = self.distribution.mean
+        sample = mean + self.get_noise()
+        if self.debug:
+            path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv"
+            self.log_actions(mean[0][2], sample[0][2], path)
+        return sample
+
+    def act_inference(self, observations):
+        if self._normalize_obs:
+            with torch.no_grad():
+                observations = self.obs_rms(observations)
+        latent_sde = self.latent_net(observations)
+        mean_actions = self.mean_actions_net(latent_sde)
+        return mean_actions
+
+    def get_noise(self):
+        latent_sde = self._latent_sde
+        if not self.learn_features:
+            latent_sde = latent_sde.detach()
+        # Use batch matrix multiplication for efficient computation
+        # (batch_size, n_features) -> (batch_size, 1, n_features)
+        latent_sde = latent_sde.unsqueeze(dim=1)
+        # (batch_size, 1, n_actions)
+        noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device))
+        return noise.squeeze(dim=1)
diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py
index 5fd6ab3b..6e6f41d6 100644
--- a/learning/modules/utils/neural_net.py
+++ b/learning/modules/utils/neural_net.py
@@ -1,9 +1,12 @@
 import torch
 import os
 import copy
+import numpy as np
 
 
-def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None):
+def create_MLP(
+    num_inputs, num_outputs, hidden_dims, activation, dropouts=None, latent=False
+):
     activation = get_activation(activation)
 
     if dropouts is None:
@@ -15,8 +18,12 @@ def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None):
     else:
         add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0])
         for i in range(len(hidden_dims)):
+            # TODO[lm]: Could also create a separate function that gives the latent
+            # reprentation used for smooth exploration (but if it doesn't mess up
+            # anything, this is simpler)
             if i == len(hidden_dims) - 1:
-                add_layer(layers, hidden_dims[i], num_outputs)
+                if not latent:
+                    add_layer(layers, hidden_dims[i], num_outputs)
             else:
                 add_layer(
                     layers,
@@ -56,7 +63,7 @@ def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0):
         layer_list.append(activation)
 
 
-def export_network(network, network_name, path, num_inputs):
+def export_network(network, network_name, path, num_inputs, latent=True):
     """
     Thsi function traces and exports the given network module in .pt and
     .onnx file formats. These can be used for evaluation on other systems
@@ -74,7 +81,22 @@ def export_network(network, network_name, path, num_inputs):
     model = copy.deepcopy(network).to("cpu")
     # To trace model, must be evaluated once with arbitrary input
     model.eval()
-    dummy_input = torch.rand((1, num_inputs))
+    dummy_input = torch.rand((num_inputs))
     model_traced = torch.jit.trace(model, dummy_input)
     torch.jit.save(model_traced, path_TS)
     torch.onnx.export(model_traced, dummy_input, path_onnx)
+
+    if latent:
+        # Export latent model
+        path_latent = os.path.join(path, network_name + "_latent.onnx")
+        model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net)
+        model_latent.eval()
+        dummy_input = torch.rand((num_inputs))
+        model_traced = torch.jit.trace(model_latent, dummy_input)
+        torch.onnx.export(model_traced, dummy_input, path_latent)
+
+        # Save actor std of shape (num_actions, latent_dim)
+        # It is important that the shape is the same as the exploration matrix
+        path_std = os.path.join(path, network_name + "_std.txt")
+        std_transposed = model.get_std.numpy().T
+        np.savetxt(path_std, std_transposed)
diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py
index 9e5ab2e3..39ce336d 100644
--- a/learning/runners/BaseRunner.py
+++ b/learning/runners/BaseRunner.py
@@ -1,6 +1,6 @@
 import torch
 from learning.algorithms import *  # noqa: F403
-from learning.modules import Actor, Critic
+from learning.modules import Actor, Critic, SmoothActor
 from learning.utils import remove_zero_weighted_rewards
 
 
@@ -22,7 +22,10 @@ def _set_up_alg(self):
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
         num_actions = self.get_action_size(self.actor_cfg["actions"])
         num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
-        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
         critic = Critic(num_critic_obs, **self.critic_cfg)
         alg_class = eval(self.cfg["algorithm_class_name"])
         self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg)
diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py
index 42f019a3..8b58dbe4 100644
--- a/learning/runners/old_policy_runner.py
+++ b/learning/runners/old_policy_runner.py
@@ -4,7 +4,7 @@
 from learning.utils import Logger
 from .BaseRunner import BaseRunner
 from learning.algorithms import PPO  # noqa: F401
-from learning.modules import ActorCritic, Actor, Critic
+from learning.modules import ActorCritic, Actor, Critic, SmoothActor
 
 logger = Logger()
 
@@ -24,7 +24,10 @@ def _set_up_alg(self):
         num_actor_obs = self.get_obs_size(self.actor_cfg["obs"])
         num_actions = self.get_action_size(self.actor_cfg["actions"])
         num_critic_obs = self.get_obs_size(self.critic_cfg["obs"])
-        actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
+        if self.actor_cfg["smooth_exploration"]:
+            actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg)
+        else:
+            actor = Actor(num_actor_obs, num_actions, **self.actor_cfg)
         critic = Critic(num_critic_obs, **self.critic_cfg)
         actor_critic = ActorCritic(actor, critic)
         alg_class = eval(self.cfg["algorithm_class_name"])
@@ -42,6 +45,10 @@ def learn(self):
 
         self.save()
 
+        # * Initialize smooth exploration matrices
+        if self.actor_cfg["smooth_exploration"]:
+            self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs)
+
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
@@ -49,6 +56,13 @@ def learn(self):
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
+                    # * Re-sample noise matrix for smooth exploration
+                    sample_freq = self.actor_cfg["exploration_sample_freq"]
+                    if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor_critic.actor.sample_weights(
+                            batch_size=self.env.num_envs
+                        )
+
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.actor_cfg["actions"],
diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py
index 2cd6a9d8..dded2739 100644
--- a/learning/runners/on_policy_runner.py
+++ b/learning/runners/on_policy_runner.py
@@ -21,7 +21,7 @@ def __init__(self, env, train_cfg, device="cpu"):
             self.device,
         )
 
-    def learn(self):
+    def learn(self, states_to_log_dict=None):
         self.set_up_logger()
 
         rewards_dict = {}
@@ -32,6 +32,10 @@ def learn(self):
         tot_iter = self.it + self.num_learning_iterations
         self.save()
 
+        # * Initialize smooth exploration matrices
+        if self.actor_cfg["smooth_exploration"]:
+            self.alg.actor.sample_weights(batch_size=self.env.num_envs)
+
         # * start up storage
         transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device)
         transition.update(
@@ -50,13 +54,39 @@ def learn(self):
             device=self.device,
         )
 
+        # burn in observation normalization.
+        with torch.inference_mode():
+            for _ in range(200):
+                actions = self.alg.act(actor_obs, critic_obs)
+                self.set_actions(
+                    self.actor_cfg["actions"],
+                    actions,
+                    self.actor_cfg["disable_actions"],
+                )
+                self.env.step()
+                actor_obs = self.get_noisy_obs(
+                    self.actor_cfg["obs"], self.actor_cfg["noise"]
+                )
+
         logger.tic("runtime")
         for self.it in range(self.it + 1, tot_iter + 1):
             logger.tic("iteration")
             logger.tic("collection")
+
+            # * Simulate environment and log states
+            if states_to_log_dict is not None:
+                it_idx = self.it - 1
+                if it_idx % 10 == 0:
+                    self.sim_and_log_states(states_to_log_dict, it_idx)
+
             # * Rollout
             with torch.inference_mode():
                 for i in range(self.num_steps_per_env):
+                    # * Re-sample noise matrix for smooth exploration
+                    sample_freq = self.actor_cfg["exploration_sample_freq"]
+                    if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0:
+                        self.alg.actor.sample_weights(batch_size=self.env.num_envs)
+
                     actions = self.alg.act(actor_obs, critic_obs)
                     self.set_actions(
                         self.actor_cfg["actions"],
@@ -137,9 +167,13 @@ def set_up_logger(self):
         )
         logger.register_rewards(["total_rewards"])
         logger.register_category(
-            "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"]
+            "algorithm",
+            self.alg,
+            ["mean_value_loss", "mean_surrogate_loss", "learning_rate"],
+        )
+        logger.register_category(
+            "actor", self.alg.actor, ["action_std", "entropy", "get_std"]
         )
-        logger.register_category("actor", self.alg.actor, ["action_std", "entropy"])
 
         logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic))
 
@@ -178,3 +212,42 @@ def get_inference_actions(self):
 
     def export(self, path):
         self.alg.actor.export(path)
+
+    def sim_and_log_states(self, states_to_log_dict, it_idx):
+        # Simulate environment for as many steps as expected in the dict.
+        # Log states to the dict, as well as whether the env terminated.
+        steps = states_to_log_dict["terminated"].shape[2]
+        actor_obs = self.get_obs(self.policy_cfg["actor_obs"])
+        critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+
+        with torch.inference_mode():
+            for i in range(steps):
+                sample_freq = self.policy_cfg["exploration_sample_freq"]
+                if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0:
+                    self.alg.actor_critic.actor.sample_weights(
+                        batch_size=self.env.num_envs
+                    )
+
+                actions = self.alg.act(actor_obs, critic_obs)
+                self.set_actions(
+                    self.policy_cfg["actions"],
+                    actions,
+                    self.policy_cfg["disable_actions"],
+                )
+
+                self.env.step()
+
+                actor_obs = self.get_noisy_obs(
+                    self.policy_cfg["actor_obs"], self.policy_cfg["noise"]
+                )
+                critic_obs = self.get_obs(self.policy_cfg["critic_obs"])
+
+                # Log states (just for the first env)
+                terminated = self.get_terminated()[0]
+                for state in states_to_log_dict:
+                    if state == "terminated":
+                        states_to_log_dict[state][0, it_idx, i, :] = terminated
+                    else:
+                        states_to_log_dict[state][0, it_idx, i, :] = getattr(
+                            self.env, state
+                        )[0, :]
diff --git a/scripts/log_play.py b/scripts/log_play.py
new file mode 100644
index 00000000..f7fd68e8
--- /dev/null
+++ b/scripts/log_play.py
@@ -0,0 +1,121 @@
+from gym.envs import __init__  # noqa: F401
+from gym.utils import get_args, task_registry
+from gym.utils import KeyboardInterface
+from gym.utils import VisualizationRecorder
+from gym import LEGGED_GYM_ROOT_DIR
+
+# torch needs to be imported after isaacgym imports in local source
+import torch
+import os
+import numpy as np
+
+TEST_TOTAL_TIMESTEPS = 1000
+
+
+def create_logging_dict(runner, test_total_timesteps):
+    states_to_log = [
+        "dof_pos_target",
+        "dof_pos_obs",
+        "dof_vel",
+        "torques",
+        "commands",
+        # 'base_lin_vel',
+        # 'base_ang_vel',
+        # 'oscillators',
+        # 'grf',
+        # 'base_height'
+    ]
+
+    states_to_log_dict = {}
+
+    for state in states_to_log:
+        array_dim = runner.get_obs_size(
+            [
+                state,
+            ]
+        )
+        states_to_log_dict[state] = torch.zeros(
+            (1, test_total_timesteps, array_dim), device=runner.env.device
+        )
+    return states_to_log_dict
+
+
+def setup(args):
+    env_cfg, train_cfg = task_registry.create_cfgs(args)
+    env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16)
+    if hasattr(env_cfg, "push_robots"):
+        env_cfg.push_robots.toggle = False
+    if hasattr(env_cfg, "commands"):
+        env_cfg.commands.resampling_time = 9999
+    env_cfg.env.episode_length_s = 9999
+    env_cfg.env.num_projectiles = 20
+    task_registry.make_gym_and_sim()
+    env = task_registry.make_env(args.task, env_cfg)
+    env.cfg.init_state.reset_mode = "reset_to_basic"
+    train_cfg.runner.resume = True
+    train_cfg.logging.enable_local_saving = False
+    runner = task_registry.make_alg_runner(env, train_cfg)
+
+    # * switch to evaluation mode (dropout for example)
+    runner.switch_to_eval()
+    return env, runner, train_cfg
+
+
+def play(env, runner, train_cfg):
+    protocol_name = train_cfg.runner.experiment_name
+
+    # * set up logging
+    log_file_path = os.path.join(
+        LEGGED_GYM_ROOT_DIR,
+        "gym",
+        "smooth_exploration",
+        "data_play",
+        protocol_name + ".npz",
+    )
+    if not os.path.exists(os.path.dirname(log_file_path)):
+        os.makedirs(os.path.dirname(log_file_path))
+
+    states_to_log_dict = create_logging_dict(runner, TEST_TOTAL_TIMESTEPS)
+
+    # * set up recording
+    if env.cfg.viewer.record:
+        recorder = VisualizationRecorder(
+            env, train_cfg.runner.experiment_name, train_cfg.runner.load_run
+        )
+
+    # * set up interface: GamepadInterface(env) or KeyboardInterface(env)
+    COMMANDS_INTERFACE = hasattr(env, "commands")
+    if COMMANDS_INTERFACE:
+        # interface = GamepadInterface(env)
+        interface = KeyboardInterface(env)
+
+    for t in range(TEST_TOTAL_TIMESTEPS):
+        if COMMANDS_INTERFACE:
+            interface.update(env)
+        if env.cfg.viewer.record:
+            recorder.update(t)
+        runner.set_actions(
+            runner.policy_cfg["actions"],
+            runner.get_inference_actions(),
+            runner.policy_cfg["disable_actions"],
+        )
+        env.step()
+        env.check_exit()
+
+        # * log
+        for state in states_to_log_dict:
+            states_to_log_dict[state][:, t, :] = getattr(env, state)[0, :]
+
+    # * save data
+    # first convert tensors to cpu
+    log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()}
+    np.savez_compressed(log_file_path, **log_dict_cpu)
+    print("saved to ", log_file_path)
+    return states_to_log_dict
+
+
+if __name__ == "__main__":
+    args = get_args()
+    with torch.no_grad():
+        env, runner, train_cfg = setup(args)
+        play(env, runner, train_cfg)
diff --git a/scripts/log_train.py b/scripts/log_train.py
new file mode 100644
index 00000000..37fca0da
--- /dev/null
+++ b/scripts/log_train.py
@@ -0,0 +1,101 @@
+from gym.envs import __init__  # noqa: F401
+from gym.utils import get_args, task_registry, randomize_episode_counters
+from gym.utils.logging_and_saving import wandb_singleton
+from gym.utils.logging_and_saving import local_code_save_helper
+from gym import LEGGED_GYM_ROOT_DIR
+
+# torch needs to be imported after isaacgym imports in local source
+import torch
+import os
+import numpy as np
+
+TRAIN_ITERATIONS = 100
+ROLLOUT_TIMESTEPS = 1000
+
+
+def create_logging_dict(runner):
+    states_to_log = [
+        "dof_pos_target",
+        "dof_pos_obs",
+        "dof_vel",
+        "torques",
+        "commands",
+        # 'base_lin_vel',
+        # 'base_ang_vel',
+        # 'oscillators',
+        # 'grf',
+        # 'base_height'
+    ]
+
+    states_to_log_dict = {}
+
+    for state in states_to_log:
+        array_dim = runner.get_obs_size(
+            [
+                state,
+            ]
+        )
+        states_to_log_dict[state] = torch.zeros(
+            (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim),
+            device=runner.env.device,
+        )
+
+    # log whether rollout terminated
+    states_to_log_dict["terminated"] = torch.zeros(
+        (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, 1), device=runner.env.device
+    )
+    return states_to_log_dict
+
+
+def setup():
+    args = get_args()
+    wandb_helper = wandb_singleton.WandbSingleton()
+
+    env_cfg, train_cfg = task_registry.create_cfgs(args)
+    task_registry.make_gym_and_sim()
+    wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args)
+    env = task_registry.make_env(name=args.task, env_cfg=env_cfg)
+    randomize_episode_counters(env)
+
+    train_cfg.runner.max_iterations = TRAIN_ITERATIONS
+    policy_runner = task_registry.make_alg_runner(env, train_cfg)
+
+    local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir)
+
+    return train_cfg, policy_runner
+
+
+def train(train_cfg, policy_runner):
+    protocol_name = train_cfg.runner.experiment_name
+
+    # * set up logging
+    log_file_path = os.path.join(
+        LEGGED_GYM_ROOT_DIR,
+        "gym",
+        "smooth_exploration",
+        "data_train",
+        protocol_name + ".npz",
+    )
+    if not os.path.exists(os.path.dirname(log_file_path)):
+        os.makedirs(os.path.dirname(log_file_path))
+
+    states_to_log_dict = create_logging_dict(policy_runner)
+
+    # * train
+    # wandb_helper = wandb_singleton.WandbSingleton()
+
+    policy_runner.learn(states_to_log_dict)
+
+    # wandb_helper.close_wandb()
+
+    # * save data
+    # first convert tensors to cpu
+    log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()}
+    np.savez_compressed(log_file_path, **log_dict_cpu)
+    print("saved to ", log_file_path)
+    return states_to_log_dict
+
+
+if __name__ == "__main__":
+    train_cfg, policy_runner = setup()
+    train(train_cfg=train_cfg, policy_runner=policy_runner)
diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json
new file mode 100644
index 00000000..49fc4125
--- /dev/null
+++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json
@@ -0,0 +1,16 @@
+{
+    "method": "grid",
+    "metric": {
+        "name": "rewards/total_rewards",
+        "goal": "maximize"
+    },
+    "run_cap": 10,
+    "parameters": {
+        "train_cfg.policy.exploration_sample_freq":{
+            "values": [4, 8, 16, 32]
+        },
+        "train_cfg.runner.num_steps_per_env":{
+            "values": [32, 64]
+        }
+    }
+}
\ No newline at end of file