From bb85463c8c8cbd981673262a396f615ee38ba3fa Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 28 Feb 2024 13:30:12 -0500 Subject: [PATCH 01/19] smooth noise sampling and started gSDE --- .gitignore | 2 + .../mini_cheetah/mini_cheetah_ref_config.py | 2 +- learning/algorithms/ppo.py | 2 +- learning/modules/actor.py | 19 ++ learning/modules/actor_critic.py | 15 ++ learning/modules/utils/__init__.py | 1 + learning/modules/utils/gSDE.py | 191 ++++++++++++++++++ learning/modules/utils/neural_net.py | 9 +- learning/runners/on_policy_runner.py | 4 + plots/plot.py | 19 ++ 10 files changed, 260 insertions(+), 4 deletions(-) create mode 100644 learning/modules/utils/gSDE.py create mode 100644 plots/plot.py diff --git a/.gitignore b/.gitignore index 0b68375a..d2761927 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ gym/wandb *.npz user/wandb_config.json *trajectories/ +plots/*.csv +plots/*.png # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 41bb634f..4db376ce 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -67,7 +67,7 @@ class scaling(MiniCheetahCfg.scaling): class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): seed = -1 - runner_class_name = "MyRunner" + runner_class_name = "OnPolicyRunner" class policy(MiniCheetahRunnerCfg.policy): actor_hidden_dims = [256, 256, 128] diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 9c7454bd..3ca5a5b4 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -152,7 +152,7 @@ def update(self): old_mu_batch, old_sigma_batch, ) in generator: - self.actor_critic.act(obs_batch) + self.actor_critic.actor.update_distribution(obs_batch) actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch ) diff --git a/learning/modules/actor.py b/learning/modules/actor.py index f9c82396..e588ea5e 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -37,6 +37,9 @@ def __init__( self.distribution = None # disable args validation for speedup Normal.set_default_validate_args = False + # smooth exploration + self.use_smooth_expl = True + self.episode_noise = None @property def action_mean(self): @@ -58,8 +61,24 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) + if self.use_smooth_expl: + return self.act_smooth() return self.distribution.sample() + def act_smooth(self): + mean = self.distribution.mean + if self.episode_noise is None: + sample = self.distribution.sample() + self.episode_noise = sample - self.distribution.mean + else: + sample = mean + self.episode_noise + # write to csv + with open( + "/home/lmolnar/workspace/QGym/plots/distribution_smooth.csv", "a" + ) as f: + f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") + return sample + def get_actions_log_prob(self, actions): return self.distribution.log_prob(actions).sum(dim=-1) diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index ed837f5e..91365fcc 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -2,6 +2,7 @@ from .actor import Actor from .critic import Critic +from .utils import StateDependentNoiseDistribution class ActorCritic(nn.Module): @@ -15,6 +16,7 @@ def __init__( activation="elu", init_noise_std=1.0, normalize_obs=True, + use_sde=True, **kwargs, ): if kwargs: @@ -40,6 +42,19 @@ def __init__( print(f"Actor MLP: {self.actor.NN}") print(f"Critic MLP: {self.critic.NN}") + # TODO[lm]: Decide how to handle the state dependent noise distribution in + # this class. In stable-baselines3 there is a class MlpExtractor which does + # what the Actor and Critic classes do here, just with the latent representation + # of the networks. Either I make a new class in a similar way and store the + # action distribution here, or I make the changes in Actor and Critic and change + # the distribution there. + if use_sde: + self.action_dist = StateDependentNoiseDistribution( + num_actions, + num_actor_obs, + num_critic_obs, + ) + @property def action_mean(self): return self.actor.action_mean diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py index 1422b82d..3bc82600 100644 --- a/learning/modules/utils/__init__.py +++ b/learning/modules/utils/__init__.py @@ -1,2 +1,3 @@ from .neural_net import create_MLP, export_network from .normalize import RunningMeanStd +from .gSDE import StateDependentNoiseDistribution diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py new file mode 100644 index 00000000..2920b796 --- /dev/null +++ b/learning/modules/utils/gSDE.py @@ -0,0 +1,191 @@ +from typing import Tuple, Optional + +import torch +import torch.nn as nn +from torch.distributions import Normal + + +# The follow implementation was used in the gSDE paper on smooth exploration. See code: +# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 +# TODO[lm]: Need to update some of the naming conventions to fit our codebase. +class StateDependentNoiseDistribution: + """ + Distribution class for using generalized State Dependent Exploration (gSDE). + It is used to create the noise exploration matrix and compute the log + probability of an action with that noise. + """ + + latent_sde_dim: Optional[int] + weights_dist: Normal + _latent_sde: torch.Tensor + exploration_mat: torch.Tensor + exploration_matrices: torch.Tensor + + def __init__( + self, + num_actions: int, + num_actor_obs: int, + num_critic_obs: int, + full_std: bool = True, + use_expln: bool = False, + learn_features: bool = False, + epsilon: float = 1e-6, + ): + self.num_actions = num_actions + self.num_actor_obs = num_actor_obs + self.num_critic_obs = num_critic_obs + self.full_std = full_std + self.use_expln = use_expln + self.learn_features = learn_features + self.epsilon = epsilon + self.mean_actions = None + self.log_std = None + + def get_std(self, log_std: torch.Tensor) -> torch.Tensor: + """ + Get the standard deviation from the learned parameter + (log of it by default). This ensures that the std is positive. + + :param log_std: + :return: + """ + if self.use_expln: + # From gSDE paper, it allows to keep variance + # above zero and prevent it from growing too fast + below_threshold = torch.exp(log_std) * (log_std <= 0) + # Avoid NaN: zeros values that are below zero + safe_log_std = log_std * (log_std > 0) + self.epsilon + above_threshold = (torch.log1p(safe_log_std) + 1.0) * (log_std > 0) + std = below_threshold + above_threshold + else: + # Use normal exponential + std = torch.exp(log_std) + + if self.full_std: + return std + assert self.latent_sde_dim is not None + # Reduce the number of parameters: + return torch.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std + + def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None: + """ + Sample weights for the noise exploration matrix, + using a centered Gaussian distribution. + + :param log_std: + :param batch_size: + """ + std = self.get_std(log_std) + self.weights_dist = Normal(torch.zeros_like(std), std) + # Reparametrization trick to pass gradients + self.exploration_mat = self.weights_dist.rsample() + # Pre-compute matrices in case of parallel exploration + self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + + def proba_distribution_net( + self, + latent_dim: int, + log_std_init: float = -2.0, + latent_sde_dim: Optional[int] = None, + ) -> Tuple[nn.Module, nn.Parameter]: + """ + Create the layers and parameter that represent the distribution: + one output will be the deterministic action, the other parameter will be the + standard deviation of the distribution that control the weights of the noise + matrix. + + :param latent_dim: Dimension of the last layer of the policy (before the + action layer) + :param log_std_init: Initial value for the log standard deviation + :param latent_sde_dim: Dimension of the last layer of the features extractor + for gSDE. By default, it is shared with the policy network. + :return: + """ + # Network for the deterministic action, it represents the mean of the + # distribution + mean_actions_net = nn.Linear(latent_dim, self.action_dim) + # When we learn features for the noise, the feature dimension + # can be different between the policy and the noise network + self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim + # Reduce the number of parameters if needed + log_std = ( + torch.ones(self.latent_sde_dim, self.action_dim) + if self.full_std + else torch.ones(self.latent_sde_dim, 1) + ) + # Transform it to a parameter so it can be optimized + log_std = nn.Parameter(log_std * log_std_init, requires_grad=True) + # Sample an exploration matrix + self.sample_weights(log_std) + return mean_actions_net, log_std + + def proba_distribution( + self, + mean_actions: torch.Tensor, + log_std: torch.Tensor, + latent_sde: torch.Tensor, + ) -> "StateDependentNoiseDistribution": + """ + Create the distribution given its parameters (mean, std) + + :param mean_actions: + :param log_std: + :param latent_sde: + :return: + """ + # Stop gradient if we don't want to influence the features + self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() + variance = torch.mm(self._latent_sde**2, self.get_std(log_std) ** 2) + self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) + return self + + def log_prob(self, actions: torch.Tensor): + # TODO[lm]: verify that summed correctly (stable-baselines3 is different) + return self.distribution.log_prob(actions).sum(-1) + + def entropy(self): + # TODO[lm]: verify that summed correctly (stable-baselines3 is different) + return self.distribution.entropy().sum(-1) + + def sample(self) -> torch.Tensor: + noise = self.get_noise(self._latent_sde) + actions = self.distribution.mean + noise + return actions + + def mode(self) -> torch.Tensor: + return self.distribution.mean + + def get_noise(self, latent_sde: torch.Tensor) -> torch.Tensor: + latent_sde = latent_sde if self.learn_features else latent_sde.detach() + # Default case: only one exploration matrix + if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): + return torch.mm(latent_sde, self.exploration_mat) + # Use batch matrix multiplication for efficient computation + # (batch_size, n_features) -> (batch_size, 1, n_features) + latent_sde = latent_sde.unsqueeze(dim=1) + # (batch_size, 1, n_actions) + noise = torch.bmm(latent_sde, self.exploration_matrices) + return noise.squeeze(dim=1) + + def actions_from_params( + self, + mean_actions: torch.Tensor, + log_std: torch.Tensor, + latent_sde: torch.Tensor, + deterministic: bool = False, + ) -> torch.Tensor: + # Update the proba distribution + self.proba_distribution(mean_actions, log_std, latent_sde) + if deterministic: + return self.mode() + return self.sample() + + def log_prob_from_params( + self, + mean_actions: torch.Tensor, + log_std: torch.Tensor, + latent_sde: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + actions = self.actions_from_params(mean_actions, log_std, latent_sde) + log_prob = self.log_prob(actions) + return actions, log_prob diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 5fd6ab3b..fa25d77b 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -3,7 +3,9 @@ import copy -def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): +def create_MLP( + num_inputs, num_outputs, hidden_dims, activation, dropouts=None, latent=False +): activation = get_activation(activation) if dropouts is None: @@ -15,7 +17,10 @@ def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): else: add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0]) for i in range(len(hidden_dims)): - if i == len(hidden_dims) - 1: + # TODO[lm]: Could also create a separate function that gives the latent + # reprentation used for smooth exploration (but if it doesn't mess up + # anything, this is simpler) + if i == len(hidden_dims) - 1 and not latent: add_layer(layers, hidden_dims[i], num_outputs) else: add_layer( diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index e2fb524e..9bf7c9f7 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -37,6 +37,10 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # Reset noise for smooth exploration + if self.alg.actor_critic.actor.use_smooth_expl and i == 0: + self.alg.actor_critic.actor.episode_noise = None + actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.policy_cfg["actions"], diff --git a/plots/plot.py b/plots/plot.py new file mode 100644 index 00000000..2a33eafa --- /dev/null +++ b/plots/plot.py @@ -0,0 +1,19 @@ +import pandas as pd +import matplotlib.pyplot as plt + +# Read the CSV file +name = "distribution_smooth" +data = pd.read_csv(name + ".csv") + +# Plot the data +n = 200 +plt.plot(data.iloc[:n, 0]) +plt.plot(data.iloc[:n, 1]) +plt.xlabel("timestep") +plt.ylabel("action") +plt.title("Smoothing every rollout") +plt.legend(["mean", "sample"]) +# plt.show() + +# Save the plot to a file +plt.savefig(name + ".png") From 243cf0f787b268d4b6f75e35b6524e3de911108b Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 1 Mar 2024 10:41:25 -0500 Subject: [PATCH 02/19] adress comments --- gym/envs/base/legged_robot_config.py | 1 + gym/envs/mini_cheetah/mini_cheetah_config.py | 1 + .../mini_cheetah/mini_cheetah_osc_config.py | 1 + .../mini_cheetah/mini_cheetah_ref_config.py | 1 + gym/envs/mit_humanoid/mit_humanoid_config.py | 1 + learning/modules/actor.py | 44 +++++++++++-------- learning/modules/actor_critic.py | 32 +++++++++----- learning/modules/utils/gSDE.py | 29 +++--------- learning/runners/on_policy_runner.py | 2 +- 9 files changed, 58 insertions(+), 54 deletions(-) diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 0de83e0e..144f8323 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -240,6 +240,7 @@ class policy: # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True + smooth_exploration = False actor_obs = [ "observation_a", diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index de015b21..ef9896e0 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -130,6 +130,7 @@ class policy(LeggedRobotRunnerCfg.policy): critic_hidden_dims = [128, 64] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False actor_obs = [ "base_lin_vel", diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a08d5179..45191a9f 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -166,6 +166,7 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False actor_obs = [ "base_ang_vel", diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 4db376ce..2f894bde 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,6 +74,7 @@ class policy(MiniCheetahRunnerCfg.policy): critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = True actor_obs = [ "base_ang_vel", diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..25db6163 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -187,6 +187,7 @@ class policy(LeggedRobotRunnerCfg.policy): critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False actor_obs = [ "base_height", diff --git a/learning/modules/actor.py b/learning/modules/actor.py index e588ea5e..857d7326 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -5,6 +5,8 @@ from .utils import export_network from .utils import RunningMeanStd +from gym import LEGGED_GYM_ROOT_DIR + class Actor(nn.Module): def __init__( @@ -37,9 +39,6 @@ def __init__( self.distribution = None # disable args validation for speedup Normal.set_default_validate_args = False - # smooth exploration - self.use_smooth_expl = True - self.episode_noise = None @property def action_mean(self): @@ -61,24 +60,8 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) - if self.use_smooth_expl: - return self.act_smooth() return self.distribution.sample() - def act_smooth(self): - mean = self.distribution.mean - if self.episode_noise is None: - sample = self.distribution.sample() - self.episode_noise = sample - self.distribution.mean - else: - sample = mean + self.episode_noise - # write to csv - with open( - "/home/lmolnar/workspace/QGym/plots/distribution_smooth.csv", "a" - ) as f: - f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") - return sample - def get_actions_log_prob(self, actions): return self.distribution.log_prob(actions).sum(dim=-1) @@ -97,3 +80,26 @@ def normalize(self, observation): def export(self, path): export_network(self, "policy", path, self.num_obs) + + +class SmoothActor(Actor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Noise for smooth exploration + self.episode_noise = None + self.debug = False + + def act(self, observations): + self.update_distribution(observations) + mean = self.distribution.mean + if self.episode_noise is None: + sample = self.distribution.sample() + self.episode_noise = sample - self.distribution.mean + else: + sample = mean + self.episode_noise + + if self.debug: + # write to csv (used for plotting) + with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f: + f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") + return sample diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 91365fcc..669a9f6a 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,6 +1,6 @@ import torch.nn as nn -from .actor import Actor +from .actor import Actor, SmoothActor from .critic import Critic from .utils import StateDependentNoiseDistribution @@ -16,7 +16,7 @@ def __init__( activation="elu", init_noise_std=1.0, normalize_obs=True, - use_sde=True, + smooth_exploration=False, **kwargs, ): if kwargs: @@ -26,14 +26,24 @@ def __init__( ) super(ActorCritic, self).__init__() - self.actor = Actor( - num_actor_obs, - num_actions, - actor_hidden_dims, - activation, - init_noise_std, - normalize_obs, - ) + if smooth_exploration: + self.actor = SmoothActor( + num_actor_obs, + num_actions, + actor_hidden_dims, + activation, + init_noise_std, + normalize_obs, + ) + else: + self.actor = Actor( + num_actor_obs, + num_actions, + actor_hidden_dims, + activation, + init_noise_std, + normalize_obs, + ) self.critic = Critic( num_critic_obs, critic_hidden_dims, activation, normalize_obs @@ -48,7 +58,7 @@ def __init__( # of the networks. Either I make a new class in a similar way and store the # action distribution here, or I make the changes in Actor and Critic and change # the distribution there. - if use_sde: + if smooth_exploration: self.action_dist = StateDependentNoiseDistribution( num_actions, num_actor_obs, diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py index 2920b796..6e04b0e9 100644 --- a/learning/modules/utils/gSDE.py +++ b/learning/modules/utils/gSDE.py @@ -5,7 +5,8 @@ from torch.distributions import Normal -# The follow implementation was used in the gSDE paper on smooth exploration. See code: +# The following implementation was used in the gSDE paper on smooth exploration. +# See code: # https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 # TODO[lm]: Need to update some of the naming conventions to fit our codebase. class StateDependentNoiseDistribution: @@ -45,9 +46,6 @@ def get_std(self, log_std: torch.Tensor) -> torch.Tensor: """ Get the standard deviation from the learned parameter (log of it by default). This ensures that the std is positive. - - :param log_std: - :return: """ if self.use_expln: # From gSDE paper, it allows to keep variance @@ -71,9 +69,6 @@ def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None: """ Sample weights for the noise exploration matrix, using a centered Gaussian distribution. - - :param log_std: - :param batch_size: """ std = self.get_std(log_std) self.weights_dist = Normal(torch.zeros_like(std), std) @@ -82,7 +77,7 @@ def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None: # Pre-compute matrices in case of parallel exploration self.exploration_matrices = self.weights_dist.rsample((batch_size,)) - def proba_distribution_net( + def get_distribution_net( self, latent_dim: int, log_std_init: float = -2.0, @@ -93,13 +88,6 @@ def proba_distribution_net( one output will be the deterministic action, the other parameter will be the standard deviation of the distribution that control the weights of the noise matrix. - - :param latent_dim: Dimension of the last layer of the policy (before the - action layer) - :param log_std_init: Initial value for the log standard deviation - :param latent_sde_dim: Dimension of the last layer of the features extractor - for gSDE. By default, it is shared with the policy network. - :return: """ # Network for the deterministic action, it represents the mean of the # distribution @@ -119,7 +107,7 @@ def proba_distribution_net( self.sample_weights(log_std) return mean_actions_net, log_std - def proba_distribution( + def get_distribution( self, mean_actions: torch.Tensor, log_std: torch.Tensor, @@ -127,11 +115,6 @@ def proba_distribution( ) -> "StateDependentNoiseDistribution": """ Create the distribution given its parameters (mean, std) - - :param mean_actions: - :param log_std: - :param latent_sde: - :return: """ # Stop gradient if we don't want to influence the features self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() @@ -174,8 +157,8 @@ def actions_from_params( latent_sde: torch.Tensor, deterministic: bool = False, ) -> torch.Tensor: - # Update the proba distribution - self.proba_distribution(mean_actions, log_std, latent_sde) + # Update the distribution + self.get_distribution(mean_actions, log_std, latent_sde) if deterministic: return self.mode() return self.sample() diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 9bf7c9f7..1cc3e0ef 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -38,7 +38,7 @@ def learn(self): with torch.inference_mode(): for i in range(self.num_steps_per_env): # Reset noise for smooth exploration - if self.alg.actor_critic.actor.use_smooth_expl and i == 0: + if self.policy_cfg["smooth_exploration"] and i == 0: self.alg.actor_critic.actor.episode_noise = None actions = self.alg.act(actor_obs, critic_obs) From d838dc6d140b648f84300efd4f92c9f714745e1c Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 1 Mar 2024 11:34:49 -0500 Subject: [PATCH 03/19] start moving things to SmoothActor --- learning/modules/actor.py | 22 +++++++++++++++++++++- learning/modules/actor_critic.py | 14 -------------- learning/modules/critic.py | 2 +- learning/modules/utils/gSDE.py | 2 -- learning/modules/utils/neural_net.py | 5 +++-- 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 857d7326..c8ffe255 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -4,6 +4,7 @@ from .utils import create_MLP from .utils import export_network from .utils import RunningMeanStd +from .utils import StateDependentNoiseDistribution from gym import LEGGED_GYM_ROOT_DIR @@ -32,7 +33,11 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions - self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) + self.hidden_dims = hidden_dims + self.activation = activation + self.NN = create_MLP( + num_obs, num_actions, hidden_dims, activation, latent=False + ) # Action noise self.std = nn.Parameter(init_noise_std * torch.ones(num_actions)) @@ -85,11 +90,26 @@ def export(self, path): class SmoothActor(Actor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # Create latent NN + self.NN = create_MLP( + self.num_obs, + self.num_actions, + self.hidden_dims, + self.activation, + latent=True, + ) + # State dependent action distribution + self.distribution = StateDependentNoiseDistribution( + self.num_actions, + self.num_obs, + ) # Noise for smooth exploration self.episode_noise = None + # Debug mode for plotting self.debug = False def act(self, observations): + # TODO[lm]: update distribution for gSDE self.update_distribution(observations) mean = self.distribution.mean if self.episode_noise is None: diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 669a9f6a..770fdc71 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -2,7 +2,6 @@ from .actor import Actor, SmoothActor from .critic import Critic -from .utils import StateDependentNoiseDistribution class ActorCritic(nn.Module): @@ -52,19 +51,6 @@ def __init__( print(f"Actor MLP: {self.actor.NN}") print(f"Critic MLP: {self.critic.NN}") - # TODO[lm]: Decide how to handle the state dependent noise distribution in - # this class. In stable-baselines3 there is a class MlpExtractor which does - # what the Actor and Critic classes do here, just with the latent representation - # of the networks. Either I make a new class in a similar way and store the - # action distribution here, or I make the changes in Actor and Critic and change - # the distribution there. - if smooth_exploration: - self.action_dist = StateDependentNoiseDistribution( - num_actions, - num_actor_obs, - num_critic_obs, - ) - @property def action_mean(self): return self.actor.action_mean diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 47e0decf..430eca33 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -20,7 +20,7 @@ def __init__( ) super().__init__() - self.NN = create_MLP(num_obs, 1, hidden_dims, activation) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation, latent=False) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py index 6e04b0e9..cd100053 100644 --- a/learning/modules/utils/gSDE.py +++ b/learning/modules/utils/gSDE.py @@ -26,7 +26,6 @@ def __init__( self, num_actions: int, num_actor_obs: int, - num_critic_obs: int, full_std: bool = True, use_expln: bool = False, learn_features: bool = False, @@ -34,7 +33,6 @@ def __init__( ): self.num_actions = num_actions self.num_actor_obs = num_actor_obs - self.num_critic_obs = num_critic_obs self.full_std = full_std self.use_expln = use_expln self.learn_features = learn_features diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index fa25d77b..057e724f 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -20,8 +20,9 @@ def create_MLP( # TODO[lm]: Could also create a separate function that gives the latent # reprentation used for smooth exploration (but if it doesn't mess up # anything, this is simpler) - if i == len(hidden_dims) - 1 and not latent: - add_layer(layers, hidden_dims[i], num_outputs) + if i == len(hidden_dims) - 1: + if not latent: + add_layer(layers, hidden_dims[i], num_outputs) else: add_layer( layers, From aea5d88fb736087108cf37e66aa0ea8093638514 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 1 Mar 2024 16:25:05 -0500 Subject: [PATCH 04/19] moved everything to SmoothActor (it runs) --- learning/modules/actor.py | 41 ------- learning/modules/actor_critic.py | 3 +- learning/modules/smooth_actor.py | 130 ++++++++++++++++++++ learning/modules/utils/__init__.py | 1 - learning/modules/utils/gSDE.py | 172 --------------------------- learning/runners/on_policy_runner.py | 7 +- plots/plot.py | 10 +- 7 files changed, 140 insertions(+), 224 deletions(-) create mode 100644 learning/modules/smooth_actor.py delete mode 100644 learning/modules/utils/gSDE.py diff --git a/learning/modules/actor.py b/learning/modules/actor.py index c8ffe255..17cf0566 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -4,9 +4,6 @@ from .utils import create_MLP from .utils import export_network from .utils import RunningMeanStd -from .utils import StateDependentNoiseDistribution - -from gym import LEGGED_GYM_ROOT_DIR class Actor(nn.Module): @@ -85,41 +82,3 @@ def normalize(self, observation): def export(self, path): export_network(self, "policy", path, self.num_obs) - - -class SmoothActor(Actor): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Create latent NN - self.NN = create_MLP( - self.num_obs, - self.num_actions, - self.hidden_dims, - self.activation, - latent=True, - ) - # State dependent action distribution - self.distribution = StateDependentNoiseDistribution( - self.num_actions, - self.num_obs, - ) - # Noise for smooth exploration - self.episode_noise = None - # Debug mode for plotting - self.debug = False - - def act(self, observations): - # TODO[lm]: update distribution for gSDE - self.update_distribution(observations) - mean = self.distribution.mean - if self.episode_noise is None: - sample = self.distribution.sample() - self.episode_noise = sample - self.distribution.mean - else: - sample = mean + self.episode_noise - - if self.debug: - # write to csv (used for plotting) - with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f: - f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") - return sample diff --git a/learning/modules/actor_critic.py b/learning/modules/actor_critic.py index 770fdc71..29f6cdf7 100644 --- a/learning/modules/actor_critic.py +++ b/learning/modules/actor_critic.py @@ -1,6 +1,7 @@ import torch.nn as nn -from .actor import Actor, SmoothActor +from .actor import Actor +from .smooth_actor import SmoothActor from .critic import Critic diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py new file mode 100644 index 00000000..6c6a42c7 --- /dev/null +++ b/learning/modules/smooth_actor.py @@ -0,0 +1,130 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal + +from .actor import Actor +from .utils import create_MLP + +from gym import LEGGED_GYM_ROOT_DIR + + +# The following implementation is based on the gSDE paper. See code: +# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 +class SmoothActor(Actor): + weights_dist: Normal + _latent_sde: torch.Tensor + exploration_mat: torch.Tensor + exploration_matrices: torch.Tensor + + def __init__( + self, + *args, + full_std: bool = True, + use_exp_ln: bool = False, + learn_features: bool = False, + epsilon: float = 1e-6, + log_std_init: float = -2.0, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.full_std = full_std + self.use_expln = use_exp_ln + self.learn_features = learn_features + self.epsilon = epsilon + self.log_std_init = log_std_init + + # Create latent NN and last layer + self.latent_net = create_MLP( + self.num_obs, + self.num_actions, + self.hidden_dims, + self.activation, + latent=True, + ) + self.latent_dim = self.hidden_dims[-1] + self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions) + # Reduce the number of parameters if needed + if self.full_std: + log_std = torch.ones(self.latent_dim, self.num_actions) + else: + log_std = torch.ones(self.latent_dim, 1) + self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True) + # Sample an exploration matrix (this sets the exploration matrices) + self.sample_weights() + self.distribution = None + + # Debug mode for plotting + self.debug = False + + def sample_weights(self, batch_size=1): + # Sample weights for the noise exploration matrix + std = self.get_std() + self.weights_dist = Normal(torch.zeros_like(std), std) + # Reparametrization trick to pass gradients + self.exploration_mat = self.weights_dist.rsample() + # Pre-compute matrices in case of parallel exploration + self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + + def get_std(self): + # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally + if self.use_expln: + # From gSDE paper, it allows to keep variance + # above zero and prevent it from growing too fast + below_threshold = torch.exp(self.log_std) * (self.log_std <= 0) + # Avoid NaN: zeros values that are below zero + safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon + above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0) + std = below_threshold + above_threshold + else: + # Use normal exponential + std = torch.exp(self.log_std) + + if self.full_std: + return std + assert self.latent_dim is not None + # Reduce the number of parameters: + return ( + torch.ones(self.latent_dim, self.num_actions).to(self.log_std.device) * std + ) + + def update_distribution(self, observations): + if self._normalize_obs: + observations = self.normalize(observations) + # Get latent features and compute distribution + self._latent_sde = self.latent_net(observations) + if not self.learn_features: + self._latent_sde = self._latent_sde.detach() + variance = torch.mm(self._latent_sde**2, self.get_std() ** 2) + mean_actions = self.mean_actions_net(self._latent_sde) + self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) + + def act(self, observations): + self.update_distribution(observations) + mean = self.distribution.mean + sample = mean + self.get_noise() + if self.debug: + # write to csv (used for plotting) + with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f: + f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") + return sample + + def act_inference(self, observations): + if self._normalize_obs: + observations = self.normalize(observations) + latent_sde = self.latent_net(observations) + mean_actions = self.mean_actions_net(latent_sde) + return mean_actions + + def get_noise(self): + latent_sde = self._latent_sde + if not self.learn_features: + latent_sde = latent_sde.detach() + # Default case: only one exploration matrix + if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): + return torch.mm(latent_sde, self.exploration_mat.to(latent_sde.device)) + # Use batch matrix multiplication for efficient computation + # (batch_size, n_features) -> (batch_size, 1, n_features) + latent_sde = latent_sde.unsqueeze(dim=1) + # (batch_size, 1, n_actions) + noise = torch.bmm(latent_sde, self.exploration_matrices) + return noise.squeeze(dim=1) diff --git a/learning/modules/utils/__init__.py b/learning/modules/utils/__init__.py index 3bc82600..1422b82d 100644 --- a/learning/modules/utils/__init__.py +++ b/learning/modules/utils/__init__.py @@ -1,3 +1,2 @@ from .neural_net import create_MLP, export_network from .normalize import RunningMeanStd -from .gSDE import StateDependentNoiseDistribution diff --git a/learning/modules/utils/gSDE.py b/learning/modules/utils/gSDE.py deleted file mode 100644 index cd100053..00000000 --- a/learning/modules/utils/gSDE.py +++ /dev/null @@ -1,172 +0,0 @@ -from typing import Tuple, Optional - -import torch -import torch.nn as nn -from torch.distributions import Normal - - -# The following implementation was used in the gSDE paper on smooth exploration. -# See code: -# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 -# TODO[lm]: Need to update some of the naming conventions to fit our codebase. -class StateDependentNoiseDistribution: - """ - Distribution class for using generalized State Dependent Exploration (gSDE). - It is used to create the noise exploration matrix and compute the log - probability of an action with that noise. - """ - - latent_sde_dim: Optional[int] - weights_dist: Normal - _latent_sde: torch.Tensor - exploration_mat: torch.Tensor - exploration_matrices: torch.Tensor - - def __init__( - self, - num_actions: int, - num_actor_obs: int, - full_std: bool = True, - use_expln: bool = False, - learn_features: bool = False, - epsilon: float = 1e-6, - ): - self.num_actions = num_actions - self.num_actor_obs = num_actor_obs - self.full_std = full_std - self.use_expln = use_expln - self.learn_features = learn_features - self.epsilon = epsilon - self.mean_actions = None - self.log_std = None - - def get_std(self, log_std: torch.Tensor) -> torch.Tensor: - """ - Get the standard deviation from the learned parameter - (log of it by default). This ensures that the std is positive. - """ - if self.use_expln: - # From gSDE paper, it allows to keep variance - # above zero and prevent it from growing too fast - below_threshold = torch.exp(log_std) * (log_std <= 0) - # Avoid NaN: zeros values that are below zero - safe_log_std = log_std * (log_std > 0) + self.epsilon - above_threshold = (torch.log1p(safe_log_std) + 1.0) * (log_std > 0) - std = below_threshold + above_threshold - else: - # Use normal exponential - std = torch.exp(log_std) - - if self.full_std: - return std - assert self.latent_sde_dim is not None - # Reduce the number of parameters: - return torch.ones(self.latent_sde_dim, self.action_dim).to(log_std.device) * std - - def sample_weights(self, log_std: torch.Tensor, batch_size: int = 1) -> None: - """ - Sample weights for the noise exploration matrix, - using a centered Gaussian distribution. - """ - std = self.get_std(log_std) - self.weights_dist = Normal(torch.zeros_like(std), std) - # Reparametrization trick to pass gradients - self.exploration_mat = self.weights_dist.rsample() - # Pre-compute matrices in case of parallel exploration - self.exploration_matrices = self.weights_dist.rsample((batch_size,)) - - def get_distribution_net( - self, - latent_dim: int, - log_std_init: float = -2.0, - latent_sde_dim: Optional[int] = None, - ) -> Tuple[nn.Module, nn.Parameter]: - """ - Create the layers and parameter that represent the distribution: - one output will be the deterministic action, the other parameter will be the - standard deviation of the distribution that control the weights of the noise - matrix. - """ - # Network for the deterministic action, it represents the mean of the - # distribution - mean_actions_net = nn.Linear(latent_dim, self.action_dim) - # When we learn features for the noise, the feature dimension - # can be different between the policy and the noise network - self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim - # Reduce the number of parameters if needed - log_std = ( - torch.ones(self.latent_sde_dim, self.action_dim) - if self.full_std - else torch.ones(self.latent_sde_dim, 1) - ) - # Transform it to a parameter so it can be optimized - log_std = nn.Parameter(log_std * log_std_init, requires_grad=True) - # Sample an exploration matrix - self.sample_weights(log_std) - return mean_actions_net, log_std - - def get_distribution( - self, - mean_actions: torch.Tensor, - log_std: torch.Tensor, - latent_sde: torch.Tensor, - ) -> "StateDependentNoiseDistribution": - """ - Create the distribution given its parameters (mean, std) - """ - # Stop gradient if we don't want to influence the features - self._latent_sde = latent_sde if self.learn_features else latent_sde.detach() - variance = torch.mm(self._latent_sde**2, self.get_std(log_std) ** 2) - self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) - return self - - def log_prob(self, actions: torch.Tensor): - # TODO[lm]: verify that summed correctly (stable-baselines3 is different) - return self.distribution.log_prob(actions).sum(-1) - - def entropy(self): - # TODO[lm]: verify that summed correctly (stable-baselines3 is different) - return self.distribution.entropy().sum(-1) - - def sample(self) -> torch.Tensor: - noise = self.get_noise(self._latent_sde) - actions = self.distribution.mean + noise - return actions - - def mode(self) -> torch.Tensor: - return self.distribution.mean - - def get_noise(self, latent_sde: torch.Tensor) -> torch.Tensor: - latent_sde = latent_sde if self.learn_features else latent_sde.detach() - # Default case: only one exploration matrix - if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): - return torch.mm(latent_sde, self.exploration_mat) - # Use batch matrix multiplication for efficient computation - # (batch_size, n_features) -> (batch_size, 1, n_features) - latent_sde = latent_sde.unsqueeze(dim=1) - # (batch_size, 1, n_actions) - noise = torch.bmm(latent_sde, self.exploration_matrices) - return noise.squeeze(dim=1) - - def actions_from_params( - self, - mean_actions: torch.Tensor, - log_std: torch.Tensor, - latent_sde: torch.Tensor, - deterministic: bool = False, - ) -> torch.Tensor: - # Update the distribution - self.get_distribution(mean_actions, log_std, latent_sde) - if deterministic: - return self.mode() - return self.sample() - - def log_prob_from_params( - self, - mean_actions: torch.Tensor, - log_std: torch.Tensor, - latent_sde: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - actions = self.actions_from_params(mean_actions, log_std, latent_sde) - log_prob = self.log_prob(actions) - return actions, log_prob diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 1cc3e0ef..4f68e164 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -34,13 +34,12 @@ def learn(self): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + # * Re-sample noise matrix for smooth exploration + if self.policy_cfg["smooth_exploration"]: + self.alg.actor_critic.actor.sample_weights() # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - # Reset noise for smooth exploration - if self.policy_cfg["smooth_exploration"] and i == 0: - self.alg.actor_critic.actor.episode_noise = None - actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.policy_cfg["actions"], diff --git a/plots/plot.py b/plots/plot.py index 2a33eafa..adda11ce 100644 --- a/plots/plot.py +++ b/plots/plot.py @@ -5,13 +5,13 @@ name = "distribution_smooth" data = pd.read_csv(name + ".csv") -# Plot the data -n = 200 -plt.plot(data.iloc[:n, 0]) -plt.plot(data.iloc[:n, 1]) +# Plot the data (last n steps) +n = 500 +plt.plot(data.iloc[-n:, 0]) +plt.plot(data.iloc[-n:, 1]) plt.xlabel("timestep") plt.ylabel("action") -plt.title("Smoothing every rollout") +plt.title("gSDE") plt.legend(["mean", "sample"]) # plt.show() From 3603205cf8ab988e04af4e0b69412768b04b4607 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 1 Mar 2024 18:21:56 -0500 Subject: [PATCH 05/19] possibly resample in PPO update --- learning/algorithms/ppo.py | 6 +++++- learning/modules/__init__.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 3ca5a5b4..ad31f846 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -34,7 +34,7 @@ import torch.nn as nn import torch.optim as optim -from learning.modules import ActorCritic +from learning.modules import ActorCritic, SmoothActor from learning.storage import RolloutStorage @@ -152,6 +152,10 @@ def update(self): old_mu_batch, old_sigma_batch, ) in generator: + # TODO[lm]: Look into resampling noise here, gSDE paper seems to do it. + if isinstance(self.actor_critic.actor, SmoothActor): + batch_size = obs_batch.shape[0] + self.actor_critic.actor.sample_weights(batch_size) self.actor_critic.actor.update_distribution(obs_batch) actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 75cca5e2..86c96759 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -33,3 +33,4 @@ from .actor_critic import ActorCritic from .state_estimator import StateEstimatorNN from .actor import Actor +from .smooth_actor import SmoothActor From bb0a273982a5b90f3b97bbbfca4c8f7851fa5b63 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 4 Mar 2024 11:04:36 -0500 Subject: [PATCH 06/19] learn_features=True and correct sample dim --- learning/modules/smooth_actor.py | 4 ++-- learning/runners/on_policy_runner.py | 2 +- plots/plot.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 6c6a42c7..200f03f2 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -21,7 +21,7 @@ def __init__( *args, full_std: bool = True, use_exp_ln: bool = False, - learn_features: bool = False, + learn_features: bool = True, epsilon: float = 1e-6, log_std_init: float = -2.0, **kwargs, @@ -49,7 +49,7 @@ def __init__( else: log_std = torch.ones(self.latent_dim, 1) self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True) - # Sample an exploration matrix (this sets the exploration matrices) + # Sample an exploration matrix self.sample_weights() self.distribution = None diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 4f68e164..e183b816 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -36,7 +36,7 @@ def learn(self): logger.tic("collection") # * Re-sample noise matrix for smooth exploration if self.policy_cfg["smooth_exploration"]: - self.alg.actor_critic.actor.sample_weights() + self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs) # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): diff --git a/plots/plot.py b/plots/plot.py index adda11ce..5de8ba35 100644 --- a/plots/plot.py +++ b/plots/plot.py @@ -10,7 +10,7 @@ plt.plot(data.iloc[-n:, 0]) plt.plot(data.iloc[-n:, 1]) plt.xlabel("timestep") -plt.ylabel("action") +plt.ylabel("action (NN output)") plt.title("gSDE") plt.legend(["mean", "sample"]) # plt.show() From 40025c43de53627d9ca615c0aa40d9b88476755d Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 4 Mar 2024 12:01:29 -0500 Subject: [PATCH 07/19] adjust sample freq and update plotting --- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 1 + learning/modules/actor.py | 16 +++++++++++++++- learning/modules/smooth_actor.py | 7 +++---- learning/runners/on_policy_runner.py | 10 +++++++--- plots/plot.py | 4 ++-- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 2f894bde..fda3f345 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -75,6 +75,7 @@ class policy(MiniCheetahRunnerCfg.policy): # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" smooth_exploration = True + exploration_sample_freq = 8 actor_obs = [ "base_ang_vel", diff --git a/learning/modules/actor.py b/learning/modules/actor.py index 17cf0566..25af26e6 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -5,6 +5,8 @@ from .utils import export_network from .utils import RunningMeanStd +from gym import LEGGED_GYM_ROOT_DIR + class Actor(nn.Module): def __init__( @@ -42,6 +44,9 @@ def __init__( # disable args validation for speedup Normal.set_default_validate_args = False + # Debug mode for plotting + self.debug = False + @property def action_mean(self): return self.distribution.mean @@ -62,7 +67,12 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) - return self.distribution.sample() + sample = self.distribution.sample() + if self.debug: + mean = self.distribution.mean + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_baseline.csv" + self.log_actions(mean[0][2], sample[0][2], path) + return sample def get_actions_log_prob(self, actions): return self.distribution.log_prob(actions).sum(dim=-1) @@ -82,3 +92,7 @@ def normalize(self, observation): def export(self, path): export_network(self, "policy", path, self.num_obs) + + def log_actions(self, mean, sample, path): + with open(path, "a") as f: + f.write(str(mean.item()) + ", " + str(sample.item()) + "\n") diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 200f03f2..bc262798 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -54,7 +54,7 @@ def __init__( self.distribution = None # Debug mode for plotting - self.debug = False + self.debug = True def sample_weights(self, batch_size=1): # Sample weights for the noise exploration matrix @@ -103,9 +103,8 @@ def act(self, observations): mean = self.distribution.mean sample = mean + self.get_noise() if self.debug: - # write to csv (used for plotting) - with open(f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv", "a") as f: - f.write(str(mean[0][2].item()) + ", " + str(sample[0][2].item()) + "\n") + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv" + self.log_actions(mean[0][2], sample[0][2], path) return sample def act_inference(self, observations): diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index e183b816..3b7ce33e 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -34,12 +34,16 @@ def learn(self): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") - # * Re-sample noise matrix for smooth exploration - if self.policy_cfg["smooth_exploration"]: - self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs) # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # * Re-sample noise matrix for smooth exploration + sample_freq = self.policy_cfg["exploration_sample_freq"] + if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.policy_cfg["actions"], diff --git a/plots/plot.py b/plots/plot.py index 5de8ba35..5018cb4d 100644 --- a/plots/plot.py +++ b/plots/plot.py @@ -6,12 +6,12 @@ data = pd.read_csv(name + ".csv") # Plot the data (last n steps) -n = 500 +n = 200 plt.plot(data.iloc[-n:, 0]) plt.plot(data.iloc[-n:, 1]) plt.xlabel("timestep") plt.ylabel("action (NN output)") -plt.title("gSDE") +plt.title("gSDE (sample_freq=8)") plt.legend(["mean", "sample"]) # plt.show() From c3017461e5ba7629e02dc75a5448fc4bf81e57c4 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 15 Mar 2024 14:17:22 -0400 Subject: [PATCH 08/19] update log_std_init=0.0 and refactor --- learning/modules/smooth_actor.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index bc262798..91754aeb 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -23,12 +23,12 @@ def __init__( use_exp_ln: bool = False, learn_features: bool = True, epsilon: float = 1e-6, - log_std_init: float = -2.0, + log_std_init: float = 0.0, **kwargs, ): super().__init__(*args, **kwargs) self.full_std = full_std - self.use_expln = use_exp_ln + self.use_exp_ln = use_exp_ln self.learn_features = learn_features self.epsilon = epsilon self.log_std_init = log_std_init @@ -54,7 +54,7 @@ def __init__( self.distribution = None # Debug mode for plotting - self.debug = True + self.debug = False def sample_weights(self, batch_size=1): # Sample weights for the noise exploration matrix @@ -67,7 +67,7 @@ def sample_weights(self, batch_size=1): def get_std(self): # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally - if self.use_expln: + if self.use_exp_ln: # From gSDE paper, it allows to keep variance # above zero and prevent it from growing too fast below_threshold = torch.exp(self.log_std) * (self.log_std <= 0) @@ -83,9 +83,7 @@ def get_std(self): return std assert self.latent_dim is not None # Reduce the number of parameters: - return ( - torch.ones(self.latent_dim, self.num_actions).to(self.log_std.device) * std - ) + return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std def update_distribution(self, observations): if self._normalize_obs: From a372a5274224c7b0bd18d6232452bb00cbec8778 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 3 Apr 2024 14:01:52 -0400 Subject: [PATCH 09/19] log joint data for training and play --- .gitignore | 6 +- gym/smooth_exploration/plot_play.py | 38 +++++++++ gym/smooth_exploration/plot_train.py | 55 +++++++++++++ learning/runners/on_policy_runner.py | 13 ++- plots/plot.py | 19 ----- scripts/log_play.py | 117 +++++++++++++++++++++++++++ scripts/log_train.py | 92 +++++++++++++++++++++ 7 files changed, 318 insertions(+), 22 deletions(-) create mode 100644 gym/smooth_exploration/plot_play.py create mode 100644 gym/smooth_exploration/plot_train.py delete mode 100644 plots/plot.py create mode 100644 scripts/log_play.py create mode 100644 scripts/log_train.py diff --git a/.gitignore b/.gitignore index d2761927..20093cee 100644 --- a/.gitignore +++ b/.gitignore @@ -16,8 +16,6 @@ gym/wandb *.npz user/wandb_config.json *trajectories/ -plots/*.csv -plots/*.png # Byte-compiled / optimized / DLL files __pycache__/ @@ -83,3 +81,7 @@ ipython_config.py venv/ env.bak/ venv.bak/ + +# Smooth exploration +gym/smooth_exploration/data +gym/smooth_exploration/figures diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py new file mode 100644 index 00000000..eedee17f --- /dev/null +++ b/gym/smooth_exploration/plot_play.py @@ -0,0 +1,38 @@ +import numpy as np +import matplotlib.pyplot as plt + +name = "mini_cheetah_ref" +data_dir = "./data/play/" + name +fig_dir = "./figures/play" + +# load data +dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] +dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] +dof_vel = np.load(data_dir + "/dof_vel.npy")[0] + +# plot data +n_steps = 200 +fig, axs = plt.subplots(3, figsize=(10, 10)) +plt.suptitle(name) + +for i in range(3): + axs[0].plot(dof_pos_obs[:n_steps, i]) +axs[0].set_title("dof_pos_obs") +axs[0].legend(["idx 0", "idx 1", "idx 2"]) +axs[0].set_xlabel("time steps") + +for i in range(3): + axs[1].plot(dof_pos_target[:n_steps, i]) +axs[1].set_title("dof_pos_target") +axs[1].legend(["idx 0", "idx 1", "idx 2"]) +axs[1].set_xlabel("time steps") + +for i in range(3): + axs[2].plot(dof_vel[:n_steps, i]) +axs[2].set_title("dof_vel") +axs[2].legend(["idx 0", "idx 1", "idx 2"]) +axs[2].set_xlabel("time steps") + +plt.tight_layout() +plt.savefig(fig_dir + "/" + name + ".png") +plt.show() diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py new file mode 100644 index 00000000..8a45729f --- /dev/null +++ b/gym/smooth_exploration/plot_train.py @@ -0,0 +1,55 @@ +import numpy as np +import matplotlib.pyplot as plt + +SMOOTH = True +name = "ref_sample_8" +data_dir = "./data/train/" + name +fig_dir = "./figures/train" + +# load data +dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] +dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] +dof_vel = np.load(data_dir + "/dof_vel.npy")[0] +torques = np.load(data_dir + "/torques.npy")[0] + +iterations = range(0, dof_pos_obs.shape[0], 10) + +# plot data for each iteration +for it in iterations: + fig, axs = plt.subplots(4, figsize=(10, 10)) + plt.suptitle(name + " iteration " + str(it)) + + for i in range(3): + axs[0].plot(dof_pos_obs[it, :, i]) + axs[0].set_title("dof_pos_obs") + axs[0].legend(["idx 0", "idx 1", "idx 2"]) + axs[0].set_xlabel("time steps") + + for i in range(3): + axs[1].plot(dof_pos_target[it, :, i]) + axs[1].set_title("dof_pos_target") + axs[1].legend(["idx 0", "idx 1", "idx 2"]) + axs[1].set_xlabel("time steps") + + for i in range(3): + axs[2].plot(dof_vel[it, :, i]) + axs[2].set_title("dof_vel") + axs[2].legend(["idx 0", "idx 1", "idx 2"]) + axs[2].set_xlabel("time steps") + + for i in range(3): + axs[3].plot(torques[it, :, i]) + axs[3].set_title("torques") + axs[3].legend(["idx 0", "idx 1", "idx 2"]) + axs[3].set_xlabel("time steps") + + if SMOOTH: + # plot vertical lines where noise is resampled + for x in range(0, dof_pos_obs.shape[1], 8): + axs[0].axvline(x, color="r", linestyle="--") + axs[1].axvline(x, color="r", linestyle="--") + axs[2].axvline(x, color="r", linestyle="--") + axs[3].axvline(x, color="r", linestyle="--") + + plt.tight_layout() + plt.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png") diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 9c167cd7..86541f9b 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -18,7 +18,7 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) - def learn(self): + def learn(self, states_to_log_dict=None): self.set_up_logger() rewards_dict = {} @@ -53,6 +53,17 @@ def learn(self): self.env.step() + # * Log states + # This continuously overwrites the rollout data, so when the state + # dict is written to a file it contains the last rollout for each + # iteration. + it_idx = self.it - 1 + if states_to_log_dict is not None: + for state in states_to_log_dict: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] + actor_obs = self.get_noisy_obs( self.policy_cfg["actor_obs"], self.policy_cfg["noise"] ) diff --git a/plots/plot.py b/plots/plot.py deleted file mode 100644 index 5018cb4d..00000000 --- a/plots/plot.py +++ /dev/null @@ -1,19 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -# Read the CSV file -name = "distribution_smooth" -data = pd.read_csv(name + ".csv") - -# Plot the data (last n steps) -n = 200 -plt.plot(data.iloc[-n:, 0]) -plt.plot(data.iloc[-n:, 1]) -plt.xlabel("timestep") -plt.ylabel("action (NN output)") -plt.title("gSDE (sample_freq=8)") -plt.legend(["mean", "sample"]) -# plt.show() - -# Save the plot to a file -plt.savefig(name + ".png") diff --git a/scripts/log_play.py b/scripts/log_play.py new file mode 100644 index 00000000..adc1b598 --- /dev/null +++ b/scripts/log_play.py @@ -0,0 +1,117 @@ +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry +from gym.utils import KeyboardInterface +from gym.utils import VisualizationRecorder +from gym import LEGGED_GYM_ROOT_DIR + +# torch needs to be imported after isaacgym imports in local source +import torch +import os +import numpy as np + +TEST_TOTAL_TIMESTEPS = 1000 + + +def create_logging_dict(runner, test_total_timesteps): + states_to_log = [ + "dof_pos_target", + "dof_pos_obs", + "dof_vel", + "torques", + "commands", + # 'base_lin_vel', + # 'base_ang_vel', + # 'oscillators', + # 'grf', + # 'base_height' + ] + + states_to_log_dict = {} + + for state in states_to_log: + array_dim = runner.get_obs_size( + [ + state, + ] + ) + states_to_log_dict[state] = torch.zeros( + (1, test_total_timesteps, array_dim), device=runner.env.device + ) + return states_to_log_dict + + +def setup(args): + env_cfg, train_cfg = task_registry.create_cfgs(args) + env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16) + if hasattr(env_cfg, "push_robots"): + env_cfg.push_robots.toggle = False + if hasattr(env_cfg, "commands"): + env_cfg.commands.resampling_time = 9999 + env_cfg.env.episode_length_s = 9999 + env_cfg.env.num_projectiles = 20 + task_registry.make_gym_and_sim() + env = task_registry.make_env(args.task, env_cfg) + env.cfg.init_state.reset_mode = "reset_to_basic" + train_cfg.runner.resume = True + train_cfg.logging.enable_local_saving = False + runner = task_registry.make_alg_runner(env, train_cfg) + + # * switch to evaluation mode (dropout for example) + runner.switch_to_eval() + return env, runner, train_cfg + + +def play(env, runner, train_cfg): + protocol_name = train_cfg.runner.experiment_name + + # * set up logging + log_file_path = os.path.join( + LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "play", protocol_name + ".npz" + ) + if not os.path.exists(os.path.dirname(log_file_path)): + os.makedirs(os.path.dirname(log_file_path)) + + states_to_log_dict = create_logging_dict(runner, TEST_TOTAL_TIMESTEPS) + + # * set up recording + if env.cfg.viewer.record: + recorder = VisualizationRecorder( + env, train_cfg.runner.experiment_name, train_cfg.runner.load_run + ) + + # * set up interface: GamepadInterface(env) or KeyboardInterface(env) + COMMANDS_INTERFACE = hasattr(env, "commands") + if COMMANDS_INTERFACE: + # interface = GamepadInterface(env) + interface = KeyboardInterface(env) + + for t in range(TEST_TOTAL_TIMESTEPS): + if COMMANDS_INTERFACE: + interface.update(env) + if env.cfg.viewer.record: + recorder.update(t) + runner.set_actions( + runner.policy_cfg["actions"], + runner.get_inference_actions(), + runner.policy_cfg["disable_actions"], + ) + env.step() + env.check_exit() + + # * log + for state in states_to_log_dict: + states_to_log_dict[state][:, t, :] = getattr(env, state)[0, :] + + # * save data + # first convert tensors to cpu + log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()} + np.savez_compressed(log_file_path, **log_dict_cpu) + print("saved to ", log_file_path) + return states_to_log_dict + + +if __name__ == "__main__": + args = get_args() + with torch.no_grad(): + env, runner, train_cfg = setup(args) + play(env, runner, train_cfg) diff --git a/scripts/log_train.py b/scripts/log_train.py new file mode 100644 index 00000000..918d0b8f --- /dev/null +++ b/scripts/log_train.py @@ -0,0 +1,92 @@ +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry, randomize_episode_counters +from gym.utils.logging_and_saving import wandb_singleton +from gym.utils.logging_and_saving import local_code_save_helper +from gym import LEGGED_GYM_ROOT_DIR + +# torch needs to be imported after isaacgym imports in local source +import torch +import os +import numpy as np + +TRAIN_ITERATIONS = 100 +ROLLOUT_TIMESTEPS = 32 + + +def create_logging_dict(runner): + states_to_log = [ + "dof_pos_target", + "dof_pos_obs", + "dof_vel", + "torques", + "commands", + # 'base_lin_vel', + # 'base_ang_vel', + # 'oscillators', + # 'grf', + # 'base_height' + ] + + states_to_log_dict = {} + + for state in states_to_log: + array_dim = runner.get_obs_size( + [ + state, + ] + ) + states_to_log_dict[state] = torch.zeros( + (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim), + device=runner.env.device, + ) + return states_to_log_dict + + +def setup(): + args = get_args() + wandb_helper = wandb_singleton.WandbSingleton() + + env_cfg, train_cfg = task_registry.create_cfgs(args) + task_registry.make_gym_and_sim() + wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) + env = task_registry.make_env(name=args.task, env_cfg=env_cfg) + randomize_episode_counters(env) + + train_cfg.runner.max_iterations = TRAIN_ITERATIONS + policy_runner = task_registry.make_alg_runner(env, train_cfg) + + local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) + + return train_cfg, policy_runner + + +def train(train_cfg, policy_runner): + protocol_name = train_cfg.runner.experiment_name + + # * set up logging + log_file_path = os.path.join( + LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "train", protocol_name + ".npz" + ) + if not os.path.exists(os.path.dirname(log_file_path)): + os.makedirs(os.path.dirname(log_file_path)) + + states_to_log_dict = create_logging_dict(policy_runner) + + # * train + # wandb_helper = wandb_singleton.WandbSingleton() + + policy_runner.learn(states_to_log_dict) + + # wandb_helper.close_wandb() + + # * save data + # first convert tensors to cpu + log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()} + np.savez_compressed(log_file_path, **log_dict_cpu) + print("saved to ", log_file_path) + return states_to_log_dict + + +if __name__ == "__main__": + train_cfg, policy_runner = setup() + train(train_cfg=train_cfg, policy_runner=policy_runner) From 599b2cab114cbcbae72ead09ed682b4fa1b92ff7 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Sat, 6 Apr 2024 18:05:29 -0400 Subject: [PATCH 10/19] update logging and plotting --- .gitignore | 4 +- gym/smooth_exploration/plot_play.py | 4 +- gym/smooth_exploration/plot_train.py | 86 ++++++++++++++++++++-------- learning/runners/on_policy_runner.py | 57 ++++++++++++++---- scripts/log_play.py | 6 +- scripts/log_train.py | 13 ++++- 6 files changed, 128 insertions(+), 42 deletions(-) diff --git a/.gitignore b/.gitignore index 20093cee..c78e6696 100644 --- a/.gitignore +++ b/.gitignore @@ -83,5 +83,5 @@ env.bak/ venv.bak/ # Smooth exploration -gym/smooth_exploration/data -gym/smooth_exploration/figures +gym/smooth_exploration/data* +gym/smooth_exploration/figures* diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py index eedee17f..5768f91a 100644 --- a/gym/smooth_exploration/plot_play.py +++ b/gym/smooth_exploration/plot_play.py @@ -2,8 +2,8 @@ import matplotlib.pyplot as plt name = "mini_cheetah_ref" -data_dir = "./data/play/" + name -fig_dir = "./figures/play" +data_dir = "./data_play/" + name +fig_dir = "./figures_play/" + name # load data dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py index 8a45729f..58c50396 100644 --- a/gym/smooth_exploration/plot_train.py +++ b/gym/smooth_exploration/plot_train.py @@ -1,55 +1,93 @@ import numpy as np import matplotlib.pyplot as plt +import os +FOURIER = True SMOOTH = True -name = "ref_sample_8" -data_dir = "./data/train/" + name -fig_dir = "./figures/train" +SAMPLE_FREQ = 16 +STEPS = 1000 + +name = "ref_sample_16_len_1000" +data_dir = "./data_train/" + name +fig_dir = "./figures_train/" + name + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) # load data dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] dof_vel = np.load(data_dir + "/dof_vel.npy")[0] torques = np.load(data_dir + "/torques.npy")[0] +terminated = np.load(data_dir + "/terminated.npy")[0] + + +# plot fourier trainsform +def plot_fourier(data, it): + fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10)) + for i in range(3): + ft = np.fft.fft(data[:, i]) + ft_half = ft[: len(ft) // 2] + axs_ft[0].plot(np.abs(ft_half)) + axs_ft[1].plot(np.angle(ft_half)) + + axs_ft[0].set_title("FT Amplitude") + axs_ft[0].set_xlabel("Frequency") + axs_ft[0].set_ylabel("Amplitude") + axs_ft[0].legend(["idx 0", "idx 1", "idx 2"]) + axs_ft[1].set_title("FT Phase") + axs_ft[1].set_xlabel("Frequency") + axs_ft[1].set_ylabel("Phase") + axs_ft[1].legend(["idx 0", "idx 1", "idx 2"]) + + fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png") -iterations = range(0, dof_pos_obs.shape[0], 10) # plot data for each iteration -for it in iterations: +for it in range(0, dof_pos_obs.shape[0], 10): + # check if iteration terminated + terminate_idx = np.where(terminated[it, :, 0] == 1)[0] + if terminate_idx.size > 0: + n_steps = terminate_idx[0] + else: + n_steps = dof_pos_obs.shape[1] + print(n_steps) + + # generate figure fig, axs = plt.subplots(4, figsize=(10, 10)) plt.suptitle(name + " iteration " + str(it)) + axs[0].set_title("dof_pos_obs") for i in range(3): - axs[0].plot(dof_pos_obs[it, :, i]) - axs[0].set_title("dof_pos_obs") - axs[0].legend(["idx 0", "idx 1", "idx 2"]) - axs[0].set_xlabel("time steps") + axs[0].plot(dof_pos_obs[it, :n_steps, i]) + axs[1].set_title("dof_pos_target") for i in range(3): - axs[1].plot(dof_pos_target[it, :, i]) - axs[1].set_title("dof_pos_target") - axs[1].legend(["idx 0", "idx 1", "idx 2"]) - axs[1].set_xlabel("time steps") + axs[1].plot(dof_pos_target[it, :n_steps, i]) + if FOURIER and n_steps == STEPS: + plot_fourier(dof_pos_target[it, :n_steps, :], it) + axs[2].set_title("dof_vel") for i in range(3): - axs[2].plot(dof_vel[it, :, i]) - axs[2].set_title("dof_vel") - axs[2].legend(["idx 0", "idx 1", "idx 2"]) - axs[2].set_xlabel("time steps") + axs[2].plot(dof_vel[it, :n_steps, i]) + axs[3].set_title("torques") for i in range(3): - axs[3].plot(torques[it, :, i]) - axs[3].set_title("torques") - axs[3].legend(["idx 0", "idx 1", "idx 2"]) - axs[3].set_xlabel("time steps") + axs[3].plot(torques[it, :n_steps, i]) + + # format plots + for idx in range(4): + axs[idx].legend(["idx 0", "idx 1", "idx 2"]) + axs[idx].set_xlabel("time steps") + axs[idx].set_xlim([0, n_steps]) if SMOOTH: # plot vertical lines where noise is resampled - for x in range(0, dof_pos_obs.shape[1], 8): + for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ): axs[0].axvline(x, color="r", linestyle="--") axs[1].axvline(x, color="r", linestyle="--") axs[2].axvline(x, color="r", linestyle="--") axs[3].axvline(x, color="r", linestyle="--") - plt.tight_layout() - plt.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png") + fig.tight_layout() + fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png") diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 86541f9b..0729b994 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -34,6 +34,13 @@ def learn(self, states_to_log_dict=None): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): @@ -53,17 +60,6 @@ def learn(self, states_to_log_dict=None): self.env.step() - # * Log states - # This continuously overwrites the rollout data, so when the state - # dict is written to a file it contains the last rollout for each - # iteration. - it_idx = self.it - 1 - if states_to_log_dict is not None: - for state in states_to_log_dict: - states_to_log_dict[state][0, it_idx, i, :] = getattr( - self.env, state - )[0, :] - actor_obs = self.get_noisy_obs( self.policy_cfg["actor_obs"], self.policy_cfg["noise"] ) @@ -171,3 +167,42 @@ def init_storage(self): critic_obs_shape=[num_critic_obs], action_shape=[num_actions], ) + + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + with torch.inference_mode(): + for i in range(steps): + sample_freq = self.policy_cfg["exploration_sample_freq"] + if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] diff --git a/scripts/log_play.py b/scripts/log_play.py index adc1b598..f7fd68e8 100644 --- a/scripts/log_play.py +++ b/scripts/log_play.py @@ -66,7 +66,11 @@ def play(env, runner, train_cfg): # * set up logging log_file_path = os.path.join( - LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "play", protocol_name + ".npz" + LEGGED_GYM_ROOT_DIR, + "gym", + "smooth_exploration", + "data_play", + protocol_name + ".npz", ) if not os.path.exists(os.path.dirname(log_file_path)): os.makedirs(os.path.dirname(log_file_path)) diff --git a/scripts/log_train.py b/scripts/log_train.py index 918d0b8f..37fca0da 100644 --- a/scripts/log_train.py +++ b/scripts/log_train.py @@ -10,7 +10,7 @@ import numpy as np TRAIN_ITERATIONS = 100 -ROLLOUT_TIMESTEPS = 32 +ROLLOUT_TIMESTEPS = 1000 def create_logging_dict(runner): @@ -39,6 +39,11 @@ def create_logging_dict(runner): (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim), device=runner.env.device, ) + + # log whether rollout terminated + states_to_log_dict["terminated"] = torch.zeros( + (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, 1), device=runner.env.device + ) return states_to_log_dict @@ -65,7 +70,11 @@ def train(train_cfg, policy_runner): # * set up logging log_file_path = os.path.join( - LEGGED_GYM_ROOT_DIR, "gym", "logs", "data", "train", protocol_name + ".npz" + LEGGED_GYM_ROOT_DIR, + "gym", + "smooth_exploration", + "data_train", + protocol_name + ".npz", ) if not os.path.exists(os.path.dirname(log_file_path)): os.makedirs(os.path.dirname(log_file_path)) From 4994550b747b289011c91eddcfd40d4d2f59acff Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 19 Apr 2024 16:20:08 -0400 Subject: [PATCH 11/19] plot FT script --- gym/smooth_exploration/plot_ft.py | 56 +++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 gym/smooth_exploration/plot_ft.py diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py new file mode 100644 index 00000000..5ad86b63 --- /dev/null +++ b/gym/smooth_exploration/plot_ft.py @@ -0,0 +1,56 @@ +import numpy as np +import matplotlib.pyplot as plt +import os + +SAMPLE_FREQ = 8 +STEPS = 500 + +smooth_name = "ref_sample_8_len_500" +baseline_name = "ref_baseline_len_500" + +smooth_data_dir = "./data_train/" + smooth_name +baseline_data_dir = "./data_train/" + baseline_name +fig_dir = "./figures_train/" + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) + +# load data +smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0] +baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0] +smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] +baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] + +# compute FFT averages +smooth_ffts = [[], [], []] +baseline_ffts = [[], [], []] +for it in range(0, smooth_pos_target.shape[0], 10): + # only use data that didn't terminate + if not np.any(smooth_terminated[it, :, 0]): + for idx in range(3): + fft = np.fft.fft(smooth_pos_target[it, :, idx]) + smooth_ffts[idx].append(fft[: len(fft) // 2]) + + if not np.any(baseline_terminated[it, :, 0]): + for idx in range(3): + fft = np.fft.fft(baseline_pos_target[it, :, idx]) + baseline_ffts[idx].append(fft[: len(fft) // 2]) + +print(f"Total smooth FFTS: {len(smooth_ffts[0])}") +print(f"Total baseline FFTS: {len(baseline_ffts[0])}") + +smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)] +baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)] + +# plot FFTs +fig, axs = plt.subplots(3, 1, figsize=(10, 10)) +for idx in range(3): + axs[idx].plot(np.abs(smooth_fft_means[idx])) + axs[idx].plot(np.abs(baseline_fft_means[idx])) + axs[idx].set_title(f"FT Amplitude idx {idx}") + axs[idx].set_xlabel("Frequency") + axs[idx].set_ylabel("Amplitude") + axs[idx].legend(["smooth", "baseline"]) + +fig.tight_layout() +fig.savefig(fig_dir + "/" + smooth_name + ".png") From 0205d99b33fc338c6ed3644ef55638fcc6bb1b5e Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 19 Apr 2024 18:47:36 -0400 Subject: [PATCH 12/19] added sweep config --- .../sweep_config_smooth_exploration.json | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 scripts/sweep_configs/sweep_config_smooth_exploration.json diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json new file mode 100644 index 00000000..952282fe --- /dev/null +++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json @@ -0,0 +1,19 @@ +{ + "method": "grid", + "metric": { + "name": "rewards/total_rewards", + "goal": "maximize" + }, + "run_cap": 10, + "parameters": { + "train_cfg.policy.exploration_sample_freq":{ + "values": [4, 8, 16, 32] + }, + "train_cfg.runner.num_steps_per_env":{ + "values": [32, 64] + }, + "train_cfg.runner.max_iterations":{ + "values": [800] + } + } +} \ No newline at end of file From 64b9b1446f4d63ca150db859d889b6d643a8b296 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Wed, 24 Apr 2024 13:56:26 -0400 Subject: [PATCH 13/19] update on policy and old policy runners, get nans for log_probs --- learning/algorithms/ppo2.py | 9 +++++++-- learning/modules/smooth_actor.py | 25 ++++++++++++++----------- learning/runners/BaseRunner.py | 7 +++++-- learning/runners/old_policy_runner.py | 18 ++++++++++++++++-- learning/runners/on_policy_runner.py | 12 +++++++----- 5 files changed, 49 insertions(+), 22 deletions(-) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 5866052c..b6b4fec0 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -6,6 +6,7 @@ create_uniform_generator, compute_generalized_advantages, ) +from learning.modules import SmoothActor class PPO2: @@ -99,7 +100,7 @@ def update_actor(self, data): self.mean_surrogate_loss = 0 counter = 0 - self.actor.act(data["actor_obs"]) + self.actor.update_distribution(data["actor_obs"]) data["old_sigma_batch"] = self.actor.action_std.detach() data["old_mu_batch"] = self.actor.action_mean.detach() data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( @@ -111,8 +112,12 @@ def update_actor(self, data): batch_size = total_data // self.num_mini_batches generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: + # * Re-sample noise for smooth actor + if isinstance(self.actor, SmoothActor): + self.actor.sample_weights(batch_size) + # ! refactor how this is done - self.actor.act(batch["actor_obs"]) + self.actor.update_distribution(batch["actor_obs"]) actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index 91754aeb..dae53c9e 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -13,14 +13,13 @@ class SmoothActor(Actor): weights_dist: Normal _latent_sde: torch.Tensor - exploration_mat: torch.Tensor exploration_matrices: torch.Tensor def __init__( self, *args, full_std: bool = True, - use_exp_ln: bool = False, + use_exp_ln: bool = True, learn_features: bool = True, epsilon: float = 1e-6, log_std_init: float = 0.0, @@ -60,8 +59,6 @@ def sample_weights(self, batch_size=1): # Sample weights for the noise exploration matrix std = self.get_std() self.weights_dist = Normal(torch.zeros_like(std), std) - # Reparametrization trick to pass gradients - self.exploration_mat = self.weights_dist.rsample() # Pre-compute matrices in case of parallel exploration self.exploration_matrices = self.weights_dist.rsample((batch_size,)) @@ -87,12 +84,20 @@ def get_std(self): def update_distribution(self, observations): if self._normalize_obs: - observations = self.normalize(observations) + with torch.no_grad(): + observations = self.obs_rms(observations) # Get latent features and compute distribution self._latent_sde = self.latent_net(observations) if not self.learn_features: self._latent_sde = self._latent_sde.detach() - variance = torch.mm(self._latent_sde**2, self.get_std() ** 2) + if self._latent_sde.dim() == 2: + variance = torch.mm(self._latent_sde**2, self.get_std() ** 2) + elif self._latent_sde.dim() == 3: + variance = torch.einsum( + "abc,cd->abd", self._latent_sde**2, self.get_std() ** 2 + ) + else: + raise ValueError("Invalid latent_sde dimension") mean_actions = self.mean_actions_net(self._latent_sde) self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) @@ -107,7 +112,8 @@ def act(self, observations): def act_inference(self, observations): if self._normalize_obs: - observations = self.normalize(observations) + with torch.no_grad(): + observations = self.obs_rms(observations) latent_sde = self.latent_net(observations) mean_actions = self.mean_actions_net(latent_sde) return mean_actions @@ -116,12 +122,9 @@ def get_noise(self): latent_sde = self._latent_sde if not self.learn_features: latent_sde = latent_sde.detach() - # Default case: only one exploration matrix - if len(latent_sde) == 1 or len(latent_sde) != len(self.exploration_matrices): - return torch.mm(latent_sde, self.exploration_mat.to(latent_sde.device)) # Use batch matrix multiplication for efficient computation # (batch_size, n_features) -> (batch_size, 1, n_features) latent_sde = latent_sde.unsqueeze(dim=1) # (batch_size, 1, n_actions) - noise = torch.bmm(latent_sde, self.exploration_matrices) + noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device)) return noise.squeeze(dim=1) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 9e5ab2e3..39ce336d 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, Critic +from learning.modules import Actor, Critic, SmoothActor from learning.utils import remove_zero_weighted_rewards @@ -22,7 +22,10 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) alg_class = eval(self.cfg["algorithm_class_name"]) self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 42f019a3..8b58dbe4 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -4,7 +4,7 @@ from learning.utils import Logger from .BaseRunner import BaseRunner from learning.algorithms import PPO # noqa: F401 -from learning.modules import ActorCritic, Actor, Critic +from learning.modules import ActorCritic, Actor, Critic, SmoothActor logger = Logger() @@ -24,7 +24,10 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) actor_critic = ActorCritic(actor, critic) alg_class = eval(self.cfg["algorithm_class_name"]) @@ -42,6 +45,10 @@ def learn(self): self.save() + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -49,6 +56,13 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # * Re-sample noise matrix for smooth exploration + sample_freq = self.actor_cfg["exploration_sample_freq"] + if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.actor_cfg["actions"], diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 3d38b48a..79f086d7 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -32,6 +32,10 @@ def learn(self, states_to_log_dict=None): tot_iter = self.it + self.num_learning_iterations self.save() + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) + # * start up storage transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) transition.update( @@ -65,11 +69,9 @@ def learn(self, states_to_log_dict=None): with torch.inference_mode(): for i in range(self.num_steps_per_env): # * Re-sample noise matrix for smooth exploration - sample_freq = self.policy_cfg["exploration_sample_freq"] - if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: - self.alg.actor_critic.actor.sample_weights( - batch_size=self.env.num_envs - ) + sample_freq = self.actor_cfg["exploration_sample_freq"] + if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) actions = self.alg.act(actor_obs, critic_obs) self.set_actions( From 91656e9b81d4a1722100d100337c63dd31d0cd56 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Thu, 25 Apr 2024 20:08:52 -0400 Subject: [PATCH 14/19] run 200 iterations before starting training, to burn in normalization --- learning/runners/on_policy_runner.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 79f086d7..fc754e17 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -54,6 +54,20 @@ def learn(self, states_to_log_dict=None): device=self.device, ) + # burn in observation normalization. + with torch.inference_mode(): + for _ in range(200): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") From f74e7feaa103c7ace373c55b9e95db0d0a21f38f Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 3 May 2024 14:37:30 -0400 Subject: [PATCH 15/19] update dummy input in export_network --- learning/modules/utils/neural_net.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 057e724f..c28762be 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -80,7 +80,7 @@ def export_network(network, network_name, path, num_inputs): model = copy.deepcopy(network).to("cpu") # To trace model, must be evaluated once with arbitrary input model.eval() - dummy_input = torch.rand((1, num_inputs)) + dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) From 18045e9289dec5f6d8c3cc74cacaf6b873489fe3 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Fri, 3 May 2024 18:50:12 -0400 Subject: [PATCH 16/19] good choice of params: sample 16, rollout 32, LR x1.1, des_kl 0.02 --- gym/envs/mini_cheetah/mini_cheetah_ref_config.py | 8 ++++---- learning/algorithms/ppo2.py | 4 ++-- learning/runners/on_policy_runner.py | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 2e108f16..a6f12e18 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,7 +74,7 @@ class actor: # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" smooth_exploration = True - exploration_sample_freq = 8 + exploration_sample_freq = 16 normalize_obs = True obs = [ @@ -151,17 +151,17 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_mini_batches = 4 storage_size = 2**17 # new mini_batch_size = 2**15 # new - learning_rate = 5.0e-5 + learning_rate = 0.002 schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] lam = 0.95 GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 + desired_kl = 0.02 max_grad_norm = 1.0 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 500 # number of policy updates + max_iterations = 800 # number of policy updates algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index b6b4fec0..58945bb3 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -139,9 +139,9 @@ def update_actor(self, data): kl_mean = torch.mean(kl) if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(1e-5, self.learning_rate / 1.5) + self.learning_rate = max(2e-4, self.learning_rate / 1.1) elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.5) + self.learning_rate = min(1e-2, self.learning_rate * 1.1) for param_group in self.optimizer.param_groups: # ! check this diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index fc754e17..1e885e81 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -167,7 +167,9 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) From adfb0788fb61883f20c15dbacc93ec9010b471ac Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Sun, 5 May 2024 22:48:28 -0400 Subject: [PATCH 17/19] export latent network and update configs --- gym/envs/mini_cheetah/mini_cheetah_config.py | 24 +++++++++++-------- .../mini_cheetah/mini_cheetah_ref_config.py | 7 ++++-- learning/algorithms/ppo2.py | 13 ++++++++-- learning/modules/utils/neural_net.py | 10 +++++++- .../sweep_config_smooth_exploration.json | 3 --- 5 files changed, 39 insertions(+), 18 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 104e5f64..6631704c 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -130,16 +130,17 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" - smooth_exploration = False + smooth_exploration = True + exploration_sample_freq = 16 obs = [ - "base_lin_vel", + # "base_lin_vel", "base_ang_vel", "projected_gravity", "commands", "dof_pos_obs", "dof_vel", - "dof_pos_target", + # "dof_pos_target", ] actions = ["dof_pos_target"] add_noise = True @@ -195,20 +196,23 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): value_loss_coef = 1.0 use_clipped_value_loss = True clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 + entropy_coef = 0.01 + num_learning_epochs = 6 # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive or fixed + num_mini_batches = 4 discount_horizon = 1.0 # [s] # GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 + desired_kl = 0.02 max_grad_norm = 1.0 + # * Learning rate + learning_rate = 0.002 + schedule = "adaptive" # can be adaptive or fixed + lr_range = [2e-4, 1e-2] + lr_ratio = 1.3 class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah" - max_iterations = 500 + max_iterations = 800 algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index a6f12e18..4f50f7d2 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -151,13 +151,16 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_mini_batches = 4 storage_size = 2**17 # new mini_batch_size = 2**15 # new - learning_rate = 0.002 - schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] lam = 0.95 GAE_bootstrap_horizon = 2.0 # [s] desired_kl = 0.02 max_grad_norm = 1.0 + # * Learning rate + learning_rate = 0.002 + schedule = "adaptive" # can be adaptive or fixed + lr_range = [3e-4, 1e-2] + lr_ratio = 1.3 class runner(MiniCheetahRunnerCfg.runner): run_name = "" diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 58945bb3..528ef882 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -27,6 +27,8 @@ def __init__( desired_kl=0.01, loss_fn="MSE", device="cpu", + lr_range=[1e-4, 1e-2], + lr_ratio=1.3, **kwargs, ): self.device = device @@ -34,6 +36,8 @@ def __init__( self.desired_kl = desired_kl self.schedule = schedule self.learning_rate = learning_rate + self.lr_range = lr_range + self.lr_ratio = lr_ratio # * PPO components self.actor = actor.to(self.device) @@ -137,11 +141,16 @@ def update_actor(self, data): axis=-1, ) kl_mean = torch.mean(kl) + lr_min, lr_max = self.lr_range if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(2e-4, self.learning_rate / 1.1) + self.learning_rate = max( + lr_min, self.learning_rate / self.lr_ratio + ) elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.1) + self.learning_rate = min( + lr_max, self.learning_rate * self.lr_ratio + ) for param_group in self.optimizer.param_groups: # ! check this diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index c28762be..24ed3df7 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -62,7 +62,7 @@ def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): layer_list.append(activation) -def export_network(network, network_name, path, num_inputs): +def export_network(network, network_name, path, num_inputs, latent=True): """ Thsi function traces and exports the given network module in .pt and .onnx file formats. These can be used for evaluation on other systems @@ -84,3 +84,11 @@ def export_network(network, network_name, path, num_inputs): model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) + + if latent: + path_latent = os.path.join(path, network_name + "_latent.onnx") + model_latent = model.latent_net + model_latent.eval() + dummy_input = torch.rand((num_inputs)) + model_traced = torch.jit.trace(model_latent, dummy_input) + torch.onnx.export(model_traced, dummy_input, path_latent) diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json index 952282fe..49fc4125 100644 --- a/scripts/sweep_configs/sweep_config_smooth_exploration.json +++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json @@ -11,9 +11,6 @@ }, "train_cfg.runner.num_steps_per_env":{ "values": [32, 64] - }, - "train_cfg.runner.max_iterations":{ - "values": [800] } } } \ No newline at end of file From 8ff82ae26b1e73daf0f72dc76af7756aab12de83 Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 6 May 2024 11:13:00 -0400 Subject: [PATCH 18/19] export latent net with norm and log get_std --- learning/modules/smooth_actor.py | 9 ++++----- learning/modules/utils/neural_net.py | 2 +- learning/runners/on_policy_runner.py | 4 +++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py index dae53c9e..5cb1da89 100644 --- a/learning/modules/smooth_actor.py +++ b/learning/modules/smooth_actor.py @@ -57,11 +57,12 @@ def __init__( def sample_weights(self, batch_size=1): # Sample weights for the noise exploration matrix - std = self.get_std() + std = self.get_std self.weights_dist = Normal(torch.zeros_like(std), std) # Pre-compute matrices in case of parallel exploration self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + @property def get_std(self): # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally if self.use_exp_ln: @@ -91,11 +92,9 @@ def update_distribution(self, observations): if not self.learn_features: self._latent_sde = self._latent_sde.detach() if self._latent_sde.dim() == 2: - variance = torch.mm(self._latent_sde**2, self.get_std() ** 2) + variance = torch.mm(self._latent_sde**2, self.get_std**2) elif self._latent_sde.dim() == 3: - variance = torch.einsum( - "abc,cd->abd", self._latent_sde**2, self.get_std() ** 2 - ) + variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2) else: raise ValueError("Invalid latent_sde dimension") mean_actions = self.mean_actions_net(self._latent_sde) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 24ed3df7..034c13d4 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -87,7 +87,7 @@ def export_network(network, network_name, path, num_inputs, latent=True): if latent: path_latent = os.path.join(path, network_name + "_latent.onnx") - model_latent = model.latent_net + model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) model_latent.eval() dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model_latent, dummy_input) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 1e885e81..dded2739 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -171,7 +171,9 @@ def set_up_logger(self): self.alg, ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) - logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) + logger.register_category( + "actor", self.alg.actor, ["action_std", "entropy", "get_std"] + ) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) From 007ea933c6d72077c8582e44162a121490cc529d Mon Sep 17 00:00:00 2001 From: Lukas Molnar Date: Mon, 6 May 2024 14:22:30 -0400 Subject: [PATCH 19/19] export actor std to txt file --- learning/modules/utils/neural_net.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 034c13d4..6e6f41d6 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -1,6 +1,7 @@ import torch import os import copy +import numpy as np def create_MLP( @@ -86,9 +87,16 @@ def export_network(network, network_name, path, num_inputs, latent=True): torch.onnx.export(model_traced, dummy_input, path_onnx) if latent: + # Export latent model path_latent = os.path.join(path, network_name + "_latent.onnx") model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) model_latent.eval() dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model_latent, dummy_input) torch.onnx.export(model_traced, dummy_input, path_latent) + + # Save actor std of shape (num_actions, latent_dim) + # It is important that the shape is the same as the exploration matrix + path_std = os.path.join(path, network_name + "_std.txt") + std_transposed = model.get_std.numpy().T + np.savetxt(path_std, std_transposed)