diff --git a/.gitignore b/.gitignore index 0b68375a..c78e6696 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,7 @@ ipython_config.py venv/ env.bak/ venv.bak/ + +# Smooth exploration +gym/smooth_exploration/data* +gym/smooth_exploration/figures* diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 7e6cabc6..1f6462ce 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -239,6 +239,7 @@ class actor: # can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" normalize_obs = True + smooth_exploration = False obs = [ "observation_a", diff --git a/gym/envs/mini_cheetah/mini_cheetah_config.py b/gym/envs/mini_cheetah/mini_cheetah_config.py index 28c70123..6631704c 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_config.py @@ -130,15 +130,17 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = True + exploration_sample_freq = 16 obs = [ - "base_lin_vel", + # "base_lin_vel", "base_ang_vel", "projected_gravity", "commands", "dof_pos_obs", "dof_vel", - "dof_pos_target", + # "dof_pos_target", ] actions = ["dof_pos_target"] add_noise = True @@ -194,20 +196,23 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): value_loss_coef = 1.0 use_clipped_value_loss = True clip_param = 0.2 - entropy_coef = 0.02 - num_learning_epochs = 4 + entropy_coef = 0.01 + num_learning_epochs = 6 # * mini batch size = num_envs*nsteps / nminibatches - num_mini_batches = 8 - learning_rate = 1.0e-5 - schedule = "adaptive" # can be adaptive or fixed + num_mini_batches = 4 discount_horizon = 1.0 # [s] # GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 + desired_kl = 0.02 max_grad_norm = 1.0 + # * Learning rate + learning_rate = 0.002 + schedule = "adaptive" # can be adaptive or fixed + lr_range = [2e-4, 1e-2] + lr_ratio = 1.3 class runner(LeggedRobotRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah" - max_iterations = 500 + max_iterations = 800 algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index a4acefa6..d5f767e8 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -166,6 +166,7 @@ class policy: critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False obs = [ "base_ang_vel", diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 3c92882d..4f50f7d2 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -73,6 +73,9 @@ class actor: hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = True + exploration_sample_freq = 16 + normalize_obs = True obs = [ "base_ang_vel", @@ -148,17 +151,20 @@ class algorithm(MiniCheetahRunnerCfg.algorithm): num_mini_batches = 4 storage_size = 2**17 # new mini_batch_size = 2**15 # new - learning_rate = 5.0e-5 - schedule = "adaptive" # can be adaptive, fixed discount_horizon = 1.0 # [s] lam = 0.95 GAE_bootstrap_horizon = 2.0 # [s] - desired_kl = 0.01 + desired_kl = 0.02 max_grad_norm = 1.0 + # * Learning rate + learning_rate = 0.002 + schedule = "adaptive" # can be adaptive or fixed + lr_range = [3e-4, 1e-2] + lr_ratio = 1.3 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 500 # number of policy updates + max_iterations = 800 # number of policy updates algorithm_class_name = "PPO2" num_steps_per_env = 32 diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 86a8d606..6f8eaa1d 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -187,6 +187,7 @@ class actor: critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + smooth_exploration = False obs = [ "base_height", diff --git a/gym/smooth_exploration/plot_ft.py b/gym/smooth_exploration/plot_ft.py new file mode 100644 index 00000000..5ad86b63 --- /dev/null +++ b/gym/smooth_exploration/plot_ft.py @@ -0,0 +1,56 @@ +import numpy as np +import matplotlib.pyplot as plt +import os + +SAMPLE_FREQ = 8 +STEPS = 500 + +smooth_name = "ref_sample_8_len_500" +baseline_name = "ref_baseline_len_500" + +smooth_data_dir = "./data_train/" + smooth_name +baseline_data_dir = "./data_train/" + baseline_name +fig_dir = "./figures_train/" + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) + +# load data +smooth_pos_target = np.load(smooth_data_dir + "/dof_pos_target.npy")[0] +baseline_pos_target = np.load(baseline_data_dir + "/dof_pos_target.npy")[0] +smooth_terminated = np.load(smooth_data_dir + "/terminated.npy")[0] +baseline_terminated = np.load(baseline_data_dir + "/terminated.npy")[0] + +# compute FFT averages +smooth_ffts = [[], [], []] +baseline_ffts = [[], [], []] +for it in range(0, smooth_pos_target.shape[0], 10): + # only use data that didn't terminate + if not np.any(smooth_terminated[it, :, 0]): + for idx in range(3): + fft = np.fft.fft(smooth_pos_target[it, :, idx]) + smooth_ffts[idx].append(fft[: len(fft) // 2]) + + if not np.any(baseline_terminated[it, :, 0]): + for idx in range(3): + fft = np.fft.fft(baseline_pos_target[it, :, idx]) + baseline_ffts[idx].append(fft[: len(fft) // 2]) + +print(f"Total smooth FFTS: {len(smooth_ffts[0])}") +print(f"Total baseline FFTS: {len(baseline_ffts[0])}") + +smooth_fft_means = [np.array(smooth_ffts[idx]).mean(axis=0) for idx in range(3)] +baseline_fft_means = [np.array(baseline_ffts[idx]).mean(axis=0) for idx in range(3)] + +# plot FFTs +fig, axs = plt.subplots(3, 1, figsize=(10, 10)) +for idx in range(3): + axs[idx].plot(np.abs(smooth_fft_means[idx])) + axs[idx].plot(np.abs(baseline_fft_means[idx])) + axs[idx].set_title(f"FT Amplitude idx {idx}") + axs[idx].set_xlabel("Frequency") + axs[idx].set_ylabel("Amplitude") + axs[idx].legend(["smooth", "baseline"]) + +fig.tight_layout() +fig.savefig(fig_dir + "/" + smooth_name + ".png") diff --git a/gym/smooth_exploration/plot_play.py b/gym/smooth_exploration/plot_play.py new file mode 100644 index 00000000..5768f91a --- /dev/null +++ b/gym/smooth_exploration/plot_play.py @@ -0,0 +1,38 @@ +import numpy as np +import matplotlib.pyplot as plt + +name = "mini_cheetah_ref" +data_dir = "./data_play/" + name +fig_dir = "./figures_play/" + name + +# load data +dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] +dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] +dof_vel = np.load(data_dir + "/dof_vel.npy")[0] + +# plot data +n_steps = 200 +fig, axs = plt.subplots(3, figsize=(10, 10)) +plt.suptitle(name) + +for i in range(3): + axs[0].plot(dof_pos_obs[:n_steps, i]) +axs[0].set_title("dof_pos_obs") +axs[0].legend(["idx 0", "idx 1", "idx 2"]) +axs[0].set_xlabel("time steps") + +for i in range(3): + axs[1].plot(dof_pos_target[:n_steps, i]) +axs[1].set_title("dof_pos_target") +axs[1].legend(["idx 0", "idx 1", "idx 2"]) +axs[1].set_xlabel("time steps") + +for i in range(3): + axs[2].plot(dof_vel[:n_steps, i]) +axs[2].set_title("dof_vel") +axs[2].legend(["idx 0", "idx 1", "idx 2"]) +axs[2].set_xlabel("time steps") + +plt.tight_layout() +plt.savefig(fig_dir + "/" + name + ".png") +plt.show() diff --git a/gym/smooth_exploration/plot_train.py b/gym/smooth_exploration/plot_train.py new file mode 100644 index 00000000..58c50396 --- /dev/null +++ b/gym/smooth_exploration/plot_train.py @@ -0,0 +1,93 @@ +import numpy as np +import matplotlib.pyplot as plt +import os + +FOURIER = True +SMOOTH = True +SAMPLE_FREQ = 16 +STEPS = 1000 + +name = "ref_sample_16_len_1000" +data_dir = "./data_train/" + name +fig_dir = "./figures_train/" + name + +if not os.path.exists(fig_dir): + os.makedirs(fig_dir) + +# load data +dof_pos_obs = np.load(data_dir + "/dof_pos_obs.npy")[0] +dof_pos_target = np.load(data_dir + "/dof_pos_target.npy")[0] +dof_vel = np.load(data_dir + "/dof_vel.npy")[0] +torques = np.load(data_dir + "/torques.npy")[0] +terminated = np.load(data_dir + "/terminated.npy")[0] + + +# plot fourier trainsform +def plot_fourier(data, it): + fig_ft, axs_ft = plt.subplots(2, figsize=(10, 10)) + for i in range(3): + ft = np.fft.fft(data[:, i]) + ft_half = ft[: len(ft) // 2] + axs_ft[0].plot(np.abs(ft_half)) + axs_ft[1].plot(np.angle(ft_half)) + + axs_ft[0].set_title("FT Amplitude") + axs_ft[0].set_xlabel("Frequency") + axs_ft[0].set_ylabel("Amplitude") + axs_ft[0].legend(["idx 0", "idx 1", "idx 2"]) + axs_ft[1].set_title("FT Phase") + axs_ft[1].set_xlabel("Frequency") + axs_ft[1].set_ylabel("Phase") + axs_ft[1].legend(["idx 0", "idx 1", "idx 2"]) + + fig_ft.savefig(fig_dir + "/dof_pos_target_FT_it_" + str(it) + ".png") + + +# plot data for each iteration +for it in range(0, dof_pos_obs.shape[0], 10): + # check if iteration terminated + terminate_idx = np.where(terminated[it, :, 0] == 1)[0] + if terminate_idx.size > 0: + n_steps = terminate_idx[0] + else: + n_steps = dof_pos_obs.shape[1] + print(n_steps) + + # generate figure + fig, axs = plt.subplots(4, figsize=(10, 10)) + plt.suptitle(name + " iteration " + str(it)) + + axs[0].set_title("dof_pos_obs") + for i in range(3): + axs[0].plot(dof_pos_obs[it, :n_steps, i]) + + axs[1].set_title("dof_pos_target") + for i in range(3): + axs[1].plot(dof_pos_target[it, :n_steps, i]) + if FOURIER and n_steps == STEPS: + plot_fourier(dof_pos_target[it, :n_steps, :], it) + + axs[2].set_title("dof_vel") + for i in range(3): + axs[2].plot(dof_vel[it, :n_steps, i]) + + axs[3].set_title("torques") + for i in range(3): + axs[3].plot(torques[it, :n_steps, i]) + + # format plots + for idx in range(4): + axs[idx].legend(["idx 0", "idx 1", "idx 2"]) + axs[idx].set_xlabel("time steps") + axs[idx].set_xlim([0, n_steps]) + + if SMOOTH: + # plot vertical lines where noise is resampled + for x in range(0, dof_pos_obs.shape[1], SAMPLE_FREQ): + axs[0].axvline(x, color="r", linestyle="--") + axs[1].axvline(x, color="r", linestyle="--") + axs[2].axvline(x, color="r", linestyle="--") + axs[3].axvline(x, color="r", linestyle="--") + + fig.tight_layout() + fig.savefig(fig_dir + "/" + name + "_it_" + str(it) + ".png") diff --git a/learning/algorithms/ppo.py b/learning/algorithms/ppo.py index 92898ad5..ff09d788 100644 --- a/learning/algorithms/ppo.py +++ b/learning/algorithms/ppo.py @@ -34,7 +34,7 @@ import torch.nn as nn import torch.optim as optim -from learning.modules import ActorCritic +from learning.modules import ActorCritic, SmoothActor from learning.storage import RolloutStorage @@ -162,7 +162,11 @@ def update(self): old_mu_batch, old_sigma_batch, ) in generator: - self.actor_critic.act(obs_batch) + # TODO[lm]: Look into resampling noise here, gSDE paper seems to do it. + if isinstance(self.actor_critic.actor, SmoothActor): + batch_size = obs_batch.shape[0] + self.actor_critic.actor.sample_weights(batch_size) + self.actor_critic.actor.update_distribution(obs_batch) actions_log_prob_batch = self.actor_critic.get_actions_log_prob( actions_batch ) diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 5866052c..528ef882 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -6,6 +6,7 @@ create_uniform_generator, compute_generalized_advantages, ) +from learning.modules import SmoothActor class PPO2: @@ -26,6 +27,8 @@ def __init__( desired_kl=0.01, loss_fn="MSE", device="cpu", + lr_range=[1e-4, 1e-2], + lr_ratio=1.3, **kwargs, ): self.device = device @@ -33,6 +36,8 @@ def __init__( self.desired_kl = desired_kl self.schedule = schedule self.learning_rate = learning_rate + self.lr_range = lr_range + self.lr_ratio = lr_ratio # * PPO components self.actor = actor.to(self.device) @@ -99,7 +104,7 @@ def update_actor(self, data): self.mean_surrogate_loss = 0 counter = 0 - self.actor.act(data["actor_obs"]) + self.actor.update_distribution(data["actor_obs"]) data["old_sigma_batch"] = self.actor.action_std.detach() data["old_mu_batch"] = self.actor.action_mean.detach() data["old_actions_log_prob_batch"] = self.actor.get_actions_log_prob( @@ -111,8 +116,12 @@ def update_actor(self, data): batch_size = total_data // self.num_mini_batches generator = create_uniform_generator(data, batch_size, self.num_learning_epochs) for batch in generator: + # * Re-sample noise for smooth actor + if isinstance(self.actor, SmoothActor): + self.actor.sample_weights(batch_size) + # ! refactor how this is done - self.actor.act(batch["actor_obs"]) + self.actor.update_distribution(batch["actor_obs"]) actions_log_prob_batch = self.actor.get_actions_log_prob(batch["actions"]) mu_batch = self.actor.action_mean sigma_batch = self.actor.action_std @@ -132,11 +141,16 @@ def update_actor(self, data): axis=-1, ) kl_mean = torch.mean(kl) + lr_min, lr_max = self.lr_range if kl_mean > self.desired_kl * 2.0: - self.learning_rate = max(1e-5, self.learning_rate / 1.5) + self.learning_rate = max( + lr_min, self.learning_rate / self.lr_ratio + ) elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0: - self.learning_rate = min(1e-2, self.learning_rate * 1.5) + self.learning_rate = min( + lr_max, self.learning_rate * self.lr_ratio + ) for param_group in self.optimizer.param_groups: # ! check this diff --git a/learning/modules/__init__.py b/learning/modules/__init__.py index 3caf5fec..410e2f25 100644 --- a/learning/modules/__init__.py +++ b/learning/modules/__init__.py @@ -34,3 +34,4 @@ from .state_estimator import StateEstimatorNN from .actor import Actor from .critic import Critic +from .smooth_actor import SmoothActor diff --git a/learning/modules/actor.py b/learning/modules/actor.py index fbaa6868..9fc37c3b 100644 --- a/learning/modules/actor.py +++ b/learning/modules/actor.py @@ -5,6 +5,8 @@ from .utils import export_network from .utils import RunningMeanStd +from gym import LEGGED_GYM_ROOT_DIR + class Actor(nn.Module): def __init__( @@ -25,7 +27,11 @@ def __init__( self.num_obs = num_obs self.num_actions = num_actions - self.NN = create_MLP(num_obs, num_actions, hidden_dims, activation) + self.hidden_dims = hidden_dims + self.activation = activation + self.NN = create_MLP( + num_obs, num_actions, hidden_dims, activation, latent=False + ) # Action noise self.std = nn.Parameter(init_noise_std * torch.ones(num_actions)) @@ -33,6 +39,9 @@ def __init__( # disable args validation for speedup Normal.set_default_validate_args = False + # Debug mode for plotting + self.debug = False + @property def action_mean(self): return self.distribution.mean @@ -51,7 +60,12 @@ def update_distribution(self, observations): def act(self, observations): self.update_distribution(observations) - return self.distribution.sample() + sample = self.distribution.sample() + if self.debug: + mean = self.distribution.mean + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_baseline.csv" + self.log_actions(mean[0][2], sample[0][2], path) + return sample def get_actions_log_prob(self, actions): return self.distribution.log_prob(actions).sum(dim=-1) @@ -67,3 +81,7 @@ def forward(self, observations): def export(self, path): export_network(self, "policy", path, self.num_obs) + + def log_actions(self, mean, sample, path): + with open(path, "a") as f: + f.write(str(mean.item()) + ", " + str(sample.item()) + "\n") diff --git a/learning/modules/critic.py b/learning/modules/critic.py index 732f8eda..3ee930dc 100644 --- a/learning/modules/critic.py +++ b/learning/modules/critic.py @@ -15,7 +15,7 @@ def __init__( ): super().__init__() - self.NN = create_MLP(num_obs, 1, hidden_dims, activation) + self.NN = create_MLP(num_obs, 1, hidden_dims, activation, latent=False) self._normalize_obs = normalize_obs if self._normalize_obs: self.obs_rms = RunningMeanStd(num_obs) diff --git a/learning/modules/smooth_actor.py b/learning/modules/smooth_actor.py new file mode 100644 index 00000000..5cb1da89 --- /dev/null +++ b/learning/modules/smooth_actor.py @@ -0,0 +1,129 @@ +import torch +import torch.nn as nn +from torch.distributions import Normal + +from .actor import Actor +from .utils import create_MLP + +from gym import LEGGED_GYM_ROOT_DIR + + +# The following implementation is based on the gSDE paper. See code: +# https://github.com/DLR-RM/stable-baselines3/blob/56f20e40a2206bbb16501a0f600e29ce1b112ef1/stable_baselines3/common/distributions.py#L421C7-L421C38 +class SmoothActor(Actor): + weights_dist: Normal + _latent_sde: torch.Tensor + exploration_matrices: torch.Tensor + + def __init__( + self, + *args, + full_std: bool = True, + use_exp_ln: bool = True, + learn_features: bool = True, + epsilon: float = 1e-6, + log_std_init: float = 0.0, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.full_std = full_std + self.use_exp_ln = use_exp_ln + self.learn_features = learn_features + self.epsilon = epsilon + self.log_std_init = log_std_init + + # Create latent NN and last layer + self.latent_net = create_MLP( + self.num_obs, + self.num_actions, + self.hidden_dims, + self.activation, + latent=True, + ) + self.latent_dim = self.hidden_dims[-1] + self.mean_actions_net = nn.Linear(self.latent_dim, self.num_actions) + # Reduce the number of parameters if needed + if self.full_std: + log_std = torch.ones(self.latent_dim, self.num_actions) + else: + log_std = torch.ones(self.latent_dim, 1) + self.log_std = nn.Parameter(log_std * self.log_std_init, requires_grad=True) + # Sample an exploration matrix + self.sample_weights() + self.distribution = None + + # Debug mode for plotting + self.debug = False + + def sample_weights(self, batch_size=1): + # Sample weights for the noise exploration matrix + std = self.get_std + self.weights_dist = Normal(torch.zeros_like(std), std) + # Pre-compute matrices in case of parallel exploration + self.exploration_matrices = self.weights_dist.rsample((batch_size,)) + + @property + def get_std(self): + # TODO[lm]: Check if this is ok, and can use action_std in ActorCritic normally + if self.use_exp_ln: + # From gSDE paper, it allows to keep variance + # above zero and prevent it from growing too fast + below_threshold = torch.exp(self.log_std) * (self.log_std <= 0) + # Avoid NaN: zeros values that are below zero + safe_log_std = self.log_std * (self.log_std > 0) + self.epsilon + above_threshold = (torch.log1p(safe_log_std) + 1.0) * (self.log_std > 0) + std = below_threshold + above_threshold + else: + # Use normal exponential + std = torch.exp(self.log_std) + + if self.full_std: + return std + assert self.latent_dim is not None + # Reduce the number of parameters: + return torch.ones(self.latent_dim, 1).to(self.log_std.device) * std + + def update_distribution(self, observations): + if self._normalize_obs: + with torch.no_grad(): + observations = self.obs_rms(observations) + # Get latent features and compute distribution + self._latent_sde = self.latent_net(observations) + if not self.learn_features: + self._latent_sde = self._latent_sde.detach() + if self._latent_sde.dim() == 2: + variance = torch.mm(self._latent_sde**2, self.get_std**2) + elif self._latent_sde.dim() == 3: + variance = torch.einsum("abc,cd->abd", self._latent_sde**2, self.get_std**2) + else: + raise ValueError("Invalid latent_sde dimension") + mean_actions = self.mean_actions_net(self._latent_sde) + self.distribution = Normal(mean_actions, torch.sqrt(variance + self.epsilon)) + + def act(self, observations): + self.update_distribution(observations) + mean = self.distribution.mean + sample = mean + self.get_noise() + if self.debug: + path = f"{LEGGED_GYM_ROOT_DIR}/plots/distribution_smooth.csv" + self.log_actions(mean[0][2], sample[0][2], path) + return sample + + def act_inference(self, observations): + if self._normalize_obs: + with torch.no_grad(): + observations = self.obs_rms(observations) + latent_sde = self.latent_net(observations) + mean_actions = self.mean_actions_net(latent_sde) + return mean_actions + + def get_noise(self): + latent_sde = self._latent_sde + if not self.learn_features: + latent_sde = latent_sde.detach() + # Use batch matrix multiplication for efficient computation + # (batch_size, n_features) -> (batch_size, 1, n_features) + latent_sde = latent_sde.unsqueeze(dim=1) + # (batch_size, 1, n_actions) + noise = torch.bmm(latent_sde, self.exploration_matrices.to(latent_sde.device)) + return noise.squeeze(dim=1) diff --git a/learning/modules/utils/neural_net.py b/learning/modules/utils/neural_net.py index 5fd6ab3b..6e6f41d6 100644 --- a/learning/modules/utils/neural_net.py +++ b/learning/modules/utils/neural_net.py @@ -1,9 +1,12 @@ import torch import os import copy +import numpy as np -def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): +def create_MLP( + num_inputs, num_outputs, hidden_dims, activation, dropouts=None, latent=False +): activation = get_activation(activation) if dropouts is None: @@ -15,8 +18,12 @@ def create_MLP(num_inputs, num_outputs, hidden_dims, activation, dropouts=None): else: add_layer(layers, num_inputs, hidden_dims[0], activation, dropouts[0]) for i in range(len(hidden_dims)): + # TODO[lm]: Could also create a separate function that gives the latent + # reprentation used for smooth exploration (but if it doesn't mess up + # anything, this is simpler) if i == len(hidden_dims) - 1: - add_layer(layers, hidden_dims[i], num_outputs) + if not latent: + add_layer(layers, hidden_dims[i], num_outputs) else: add_layer( layers, @@ -56,7 +63,7 @@ def add_layer(layer_list, num_inputs, num_outputs, activation=None, dropout=0): layer_list.append(activation) -def export_network(network, network_name, path, num_inputs): +def export_network(network, network_name, path, num_inputs, latent=True): """ Thsi function traces and exports the given network module in .pt and .onnx file formats. These can be used for evaluation on other systems @@ -74,7 +81,22 @@ def export_network(network, network_name, path, num_inputs): model = copy.deepcopy(network).to("cpu") # To trace model, must be evaluated once with arbitrary input model.eval() - dummy_input = torch.rand((1, num_inputs)) + dummy_input = torch.rand((num_inputs)) model_traced = torch.jit.trace(model, dummy_input) torch.jit.save(model_traced, path_TS) torch.onnx.export(model_traced, dummy_input, path_onnx) + + if latent: + # Export latent model + path_latent = os.path.join(path, network_name + "_latent.onnx") + model_latent = torch.nn.Sequential(model.obs_rms, model.latent_net) + model_latent.eval() + dummy_input = torch.rand((num_inputs)) + model_traced = torch.jit.trace(model_latent, dummy_input) + torch.onnx.export(model_traced, dummy_input, path_latent) + + # Save actor std of shape (num_actions, latent_dim) + # It is important that the shape is the same as the exploration matrix + path_std = os.path.join(path, network_name + "_std.txt") + std_transposed = model.get_std.numpy().T + np.savetxt(path_std, std_transposed) diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 9e5ab2e3..39ce336d 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -1,6 +1,6 @@ import torch from learning.algorithms import * # noqa: F403 -from learning.modules import Actor, Critic +from learning.modules import Actor, Critic, SmoothActor from learning.utils import remove_zero_weighted_rewards @@ -22,7 +22,10 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) alg_class = eval(self.cfg["algorithm_class_name"]) self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 42f019a3..8b58dbe4 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -4,7 +4,7 @@ from learning.utils import Logger from .BaseRunner import BaseRunner from learning.algorithms import PPO # noqa: F401 -from learning.modules import ActorCritic, Actor, Critic +from learning.modules import ActorCritic, Actor, Critic, SmoothActor logger = Logger() @@ -24,7 +24,10 @@ def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) num_critic_obs = self.get_obs_size(self.critic_cfg["obs"]) - actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) + if self.actor_cfg["smooth_exploration"]: + actor = SmoothActor(num_actor_obs, num_actions, **self.actor_cfg) + else: + actor = Actor(num_actor_obs, num_actions, **self.actor_cfg) critic = Critic(num_critic_obs, **self.critic_cfg) actor_critic = ActorCritic(actor, critic) alg_class = eval(self.cfg["algorithm_class_name"]) @@ -42,6 +45,10 @@ def learn(self): self.save() + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor_critic.actor.sample_weights(batch_size=self.env.num_envs) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") @@ -49,6 +56,13 @@ def learn(self): # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # * Re-sample noise matrix for smooth exploration + sample_freq = self.actor_cfg["exploration_sample_freq"] + if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.actor_cfg["actions"], diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 2cd6a9d8..dded2739 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -21,7 +21,7 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) - def learn(self): + def learn(self, states_to_log_dict=None): self.set_up_logger() rewards_dict = {} @@ -32,6 +32,10 @@ def learn(self): tot_iter = self.it + self.num_learning_iterations self.save() + # * Initialize smooth exploration matrices + if self.actor_cfg["smooth_exploration"]: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) + # * start up storage transition = TensorDict({}, batch_size=self.env.num_envs, device=self.device) transition.update( @@ -50,13 +54,39 @@ def learn(self): device=self.device, ) + # burn in observation normalization. + with torch.inference_mode(): + for _ in range(200): + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.actor_cfg["actions"], + actions, + self.actor_cfg["disable_actions"], + ) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + logger.tic("runtime") for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): + # * Re-sample noise matrix for smooth exploration + sample_freq = self.actor_cfg["exploration_sample_freq"] + if self.actor_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor.sample_weights(batch_size=self.env.num_envs) + actions = self.alg.act(actor_obs, critic_obs) self.set_actions( self.actor_cfg["actions"], @@ -137,9 +167,13 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], + ) + logger.register_category( + "actor", self.alg.actor, ["action_std", "entropy", "get_std"] ) - logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) @@ -178,3 +212,42 @@ def get_inference_actions(self): def export(self, path): self.alg.actor.export(path) + + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + with torch.inference_mode(): + for i in range(steps): + sample_freq = self.policy_cfg["exploration_sample_freq"] + if self.policy_cfg["smooth_exploration"] and i % sample_freq == 0: + self.alg.actor_critic.actor.sample_weights( + batch_size=self.env.num_envs + ) + + actions = self.alg.act(actor_obs, critic_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + critic_obs = self.get_obs(self.policy_cfg["critic_obs"]) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] diff --git a/scripts/log_play.py b/scripts/log_play.py new file mode 100644 index 00000000..f7fd68e8 --- /dev/null +++ b/scripts/log_play.py @@ -0,0 +1,121 @@ +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry +from gym.utils import KeyboardInterface +from gym.utils import VisualizationRecorder +from gym import LEGGED_GYM_ROOT_DIR + +# torch needs to be imported after isaacgym imports in local source +import torch +import os +import numpy as np + +TEST_TOTAL_TIMESTEPS = 1000 + + +def create_logging_dict(runner, test_total_timesteps): + states_to_log = [ + "dof_pos_target", + "dof_pos_obs", + "dof_vel", + "torques", + "commands", + # 'base_lin_vel', + # 'base_ang_vel', + # 'oscillators', + # 'grf', + # 'base_height' + ] + + states_to_log_dict = {} + + for state in states_to_log: + array_dim = runner.get_obs_size( + [ + state, + ] + ) + states_to_log_dict[state] = torch.zeros( + (1, test_total_timesteps, array_dim), device=runner.env.device + ) + return states_to_log_dict + + +def setup(args): + env_cfg, train_cfg = task_registry.create_cfgs(args) + env_cfg.env.num_envs = min(env_cfg.env.num_envs, 16) + if hasattr(env_cfg, "push_robots"): + env_cfg.push_robots.toggle = False + if hasattr(env_cfg, "commands"): + env_cfg.commands.resampling_time = 9999 + env_cfg.env.episode_length_s = 9999 + env_cfg.env.num_projectiles = 20 + task_registry.make_gym_and_sim() + env = task_registry.make_env(args.task, env_cfg) + env.cfg.init_state.reset_mode = "reset_to_basic" + train_cfg.runner.resume = True + train_cfg.logging.enable_local_saving = False + runner = task_registry.make_alg_runner(env, train_cfg) + + # * switch to evaluation mode (dropout for example) + runner.switch_to_eval() + return env, runner, train_cfg + + +def play(env, runner, train_cfg): + protocol_name = train_cfg.runner.experiment_name + + # * set up logging + log_file_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "gym", + "smooth_exploration", + "data_play", + protocol_name + ".npz", + ) + if not os.path.exists(os.path.dirname(log_file_path)): + os.makedirs(os.path.dirname(log_file_path)) + + states_to_log_dict = create_logging_dict(runner, TEST_TOTAL_TIMESTEPS) + + # * set up recording + if env.cfg.viewer.record: + recorder = VisualizationRecorder( + env, train_cfg.runner.experiment_name, train_cfg.runner.load_run + ) + + # * set up interface: GamepadInterface(env) or KeyboardInterface(env) + COMMANDS_INTERFACE = hasattr(env, "commands") + if COMMANDS_INTERFACE: + # interface = GamepadInterface(env) + interface = KeyboardInterface(env) + + for t in range(TEST_TOTAL_TIMESTEPS): + if COMMANDS_INTERFACE: + interface.update(env) + if env.cfg.viewer.record: + recorder.update(t) + runner.set_actions( + runner.policy_cfg["actions"], + runner.get_inference_actions(), + runner.policy_cfg["disable_actions"], + ) + env.step() + env.check_exit() + + # * log + for state in states_to_log_dict: + states_to_log_dict[state][:, t, :] = getattr(env, state)[0, :] + + # * save data + # first convert tensors to cpu + log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()} + np.savez_compressed(log_file_path, **log_dict_cpu) + print("saved to ", log_file_path) + return states_to_log_dict + + +if __name__ == "__main__": + args = get_args() + with torch.no_grad(): + env, runner, train_cfg = setup(args) + play(env, runner, train_cfg) diff --git a/scripts/log_train.py b/scripts/log_train.py new file mode 100644 index 00000000..37fca0da --- /dev/null +++ b/scripts/log_train.py @@ -0,0 +1,101 @@ +from gym.envs import __init__ # noqa: F401 +from gym.utils import get_args, task_registry, randomize_episode_counters +from gym.utils.logging_and_saving import wandb_singleton +from gym.utils.logging_and_saving import local_code_save_helper +from gym import LEGGED_GYM_ROOT_DIR + +# torch needs to be imported after isaacgym imports in local source +import torch +import os +import numpy as np + +TRAIN_ITERATIONS = 100 +ROLLOUT_TIMESTEPS = 1000 + + +def create_logging_dict(runner): + states_to_log = [ + "dof_pos_target", + "dof_pos_obs", + "dof_vel", + "torques", + "commands", + # 'base_lin_vel', + # 'base_ang_vel', + # 'oscillators', + # 'grf', + # 'base_height' + ] + + states_to_log_dict = {} + + for state in states_to_log: + array_dim = runner.get_obs_size( + [ + state, + ] + ) + states_to_log_dict[state] = torch.zeros( + (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, array_dim), + device=runner.env.device, + ) + + # log whether rollout terminated + states_to_log_dict["terminated"] = torch.zeros( + (1, TRAIN_ITERATIONS, ROLLOUT_TIMESTEPS, 1), device=runner.env.device + ) + return states_to_log_dict + + +def setup(): + args = get_args() + wandb_helper = wandb_singleton.WandbSingleton() + + env_cfg, train_cfg = task_registry.create_cfgs(args) + task_registry.make_gym_and_sim() + wandb_helper.setup_wandb(env_cfg=env_cfg, train_cfg=train_cfg, args=args) + env = task_registry.make_env(name=args.task, env_cfg=env_cfg) + randomize_episode_counters(env) + + train_cfg.runner.max_iterations = TRAIN_ITERATIONS + policy_runner = task_registry.make_alg_runner(env, train_cfg) + + local_code_save_helper.save_local_files_to_logs(train_cfg.log_dir) + + return train_cfg, policy_runner + + +def train(train_cfg, policy_runner): + protocol_name = train_cfg.runner.experiment_name + + # * set up logging + log_file_path = os.path.join( + LEGGED_GYM_ROOT_DIR, + "gym", + "smooth_exploration", + "data_train", + protocol_name + ".npz", + ) + if not os.path.exists(os.path.dirname(log_file_path)): + os.makedirs(os.path.dirname(log_file_path)) + + states_to_log_dict = create_logging_dict(policy_runner) + + # * train + # wandb_helper = wandb_singleton.WandbSingleton() + + policy_runner.learn(states_to_log_dict) + + # wandb_helper.close_wandb() + + # * save data + # first convert tensors to cpu + log_dict_cpu = {k: v.cpu() for k, v in states_to_log_dict.items()} + np.savez_compressed(log_file_path, **log_dict_cpu) + print("saved to ", log_file_path) + return states_to_log_dict + + +if __name__ == "__main__": + train_cfg, policy_runner = setup() + train(train_cfg=train_cfg, policy_runner=policy_runner) diff --git a/scripts/sweep_configs/sweep_config_smooth_exploration.json b/scripts/sweep_configs/sweep_config_smooth_exploration.json new file mode 100644 index 00000000..49fc4125 --- /dev/null +++ b/scripts/sweep_configs/sweep_config_smooth_exploration.json @@ -0,0 +1,16 @@ +{ + "method": "grid", + "metric": { + "name": "rewards/total_rewards", + "goal": "maximize" + }, + "run_cap": 10, + "parameters": { + "train_cfg.policy.exploration_sample_freq":{ + "values": [4, 8, 16, 32] + }, + "train_cfg.runner.num_steps_per_env":{ + "values": [32, 64] + } + } +} \ No newline at end of file