From 6b9a2c9324fa9daf56631149e326e5025ff073bc Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sat, 16 Mar 2024 12:04:58 -0400 Subject: [PATCH 01/26] add coupling to humanoid torques --- gym/envs/mit_humanoid/mit_humanoid.py | 30 +++++++++----------- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 +- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 74e35d5a..10169afc 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -12,23 +12,19 @@ def _init_buffers(self): super()._init_buffers() def _compute_torques(self): - self.desired_pos_target = self.dof_pos_target + self.default_dof_pos - q = self.dof_pos.clone() - qd = self.dof_vel.clone() - q_des = self.desired_pos_target.clone() - qd_des = self.dof_vel_target.clone() - tau_ff = self.tau_ff.clone() - kp = self.p_gains.clone() - kd = self.d_gains.clone() - - if self.cfg.asset.apply_humanoid_jacobian: - torques = _apply_coupling(q, qd, q_des, qd_des, kp, kd, tau_ff) - else: - torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff - - torques = torch.clip(torques, -self.torque_limits, self.torque_limits) - - return torques.view(self.torques.shape) + return torch.clip( + _apply_coupling( + self.dof_pos, + self.dof_vel, + self.dof_pos_target + self.default_dof_pos, + self.dof_vel_target, + self.p_gains, + self.d_gains, + self.tau_ff, + ), + -self.torque_limits, + self.torque_limits, + ) # view shape def _reward_lin_vel_z(self): # Penalize z axis base linear velocity w. squared exp diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 3439e58c..cdc4334f 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -248,7 +248,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO" + algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 run_name = "Standing" From e52208f5527c99849938726841ebba87a4dd01f6 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sat, 16 Mar 2024 12:14:47 -0400 Subject: [PATCH 02/26] add exp avg filtering --- gym/envs/mit_humanoid/mit_humanoid.py | 25 ++++++++++---------- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 ++ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 10169afc..67b21868 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -2,6 +2,7 @@ from gym.envs.base.legged_robot import LeggedRobot from .jacobian import _apply_coupling +from gym.utils import exp_avg_filter class MIT_Humanoid(LeggedRobot): @@ -12,19 +13,17 @@ def _init_buffers(self): super()._init_buffers() def _compute_torques(self): - return torch.clip( - _apply_coupling( - self.dof_pos, - self.dof_vel, - self.dof_pos_target + self.default_dof_pos, - self.dof_vel_target, - self.p_gains, - self.d_gains, - self.tau_ff, - ), - -self.torque_limits, - self.torque_limits, - ) # view shape + torques = _apply_coupling( + self.dof_pos, + self.dof_vel, + self.dof_pos_target + self.default_dof_pos, + self.dof_vel_target, + self.p_gains, + self.d_gains, + self.tau_ff, + ) + torques = torques.clip(-self.torque_limits, self.torque_limits) + return exp_avg_filter(torques, self.torques, self.cfg.control.filter_gain) def _reward_lin_vel_z(self): # Penalize z axis base linear velocity w. squared exp diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index cdc4334f..d442216d 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -111,6 +111,8 @@ class control(LeggedRobotCfg.control): ctrl_frequency = 100 desired_sim_frequency = 800 + filter_gain = 0.1586 # 1: no filtering, 0: wall + class commands: resampling_time = 10.0 # time before command are changed[s] From 54b826db9475061b01139dc58b6d0e0d930dedb4 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sat, 16 Mar 2024 19:00:50 -0400 Subject: [PATCH 03/26] add oscillators and phase-based rewards, some tuning --- gym/envs/base/legged_robot.py | 45 +- gym/envs/base/legged_robot_config.py | 2 +- .../mit_humanoid/humanoid_running_config.py | 2 +- gym/envs/mit_humanoid/mit_humanoid.py | 91 ++ gym/envs/mit_humanoid/mit_humanoid_config.py | 157 ++- .../robots/mit_humanoid/friction_model_L.pt | Bin 0 -> 13519 bytes .../robots/mit_humanoid/friction_model_R.pt | Bin 0 -> 13519 bytes .../mit_humanoid/urdf/humanoid_F_sf.urdf | 16 +- .../urdf/humanoid_F_sf_learnt.urdf | 996 ++++++++++++++++++ 9 files changed, 1242 insertions(+), 67 deletions(-) create mode 100644 resources/robots/mit_humanoid/friction_model_L.pt create mode 100644 resources/robots/mit_humanoid/friction_model_R.pt create mode 100644 resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index e02507d5..1bc4623e 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -532,6 +532,29 @@ def _init_buffers(self): self.num_envs, 1, dtype=torch.float, device=self.device ) + # # * get the body_name to body_index dict + # body_dict = self.gym.get_actor_rigid_body_dict( + # self.envs[0], self.actor_handles[0] + # ) + # # * extract a list of body_names where the index is the id number + # body_names = [ + # body_tuple[0] + # for body_tuple in sorted( + # body_dict.items(), key=lambda body_tuple: body_tuple[1] + # ) + # ] + # # * construct a list of id numbers corresponding to end_effectors + # self.end_effector_ids = [] + # for end_effector_name in self.cfg.asset.foot_collisionbox_names: + # self.end_effector_ids.extend( + # [ + # body_names.index(body_name) + # for body_name in body_names + # if end_effector_name in body_name + # ] + # ) + # # ---------------------------------------- + if self.cfg.terrain.measure_heights: self.height_points = self._init_height_points() self.measured_heights = 0 @@ -994,11 +1017,11 @@ def _reward_lin_vel_z(self): def _reward_ang_vel_xy(self): """Penalize xy axes base angular velocity""" - return -torch.sum(torch.square(self.base_ang_vel[:, :2]), dim=1) + return -torch.mean(torch.square(self.base_ang_vel[:, :2]), dim=1) def _reward_orientation(self): """Penalize non flat base orientation""" - return -torch.sum(torch.square(self.projected_gravity[:, :2]), dim=1) + return -torch.mean(torch.square(self.projected_gravity[:, :2]), dim=1) def _reward_base_height(self): """Penalize base height away from target""" @@ -1009,11 +1032,11 @@ def _reward_base_height(self): def _reward_torques(self): """Penalize torques""" - return -torch.sum(torch.square(self.torques), dim=1) + return -torch.mean(torch.square(self.torques), dim=1) def _reward_dof_vel(self): """Penalize dof velocities""" - return -torch.sum(torch.square(self.dof_vel), dim=1) + return -torch.mean(torch.square(self.dof_vel), dim=1) def _reward_action_rate(self): """Penalize changes in actions""" @@ -1025,7 +1048,7 @@ def _reward_action_rate(self): ) / dt2 ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" @@ -1039,7 +1062,7 @@ def _reward_action_rate2(self): ) / dt2 ) - return -torch.sum(error, dim=1) + return -torch.mean(error, dim=1) def _reward_collision(self): """Penalize collisions on selected bodies""" @@ -1064,26 +1087,26 @@ def _reward_dof_pos_limits(self): # * lower limit out_of_limits = -(self.dof_pos - self.dof_pos_limits[:, 0]).clip(max=0.0) out_of_limits += (self.dof_pos - self.dof_pos_limits[:, 1]).clip(min=0.0) - return -torch.sum(out_of_limits, dim=1) + return -torch.mean(out_of_limits, dim=1) def _reward_dof_vel_limits(self): """Penalize dof velocities too close to the limit""" # * clip to max error = 1 rad/s per joint to avoid huge penalties limit = self.cfg.reward_settings.soft_dof_vel_limit error = self.dof_vel.abs() - self.dof_vel_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) def _reward_torque_limits(self): """penalize torques too close to the limit""" limit = self.cfg.reward_settings.soft_torque_limit error = self.torques.abs() - self.torque_limits * limit - return -torch.sum(error.clip(min=0.0, max=1.0), dim=1) + return -torch.mean(error.clip(min=0.0, max=1.0), dim=1) def _reward_tracking_lin_vel(self): """Tracking of linear velocity commands (xy axes)""" error = torch.square(self.commands[:, :2] - self.base_lin_vel[:, :2]) error = torch.exp(-error / self.cfg.reward_settings.tracking_sigma) - return torch.sum(error, dim=1) + return torch.mean(error, dim=1) def _reward_tracking_ang_vel(self): """Tracking of angular velocity commands (yaw)""" @@ -1092,7 +1115,7 @@ def _reward_tracking_ang_vel(self): def _reward_feet_contact_forces(self): """penalize high contact forces""" - return -torch.sum( + return -torch.mean( ( torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) - self.cfg.reward_settings.max_contact_force diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 2955f5a5..221a6afc 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -163,7 +163,7 @@ class asset: file = "" # * name of the feet bodies, # * used to index body state and contact force tensors - foot_name = "None" + foot_name = "foot" penalize_contacts_on = [] terminate_after_contacts_on = [] end_effector_names = [] diff --git a/gym/envs/mit_humanoid/humanoid_running_config.py b/gym/envs/mit_humanoid/humanoid_running_config.py index 86a113f4..fd21c9b4 100644 --- a/gym/envs/mit_humanoid/humanoid_running_config.py +++ b/gym/envs/mit_humanoid/humanoid_running_config.py @@ -142,7 +142,7 @@ class asset(LeggedRobotCfg.asset): # +'humanoid_fixed_arms_full.urdf') file = ( "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_F_sf.urdf" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" ) keypoints = ["base"] end_effectors = ["left_foot", "right_foot"] diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 67b21868..17167286 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -11,7 +11,29 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + self.oscillators = torch.zeros(self.num_envs, 2, device=self.device) + self.oscillator_obs = torch.zeros(self.num_envs, 4, device=self.device) + self.oscillator_freq = torch.zeros(self.num_envs, 2, device=self.device) + def _reset_system(self, env_ids): + if len(env_ids) == 0: + return + super()._reset_system(env_ids) + # reset oscillators, with a pi phase shift between left and right + self.oscillators[env_ids, 0] = ( + torch.rand(len(env_ids), device=self.device) * 2 * torch.pi + ) + self.oscillators[env_ids, 1] = self.oscillators[env_ids, 0] + torch.pi + self.oscillators = torch.remainder(self.oscillators, 2 * torch.pi) + # reset oscillator velocities to base freq + self.oscillator_freq[env_ids] = self.cfg.oscillator.base_frequency + # recompute oscillator observations + self.oscillator_obs = torch.cat( + (self.oscillators.cos(), self.oscillators.sin()), dim=1 + ) + return + + # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): torques = _apply_coupling( self.dof_pos, @@ -25,6 +47,44 @@ def _compute_torques(self): torques = torques.clip(-self.torque_limits, self.torque_limits) return exp_avg_filter(torques, self.torques, self.cfg.control.filter_gain) + # oscillator integration + + def _post_decimation_step(self): + super()._post_decimation_step() + self._step_oscillators() + + def _step_oscillators(self, dt=None): + if dt is None: + dt = self.dt + self.oscillators += (self.oscillator_freq * 2 * torch.pi) * dt + self.oscillators = torch.remainder(self.oscillators, 2 * torch.pi) + self.oscillator_obs = torch.cat( + (torch.cos(self.oscillators), torch.sin(self.oscillators)), dim=1 + ) + + # --- rewards --- + + def _switch(self, mode=None): + c_vel = torch.linalg.norm(self.commands, dim=1) + switch = torch.exp( + -torch.square( + torch.max( + torch.zeros_like(c_vel), + c_vel - self.cfg.reward_settings.switch_scale, + ) + ) + / self.cfg.reward_settings.switch_scale + ) + if mode is None or mode == "stand": + return switch + elif mode == "move": + return 1 - switch + + def _reward_lin_vel_xy(self): + return torch.exp( + -torch.linalg.norm(self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1) + ) + def _reward_lin_vel_z(self): # Penalize z axis base linear velocity w. squared exp return self._sqrdexp(self.base_lin_vel[:, 2] / self.scales["base_lin_vel"]) @@ -52,6 +112,7 @@ def _reward_tracking_lin_vel(self): def _reward_dof_vel(self): # Penalize dof velocities return torch.mean(self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1) + # * self._switch("stand") def _reward_dof_near_home(self): return torch.mean( @@ -60,3 +121,33 @@ def _reward_dof_near_home(self): ), dim=1, ) + + def _reward_stand_still(self): + """Penalize motion at zero commands""" + # * normalize angles so we care about being within 5 deg + rew_pos = torch.mean( + self._sqrdexp((self.dof_pos - self.default_dof_pos) / torch.pi * 36), dim=1 + ) + rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) + rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) + rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) + return rew_vel + rew_pos - rew_base_vel # * self._switch("stand") + + def _reward_stance(self): + phase = torch.maximum( + torch.zeros_like(self.oscillators), -self.oscillators.sin() + ) # positive during swing, negative during stance + return (phase * self._compute_grf()).mean(dim=1) + + def _reward_swing(self): + phase = torch.maximum( + torch.zeros_like(self.oscillators), self.oscillators.sin() + ) # positive during swing, negative during stance + return -(phase * self._compute_grf()).mean(dim=1) + + def _compute_grf(self, grf_norm=True): + grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) + if grf_norm: + return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0) + else: + return grf diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index d442216d..826480e7 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -3,13 +3,15 @@ LeggedRobotRunnerCfg, ) +BASE_HEIGHT_REF = 0.80 + class MITHumanoidCfg(LeggedRobotCfg): class env(LeggedRobotCfg.env): num_envs = 4096 num_observations = 49 + 3 * 18 # 121 num_actuators = 18 - episode_length_s = 100 # episode length in seconds + episode_length_s = 5 # episode length in seconds num_privileged_obs = num_observations class terrain(LeggedRobotCfg.terrain): @@ -75,13 +77,13 @@ class init_state(LeggedRobotCfg.init_state): ] # yaw root_vel_range = [ - [-0.1, 0.1], # x - [-0.1, 0.1], # y - [-0.1, 0.1], # z - [-0.1, 0.1], # roll - [-0.1, 0.1], # pitch - [-0.1, 0.1], - ] # yaw + [-0.5, 1.5], # x + [-0.55, 0.55], # y + [-0.35, 0.1], # z + [-0.35, 0.35], # roll + [-0.35, 0.35], # pitch + [-0.35, 0.35], # yaw + ] class control(LeggedRobotCfg.control): # * PD Drive parameters: @@ -97,22 +99,25 @@ class control(LeggedRobotCfg.control): "elbow": 40.0, } # [N*m/rad] damping = { - "hip_yaw": 5.0, - "hip_abad": 5.0, - "hip_pitch": 5.0, - "knee": 5.0, - "ankle": 5.0, - "shoulder_pitch": 5.0, - "shoulder_abad": 5.0, - "shoulder_yaw": 5.0, - "elbow": 5.0, + "hip_yaw": 2.0, + "hip_abad": 2.0, + "hip_pitch": 2.0, + "knee": 2.0, + "ankle": 2.0, + "shoulder_pitch": 2.0, + "shoulder_abad": 2.0, + "shoulder_yaw": 2.0, + "elbow": 1.0, } # [N*m*s/rad] ctrl_frequency = 100 - desired_sim_frequency = 800 + desired_sim_frequency = 1000 filter_gain = 0.1586 # 1: no filtering, 0: wall + class oscillator: + base_frequency = 1.5 # [Hz] + class commands: resampling_time = 10.0 # time before command are changed[s] @@ -138,6 +143,7 @@ class asset(LeggedRobotCfg.asset): "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + "mit_humanoid/urdf/humanoid_R_sf.urdf" ) + # foot_collisionbox_names = ["foot"] foot_name = "foot" penalize_contacts_on = ["base", "arm"] terminate_after_contacts_on = ["base"] @@ -148,35 +154,88 @@ class asset(LeggedRobotCfg.asset): # * see GymDofDriveModeFlags # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) default_dof_drive_mode = 3 + fix_base_link = False disable_gravity = False disable_motors = False apply_humanoid_jacobian = False + total_mass = 25.0 class reward_settings(LeggedRobotCfg.reward_settings): soft_dof_pos_limit = 0.9 soft_dof_vel_limit = 0.9 soft_torque_limit = 0.9 max_contact_force = 1500.0 - - base_height_target = 0.65 + base_height_target = BASE_HEIGHT_REF tracking_sigma = 0.5 + # a smooth switch based on |cmd| (commanded velocity). + switch_scale = 0.5 + switch_threshold = 0.2 + class scaling(LeggedRobotCfg.scaling): # * dimensionless time: sqrt(L/g) or sqrt(I/[mgL]), with I=I0+mL^2 - virtual_leg_length = 0.65 - dimensionless_time = (virtual_leg_length / 9.81) ** 0.5 - base_height = virtual_leg_length - base_lin_vel = virtual_leg_length / dimensionless_time - base_ang_vel = 3.14 / dimensionless_time - dof_vel = 20 # ought to be roughly max expected speed. - height_measurements = virtual_leg_length - - # todo check order of joints, create per-joint scaling - dof_pos = 3.14 + # virtual_leg_length = 0.65 + # dimensionless_time = (virtual_leg_length / 9.81) ** 0.5 + # base_height = virtual_leg_length + # base_lin_vel = virtual_leg_length / dimensionless_time + # base_ang_vel = 3.14 / dimensionless_time + # dof_vel = 20 # ought to be roughly max expected speed. + # height_measurements = virtual_leg_length + + # # todo check order of joints, create per-joint scaling + # dof_pos = 3.14 + # dof_pos_obs = dof_pos + # # * Action scales + # dof_pos_target = dof_pos + # tau_ff = 0.1 + base_ang_vel = 2.5 + base_lin_vel = 1.5 + commands = 1 + base_height = BASE_HEIGHT_REF + dof_pos = [ + 0.1, + 0.2, + 0.4, + 0.4, + 0.4, + 0.1, + 0.2, + 0.4, + 0.4, + 0.4, + 0.5, + 0.5, + 0.5, + 0.5, + 0.5, + 0.5, + 0.5, + 0.5, + ] dof_pos_obs = dof_pos - # * Action scales + # # * Action scales dof_pos_target = dof_pos - tau_ff = 0.1 + dof_vel = [ + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ] + dof_pos_history = 3 * dof_pos_obs class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): @@ -184,6 +243,7 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): runner_class_name = "OnPolicyRunner" class policy(LeggedRobotRunnerCfg.policy): + disable_actions = False init_noise_std = 1.0 actor_hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] @@ -199,26 +259,27 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_obs", "dof_vel", "dof_pos_history", + "oscillator_obs", ] critic_obs = actor_obs actions = ["dof_pos_target"] class noise: - base_height = 0.05 - dof_pos_obs = 0.0 - dof_vel = 0.0 - base_lin_vel = 0.1 - base_ang_vel = 0.2 - projected_gravity = 0.05 - height_measurements = 0.1 + base_height = 0.0 # 0.05 + dof_pos_obs = 0.0 # 0.0 + dof_vel = 0.0 # 0.0 + base_lin_vel = 0.0 # 0.1 + base_ang_vel = 0.0 # 0.2 + projected_gravity = 0.0 # 0.05 + height_measurements = 0.0 # 0.1 class reward: class weights: - tracking_ang_vel = 0.5 - tracking_lin_vel = 0.5 - orientation = 1.5 - torques = 5.0e-6 + tracking_ang_vel = 1.5 + tracking_lin_vel = 3.0 + orientation = 1.0 + torques = 5.0e-4 min_base_height = 1.5 action_rate = 0.01 action_rate2 = 0.001 @@ -226,8 +287,10 @@ class weights: ang_vel_xy = 0.0 dof_vel = 0.0 stand_still = 0.0 - dof_pos_limits = 0.0 - dof_near_home = 0.5 + dof_pos_limits = 0.25 + dof_near_home = 0.1 + stance = 1.0 + swing = 1.0 class termination_weight: termination = 15 @@ -245,13 +308,15 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): schedule = "adaptive" # could be adaptive, fixed gamma = 0.999 lam = 0.95 + discount_horizon = 0.5 # [s] + GAE_bootstrap_horizon = 0.2 # [s] desired_kl = 0.01 max_grad_norm = 1.0 class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" algorithm_class_name = "PPO2" - num_steps_per_env = 24 + num_steps_per_env = 32 max_iterations = 1000 run_name = "Standing" experiment_name = "Humanoid" diff --git a/resources/robots/mit_humanoid/friction_model_L.pt b/resources/robots/mit_humanoid/friction_model_L.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1d680d5aa88e8f496d957fa7db1640c98ee00e1 GIT binary patch literal 13519 zcmbVz2{={X+qQX56j4GlCQ;_$>}MU4%A86nl1Mah(4KQDOb8#{Mg!LYGF$_(v>Uup)G6kbl&|<&j|# z{wqyoc%lXdK9c>Q$o|NHMGJXizaTPBOTz-9rvKU~!xQ%r?vEMIlkkz~uLD;E1=x<~ zNxIz=j^as8a}jbG323K7x5_G9L00-;<>Jf;ko^z z&VSj$sDOd2=ehrrIUWW+{YfUo^Nit5`{!^g7e++(XFSi#ML5z&q<`1a3B2hsycr?? zY#2!S$gtq3$k0X0C-7#*@VtL*|0mFtH!Ftc^Y4MCyxB3lIll#(@#eRv$VglbVCp@HK0Kel~&A=N+HzR7qe z2c^GS{;%6WfcrZ#rcOYHt$c}MzOLNGTak5LPy{x*xwvfGXD}B{gw(4OFzQnlO`Ck1 z+CG*im-%w^{M0HkUvDrOvGhCHbx_!TW9l@4ynDPLuyi+hwCp0xpJ-#>IPU`u{4rgy zxceljeQHA6lw$3zq`#pzPel+I@RRUcVo9Hh9DTI%6#bZU7=`*DFizX3V z2t`-VcU1k!XrgtI%dKcQMpUoO5Lh033v$QwxCf^^K`};~oDn8mUieT5Y;@#yTJdOr z|73a~?FA{=Jy|fSG#P!5J*A&t$_b>NuO@-bRM1>xDo8aMOfPt7lefNHZeP$@!Gj03 zU=sh7c5PLn))k^eNFrMxHf$?-U$9JIvT?NF5`QE8V&o^-7R-{;O*_D1_hI@aA)cn( zlj0s#tYbnYy~&Ki7I^vmI-@&kE!{k74?Yt)Neo}TBz?P5$-P;p=≷s5+#I3~H?* zy@lnp#73LjDK179Or5}$oT5jrUEg8C0-xY6R$o*|lASErB zF=;RDYC1++Hq2&HChin0PTed>Dd3Ymkp+xT?uC_&)!z0_20 z5%GDMFGzYaX0tfA zTcA^uK?=qTATkZ<+t-ey!z)!l^$N%t(K&S9qFxdk8ckpOjuFI$XOb)3=A`J&A>tGt zM~i>x3oeeVfQk7sWcR{4GO>F*{Z^{)60X4`!<*LX_&GVKIm@usA;>ku8G=1jNSb+XM)_FRLv zu`QA--H7(VL1a?=2zV)|fo<9PEld2z32uDv#Y4K+$>h7@g6t)&f@=320-rp8I{0-h zd8BxQ^i4Kysq!wzO`}_=qwh-+RiMUBy1PkWYA8cDd+(;X%ro-*oUmZ6&>I?T^&0#i zdC`yaW@6`$yVOo~4k*LPCxRlvP2{h4HY>I)({^GIA;EqM{6K_9<5iSMqb2!zMSG9&sXlbo9K zg4eEZXqWN_0poWQ)i`SeuzZ&wE9VC>2x}!(3X0tDAP~6Lj1#0z5(`-6@m>#WN?@H24 zbOm8;>fGu}*ZD5!Dj1VHl3T1ko4%{{g>IR(C{Iq2o`70f(cmZWATgxjLLu21s>=kA zeMB2eWdtu(`pBl!L%0jG5^2e}*@7*#uknMG23K&>kh^gH0_M{o1%aYAQm4&dh)|{v z5sTBM@0LrEK98BiBwK|XmkXrwBe+z(kW1XEx{1&Cy|nYqBD$KY3H0Mr1#!>p1fN!0 z3Z@4b(e*bissFcUgk$mq%g)6M9y?z^(bH+9Y`m_$x$j8siL#AkZN(C5`DrKjXt^tQ zett3c+splsY~IMa`k&w)zkeAV#be3iy^eGZSw-kADf{XJvjif8O$6JQXjADW!|5rp zSM>Z%XEJ|$1$WE$(>VP>2#Ye&(Du}xyVkIW+kNLU#?Qzi{HzIF;fOYF-w6qOBfHD| ztsWKJV>zDy{l}BBMT1);GUUj|F&~KkI3vOzmq%t@RA~9C*Fmnh=unY6GUQ2j9T^cM z+tRx-UvTh9I``EIZ!+Eb9jR&@O#O2C^!d*svhL^?n6YvSzPodd8rh#FZ5&BKp_C#y z7@A6N6%P`epKV2oR?5=@CzObkXC1BidIaBm?j^eU1_C>1BpIU@QQ689WU(rTo92=z zaJnSSo#OwRjd?emyLrDoH~Dotjg2U#9w8cB-69#H3(?#Y2WC+2T3=vR0f4aOY}WVoI+y?51|l}qh|$9tR6 z{;3?X%&@`D&tEh9rG%i@LV=u7t%T^w`xwWokNBT|>VnN>CYzzS6&+;rjs(D7W0lpm_{m{pSzdeAt+KW^^S4oxB1DNkd^ptQcK*tBu`U zdW@uXsxf7!^}wQa4jCc;5*saYn0aTubL`5O)5WhXX~E5G*g0h{mN*!Lq+43l=$%t&rk615slSgsqWzdX zv7?JVbx(m^ym*wjU%>pV+s9lPX$$JptD&UqF(Y-7CAf10xAn+Jl)Sr~o{||04LiCZ zqDD{S^=Um{yu%9qO*9csB&Cn1ylFlfuz>0OcutjGIIklbzjnHS{?$&|HO|saQ zT8v(@M(l{usr*xpDn$Iu2x>909xa3R;lvw0{Q1)LaQwtxkQ_f7_iagp&x`J1>Ru`E zkafVXVqRd>kPg8Ow$K;Wh~oC8*kJVp*E^=MPmQkg-4>5wQa)yZg49D8d14oN=b27j zYNO~Jz71FT(+s9&aS;`5?E&>c_VD>-F1Ms}grH2Tjj8qvBcqO1lctpYnB`JI9fO;R zbmj=bg*i*;lLHBYmrvcvq5VxTeWDVr5Bo;L<}gH6mHQAZxArFRi^oVlEC+mTL7FifrdOPPfv*3#)_vJYQRwGUezIHDm2e#V6@?-np<%ueY z^=NEPn0f#!y~3!Sv>N)ZPk`oAKd8lpR+N8QPmPBP$dMzr=(NXoXi?@hmiPWFHlKB7 zMj9)lN1O=TRDX?0d~%s_<__itp6+G}qwnB`-gA)Y5dg1rjX|q<8#v!vgd);1Pg8T)h4EI?JGEJkp!x~ z`H)g_1BB+JQ?E)B(gnHXrC=5&URA=W)03&k)i`8Dq;b*eX0~1O9Ge~#OWf2sVC0bp zF6T2j#pZ9wuH6?f@xeJ}Y0@WH1Kp5codAZPB2l=<2#l63#NDfw)7g<6xH-d*zdVYi zhwEf<8@GgtTGkV(rxJq8Su<#+!ZcjeXayc$-gCBX_eU!zVOE;P;%YMi)*e1d#N0=a z)S-5unxDY7xzsSBnW`|Ux|^{szRju)3#H8D<;c}VaB5hGA|8^|c~2Ui^{58-&{(;P-S%WBtzLN<+9Ge@hU_gE{%r$$q@e&`jS{0p3Uld-xNh{x(qg3@ zs!4PHAi=&IWBQ~n9Nvf(Frv;@C>G~JPQ0v!yKnXrxlQjOYT7w`Gw%U}gf@`y_tG?c z@_swT@CLm0K$65PC19wrlLh4uEDc$Jm3KcdQTIfMe32;Wd}>eBr&ZwzUo*n7Fkq`c zs)BgkQ8uUOGw%K1OyxNK%sAb6wqkq;?b;UukM9>l*!9~mYkwi8y-k)Jd?|)cKI<@dig@(v+m$f#=~jNzxChMc+1~V1N)%p9 zTZFU9rjWGJCrItoQ{XZ$moz4vzz|DK?#;4tn!I%H10Yq z&Zg(p)CFUe!&upME#!yE2|S)1&Mh;TXLlrg0^TaBVyk7d(9z?A-Q%~z37c}3X!bmZ zot2hEbCfB$pI-zWFVdMgZ+9`_!+deDY8<&IFCkFBco^a|SXT9>2430SjQNYjNau^Y z(3%(oH(iY&c=ZC3Xs1nD_PD?i83B>5>EWD}oPyMd7L=fnN-1g< z5rSp&m|Rl>I#;@ic$7?si2Oo^ZIwd%@3V+zU^;$qCtmca% z>S6i;%=VN}@vqn6yaL7g`<~F}cLdwJ7{Zx#3+7GMB{`PEXu!;BqWD&c+P@iw^985i z&fZlpeeYd-*SrcJE)1ha4l-OhK_C%rx`fXbo+FPgs?Z@7{usFWG$S0pjk&+l9<$Gc zLeJ4b0=%0K#jP#y-uMu`zN;0fb~HQ0$Ocj?elSTd5@2`3S}?29AfmBH@l|vRp60eO zk zL&D2X)8wd|pJk2 zbqKkyq)Ckv%Hdp@FT4E95LU0X6g0+s=lfh$2fvI**lR8)$a3gILHl&P>aIu^xZa>A z@9IL}_5gU2TS7jsy^bSXZ<35?7uc%d4JW1A*z;EmVe%*ms2w3fcc${$H%;b(YU#t& zI9{IIT5L>{{bA(Z{tJ+KR?XC%x3k8M#3WT?9XXV-;vh6EqN$%_{8WyZ&0>+p8)`c%MPI5d}}Ygc1R z%zidqY!^RXYCD^MP81~!M&OLQ{-{0VHfvq}81pp0Gq)c-f{lvGbmQ~oc+p#dY~AyM zRaV&q)3?pV=ej9mhS+doXnzrh$~*(5*ey)Qa#b=#u$`t9$%CDS1nzIsB-R$gX<%t5 z6A>WJ!cc2)eQwTwwM~N4+$Tg*Oaf73SQ2ui9MNL=H%!VdMfTEely~&8an%<>OCfVw z^n@~sN^{V}LzI}tu3$9}=CRjxlmy<-`y2gJH!31#M7`W5kd?QF(%8z&n55E(4wWu& z`%okZZBrsXCP&d^7UD+R*<^IzQFwN#lui7a4n-4lm@!xL*?aSxN$Jyd za79lhrMB<&A+gap27(2lOe6Vkz|b3TKKw_3;wq8INe*F zL^h0Krn$M1PTy5Hr#=$p)@9@PD?gZHO=5KXm0K__@FWO*ZG*HH4rW`;q#Y-;L29fd z4Umvv6!wV1IW0H(P(*{W)mmioS#4Uvl|@#M!*LWS(i4}oh~oCsu=h|4a6(!!&gm-Z zM3taWO9ieHpGd+MmqM_p8|euzz^7=4h*F|w&qN-{$Q%*H4UGeb~BDDL&(vj2Q*Z5F}$`F z#;Jj&OljT_wskcZI|^peiQb#}QJXH1H;!6@i0l^_G%1%(=>AR{A0LHWR+|pZ$soyE zDm4Gc5~8>|2Og*svT~s|VwxAx+t*IJwp_Kd_Di5WzCYoRYdpQ%<;`rhj{#TaGJLWR zA|DMYiC*SH2K@)7|fw5buTf ziC4f$Y}(4iz7|5AhllJYd9Pz%6_eu6h@#IfGjh40W5#vWOm$BYZA=VyL607BjkxOHm_!{iBIK)=n8 zBYOtCkAdg6C_K8IdAGZ%br`cc2w7RGMXCjOf%A5bb) zie`K@!~-GIF=?M0xxK*+8gkc=vaK~tzK#lQh_1k>T0e}~F3rj;4u(mNv#4g+DQ5H5 zr9{8X5feLY$Z^X9?5II=Ah7-mM@_K_*-Z?ly_-ocJM{OdSuH{rbb-#(n`qf7%slXX z%g$^WOv>GInC+=aR;R4vnC%Alz{oQL14<&8yZLHps#3^kWh_D0eS_FFsnWz`nh~a% z6kvh52o&GyWHJ}kL&xqxthw21e0+TpuD?5m`5Zlgn)OL>Qr~J3vx-J$QlK2(S-J>S z1TkzvYBkzCi(yrA&8VxN47Jp;AkVfX<1^a^h&nl$`BIwz&H_d9sv!X{k#tr@Vi>ib zy$d25rja@Bh1hfQbaU3Xy%6!L0&hLNjO`kOsQ&SFSakIxW(cLzg1|Ps-M5CKK!)z! zDNdr~laUv)8)}RO+3BfB)A-^ctXP~r-Af%g9Qia@T_HqgZkj;9_65M)o>B-eI6y9N zZ-YroIy_f!|mQre`;w5!zAC>MVC8H8OM1X_^wg-BN&!mq#=8S7V^Zb{qa2EJflT zJz>Ad?%@1PD`bp^`@=~|esg%Q7}*ed7w7#5W}OyAf?kUxewHa?8g@2kEuXgvSN6A6x~m z+cVkh%4E122yWF85{(de!&Ni}b-X4mwd0@oi>R6Ke$TVr<}zmF3lAM4WS$Y5cN z)|x~&gexJir(sQD5$tUsV6jaXJWWr*qcP5mnNJp+aE}J(!CA0cClG3SdgzrU1*p%_ zCELV?(TvKqF!GB%J$!I1!VMdcu=;mI{n(6^yl^Ci&e;%LlF+=_dO*0IuY<|Jj%R&e()WInSq=&H&0n5yUd z*cBo^=vv9*yI}{xt9ms&&3Fqh^gFS*?+Aa$)HQgYjjEeH08?GlWK# zoFt*|W>M!{2ar!^(F6N-tYQk3o70wvE2&@lV}+P*(wcK~0r(?xG$zHYx~sY-CrVhXoX%bEHq z^Wflucg%ybZBV&GjSSzS$!Z3y@As-?h{-h%_N+=QtEbt+=Z>8Swa0TXEtvlpu?hdYEQekwLpO zO5%{Da2*vt-2}-lU;f*XgXrA@IgI|^a`xstQ+$}D033r(u%F+~wkSSgoY%@T{53!M z89QxJ_qsA3aC(f>ulBOi%bszjhHqq*j|~SLx|A`~lEoJH zrmuen0v&sj+fYP(2RmR@UIScJDJ6+(r0((wv4YBK1(29nvXB4Ar#x zx~Fx>qYAj_!j9eo}-!J@ND~d{ldaL-OLG zPQjcV&DUcUK0k$3zJXXBXG`9Ch!Bqr=Fs((P?303CQkYt>si4ecXKt^sjbl{YCM4) zdwKzaB!e)$u^9#L#?$gy44f~KMM1MWmXC|4B5JaP*ESWGD;{Gn9>0J!r3*+)nj-04 zvxUeqBUvY%Tym@CIyl&;L+7PB9PgBdOX|l^^@o`_c-BK^VP7jFBc}|Jo|a%F)xw4x z*<=?yDw~d(vyaqv59`lj0Uf@=2V_+&(KO~DJ1ze(N;(#^pm^!YCsX&cQmM!G)*rgLVcz zFnA{0UH+DpG>HL|i$I@-YSTkUb!pWh1(J396uaw{B&rRc!N%k-!@Hu=WJ8W0tEO-o zuCS_P*vkh@goZv&k7{5)-?@T;gVJHpIc?TLs{i}-_ab(2xE$7_9R>Gjrllmx5DFNd9>szMIfr`5L8$PWQcs$9Eg84)0Dm5|Lwj810qk;p5$5MChdo z6`F5Om8K|D*(yzba$P<6#=B6HY0Yps+l)AvKjMp5db1|_<(wb=?*d2prW z9HtnG5^7#TfAo7G!r9~D%eq`NPA}smezKytMGbU3<;iG&4o&9lg=Fg`WS)TsNiMMf ztCL1VWvVRoU490>&#=ef{P*m}#}52fYik&kGm1LoE@I4P93^NzFp{R_?$Z+#F|lJvEVEPj>Wjn}?Z;QB{9+1p1>VAMP_zWb{WWW(VWJTK!win1l(%eNf>bFAQ} zgfsng_Z;dfuco%#G}OEnO+xhQKxV;c8o7-_9`wW`igY&fx}IQ~?|G={>%!K9d8qMY zHze-YWe?riMsuH;;m=TiwsI-ZRZC*Y$*^i>kogBjss9`H>xm8Q;pj8)dYumL89$7+ zHKp;l3?4>XjGTy7;b1yleF$qkRf8n1Du&T1BS3aVAjs^P3^6YoL8)mht*~>UW4!~= zWWq@LR{tW-Xx|8nEQBaslY*<PDZ&>Z?QI33ehTXW(A;<2siGGa~#dVSS{!h43qS?D@n zu`d;xS8NBD&zUr-iiJ02S@23EAEiyy$#G?GvN?}n!$m!Kx9}clSs#bG1bw1u8%s;1 zG^z9B=kQ4?AHS*1#~k%7gx{D;P0ovxB<4I_`|c#R>4<}(s1Q*}(xg8%17KI}dN@Dt zC)U022JtmF*`?P!VP=&R?M#>g>pgYo4NX@xRhvw!+g~P&l?0!8BGH>y0=pOz9YBrt2F1!q{&MJfHu}z$>uG>KRuq>|b5hBA} z@;PCbQXsEuFWwsA$U2ccaxHigXZig(=r3l2O^-~;NdLnb?WHO}0$>0?yS)wB) zO)snOgHLA**qe!!i`Z8D&UJMz1ns8twACsT<;Dyokmcw|jp~pIIh@CMAL!$! znO($`vxTHly2MWO@({YNBOEjJS&VpA&(NG+IBw)jkBv^k@+fuq8ZjSs^q%C%&s)j3 z46DU4bJR)b2}?R;)s+z|#U7kt z#;Og)Yf(WMv@-~<%p6C8_LL&AjfV_H4V3&)%rNpH?6oz?tVtt>j2^B`vkh{v+Ttyz z{hSYMei(?hN4Maf0%tTHkqvh)XrY^J0#hKwMWs_2K!51a8NMfmR7Hn!=+PVm&;fHS#uc;>Y_Lvv@K#poV<-Y$f9CaE#^SE+#C>-{J!d<+)u ztz$Tk6fxG!mr>ty3@?x=BudeT$acR)y^un5D-&b;5_CwGp%|XwCE#nDZP=|mmx*Xn zpq|Q?!Oq{9<=q#D7ZplW@=Y>MOS{E5oA;qkk2>v3-UT-KF32QSFg6)a(L^T*wzV&Y z-0-EOvQdUEvhyOh?+b8wbTFWoYX`&8%(8)uojjlBN`xp`;w*V|e&%hhrS$1oT8*BL^9a`xo7#5y@ z-@nfWo9`#l(4C9x-hRNM5+`QBKUgW-+-mRChnKp>cvGst0V_TUePi#Q5fW?Fx=QsoU zA1AGt`M>Z#EC=+@k<^6EEy9d~)*Lo-fh^wdlE#T)6WNfRO1LiiIXiJYWyL!CIK_p7 zn#Hd5aqw{(gz6pQZ#1fA&feJ1YMVKO64S@O5#H}9Dym`kDIKP(t$`yfqzpN4J6OBg zdz|KRo7lp1V@8%u=B$66%Ctn?YySF40ltlpVmg#%|di&PCM?&=}{#bT^M< zAN}+Oeqk;%%2Wb8bBowhPZanGjrM3SKY}q|o5yMmNyqraROpvr@-{8L4`o zJ?#6e+3sL}yZly*Q)a#As4DDc8}9P$+Ey=P&7}+R$m0suHrJ0;SwL}DSQr0Lr4994 zVZsP#4m`UYhHsVB__7Pq@PMNl|74UKeBMyaUOYFFny4tkz_ir!2AAaS)c_7BFxaSPIACEIy_mh|dKMnaa)!#LnM;&KtwYGDt zZBp2ux_3BZlcLy(LryhMj;&zto6F$z_bY9ENjpF5wjyWN8$*y=ZOa#*GnY}Go5KhX ztzxs(hOvheTKPSBoA?FF#mwzn9qbV1C}Vz(%f{MWX;zN<#6FNd!YM!I&8ba#{!jah zz2vC7QKa9$>o5O-e`ht2zn#pZ=A|yljLY#|c8wE;^RK^8{ijXvLuTtL_4|PX<$vZJ zEM)TEW4it|BHhVh?EiD1_8+r0{_l#J%s>mgylc_Se(^#B#nVN2p!)AQA-|6pA23<} zT>hRR@;CbTv1#&uK)?5oK>rVIL)|Hl4)AN&)0Y205}i@&kI=ir~%Oyj?>mVaY^&$U0X zo)iAUTK$dvJ#z;C@&A_ohW%G=db&tR82u|qL$v>#{Y;pKK>1H2aTbtw>N%M0j{`P|Tu;rO}dZQ44~jf)_X} z3W|~B3F&$8gmrx-`l-Xm8!ro9v@kS=ClVAB(a&l;FE(aTc(mlasNngri^3PoiwRyD z9T7Dz&PqQGdLOlbe< zLd8OV(T(TH{!TZ{Su=(w_YYe#Pu^KLnWylJNil|}{LVGP zMMx}$H}aP>%@m$$zrucLYQG58V|b%HcpAT?nf;Qc`3IdjZ}jhUV+N#Y{Y#qmfHa+7 zOk-ns<2-o(y6xuwB~9-SE(@Oi?_37|NHhFzX-2=K8UG?QiQ$=g@W%g=X8uc>*&lS4 zJoDe_EC!@m{!7|~0clphm?p;XtUY*>e%&>T|B`0&2iF81=XWl~xip3c16<nZqA!Ry@bw*(N)S$MBr~VN2#s8BjL$7t^#Dp0fvUdTavE zNV?eiKnn0qWg=`o-m8t|=a_acH)MSmXz=u~uVsam0*q{aUjsR(ohO59@ zb{|wkjumu^o<+OM59m_8iQIW*mV!@{ys7TtB(kC2n`+;3<-RtYE*P6MnkI$XQn^#^ zn10xf+dIKTFiyK1cWai>m(NaN*$E+fW2OmdzOtL>TkvRQ&S-(#x3PGwK^g6~R@3iN zR-{@Yht{MOL*SM}bSY0#FvHWFd{jD1#Sb(H4h-VbDd!|e^BSzvxN499J`3JnPp$fw&@WXos? zK~{kcG5q2{bmNW?Cf}WQy*t9^b-cm!pi@w*D9W`k(-!cX8i=1$6U~sY=DK&}QTMts zdd+^YZSmYFC>yKIJy=nU-jZU(?{gd(oo!5>*lBZPMouO>E~b&;_grkZF29cpoLxzZ z{$hbq&U5PX-k2+>K1`+@&t{9(Ta%WqN=mC<;@j$z?C0E_^mgi2u3K9PU8OoqurY8N z6?&{kA0`iFYNj0)9F>bAA2M$;o@dm_+{ide`&!VgU;*uWV@uo9>&bal89~qf({R|^ zkPi0MqUY~#Bu^`K1O`$D-25srz|>>V3$LXti;bY$~O?4W+k!2lAOS#i{dE6NK z12*Oy0q@W&)WlzeJHOF{JIoE~=o8P$#uq&_V@qH8#SErNXP7{B^ddp)vD;MU?FW2c zah?$~vm^<9Rj435q`}(B2zI6_aX+FOS)Fj6-uQS?FwE@=tCPb(W5Dlj0qYpxF?atJ-%N<@JaI)jSzc*ZYoxE-0%=)v-1_Q>iR95 zUsphr_*+QW_0598PrSG_g*60a!)SbOF3HqC4nbK5xf2o;8zi#RX?LatN$M#l;VmZI zyr&Zy#OGe2E3Qr>VT%T#@ez5#?8|S+>#rwO2mQIP#|Vg<{y*icl3M_*tMOfD>tNaWmRsVnb(?zZDWViZx^@Ik8Nh$ z1#AM9Shh>>N+Xs!g@w?ab$x^@o=1!*+C_z70~E5cd|aefkq$qB=Y5!g3FIoxM8Kv+>GEB>Ye7qy?e`0 z(D-^kH>BezSDJf^d664{4-MM{YXm20(t%(>*w@>5M*1~0NsbbHf8b2p1%`r8dUiTr2;O?HErjo7hho{%?x4XWcPX( z1yk$X@+|&<{@B7@Yu5I+mQ&_L>!2!gqy&;e3HV|~2#s-AX0zzgE@E_6R#2bn$!?I6CC|dt zp`H&UsdwaU>V z303J3S2T}^pL-2qD_3!{*LiRQ>-?!?Ujen6B24e7H?!(vGpX^TCc*iInIt2pn3J96 zg)4j$*&*~hGuW<$&Ro5c?!A12mUhc>OQwpEAkWo;4XwwiV}Kk6ztg5)CJV8gH!(0a zQMgn+I#Q%HoT}WHxiV294fQ4JG5PA?=kSxi|a(oAUNJCiH$~yrMkN z?`Z&4snLd+Tg=H2t9on}C^DISvr(qD6ig>B#Cg|!_#9_9npbT|jP>@za=jVU_EZ*z zH`inI+{5VpF&ES8E79n5D(OwQ3P&qn*z3IIRo)zhkEy)w; z{H9#oBAdc|xdGS{cnO!hmZugHx7h9R6Y0L*awgvCCB`eQ!%)>acBrB(sxI6Ho7Sll z*M@meGb*1oiucFOEgR9pb{Mzn+jr*Stgq;z5KL!g9|8@#*I=2wmU!OtLc?9 zcq(EJJr}0Ww6)*D<=N+CM7QhKr_>6|0`^gHs4S1&G6WA#6NT7oZSyLNI z-?%#xV_yq$*-VYBxp|ysHTh7E&3m^n!C_lik=S4o;33C-=Aug` zwuq8^)FCevOv%|@53tGU0DI)L5W$Y^I8p8(IkE2>RF;pYiW?@9gv)7UT!<9?z@8^7 z-g0SWNIuzfWj)>fP8^mDdjiZ=Uvg|@Ar!xihh4Uh;GFVW%&%}HSzq^(n!BO&{>?J< zb2|?kbO>qvZo|ztc80>DbL>{tZ#H7(7eQ-}JAF0c2mgXgF7|E;rAI)Mx{TWncQk}( zPV)&6%Bdt5^T&{s*0q?dc!|^U#g0r2FM;A~;dtQve4It4;d-i+;)}X9Q!d~ z)vF6zZcM=8%??B@el&Z!Ya8(ypF?}ZJ232f7#VG|51QixaoO&>D7AGc4O#jQPwfk# zPn0-dn7WevF z4alWmcSv#WmsS&}cjaV^*kQ6ZLX5IYJ~Hcz?xUT2EWYDcV7bULc6ebA79R5e)rn%< zx50&YMlT!uXS?EZ6s3nh#bJjikJhz(XO@Mg;hfm2`mx?u@Nu^v4YFZ5_rA!G)hkl? z2~*`!s>uRfOTw`3Nf_)Xy9RBRrLYH+IsTTj;7G$IR64y5?oX5_#!i-)xLBN2>b^&> ziw*4EqyqNs-8rP^f~vrvI1zj6&QqOfPnp2jb+}bpS1@yU3@3rLK?TeQqMj~*bF|(x^$Vj zJREr{L$;}jQMLRocpw~0D-J%vq!B{Yw`v~j+sP+d&8i?>(E=-H9K}6#Yss#(Rybj< zPqj?afquUN9NkQ|UOth!=vd>|2O~+ETq<+&#T*(e_mQ!hGa5Q=Q`s#~IJDME9R!9! zV5NTzj(%N-o6d8o&~inZKCS`szl?#4o7dCrB!#S;B2D*+JK({%t7IeBo|gK|hBsI0 zP)=?RvE128c0_!k4jc}B7BZe>zkf&5PgP_4mO>C2QbzaAzk{dKXOpXHNCyqeBs*77 z46b)4=A$i`+00I05=7~wxydm7gBDd8krMFw8ku=7MY+a?jSvcFk=b*LS{e4? zR<-~tf36oSxowa3H_K?;_qAm4LTkE&t3bvDU&QvCW+c3NGJPiT6nti@kh$9|sN0cF z5HC1|86ibPDYc!wlHkk)&pgN6$}S;!3PH5g?;@VnETjfUwQ!b69}H6zqR(Y_vZ=DC zfX)AoTkWGsq*N_5PO_n^H@};7P1uQCe!GLSsos*gcK<4Dd@>p~*VJ;7&gFxlM1%cGmmt%|^w z3?`-?LkWNPTi6k!3vtU%(Z#R2@nv8vZm&mU@a8lGuto&>QlcR;MIUqc42s!G1;d ziSj|xP^$o?rdj;6(Zg^_@he8_nIt`Ydla|lRTn$TEE68aJ;!xpCgV|AVeW;cV%$XK zTQu?si?76IaT)%0GW6$A?!}uyFfu%mgSmY_%Q4Q0+ke0I+icS?p=XVD;=$w2kiWT#op5vk4Y1e&J12{hR<#l~uet?p z7TMDq2j((Z?6pge!FfU*$qQ4W_W7w~d~g$} zNgW4aS6ygw+)eDOtcaksn{#A@7~RvA4^QXTaXwkSWFN0;W(-DtvstZK4;gV1L?<|s z4GGUK=!WJ=MsvJ6$EN&SEjaI{$ z4^ahpMBfd!B<8bfLTx&OGW3e zp}j));ia+;RgG7tHFs)2d58r_rwdb~NDd^wFQfW%Oh}-A71cZ0L+>?mX^oHoUKgK( z5ZOxl=)!hjr%Mv!cmvWDmdUqPxJWOJJZW=>(*{@0G{DYXPNbR`V`<$KFb>RQ1S(vb zpPWnbgoa@E3B&p){oe~At6$=f8ZUOW_#D<+dN7nIUSjl?+{2qoqM>rxB09;=jqTRx zLf#96K%v`MB9ceDi}I*LYBIax=1Uuy4Xc=Xr!&l5$3l{1cmjKizeD`^27b;qT_QIq z5?(&lW1>%F!t~J_h<5QwQoJ{xw0kV2Q#6}!VShV0_UcqTzVQV0ipZ(ov@wMoRZhi; zXhW~6z9QRAf8ttkd5C*+hPrNkPk0Ni;c(G@AEj$IJ>qG>$?;Amt9~exUAKqw&n?p; z@gpiY1OCAsbWj%>)^8s4m;b;&Fdy&_G;g*u&(@5>?vGB$Ug2=Ap4T^4qX zb!B|~#rTCqbu3NJK_Vi7A`iVebJ#_A);_zAbL9Y5-K@kS2MHL^?;X?UG2nd+l>b1# z#ejZk2ZAr3TQlaT)^gm1QhNuJW;>&39Kzn04d&&1SjeJnUkNnmGIdmP#Xzk=5 z6FJ4)IW+_B9yrTv_xS|b`*oO&cIL45>_|H5fi8=K#=u?`Kg^$Z4A#ZYC;27a;B9vo zN>f|$&gXgXu|SVXNp!*(_8^*kEMlT_hNFMF6`rGcwD(vAZW7(XaeJ_jA2YWc1!w?| zs;oKcFQ>r9dtBnykcTg?J*eMsO`OgXGQ+`lPOy?j(r_p2C<-6GhD)<|;fO)8%x2BY z(3D}@?`M97!*9ppl5o0R3M_nNi0yTe)a&#Ij_ULx zOmdx%+h1LR@}#|(ec70rbdM)b{DtYsIwgpX)n%({HsU-?Bf;^Uyb)LHIe z^RG%$$(eWQmqqfp@~{qmww0lt4bo)BMQ?m|&K#$lc0v1Ab=ZFO1sYver$efv*{XNN zp!#(a+-cZeuMwR|MA9N)hm|Gv*hZ4ja}uaBZej;^#S=?Y#vUcv5Z_ypW} zlc`va5ZUCL&W?ZU$X{mx&|&)%#QYCn%$9{9IG>E(1qwJqu7?G?dWL^vl8x!a4C3Fp z156wYXy&3kuv9N+E4=l|sA^mK(_|M!xGJ&=&pVjGQO7ZR#c<~C^9}gXO_9_IXJE10 zR@AE2g(Jf=v7z}M>~SoB0}Ed>&JR+VXU}%xc@qV8`tD+kpD`80PbD*5%T6*szXL?? zxWY+{JI*{|%0N0*iZ})zWLsP^8O|72dg)mTtn{yBEAlg7#6*4iG^2$Xq&x?$b9|Ut zVsq%2q|eL&`*R$vvF*5Y)g}}^zz0!dS*9~hk*e6t0&!k3`|zL!ZQQH_L2Gu9Vs1QC zK5JkcjGN%}yH=Z^!yS0#x+_NgG-Z{nBf&so2JU>ViZ_Jg$jToj*nL}+?$wc^D(*As zhB#&Nkt}AE9y_w8#x2Z`c|O$H!2knPRl)w}H;miOFz$T>=kA({YEG7;F7` zJ!@%x9#pe4fG-ND_INHhCrzX;{Ep%RA5Zw;m4piS=fU98WNiKU0Irk{rHRY)VT5oZ zojrGs!YS;&#q*+pBtH9YeGu6!Qj}PK@FrnLHS5ss;cG4+;6r-m*WQs zO|Sy1$0zCov|eF|uL1zSM&f@XQOW@_%39$0ydg{jE@z+XE2VHzZm%TOz1wlT2 z+_@Y5o}2yia-o|$;B zPaD$2MTnP-D|Fqz$eiif1UWZy@%q7VzRgizRzrIqNZ9LBorv8qZqjmkX!uVsQAq{s zZ$@-#;a#|Td?N&yw!+RMPnl_kxy;>>Ts)Qj1j_xT$exR?gcs~gil?)zjCmyUL_?V> z`;x zQ7uE)_9@dNgAy1+1Aqx{_v0~-XY8B{ENgOB2zC4~GQ*v2K3=YT!P(p74dEI` zShJIAa8)mb+JA9?kwx#Yd)7K|eD{@Y%{auE56K02;VH1%_aqZm?#(#I2El95Jxr#! zF?^i5h21quhhA6tjLL6XFr6J>`gNcPVzy+l*HE!^x^lLeJP6vvXfq zW1XlRFeSE#FKzJo`s+CT(sEQ-@)NtJdb0YX8gTw}8xsEN7_-_}p}+sN9@Ylcg0!k4 ze%&yd7Szhncq=9PF8Ug?y_Ey0DL>ItTt z_VS|nb^f_C@H9iKDf_u!R4g0XgOPW5;OJT~sT)~-eV&W@6Xkubl9f#UNdDfjwCN z6GK!#K~jYQ=gGP%{QP+fN`^}V@^(T-?RXk8%9P#Wzr3z%(n+W=3SnMG=D?#ncFeto zP^Kz00-qTSB^zFxQ84XUHCDC_V@7?=WbeGb%eTy}!69OG@M^F-d%M3cf5fpFV3lRUe7-9Mt?R$w znqDdVzF`OCYEtOAWe)+Fw$$b7c9^9ohwqOE^7BgOQziL#U|DHFJH2kRC;V39O4bL8 zR14X+7UnQ^_$PQ3!4QYo&)B-@1dg3^23@{8^V^C8U~_>i$yfLR0g+Si?8gjxL(~yn zH(Js;+UnH1auWE)>JY!W@pOOAQ_k~`D)fGdDZQYa!v?N;jUSKow=HCDaenY8(i2)= zQ1PTD4V=}0gVmF$hKCJGeT(L+oq7by6GyYT*~#?7vGZVBrv_@XIsDU`9x)yFOmOum z2jcbZD>jm)G%jj4X#2%M^BiqHqg_E(j?F|*aV;j~j4X`O4ky-1lC-q$HeM6i2OW<* z>4^&)!S?P*YOpDQIcT;WF8STTq|2scdV~|(qFw+OO_T9{;6oTEWkhukC=dltTRPpY zgmT&qQNB@@UjMomj~N)?^rh9-UtWoVplKz$t*aT+Z;vOnp_1fW-!+)DQHpprTeIh4 zp2Me50j6&aWY+5sC8j-t$X)IOX8j-~a4KqJ3Y@C?{h+hxG37IVLF)!Q;JO3#4~P@J z5JkGaND1Dw?Scut`}ryoR~TXc`uau3G?}im1+Yp5u=(>Gc%`$Zznyv-&K0ghqlkmh ze7c$lt#&5&rp2Qni5J4S6uTZvxf*{gMy8Em@hELYyiX z0!?FVAxC}>BmZL>-sPWX$88@&r?`E=vhLl`Whn`^6 zOtCu-LyJ=3QrJCkd%hJs&0WyQDI8YL>4M7Ix%5U&KC?u2ESNRb^HSaSmQMvq|L zNDV`Y7z1de>3Hc!0ChaL9xRnqK;L^dQ)c8sCM-&U4cn4oI9_9(x36Y{Q}*H!ZX$o3 zK!;=MW=f-JIy5Eh!Q7`iu&TV3U9xi_%D&w~ZuDfc`g|2K%3X$dc4{#v!x~^1lLMFf zH0VJKhPp2`=6o@*M=9S6%=gtz=yT2%PMS*cI|ljFv}9Fy*eOl!`&2QnBRA2;<$8=s zyD%!(lwsG6^>niN0_Yku4)Vt^pqmQiZ5iQnVEDvste!$$AbC{E!{E4aP8^U&vZsM!oxnIlUR5QhO ziIBN3gz?*U6bJN6U3hHtzwkdM4Cp`T`i6fZe3ngjbus5k=W+hkcXv5mfs!l^UCck7wPLJmq& zPIHobrq@qYScOqnsv$RjIrL5($G`t040ydV%-&!{&Ucp^h>x8(j$fa#S{F>1CFR;s z*>CD>5pt zlX*1=hHhg}ATg9g-*HB@TXG=8_h*|lY@mIk4>Q|C7K!IKw*B)+e(jE(*e)@h&zrZF zX-F7KtNnV=wkC+`mz47dy*G!bp1U?3jfbG^nKxd|eh5bSlISv{jPr5x3nrJ%z$*#m z?9^p+zJ?3Dc{!Ao{0WXiu_G|Fc(Td%LdMCojBk+TBM$ZII6(Z*5mOHkNy&Nh9+|Nmz;f=%CwEa~a<+G!4l!Gpw+Sbfm z_<@XVr4OV0qmiSdI}7BF4Pmc}DYKT#-|^pHkOfDrL!8Xna?GLhHqPm^x$G04D@>5r zR7Sx_6KAZFV+Qg!^l2n-tw_Ir*I)hv|ITV4e^ofkP-@KIdBEnix+zJix(Ow?#{vk ztN)%8^81MK0h9I5J{EYky+h$Nz=3`WyRu=E(l>|CaxT{a0?fI}aLU_^+T*qW#zOuSN2&^_+ha nP)JDL?AL*Q+`xPwp(Pf-wg)EuxI2pv%=;0N?EekK|BC%T!Zc6A literal 0 HcmV?d00001 diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf index 51a84eee..f7f55186 100644 --- a/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf.urdf @@ -562,7 +562,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -607,7 +607,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -660,7 +660,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -711,7 +711,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -790,7 +790,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -835,7 +835,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -888,7 +888,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> @@ -939,7 +939,7 @@ Simple Foot: foot approximated as single box-contact --> + type="fixed"> diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf new file mode 100644 index 00000000..5e76f34a --- /dev/null +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf @@ -0,0 +1,996 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From c9411cdf0c07b55d7095b07233abbf41f5eeb5c0 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 18 Mar 2024 14:04:09 -0400 Subject: [PATCH 04/26] some tuning --- gym/envs/mit_humanoid/mit_humanoid.py | 17 ++++- gym/envs/mit_humanoid/mit_humanoid_config.py | 72 ++++++++++---------- 2 files changed, 49 insertions(+), 40 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 17167286..684f4cb3 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -111,8 +111,9 @@ def _reward_tracking_lin_vel(self): def _reward_dof_vel(self): # Penalize dof velocities - return torch.mean(self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1) - # * self._switch("stand") + return torch.mean( + self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1 + ) * self._switch("stand") def _reward_dof_near_home(self): return torch.mean( @@ -131,7 +132,7 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return rew_vel + rew_pos - rew_base_vel # * self._switch("stand") + return rew_vel + rew_pos - rew_base_vel * self._switch("stand") def _reward_stance(self): phase = torch.maximum( @@ -151,3 +152,13 @@ def _compute_grf(self, grf_norm=True): return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0) else: return grf + + def _reward_walk_freq(self): + # Penalize deviation from base frequency + return torch.mean( + self._sqrdexp( + (self.oscillator_freq - self.cfg.oscillator.base_frequency) + / self.cfg.oscillator.base_frequency + ), + dim=1, + ) * self._switch("move") diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 826480e7..7a8680e8 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -70,15 +70,15 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [0.7, 0.72], # z + [0.7, 0.8], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], ] # yaw root_vel_range = [ - [-0.5, 1.5], # x - [-0.55, 0.55], # y + [-0.5, 2.5], # x + [-0.5, 0.5], # y [-0.35, 0.1], # z [-0.35, 0.35], # roll [-0.35, 0.35], # pitch @@ -122,18 +122,18 @@ class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-1.0, 1.0] # min max [m/s] + lin_vel_x = [-1.0, 4.0] # min max [m/s] lin_vel_y = 1.0 # max [m/s] yaw_vel = 1 # max [rad/s] class push_robots: toggle = False - interval_s = 15 - max_push_vel_xy = 0.05 - push_box_dims = [0.1, 0.2, 0.3] # x,y,z [m] + interval_s = 2 + max_push_vel_xy = 0.6 + push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] class domain_rand: - randomize_friction = False + randomize_friction = True friction_range = [0.5, 1.25] randomize_base_mass = True added_mass_range = [-1.0, 1.0] @@ -157,7 +157,6 @@ class asset(LeggedRobotCfg.asset): fix_base_link = False disable_gravity = False disable_motors = False - apply_humanoid_jacobian = False total_mass = 25.0 class reward_settings(LeggedRobotCfg.reward_settings): @@ -195,22 +194,22 @@ class scaling(LeggedRobotCfg.scaling): dof_pos = [ 0.1, 0.2, - 0.4, - 0.4, - 0.4, + 0.8, + 0.8, + 0.8, 0.1, 0.2, - 0.4, - 0.4, - 0.4, - 0.5, - 0.5, - 0.5, - 0.5, - 0.5, - 0.5, - 0.5, - 0.5, + 0.8, + 0.8, + 0.8, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, ] dof_pos_obs = dof_pos # # * Action scales @@ -248,7 +247,7 @@ class policy(LeggedRobotRunnerCfg.policy): actor_hidden_dims = [512, 256, 128] critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" actor_obs = [ "base_height", @@ -266,13 +265,13 @@ class policy(LeggedRobotRunnerCfg.policy): actions = ["dof_pos_target"] class noise: - base_height = 0.0 # 0.05 - dof_pos_obs = 0.0 # 0.0 - dof_vel = 0.0 # 0.0 - base_lin_vel = 0.0 # 0.1 - base_ang_vel = 0.0 # 0.2 - projected_gravity = 0.0 # 0.05 - height_measurements = 0.0 # 0.1 + base_height = 0.05 + dof_pos_obs = 0.0 + dof_vel = 0.0 + base_lin_vel = 0.1 + base_ang_vel = 0.2 + projected_gravity = 0.05 + height_measurements = 0.1 class reward: class weights: @@ -285,12 +284,13 @@ class weights: action_rate2 = 0.001 lin_vel_z = 0.0 ang_vel_xy = 0.0 - dof_vel = 0.0 - stand_still = 0.0 + dof_vel = 0.25 + stand_still = 0.25 dof_pos_limits = 0.25 - dof_near_home = 0.1 + dof_near_home = 0.25 stance = 1.0 swing = 1.0 + walk_freq = 0.5 class termination_weight: termination = 15 @@ -300,7 +300,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): value_loss_coef = 1.0 use_clipped_value_loss = True clip_param = 0.2 - entropy_coef = 0.001 + entropy_coef = 0.002 num_learning_epochs = 5 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 @@ -308,8 +308,6 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): schedule = "adaptive" # could be adaptive, fixed gamma = 0.999 lam = 0.95 - discount_horizon = 0.5 # [s] - GAE_bootstrap_horizon = 0.2 # [s] desired_kl = 0.01 max_grad_norm = 1.0 From 90ff50cc34ba6a41f442a3c732e3c53dd1c600d5 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 18 Mar 2024 19:57:46 -0400 Subject: [PATCH 05/26] implement history with (approximated) arbitrary sampling frequency and history length. Implemented for dof_pos_target, dof_pos, dof_vel. --- gym/envs/mit_humanoid/mit_humanoid.py | 49 ++++++++++++++++++++ gym/envs/mit_humanoid/mit_humanoid_config.py | 19 ++------ 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 684f4cb3..a7fb56cc 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -15,6 +15,31 @@ def _init_buffers(self): self.oscillator_obs = torch.zeros(self.num_envs, 4, device=self.device) self.oscillator_freq = torch.zeros(self.num_envs, 2, device=self.device) + self.dof_pos_target_history = torch.zeros( + (self.num_envs, self.num_dof, 12), device=self.device + ) + self._init_history_buffers() + + def _init_history_buffers(self): + self.dof_pos_target_history = torch.zeros( + (self.num_envs, self.num_dof * self.cfg.env.history_length), + device=self.device, + ) + self.dof_pos_history = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.history_length, + device=self.device, + ) + self.dof_vel_history = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.history_length, + device=self.device, + ) + self.history_counter = torch.zeros(self.num_envs, dtype=int, device=self.device) + self.history_threshold = int( + self.cfg.control.ctrl_frequency / self.cfg.env.history_frequency + ) + def _reset_system(self, env_ids): if len(env_ids) == 0: return @@ -31,8 +56,15 @@ def _reset_system(self, env_ids): self.oscillator_obs = torch.cat( (self.oscillators.cos(), self.oscillators.sin()), dim=1 ) + self._reset_history_buffers(env_ids) return + def _reset_history_buffers(self, ids): + n = self.cfg.env.history_length + self.dof_pos_target_history[ids] = self.dof_pos_target[ids].tile(n) + self.dof_pos_history[ids] = self.dof_pos_target[ids].tile(n) + self.dof_vel_history[ids] = self.dof_pos_target[ids].tile(n) + # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): torques = _apply_coupling( @@ -52,6 +84,7 @@ def _compute_torques(self): def _post_decimation_step(self): super()._post_decimation_step() self._step_oscillators() + self._update_history_buffers() def _step_oscillators(self, dt=None): if dt is None: @@ -62,6 +95,22 @@ def _step_oscillators(self, dt=None): (torch.cos(self.oscillators), torch.sin(self.oscillators)), dim=1 ) + def _update_history_buffers(self): + self.history_counter += 1 + + ids = torch.nonzero( + self.history_counter == self.history_threshold, as_tuple=False + ).flatten() + + self.dof_pos_target_history[ids].roll(self.num_dof, dims=1) # check + self.dof_pos_target_history[ids, : self.num_dof] = self.dof_pos_target[ids] + self.dof_pos_history[ids].roll(self.num_dof, dims=1) # check + self.dof_pos_history[ids, : self.num_dof] = self.dof_pos_target[ids] + self.dof_vel_history[ids].roll(self.num_dof, dims=1) # check + self.dof_vel_history[ids, : self.num_dof] = self.dof_pos_target[ids] + + self.history_counter[ids] = 0 + # --- rewards --- def _switch(self, mode=None): diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 7a8680e8..04433381 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -12,7 +12,9 @@ class env(LeggedRobotCfg.env): num_observations = 49 + 3 * 18 # 121 num_actuators = 18 episode_length_s = 5 # episode length in seconds - num_privileged_obs = num_observations + + history_length = 3 # n samples + history_frequency = 10 # [Hz] class terrain(LeggedRobotCfg.terrain): pass @@ -172,21 +174,6 @@ class reward_settings(LeggedRobotCfg.reward_settings): switch_threshold = 0.2 class scaling(LeggedRobotCfg.scaling): - # * dimensionless time: sqrt(L/g) or sqrt(I/[mgL]), with I=I0+mL^2 - # virtual_leg_length = 0.65 - # dimensionless_time = (virtual_leg_length / 9.81) ** 0.5 - # base_height = virtual_leg_length - # base_lin_vel = virtual_leg_length / dimensionless_time - # base_ang_vel = 3.14 / dimensionless_time - # dof_vel = 20 # ought to be roughly max expected speed. - # height_measurements = virtual_leg_length - - # # todo check order of joints, create per-joint scaling - # dof_pos = 3.14 - # dof_pos_obs = dof_pos - # # * Action scales - # dof_pos_target = dof_pos - # tau_ff = 0.1 base_ang_vel = 2.5 base_lin_vel = 1.5 commands = 1 From 5020fd023f657116d0ebc5d254bcb98f20c2d215 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 18 Mar 2024 20:20:22 -0400 Subject: [PATCH 06/26] make pd gains specific for each robot in preparation for domain randomization of gains --- gym/envs/base/legged_robot.py | 33 ++++++-------------- gym/envs/mit_humanoid/mit_humanoid_config.py | 2 +- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index 1bc4623e..7fa2f0dd 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -481,41 +481,26 @@ def _init_buffers(self): get_axis_params(-1.0, self.up_axis_idx), device=self.device ).repeat((self.num_envs, 1)) self.torques = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.p_gains = torch.zeros( - self.num_actuators, dtype=torch.float, device=self.device + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.d_gains = torch.zeros( - self.num_actuators, dtype=torch.float, device=self.device + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_pos_target = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_vel_target = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.tau_ff = torch.zeros( - self.num_envs, - self.num_actuators, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators, dtype=torch.float, device=self.device ) self.dof_pos_history = torch.zeros( - self.num_envs, - self.num_actuators * 3, - dtype=torch.float, - device=self.device, + self.num_envs, self.num_actuators * 3, dtype=torch.float, device=self.device ) self.commands = torch.zeros( self.num_envs, 3, dtype=torch.float, device=self.device @@ -582,8 +567,8 @@ def _init_buffers(self): found = False for dof_name in self.cfg.control.stiffness.keys(): if dof_name in name: - self.p_gains[i] = self.cfg.control.stiffness[dof_name] - self.d_gains[i] = self.cfg.control.damping[dof_name] + self.p_gains[:, i] = self.cfg.control.stiffness[dof_name] + self.d_gains[:, i] = self.cfg.control.damping[dof_name] found = True if not found: self.p_gains[i] = 0.0 diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 04433381..1e079807 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -277,7 +277,7 @@ class weights: dof_near_home = 0.25 stance = 1.0 swing = 1.0 - walk_freq = 0.5 + walk_freq = 0.0 class termination_weight: termination = 15 From 82968b164fd94cbbfe95efb6d303877ed4ba6b2b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 19 Mar 2024 16:58:32 -0400 Subject: [PATCH 07/26] refactor sampled history to be more explicit and not overlap with `dof_pos_history` (used for smoothness reward). Some tuning. Change stance/swing reward to use the smoothed square wave. --- gym/envs/base/legged_robot.py | 12 +-- gym/envs/mit_humanoid/mit_humanoid.py | 95 ++++++++++++-------- gym/envs/mit_humanoid/mit_humanoid_config.py | 89 +++++++++--------- scripts/play.py | 2 +- 4 files changed, 109 insertions(+), 89 deletions(-) diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index 7fa2f0dd..d635fea7 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -110,10 +110,10 @@ def _post_decimation_step(self): self.base_height = self.root_states[:, 2:3] - n = self.num_actuators - self.dof_pos_history[:, 2 * n :] = self.dof_pos_history[:, n : 2 * n] - self.dof_pos_history[:, n : 2 * n] = self.dof_pos_history[:, :n] - self.dof_pos_history[:, :n] = self.dof_pos_target + self.dof_pos_history.roll(self.num_actuators) + self.dof_pos_history[:, : self.num_actuators] = ( + self.dof_pos - self.default_dof_pos + ) self.dof_pos_obs = self.dof_pos - self.default_dof_pos env_ids = ( @@ -133,7 +133,9 @@ def _reset_idx(self, env_ids): self._reset_system(env_ids) self._resample_commands(env_ids) # * reset buffers - self.dof_pos_history[env_ids] = 0.0 + self.dof_pos_history[env_ids] = ( + self.dof_pos[env_ids] - self.default_dof_pos + ).tile(3) self.episode_length_buf[env_ids] = 0 def _initialize_sim(self): diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index a7fb56cc..5db4ab7a 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -14,30 +14,28 @@ def _init_buffers(self): self.oscillators = torch.zeros(self.num_envs, 2, device=self.device) self.oscillator_obs = torch.zeros(self.num_envs, 4, device=self.device) self.oscillator_freq = torch.zeros(self.num_envs, 2, device=self.device) + self._init_sampled_history_buffers() - self.dof_pos_target_history = torch.zeros( - (self.num_envs, self.num_dof, 12), device=self.device - ) - self._init_history_buffers() - - def _init_history_buffers(self): - self.dof_pos_target_history = torch.zeros( - (self.num_envs, self.num_dof * self.cfg.env.history_length), + def _init_sampled_history_buffers(self): + self.sampled_history_dof_pos_target = torch.zeros( + (self.num_envs, self.num_dof * self.cfg.env.sampled_history_length), device=self.device, ) - self.dof_pos_history = torch.zeros( + self.sampled_history_dof_pos = torch.zeros( self.num_envs, - self.num_dof * self.cfg.env.history_length, + self.num_dof * self.cfg.env.sampled_history_length, device=self.device, ) - self.dof_vel_history = torch.zeros( + self.sampled_history_dof_vel = torch.zeros( self.num_envs, - self.num_dof * self.cfg.env.history_length, + self.num_dof * self.cfg.env.sampled_history_length, device=self.device, ) - self.history_counter = torch.zeros(self.num_envs, dtype=int, device=self.device) - self.history_threshold = int( - self.cfg.control.ctrl_frequency / self.cfg.env.history_frequency + self.sampled_history_counter = torch.zeros( + self.num_envs, dtype=int, device=self.device + ) + self.sampled_history_threshold = int( + self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency ) def _reset_system(self, env_ids): @@ -56,14 +54,14 @@ def _reset_system(self, env_ids): self.oscillator_obs = torch.cat( (self.oscillators.cos(), self.oscillators.sin()), dim=1 ) - self._reset_history_buffers(env_ids) + self._reset_sampled_history_buffers(env_ids) return - def _reset_history_buffers(self, ids): - n = self.cfg.env.history_length - self.dof_pos_target_history[ids] = self.dof_pos_target[ids].tile(n) - self.dof_pos_history[ids] = self.dof_pos_target[ids].tile(n) - self.dof_vel_history[ids] = self.dof_pos_target[ids].tile(n) + def _reset_sampled_history_buffers(self, ids): + n = self.cfg.env.sampled_history_length + self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n) # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): @@ -84,7 +82,7 @@ def _compute_torques(self): def _post_decimation_step(self): super()._post_decimation_step() self._step_oscillators() - self._update_history_buffers() + self._update_sampled_history_buffers() def _step_oscillators(self, dt=None): if dt is None: @@ -95,21 +93,24 @@ def _step_oscillators(self, dt=None): (torch.cos(self.oscillators), torch.sin(self.oscillators)), dim=1 ) - def _update_history_buffers(self): - self.history_counter += 1 + def _update_sampled_history_buffers(self): + self.sampled_history_counter += 1 ids = torch.nonzero( - self.history_counter == self.history_threshold, as_tuple=False + self.sampled_history_counter == self.sampled_history_threshold, + as_tuple=False, ).flatten() - self.dof_pos_target_history[ids].roll(self.num_dof, dims=1) # check - self.dof_pos_target_history[ids, : self.num_dof] = self.dof_pos_target[ids] - self.dof_pos_history[ids].roll(self.num_dof, dims=1) # check - self.dof_pos_history[ids, : self.num_dof] = self.dof_pos_target[ids] - self.dof_vel_history[ids].roll(self.num_dof, dims=1) # check - self.dof_vel_history[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_pos_target[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ + ids + ] + self.sampled_history_dof_pos[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_vel[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] - self.history_counter[ids] = 0 + self.sampled_history_counter[ids] = 0 # --- rewards --- @@ -167,7 +168,7 @@ def _reward_dof_vel(self): def _reward_dof_near_home(self): return torch.mean( self._sqrdexp( - (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos_obs"] + (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] ), dim=1, ) @@ -184,15 +185,17 @@ def _reward_stand_still(self): return rew_vel + rew_pos - rew_base_vel * self._switch("stand") def _reward_stance(self): - phase = torch.maximum( - torch.zeros_like(self.oscillators), -self.oscillators.sin() - ) # positive during swing, negative during stance + # phase = torch.maximum( + # torch.zeros_like(self.oscillators), -self.oscillators.sin() + # ) # positive during swing, negative during stance + phase = self.smooth_sqr_wave(self.oscillators) return (phase * self._compute_grf()).mean(dim=1) def _reward_swing(self): - phase = torch.maximum( - torch.zeros_like(self.oscillators), self.oscillators.sin() - ) # positive during swing, negative during stance + # phase = torch.maximum( + # torch.zeros_like(self.oscillators), self.oscillators.sin() + # ) # positive during swing, negative during stance + phase = self.smooth_sqr_wave(self.oscillators + torch.pi) return -(phase * self._compute_grf()).mean(dim=1) def _compute_grf(self, grf_norm=True): @@ -202,6 +205,9 @@ def _compute_grf(self, grf_norm=True): else: return grf + def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function + return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 + def _reward_walk_freq(self): # Penalize deviation from base frequency return torch.mean( @@ -211,3 +217,14 @@ def _reward_walk_freq(self): ), dim=1, ) * self._switch("move") + + def _reward_hips_forward(self): + # reward hip motors for pointing forward + hip_yaw_abad = torch.stack((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.stack( + (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 + ) + hip_yaw_abad /= torch.stack( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 + ) + return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 1e079807..89359a8c 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -9,12 +9,11 @@ class MITHumanoidCfg(LeggedRobotCfg): class env(LeggedRobotCfg.env): num_envs = 4096 - num_observations = 49 + 3 * 18 # 121 num_actuators = 18 episode_length_s = 5 # episode length in seconds - history_length = 3 # n samples - history_frequency = 10 # [Hz] + sampled_history_length = 3 # n samples + sampled_history_frequency = 10 # [Hz] class terrain(LeggedRobotCfg.terrain): pass @@ -29,17 +28,17 @@ class init_state(LeggedRobotCfg.init_state): default_joint_angles = { "hip_yaw": 0.0, "hip_abad": 0.0, - "hip_pitch": -0.4, - "knee": 0.9, - "ankle": -0.45, + "hip_pitch": -0.667751, + "knee": 1.4087, + "ankle": -0.708876, "shoulder_pitch": 0.0, "shoulder_abad": 0.0, "shoulder_yaw": 0.0, - "elbow": 0.0, + "elbow": -1.25, } # * default COM for basic initialization - pos = [0.0, 0.0, 0.66] # x,y,z [m] + pos = [0.0, 0.0, 0.72] # x,y,z [m] rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] @@ -47,22 +46,22 @@ class init_state(LeggedRobotCfg.init_state): # * initialization for random range setup dof_pos_range = { - "hip_yaw": [0.0, 0.0], - "hip_abad": [0.0, 0.0], - "hip_pitch": [-0.29, -0.25], - "knee": [0.67, 0.71], - "ankle": [-0.43, -0.39], + "hip_yaw": [-0.1, 0.1], + "hip_abad": [-0.1, 0.1], + "hip_pitch": [-0.2, 0.2], + "knee": [-0.2, 0.2], + "ankle": [-0.15, 0.15], "shoulder_pitch": [0.0, 0.0], "shoulder_abad": [0.0, 0.0], "shoulder_yaw": [0.0, 0.0], - "elbow": [0.0, 0.0], + "elbow": [0, 0], } dof_vel_range = { - "hip_yaw": [-0.0, 0.1], - "hip_abad": [-0.0, 0.1], - "hip_pitch": [-0.1, -0.1], - "knee": [-0.05, 0.05], - "ankle": [-0.05, 0.05], + "hip_yaw": [-0.1, 0.1], + "hip_abad": [-0.1, 0.1], + "hip_pitch": [-0.1, 0.1], + "knee": [-0.1, 0.1], + "ankle": [-0.1, 0.1], "shoulder_pitch": [0.0, 0.0], "shoulder_abad": [0.0, 0.0], "shoulder_yaw": [0.0, 0.0], @@ -72,15 +71,15 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [0.7, 0.8], # z + [0.72, 0.72], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], ] # yaw root_vel_range = [ - [-0.5, 2.5], # x - [-0.5, 0.5], # y + [-0.75, 2.75], # x + [-0.55, 0.55], # y [-0.35, 0.1], # z [-0.35, 0.35], # roll [-0.35, 0.35], # pitch @@ -98,7 +97,7 @@ class control(LeggedRobotCfg.control): "shoulder_pitch": 40.0, "shoulder_abad": 40.0, "shoulder_yaw": 40.0, - "elbow": 40.0, + "elbow": 50.0, } # [N*m/rad] damping = { "hip_yaw": 2.0, @@ -118,19 +117,19 @@ class control(LeggedRobotCfg.control): filter_gain = 0.1586 # 1: no filtering, 0: wall class oscillator: - base_frequency = 1.5 # [Hz] + base_frequency = 2.0 # [Hz] class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-1.0, 4.0] # min max [m/s] - lin_vel_y = 1.0 # max [m/s] - yaw_vel = 1 # max [rad/s] + lin_vel_x = [-1.0, 4.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.35 # max [m/s] + yaw_vel = 0.5 # max [rad/s] class push_robots: - toggle = False - interval_s = 2 + toggle = True + interval_s = 1 max_push_vel_xy = 0.6 push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] @@ -147,11 +146,11 @@ class asset(LeggedRobotCfg.asset): ) # foot_collisionbox_names = ["foot"] foot_name = "foot" - penalize_contacts_on = ["base", "arm"] + penalize_contacts_on = ["arm"] terminate_after_contacts_on = ["base"] - end_effector_names = ["hand", "foot"] + end_effector_names = ["hand", "foot"] # ?? flip_visual_attachments = False - self_collisions = 1 # 1 to disagble, 0 to enable...bitwise filter + self_collisions = 0 # 1 to disagble, 0 to enable...bitwise filter collapse_fixed_joints = False # * see GymDofDriveModeFlags # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) @@ -198,7 +197,6 @@ class scaling(LeggedRobotCfg.scaling): 0.1, 0.1, ] - dof_pos_obs = dof_pos # # * Action scales dof_pos_target = dof_pos dof_vel = [ @@ -221,7 +219,7 @@ class scaling(LeggedRobotCfg.scaling): 1.0, 1.0, ] - dof_pos_history = 3 * dof_pos_obs + dof_pos_history = 3 * dof_pos class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): @@ -231,8 +229,8 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): class policy(LeggedRobotRunnerCfg.policy): disable_actions = False init_noise_std = 1.0 - actor_hidden_dims = [512, 256, 128] - critic_hidden_dims = [512, 256, 128] + actor_hidden_dims = [256, 256, 128] + critic_hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" @@ -245,6 +243,9 @@ class policy(LeggedRobotRunnerCfg.policy): "dof_pos_obs", "dof_vel", "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", "oscillator_obs", ] critic_obs = actor_obs @@ -252,18 +253,17 @@ class policy(LeggedRobotRunnerCfg.policy): actions = ["dof_pos_target"] class noise: - base_height = 0.05 - dof_pos_obs = 0.0 - dof_vel = 0.0 - base_lin_vel = 0.1 - base_ang_vel = 0.2 - projected_gravity = 0.05 - height_measurements = 0.1 + dof_pos = 0.005 + dof_vel = 0.05 + base_ang_vel = 0.025 + base_lin_vel = 0.025 + projected_gravity = 0.01 + feet_contact_state = 0.025 class reward: class weights: - tracking_ang_vel = 1.5 tracking_lin_vel = 3.0 + tracking_ang_vel = 1.5 orientation = 1.0 torques = 5.0e-4 min_base_height = 1.5 @@ -277,6 +277,7 @@ class weights: dof_near_home = 0.25 stance = 1.0 swing = 1.0 + hips_forward = 0.5 walk_freq = 0.0 class termination_weight: diff --git a/scripts/play.py b/scripts/play.py index 29a837ea..eb32855e 100644 --- a/scripts/play.py +++ b/scripts/play.py @@ -18,7 +18,7 @@ def setup(args): env_cfg.env.num_projectiles = 20 task_registry.make_gym_and_sim() env = task_registry.make_env(args.task, env_cfg) - env.cfg.init_state.reset_mode = "reset_to_basic" + # env.cfg.init_state.reset_mode = "reset_to_basic" train_cfg.runner.resume = True train_cfg.logging.enable_local_saving = False runner = task_registry.make_alg_runner(env, train_cfg) From 3ae04a080730dd24269d36c222ecc091e936d311 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sat, 23 Mar 2024 12:29:28 -0400 Subject: [PATCH 08/26] properly roll histories, retune action_rate for mc --- gym/envs/base/legged_robot.py | 2 +- .../mini_cheetah/mini_cheetah_ref_config.py | 4 +-- gym/envs/mit_humanoid/mit_humanoid.py | 12 +++++-- gym/envs/mit_humanoid/mit_humanoid_config.py | 36 +++++++++---------- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index d635fea7..68178650 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -110,7 +110,7 @@ def _post_decimation_step(self): self.base_height = self.root_states[:, 2:3] - self.dof_pos_history.roll(self.num_actuators) + self.dof_pos_history = self.dof_pos_history.roll(self.num_actuators) self.dof_pos_history[:, : self.num_actuators] = ( self.dof_pos - self.default_dof_pos ) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 7911206a..1a4e0aed 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -113,8 +113,8 @@ class weights: dof_vel = 0.0 min_base_height = 1.5 collision = 0.0 - action_rate = 0.01 - action_rate2 = 0.001 + action_rate = 1.0 + action_rate2 = 1.0 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 5db4ab7a..489b463d 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -101,13 +101,19 @@ def _update_sampled_history_buffers(self): as_tuple=False, ).flatten() - self.sampled_history_dof_pos_target[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_pos_target[ids] = torch.roll( + self.sampled_history_dof_pos_target[ids], self.num_dof, dims=1 + ) # check self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ ids ] - self.sampled_history_dof_pos[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_pos[ids] = torch.roll( + self.sampled_history_dof_pos[ids], self.num_dof, dims=1 + ) # check self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] - self.sampled_history_dof_vel[ids].roll(self.num_dof, dims=1) # check + self.sampled_history_dof_vel[ids] = torch.roll( + self.sampled_history_dof_vel[ids], self.num_dof, dims=1 + ) # check self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] self.sampled_history_counter[ids] = 0 diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 89359a8c..54813f7b 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -38,7 +38,7 @@ class init_state(LeggedRobotCfg.init_state): } # * default COM for basic initialization - pos = [0.0, 0.0, 0.72] # x,y,z [m] + pos = [0.0, 0.0, 0.6] # x,y,z [m] rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] @@ -46,11 +46,11 @@ class init_state(LeggedRobotCfg.init_state): # * initialization for random range setup dof_pos_range = { - "hip_yaw": [-0.1, 0.1], - "hip_abad": [-0.1, 0.1], - "hip_pitch": [-0.2, 0.2], - "knee": [-0.2, 0.2], - "ankle": [-0.15, 0.15], + "hip_yaw": [-0.0, 0.0], + "hip_abad": [-0.0, 0.0], + "hip_pitch": [-0.667751, -0.667751], + "knee": [1.4087, 1.4087], + "ankle": [-0.708876, -0.708876], "shoulder_pitch": [0.0, 0.0], "shoulder_abad": [0.0, 0.0], "shoulder_yaw": [0.0, 0.0], @@ -71,7 +71,7 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [0.72, 0.72], # z + [0.64, 0.7], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], @@ -250,7 +250,7 @@ class policy(LeggedRobotRunnerCfg.policy): ] critic_obs = actor_obs - actions = ["dof_pos_target"] + actions = ["dof_pos_target", "oscillator_freq"] class noise: dof_pos = 0.005 @@ -262,23 +262,23 @@ class noise: class reward: class weights: - tracking_lin_vel = 3.0 - tracking_ang_vel = 1.5 - orientation = 1.0 + # tracking_lin_vel = 3.0 + # tracking_ang_vel = 1.5 + # orientation = 1.0 torques = 5.0e-4 - min_base_height = 1.5 + # min_base_height = 1.5 action_rate = 0.01 action_rate2 = 0.001 lin_vel_z = 0.0 ang_vel_xy = 0.0 - dof_vel = 0.25 - stand_still = 0.25 + # dof_vel = 0.25 + # stand_still = 0.25 dof_pos_limits = 0.25 - dof_near_home = 0.25 + # dof_near_home = 0.25 stance = 1.0 swing = 1.0 hips_forward = 0.5 - walk_freq = 0.0 + walk_freq = 2.5 class termination_weight: termination = 15 @@ -288,11 +288,11 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): value_loss_coef = 1.0 use_clipped_value_loss = True clip_param = 0.2 - entropy_coef = 0.002 + entropy_coef = 0.01 num_learning_epochs = 5 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 - learning_rate = 5.0e-4 + learning_rate = 5.0e-5 schedule = "adaptive" # could be adaptive, fixed gamma = 0.999 lam = 0.95 From ee05de48a620bcf598002a4b76779d9d92be3612 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sun, 24 Mar 2024 18:30:55 -0400 Subject: [PATCH 09/26] enable arms in learnt urdf diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 54813f7..368f2c3 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -142,7 +142,7 @@ class MITHumanoidCfg(LeggedRobotCfg): class asset(LeggedRobotCfg.asset): file = ( "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_R_sf.urdf" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" ) # foot_collisionbox_names = ["foot"] foot_name = "foot" @@ -289,12 +289,12 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): use_clipped_value_loss = True clip_param = 0.2 entropy_coef = 0.01 - num_learning_epochs = 5 + num_learning_epochs = 4 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 - learning_rate = 5.0e-5 + learning_rate = 1.0e-6 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.999 + gamma = 0.99 lam = 0.95 desired_kl = 0.01 max_grad_norm = 1.0 diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf index 5e76f34..98a0773 100644 --- a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf @@ -570,7 +570,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -615,7 +615,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -668,7 +668,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -719,7 +719,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -798,7 +798,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -843,7 +843,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -896,7 +896,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -947,7 +947,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> --- gym/envs/mit_humanoid/mit_humanoid_config.py | 8 ++++---- .../mit_humanoid/urdf/humanoid_F_sf_learnt.urdf | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 54813f7b..368f2c3b 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -142,7 +142,7 @@ class domain_rand: class asset(LeggedRobotCfg.asset): file = ( "{LEGGED_GYM_ROOT_DIR}/resources/robots/" - + "mit_humanoid/urdf/humanoid_R_sf.urdf" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" ) # foot_collisionbox_names = ["foot"] foot_name = "foot" @@ -289,12 +289,12 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): use_clipped_value_loss = True clip_param = 0.2 entropy_coef = 0.01 - num_learning_epochs = 5 + num_learning_epochs = 4 # * mini batch size = num_envs*nsteps / nminibatches num_mini_batches = 4 - learning_rate = 5.0e-5 + learning_rate = 1.0e-6 schedule = "adaptive" # could be adaptive, fixed - gamma = 0.999 + gamma = 0.99 lam = 0.95 desired_kl = 0.01 max_grad_norm = 1.0 diff --git a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf index 5e76f34a..98a0773a 100644 --- a/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf +++ b/resources/robots/mit_humanoid/urdf/humanoid_F_sf_learnt.urdf @@ -570,7 +570,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -615,7 +615,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -668,7 +668,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -719,7 +719,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -798,7 +798,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -843,7 +843,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -896,7 +896,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> @@ -947,7 +947,7 @@ Simple Foot: foot approximated as single box-contact --> + type="revolute"> From 0aab1005ea71ad3b39712ad4a5dcbd81845d3c8c Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Sun, 24 Mar 2024 19:15:55 -0400 Subject: [PATCH 10/26] hotfix old runner, and set as default for humanoid. New runner is numerically unstable for humanoid --- gym/envs/mit_humanoid/mit_humanoid_config.py | 4 ++-- learning/runners/old_policy_runner.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index 368f2c3b..a3e113b6 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -224,7 +224,7 @@ class scaling(LeggedRobotCfg.scaling): class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "OldPolicyRunner" class policy(LeggedRobotRunnerCfg.policy): disable_actions = False @@ -301,7 +301,7 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" - algorithm_class_name = "PPO2" + algorithm_class_name = "PPO" num_steps_per_env = 32 max_iterations = 1000 run_name = "Standing" diff --git a/learning/runners/old_policy_runner.py b/learning/runners/old_policy_runner.py index 9a738b93..f42b26cf 100644 --- a/learning/runners/old_policy_runner.py +++ b/learning/runners/old_policy_runner.py @@ -70,7 +70,7 @@ def learn(self): logger.tic("learning") self.alg.update() logger.toc("learning") - logger.log_category() + logger.log_all_categories() logger.finish_iteration() logger.toc("iteration") From d86162e57f3d2c64ed28f1a4a172bc3b7a90efbd Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Fri, 29 Mar 2024 13:04:45 -0400 Subject: [PATCH 11/26] WIP: tuning, critic loss higher than expected --- gym/envs/mit_humanoid/mit_humanoid.py | 12 ++--- gym/envs/mit_humanoid/mit_humanoid_config.py | 48 ++++++++++---------- learning/algorithms/ppo2.py | 13 ++---- learning/runners/on_policy_runner.py | 4 ++ learning/utils/dict_utils.py | 23 ++++------ 5 files changed, 48 insertions(+), 52 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 489b463d..118ac170 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -172,12 +172,9 @@ def _reward_dof_vel(self): ) * self._switch("stand") def _reward_dof_near_home(self): - return torch.mean( - self._sqrdexp( - (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] - ), - dim=1, - ) + return self._sqrdexp( + (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] + ).mean(dim=1) def _reward_stand_still(self): """Penalize motion at zero commands""" @@ -233,4 +230,5 @@ def _reward_hips_forward(self): hip_yaw_abad /= torch.stack( (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 ) - return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) + return (hip_yaw_abad).pow(2).mean(dim=1) + # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index a3e113b6..033409e0 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -117,20 +117,20 @@ class control(LeggedRobotCfg.control): filter_gain = 0.1586 # 1: no filtering, 0: wall class oscillator: - base_frequency = 2.0 # [Hz] + base_frequency = 3.0 # [Hz] class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-1.0, 4.0] # min max [m/s] [-0.75, 0.75] - lin_vel_y = 0.35 # max [m/s] - yaw_vel = 0.5 # max [rad/s] + lin_vel_x = [-0.0, 0.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.0 # max [m/s] + yaw_vel = 0.0 # max [rad/s] class push_robots: toggle = True interval_s = 1 - max_push_vel_xy = 0.6 + max_push_vel_xy = 0.5 push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] class domain_rand: @@ -155,18 +155,18 @@ class asset(LeggedRobotCfg.asset): # * see GymDofDriveModeFlags # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) default_dof_drive_mode = 3 - fix_base_link = False + fix_base_link = True disable_gravity = False disable_motors = False total_mass = 25.0 class reward_settings(LeggedRobotCfg.reward_settings): - soft_dof_pos_limit = 0.9 - soft_dof_vel_limit = 0.9 - soft_torque_limit = 0.9 + soft_dof_pos_limit = 0.8 + soft_dof_vel_limit = 0.8 + soft_torque_limit = 0.8 max_contact_force = 1500.0 base_height_target = BASE_HEIGHT_REF - tracking_sigma = 0.5 + tracking_sigma = 0.25 # a smooth switch based on |cmd| (commanded velocity). switch_scale = 0.5 @@ -229,28 +229,28 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): class policy(LeggedRobotRunnerCfg.policy): disable_actions = False init_noise_std = 1.0 - actor_hidden_dims = [256, 256, 128] - critic_hidden_dims = [256, 256, 128] + actor_hidden_dims = [512, 512, 128] + critic_hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" actor_obs = [ "base_height", "base_lin_vel", "base_ang_vel", "projected_gravity", - "commands", + # "commands", "dof_pos_obs", "dof_vel", "dof_pos_history", - "sampled_history_dof_pos", - "sampled_history_dof_vel", - "sampled_history_dof_pos_target", + # "sampled_history_dof_pos", + # "sampled_history_dof_vel", + # "sampled_history_dof_pos_target", "oscillator_obs", ] critic_obs = actor_obs - actions = ["dof_pos_target", "oscillator_freq"] + actions = ["dof_pos_target"] class noise: dof_pos = 0.005 @@ -262,23 +262,23 @@ class noise: class reward: class weights: - # tracking_lin_vel = 3.0 + # tracking_lin_vel = 0.0 # tracking_ang_vel = 1.5 # orientation = 1.0 torques = 5.0e-4 # min_base_height = 1.5 - action_rate = 0.01 - action_rate2 = 0.001 + action_rate = 1e-3 + action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 # dof_vel = 0.25 # stand_still = 0.25 dof_pos_limits = 0.25 - # dof_near_home = 0.25 + dof_near_home = 0.25 stance = 1.0 swing = 1.0 - hips_forward = 0.5 - walk_freq = 2.5 + hips_forward = 0.0 + walk_freq = 0.0 # 2.5 class termination_weight: termination = 15 diff --git a/learning/algorithms/ppo2.py b/learning/algorithms/ppo2.py index 69d6dcd2..efdce668 100644 --- a/learning/algorithms/ppo2.py +++ b/learning/algorithms/ppo2.py @@ -65,15 +65,12 @@ def train_mode(self): def act(self, obs, critic_obs): return self.actor.act(obs).detach() - def update(self, data, last_obs=None): - if last_obs is None: - last_values = None - else: - with torch.no_grad(): - last_values = self.critic.evaluate(last_obs).detach() - compute_generalized_advantages( - data, self.gamma, self.lam, self.critic, last_values + def update(self, data): + data["values"] = self.critic.evaluate(data["critic_obs"]) + data["advantages"] = compute_generalized_advantages( + data, self.gamma, self.lam, self.critic ) + data["returns"] = data["advantages"] + data["values"] self.update_critic(data) self.update_actor(data) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 91e2ac26..6350eaa5 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -38,8 +38,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, + "next_actor_obs": actor_obs, "actions": self.alg.act(actor_obs, critic_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -89,6 +91,8 @@ def learn(self): transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, "timed_out": timed_out, "dones": dones, diff --git a/learning/utils/dict_utils.py b/learning/utils/dict_utils.py index 2d19e072..4f4ff420 100644 --- a/learning/utils/dict_utils.py +++ b/learning/utils/dict_utils.py @@ -23,16 +23,14 @@ def compute_MC_returns(data: TensorDict, gamma, critic=None): @torch.no_grad -def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): - data.update({"values": critic.evaluate(data["critic_obs"])}) - - data.update({"advantages": torch.zeros_like(data["values"])}) - +def compute_generalized_advantages(data, gamma, lam, critic): + last_values = critic.evaluate(data["next_critic_obs"][-1]) + advantages = torch.zeros_like(data["values"]) if last_values is not None: # todo check this # since we don't have observations for the last step, need last value plugged in not_done = ~data["dones"][-1] - data["advantages"][-1] = ( + advantages[-1] = ( data["rewards"][-1] + gamma * data["values"][-1] * data["timed_out"][-1] + gamma * last_values * not_done @@ -47,15 +45,14 @@ def compute_generalized_advantages(data, gamma, lam, critic, last_values=None): + gamma * data["values"][k + 1] * not_done - data["values"][k] ) - data["advantages"][k] = ( - td_error + gamma * lam * not_done * data["advantages"][k + 1] - ) + advantages[k] = td_error + gamma * lam * not_done * advantages[k + 1] - data["returns"] = data["advantages"] + data["values"] + return normalize(advantages) - data["advantages"] = (data["advantages"] - data["advantages"].mean()) / ( - data["advantages"].std() + 1e-8 - ) + +@torch.no_grad +def normalize(input, eps=1e-8): + return (input - input.mean()) / (input.std() + eps) # todo change num_epochs to num_batches From 098989a175231ca70a814230c1adf9ca5aabbb9b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 6 Aug 2024 13:19:37 -0400 Subject: [PATCH 12/26] fix hips_forward dimensionality --- gym/envs/mit_humanoid/mit_humanoid.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 118ac170..265f0ff9 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -223,12 +223,12 @@ def _reward_walk_freq(self): def _reward_hips_forward(self): # reward hip motors for pointing forward - hip_yaw_abad = torch.stack((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) - hip_yaw_abad -= torch.stack( + hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.cat( (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 ) - hip_yaw_abad /= torch.stack( - (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 + hip_yaw_abad /= torch.cat( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]) ) return (hip_yaw_abad).pow(2).mean(dim=1) # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) From 0611bcd52ba326d8a23b929a679382a72f14b0f9 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 6 Aug 2024 13:43:10 -0400 Subject: [PATCH 13/26] lander env --- gym/envs/__init__.py | 6 +- gym/envs/mit_humanoid/lander.py | 195 +++++++++++++++ gym/envs/mit_humanoid/lander_config.py | 331 +++++++++++++++++++++++++ 3 files changed, 531 insertions(+), 1 deletion(-) create mode 100644 gym/envs/mit_humanoid/lander.py create mode 100644 gym/envs/mit_humanoid/lander_config.py diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index f9d8b788..26710fce 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -19,7 +19,8 @@ "Anymal": ".anymal_c.anymal", "A1": ".a1.a1", "HumanoidRunning": ".mit_humanoid.humanoid_running", - "Pendulum": ".pendulum.pendulum" + "Pendulum": ".pendulum.pendulum", + "Lander": ".mit_humanoid.lander", } config_dict = { @@ -34,6 +35,7 @@ "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config", "PendulumCfg": ".pendulum.pendulum_config", "PendulumSACCfg": ".pendulum.pendulum_SAC_config", + "LanderCfg": ".mit_humanoid.lander_config", } runner_config_dict = { @@ -48,6 +50,7 @@ "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config", "PendulumRunnerCfg": ".pendulum.pendulum_config", "PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config", + "LanderRunnerCfg": ".mit_humanoid.lander_config", } task_dict = { @@ -77,6 +80,7 @@ "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"], "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"], "sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"], + "lander": ["Lander", "LanderCfg", "LanderRunnerCfg"], } for class_name, class_location in class_dict.items(): diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py new file mode 100644 index 00000000..5856efb8 --- /dev/null +++ b/gym/envs/mit_humanoid/lander.py @@ -0,0 +1,195 @@ +import torch + +from gym.envs.base.legged_robot import LeggedRobot + +# from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid +from .jacobian import _apply_coupling + + +class Lander(LeggedRobot): + def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): + super().__init__(gym, sim, cfg, sim_params, sim_device, headless) + + def _init_buffers(self): + super()._init_buffers() + self._init_sampled_history_buffers() + + def _init_sampled_history_buffers(self): + self.sampled_history_dof_pos_target = torch.zeros( + (self.num_envs, self.num_dof * self.cfg.env.sampled_history_length), + device=self.device, + ) + self.sampled_history_dof_pos = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.sampled_history_length, + device=self.device, + ) + self.sampled_history_dof_vel = torch.zeros( + self.num_envs, + self.num_dof * self.cfg.env.sampled_history_length, + device=self.device, + ) + self.sampled_history_counter = torch.zeros( + self.num_envs, dtype=int, device=self.device + ) + self.sampled_history_threshold = int( + self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency + ) + + def _reset_system(self, env_ids): + if len(env_ids) == 0: + return + super()._reset_system(env_ids) + + self._reset_sampled_history_buffers(env_ids) + return + + def _reset_sampled_history_buffers(self, ids): + n = self.cfg.env.sampled_history_length + self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n) + + # compute_torques accounting for coupling, and filtering torques + def _compute_torques(self): + torques = _apply_coupling( + self.dof_pos, + self.dof_vel, + self.dof_pos_target + self.default_dof_pos, + self.dof_vel_target, + self.p_gains, + self.d_gains, + self.tau_ff, + ) + torques = torques.clip(-self.torque_limits, self.torque_limits) + return torques + + def _post_decimation_step(self): + super()._post_decimation_step() + self._update_sampled_history_buffers() + + def _update_sampled_history_buffers(self): + self.sampled_history_counter += 1 + + ids = torch.nonzero( + self.sampled_history_counter == self.sampled_history_threshold, + as_tuple=False, + ).flatten() + + self.sampled_history_dof_pos_target[ids] = torch.roll( + self.sampled_history_dof_pos_target[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ + ids + ] + self.sampled_history_dof_pos[ids] = torch.roll( + self.sampled_history_dof_pos[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_vel[ids] = torch.roll( + self.sampled_history_dof_vel[ids], self.num_dof, dims=1 + ) # check + self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] + + self.sampled_history_counter[ids] = 0 + + # --- rewards --- + + def _switch(self, mode=None): + c_vel = torch.linalg.norm(self.commands, dim=1) + switch = torch.exp( + -torch.square( + torch.max( + torch.zeros_like(c_vel), + c_vel - self.cfg.reward_settings.switch_scale, + ) + ) + / self.cfg.reward_settings.switch_scale + ) + if mode is None or mode == "stand": + return switch + elif mode == "move": + return 1 - switch + + def _reward_lin_vel_xy(self): + return torch.exp( + -torch.linalg.norm(self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1) + ) + + def _reward_lin_vel_z(self): + # Penalize z axis base linear velocity w. squared exp + return self._sqrdexp(self.base_lin_vel[:, 2] / self.scales["base_lin_vel"]) + + def _reward_orientation(self): + # Penalize non flat base orientation + return torch.sum( + self._sqrdexp(torch.square(self.projected_gravity[:, :2])), dim=1 + ) + + def _reward_min_base_height(self): + """Squared exponential saturating at base_height target""" + error = self.base_height - self.cfg.reward_settings.base_height_target + error = torch.clamp(error, max=0, min=None).flatten() + return self._sqrdexp(error) + + def _reward_tracking_lin_vel(self): + """Tracking of linear velocity commands (xy axes)""" + # just use lin_vel? + error = self.commands[:, :2] - self.base_lin_vel[:, :2] + # * scale by (1+|cmd|): if cmd=0, no scaling. + error *= 1.0 / (1.0 + torch.abs(self.commands[:, :2])) + return torch.mean(self._sqrdexp(error), dim=1) + + def _reward_dof_vel(self): + # Penalize dof velocities + return torch.mean( + self._sqrdexp(self.dof_vel / self.scales["dof_vel"]), dim=1 + ) * self._switch("stand") + + def _reward_dof_near_home(self): + return self._sqrdexp( + (self.dof_pos - self.default_dof_pos) / self.scales["dof_pos"] + ).mean(dim=1) + + def _reward_stand_still(self): + """Penalize motion at zero commands""" + # * normalize angles so we care about being within 5 deg + rew_pos = torch.mean( + self._sqrdexp((self.dof_pos - self.default_dof_pos) / torch.pi * 36), dim=1 + ) + rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) + rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) + rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) + return rew_vel + rew_pos - rew_base_vel * self._switch("stand") + + def _compute_grf(self, grf_norm=True): + grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) + if grf_norm: + return torch.clamp_max(grf / self.cfg.asset.total_mass, 1.0) + else: + return grf + + def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function + return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 + + def _reward_walk_freq(self): + # Penalize deviation from base frequency + return torch.mean( + self._sqrdexp( + (self.oscillator_freq - self.cfg.oscillator.base_frequency) + / self.cfg.oscillator.base_frequency + ), + dim=1, + ) * self._switch("move") + + def _reward_hips_forward(self): + # reward hip motors for pointing forward + hip_yaw_abad = torch.stack((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.stack( + (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 + ) + hip_yaw_abad /= torch.stack( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 + ) + return (hip_yaw_abad).pow(2).mean(dim=1) + # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py new file mode 100644 index 00000000..a4262f0d --- /dev/null +++ b/gym/envs/mit_humanoid/lander_config.py @@ -0,0 +1,331 @@ +from gym.envs.base.legged_robot_config import ( + LeggedRobotCfg, + LeggedRobotRunnerCfg, +) + +BASE_HEIGHT_REF = 0.80 + + +class LanderCfg(LeggedRobotCfg): + class env(LeggedRobotCfg.env): + num_envs = 4096 + num_actuators = 18 + episode_length_s = 5 # episode length in seconds + + sampled_history_length = 3 # n samples + sampled_history_frequency = 10 # [Hz] + + class terrain(LeggedRobotCfg.terrain): + pass + + class init_state(LeggedRobotCfg.init_state): + # * default setup chooses how the initial conditions are chosen. + # * "reset_to_basic" = a single position with added randomized noise. + # * "reset_to_range" = a range of joint positions and velocities. + # * "reset_to_traj" = feed in a trajectory to sample from. + reset_mode = "reset_to_range" + + default_joint_angles = { + "hip_yaw": 0.0, + "hip_abad": 0.0, + "hip_pitch": -0.667751, + "knee": 1.4087, + "ankle": -0.708876, + "shoulder_pitch": 0.0, + "shoulder_abad": 0.0, + "shoulder_yaw": 0.0, + "elbow": -1.25, + } + + # * default COM for basic initialization + pos = [0.0, 0.0, 0.6] # x,y,z [m] + rot = [0.0, 0.0, 0.0, 1.0] # x,y,z,w [quat] + lin_vel = [0.0, 0.0, 0.0] # x,y,z [m/s] + ang_vel = [0.0, 0.0, 0.0] # x,y,z [rad/s] + + # * initialization for random range setup + + dof_pos_range = { + "hip_yaw": [-0.0, 0.0], + "hip_abad": [-0.0, 0.0], + "hip_pitch": [-0.667751, -0.667751], + "knee": [1.4087, 1.4087], + "ankle": [-0.708876, -0.708876], + "shoulder_pitch": [0.0, 0.0], + "shoulder_abad": [0.0, 0.0], + "shoulder_yaw": [0.0, 0.0], + "elbow": [0, 0], + } + dof_vel_range = { + "hip_yaw": [-0.1, 0.1], + "hip_abad": [-0.1, 0.1], + "hip_pitch": [-0.1, 0.1], + "knee": [-0.1, 0.1], + "ankle": [-0.1, 0.1], + "shoulder_pitch": [0.0, 0.0], + "shoulder_abad": [0.0, 0.0], + "shoulder_yaw": [0.0, 0.0], + "elbow": [0.0, 0.0], + } + + root_pos_range = [ + [0.0, 0.0], # x + [0.0, 0.0], # y + [1.0, 1.5], # z + [-0.1, 0.1], # roll + [-0.1, 0.1], # pitch + [-0.1, 0.1], + ] # yaw + + root_vel_range = [ + [-0.75, 2.75], # x + [-0.55, 0.55], # y + [-2.5, 0.25], # z + [-0.35, 0.35], # roll + [-0.35, 0.35], # pitch + [-0.35, 0.35], # yaw + ] + + class control(LeggedRobotCfg.control): + # * PD Drive parameters: + stiffness = { + "hip_yaw": 30.0, + "hip_abad": 30.0, + "hip_pitch": 30.0, + "knee": 30.0, + "ankle": 30.0, + "shoulder_pitch": 40.0, + "shoulder_abad": 40.0, + "shoulder_yaw": 40.0, + "elbow": 50.0, + } # [N*m/rad] + damping = { + "hip_yaw": 2.0, + "hip_abad": 2.0, + "hip_pitch": 2.0, + "knee": 2.0, + "ankle": 2.0, + "shoulder_pitch": 2.0, + "shoulder_abad": 2.0, + "shoulder_yaw": 2.0, + "elbow": 1.0, + } # [N*m*s/rad] + + ctrl_frequency = 100 + desired_sim_frequency = 1000 + + # class oscillator: + # base_frequency = 3.0 # [Hz] + + class commands: + resampling_time = 10.0 # time before command are changed[s] + + class ranges: + lin_vel_x = [-0.0, 0.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.0 # max [m/s] + yaw_vel = 0.0 # max [rad/s] + + class push_robots: + toggle = True + interval_s = 1 + max_push_vel_xy = 0.5 + push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] + + class domain_rand: + randomize_friction = True + friction_range = [0.5, 1.25] + randomize_base_mass = True + added_mass_range = [-1.0, 1.0] + + class asset(LeggedRobotCfg.asset): + file = ( + "{LEGGED_GYM_ROOT_DIR}/resources/robots/" + + "mit_humanoid/urdf/humanoid_F_sf_learnt.urdf" + ) + # foot_collisionbox_names = ["foot"] + foot_name = "foot" + penalize_contacts_on = ["arm"] + terminate_after_contacts_on = ["base"] + end_effector_names = ["hand", "foot"] # ?? + flip_visual_attachments = False + self_collisions = 0 # 1 to disagble, 0 to enable...bitwise filter + collapse_fixed_joints = False + # * see GymDofDriveModeFlags + # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) + default_dof_drive_mode = 3 + fix_base_link = False + disable_gravity = False + disable_motors = False + total_mass = 25.0 + + class reward_settings(LeggedRobotCfg.reward_settings): + soft_dof_pos_limit = 0.8 + soft_dof_vel_limit = 0.8 + soft_torque_limit = 0.8 + max_contact_force = 1500.0 + base_height_target = BASE_HEIGHT_REF + tracking_sigma = 0.25 + + # a smooth switch based on |cmd| (commanded velocity). + switch_scale = 0.5 + switch_threshold = 0.2 + + class scaling(LeggedRobotCfg.scaling): + base_ang_vel = 2.5 + base_lin_vel = 1.5 + commands = 1 + base_height = BASE_HEIGHT_REF + dof_pos = [ + 0.1, + 0.2, + 0.8, + 0.8, + 0.8, + 0.1, + 0.2, + 0.8, + 0.8, + 0.8, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + 0.1, + ] + # # * Action scales + dof_pos_target = dof_pos + dof_vel = [ + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 0.5, + 1.0, + 4.0, + 4.0, + 2.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ] + dof_pos_history = 3 * dof_pos + + +class LanderRunnerCfg(LeggedRobotRunnerCfg): + seed = -1 + runner_class_name = "OnPolicyRunner" + + class actor(LeggedRobotRunnerCfg.actor): + init_noise_std = 1.0 + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + smooth_exploration = False + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + # "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + # "sampled_history_dof_pos", + # "sampled_history_dof_vel", + # "sampled_history_dof_pos_target", + # "oscillator_obs", + ] + normalize_obs = True + + actions = ["dof_pos_target"] + disable_actions = False + + class noise: + dof_pos = 0.005 + dof_vel = 0.05 + base_ang_vel = 0.025 + base_lin_vel = 0.025 + projected_gravity = 0.01 + feet_contact_state = 0.025 + + class critic(LeggedRobotRunnerCfg.critic): + hidden_dims = [512, 256, 128] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + activation = "elu" + + obs = [ + "base_height", + "base_lin_vel", + "base_ang_vel", + "projected_gravity", + "commands", + "dof_pos_obs", + "dof_vel", + "dof_pos_history", + ] + normalize_obs = True + + class reward: + class weights: + # tracking_lin_vel = 0.0 + # tracking_ang_vel = 1.5 + # orientation = 1.0 + torques = 5.0e-4 + # min_base_height = 1.5 + action_rate = 1e-3 + action_rate2 = 1e-3 + lin_vel_z = 0.0 + ang_vel_xy = 0.0 + # dof_vel = 0.25 + # stand_still = 0.25 + dof_pos_limits = 0.25 + dof_near_home = 0.25 + # stance = 1.0 + # swing = 1.0 + hips_forward = 0.0 + # walk_freq = 0.0 # 2.5 + + class termination_weight: + termination = 15 + + class algorithm(LeggedRobotRunnerCfg.algorithm): + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2**15 + max_gradient_steps = 24 + # new + storage_size = 2**17 # new + batch_size = 2**15 # new + + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 + lr_range = [1e-5, 1e-2] + lr_ratio = 1.5 + + class runner(LeggedRobotRunnerCfg.runner): + policy_class_name = "ActorCritic" + algorithm_class_name = "PPO2" + num_steps_per_env = 24 + max_iterations = 1000 + run_name = "Standing" + experiment_name = "Humanoid" + save_interval = 50 From fd308656c2aeaf81e02c79c1d19c22c976ebf02b Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 7 Aug 2024 14:17:00 -0400 Subject: [PATCH 14/26] unify humanoid and lander put jacobian into MIT_humanoid base environment, rather than import; importing breaks when running --original_config --- gym/envs/mit_humanoid/lander.py | 113 +++---------------------- gym/envs/mit_humanoid/lander_config.py | 36 ++++---- gym/envs/mit_humanoid/mit_humanoid.py | 99 ++++++++++------------ 3 files changed, 74 insertions(+), 174 deletions(-) diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py index 5856efb8..972eb413 100644 --- a/gym/envs/mit_humanoid/lander.py +++ b/gym/envs/mit_humanoid/lander.py @@ -1,98 +1,13 @@ import torch -from gym.envs.base.legged_robot import LeggedRobot +# from gym.envs.base.legged_robot import LeggedRobot +from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid -# from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid -from .jacobian import _apply_coupling - -class Lander(LeggedRobot): +class Lander(MIT_Humanoid): def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): super().__init__(gym, sim, cfg, sim_params, sim_device, headless) - def _init_buffers(self): - super()._init_buffers() - self._init_sampled_history_buffers() - - def _init_sampled_history_buffers(self): - self.sampled_history_dof_pos_target = torch.zeros( - (self.num_envs, self.num_dof * self.cfg.env.sampled_history_length), - device=self.device, - ) - self.sampled_history_dof_pos = torch.zeros( - self.num_envs, - self.num_dof * self.cfg.env.sampled_history_length, - device=self.device, - ) - self.sampled_history_dof_vel = torch.zeros( - self.num_envs, - self.num_dof * self.cfg.env.sampled_history_length, - device=self.device, - ) - self.sampled_history_counter = torch.zeros( - self.num_envs, dtype=int, device=self.device - ) - self.sampled_history_threshold = int( - self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency - ) - - def _reset_system(self, env_ids): - if len(env_ids) == 0: - return - super()._reset_system(env_ids) - - self._reset_sampled_history_buffers(env_ids) - return - - def _reset_sampled_history_buffers(self, ids): - n = self.cfg.env.sampled_history_length - self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n) - self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n) - self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n) - - # compute_torques accounting for coupling, and filtering torques - def _compute_torques(self): - torques = _apply_coupling( - self.dof_pos, - self.dof_vel, - self.dof_pos_target + self.default_dof_pos, - self.dof_vel_target, - self.p_gains, - self.d_gains, - self.tau_ff, - ) - torques = torques.clip(-self.torque_limits, self.torque_limits) - return torques - - def _post_decimation_step(self): - super()._post_decimation_step() - self._update_sampled_history_buffers() - - def _update_sampled_history_buffers(self): - self.sampled_history_counter += 1 - - ids = torch.nonzero( - self.sampled_history_counter == self.sampled_history_threshold, - as_tuple=False, - ).flatten() - - self.sampled_history_dof_pos_target[ids] = torch.roll( - self.sampled_history_dof_pos_target[ids], self.num_dof, dims=1 - ) # check - self.sampled_history_dof_pos_target[ids, : self.num_dof] = self.dof_pos_target[ - ids - ] - self.sampled_history_dof_pos[ids] = torch.roll( - self.sampled_history_dof_pos[ids], self.num_dof, dims=1 - ) # check - self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] - self.sampled_history_dof_vel[ids] = torch.roll( - self.sampled_history_dof_vel[ids], self.num_dof, dims=1 - ) # check - self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] - - self.sampled_history_counter[ids] = 0 - # --- rewards --- def _switch(self, mode=None): @@ -172,24 +87,18 @@ def _compute_grf(self, grf_norm=True): def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 - def _reward_walk_freq(self): - # Penalize deviation from base frequency - return torch.mean( - self._sqrdexp( - (self.oscillator_freq - self.cfg.oscillator.base_frequency) - / self.cfg.oscillator.base_frequency - ), - dim=1, - ) * self._switch("move") - def _reward_hips_forward(self): # reward hip motors for pointing forward - hip_yaw_abad = torch.stack((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) - hip_yaw_abad -= torch.stack( + hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) + hip_yaw_abad -= torch.cat( (self.default_dof_pos[:, 0:2], self.default_dof_pos[:, 5:7]), dim=1 ) - hip_yaw_abad /= torch.stack( - (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]), dim=1 + hip_yaw_abad /= torch.cat( + (self.scales["dof_pos"][0:2], self.scales["dof_pos"][5:7]) ) return (hip_yaw_abad).pow(2).mean(dim=1) # return self._sqrdexp(hip_yaw_abad).sum(dim=1).mean(dim=1) + + def _reward_power(self): + power = self.torques * self.dof_vel + return power.pow(2).mean(dim=1) diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py index a4262f0d..0de34b4c 100644 --- a/gym/envs/mit_humanoid/lander_config.py +++ b/gym/envs/mit_humanoid/lander_config.py @@ -10,7 +10,7 @@ class LanderCfg(LeggedRobotCfg): class env(LeggedRobotCfg.env): num_envs = 4096 num_actuators = 18 - episode_length_s = 5 # episode length in seconds + episode_length_s = 10 # episode length in seconds sampled_history_length = 3 # n samples sampled_history_frequency = 10 # [Hz] @@ -71,7 +71,7 @@ class init_state(LeggedRobotCfg.init_state): root_pos_range = [ [0.0, 0.0], # x [0.0, 0.0], # y - [1.0, 1.5], # z + [0.64, 1.5], # z [-0.1, 0.1], # roll [-0.1, 0.1], # pitch [-0.1, 0.1], @@ -144,7 +144,7 @@ class asset(LeggedRobotCfg.asset): ) # foot_collisionbox_names = ["foot"] foot_name = "foot" - penalize_contacts_on = ["arm"] + penalize_contacts_on = ["arm", "hand", "shoulder"] terminate_after_contacts_on = ["base"] end_effector_names = ["hand", "foot"] # ?? flip_visual_attachments = False @@ -228,7 +228,7 @@ class actor(LeggedRobotRunnerCfg.actor): init_noise_std = 1.0 hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" smooth_exploration = False obs = [ @@ -236,16 +236,15 @@ class actor(LeggedRobotRunnerCfg.actor): "base_lin_vel", "base_ang_vel", "projected_gravity", - # "commands", + "commands", "dof_pos_obs", "dof_vel", "dof_pos_history", - # "sampled_history_dof_pos", - # "sampled_history_dof_vel", - # "sampled_history_dof_pos_target", - # "oscillator_obs", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] disable_actions = False @@ -261,10 +260,10 @@ class noise: class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "elu" + activation = "tanh" obs = [ - "base_height", + # "base_height", "base_lin_vel", "base_ang_vel", "projected_gravity", @@ -272,21 +271,25 @@ class critic(LeggedRobotRunnerCfg.critic): "dof_pos_obs", "dof_vel", "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - normalize_obs = True + normalize_obs = False class reward: class weights: # tracking_lin_vel = 0.0 # tracking_ang_vel = 1.5 # orientation = 1.0 - torques = 5.0e-4 - # min_base_height = 1.5 + torques = 5.0e-5 + power = 0 # 1.0e-2 + min_base_height = 1.5 action_rate = 1e-3 action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 - # dof_vel = 0.25 + dof_vel = 0.5 # stand_still = 0.25 dof_pos_limits = 0.25 dof_near_home = 0.25 @@ -294,6 +297,7 @@ class weights: # swing = 1.0 hips_forward = 0.0 # walk_freq = 0.0 # 2.5 + collision = 1.0 class termination_weight: termination = 15 diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index 265f0ff9..a580fd57 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -1,8 +1,6 @@ import torch from gym.envs.base.legged_robot import LeggedRobot -from .jacobian import _apply_coupling -from gym.utils import exp_avg_filter class MIT_Humanoid(LeggedRobot): @@ -11,9 +9,6 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() - self.oscillators = torch.zeros(self.num_envs, 2, device=self.device) - self.oscillator_obs = torch.zeros(self.num_envs, 4, device=self.device) - self.oscillator_freq = torch.zeros(self.num_envs, 2, device=self.device) self._init_sampled_history_buffers() def _init_sampled_history_buffers(self): @@ -38,22 +33,51 @@ def _init_sampled_history_buffers(self): self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency ) + def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): + # Create a Jacobian matrix and move it to the same device as input tensors + J = torch.eye(q.shape[-1]).to(q.device) + J[4, 3] = 1 + J[9, 8] = 1 + + # Perform transformations using Jacobian + q = torch.matmul(q, J.T) + qd = torch.matmul(qd, J.T) + q_des = torch.matmul(q_des, J.T) + qd_des = torch.matmul(qd_des, J.T) + + # Inverse of the transpose of Jacobian + J_inv_T = torch.inverse(J.T) + + # Compute feed-forward torques + tau_ff = torch.matmul(J_inv_T, tau_ff.T).T + + # Compute kp and kd terms + kp = torch.diagonal( + torch.matmul( + torch.matmul(J_inv_T, torch.diag_embed(kp, dim1=-2, dim2=-1)), J_inv_T.T + ), + dim1=-2, + dim2=-1, + ) + + kd = torch.diagonal( + torch.matmul( + torch.matmul(J_inv_T, torch.diag_embed(kd, dim1=-2, dim2=-1)), J_inv_T.T + ), + dim1=-2, + dim2=-1, + ) + + # Compute torques + torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff + torques = torch.matmul(torques, J) + + return torques + def _reset_system(self, env_ids): if len(env_ids) == 0: return super()._reset_system(env_ids) - # reset oscillators, with a pi phase shift between left and right - self.oscillators[env_ids, 0] = ( - torch.rand(len(env_ids), device=self.device) * 2 * torch.pi - ) - self.oscillators[env_ids, 1] = self.oscillators[env_ids, 0] + torch.pi - self.oscillators = torch.remainder(self.oscillators, 2 * torch.pi) - # reset oscillator velocities to base freq - self.oscillator_freq[env_ids] = self.cfg.oscillator.base_frequency - # recompute oscillator observations - self.oscillator_obs = torch.cat( - (self.oscillators.cos(), self.oscillators.sin()), dim=1 - ) self._reset_sampled_history_buffers(env_ids) return @@ -65,7 +89,7 @@ def _reset_sampled_history_buffers(self, ids): # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): - torques = _apply_coupling( + torques = self._apply_coupling( self.dof_pos, self.dof_vel, self.dof_pos_target + self.default_dof_pos, @@ -74,25 +98,12 @@ def _compute_torques(self): self.d_gains, self.tau_ff, ) - torques = torques.clip(-self.torque_limits, self.torque_limits) - return exp_avg_filter(torques, self.torques, self.cfg.control.filter_gain) - - # oscillator integration + return torques.clip(-self.torque_limits, self.torque_limits) def _post_decimation_step(self): super()._post_decimation_step() - self._step_oscillators() self._update_sampled_history_buffers() - def _step_oscillators(self, dt=None): - if dt is None: - dt = self.dt - self.oscillators += (self.oscillator_freq * 2 * torch.pi) * dt - self.oscillators = torch.remainder(self.oscillators, 2 * torch.pi) - self.oscillator_obs = torch.cat( - (torch.cos(self.oscillators), torch.sin(self.oscillators)), dim=1 - ) - def _update_sampled_history_buffers(self): self.sampled_history_counter += 1 @@ -187,20 +198,6 @@ def _reward_stand_still(self): rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) return rew_vel + rew_pos - rew_base_vel * self._switch("stand") - def _reward_stance(self): - # phase = torch.maximum( - # torch.zeros_like(self.oscillators), -self.oscillators.sin() - # ) # positive during swing, negative during stance - phase = self.smooth_sqr_wave(self.oscillators) - return (phase * self._compute_grf()).mean(dim=1) - - def _reward_swing(self): - # phase = torch.maximum( - # torch.zeros_like(self.oscillators), self.oscillators.sin() - # ) # positive during swing, negative during stance - phase = self.smooth_sqr_wave(self.oscillators + torch.pi) - return -(phase * self._compute_grf()).mean(dim=1) - def _compute_grf(self, grf_norm=True): grf = torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) if grf_norm: @@ -211,16 +208,6 @@ def _compute_grf(self, grf_norm=True): def smooth_sqr_wave(self, phase, sigma=0.2): # sigma=0 is step function return phase.sin() / (2 * torch.sqrt(phase.sin() ** 2.0 + sigma**2.0)) + 0.5 - def _reward_walk_freq(self): - # Penalize deviation from base frequency - return torch.mean( - self._sqrdexp( - (self.oscillator_freq - self.cfg.oscillator.base_frequency) - / self.cfg.oscillator.base_frequency - ), - dim=1, - ) * self._switch("move") - def _reward_hips_forward(self): # reward hip motors for pointing forward hip_yaw_abad = torch.cat((self.dof_pos[:, 0:2], self.dof_pos[:, 5:7]), dim=1) From be2e990c6ec3d2887e3f89e856e66d5284f5bdad Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 4 Sep 2024 09:58:49 -0400 Subject: [PATCH 15/26] wip --- gym/envs/mit_humanoid/lander.py | 16 +++++++++++++++ gym/envs/mit_humanoid/lander_config.py | 28 +++++++++----------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py index 972eb413..f86d30f4 100644 --- a/gym/envs/mit_humanoid/lander.py +++ b/gym/envs/mit_humanoid/lander.py @@ -2,12 +2,28 @@ # from gym.envs.base.legged_robot import LeggedRobot from gym.envs.mit_humanoid.mit_humanoid import MIT_Humanoid +from isaacgym.torch_utils import torch_rand_float class Lander(MIT_Humanoid): def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): super().__init__(gym, sim, cfg, sim_params, sim_device, headless) + def _resample_commands(self, env_ids): + """Randommly select commands of some environments + + Args: + env_ids (List[int]): Environments ids for which new commands are needed + """ + if len(env_ids) == 0: + return + super()._resample_commands(env_ids) + # * with 75% chance, reset to 0 + self.commands[env_ids, :] *= ( + torch_rand_float(0, 1, (len(env_ids), 1), device=self.device).squeeze(1) + < 0.25 + ).unsqueeze(1) + # --- rewards --- def _switch(self, mode=None): diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py index 0de34b4c..8219d459 100644 --- a/gym/envs/mit_humanoid/lander_config.py +++ b/gym/envs/mit_humanoid/lander_config.py @@ -112,7 +112,7 @@ class control(LeggedRobotCfg.control): } # [N*m*s/rad] ctrl_frequency = 100 - desired_sim_frequency = 1000 + desired_sim_frequency = 500 # class oscillator: # base_frequency = 3.0 # [Hz] @@ -121,13 +121,13 @@ class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-0.0, 0.0] # min max [m/s] [-0.75, 0.75] - lin_vel_y = 0.0 # max [m/s] - yaw_vel = 0.0 # max [rad/s] + lin_vel_x = [-2.0, 2.0] # min max [m/s] [-0.75, 0.75] + lin_vel_y = 0.3 # max [m/s] + yaw_vel = 1.0 # max [rad/s] class push_robots: toggle = True - interval_s = 1 + interval_s = 2 max_push_vel_xy = 0.5 push_box_dims = [0.1, 0.1, 0.3] # x,y,z [m] @@ -146,13 +146,9 @@ class asset(LeggedRobotCfg.asset): foot_name = "foot" penalize_contacts_on = ["arm", "hand", "shoulder"] terminate_after_contacts_on = ["base"] - end_effector_names = ["hand", "foot"] # ?? flip_visual_attachments = False self_collisions = 0 # 1 to disagble, 0 to enable...bitwise filter collapse_fixed_joints = False - # * see GymDofDriveModeFlags - # * (0 is none, 1 is pos tgt, 2 is vel tgt, 3 effort) - default_dof_drive_mode = 3 fix_base_link = False disable_gravity = False disable_motors = False @@ -279,24 +275,18 @@ class critic(LeggedRobotRunnerCfg.critic): class reward: class weights: - # tracking_lin_vel = 0.0 - # tracking_ang_vel = 1.5 - # orientation = 1.0 torques = 5.0e-5 - power = 0 # 1.0e-2 + power = 1e-6 # 1.0e-2 min_base_height = 1.5 + lin_vel_xy = 1.0 action_rate = 1e-3 action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 dof_vel = 0.5 - # stand_still = 0.25 dof_pos_limits = 0.25 - dof_near_home = 0.25 - # stance = 1.0 - # swing = 1.0 + dof_near_home = 0.75 hips_forward = 0.0 - # walk_freq = 0.0 # 2.5 collision = 1.0 class termination_weight: @@ -330,6 +320,6 @@ class runner(LeggedRobotRunnerCfg.runner): algorithm_class_name = "PPO2" num_steps_per_env = 24 max_iterations = 1000 - run_name = "Standing" + run_name = "lander" experiment_name = "Humanoid" save_interval = 50 From 27991100e4388ea3df415b75f4b022a599747b51 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 4 Sep 2024 15:42:46 -0400 Subject: [PATCH 16/26] Refactor for action frequency handled in runner only for lander: needs to overload _check_terminations_and_timeouts to not actually reset so that reset can be handled only by runner (without breaking backwards compatibility) Also keeps a list of rewards, for finer reward integration, and can also do traj stats --- gym/envs/mit_humanoid/lander.py | 7 ++ gym/envs/mit_humanoid/lander_config.py | 15 +-- gym/utils/helpers.py | 2 +- learning/runners/my_runner.py | 128 +++++++++++++++++++++---- 4 files changed, 127 insertions(+), 25 deletions(-) diff --git a/gym/envs/mit_humanoid/lander.py b/gym/envs/mit_humanoid/lander.py index f86d30f4..86d37321 100644 --- a/gym/envs/mit_humanoid/lander.py +++ b/gym/envs/mit_humanoid/lander.py @@ -24,6 +24,13 @@ def _resample_commands(self, env_ids): < 0.25 ).unsqueeze(1) + def _check_terminations_and_timeouts(self): + """Check if environments need to be reset""" + contact_forces = self.contact_forces[:, self.termination_contact_indices, :] + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.timed_out = self.episode_length_buf >= self.max_episode_length + # self.to_be_reset = self.timed_out | self.terminated + # --- rewards --- def _switch(self, mode=None): diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py index 8219d459..48c2eb53 100644 --- a/gym/envs/mit_humanoid/lander_config.py +++ b/gym/envs/mit_humanoid/lander_config.py @@ -111,7 +111,7 @@ class control(LeggedRobotCfg.control): "elbow": 1.0, } # [N*m*s/rad] - ctrl_frequency = 100 + ctrl_frequency = 500 desired_sim_frequency = 500 # class oscillator: @@ -218,13 +218,15 @@ class scaling(LeggedRobotCfg.scaling): class LanderRunnerCfg(LeggedRobotRunnerCfg): seed = -1 - runner_class_name = "OnPolicyRunner" + runner_class_name = "MyRunner" class actor(LeggedRobotRunnerCfg.actor): + frequency = 100 init_noise_std = 1.0 hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = ["elu", "elu", "tanh"] + layer_norm = True smooth_exploration = False obs = [ @@ -256,7 +258,8 @@ class noise: class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid - activation = "tanh" + activation = "elu" + layer_norm = True obs = [ # "base_height", @@ -290,7 +293,7 @@ class weights: collision = 1.0 class termination_weight: - termination = 15 + termination = 1.0 class algorithm(LeggedRobotRunnerCfg.algorithm): # both @@ -319,7 +322,7 @@ class runner(LeggedRobotRunnerCfg.runner): policy_class_name = "ActorCritic" algorithm_class_name = "PPO2" num_steps_per_env = 24 - max_iterations = 1000 + max_iterations = 500 run_name = "lander" experiment_name = "Humanoid" save_interval = 50 diff --git a/gym/utils/helpers.py b/gym/utils/helpers.py index 90af8bce..8e74d93c 100644 --- a/gym/utils/helpers.py +++ b/gym/utils/helpers.py @@ -191,7 +191,7 @@ def get_args(custom_parameters=None): { "name": "--task", "type": str, - "default": "cartpole", + "default": "lander", "help": "Resume training or start testing from a checkpoint. " "Overrides config file if provided.", }, diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index d97dee8d..96eedc77 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -19,14 +19,18 @@ def __init__(self, env, train_cfg, device="cpu"): self.device, ) - def learn(self): + def learn(self, states_to_log_dict=None): self.set_up_logger() + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + rewards_dict = {} + rewards_list = [dict() for x in range(n_policy_steps)] self.alg.switch_to_train() - actor_obs = self.get_obs(self.actor_cfg["actor_obs"]) - critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) tot_iter = self.it + self.num_learning_iterations self.save() @@ -35,8 +39,10 @@ def learn(self): transition.update( { "actor_obs": actor_obs, - "actions": self.alg.act(actor_obs, critic_obs), + "next_actor_obs": actor_obs, + "actions": self.alg.act(actor_obs), "critic_obs": critic_obs, + "next_critic_obs": critic_obs, "rewards": self.get_rewards({"termination": 0.0})["termination"], "dones": self.get_timed_out(), } @@ -52,10 +58,17 @@ def learn(self): for self.it in range(self.it + 1, tot_iter + 1): logger.tic("iteration") logger.tic("collection") + + # * Simulate environment and log states + if states_to_log_dict is not None: + it_idx = self.it - 1 + if it_idx % 10 == 0: + self.sim_and_log_states(states_to_log_dict, it_idx) + # * Rollout with torch.inference_mode(): for i in range(self.num_steps_per_env): - actions = self.alg.act(actor_obs, critic_obs) + actions = self.alg.act(actor_obs) self.set_actions( self.actor_cfg["actions"], actions, @@ -69,33 +82,48 @@ def learn(self): "critic_obs": critic_obs, } ) - - self.env.step() + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards(rewards_list[step], self.env.terminated) + # handle time-outs/terminations happening inside here with mask + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) actor_obs = self.get_noisy_obs( - self.actor_cfg["actor_obs"], self.actor_cfg["noise"] + self.actor_cfg["obs"], self.actor_cfg["noise"] ) - critic_obs = self.get_obs(self.critic_cfg["critic_obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) + self.update_rewards(rewards_dict, self.env.terminated) + + # integrate rewards from list + total_rewards = torch.stack( + tuple( + torch.stack(tuple(rewards_dict.values())) + for rewards_dict in rewards_list + ) + ).sum(dim=(0, 1)) transition.update( { + "next_actor_obs": actor_obs, + "next_critic_obs": critic_obs, "rewards": total_rewards, - "timed_out": timed_out, - "dones": dones, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, } ) storage.add_transitions(transition) logger.log_rewards(rewards_dict) logger.log_rewards({"total_rewards": total_rewards}) - logger.finish_step(dones) + logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") logger.tic("learning") @@ -113,6 +141,21 @@ def learn(self): self.save() self.save() + @torch.no_grad + def burn_in_normalization(self, n_iterations=100): + actor_obs = self.get_obs(self.actor_cfg["obs"]) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + for _ in range(n_iterations): + actions = self.alg.act(actor_obs) + self.set_actions(self.actor_cfg["actions"], actions) + self.env.step() + actor_obs = self.get_noisy_obs( + self.actor_cfg["obs"], self.actor_cfg["noise"] + ) + critic_obs = self.get_obs(self.critic_cfg["obs"]) + self.alg.critic.evaluate(critic_obs) + self.env.reset() + def set_up_logger(self): logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( @@ -120,8 +163,57 @@ def set_up_logger(self): ) logger.register_rewards(["total_rewards"]) logger.register_category( - "algorithm", self.alg, ["mean_value_loss", "mean_surrogate_loss"] + "algorithm", + self.alg, + ["mean_value_loss", "mean_surrogate_loss", "learning_rate"], ) logger.register_category("actor", self.alg.actor, ["action_std", "entropy"]) logger.attach_torch_obj_to_wandb((self.alg.actor, self.alg.critic)) + + def update_rewards(self, rewards_dict, terminated): + # sum existing rewards with new rewards + + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], mask=terminated + ) + ) + rewards_dict.update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~terminated, + ) + ) + + def sim_and_log_states(self, states_to_log_dict, it_idx): + # Simulate environment for as many steps as expected in the dict. + # Log states to the dict, as well as whether the env terminated. + steps = states_to_log_dict["terminated"].shape[2] + actor_obs = self.get_obs(self.policy_cfg["actor_obs"]) + + with torch.inference_mode(): + for i in range(steps): + actions = self.alg.act(actor_obs) + self.set_actions( + self.policy_cfg["actions"], + actions, + self.policy_cfg["disable_actions"], + ) + + self.env.step() + + actor_obs = self.get_noisy_obs( + self.policy_cfg["actor_obs"], self.policy_cfg["noise"] + ) + + # Log states (just for the first env) + terminated = self.get_terminated()[0] + for state in states_to_log_dict: + if state == "terminated": + states_to_log_dict[state][0, it_idx, i, :] = terminated + else: + states_to_log_dict[state][0, it_idx, i, :] = getattr( + self.env, state + )[0, :] From f105c0eabc784fa5f93e487f29a78fe752d3af47 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 09:31:47 -0400 Subject: [PATCH 17/26] update nn_params --- gym/envs/pendulum/pendulum_SAC_config.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 0125cee4..836a6185 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -43,10 +43,7 @@ class actor(FixedRobotCfgPPO.actor): nn_params = {"latent": latent_nn, "mean": mean_nn, "std": std_nn} normalize_obs = False - obs = [ - "dof_pos_obs", - "dof_vel", - ] + obs = ["dof_pos_obs", "dof_vel"] actions = ["tau_ff"] disable_actions = False From c04b563f03a540a599fdee1522667e1eb4b7dda9 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 14:06:18 -0400 Subject: [PATCH 18/26] apply stash w/ sim-freq reward sampling. Adjust some reward weights of action_rate (mini-cheetah) to work decently. --- gym/envs/base/task_skeleton.py | 8 +- .../mini_cheetah/mini_cheetah_osc_config.py | 7 +- gym/envs/mini_cheetah/mini_cheetah_ref.py | 9 ++ .../mini_cheetah/mini_cheetah_ref_config.py | 9 +- gym/envs/mit_humanoid/lander_config.py | 2 +- gym/utils/task_registry.py | 3 + learning/runners/my_runner.py | 57 +++++++---- learning/runners/off_policy_runner.py | 95 ++++++++++++------- learning/runners/on_policy_runner.py | 69 +++++++++----- 9 files changed, 178 insertions(+), 81 deletions(-) diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index 0974970a..4f931eab 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -5,10 +5,14 @@ class TaskSkeleton: - def __init__(self, num_envs=1, max_episode_length=1.0, device="cpu"): + def __init__( + self, num_envs=1, max_episode_length=1.0, device="cpu", auto_reset=True + ): self.num_envs = num_envs self.max_episode_length = max_episode_length self.device = device + self.auto_reset = auto_reset + return None def get_states(self, obs_list): @@ -70,7 +74,7 @@ def _check_terminations_and_timeouts(self): contact_forces = self.contact_forces[:, self.termination_contact_indices, :] self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) self.timed_out = self.episode_length_buf >= self.max_episode_length - self.to_be_reset = self.timed_out | self.terminated + # self.to_be_reset = self.timed_out | self.terminated def step(self, actions): raise NotImplementedError diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py index ec283033..34996404 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc_config.py @@ -64,7 +64,7 @@ class control(MiniCheetahCfg.control): # * PD Drive parameters: stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} - ctrl_frequency = 100 + ctrl_frequency = 500 desired_sim_frequency = 500 class osc: @@ -163,6 +163,7 @@ class MiniCheetahOscRunnerCfg(MiniCheetahRunnerCfg): runner_class_name = "OnPolicyRunner" class actor(MiniCheetahRunnerCfg.actor): + frequency = 100 hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" @@ -221,8 +222,8 @@ class weights: dof_vel = 0.0 min_base_height = 1.5 collision = 0 - action_rate = 0.1 # -0.01 - action_rate2 = 0.01 # -0.001 + action_rate = 0.001 # -0.01 + action_rate2 = 0.0001 # -0.001 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index 57e1f0e7..d3f8a89b 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -50,6 +50,15 @@ def _resample_commands(self, env_ids): ).squeeze(1) self.commands[env_ids, :3] *= (rand_ids < 0.9).unsqueeze(1) + def _check_terminations_and_timeouts(self): + """Check if environments need to be reset""" + contact_forces = self.contact_forces[:, self.termination_contact_indices, :] + self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1) + self.timed_out = self.episode_length_buf >= self.max_episode_length + # self.to_be_reset = self.timed_out | self.terminated + + # --- + def _switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) return torch.exp( diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 1803c765..b8a7a2fe 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -26,7 +26,7 @@ class control(MiniCheetahCfg.control): stiffness = {"haa": 20.0, "hfe": 20.0, "kfe": 20.0} damping = {"haa": 0.5, "hfe": 0.5, "kfe": 0.5} gait_freq = 3.0 - ctrl_frequency = 100 + ctrl_frequency = 500 desired_sim_frequency = 500 class commands(MiniCheetahCfg.commands): @@ -70,6 +70,7 @@ class MiniCheetahRefRunnerCfg(MiniCheetahRunnerCfg): runner_class_name = "OnPolicyRunner" class actor(MiniCheetahRunnerCfg.actor): + frequency = 100 hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" @@ -122,12 +123,12 @@ class weights: lin_vel_z = 0.0 ang_vel_xy = 0.01 orientation = 1.0 - torques = 5.0e-7 + torques = 5.0e-8 dof_vel = 0.0 min_base_height = 1.5 collision = 0.0 - action_rate = 0.1 - action_rate2 = 0.01 + action_rate = 1e-4 + action_rate2 = 1e-5 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 diff --git a/gym/envs/mit_humanoid/lander_config.py b/gym/envs/mit_humanoid/lander_config.py index 48c2eb53..86f3329f 100644 --- a/gym/envs/mit_humanoid/lander_config.py +++ b/gym/envs/mit_humanoid/lander_config.py @@ -282,7 +282,7 @@ class weights: power = 1e-6 # 1.0e-2 min_base_height = 1.5 lin_vel_xy = 1.0 - action_rate = 1e-3 + action_rate = 1e-2 action_rate2 = 1e-3 lin_vel_z = 0.0 ang_vel_xy = 0.0 diff --git a/gym/utils/task_registry.py b/gym/utils/task_registry.py index 6c2025ba..cc0787ca 100644 --- a/gym/utils/task_registry.py +++ b/gym/utils/task_registry.py @@ -178,6 +178,9 @@ def set_control_and_sim_dt(self, env_cfg, train_cfg): f" to {env_cfg.sim_dt}." ) + if not hasattr(train_cfg.actor, "frequency"): + train_cfg.actor.frequency = env_cfg.control.ctrl_frequency + def set_discount_rates(self, train_cfg, dt): if hasattr(train_cfg.algorithm, "discount_horizon"): hrzn = train_cfg.algorithm.discount_horizon diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 15192d2d..4c2f5d52 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -39,8 +39,7 @@ def learn(self, states_to_log_dict=None): n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" - rewards_dict = {} - rewards_list = [dict() for x in range(n_policy_steps)] + rewards_dict = self.initialize_rewards_dict(n_policy_steps) self.alg.switch_to_train() actor_obs = self.get_obs(self.actor_cfg["obs"]) @@ -104,31 +103,21 @@ def learn(self, states_to_log_dict=None): for step in range(n_policy_steps): self.env.step() # put reward integration here - self.update_rewards(rewards_list[step], self.env.terminated) - # handle time-outs/terminations happening inside here with mask + self.update_rewards_dict(rewards_dict, step) else: # catch and reset failed envs to_be_reset = self.env.timed_out | self.env.terminated env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() self.env._reset_idx(env_ids) + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - # * get time_outs - - self.update_rewards(rewards_dict, self.env.terminated) - - # integrate rewards from list - total_rewards = torch.stack( - tuple( - torch.stack(tuple(rewards_dict.values())) - for rewards_dict in rewards_list - ) - ).sum(dim=(0, 1)) - transition.update( { "next_actor_obs": actor_obs, @@ -140,7 +129,7 @@ def learn(self, states_to_log_dict=None): ) storage.add_transitions(transition) - logger.log_rewards(rewards_dict) + logger.log_rewards(rewards_dict.sum(dim=0)) logger.log_rewards({"total_rewards": total_rewards}) logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") @@ -205,6 +194,40 @@ def update_rewards(self, rewards_dict, terminated): ) ) + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device + ) + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict + + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, + ) + rewards_dict[step].update( + self.get_rewards( + self.critic_cfg["reward"]["weights"], + modifier=self.env.dt, + mask=~self.env.terminated, + ), + inplace=True, + ) + def sim_and_log_states(self, states_to_log_dict, it_idx): # Simulate environment for as many steps as expected in the dict. # Log states to the dict, as well as whether the env terminated. diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 7aed58e7..516c4a3e 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -48,7 +48,10 @@ def _set_up_alg(self): def learn(self): self.set_up_logger() - rewards_dict = {} + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + + rewards_dict = self.initialize_rewards_dict(n_policy_steps) self.alg.switch_to_train() actor_obs = self.get_obs(self.actor_cfg["obs"]) @@ -94,28 +97,32 @@ def learn(self): } ) - self.env.step() + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) + + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated - - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) - transition.update( { "next_actor_obs": actor_obs, "next_critic_obs": critic_obs, "rewards": total_rewards, - "timed_out": timed_out, - "dones": dones, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, } ) storage.add_transitions(transition) @@ -147,35 +154,39 @@ def learn(self): } ) - self.env.step() + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) + + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated - - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) - transition.update( { "next_actor_obs": actor_obs, "next_critic_obs": critic_obs, "rewards": total_rewards, - "timed_out": timed_out, - "dones": dones, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, } ) storage.add_transitions(transition) - logger.log_rewards(rewards_dict) + logger.log_rewards(rewards_dict.sum(dim=0)) logger.log_rewards({"total_rewards": total_rewards}) - logger.finish_step(dones) + logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") logger.tic("learning") @@ -192,20 +203,40 @@ def learn(self): self.save() self.save() - def update_rewards(self, rewards_dict, terminated): - rewards_dict.update( + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( self.get_rewards( - self.critic_cfg["reward"]["termination_weight"], mask=terminated - ) + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, ) - rewards_dict.update( + rewards_dict[step].update( self.get_rewards( self.critic_cfg["reward"]["weights"], modifier=self.env.dt, - mask=~terminated, - ) + mask=~self.env.terminated, + ), + inplace=True, ) + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device + ) + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict + def set_up_logger(self): logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 0a81680d..7167f03e 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -24,7 +24,10 @@ def __init__(self, env, train_cfg, device="cpu"): def learn(self, states_to_log_dict=None): self.set_up_logger() - rewards_dict = {} + n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) + assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + + rewards_dict = self.initialize_rewards_dict(n_policy_steps) self.alg.switch_to_train() actor_obs = self.get_obs(self.actor_cfg["obs"]) @@ -86,37 +89,39 @@ def learn(self, states_to_log_dict=None): "critic_obs": critic_obs, } ) + for step in range(n_policy_steps): + self.env.step() + # put reward integration here + self.update_rewards_dict(rewards_dict, step) + else: + # catch and reset failed envs + to_be_reset = self.env.timed_out | self.env.terminated + env_ids = (to_be_reset).nonzero(as_tuple=False).flatten() + self.env._reset_idx(env_ids) - self.env.step() + total_rewards = torch.stack( + tuple(rewards_dict.sum(dim=0).values()) + ).sum(dim=(0)) actor_obs = self.get_noisy_obs( self.actor_cfg["obs"], self.actor_cfg["noise"] ) critic_obs = self.get_obs(self.critic_cfg["obs"]) - # * get time_outs - timed_out = self.get_timed_out() - terminated = self.get_terminated() - dones = timed_out | terminated - - self.update_rewards(rewards_dict, terminated) - total_rewards = torch.stack(tuple(rewards_dict.values())).sum(dim=0) - transition.update( { "next_actor_obs": actor_obs, "next_critic_obs": critic_obs, "rewards": total_rewards, - "timed_out": timed_out, - "terminated": terminated, - "dones": dones, + "timed_out": self.env.timed_out, + "dones": self.env.timed_out | self.env.terminated, } ) storage.add_transitions(transition) - logger.log_rewards(rewards_dict) + logger.log_rewards(rewards_dict.sum(dim=0)) logger.log_rewards({"total_rewards": total_rewards}) - logger.finish_step(dones) + logger.finish_step(self.env.timed_out | self.env.terminated) logger.toc("collection") logger.tic("learning") @@ -149,19 +154,39 @@ def burn_in_normalization(self, n_iterations=100): self.alg.critic.evaluate(critic_obs) self.env.reset() - def update_rewards(self, rewards_dict, terminated): - rewards_dict.update( + def update_rewards_dict(self, rewards_dict, step): + # sum existing rewards with new rewards + rewards_dict[step].update( self.get_rewards( - self.critic_cfg["reward"]["termination_weight"], mask=terminated - ) + self.critic_cfg["reward"]["termination_weight"], + modifier=self.env.dt, + mask=self.env.terminated, + ), + inplace=True, ) - rewards_dict.update( + rewards_dict[step].update( self.get_rewards( self.critic_cfg["reward"]["weights"], modifier=self.env.dt, - mask=~terminated, - ) + mask=~self.env.terminated, + ), + inplace=True, + ) + + def initialize_rewards_dict(self, n_steps): + # sum existing rewards with new rewards + rewards_dict = TensorDict( + {}, batch_size=(n_steps, self.env.num_envs), device=self.device ) + for key in self.critic_cfg["reward"]["termination_weight"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + for key in self.critic_cfg["reward"]["weights"]: + rewards_dict.update( + {key: torch.zeros(n_steps, self.env.num_envs, device=self.device)} + ) + return rewards_dict def set_up_logger(self): logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) From e3956738f7ae709874a889b7868abd5fc38dd3f3 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 14:23:38 -0400 Subject: [PATCH 19/26] refactor skeleton with a super_init fix, and pre-initialize reward buffer. Doesn't seem to make a difference in speed. --- gym/envs/base/base_task.py | 19 +++---------------- gym/envs/base/task_skeleton.py | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/gym/envs/base/base_task.py b/gym/envs/base/base_task.py index 551143a3..033e0eeb 100644 --- a/gym/envs/base/base_task.py +++ b/gym/envs/base/base_task.py @@ -18,33 +18,20 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): # * env device is GPU only if sim is on GPU and use_gpu_pipeline=True, # * otherwise returned tensors are copied to CPU by physX. if sim_device_type == "cuda" and sim_params.use_gpu_pipeline: - self.device = self.sim_device + device = self.sim_device else: - self.device = "cpu" + device = "cpu" # * graphics device for rendering, -1 for no rendering self.graphics_device_id = self.sim_device_id - self.num_envs = cfg.env.num_envs self.num_actuators = cfg.env.num_actuators # * optimization flags for pytorch JIT torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) - # allocate buffers - self.to_be_reset = torch.ones( - self.num_envs, device=self.device, dtype=torch.bool - ) - self.terminated = torch.ones( - self.num_envs, device=self.device, dtype=torch.bool - ) - self.episode_length_buf = torch.zeros( - self.num_envs, device=self.device, dtype=torch.long - ) - self.timed_out = torch.zeros( - self.num_envs, device=self.device, dtype=torch.bool - ) + super().__init__(num_envs=cfg.env.num_envs, device=device) # todo: read from config self.enable_viewer_sync = True diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index 4f931eab..d3585e45 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -5,13 +5,15 @@ class TaskSkeleton: - def __init__( - self, num_envs=1, max_episode_length=1.0, device="cpu", auto_reset=True - ): + def __init__(self, num_envs=1, device="cpu"): self.num_envs = num_envs - self.max_episode_length = max_episode_length self.device = device - self.auto_reset = auto_reset + + self.to_be_reset = torch.ones(num_envs, device=device, dtype=torch.bool) + self.terminated = torch.ones(num_envs, device=device, dtype=torch.bool) + self.episode_length_buf = torch.zeros(num_envs, device=device, dtype=torch.long) + self.timed_out = torch.zeros(num_envs, device=device, dtype=torch.bool) + self.reward_buf = torch.zeros(num_envs, device=device, dtype=torch.float) return None @@ -61,10 +63,11 @@ def compute_reward(self, reward_weights): reward_weights: dict with keys matching reward names, and values matching weights """ - reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float) + # reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float) + self.reward_buf[:] = 0 for name, weight in reward_weights.items(): - reward += weight * self._eval_reward(name) - return reward + self.reward_buf += weight * self._eval_reward(name) + return self.reward_buf def _eval_reward(self, name): return eval("self._reward_" + name + "()") From ea5cdff304ef6edc78dbd66e4ecfcd9d7d4cccb1 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 15:09:09 -0400 Subject: [PATCH 20/26] refactor: redo rewards computation, with a dict of reward functions in the runner. Seems to make a tiny difference in speed. More importantly, it removes the `eval` hack, which means we can profile individual reward functions. THIS BREAKS PBRS --- gym/envs/base/task_skeleton.py | 15 --------------- learning/runners/BaseRunner.py | 15 +++++++++++---- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py index d3585e45..4dfcb280 100644 --- a/gym/envs/base/task_skeleton.py +++ b/gym/envs/base/task_skeleton.py @@ -13,7 +13,6 @@ def __init__(self, num_envs=1, device="cpu"): self.terminated = torch.ones(num_envs, device=device, dtype=torch.bool) self.episode_length_buf = torch.zeros(num_envs, device=device, dtype=torch.long) self.timed_out = torch.zeros(num_envs, device=device, dtype=torch.bool) - self.reward_buf = torch.zeros(num_envs, device=device, dtype=torch.float) return None @@ -58,20 +57,6 @@ def _reset_buffers(self): self.terminated[:] = False self.timed_out[:] = False - def compute_reward(self, reward_weights): - """Compute and return a torch tensor of rewards - reward_weights: dict with keys matching reward names, and values - matching weights - """ - # reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float) - self.reward_buf[:] = 0 - for name, weight in reward_weights.items(): - self.reward_buf += weight * self._eval_reward(name) - return self.reward_buf - - def _eval_reward(self, name): - return eval("self._reward_" + name + "()") - def _check_terminations_and_timeouts(self): """Check if environments need to be reset""" contact_forces = self.contact_forces[:, self.termination_contact_indices, :] diff --git a/learning/runners/BaseRunner.py b/learning/runners/BaseRunner.py index 01a1b825..1545293b 100644 --- a/learning/runners/BaseRunner.py +++ b/learning/runners/BaseRunner.py @@ -8,8 +8,8 @@ class BaseRunner: def __init__(self, env, train_cfg, device="cpu"): self.device = device self.env = env + self.setup_reward_functions() self.parse_train_cfg(train_cfg) - self.num_steps_per_env = self.cfg["num_steps_per_env"] self.save_interval = self.cfg["save_interval"] self.num_learning_iterations = self.cfg["max_iterations"] @@ -18,6 +18,13 @@ def __init__(self, env, train_cfg, device="cpu"): self.log_dir = train_cfg["log_dir"] self._set_up_alg() + def setup_reward_functions(self): + self.reward_functions = { + method.replace("_reward_", ""): getattr(self.env, method) + for method in dir(self.env) + if callable(getattr(self.env, method)) and method.startswith("_reward_") + } + def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) num_actions = self.get_action_size(self.actor_cfg["actions"]) @@ -95,8 +102,8 @@ def get_rewards(self, reward_weights, modifier=1, mask=None): if mask is None: mask = 1.0 for name, weight in reward_weights.items(): - rewards_dict[name] = mask * self._get_reward({name: weight}, modifier) + rewards_dict[name] = mask * self._get_reward(name, weight * modifier) return rewards_dict - def _get_reward(self, name_weight, modifier=1): - return modifier * self.env.compute_reward(name_weight).to(self.device) + def _get_reward(self, name, weight): + return weight * self.reward_functions[name]().to(self.device) From 7296c6c48903a5458c2e3d4e2c50e72bf430d97a Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Mon, 9 Sep 2024 15:58:57 -0400 Subject: [PATCH 21/26] compute switch once per decimation (speedup ~10%) --- gym/envs/mini_cheetah/mini_cheetah_osc.py | 20 +++++++++++++------ gym/envs/mini_cheetah/mini_cheetah_ref.py | 19 ++++++++++-------- .../mini_cheetah/mini_cheetah_ref_config.py | 1 + 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_osc.py b/gym/envs/mini_cheetah/mini_cheetah_osc.py index acec0b8a..6d44e35d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_osc.py +++ b/gym/envs/mini_cheetah/mini_cheetah_osc.py @@ -16,6 +16,9 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + + self._switch = torch.zeros(self.num_envs, 1, device=self.device) + self.oscillators = torch.zeros(self.num_envs, 4, device=self.device) self.oscillator_obs = torch.zeros(self.num_envs, 8, device=self.device) @@ -158,6 +161,7 @@ def _post_decimation_step(self): """Update all states that are not handled in PhysX""" super()._post_decimation_step() self.grf = self._compute_grf() + self._update_cmd_switch() # self._step_oscillators() def _post_physx_step(self): @@ -266,12 +270,16 @@ def _compute_grf(self, grf_norm=True): else: return grf - def _switch(self): + def _update_cmd_switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) - return torch.exp( + self._switch = torch.exp( -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / self.cfg.reward_settings.switch_scale ) + # return torch.exp( + # -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) + # / self.cfg.reward_settings.switch_scale + # ) def _reward_cursorial(self): # penalize the abad joints being away from 0 @@ -357,10 +365,10 @@ def _reward_stance_velocity(self): return -torch.mean(rew, dim=1) def _reward_dof_vel(self): - return super()._reward_dof_vel() * self._switch() + return super()._reward_dof_vel() * self._switch def _reward_dof_near_home(self): - return super()._reward_dof_near_home() * self._switch() + return super()._reward_dof_near_home() * self._switch def _reward_stand_still(self): """Penalize motion at zero commands""" @@ -371,11 +379,11 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return (rew_vel + rew_pos - rew_base_vel) * self._switch() + return (rew_vel + rew_pos - rew_base_vel) * self._switch def _reward_standing_torques(self): """Penalize torques at zero commands""" - return super()._reward_torques() * self._switch() + return super()._reward_torques() * self._switch # * gait similarity scores def angle_difference(self, theta1, theta2): diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref.py b/gym/envs/mini_cheetah/mini_cheetah_ref.py index d3f8a89b..0084a80c 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref.py @@ -17,6 +17,7 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless): def _init_buffers(self): super()._init_buffers() + self._switch = torch.zeros(self.num_envs, 1, device=self.device) self.phase = torch.zeros( self.num_envs, 1, dtype=torch.float, device=self.device ) @@ -41,6 +42,7 @@ def _post_decimation_step(self): self.phase_obs = torch.cat( (torch.sin(self.phase), torch.cos(self.phase)), dim=1 ) + self._update_cmd_switch() def _resample_commands(self, env_ids): super()._resample_commands(env_ids) @@ -59,10 +61,11 @@ def _check_terminations_and_timeouts(self): # --- - def _switch(self): + def _update_cmd_switch(self): c_vel = torch.linalg.norm(self.commands, dim=1) - return torch.exp( - -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) / 0.1 + self._switch = torch.exp( + -torch.square(torch.max(torch.zeros_like(c_vel), c_vel - 0.1)) + / self.cfg.reward_settings.switch_scale ) def _reward_swing_grf(self): @@ -73,7 +76,7 @@ def _reward_swing_grf(self): ) ph_off = torch.lt(self.phase, torch.pi) rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return -torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return -torch.sum(rew.float(), dim=1) * (1 - self._switch) def _reward_stance_grf(self): """Reward non-zero grf during stance (pi to 2pi)""" @@ -84,7 +87,7 @@ def _reward_stance_grf(self): ph_off = torch.gt(self.phase, torch.pi) # should this be in swing? rew = in_contact * torch.cat((ph_off, ~ph_off, ~ph_off, ph_off), dim=1) - return torch.sum(rew.float(), dim=1) * (1 - self._switch()) + return torch.sum(rew.float(), dim=1) * (1 - self._switch) def _reward_reference_traj(self): """REWARDS EACH LEG INDIVIDUALLY BASED ON ITS POSITION IN THE CYCLE""" @@ -93,7 +96,7 @@ def _reward_reference_traj(self): error /= self.scales["dof_pos"] reward = (self._sqrdexp(error) - torch.abs(error) * 0.2).mean(dim=1) # * only when commanded velocity is higher - return reward * (1 - self._switch()) + return reward * (1 - self._switch) def _get_ref(self): leg_frame = torch.zeros_like(self.torques) @@ -121,10 +124,10 @@ def _reward_stand_still(self): rew_vel = torch.mean(self._sqrdexp(self.dof_vel), dim=1) rew_base_vel = torch.mean(torch.square(self.base_lin_vel), dim=1) rew_base_vel += torch.mean(torch.square(self.base_ang_vel), dim=1) - return (rew_vel + rew_pos - rew_base_vel) * self._switch() + return (rew_vel + rew_pos - rew_base_vel) * self._switch def _reward_tracking_lin_vel(self): """Tracking linear velocity commands (xy axes)""" # just use lin_vel? reward = super()._reward_tracking_lin_vel() - return reward * (1 - self._switch()) + return reward * (1 - self._switch) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index b8a7a2fe..7a54bb1d 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -60,6 +60,7 @@ class reward_settings(MiniCheetahCfg.reward_settings): max_contact_force = 600.0 base_height_target = 0.3 tracking_sigma = 0.25 + switch_scale = 0.1 class scaling(MiniCheetahCfg.scaling): pass From 69218dd1a7ffcd1c8a898a160f1bb6f17a75e888 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Tue, 10 Sep 2024 17:55:53 -0400 Subject: [PATCH 22/26] fixed logging bug that scaled with wrong dt --- gym/envs/base/fixed_robot.py | 21 ++++-------- gym/envs/base/legged_robot.py | 20 ++++------- gym/envs/base/legged_robot_config.py | 3 -- .../mini_cheetah/mini_cheetah_ref_config.py | 33 ++++++++++++++----- gym/envs/pendulum/pendulum_SAC_config.py | 3 +- gym/envs/pendulum/pendulum_config.py | 1 + learning/runners/my_runner.py | 17 +++++----- learning/runners/off_policy_runner.py | 17 +++++----- learning/runners/on_policy_runner.py | 17 +++++----- 9 files changed, 64 insertions(+), 68 deletions(-) diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py index 544eba52..f623e5d1 100644 --- a/gym/envs/base/fixed_robot.py +++ b/gym/envs/base/fixed_robot.py @@ -580,27 +580,18 @@ def _reward_dof_vel(self): def _reward_action_rate(self): """Penalize changes in actions""" nact = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :nact] - - self.dof_pos_history[:, nact : 2 * nact] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :nact] - self.dof_pos_history[:, 2 * nact :] ) return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" nact = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :nact] - - 2 * self.dof_pos_history[:, nact : 2 * nact] - + self.dof_pos_history[:, 2 * nact :] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :nact] + - 2 * self.dof_pos_history[:, nact : 2 * nact] + + self.dof_pos_history[:, 2 * nact :] ) return -torch.mean(error, dim=1) diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py index 4433d4fb..c80ebc85 100644 --- a/gym/envs/base/legged_robot.py +++ b/gym/envs/base/legged_robot.py @@ -1030,26 +1030,18 @@ def _reward_dof_vel(self): def _reward_action_rate(self): """Penalize changes in actions""" n = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :n] - self.dof_pos_history[:, n : 2 * n] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :n] - self.dof_pos_history[:, 2 * n :] ) return -torch.mean(error, dim=1) def _reward_action_rate2(self): """Penalize changes in actions""" n = self.num_actuators - dt2 = (self.dt * self.cfg.control.decimation) ** 2 - error = ( - torch.square( - self.dof_pos_history[:, :n] - - 2 * self.dof_pos_history[:, n : 2 * n] - + self.dof_pos_history[:, 2 * n :] - ) - / dt2 + error = torch.square( + self.dof_pos_history[:, :n] + - 2 * self.dof_pos_history[:, n : 2 * n] + + self.dof_pos_history[:, 2 * n :] ) return -torch.mean(error, dim=1) diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py index 4336a749..c38f3e33 100644 --- a/gym/envs/base/legged_robot_config.py +++ b/gym/envs/base/legged_robot_config.py @@ -301,9 +301,6 @@ class algorithm: batch_size = 2**15 max_gradient_steps = 24 # new - storage_size = 2**17 # new - batch_size = 2**15 # new - clip_param = 0.2 learning_rate = 1.0e-3 max_grad_norm = 1.0 diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index 7a54bb1d..d3a301d3 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -85,7 +85,7 @@ class actor(MiniCheetahRunnerCfg.actor): "dof_vel", "phase_obs", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] disable_actions = False @@ -115,7 +115,7 @@ class critic(MiniCheetahRunnerCfg.critic): "phase_obs", "dof_pos_target", ] - normalize_obs = True + normalize_obs = False class reward: class weights: @@ -128,13 +128,13 @@ class weights: dof_vel = 0.0 min_base_height = 1.5 collision = 0.0 - action_rate = 1e-4 - action_rate2 = 1e-5 + action_rate = 1e-5 + action_rate2 = 1e-6 stand_still = 0.0 dof_pos_limits = 0.0 feet_contact_forces = 0.0 dof_near_home = 0.0 - reference_traj = 1.5 + reference_traj = 0.0 swing_grf = 1.5 stance_grf = 1.5 @@ -142,11 +142,28 @@ class termination_weight: termination = 0.15 class algorithm(MiniCheetahRunnerCfg.algorithm): - pass + # both + gamma = 0.99 + lam = 0.95 + # shared + batch_size = 2 * 4096 # use all the data + max_gradient_steps = 50 + + clip_param = 0.2 + learning_rate = 1.0e-3 + max_grad_norm = 1.0 + # Critic + use_clipped_value_loss = True + # Actor + entropy_coef = 0.01 + schedule = "adaptive" # could be adaptive, fixed + desired_kl = 0.01 + lr_range = [2e-5, 1e-2] + lr_ratio = 1.5 class runner(MiniCheetahRunnerCfg.runner): run_name = "" experiment_name = "mini_cheetah_ref" - max_iterations = 1000 # number of policy updates + max_iterations = 500 # number of policy updates algorithm_class_name = "PPO2" - num_steps_per_env = 32 # deprecate + num_steps_per_env = 20 # deprecate diff --git a/gym/envs/pendulum/pendulum_SAC_config.py b/gym/envs/pendulum/pendulum_SAC_config.py index 836a6185..42bf0265 100644 --- a/gym/envs/pendulum/pendulum_SAC_config.py +++ b/gym/envs/pendulum/pendulum_SAC_config.py @@ -17,7 +17,7 @@ class init_state(PendulumCfg.init_state): dof_vel_range = {"theta": [-5, 5]} class control(PendulumCfg.control): - ctrl_frequency = 10 + ctrl_frequency = 100 desired_sim_frequency = 100 class asset(PendulumCfg.asset): @@ -37,6 +37,7 @@ class PendulumSACRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OffPolicyRunner" class actor(FixedRobotCfgPPO.actor): + frequency = 10 latent_nn = {"hidden_dims": [128, 64], "activation": "elu", "layer_norm": True} mean_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} std_nn = {"hidden_dims": [32], "activation": "elu", "layer_norm": True} diff --git a/gym/envs/pendulum/pendulum_config.py b/gym/envs/pendulum/pendulum_config.py index bb33c1f1..910fe341 100644 --- a/gym/envs/pendulum/pendulum_config.py +++ b/gym/envs/pendulum/pendulum_config.py @@ -62,6 +62,7 @@ class PendulumRunnerCfg(FixedRobotCfgPPO): runner_class_name = "OnPolicyRunner" class actor(FixedRobotCfgPPO.actor): + frequency = 25 hidden_dims = [128, 64, 32] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "tanh" diff --git a/learning/runners/my_runner.py b/learning/runners/my_runner.py index 4c2f5d52..e0da1a1b 100644 --- a/learning/runners/my_runner.py +++ b/learning/runners/my_runner.py @@ -16,12 +16,6 @@ class MyRunner(OnPolicyRunner): def __init__(self, env, train_cfg, device="cpu"): super().__init__(env, train_cfg, device) - logger.initialize( - self.env.num_envs, - self.env.dt, - self.cfg["max_iterations"], - self.device, - ) def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) @@ -34,10 +28,9 @@ def _set_up_alg(self): self.alg = alg_class(actor, critic, device=self.device, **self.alg_cfg) def learn(self, states_to_log_dict=None): - self.set_up_logger() - n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) rewards_dict = self.initialize_rewards_dict(n_policy_steps) @@ -164,7 +157,13 @@ def burn_in_normalization(self, n_iterations=100): self.alg.critic.evaluate(critic_obs) self.env.reset() - def set_up_logger(self): + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device + ) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( list(self.critic_cfg["reward"]["termination_weight"].keys()) diff --git a/learning/runners/off_policy_runner.py b/learning/runners/off_policy_runner.py index 516c4a3e..83249aef 100644 --- a/learning/runners/off_policy_runner.py +++ b/learning/runners/off_policy_runner.py @@ -16,12 +16,6 @@ class OffPolicyRunner(BaseRunner): def __init__(self, env, train_cfg, device="cpu"): super().__init__(env, train_cfg, device) - logger.initialize( - self.env.num_envs, - self.env.dt, - self.cfg["max_iterations"], - self.device, - ) def _set_up_alg(self): num_actor_obs = self.get_obs_size(self.actor_cfg["obs"]) @@ -46,10 +40,9 @@ def _set_up_alg(self): ) def learn(self): - self.set_up_logger() - n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) rewards_dict = self.initialize_rewards_dict(n_policy_steps) @@ -237,7 +230,13 @@ def initialize_rewards_dict(self, n_steps): ) return rewards_dict - def set_up_logger(self): + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device + ) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( list(self.critic_cfg["reward"]["termination_weight"].keys()) diff --git a/learning/runners/on_policy_runner.py b/learning/runners/on_policy_runner.py index 7167f03e..4831b615 100644 --- a/learning/runners/on_policy_runner.py +++ b/learning/runners/on_policy_runner.py @@ -14,18 +14,11 @@ class OnPolicyRunner(BaseRunner): def __init__(self, env, train_cfg, device="cpu"): super().__init__(env, train_cfg, device) - logger.initialize( - self.env.num_envs, - self.env.dt, - self.cfg["max_iterations"], - self.device, - ) def learn(self, states_to_log_dict=None): - self.set_up_logger() - n_policy_steps = int((1 / self.env.dt) / self.actor_cfg["frequency"]) assert n_policy_steps > 0, "actor frequency should be less than ctrl_freq" + self.set_up_logger(dt=self.env.dt * n_policy_steps) rewards_dict = self.initialize_rewards_dict(n_policy_steps) @@ -188,7 +181,13 @@ def initialize_rewards_dict(self, n_steps): ) return rewards_dict - def set_up_logger(self): + def set_up_logger(self, dt=None): + if dt is None: + dt = self.env.dt + logger.initialize( + self.env.num_envs, dt, self.cfg["max_iterations"], self.device + ) + logger.register_rewards(list(self.critic_cfg["reward"]["weights"].keys())) logger.register_rewards( list(self.critic_cfg["reward"]["termination_weight"].keys()) From b1bd4f2b9b20c2c67877d61e8c4daee513d65b17 Mon Sep 17 00:00:00 2001 From: Steve Heim Date: Wed, 11 Sep 2024 14:37:52 -0400 Subject: [PATCH 23/26] hardcode Jacobian (halves time for _apply_coupling) --- .../mini_cheetah/mini_cheetah_ref_config.py | 4 ++ gym/envs/mit_humanoid/mit_humanoid.py | 28 ++++++------ gym/envs/mit_humanoid/mit_humanoid_config.py | 43 +++++++++---------- learning/modules/QRCritics.py | 2 +- 4 files changed, 39 insertions(+), 38 deletions(-) diff --git a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py index d3a301d3..37c52919 100644 --- a/gym/envs/mini_cheetah/mini_cheetah_ref_config.py +++ b/gym/envs/mini_cheetah/mini_cheetah_ref_config.py @@ -74,6 +74,7 @@ class actor(MiniCheetahRunnerCfg.actor): frequency = 100 hidden_dims = [256, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid + layer_norm = [True, True, False] activation = "elu" smooth_exploration = False exploration_sample_freq = 16 @@ -102,6 +103,9 @@ class noise: class critic(MiniCheetahRunnerCfg.critic): hidden_dims = [256, 256, 128] + layer_norm = [True, True, False] + dropouts = [0.1, 0.0, 0.0] + # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" obs = [ diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index a580fd57..a2e082e8 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -32,29 +32,28 @@ def _init_sampled_history_buffers(self): self.sampled_history_threshold = int( self.cfg.control.ctrl_frequency / self.cfg.env.sampled_history_frequency ) + self.J = torch.eye(self.num_dof).to(self.device) + self.J[4, 3] = 1 + self.J[9, 8] = 1 + self.J_inv_T = torch.inverse(self.J.T) def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): # Create a Jacobian matrix and move it to the same device as input tensors - J = torch.eye(q.shape[-1]).to(q.device) - J[4, 3] = 1 - J[9, 8] = 1 # Perform transformations using Jacobian - q = torch.matmul(q, J.T) - qd = torch.matmul(qd, J.T) - q_des = torch.matmul(q_des, J.T) - qd_des = torch.matmul(qd_des, J.T) - - # Inverse of the transpose of Jacobian - J_inv_T = torch.inverse(J.T) + q = torch.matmul(q, self.J.T) + qd = torch.matmul(qd, self.J.T) + q_des = torch.matmul(q_des, self.J.T) + qd_des = torch.matmul(qd_des, self.J.T) # Compute feed-forward torques - tau_ff = torch.matmul(J_inv_T, tau_ff.T).T + tau_ff = torch.matmul(self.J_inv_T, tau_ff.T).T # Compute kp and kd terms kp = torch.diagonal( torch.matmul( - torch.matmul(J_inv_T, torch.diag_embed(kp, dim1=-2, dim2=-1)), J_inv_T.T + torch.matmul(self.J_inv_T, torch.diag_embed(kp, dim1=-2, dim2=-1)), + self.J_inv_T.T, ), dim1=-2, dim2=-1, @@ -62,7 +61,8 @@ def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): kd = torch.diagonal( torch.matmul( - torch.matmul(J_inv_T, torch.diag_embed(kd, dim1=-2, dim2=-1)), J_inv_T.T + torch.matmul(self.J_inv_T, torch.diag_embed(kd, dim1=-2, dim2=-1)), + self.J_inv_T.T, ), dim1=-2, dim2=-1, @@ -70,7 +70,7 @@ def _apply_coupling(self, q, qd, q_des, qd_des, kp, kd, tau_ff): # Compute torques torques = kp * (q_des - q) + kd * (qd_des - qd) + tau_ff - torques = torch.matmul(torques, J) + torques = torch.matmul(torques, self.J) return torques diff --git a/gym/envs/mit_humanoid/mit_humanoid_config.py b/gym/envs/mit_humanoid/mit_humanoid_config.py index dc02a211..861f2826 100644 --- a/gym/envs/mit_humanoid/mit_humanoid_config.py +++ b/gym/envs/mit_humanoid/mit_humanoid_config.py @@ -112,7 +112,7 @@ class control(LeggedRobotCfg.control): } # [N*m*s/rad] ctrl_frequency = 100 - desired_sim_frequency = 1000 + desired_sim_frequency = 500 filter_gain = 0.1586 # 1: no filtering, 0: wall @@ -123,7 +123,7 @@ class commands: resampling_time = 10.0 # time before command are changed[s] class ranges: - lin_vel_x = [-0.0, 0.0] # min max [m/s] [-0.75, 0.75] + lin_vel_x = [-0.0, 4.0] # min max [m/s] [-0.75, 0.75] lin_vel_y = 0.0 # max [m/s] yaw_vel = 0.0 # max [rad/s] @@ -227,10 +227,12 @@ class MITHumanoidRunnerCfg(LeggedRobotRunnerCfg): runner_class_name = "OnPolicyRunner" class actor(LeggedRobotRunnerCfg.actor): + frequency = 100 init_noise_std = 1.0 hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + layer_norm = [True, True, False] smooth_exploration = False obs = [ @@ -238,16 +240,15 @@ class actor(LeggedRobotRunnerCfg.actor): "base_lin_vel", "base_ang_vel", "projected_gravity", - # "commands", + "commands", "dof_pos_obs", "dof_vel", - "dof_pos_history", - # "sampled_history_dof_pos", - # "sampled_history_dof_vel", - # "sampled_history_dof_pos_target", - "oscillator_obs", + # "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - normalize_obs = True + normalize_obs = False actions = ["dof_pos_target"] disable_actions = False @@ -264,26 +265,28 @@ class critic(LeggedRobotRunnerCfg.critic): hidden_dims = [512, 256, 128] # * can be elu, relu, selu, crelu, lrelu, tanh, sigmoid activation = "elu" + layer_norm = [True, True, False] obs = [ "base_height", "base_lin_vel", "base_ang_vel", "projected_gravity", - "commands", "dof_pos_obs", "dof_vel", - "dof_pos_history", + "sampled_history_dof_pos", + "sampled_history_dof_vel", + "sampled_history_dof_pos_target", ] - normalize_obs = True + normalize_obs = False class reward: class weights: - # tracking_lin_vel = 0.0 - # tracking_ang_vel = 1.5 + tracking_lin_vel = 4.0 + tracking_ang_vel = 0.5 # orientation = 1.0 torques = 5.0e-4 - # min_base_height = 1.5 + min_base_height = 1.5 action_rate = 1e-3 action_rate2 = 1e-3 lin_vel_z = 0.0 @@ -292,8 +295,6 @@ class weights: # stand_still = 0.25 dof_pos_limits = 0.25 dof_near_home = 0.25 - stance = 1.0 - swing = 1.0 hips_forward = 0.0 walk_freq = 0.0 # 2.5 @@ -305,12 +306,8 @@ class algorithm(LeggedRobotRunnerCfg.algorithm): gamma = 0.99 lam = 0.95 # shared - batch_size = 2**15 - max_gradient_steps = 24 - # new - storage_size = 2**17 # new - batch_size = 2**15 # new - + batch_size = 4096 + max_gradient_steps = 48 clip_param = 0.2 learning_rate = 1.0e-3 max_grad_norm = 1.0 diff --git a/learning/modules/QRCritics.py b/learning/modules/QRCritics.py index d2ed9378..9dd51489 100644 --- a/learning/modules/QRCritics.py +++ b/learning/modules/QRCritics.py @@ -19,7 +19,7 @@ def init_weights(m): m.bias.data.fill_(0.01) -class Critic(nn.Module): +class Critic2(nn.Module): def __init__( self, num_obs, From 2a7d1a1c099ba47e67ccfd7c693a566a82d54b88 Mon Sep 17 00:00:00 2001 From: sheim Date: Wed, 10 Dec 2025 16:56:22 -0500 Subject: [PATCH 24/26] ruff --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 075e1dbf..14539b8c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.3 + rev: v0.14.2 hooks: - id: ruff-format - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.3 + rev: v0.14.2 hooks: - id: ruff From 8b9fcb137d9342acfd167f6451336b86ed01f3ad Mon Sep 17 00:00:00 2001 From: sheim Date: Wed, 10 Dec 2025 16:56:50 -0500 Subject: [PATCH 25/26] ruff --- learning/utils/logger/PerIterationLogs.py | 6 ++-- learning/utils/logger/test_logger.py | 36 +++++++++---------- .../test_runner_integration.py | 18 +++++----- .../test_generated_torch_files.py | 6 ++-- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/learning/utils/logger/PerIterationLogs.py b/learning/utils/logger/PerIterationLogs.py index 0f30b899..5d781efb 100644 --- a/learning/utils/logger/PerIterationLogs.py +++ b/learning/utils/logger/PerIterationLogs.py @@ -6,9 +6,9 @@ def __init__(self): def register_items(self, category, target, attribute_list): if category in self.targets.keys(): - assert ( - self.targets[category] == target - ), "Category already registered with different target" + assert self.targets[category] == target, ( + "Category already registered with different target" + ) else: self.targets[category] = target self.logs[category] = {} diff --git a/learning/utils/logger/test_logger.py b/learning/utils/logger/test_logger.py index eaef76f6..941a4f0b 100644 --- a/learning/utils/logger/test_logger.py +++ b/learning/utils/logger/test_logger.py @@ -10,9 +10,9 @@ def __init__(self): def all_rewards_registered(logger, reward_names): for key in reward_names: - assert ( - key in logger.reward_logs.log_items.keys() - ), "key not registered in logger." + assert key in logger.reward_logs.log_items.keys(), ( + "key not registered in logger." + ) def only_rewards_registered(logger, reward_names): @@ -27,13 +27,13 @@ def only_category_registered(logger, categories): def both_target_and_log_set_up(logger): for key in logger.iteration_logs.logs.keys(): - assert ( - key in logger.iteration_logs.targets.keys() - ), "target not registered in PerIteration." + assert key in logger.iteration_logs.targets.keys(), ( + "target not registered in PerIteration." + ) for key in logger.iteration_logs.targets.keys(): - assert ( - key in logger.iteration_logs.logs.keys() - ), "log not registered in PerIteration." + assert key in logger.iteration_logs.logs.keys(), ( + "log not registered in PerIteration." + ) def test_logger_setup(): @@ -71,16 +71,16 @@ def check_episode_count(logger, expected_count=1): def check_average_time(logger, expected_time): avg_time = logger.reward_logs.get_average_time() - assert ( - abs(avg_time.item() - expected_time) < 1e-5 - ), f"Average time {avg_time} is not close to {expected_time}" + assert abs(avg_time.item() - expected_time) < 1e-5, ( + f"Average time {avg_time} is not close to {expected_time}" + ) def check_average_reward(logger, reward_name, expected_average): avg_reward = logger.reward_logs.get_average_rewards()[reward_name] - assert ( - abs(avg_reward.item() - expected_average) < 1e-5 - ), f"Average reward {avg_reward} is not close to {expected_average}" + assert abs(avg_reward.item() - expected_average) < 1e-5, ( + f"Average reward {avg_reward} is not close to {expected_average}" + ) def test_logging_rewards(): @@ -173,8 +173,8 @@ def test_timer(): ETA2 = logger.estimate_ETA(["first_step"], mode="total") expected_ETA2 = a * (1000 - 1) - assert ( - abs(ETA2 - expected_ETA2) < 1e-5 - ), f"ETA {ETA2} is not close to {expected_ETA2}" + assert abs(ETA2 - expected_ETA2) < 1e-5, ( + f"ETA {ETA2} is not close to {expected_ETA2}" + ) assert (a + b) >= 2 * trial_time, "Timer not working correctly." diff --git a/tests/integration_tests/test_runner_integration.py b/tests/integration_tests/test_runner_integration.py index 611108e5..d847f627 100644 --- a/tests/integration_tests/test_runner_integration.py +++ b/tests/integration_tests/test_runner_integration.py @@ -58,9 +58,9 @@ def test_default_integration_settings(self, args): with torch.no_grad(): actions = runner.get_inference_actions() deployed_actions = runner.env.get_states(runner.actor_cfg["actions"]) - assert ( - torch.equal(actions, torch.zeros_like(actions)) is False - ), "Policy returning all zeros" + assert torch.equal(actions, torch.zeros_like(actions)) is False, ( + "Policy returning all zeros" + ) assert ( torch.equal(deployed_actions, torch.zeros_like(deployed_actions)) is False ), "Actions not written to environment" @@ -78,9 +78,9 @@ def test_default_integration_settings(self, args): model_7_path = os.path.join(runner.log_dir, "model_7.pt") model_8_path = os.path.join(runner.log_dir, "model_8.pt") - assert os.path.exists( - model_0_path - ), f"{model_0_path} (pre-iteration) was not saved" + assert os.path.exists(model_0_path), ( + f"{model_0_path} (pre-iteration) was not saved" + ) assert not os.path.exists(model_1_path), f"{model_1_path} was saved" assert not os.path.exists(model_2_path), f"{model_2_path} was saved" assert os.path.exists(model_3_path), f"{model_3_path} was not saved" @@ -88,9 +88,9 @@ def test_default_integration_settings(self, args): assert not os.path.exists(model_5_path), f"{model_5_path} was saved" assert os.path.exists(model_6_path), f"{model_6_path} was not saved" assert not os.path.exists(model_7_path), f"{model_7_path} was saved" - assert os.path.exists( - model_8_path - ), f"{model_5_path}(last iteration) was not saved" + assert os.path.exists(model_8_path), ( + f"{model_5_path}(last iteration) was not saved" + ) obs = torch.randn_like(runner.get_obs(runner.actor_cfg["obs"])) actions_first = runner.alg.actor.act_inference(obs).cpu().clone() diff --git a/tests/regression_tests/test_generated_torch_files.py b/tests/regression_tests/test_generated_torch_files.py index cf640ed3..55626034 100644 --- a/tests/regression_tests/test_generated_torch_files.py +++ b/tests/regression_tests/test_generated_torch_files.py @@ -27,6 +27,6 @@ def test_generated_tensor_matches_reference(tmp_path): _generate_candidate_tensor(candidate_path) reference = _load_tensor(REFERENCE_FILE) candidate = _load_tensor(candidate_path) - assert torch.equal( - reference, candidate - ), f"Tensors in {REFERENCE_FILE} and {candidate_path} differ." + assert torch.equal(reference, candidate), ( + f"Tensors in {REFERENCE_FILE} and {candidate_path} differ." + ) From 18343284aed6a8370628680a1c167ab7747c0f4f Mon Sep 17 00:00:00 2001 From: sheim Date: Wed, 10 Dec 2025 18:05:50 -0500 Subject: [PATCH 26/26] catch sample history bug --- gym/envs/mit_humanoid/mit_humanoid.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gym/envs/mit_humanoid/mit_humanoid.py b/gym/envs/mit_humanoid/mit_humanoid.py index a2e082e8..cb4db70f 100644 --- a/gym/envs/mit_humanoid/mit_humanoid.py +++ b/gym/envs/mit_humanoid/mit_humanoid.py @@ -84,8 +84,8 @@ def _reset_system(self, env_ids): def _reset_sampled_history_buffers(self, ids): n = self.cfg.env.sampled_history_length self.sampled_history_dof_pos_target[ids] = self.dof_pos_target[ids].tile(n) - self.sampled_history_dof_pos[ids] = self.dof_pos_target[ids].tile(n) - self.sampled_history_dof_vel[ids] = self.dof_pos_target[ids].tile(n) + self.sampled_history_dof_pos[ids] = self.dof_pos[ids].tile(n) + self.sampled_history_dof_vel[ids] = self.dof_vel[ids].tile(n) # compute_torques accounting for coupling, and filtering torques def _compute_torques(self): @@ -121,11 +121,11 @@ def _update_sampled_history_buffers(self): self.sampled_history_dof_pos[ids] = torch.roll( self.sampled_history_dof_pos[ids], self.num_dof, dims=1 ) # check - self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_pos[ids, : self.num_dof] = self.dof_pos[ids] self.sampled_history_dof_vel[ids] = torch.roll( self.sampled_history_dof_vel[ids], self.num_dof, dims=1 ) # check - self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_pos_target[ids] + self.sampled_history_dof_vel[ids, : self.num_dof] = self.dof_vel[ids] self.sampled_history_counter[ids] = 0