sheim · sheim · Dec 10, 2025 · Mar 16, 2024 · Mar 16, 2024 · Mar 16, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.14.2
     hooks:
       - id: ruff-format
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.14.2
     hooks:
       - id: ruff
 

diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py
@@ -20,6 +20,7 @@
     "A1": ".a1.a1",
     "HumanoidRunning": ".mit_humanoid.humanoid_running",
     "Pendulum": ".pendulum.pendulum",
+    "Lander": ".mit_humanoid.lander",
 }
 
 config_dict = {
@@ -34,6 +35,7 @@
     "HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config",
     "PendulumCfg": ".pendulum.pendulum_config",
     "PendulumSACCfg": ".pendulum.pendulum_SAC_config",
+    "LanderCfg": ".mit_humanoid.lander_config",
     "PendulumPSDCfg": ".pendulum.pendulum_PSD_config",
 }
 
@@ -49,6 +51,7 @@
     "HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config",
     "PendulumRunnerCfg": ".pendulum.pendulum_config",
     "PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config",
+    "LanderRunnerCfg": ".mit_humanoid.lander_config",
     "PendulumPSDRunnerCfg": ".pendulum.pendulum_PSD_config",
 }
 
@@ -79,6 +82,7 @@
     "flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"],
     "pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"],
     "sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"],
+    "lander": ["Lander", "LanderCfg", "LanderRunnerCfg"],
     "psd_pendulum": ["Pendulum", "PendulumPSDCfg", "PendulumPSDRunnerCfg"],
 }
 

diff --git a/gym/envs/base/base_task.py b/gym/envs/base/base_task.py
@@ -18,33 +18,20 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
         # * env device is GPU only if sim is on GPU and use_gpu_pipeline=True,
         # * otherwise returned tensors are copied to CPU by physX.
         if sim_device_type == "cuda" and sim_params.use_gpu_pipeline:
-            self.device = self.sim_device
+            device = self.sim_device
         else:
-            self.device = "cpu"
+            device = "cpu"
 
         # * graphics device for rendering, -1 for no rendering
         self.graphics_device_id = self.sim_device_id
 
-        self.num_envs = cfg.env.num_envs
         self.num_actuators = cfg.env.num_actuators
 
         # * optimization flags for pytorch JIT
         torch._C._jit_set_profiling_mode(False)
         torch._C._jit_set_profiling_executor(False)
 
-        # allocate buffers
-        self.to_be_reset = torch.ones(
-            self.num_envs, device=self.device, dtype=torch.bool
-        )
-        self.terminated = torch.ones(
-            self.num_envs, device=self.device, dtype=torch.bool
-        )
-        self.episode_length_buf = torch.zeros(
-            self.num_envs, device=self.device, dtype=torch.long
-        )
-        self.timed_out = torch.zeros(
-            self.num_envs, device=self.device, dtype=torch.bool
-        )
+        super().__init__(num_envs=cfg.env.num_envs, device=device)
 
         # todo: read from config
         self.enable_viewer_sync = True

diff --git a/gym/envs/base/fixed_robot.py b/gym/envs/base/fixed_robot.py
@@ -580,27 +580,18 @@ def _reward_dof_vel(self):
     def _reward_action_rate(self):
         """Penalize changes in actions"""
         nact = self.num_actuators
-        dt2 = (self.dt * self.cfg.control.decimation) ** 2
-        error = (
-            torch.square(
-                self.dof_pos_history[:, :nact]
-                - self.dof_pos_history[:, nact : 2 * nact]
-            )
-            / dt2
+        error = torch.square(
+            self.dof_pos_history[:, :nact] - self.dof_pos_history[:, 2 * nact :]
         )
         return -torch.mean(error, dim=1)
 
     def _reward_action_rate2(self):
         """Penalize changes in actions"""
         nact = self.num_actuators
-        dt2 = (self.dt * self.cfg.control.decimation) ** 2
-        error = (
-            torch.square(
-                self.dof_pos_history[:, :nact]
-                - 2 * self.dof_pos_history[:, nact : 2 * nact]
-                + self.dof_pos_history[:, 2 * nact :]
-            )
-            / dt2
+        error = torch.square(
+            self.dof_pos_history[:, :nact]
+            - 2 * self.dof_pos_history[:, nact : 2 * nact]
+            + self.dof_pos_history[:, 2 * nact :]
         )
         return -torch.mean(error, dim=1)
 

diff --git a/gym/envs/base/legged_robot.py b/gym/envs/base/legged_robot.py
@@ -110,12 +110,12 @@ def _post_decimation_step(self):
 
         self.base_height = self.root_states[:, 2:3]
 
-        n = self.num_actuators
-        self.dof_pos_history[:, 2 * n :] = self.dof_pos_history[:, n : 2 * n]
-        self.dof_pos_history[:, n : 2 * n] = self.dof_pos_history[:, :n]
-        self.dof_pos_history[:, :n] = self.dof_pos_target
         self.dof_pos_obs = self.dof_pos - self.default_dof_pos
 
+        self.dof_pos_history = self.dof_pos_history.roll(self.num_actuators)
+        # self.dof_pos_history[:, : self.num_actuators] = self.dof_pos_obs
+        self.dof_pos_history[:, : self.num_actuators] = self.dof_pos_target
+
         env_ids = (
             self.episode_length_buf % int(self.cfg.commands.resampling_time / self.dt)
             == 0
@@ -133,7 +133,10 @@ def _reset_idx(self, env_ids):
         self._reset_system(env_ids)
         self._resample_commands(env_ids)
         # * reset buffers
-        self.dof_pos_history[env_ids] = 0.0
+        self.dof_pos_obs[env_ids] = self.dof_pos[env_ids] - self.default_dof_pos
+        # self.dof_pos_history[env_ids] = self.dof_pos_obs[env_ids].tile(3)
+        self.dof_pos_target[env_ids] = self.default_dof_pos
+        self.dof_pos_history[env_ids] = self.dof_pos_target[env_ids].tile(3)
         self.episode_length_buf[env_ids] = 0
 
     def _initialize_sim(self):
@@ -481,41 +484,26 @@ def _init_buffers(self):
             get_axis_params(-1.0, self.up_axis_idx), device=self.device
         ).repeat((self.num_envs, 1))
         self.torques = torch.zeros(
-            self.num_envs,
-            self.num_actuators,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.p_gains = torch.zeros(
-            self.num_actuators, dtype=torch.float, device=self.device
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.d_gains = torch.zeros(
-            self.num_actuators, dtype=torch.float, device=self.device
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.dof_pos_target = torch.zeros(
-            self.num_envs,
-            self.num_actuators,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.dof_vel_target = torch.zeros(
-            self.num_envs,
-            self.num_actuators,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
         self.tau_ff = torch.zeros(
-            self.num_envs,
-            self.num_actuators,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
         )
 
         self.dof_pos_history = torch.zeros(
-            self.num_envs,
-            self.num_actuators * 3,
-            dtype=torch.float,
-            device=self.device,
+            self.num_envs, self.num_actuators * 3, dtype=torch.float, device=self.device
         )
         self.commands = torch.zeros(
             self.num_envs, 3, dtype=torch.float, device=self.device
@@ -532,6 +520,29 @@ def _init_buffers(self):
             self.num_envs, 1, dtype=torch.float, device=self.device
         )
 
+        # # * get the body_name to body_index dict
+        # body_dict = self.gym.get_actor_rigid_body_dict(
+        #     self.envs[0], self.actor_handles[0]
+        # )
+        # # * extract a list of body_names where the index is the id number
+        # body_names = [
+        #     body_tuple[0]
+        #     for body_tuple in sorted(
+        #         body_dict.items(), key=lambda body_tuple: body_tuple[1]
+        #     )
+        # ]
+        # # * construct a list of id numbers corresponding to end_effectors
+        # self.end_effector_ids = []
+        # for end_effector_name in self.cfg.asset.foot_collisionbox_names:
+        #     self.end_effector_ids.extend(
+        #         [
+        #             body_names.index(body_name)
+        #             for body_name in body_names
+        #             if end_effector_name in body_name
+        #         ]
+        #     )
+        # # ----------------------------------------
+
         if self.cfg.terrain.measure_heights:
             self.height_points = self._init_height_points()
         self.measured_heights = 0
@@ -559,8 +570,8 @@ def _init_buffers(self):
             found = False
             for dof_name in self.cfg.control.stiffness.keys():
                 if dof_name in name:
-                    self.p_gains[i] = self.cfg.control.stiffness[dof_name]
-                    self.d_gains[i] = self.cfg.control.damping[dof_name]
+                    self.p_gains[:, i] = self.cfg.control.stiffness[dof_name]
+                    self.d_gains[:, i] = self.cfg.control.damping[dof_name]
                     found = True
             if not found:
                 self.p_gains[i] = 0.0
@@ -995,11 +1006,11 @@ def _reward_lin_vel_z(self):
 
     def _reward_ang_vel_xy(self):
         """Penalize xy axes base angular velocity"""
-        return -torch.sum(torch.square(self.base_ang_vel[:, :2]), dim=1)
+        return -torch.mean(torch.square(self.base_ang_vel[:, :2]), dim=1)
 
     def _reward_orientation(self):
         """Penalize non flat base orientation"""
-        return -torch.sum(torch.square(self.projected_gravity[:, :2]), dim=1)
+        return -torch.mean(torch.square(self.projected_gravity[:, :2]), dim=1)
 
     def _reward_base_height(self):
         """Penalize base height away from target"""
@@ -1019,26 +1030,18 @@ def _reward_dof_vel(self):
     def _reward_action_rate(self):
         """Penalize changes in actions"""
         n = self.num_actuators
-        dt2 = (self.dt * self.cfg.control.decimation) ** 2
-        error = (
-            torch.square(
-                self.dof_pos_history[:, :n] - self.dof_pos_history[:, n : 2 * n]
-            )
-            / dt2
+        error = torch.square(
+            self.dof_pos_history[:, :n] - self.dof_pos_history[:, 2 * n :]
         )
         return -torch.mean(error, dim=1)
 
     def _reward_action_rate2(self):
         """Penalize changes in actions"""
         n = self.num_actuators
-        dt2 = (self.dt * self.cfg.control.decimation) ** 2
-        error = (
-            torch.square(
-                self.dof_pos_history[:, :n]
-                - 2 * self.dof_pos_history[:, n : 2 * n]
-                + self.dof_pos_history[:, 2 * n :]
-            )
-            / dt2
+        error = torch.square(
+            self.dof_pos_history[:, :n]
+            - 2 * self.dof_pos_history[:, n : 2 * n]
+            + self.dof_pos_history[:, 2 * n :]
         )
         return -torch.mean(error, dim=1)
 
@@ -1084,7 +1087,7 @@ def _reward_tracking_lin_vel(self):
         """Tracking of linear velocity commands (xy axes)"""
         error = torch.square(self.commands[:, :2] - self.base_lin_vel[:, :2])
         error = torch.exp(-error / self.cfg.reward_settings.tracking_sigma)
-        return torch.sum(error, dim=1)
+        return torch.mean(error, dim=1)
 
     def _reward_tracking_ang_vel(self):
         """Tracking of angular velocity commands (yaw)"""
@@ -1093,7 +1096,7 @@ def _reward_tracking_ang_vel(self):
 
     def _reward_feet_contact_forces(self):
         """penalize high contact forces"""
-        return -torch.sum(
+        return -torch.mean(
             (
                 torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
                 - self.cfg.reward_settings.max_contact_force

diff --git a/gym/envs/base/legged_robot_config.py b/gym/envs/base/legged_robot_config.py
@@ -163,7 +163,7 @@ class asset:
         file = ""
         # * name of the feet bodies,
         # * used to index body state and contact force tensors
-        foot_name = "None"
+        foot_name = "foot"
         penalize_contacts_on = []
         terminate_after_contacts_on = []
         end_effector_names = []
@@ -301,9 +301,6 @@ class algorithm:
         batch_size = 2**15
         max_gradient_steps = 24
         # new
-        storage_size = 2**17  # new
-        batch_size = 2**15  #  new
-
         clip_param = 0.2
         learning_rate = 1.0e-3
         max_grad_norm = 1.0

diff --git a/gym/envs/base/task_skeleton.py b/gym/envs/base/task_skeleton.py
@@ -5,10 +5,15 @@
 
 
 class TaskSkeleton:
-    def __init__(self, num_envs=1, max_episode_length=1.0, device="cpu"):
+    def __init__(self, num_envs=1, device="cpu"):
         self.num_envs = num_envs
-        self.max_episode_length = max_episode_length
         self.device = device
+
+        self.to_be_reset = torch.ones(num_envs, device=device, dtype=torch.bool)
+        self.terminated = torch.ones(num_envs, device=device, dtype=torch.bool)
+        self.episode_length_buf = torch.zeros(num_envs, device=device, dtype=torch.long)
+        self.timed_out = torch.zeros(num_envs, device=device, dtype=torch.bool)
+
         return None
 
     def get_states(self, obs_list):
@@ -52,25 +57,12 @@ def _reset_buffers(self):
         self.terminated[:] = False
         self.timed_out[:] = False
 
-    def compute_reward(self, reward_weights):
-        """Compute and return a torch tensor of rewards
-        reward_weights: dict with keys matching reward names, and values
-            matching weights
-        """
-        reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float)
-        for name, weight in reward_weights.items():
-            reward += weight * self._eval_reward(name)
-        return reward
-
-    def _eval_reward(self, name):
-        return eval("self._reward_" + name + "()")
-
     def _check_terminations_and_timeouts(self):
         """Check if environments need to be reset"""
         contact_forces = self.contact_forces[:, self.termination_contact_indices, :]
         self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
         self.timed_out = self.episode_length_buf >= self.max_episode_length
-        self.to_be_reset = self.timed_out | self.terminated
+        # self.to_be_reset = self.timed_out | self.terminated
 
     def step(self, actions):
         raise NotImplementedError