Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
6b9a2c9
add coupling to humanoid torques
sheim Mar 16, 2024
e52208f
add exp avg filtering
sheim Mar 16, 2024
54b826d
add oscillators and phase-based rewards, some tuning
sheim Mar 16, 2024
c9411cd
some tuning
sheim Mar 18, 2024
90ff50c
implement history with (approximated) arbitrary sampling frequency an…
sheim Mar 18, 2024
5020fd0
make pd gains specific for each robot
sheim Mar 19, 2024
82968b1
refactor sampled history to be more explicit and not overlap with `do…
sheim Mar 19, 2024
3ae04a0
properly roll histories, retune action_rate for mc
sheim Mar 23, 2024
ee05de4
enable arms in learnt urdf
sheim Mar 24, 2024
0aab100
hotfix old runner, and set as default for humanoid. New runner is num…
sheim Mar 24, 2024
d86162e
WIP: tuning, critic loss higher than expected
sheim Mar 29, 2024
816a18b
Merge branch 'dev' into humanoid
sheim Aug 6, 2024
098989a
fix hips_forward dimensionality
sheim Aug 6, 2024
0611bcd
lander env
sheim Aug 6, 2024
fd30865
unify humanoid and lander
sheim Aug 7, 2024
be2e990
wip
sheim Sep 4, 2024
3cd76b3
Merge branch 'dev' into humanoid
sheim Sep 4, 2024
2799110
Refactor for action frequency handled in runner
sheim Sep 4, 2024
12e8327
Merge branch 'dev' into humanoid
sheim Sep 8, 2024
f105c0e
update nn_params
sheim Sep 9, 2024
c04b563
apply stash w/ sim-freq reward sampling.
sheim Sep 9, 2024
e395673
refactor skeleton with a super_init fix, and pre-initialize reward bu…
sheim Sep 9, 2024
ea5cdff
refactor: redo rewards computation, with a dict of reward functions i…
sheim Sep 9, 2024
7296c6c
compute switch once per decimation (speedup ~10%)
sheim Sep 9, 2024
69218dd
fixed logging bug that scaled with wrong dt
sheim Sep 10, 2024
b1bd4f2
hardcode Jacobian (halves time for _apply_coupling)
sheim Sep 11, 2024
2022c82
merge in humanoid branch, which had some refactoring and bugfixes, in…
sheim Dec 10, 2025
2a7d1a1
ruff
sheim Dec 10, 2025
8b9fcb1
ruff
sheim Dec 10, 2025
1834328
catch sample history bug
sheim Dec 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.3
rev: v0.14.2
hooks:
- id: ruff-format

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.3
rev: v0.14.2
hooks:
- id: ruff

Expand Down
4 changes: 4 additions & 0 deletions gym/envs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"A1": ".a1.a1",
"HumanoidRunning": ".mit_humanoid.humanoid_running",
"Pendulum": ".pendulum.pendulum",
"Lander": ".mit_humanoid.lander",
}

config_dict = {
Expand All @@ -34,6 +35,7 @@
"HumanoidRunningCfg": ".mit_humanoid.humanoid_running_config",
"PendulumCfg": ".pendulum.pendulum_config",
"PendulumSACCfg": ".pendulum.pendulum_SAC_config",
"LanderCfg": ".mit_humanoid.lander_config",
"PendulumPSDCfg": ".pendulum.pendulum_PSD_config",
}

Expand All @@ -49,6 +51,7 @@
"HumanoidRunningRunnerCfg": ".mit_humanoid.humanoid_running_config",
"PendulumRunnerCfg": ".pendulum.pendulum_config",
"PendulumSACRunnerCfg": ".pendulum.pendulum_SAC_config",
"LanderRunnerCfg": ".mit_humanoid.lander_config",
"PendulumPSDRunnerCfg": ".pendulum.pendulum_PSD_config",
}

Expand Down Expand Up @@ -79,6 +82,7 @@
"flat_anymal_c": ["Anymal", "AnymalCFlatCfg", "AnymalCFlatRunnerCfg"],
"pendulum": ["Pendulum", "PendulumCfg", "PendulumRunnerCfg"],
"sac_pendulum": ["Pendulum", "PendulumSACCfg", "PendulumSACRunnerCfg"],
"lander": ["Lander", "LanderCfg", "LanderRunnerCfg"],
"psd_pendulum": ["Pendulum", "PendulumPSDCfg", "PendulumPSDRunnerCfg"],
}

Expand Down
19 changes: 3 additions & 16 deletions gym/envs/base/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,20 @@ def __init__(self, gym, sim, cfg, sim_params, sim_device, headless):
# * env device is GPU only if sim is on GPU and use_gpu_pipeline=True,
# * otherwise returned tensors are copied to CPU by physX.
if sim_device_type == "cuda" and sim_params.use_gpu_pipeline:
self.device = self.sim_device
device = self.sim_device
else:
self.device = "cpu"
device = "cpu"

# * graphics device for rendering, -1 for no rendering
self.graphics_device_id = self.sim_device_id

self.num_envs = cfg.env.num_envs
self.num_actuators = cfg.env.num_actuators

# * optimization flags for pytorch JIT
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)

# allocate buffers
self.to_be_reset = torch.ones(
self.num_envs, device=self.device, dtype=torch.bool
)
self.terminated = torch.ones(
self.num_envs, device=self.device, dtype=torch.bool
)
self.episode_length_buf = torch.zeros(
self.num_envs, device=self.device, dtype=torch.long
)
self.timed_out = torch.zeros(
self.num_envs, device=self.device, dtype=torch.bool
)
super().__init__(num_envs=cfg.env.num_envs, device=device)

# todo: read from config
self.enable_viewer_sync = True
Expand Down
21 changes: 6 additions & 15 deletions gym/envs/base/fixed_robot.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,27 +580,18 @@ def _reward_dof_vel(self):
def _reward_action_rate(self):
"""Penalize changes in actions"""
nact = self.num_actuators
dt2 = (self.dt * self.cfg.control.decimation) ** 2
error = (
torch.square(
self.dof_pos_history[:, :nact]
- self.dof_pos_history[:, nact : 2 * nact]
)
/ dt2
error = torch.square(
self.dof_pos_history[:, :nact] - self.dof_pos_history[:, 2 * nact :]
)
return -torch.mean(error, dim=1)

def _reward_action_rate2(self):
"""Penalize changes in actions"""
nact = self.num_actuators
dt2 = (self.dt * self.cfg.control.decimation) ** 2
error = (
torch.square(
self.dof_pos_history[:, :nact]
- 2 * self.dof_pos_history[:, nact : 2 * nact]
+ self.dof_pos_history[:, 2 * nact :]
)
/ dt2
error = torch.square(
self.dof_pos_history[:, :nact]
- 2 * self.dof_pos_history[:, nact : 2 * nact]
+ self.dof_pos_history[:, 2 * nact :]
)
return -torch.mean(error, dim=1)

Expand Down
97 changes: 50 additions & 47 deletions gym/envs/base/legged_robot.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,12 @@ def _post_decimation_step(self):

self.base_height = self.root_states[:, 2:3]

n = self.num_actuators
self.dof_pos_history[:, 2 * n :] = self.dof_pos_history[:, n : 2 * n]
self.dof_pos_history[:, n : 2 * n] = self.dof_pos_history[:, :n]
self.dof_pos_history[:, :n] = self.dof_pos_target
self.dof_pos_obs = self.dof_pos - self.default_dof_pos

self.dof_pos_history = self.dof_pos_history.roll(self.num_actuators)
# self.dof_pos_history[:, : self.num_actuators] = self.dof_pos_obs
self.dof_pos_history[:, : self.num_actuators] = self.dof_pos_target

env_ids = (
self.episode_length_buf % int(self.cfg.commands.resampling_time / self.dt)
== 0
Expand All @@ -133,7 +133,10 @@ def _reset_idx(self, env_ids):
self._reset_system(env_ids)
self._resample_commands(env_ids)
# * reset buffers
self.dof_pos_history[env_ids] = 0.0
self.dof_pos_obs[env_ids] = self.dof_pos[env_ids] - self.default_dof_pos
# self.dof_pos_history[env_ids] = self.dof_pos_obs[env_ids].tile(3)
self.dof_pos_target[env_ids] = self.default_dof_pos
self.dof_pos_history[env_ids] = self.dof_pos_target[env_ids].tile(3)
self.episode_length_buf[env_ids] = 0

def _initialize_sim(self):
Expand Down Expand Up @@ -481,41 +484,26 @@ def _init_buffers(self):
get_axis_params(-1.0, self.up_axis_idx), device=self.device
).repeat((self.num_envs, 1))
self.torques = torch.zeros(
self.num_envs,
self.num_actuators,
dtype=torch.float,
device=self.device,
self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
)
self.p_gains = torch.zeros(
self.num_actuators, dtype=torch.float, device=self.device
self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
)
self.d_gains = torch.zeros(
self.num_actuators, dtype=torch.float, device=self.device
self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
)
self.dof_pos_target = torch.zeros(
self.num_envs,
self.num_actuators,
dtype=torch.float,
device=self.device,
self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
)
self.dof_vel_target = torch.zeros(
self.num_envs,
self.num_actuators,
dtype=torch.float,
device=self.device,
self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
)
self.tau_ff = torch.zeros(
self.num_envs,
self.num_actuators,
dtype=torch.float,
device=self.device,
self.num_envs, self.num_actuators, dtype=torch.float, device=self.device
)

self.dof_pos_history = torch.zeros(
self.num_envs,
self.num_actuators * 3,
dtype=torch.float,
device=self.device,
self.num_envs, self.num_actuators * 3, dtype=torch.float, device=self.device
)
self.commands = torch.zeros(
self.num_envs, 3, dtype=torch.float, device=self.device
Expand All @@ -532,6 +520,29 @@ def _init_buffers(self):
self.num_envs, 1, dtype=torch.float, device=self.device
)

# # * get the body_name to body_index dict
# body_dict = self.gym.get_actor_rigid_body_dict(
# self.envs[0], self.actor_handles[0]
# )
# # * extract a list of body_names where the index is the id number
# body_names = [
# body_tuple[0]
# for body_tuple in sorted(
# body_dict.items(), key=lambda body_tuple: body_tuple[1]
# )
# ]
# # * construct a list of id numbers corresponding to end_effectors
# self.end_effector_ids = []
# for end_effector_name in self.cfg.asset.foot_collisionbox_names:
# self.end_effector_ids.extend(
# [
# body_names.index(body_name)
# for body_name in body_names
# if end_effector_name in body_name
# ]
# )
# # ----------------------------------------

if self.cfg.terrain.measure_heights:
self.height_points = self._init_height_points()
self.measured_heights = 0
Expand Down Expand Up @@ -559,8 +570,8 @@ def _init_buffers(self):
found = False
for dof_name in self.cfg.control.stiffness.keys():
if dof_name in name:
self.p_gains[i] = self.cfg.control.stiffness[dof_name]
self.d_gains[i] = self.cfg.control.damping[dof_name]
self.p_gains[:, i] = self.cfg.control.stiffness[dof_name]
self.d_gains[:, i] = self.cfg.control.damping[dof_name]
found = True
if not found:
self.p_gains[i] = 0.0
Expand Down Expand Up @@ -995,11 +1006,11 @@ def _reward_lin_vel_z(self):

def _reward_ang_vel_xy(self):
"""Penalize xy axes base angular velocity"""
return -torch.sum(torch.square(self.base_ang_vel[:, :2]), dim=1)
return -torch.mean(torch.square(self.base_ang_vel[:, :2]), dim=1)

def _reward_orientation(self):
"""Penalize non flat base orientation"""
return -torch.sum(torch.square(self.projected_gravity[:, :2]), dim=1)
return -torch.mean(torch.square(self.projected_gravity[:, :2]), dim=1)

def _reward_base_height(self):
"""Penalize base height away from target"""
Expand All @@ -1019,26 +1030,18 @@ def _reward_dof_vel(self):
def _reward_action_rate(self):
"""Penalize changes in actions"""
n = self.num_actuators
dt2 = (self.dt * self.cfg.control.decimation) ** 2
error = (
torch.square(
self.dof_pos_history[:, :n] - self.dof_pos_history[:, n : 2 * n]
)
/ dt2
error = torch.square(
self.dof_pos_history[:, :n] - self.dof_pos_history[:, 2 * n :]
)
return -torch.mean(error, dim=1)

def _reward_action_rate2(self):
"""Penalize changes in actions"""
n = self.num_actuators
dt2 = (self.dt * self.cfg.control.decimation) ** 2
error = (
torch.square(
self.dof_pos_history[:, :n]
- 2 * self.dof_pos_history[:, n : 2 * n]
+ self.dof_pos_history[:, 2 * n :]
)
/ dt2
error = torch.square(
self.dof_pos_history[:, :n]
- 2 * self.dof_pos_history[:, n : 2 * n]
+ self.dof_pos_history[:, 2 * n :]
)
return -torch.mean(error, dim=1)

Expand Down Expand Up @@ -1084,7 +1087,7 @@ def _reward_tracking_lin_vel(self):
"""Tracking of linear velocity commands (xy axes)"""
error = torch.square(self.commands[:, :2] - self.base_lin_vel[:, :2])
error = torch.exp(-error / self.cfg.reward_settings.tracking_sigma)
return torch.sum(error, dim=1)
return torch.mean(error, dim=1)

def _reward_tracking_ang_vel(self):
"""Tracking of angular velocity commands (yaw)"""
Expand All @@ -1093,7 +1096,7 @@ def _reward_tracking_ang_vel(self):

def _reward_feet_contact_forces(self):
"""penalize high contact forces"""
return -torch.sum(
return -torch.mean(
(
torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1)
- self.cfg.reward_settings.max_contact_force
Expand Down
5 changes: 1 addition & 4 deletions gym/envs/base/legged_robot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ class asset:
file = ""
# * name of the feet bodies,
# * used to index body state and contact force tensors
foot_name = "None"
foot_name = "foot"
penalize_contacts_on = []
terminate_after_contacts_on = []
end_effector_names = []
Expand Down Expand Up @@ -301,9 +301,6 @@ class algorithm:
batch_size = 2**15
max_gradient_steps = 24
# new
storage_size = 2**17 # new
batch_size = 2**15 # new

clip_param = 0.2
learning_rate = 1.0e-3
max_grad_norm = 1.0
Expand Down
24 changes: 8 additions & 16 deletions gym/envs/base/task_skeleton.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,15 @@


class TaskSkeleton:
def __init__(self, num_envs=1, max_episode_length=1.0, device="cpu"):
def __init__(self, num_envs=1, device="cpu"):
self.num_envs = num_envs
self.max_episode_length = max_episode_length
self.device = device

self.to_be_reset = torch.ones(num_envs, device=device, dtype=torch.bool)
self.terminated = torch.ones(num_envs, device=device, dtype=torch.bool)
self.episode_length_buf = torch.zeros(num_envs, device=device, dtype=torch.long)
self.timed_out = torch.zeros(num_envs, device=device, dtype=torch.bool)

return None

def get_states(self, obs_list):
Expand Down Expand Up @@ -52,25 +57,12 @@ def _reset_buffers(self):
self.terminated[:] = False
self.timed_out[:] = False

def compute_reward(self, reward_weights):
"""Compute and return a torch tensor of rewards
reward_weights: dict with keys matching reward names, and values
matching weights
"""
reward = torch.zeros(self.num_envs, device=self.device, dtype=torch.float)
for name, weight in reward_weights.items():
reward += weight * self._eval_reward(name)
return reward

def _eval_reward(self, name):
return eval("self._reward_" + name + "()")

def _check_terminations_and_timeouts(self):
"""Check if environments need to be reset"""
contact_forces = self.contact_forces[:, self.termination_contact_indices, :]
self.terminated |= torch.any(torch.norm(contact_forces, dim=-1) > 1.0, dim=1)
self.timed_out = self.episode_length_buf >= self.max_episode_length
self.to_be_reset = self.timed_out | self.terminated
# self.to_be_reset = self.timed_out | self.terminated

def step(self, actions):
raise NotImplementedError
Expand Down
Loading