diff --git a/src/mjlab/envs/manager_based_rl_env.py b/src/mjlab/envs/manager_based_rl_env.py index 11563c97c..7f7eea09a 100644 --- a/src/mjlab/envs/manager_based_rl_env.py +++ b/src/mjlab/envs/manager_based_rl_env.py @@ -85,6 +85,13 @@ class ManagerBasedRlEnvCfg: receives a truncated done signal to bootstrap the value of continuing beyond the limit. """ + scale_rewards_by_dt: bool = True + """Whether to multiply rewards by the environment step duration (dt). + + When True (default), reward values are scaled by step_dt to normalize cumulative + episodic rewards across different simulation frequencies. Set to False for + algorithms that expect unscaled reward signals (e.g., HER, static reward scaling). + """ class ManagerBasedRlEnv: @@ -236,7 +243,9 @@ def load_managers(self) -> None: self.termination_manager = TerminationManager(self.cfg.terminations, self) print_info(f"[INFO] {self.termination_manager}") - self.reward_manager = RewardManager(self.cfg.rewards, self) + self.reward_manager = RewardManager( + self.cfg.rewards, self, scale_by_dt=self.cfg.scale_rewards_by_dt + ) print_info(f"[INFO] {self.reward_manager}") if self.cfg.curriculum is not None: self.curriculum_manager = CurriculumManager(self.cfg.curriculum, self) diff --git a/src/mjlab/managers/reward_manager.py b/src/mjlab/managers/reward_manager.py index 10adfcd97..51cf8dfd0 100644 --- a/src/mjlab/managers/reward_manager.py +++ b/src/mjlab/managers/reward_manager.py @@ -16,12 +16,39 @@ class RewardManager(ManagerBase): + """Manages reward computation by aggregating weighted reward terms. + + Reward Scaling Behavior: + By default, rewards are scaled by the environment step duration (dt). This + normalizes cumulative episodic rewards across different simulation frequencies. + The scaling can be disabled via the ``scale_by_dt`` parameter. + + When ``scale_by_dt=True`` (default): + - ``reward_buf`` (returned by ``compute()``) = raw_value * weight * dt + - ``_episode_sums`` (cumulative rewards) are scaled by dt + - ``Episode_Reward/*`` logged metrics are scaled by dt + + When ``scale_by_dt=False``: + - ``reward_buf`` = raw_value * weight (no dt scaling) + + Regardless of the scaling setting: + - ``_step_reward`` (via ``get_active_iterable_terms()``) always contains + the unscaled reward rate (raw_value * weight) + """ + _env: ManagerBasedRlEnv - def __init__(self, cfg: dict[str, RewardTermCfg], env: ManagerBasedRlEnv): + def __init__( + self, + cfg: dict[str, RewardTermCfg], + env: ManagerBasedRlEnv, + *, + scale_by_dt: bool = True, + ): self._term_names: list[str] = list() self._term_cfgs: list[RewardTermCfg] = list() self._class_term_cfgs: list[RewardTermCfg] = list() + self._scale_by_dt = scale_by_dt self.cfg = deepcopy(cfg) super().__init__(env=env) @@ -76,18 +103,19 @@ def reset( def compute(self, dt: float) -> torch.Tensor: self._reward_buf[:] = 0.0 + scale = dt if self._scale_by_dt else 1.0 for term_idx, (name, term_cfg) in enumerate( zip(self._term_names, self._term_cfgs, strict=False) ): if term_cfg.weight == 0.0: self._step_reward[:, term_idx] = 0.0 continue - value = term_cfg.func(self._env, **term_cfg.params) * term_cfg.weight * dt + value = term_cfg.func(self._env, **term_cfg.params) * term_cfg.weight * scale # NaN/Inf can occur from corrupted physics state; zero them to avoid policy crash. value = torch.nan_to_num(value, nan=0.0, posinf=0.0, neginf=0.0) self._reward_buf += value self._episode_sums[name] += value - self._step_reward[:, term_idx] = value / dt + self._step_reward[:, term_idx] = value / scale return self._reward_buf def get_active_iterable_terms(self, env_idx): diff --git a/tests/test_rewards.py b/tests/test_rewards.py index e6394373f..b98ca5c02 100644 --- a/tests/test_rewards.py +++ b/tests/test_rewards.py @@ -271,3 +271,63 @@ def neginf_reward(env): assert not torch.isinf(rewards).any() assert rewards[2] == 0.0 + + +def test_reward_scaling_enabled(mock_env): + """Test that rewards are scaled by dt when scale_by_dt=True (default).""" + + def constant_reward(env): + return torch.ones(env.num_envs, device=env.device) + + cfg = {"term": RewardTermCfg(func=constant_reward, weight=2.0, params={})} + manager = RewardManager(cfg, mock_env, scale_by_dt=True) + + dt = 0.02 + rewards = manager.compute(dt=dt) + + # With scaling: reward = raw_value (1.0) * weight (2.0) * dt (0.02) = 0.04 + expected = 1.0 * 2.0 * dt + assert torch.allclose(rewards, torch.full((4,), expected)) + + # _step_reward should be unscaled (raw_value * weight) + step_reward = manager._step_reward[:, 0] + expected_step = 1.0 * 2.0 + assert torch.allclose(step_reward, torch.full((4,), expected_step)) + + +def test_reward_scaling_disabled(mock_env): + """Test that rewards are not scaled by dt when scale_by_dt=False.""" + + def constant_reward(env): + return torch.ones(env.num_envs, device=env.device) + + cfg = {"term": RewardTermCfg(func=constant_reward, weight=2.0, params={})} + manager = RewardManager(cfg, mock_env, scale_by_dt=False) + + dt = 0.02 + rewards = manager.compute(dt=dt) + + # Without scaling: reward = raw_value (1.0) * weight (2.0) = 2.0 + expected = 1.0 * 2.0 + assert torch.allclose(rewards, torch.full((4,), expected)) + + # _step_reward should still be unscaled (same as reward when not scaling) + step_reward = manager._step_reward[:, 0] + assert torch.allclose(step_reward, torch.full((4,), expected)) + + +def test_reward_scaling_default_is_enabled(mock_env): + """Test that scale_by_dt defaults to True for backward compatibility.""" + + def constant_reward(env): + return torch.ones(env.num_envs, device=env.device) + + cfg = {"term": RewardTermCfg(func=constant_reward, weight=1.0, params={})} + # Don't pass scale_by_dt - should default to True + manager = RewardManager(cfg, mock_env) + + dt = 0.01 + rewards = manager.compute(dt=dt) + + # Default (scaling enabled): reward = 1.0 * 1.0 * 0.01 = 0.01 + assert torch.allclose(rewards, torch.full((4,), 0.01))