Skip to content

Commit

Permalink
Add reward scaling option to PPO
Browse files Browse the repository at this point in the history
  • Loading branch information
rystrauss committed Dec 30, 2023
1 parent 7b3517b commit da17cf6
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions dopamax/agents/anakin/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
"entropy_coef": 0.01,
# The coefficient for the value loss.
"value_coef": 0.5,
# Multiplier applied to rewards when computing losses.
"reward_scaling": 1.0,
# The type of network to use.
"network": "mlp",
# The configuration dictionary for the network.
Expand Down Expand Up @@ -194,13 +196,13 @@ def train_step(self, train_state: AnakinTrainState) -> Tuple[AnakinTrainState, M
values = jnp.concatenate([rollout_data[SampleBatch.VALUE], jnp.expand_dims(final_value, axis=0)], axis=0)

rollout_data[SampleBatch.RETURN] = rlax.discounted_returns(
rollout_data[SampleBatch.REWARD],
rollout_data[SampleBatch.REWARD] * self.config.reward_scaling,
rollout_data[SampleBatch.DISCOUNT] * self.config.gamma,
v_t=0.0,
)

rollout_data[SampleBatch.ADVANTAGE] = rlax.truncated_generalized_advantage_estimation(
rollout_data[SampleBatch.REWARD],
rollout_data[SampleBatch.REWARD] * self.config.reward_scaling,
rollout_data[SampleBatch.DISCOUNT] * self.config.gamma,
self.config.lambda_,
values,
Expand Down

0 comments on commit da17cf6

Please sign in to comment.