diff --git a/bsuite/environments/deep_sea.py b/bsuite/environments/deep_sea.py index 5d7ff7d7..634f787d 100644 --- a/bsuite/environments/deep_sea.py +++ b/bsuite/environments/deep_sea.py @@ -118,11 +118,12 @@ def _step(self, action: int) -> dm_env.TimeStep: action_right = action == self._action_mapping[self._row, self._column] # Reward calculation - if self._column == self._size - 1 and action_right: + if self._column == self._size - 1 and action_right \ + and (self._rng.rand() > 1 / self._size or self._deterministic): reward += 1. self._denoised_return += 1. if not self._deterministic: # Noisy rewards on the 'end' of chain. - if self._row == self._size - 1 and self._column in [0, self._size - 1]: + if self._row == self._size - 1 and 0 <= self._column <= self._size - 1: reward += self._rng.randn() # Transition dynamics