google-deepmind · cgao3 · Oct 22, 2020
diff --git a/bsuite/environments/deep_sea.py b/bsuite/environments/deep_sea.py
@@ -118,11 +118,12 @@ def _step(self, action: int) -> dm_env.TimeStep:
     action_right = action == self._action_mapping[self._row, self._column]
 
     # Reward calculation
-    if self._column == self._size - 1 and action_right:
+    if self._column == self._size - 1 and action_right \
+            and (self._rng.rand() > 1 / self._size or self._deterministic):
       reward += 1.
       self._denoised_return += 1.
     if not self._deterministic:  # Noisy rewards on the 'end' of chain.
-      if self._row == self._size - 1 and self._column in [0, self._size - 1]:
+      if self._row == self._size - 1 and 0 <= self._column <= self._size - 1:
         reward += self._rng.randn()
 
     # Transition dynamics