assume-framework · kim-mskw · Dec 23, 2025 · Dec 9, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/assume/common/base.py b/assume/common/base.py
@@ -893,7 +893,7 @@ class LearningStrategy(BaseStrategy):
     convention when designing your create_observation method and the observation space.
 
     Attributes:
-        obs_dim (int): The observation dimension.
+        foresight (int): Number of steps of for- and backwards looking in observations.
         act_dim (int): The action dimension.
         unique_obs_dim (int): The unique observation dimension.
         num_timeseries_obs_dim (int): The number of observation timeseries dimension.
@@ -907,7 +907,7 @@ class LearningStrategy(BaseStrategy):
     def __init__(
         self,
         learning_role,
-        obs_dim: int,
+        foresight: int,
         act_dim: int,
         unique_obs_dim: int,
         num_timeseries_obs_dim: int = 3,
@@ -923,7 +923,7 @@ def __init__(
         self.learning_role = learning_role
         self.learning_config = learning_role.learning_config
 
-        self.obs_dim = obs_dim
+        self.foresight = foresight
         self.act_dim = act_dim
 
         # this defines the number of unique observations, which are not the same for all units
@@ -934,6 +934,8 @@ def __init__(
         # them into suitable format for recurrent neural networks
         self.num_timeseries_obs_dim = num_timeseries_obs_dim
 
+        self.obs_dim = num_timeseries_obs_dim * foresight + unique_obs_dim
+
 
 class MinMaxStrategy(BaseStrategy):
     pass

diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py
@@ -270,23 +270,25 @@ def check_strategy_dimensions(self) -> None:
         Also check if the unique observation dimensions are the same. If not, raise a ValueError.
         This is important for the TD3 algorithm, as it uses a centralized critic that requires consistent dimensions across all agents.
         """
+        foresight_list = []
         obs_dim_list = []
         act_dim_list = []
         unique_obs_dim_list = []
         num_timeseries_obs_dim_list = []
 
         for strategy in self.learning_role.rl_strats.values():
+            foresight_list.append(strategy.foresight)
             obs_dim_list.append(strategy.obs_dim)
             act_dim_list.append(strategy.act_dim)
             unique_obs_dim_list.append(strategy.unique_obs_dim)
             num_timeseries_obs_dim_list.append(strategy.num_timeseries_obs_dim)
 
-        if len(set(obs_dim_list)) > 1:
+        if len(set(foresight_list)) > 1:
             raise ValueError(
-                f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}"
+                f"All foresight values must be the same for all RL agents. The defined learning strategies have the following foresight values: {foresight_list}"
             )
         else:
-            self.obs_dim = obs_dim_list[0]
+            self.foresight = foresight_list[0]
 
         if len(set(act_dim_list)) > 1:
             raise ValueError(
@@ -309,6 +311,14 @@ def check_strategy_dimensions(self) -> None:
         else:
             self.num_timeseries_obs_dim = num_timeseries_obs_dim_list[0]
 
+        # Check last, as other cases should fail before!
+        if len(set(obs_dim_list)) > 1:
+            raise ValueError(
+                f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}"
+            )
+        else:
+            self.obs_dim = obs_dim_list[0]
+
     def create_actors(self) -> None:
         """
         Create actor networks for reinforcement learning for each unit strategy.

diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py
@@ -117,6 +117,8 @@ def load_actor_params(self, load_path):
 
     def prepare_observations(self, unit, market_id):
         # scaling factors for the observations
+        # Note: These scaling factors could be interpreted as information leakage. However as we are in a simulation environment and not a purley forecasting setting
+        # we assume that the agent has access to this information already
         upper_scaling_factor_price = max(unit.forecaster.price[market_id])
         lower_scaling_factor_price = min(unit.forecaster.price[market_id])
         residual_load = unit.forecaster.residual_load.get(
@@ -185,6 +187,8 @@ def create_observation(
         )
 
         # --- 2. Historical actual prices (backward-looking) ---
+        # Note: We scale with the max_bid_price here in comparison to the scaling of the forecast where we use the max price of the forecast period
+        # this is not consistent but has worked well so far. Future work could look into this in more detail.
         scaled_price_history = (
             unit.outputs["energy_accepted_price"].window(
                 start, self.foresight, direction="backward"
@@ -308,11 +312,11 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
     on an Energy-Only Market.
 
     The agent submits two price bids: one for the inflexible component (P_min) and another for
-    the flexible component (P_max - P_min) of its capacity. This strategy utilizes a set of 50
+    the flexible component (P_max - P_min) of its capacity. This strategy utilizes a set of 38
     observations to generate actions, which are then transformed into market bids. The observation
     space comprises two unique values: the marginal cost and the current capacity of the unit.
 
-    The observation space for this strategy consists of 50 elements, drawn from both the forecaster
+    The observation space for this strategy consists of 38 elements, drawn from both the forecaster
     and the unit's internal state. Observations include the following components:
 
     - **Forecasted Residual Load**: Forecasted load over the foresight period, scaled by the maximum
@@ -344,7 +348,7 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
     Attributes
     ----------
     foresight : int
-        Number of time steps for which the agent forecasts market conditions. Defaults to 24.
+        Number of time steps for which the agent forecasts market conditions. Defaults to 12.
     max_bid_price : float
         Maximum allowable bid price. Defaults to 100.
     max_demand : float
@@ -375,24 +379,19 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
     """
 
     def __init__(self, *args, **kwargs):
-        obs_dim = kwargs.pop("obs_dim", 38)
+        # 'foresight' represents the number of time steps into the future that we will consider
+        # when constructing the observations.
+        foresight = kwargs.pop("foresight", 12)
         act_dim = kwargs.pop("act_dim", 2)
         unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
         super().__init__(
-            obs_dim=obs_dim,
+            foresight=foresight,
             act_dim=act_dim,
             unique_obs_dim=unique_obs_dim,
             *args,
             **kwargs,
         )
 
-        # 'foresight' represents the number of time steps into the future that we will consider
-        # when constructing the observations. This value is fixed for each strategy, as the
-        # neural network architecture is predefined, and the size of the observations must remain consistent.
-        # If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
-        # as the observation dimension depends on the foresight value.
-        self.foresight = 12
-
         # define allowed order types
         self.order_types = kwargs.get("order_types", ["SB"])
 
@@ -682,8 +681,8 @@ def calculate_reward(
 
         # scaling factor to normalize the reward to the range [-1,1]
         scaling = 1 / (self.max_bid_price * unit.max_power)
-        reward = scaling * (profit - regret_scale * opportunity_cost)
         regret = regret_scale * opportunity_cost
+        reward = scaling * (profit - regret)
 
         # Store results in unit outputs
         # Note: these are not learning-specific results but stored for all units for analysis
@@ -722,20 +721,18 @@ class EnergyLearningSingleBidStrategy(EnergyLearningStrategy, MinMaxStrategy):
     """
 
     def __init__(self, *args, **kwargs):
-        obs_dim = kwargs.pop("obs_dim", 74)
+        # we select 24 to be in line with the storage strategies
+        foresight = kwargs.pop("foresight", 24)
         act_dim = kwargs.pop("act_dim", 1)
         unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
         super().__init__(
-            obs_dim=obs_dim,
+            foresight=foresight,
             act_dim=act_dim,
             unique_obs_dim=unique_obs_dim,
             *args,
             **kwargs,
         )
 
-        # we select 24 to be in line with the storage strategies
-        self.foresight = 24
-
     def calculate_bids(
         self,
         unit: SupportsMinMax,
@@ -807,7 +804,7 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy)
     Reinforcement Learning Strategy for a storage unit that enables the agent to learn
     optimal bidding strategies on an Energy-Only Market.
 
-    The observation space for this strategy consists of 50 elements. Key components include:
+    The observation space for this strategy consists of 74 elements. Key components include:
 
     - **State of Charge**: Represents the current level of energy in the storage unit,
       influencing the bid direction and capacity.
@@ -868,24 +865,19 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy)
     """
 
     def __init__(self, *args, **kwargs):
-        obs_dim = kwargs.pop("obs_dim", 74)
+        # 'foresight' represents the number of time steps into the future that we will consider
+        # when constructing the observations.
+        foresight = kwargs.pop("foresight", 24)
         act_dim = kwargs.pop("act_dim", 1)
         unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
         super().__init__(
-            obs_dim=obs_dim,
+            foresight=foresight,
             act_dim=act_dim,
             unique_obs_dim=unique_obs_dim,
             *args,
             **kwargs,
         )
 
-        # 'foresight' represents the number of time steps into the future that we will consider
-        # when constructing the observations. This value is fixed for each strategy, as the
-        # neural network architecture is predefined, and the size of the observations must remain consistent.
-        # If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
-        # as the observation dimension depends on the foresight value.
-        self.foresight = 24
-
         # define allowed order types
         self.order_types = kwargs.get("order_types", ["SB"])
 
@@ -1168,24 +1160,19 @@ class RenewableEnergyLearningSingleBidStrategy(EnergyLearningSingleBidStrategy):
     """
 
     def __init__(self, *args, **kwargs):
-        obs_dim = kwargs.pop("obs_dim", 75)
+        # 'foresight' represents the number of time steps into the future that we will consider
+        # when constructing the observations.
+        foresight = kwargs.pop("foresight", 24)
         act_dim = kwargs.pop("act_dim", 1)
         unique_obs_dim = kwargs.pop("unique_obs_dim", 3)
         super().__init__(
-            obs_dim=obs_dim,
+            foresight=foresight,
             act_dim=act_dim,
             unique_obs_dim=unique_obs_dim,
             *args,
             **kwargs,
         )
 
-        # 'foresight' represents the number of time steps into the future that we will consider
-        # when constructing the observations. This value is fixed for each strategy, as the
-        # neural network architecture is predefined, and the size of the observations must remain consistent.
-        # If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
-        # as the observation dimension depends on the foresight value.
-        self.foresight = 24
-
         # define allowed order types
         self.order_types = kwargs.get("order_types", ["SB"])
 
@@ -1308,12 +1295,16 @@ def calculate_reward(
 
         profit = income - operational_cost
 
-        # Stabilizing learning: Limit positive profit to 10% of its absolute value.
+        # Stabilizing learning: Limit positive profit to 50% of its absolute value.
         # This reduces variance in rewards and prevents overfitting to extreme profit-seeking behavior.
         # However, this does NOT prevent the agent from exploiting market inefficiencies if they exist.
         # RL by nature identifies and exploits system weaknesses if they lead to higher profit.
         # This is not a price cap but rather a stabilizing factor to avoid reward spikes affecting learning stability.
-        profit = min(profit, 0.5 * abs(profit))
+        # IMPORTANT: This is a clear case of reward_tuning to stabilize learning - Use with caution!
+        # profit_scale = 0.5
+
+        profit_scale = 1
+        profit = min(profit, profit_scale * abs(profit))
 
         # get potential maximum infeed according to availability from order volume
         # Note: this will only work as the correct reference point when the volume is not defined by an action

diff --git a/docs/source/learning.rst b/docs/source/learning.rst
@@ -140,8 +140,8 @@ The Actor
 We will explain the way learning works in ASSUME starting from the interface to the simulation, namely the bidding strategy of the power plants.
 The bidding strategy, per definition in ASSUME, defines the way we formulate bids based on the technical restrictions of the unit.
 In a learning setting, this is done by the actor network which maps the observation to an action. The observation thereby is managed and collected by the units operator as
-summarized in the following picture. As you can see in the current working version, the observation space contains a residual load forecast for the next 24 hours and a price
-forecast for 24 hours, as well as the current capacity of the power plant and its marginal costs.
+summarized in the following picture. As you can see in the current working version, the observation space contains a residual load forecast and a price
+forecast for example for the next 24 hours, as well as the current capacity of the power plant and its marginal costs.
 
 .. image:: img/ActorTask.jpg
     :align: center