Skip to content
Merged
8 changes: 5 additions & 3 deletions assume/common/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,7 +893,7 @@ class LearningStrategy(BaseStrategy):
convention when designing your create_observation method and the observation space.

Attributes:
obs_dim (int): The observation dimension.
foresight (int): Number of steps of for- and backwards looking in observations.
act_dim (int): The action dimension.
unique_obs_dim (int): The unique observation dimension.
num_timeseries_obs_dim (int): The number of observation timeseries dimension.
Expand All @@ -907,7 +907,7 @@ class LearningStrategy(BaseStrategy):
def __init__(
self,
learning_role,
obs_dim: int,
foresight: int,
act_dim: int,
unique_obs_dim: int,
num_timeseries_obs_dim: int = 3,
Expand All @@ -923,7 +923,7 @@ def __init__(
self.learning_role = learning_role
self.learning_config = learning_role.learning_config

self.obs_dim = obs_dim
self.foresight = foresight
self.act_dim = act_dim

# this defines the number of unique observations, which are not the same for all units
Expand All @@ -934,6 +934,8 @@ def __init__(
# them into suitable format for recurrent neural networks
self.num_timeseries_obs_dim = num_timeseries_obs_dim

self.obs_dim = num_timeseries_obs_dim * foresight + unique_obs_dim


class MinMaxStrategy(BaseStrategy):
pass
Expand Down
16 changes: 13 additions & 3 deletions assume/reinforcement_learning/algorithms/matd3.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,23 +270,25 @@ def check_strategy_dimensions(self) -> None:
Also check if the unique observation dimensions are the same. If not, raise a ValueError.
This is important for the TD3 algorithm, as it uses a centralized critic that requires consistent dimensions across all agents.
"""
foresight_list = []
obs_dim_list = []
act_dim_list = []
unique_obs_dim_list = []
num_timeseries_obs_dim_list = []

for strategy in self.learning_role.rl_strats.values():
foresight_list.append(strategy.foresight)
obs_dim_list.append(strategy.obs_dim)
act_dim_list.append(strategy.act_dim)
unique_obs_dim_list.append(strategy.unique_obs_dim)
num_timeseries_obs_dim_list.append(strategy.num_timeseries_obs_dim)

if len(set(obs_dim_list)) > 1:
if len(set(foresight_list)) > 1:
raise ValueError(
f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}"
f"All foresight values must be the same for all RL agents. The defined learning strategies have the following foresight values: {foresight_list}"
)
else:
self.obs_dim = obs_dim_list[0]
self.foresight = foresight_list[0]

if len(set(act_dim_list)) > 1:
raise ValueError(
Expand All @@ -309,6 +311,14 @@ def check_strategy_dimensions(self) -> None:
else:
self.num_timeseries_obs_dim = num_timeseries_obs_dim_list[0]

# Check last, as other cases should fail before!
if len(set(obs_dim_list)) > 1:
raise ValueError(
f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}"
)
else:
self.obs_dim = obs_dim_list[0]

def create_actors(self) -> None:
"""
Create actor networks for reinforcement learning for each unit strategy.
Expand Down
69 changes: 30 additions & 39 deletions assume/strategies/learning_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ def load_actor_params(self, load_path):

def prepare_observations(self, unit, market_id):
# scaling factors for the observations
# Note: These scaling factors could be interpreted as information leakage. However as we are in a simulation environment and not a purley forecasting setting
# we assume that the agent has access to this information already
upper_scaling_factor_price = max(unit.forecaster.price[market_id])
lower_scaling_factor_price = min(unit.forecaster.price[market_id])
residual_load = unit.forecaster.residual_load.get(
Expand Down Expand Up @@ -185,6 +187,8 @@ def create_observation(
)

# --- 2. Historical actual prices (backward-looking) ---
# Note: We scale with the max_bid_price here in comparison to the scaling of the forecast where we use the max price of the forecast period
# this is not consistent but has worked well so far. Future work could look into this in more detail.
scaled_price_history = (
unit.outputs["energy_accepted_price"].window(
start, self.foresight, direction="backward"
Expand Down Expand Up @@ -308,11 +312,11 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
on an Energy-Only Market.

The agent submits two price bids: one for the inflexible component (P_min) and another for
the flexible component (P_max - P_min) of its capacity. This strategy utilizes a set of 50
the flexible component (P_max - P_min) of its capacity. This strategy utilizes a set of 38
observations to generate actions, which are then transformed into market bids. The observation
space comprises two unique values: the marginal cost and the current capacity of the unit.

The observation space for this strategy consists of 50 elements, drawn from both the forecaster
The observation space for this strategy consists of 38 elements, drawn from both the forecaster
and the unit's internal state. Observations include the following components:

- **Forecasted Residual Load**: Forecasted load over the foresight period, scaled by the maximum
Expand Down Expand Up @@ -344,7 +348,7 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
Attributes
----------
foresight : int
Number of time steps for which the agent forecasts market conditions. Defaults to 24.
Number of time steps for which the agent forecasts market conditions. Defaults to 12.
max_bid_price : float
Maximum allowable bid price. Defaults to 100.
max_demand : float
Expand Down Expand Up @@ -375,24 +379,19 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
"""

def __init__(self, *args, **kwargs):
obs_dim = kwargs.pop("obs_dim", 38)
# 'foresight' represents the number of time steps into the future that we will consider
# when constructing the observations.
foresight = kwargs.pop("foresight", 12)
act_dim = kwargs.pop("act_dim", 2)
unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
super().__init__(
obs_dim=obs_dim,
foresight=foresight,
act_dim=act_dim,
unique_obs_dim=unique_obs_dim,
*args,
**kwargs,
)

# 'foresight' represents the number of time steps into the future that we will consider
# when constructing the observations. This value is fixed for each strategy, as the
# neural network architecture is predefined, and the size of the observations must remain consistent.
# If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
# as the observation dimension depends on the foresight value.
self.foresight = 12

# define allowed order types
self.order_types = kwargs.get("order_types", ["SB"])

Expand Down Expand Up @@ -682,8 +681,8 @@ def calculate_reward(

# scaling factor to normalize the reward to the range [-1,1]
scaling = 1 / (self.max_bid_price * unit.max_power)
reward = scaling * (profit - regret_scale * opportunity_cost)
regret = regret_scale * opportunity_cost
reward = scaling * (profit - regret)

# Store results in unit outputs
# Note: these are not learning-specific results but stored for all units for analysis
Expand Down Expand Up @@ -722,20 +721,18 @@ class EnergyLearningSingleBidStrategy(EnergyLearningStrategy, MinMaxStrategy):
"""

def __init__(self, *args, **kwargs):
obs_dim = kwargs.pop("obs_dim", 74)
# we select 24 to be in line with the storage strategies
foresight = kwargs.pop("foresight", 24)
act_dim = kwargs.pop("act_dim", 1)
unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
super().__init__(
obs_dim=obs_dim,
foresight=foresight,
act_dim=act_dim,
unique_obs_dim=unique_obs_dim,
*args,
**kwargs,
)

# we select 24 to be in line with the storage strategies
self.foresight = 24

def calculate_bids(
self,
unit: SupportsMinMax,
Expand Down Expand Up @@ -807,7 +804,7 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy)
Reinforcement Learning Strategy for a storage unit that enables the agent to learn
optimal bidding strategies on an Energy-Only Market.

The observation space for this strategy consists of 50 elements. Key components include:
The observation space for this strategy consists of 74 elements. Key components include:

- **State of Charge**: Represents the current level of energy in the storage unit,
influencing the bid direction and capacity.
Expand Down Expand Up @@ -868,24 +865,19 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy)
"""

def __init__(self, *args, **kwargs):
obs_dim = kwargs.pop("obs_dim", 74)
# 'foresight' represents the number of time steps into the future that we will consider
# when constructing the observations.
foresight = kwargs.pop("foresight", 24)
act_dim = kwargs.pop("act_dim", 1)
unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
super().__init__(
obs_dim=obs_dim,
foresight=foresight,
act_dim=act_dim,
unique_obs_dim=unique_obs_dim,
*args,
**kwargs,
)

# 'foresight' represents the number of time steps into the future that we will consider
# when constructing the observations. This value is fixed for each strategy, as the
# neural network architecture is predefined, and the size of the observations must remain consistent.
# If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
# as the observation dimension depends on the foresight value.
self.foresight = 24

# define allowed order types
self.order_types = kwargs.get("order_types", ["SB"])

Expand Down Expand Up @@ -1168,24 +1160,19 @@ class RenewableEnergyLearningSingleBidStrategy(EnergyLearningSingleBidStrategy):
"""

def __init__(self, *args, **kwargs):
obs_dim = kwargs.pop("obs_dim", 75)
# 'foresight' represents the number of time steps into the future that we will consider
# when constructing the observations.
foresight = kwargs.pop("foresight", 24)
act_dim = kwargs.pop("act_dim", 1)
unique_obs_dim = kwargs.pop("unique_obs_dim", 3)
super().__init__(
obs_dim=obs_dim,
foresight=foresight,
act_dim=act_dim,
unique_obs_dim=unique_obs_dim,
*args,
**kwargs,
)

# 'foresight' represents the number of time steps into the future that we will consider
# when constructing the observations. This value is fixed for each strategy, as the
# neural network architecture is predefined, and the size of the observations must remain consistent.
# If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
# as the observation dimension depends on the foresight value.
self.foresight = 24

# define allowed order types
self.order_types = kwargs.get("order_types", ["SB"])

Expand Down Expand Up @@ -1308,12 +1295,16 @@ def calculate_reward(

profit = income - operational_cost

# Stabilizing learning: Limit positive profit to 10% of its absolute value.
# Stabilizing learning: Limit positive profit to 50% of its absolute value.
# This reduces variance in rewards and prevents overfitting to extreme profit-seeking behavior.
# However, this does NOT prevent the agent from exploiting market inefficiencies if they exist.
# RL by nature identifies and exploits system weaknesses if they lead to higher profit.
# This is not a price cap but rather a stabilizing factor to avoid reward spikes affecting learning stability.
profit = min(profit, 0.5 * abs(profit))
# IMPORTANT: This is a clear case of reward_tuning to stabilize learning - Use with caution!
# profit_scale = 0.5

profit_scale = 1
profit = min(profit, profit_scale * abs(profit))

# get potential maximum infeed according to availability from order volume
# Note: this will only work as the correct reference point when the volume is not defined by an action
Expand Down
4 changes: 2 additions & 2 deletions docs/source/learning.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ The Actor
We will explain the way learning works in ASSUME starting from the interface to the simulation, namely the bidding strategy of the power plants.
The bidding strategy, per definition in ASSUME, defines the way we formulate bids based on the technical restrictions of the unit.
In a learning setting, this is done by the actor network which maps the observation to an action. The observation thereby is managed and collected by the units operator as
summarized in the following picture. As you can see in the current working version, the observation space contains a residual load forecast for the next 24 hours and a price
forecast for 24 hours, as well as the current capacity of the power plant and its marginal costs.
summarized in the following picture. As you can see in the current working version, the observation space contains a residual load forecast and a price
forecast for example for the next 24 hours, as well as the current capacity of the power plant and its marginal costs.

.. image:: img/ActorTask.jpg
:align: center
Expand Down
Loading