diff --git a/assume/common/base.py b/assume/common/base.py index 73553835d..0b41b7d28 100644 --- a/assume/common/base.py +++ b/assume/common/base.py @@ -893,7 +893,7 @@ class LearningStrategy(BaseStrategy): convention when designing your create_observation method and the observation space. Attributes: - obs_dim (int): The observation dimension. + foresight (int): Number of steps of for- and backwards looking in observations. act_dim (int): The action dimension. unique_obs_dim (int): The unique observation dimension. num_timeseries_obs_dim (int): The number of observation timeseries dimension. @@ -907,7 +907,7 @@ class LearningStrategy(BaseStrategy): def __init__( self, learning_role, - obs_dim: int, + foresight: int, act_dim: int, unique_obs_dim: int, num_timeseries_obs_dim: int = 3, @@ -923,7 +923,7 @@ def __init__( self.learning_role = learning_role self.learning_config = learning_role.learning_config - self.obs_dim = obs_dim + self.foresight = foresight self.act_dim = act_dim # this defines the number of unique observations, which are not the same for all units @@ -934,6 +934,8 @@ def __init__( # them into suitable format for recurrent neural networks self.num_timeseries_obs_dim = num_timeseries_obs_dim + self.obs_dim = num_timeseries_obs_dim * foresight + unique_obs_dim + class MinMaxStrategy(BaseStrategy): pass diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py index 5bd4edef5..12d2a9a38 100644 --- a/assume/reinforcement_learning/algorithms/matd3.py +++ b/assume/reinforcement_learning/algorithms/matd3.py @@ -270,23 +270,25 @@ def check_strategy_dimensions(self) -> None: Also check if the unique observation dimensions are the same. If not, raise a ValueError. This is important for the TD3 algorithm, as it uses a centralized critic that requires consistent dimensions across all agents. """ + foresight_list = [] obs_dim_list = [] act_dim_list = [] unique_obs_dim_list = [] num_timeseries_obs_dim_list = [] for strategy in self.learning_role.rl_strats.values(): + foresight_list.append(strategy.foresight) obs_dim_list.append(strategy.obs_dim) act_dim_list.append(strategy.act_dim) unique_obs_dim_list.append(strategy.unique_obs_dim) num_timeseries_obs_dim_list.append(strategy.num_timeseries_obs_dim) - if len(set(obs_dim_list)) > 1: + if len(set(foresight_list)) > 1: raise ValueError( - f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}" + f"All foresight values must be the same for all RL agents. The defined learning strategies have the following foresight values: {foresight_list}" ) else: - self.obs_dim = obs_dim_list[0] + self.foresight = foresight_list[0] if len(set(act_dim_list)) > 1: raise ValueError( @@ -309,6 +311,14 @@ def check_strategy_dimensions(self) -> None: else: self.num_timeseries_obs_dim = num_timeseries_obs_dim_list[0] + # Check last, as other cases should fail before! + if len(set(obs_dim_list)) > 1: + raise ValueError( + f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}" + ) + else: + self.obs_dim = obs_dim_list[0] + def create_actors(self) -> None: """ Create actor networks for reinforcement learning for each unit strategy. diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py index 11464a7a4..b6a5e10db 100644 --- a/assume/strategies/learning_strategies.py +++ b/assume/strategies/learning_strategies.py @@ -117,6 +117,8 @@ def load_actor_params(self, load_path): def prepare_observations(self, unit, market_id): # scaling factors for the observations + # Note: These scaling factors could be interpreted as information leakage. However as we are in a simulation environment and not a purley forecasting setting + # we assume that the agent has access to this information already upper_scaling_factor_price = max(unit.forecaster.price[market_id]) lower_scaling_factor_price = min(unit.forecaster.price[market_id]) residual_load = unit.forecaster.residual_load.get( @@ -185,6 +187,8 @@ def create_observation( ) # --- 2. Historical actual prices (backward-looking) --- + # Note: We scale with the max_bid_price here in comparison to the scaling of the forecast where we use the max price of the forecast period + # this is not consistent but has worked well so far. Future work could look into this in more detail. scaled_price_history = ( unit.outputs["energy_accepted_price"].window( start, self.foresight, direction="backward" @@ -308,11 +312,11 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy): on an Energy-Only Market. The agent submits two price bids: one for the inflexible component (P_min) and another for - the flexible component (P_max - P_min) of its capacity. This strategy utilizes a set of 50 + the flexible component (P_max - P_min) of its capacity. This strategy utilizes a set of 38 observations to generate actions, which are then transformed into market bids. The observation space comprises two unique values: the marginal cost and the current capacity of the unit. - The observation space for this strategy consists of 50 elements, drawn from both the forecaster + The observation space for this strategy consists of 38 elements, drawn from both the forecaster and the unit's internal state. Observations include the following components: - **Forecasted Residual Load**: Forecasted load over the foresight period, scaled by the maximum @@ -344,7 +348,7 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy): Attributes ---------- foresight : int - Number of time steps for which the agent forecasts market conditions. Defaults to 24. + Number of time steps for which the agent forecasts market conditions. Defaults to 12. max_bid_price : float Maximum allowable bid price. Defaults to 100. max_demand : float @@ -375,24 +379,19 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy): """ def __init__(self, *args, **kwargs): - obs_dim = kwargs.pop("obs_dim", 38) + # 'foresight' represents the number of time steps into the future that we will consider + # when constructing the observations. + foresight = kwargs.pop("foresight", 12) act_dim = kwargs.pop("act_dim", 2) unique_obs_dim = kwargs.pop("unique_obs_dim", 2) super().__init__( - obs_dim=obs_dim, + foresight=foresight, act_dim=act_dim, unique_obs_dim=unique_obs_dim, *args, **kwargs, ) - # 'foresight' represents the number of time steps into the future that we will consider - # when constructing the observations. This value is fixed for each strategy, as the - # neural network architecture is predefined, and the size of the observations must remain consistent. - # If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above, - # as the observation dimension depends on the foresight value. - self.foresight = 12 - # define allowed order types self.order_types = kwargs.get("order_types", ["SB"]) @@ -682,8 +681,8 @@ def calculate_reward( # scaling factor to normalize the reward to the range [-1,1] scaling = 1 / (self.max_bid_price * unit.max_power) - reward = scaling * (profit - regret_scale * opportunity_cost) regret = regret_scale * opportunity_cost + reward = scaling * (profit - regret) # Store results in unit outputs # Note: these are not learning-specific results but stored for all units for analysis @@ -722,20 +721,18 @@ class EnergyLearningSingleBidStrategy(EnergyLearningStrategy, MinMaxStrategy): """ def __init__(self, *args, **kwargs): - obs_dim = kwargs.pop("obs_dim", 74) + # we select 24 to be in line with the storage strategies + foresight = kwargs.pop("foresight", 24) act_dim = kwargs.pop("act_dim", 1) unique_obs_dim = kwargs.pop("unique_obs_dim", 2) super().__init__( - obs_dim=obs_dim, + foresight=foresight, act_dim=act_dim, unique_obs_dim=unique_obs_dim, *args, **kwargs, ) - # we select 24 to be in line with the storage strategies - self.foresight = 24 - def calculate_bids( self, unit: SupportsMinMax, @@ -807,7 +804,7 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy) Reinforcement Learning Strategy for a storage unit that enables the agent to learn optimal bidding strategies on an Energy-Only Market. - The observation space for this strategy consists of 50 elements. Key components include: + The observation space for this strategy consists of 74 elements. Key components include: - **State of Charge**: Represents the current level of energy in the storage unit, influencing the bid direction and capacity. @@ -868,24 +865,19 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy) """ def __init__(self, *args, **kwargs): - obs_dim = kwargs.pop("obs_dim", 74) + # 'foresight' represents the number of time steps into the future that we will consider + # when constructing the observations. + foresight = kwargs.pop("foresight", 24) act_dim = kwargs.pop("act_dim", 1) unique_obs_dim = kwargs.pop("unique_obs_dim", 2) super().__init__( - obs_dim=obs_dim, + foresight=foresight, act_dim=act_dim, unique_obs_dim=unique_obs_dim, *args, **kwargs, ) - # 'foresight' represents the number of time steps into the future that we will consider - # when constructing the observations. This value is fixed for each strategy, as the - # neural network architecture is predefined, and the size of the observations must remain consistent. - # If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above, - # as the observation dimension depends on the foresight value. - self.foresight = 24 - # define allowed order types self.order_types = kwargs.get("order_types", ["SB"]) @@ -1168,24 +1160,19 @@ class RenewableEnergyLearningSingleBidStrategy(EnergyLearningSingleBidStrategy): """ def __init__(self, *args, **kwargs): - obs_dim = kwargs.pop("obs_dim", 75) + # 'foresight' represents the number of time steps into the future that we will consider + # when constructing the observations. + foresight = kwargs.pop("foresight", 24) act_dim = kwargs.pop("act_dim", 1) unique_obs_dim = kwargs.pop("unique_obs_dim", 3) super().__init__( - obs_dim=obs_dim, + foresight=foresight, act_dim=act_dim, unique_obs_dim=unique_obs_dim, *args, **kwargs, ) - # 'foresight' represents the number of time steps into the future that we will consider - # when constructing the observations. This value is fixed for each strategy, as the - # neural network architecture is predefined, and the size of the observations must remain consistent. - # If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above, - # as the observation dimension depends on the foresight value. - self.foresight = 24 - # define allowed order types self.order_types = kwargs.get("order_types", ["SB"]) @@ -1308,12 +1295,16 @@ def calculate_reward( profit = income - operational_cost - # Stabilizing learning: Limit positive profit to 10% of its absolute value. + # Stabilizing learning: Limit positive profit to 50% of its absolute value. # This reduces variance in rewards and prevents overfitting to extreme profit-seeking behavior. # However, this does NOT prevent the agent from exploiting market inefficiencies if they exist. # RL by nature identifies and exploits system weaknesses if they lead to higher profit. # This is not a price cap but rather a stabilizing factor to avoid reward spikes affecting learning stability. - profit = min(profit, 0.5 * abs(profit)) + # IMPORTANT: This is a clear case of reward_tuning to stabilize learning - Use with caution! + # profit_scale = 0.5 + + profit_scale = 1 + profit = min(profit, profit_scale * abs(profit)) # get potential maximum infeed according to availability from order volume # Note: this will only work as the correct reference point when the volume is not defined by an action diff --git a/docs/source/learning.rst b/docs/source/learning.rst index 089c068e0..21cfbdd47 100644 --- a/docs/source/learning.rst +++ b/docs/source/learning.rst @@ -140,8 +140,8 @@ The Actor We will explain the way learning works in ASSUME starting from the interface to the simulation, namely the bidding strategy of the power plants. The bidding strategy, per definition in ASSUME, defines the way we formulate bids based on the technical restrictions of the unit. In a learning setting, this is done by the actor network which maps the observation to an action. The observation thereby is managed and collected by the units operator as -summarized in the following picture. As you can see in the current working version, the observation space contains a residual load forecast for the next 24 hours and a price -forecast for 24 hours, as well as the current capacity of the power plant and its marginal costs. +summarized in the following picture. As you can see in the current working version, the observation space contains a residual load forecast and a price +forecast for example for the next 24 hours, as well as the current capacity of the power plant and its marginal costs. .. image:: img/ActorTask.jpg :align: center diff --git a/docs/source/learning_algorithm.rst b/docs/source/learning_algorithm.rst index 2121d2345..640f2663e 100644 --- a/docs/source/learning_algorithm.rst +++ b/docs/source/learning_algorithm.rst @@ -31,29 +31,29 @@ The following table shows the options that can be adjusted and gives a short exp ======================================== ========================================================================================================== learning_mode Should we use learning mode at all? If False, the learning bidding strategy is loaded from trained_policies_load_path and no training occurs. Default is False. evaluation_mode This setting is modified internally. Whether to run in evaluation mode. If True, the agent uses the learned policy without exploration noise and no training updates occur. Default is False. - continue_learning Whether to use pre-learned strategies and then continue learning. If True, loads existing policies from trained_policies_load_path and continues training. Default is False. + continue_learning Whether to use pre-learned strategies and then continue learning. If True, loads existing policies from trained_policies_load_path and continues training. Note: Set True when you have a pretrained model and want incremental learning under new data or scenarios. Leave False for clean experiments. Default is False. trained_policies_save_path The directory path - relative to the scenario's inputs_path - where newly trained RL policies (actor and critic networks) will be saved. Only needed when learning_mode is True. Value is set in setup_world(). Defaults otherwise to None. trained_policies_load_path The directory path - relative to the scenario's inputs_path - from which pre-trained policies should be loaded. Needed when continue_learning is True or using pre-trained strategies. Default is None. - min_bid_price The minimum bid price which limits the action of the actor to this price. Used to constrain the actor's output to a realistic price range. Default is -100.0. - max_bid_price The maximum bid price which limits the action of the actor to this price. Used to constrain the actor's output to a realistic price range. Default is 100.0. + min_bid_price The minimum bid price which limits the action of the actor to this price. Used to constrain the actor's output to a price range. Note: Best practice is to set this parameter as unconstraining as possible. When agent bid convergence is guaranteed to occur above zero, increasing the minimum bid value can reduce training times. Default is -100.0. + max_bid_price The maximum bid price which limits the action of the actor to this price. Used to constrain the actor's output to a price range. Note: Align this with realistic market constraints. Too low = limited strategy space. Too high = noisy learning. Default is 100.0. device The device to use for PyTorch computations. Options include "cpu", "cuda", or specific CUDA devices like "cuda:0". Default is "cpu". - episodes_collecting_initial_experience The number of episodes at the start during which random actions are chosen instead of using the actor network. This helps populate the replay buffer with diverse experiences. Default is 5. + episodes_collecting_initial_experience The number of episodes at the start during which random actions are chosen instead of using the actor network. This helps populate the replay buffer with diverse experiences. Note: Increase (5–20) for larger environments. Too low causes early high variance and instability; too high wastes time. Default is 5. exploration_noise_std The standard deviation of Gaussian noise added to actions during exploration in the environment. Higher values encourage more exploration. Default is 0.2. training_episodes The number of training episodes, where one episode is the entire simulation horizon specified in the general config. Default is 100. - validation_episodes_interval The interval (in episodes) at which validation episodes are run to evaluate the current policy's performance without training updates. Default is 5. - train_freq Defines the frequency in time steps at which the actor and critic networks are updated. Accepts time strings like "24h" for 24 hours or "1d" for 1 day. Default is "24h". + validation_episodes_interval The interval (in episodes) at which validation episodes are run to evaluate the current policy's performance without training updates. Note: With long simulation horizons, choosing this higher will reduce training time. Default is 5. + train_freq Defines the frequency in time steps at which the actor and critic networks are updated. Accepts time strings like "24h" for 24 hours or "1d" for 1 day. Note: Shorter intervals = frequent updates, faster but less stable learning. Longer intervals = slower but more reliable. Use intervals > "72h" for units that require time coupling such as storages. Default is "24h". batch_size The batch size of experiences sampled from the replay buffer for each training update. Larger batches provide more stable gradients but require more memory. In environments with many learning agents we advise small batch sizes. Default is 128. - gradient_steps The number of gradient descent steps performed during each training update. More steps can lead to better learning but increase computation time. Default is 100. - learning_rate The learning rate (step size) for the optimizer, which controls how much the policy and value networks are updated during training. Default is 0.001. + gradient_steps The number of gradient descent steps performed during each training update. More steps can lead to better learning but increase computation time. Note: For environments with many agents one should use not many gradient steps, as policies of other agents are updated as well outdating the current best strategy. Default is 100. + learning_rate The learning rate (step size) for the optimizer, which controls how much the policy and value networks are updated during training. Note: Start around 1e-3. Decrease (e.g. 3e-4, 1e-4) if training oscillates or diverges. Default is 0.001. learning_rate_schedule Which learning rate decay schedule to use. Currently only "linear" decay is available, which linearly decreases the learning rate over time. Default is None (constant learning rate). - early_stopping_steps The number of validation steps over which the moving average reward is calculated for early stopping. If the reward doesn't change by early_stopping_threshold over this many steps, training stops. If None, defaults to training_episodes / validation_episodes_interval + 1. - early_stopping_threshold The minimum improvement in moving average reward required to avoid early stopping. If the reward improvement is less than this threshold over early_stopping_steps, training is terminated early. Default is 0.05. + early_stopping_steps The number of validation steps over which the moving average reward is calculated for early stopping. If the reward doesn't change by early_stopping_threshold over this many steps, training stops. Note: It prevents wasting compute on runs that have plateaued. Higher values are safer for noisy environments to avoid premature stopping; lower values react faster in stable settings. If None, defaults to training_episodes / validation_episodes_interval + 1. + early_stopping_threshold The minimum improvement in moving average reward required to avoid early stopping. If the reward improvement is less than this threshold over early_stopping_steps, training is terminated early. Note: If training stops too early, reduce the threshold. In noisy environments, combine a lower threshold with higher early_stopping_steps. Default is 0.05. algorithm Specifies which reinforcement learning algorithm to use. Currently, only "matd3" (Multi-Agent Twin Delayed Deep Deterministic Policy Gradient) is implemented. Default is "matd3". replay_buffer_size The maximum number of transitions stored in the replay buffer for experience replay. Larger buffers allow for more diverse training samples. Default is 500000. - gamma The discount factor for future rewards, ranging from 0 to 1. Higher values give more weight to long-term rewards in decision-making. Default is 0.99. + gamma The discount factor for future rewards, ranging from 0 to 1. Higher values give more weight to long-term rewards in decision-making, which should be chosen for units with time coupling like storages. Default is 0.99. actor_architecture The architecture of the neural networks used for the actors. Options include "mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp". policy_delay The frequency (in gradient steps) at which the actor policy is updated. TD3 updates the critic more frequently than the actor to stabilize training. Default is 2. - noise_sigma The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution used to generate exploration noise added to actions. Default is 0.1. + noise_sigma The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution used to generate exploration noise added to actions. Note: In multi-agent ennvironments high noises are necessary to encourage sufficient exploration. Default is 0.1. noise_scale The scale factor multiplied by the noise drawn from the distribution. Larger values increase exploration. Default is 1. noise_dt The time step parameter for the Ornstein-Uhlenbeck process, which determines how quickly the noise decays over time. Used for noise scheduling. Default is 1. action_noise_schedule Which action noise decay schedule to use. Currently only "linear" decay is available, which linearly decreases exploration noise over training. Default is "linear". diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 48e8958eb..7128cde42 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -24,13 +24,11 @@ Upcoming Release - **Application of new naming convention for bidding strategies**: [unit]_[market]_[method]_[comment] for bidding strategy keys (in snake_case) and [Unit][Market][Method][Comment]Strategy for bidding strategy classes (in PascalCase for classes) - **Restructured learning_role tasks**: Major learning changes that make learning application more generalizable across the framework. - - **Simplified learning data flow:** Removed the special ``learning_unit_operator`` that previously aggregated unit data and forwarded it to the learning role. Eliminates the single-sender dependency and avoids double bookkeeping across units and operators. - **Direct write access:** All learning-capable entities (units, unit operators, market agents) now write learning data directly to the learning role. - **Centralized logic:** Learning-related functionality is now almost always contained within the learning role, improving maintainability. - .. note:: - Distributed learning across multiple machines is no longer supported, but this feature was not in active use. - + - **Automatic calculation of obs_dim:** The observation dimension is now automatically calculated based on the definition of the foresight, num_timeseries_obs_dim and unique_obs_dim in the learning configuration. This avoids inconsistencies between the defined observation space and the actual observation dimension used in the actor network. However, if assumes the rational that 'self.obs_dim = num_timeseries_obs_dim * foresight + unique_obs_dim', if this is not the case the calculation of obs_dim needs to be adjusted in the learning strategy. + - **Note:** Distributed learning across multiple machines is no longer supported, but this feature was not in active use. - **Restructured learning configuration**: All learning-related configuration parameters are now contained within a single `learning_config` dictionary in the `config.yaml` file. This change simplifies configuration management and avoids ambiguous setting of defaults. .. note:: @@ -39,6 +37,7 @@ Upcoming Release - **Learning_role in all cases involving DRL**: The `learning_role` is now available in all simulations involving DRL, also if pre-trained strategies are loaded and no policy updates are performed. This change ensures consistent handling of learning configurations and simplifies the codebase by removing special cases. - **Final DRL simulation with last policies**: After training, the final simulation now uses the last trained policies instead of the best policies. This change provides a more accurate representation of the learned behavior, as the last policies reflect the most recent training state. Additionally, multi-agent simulations do not always converge to the maximum reward. E.g. competing agents may underbid each other to gain market share, leading to lower overall rewards while reaching a stable state nevertheless. + **New Features:** - **Unit Operator Portfolio Strategy**: A new bidding strategy type that enables portfolio optimization, where the default is called `UnitsOperatorEnergyNaiveDirectStrategy`. This strategy simply passes through bidding decisions of individual units within a portfolio, which was the default behavior beforehand as well. Further we added 'UnitsOperatorEnergyHeuristicCournotStrategy' which allows to model bidding behavior of a portfolio of units in a day-ahead market. The strategy calculates the optimal bid price and quantity for each unit in the portfolio, taking into account markup and the production costs of the units. This enables users to simulate and analyze the impact of strategic portfolio bidding on market outcomes and unit profitability. diff --git a/examples/notebooks/04b_reinforcement_learning_example.ipynb b/examples/notebooks/04b_reinforcement_learning_example.ipynb index 73ccdaa0d..ef634eccf 100644 --- a/examples/notebooks/04b_reinforcement_learning_example.ipynb +++ b/examples/notebooks/04b_reinforcement_learning_example.ipynb @@ -484,20 +484,18 @@ " \"\"\"\n", "\n", " def __init__(self, *args, **kwargs):\n", - " obs_dim = kwargs.pop(\"obs_dim\", 38) # Forecasts + history + individual values\n", + " # Forecast horizon (in timesteps) used for market and residual load forecasts\n", + " foresight = kwargs.pop(\"foresight\", 12)\n", " act_dim = kwargs.pop(\"act_dim\", 1) # One action: bid price\n", " unique_obs_dim = kwargs.pop(\"unique_obs_dim\", 2) # Number of individual obs\n", "\n", " super().__init__(\n", - " obs_dim=obs_dim,\n", + " foresight=foresight,\n", " act_dim=act_dim,\n", " unique_obs_dim=unique_obs_dim,\n", " *args,\n", " **kwargs,\n", - " )\n", - "\n", - " # Forecast horizon (in timesteps) used for market and residual load forecasts\n", - " self.foresight = 12" + " )" ] }, { @@ -505,6 +503,8 @@ "id": "f4bc511c", "metadata": {}, "source": [ + "With your defined foresight range the global observations are defined in the function `create_observation` of the base class. Based on the chosen foresight the observation_space dimension is calculated automatically following `self.obs_dim = num_timeseries_obs_dim * foresight + unique_obs_dim` as defined in the base class. If one wants to change that rational it needs to be overwritten in the `learning_strategy` itself.\n", + "\n", "### 3.4 Exercise 1: Define Individual Observations\n", "\n", "Now you will implement the following method:" @@ -1461,7 +1461,13 @@ "execution_count": null, "id": "cb4251a9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [] + } + ], "source": [ "log = logging.getLogger(__name__)\n", "csv_path = \"outputs\"\n", @@ -1841,7 +1847,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "assume-framework", "language": "python", "name": "python3" }, @@ -1855,7 +1861,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.9" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/examples/notebooks/04c_reinforcement_learning_storage_example.ipynb b/examples/notebooks/04c_reinforcement_learning_storage_example.ipynb index bcdcd4a15..097a3ac40 100644 --- a/examples/notebooks/04c_reinforcement_learning_storage_example.ipynb +++ b/examples/notebooks/04c_reinforcement_learning_storage_example.ipynb @@ -479,20 +479,20 @@ " \"\"\"\n", "\n", " def __init__(self, *args, **kwargs):\n", - " obs_dim = kwargs.pop(\"obs_dim\", 74) # Forecasts + history + individual values\n", + " # Forecast horizon (in timesteps) used for market and residual load forecasts\n", + " foresight = None # Your Choice here\n", " act_dim = kwargs.pop(\"act_dim\", 1) # One action: bid price\n", " unique_obs_dim = kwargs.pop(\"unique_obs_dim\", 2) # Number of individual obs\n", "\n", + " # all further calculations are handled in the parent classes\n", + " # like the general observation calculation based on the foresight\n", " super().__init__(\n", - " obs_dim=obs_dim,\n", + " foresight=foresight,\n", " act_dim=act_dim,\n", " unique_obs_dim=unique_obs_dim,\n", " *args,\n", " **kwargs,\n", - " )\n", - "\n", - " # Forecast horizon (in timesteps) used for market and residual load forecasts\n", - " self.foresight = None # Your implementation here" + " )" ] }, { @@ -517,20 +517,18 @@ " \"\"\"\n", "\n", " def __init__(self, *args, **kwargs):\n", - " obs_dim = kwargs.pop(\"obs_dim\", 74) # Forecasts + history + individual values\n", + " # Forecast horizon (in timesteps) used for market and residual load forecasts\n", + " foresight = 24 # Your implementation here\n", " act_dim = kwargs.pop(\"act_dim\", 1) # One action: bid price\n", " unique_obs_dim = kwargs.pop(\"unique_obs_dim\", 2) # Number of individual obs\n", "\n", " super().__init__(\n", - " obs_dim=obs_dim,\n", + " foresight=foresight,\n", " act_dim=act_dim,\n", " unique_obs_dim=unique_obs_dim,\n", " *args,\n", " **kwargs,\n", - " )\n", - "\n", - " # Forecast horizon (in timesteps) used for market and residual load forecasts\n", - " self.foresight = 24 # Your implementation here" + " )" ] }, { @@ -540,7 +538,7 @@ "source": [ "For storages, we recommend a foresight of **24 hours**, which aligns with standard industry practice and allows for daily charge/discharge cycles. Note that longer foresight increases the size of the observation space, as **each forecasted time series (e.g., price, residual load)** is extended accordingly. If you're designing seasonal storage agents (e.g., hydrogen or pumped hydro), you may consider even longer horizons—but beware the combinatorial explosion of the input space.\n", "\n", - "With this foresight range the global observations are defined in the function `create_observation`of the base class. We focus here on the individual observations in the next chapter." + "With this foresight range the global observations are defined in the function `create_observation`of the base class. Based on the chosen foresight the observation_space dimension is calculated automatically following `self.obs_dim = num_timeseries_obs_dim * foresight + unique_obs_dim` as defined in the base class. If one wants to change that rational it needs to be overwritten in the `learning_strategy` itself. We focus here on the individual observations in the next chapter." ] }, { @@ -1968,7 +1966,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "assume-framework", "language": "python", "name": "python3" }, @@ -1982,7 +1980,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.9" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/examples/notebooks/09_example_Sim_and_xRL.ipynb b/examples/notebooks/09_example_Sim_and_xRL.ipynb index 97b0ea452..0fe7a4a72 100644 --- a/examples/notebooks/09_example_Sim_and_xRL.ipynb +++ b/examples/notebooks/09_example_Sim_and_xRL.ipynb @@ -409,7 +409,6 @@ " \"end_date\": \"2019-01-01 23:00\",\n", " \"time_step\": \"1h\",\n", " \"save_frequency_hours\": 4,\n", - " \"learning_mode\": \"True\",\n", " \"markets_config\": {\n", " \"zonal\": {\n", " \"operator\": \"EOM_operator\",\n", diff --git a/tests/test_learning_role.py b/tests/test_learning_role.py index 1f938cef0..1b2a8df83 100644 --- a/tests/test_learning_role.py +++ b/tests/test_learning_role.py @@ -26,7 +26,7 @@ @pytest.mark.require_learning def test_learning_init(): config = { - "obs_dim": 3, + "foresight": 1, "act_dim": 2, "unique_obs_dim": 0, "learning_config": LearningConfig( diff --git a/tests/test_matd3.py b/tests/test_matd3.py index 5520dc483..0471f84eb 100644 --- a/tests/test_matd3.py +++ b/tests/test_matd3.py @@ -30,11 +30,15 @@ @pytest.fixture def base_learning_config() -> dict: + foresight = 2 + unique_obs_dim = 2 + num_timeseries_obs_dim = 4 return { - "obs_dim": 10, + "foresight": foresight, "act_dim": 3, - "unique_obs_dim": 2, - "num_timeseries_obs_dim": 4, + "unique_obs_dim": unique_obs_dim, + "num_timeseries_obs_dim": num_timeseries_obs_dim, + "obs_dim": foresight * num_timeseries_obs_dim + unique_obs_dim, "learning_config": LearningConfig( train_freq="1h", algorithm="matd3", @@ -409,7 +413,7 @@ def test_td3_load_corrupted_or_incomplete_critic(tmp_path, base_learning_config) @pytest.mark.parametrize( "mod_field, mod_value, expected_error", [ - ("obs_dim", 99, "All observation dimensions must be the same"), + ("foresight", 99, "All foresight values must be the same"), ("act_dim", 99, "All action dimensions must be the same"), ("unique_obs_dim", 99, "All unique_obs_dim values must be the same"), (