diff --git a/assume/common/base.py b/assume/common/base.py index 1d7abcd08..464a5ef4d 100644 --- a/assume/common/base.py +++ b/assume/common/base.py @@ -2,9 +2,10 @@ # # SPDX-License-Identifier: AGPL-3.0-or-later +import logging from collections import defaultdict +from dataclasses import dataclass from datetime import datetime, timedelta -from typing import TypedDict import numpy as np @@ -12,6 +13,8 @@ from assume.common.forecaster import UnitForecaster from assume.common.market_objects import MarketConfig, Orderbook, Product +logger = logging.getLogger(__name__) + class BaseStrategy: pass @@ -744,6 +747,142 @@ def remove_empty_bids(self, bids: list) -> list: return cleaned_bids +@dataclass +class LearningConfig: + """ + A class for the learning configuration. + + Attributes: + learning_mode (bool): Should we use learning mode at all? If False, the learning bidding strategy is + loaded from trained_policies_load_path and no training occurs. Default is False. + evaluation_mode (bool): This setting is modified internally. Whether to run in evaluation mode. If True, the agent uses the learned policy + without exploration noise and no training updates occur. Default is False. + continue_learning (bool): Whether to use pre-learned strategies and then continue learning. + If True, loads existing policies from trained_policies_load_path and continues training. Default is False. + trained_policies_save_path (str | None): The directory path - relative to the scenario's inputs_path - where newly trained RL policies (actor and + critic networks) will be saved. Only needed when learning_mode is True. Value is set in setup_world(). Defaults to None. + trained_policies_load_path (str | None): The directory path - relative to the scenario's inputs_path - from which pre-trained policies should be + loaded. Needed when continue_learning is True or using pre-trained strategies. Default is None. + + min_bid_price (float | None): The minimum bid price which limits the action of the actor to this price. + Used to constrain the actor's output to a realistic price range. Default is -100.0. + max_bid_price (float | None): The maximum bid price which limits the action of the actor to this price. + Used to constrain the actor's output to a realistic price range. Default is 100.0. + + device (str): The device to use for PyTorch computations. Options include "cpu", "cuda", or specific + CUDA devices like "cuda:0". Default is "cpu". + episodes_collecting_initial_experience (int): The number of episodes at the start during which random + actions are chosen instead of using the actor network. This helps populate the replay buffer with + diverse experiences. Default is 5. + exploration_noise_std (float): The standard deviation of Gaussian noise added to actions during + exploration in the environment. Higher values encourage more exploration. Default is 0.2. + training_episodes (int): The number of training episodes, where one episode is the entire simulation + horizon specified in the general config. Default is 100. + validation_episodes_interval (int): The interval (in episodes) at which validation episodes are run + to evaluate the current policy's performance without training updates. Default is 5. + train_freq (str): Defines the frequency in time steps at which the actor and critic networks are updated. + Accepts time strings like "24h" for 24 hours or "1d" for 1 day. Default is "24h". + batch_size (int): The batch size of experiences sampled from the replay buffer for each training update. + Larger batches provide more stable gradients but require more memory. In environments with many leanring agents we advise small batch sizes. + Default is 128. + gradient_steps (int): The number of gradient descent steps performed during each training update. + More steps can lead to better learning but increase computation time. Default is 100. + learning_rate (float): The learning rate (step size) for the optimizer, which controls how much the + policy and value networks are updated during training. Default is 0.001. + learning_rate_schedule (str | None): Which learning rate decay schedule to use. Currently only "linear" + decay is available, which linearly decreases the learning rate over time. Default is None (constant learning rate). + early_stopping_steps (int | None): The number of validation steps over which the moving average reward + is calculated for early stopping. If the reward doesn't change by early_stopping_threshold over + this many steps, training stops. If None, defaults to training_episodes / validation_episodes_interval + 1. + early_stopping_threshold (float): The minimum improvement in moving average reward required to avoid + early stopping. If the reward improvement is less than this threshold over early_stopping_steps, + training is terminated early. Default is 0.05. + + algorithm (str): Specifies which reinforcement learning algorithm to use. Currently, only "matd3" + (Multi-Agent Twin Delayed Deep Deterministic Policy Gradient) is implemented. Default is "matd3". + replay_buffer_size (int): The maximum number of transitions stored in the replay buffer for experience replay. + Larger buffers allow for more diverse training samples. Default is 500000. + gamma (float): The discount factor for future rewards, ranging from 0 to 1. Higher values give more + weight to long-term rewards in decision-making. Default is 0.99. + actor_architecture (str): The architecture of the neural networks used for the actors. Options include + "mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp". + policy_delay (int): The frequency (in gradient steps) at which the actor policy is updated. + TD3 updates the critic more frequently than the actor to stabilize training. Default is 2. + noise_sigma (float): The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution + used to generate exploration noise added to actions. Default is 0.1. + noise_scale (int): The scale factor multiplied by the noise drawn from the distribution. + Larger values increase exploration. Default is 1. + noise_dt (int): The time step parameter for the Ornstein-Uhlenbeck process, which determines how + quickly the noise decays over time. Used for noise scheduling. Default is 1. + action_noise_schedule (str | None): Which action noise decay schedule to use. Currently only "linear" + decay is available, which linearly decreases exploration noise over training. Default is "linear". + tau (float): The soft update coefficient for updating target networks. Controls how slowly target + networks track the main networks. Smaller values mean slower updates. Default is 0.005. + target_policy_noise (float): The standard deviation of noise added to target policy actions during + critic updates. This smoothing helps prevent overfitting to narrow policy peaks. Default is 0.2. + target_noise_clip (float): The maximum absolute value for clipping the target policy noise. + Prevents the noise from being too large. Default is 0.5. + + """ + + learning_mode: bool = False + evaluation_mode: bool = False + continue_learning: bool = False + trained_policies_save_path: str | None = None + trained_policies_load_path: str | None = None + + min_bid_price: float | None = -100.0 + max_bid_price: float | None = 100.0 + + device: str = "cpu" + episodes_collecting_initial_experience: int = 5 + exploration_noise_std: float = 0.2 + training_episodes: int = 100 + validation_episodes_interval: int = 5 + train_freq: str = "24h" + batch_size: int = 128 + gradient_steps: int = 100 + learning_rate: float = 0.001 + learning_rate_schedule: str | None = None + early_stopping_steps: int | None = None + early_stopping_threshold: float = 0.05 + + algorithm: str = "matd3" + replay_buffer_size: int = 50000 + gamma: float = 0.99 + actor_architecture: str = "mlp" + policy_delay: int = 2 + noise_sigma: float = 0.1 + noise_scale: int = 1 + noise_dt: int = 1 + action_noise_schedule: str | None = None + tau: float = 0.005 + target_policy_noise: float = 0.2 + target_noise_clip: float = 0.5 + + def __post_init__(self): + """Calculate defaults that depend on other fields and validate inputs.""" + if self.early_stopping_steps is None: + self.early_stopping_steps = int( + self.training_episodes / self.validation_episodes_interval + 1 + ) + + # if we do not have initial experience collected we will get an error as no samples are available on the + # buffer from which we can draw experience to adapt the strategy, hence we set it to minimum one episode + if self.episodes_collecting_initial_experience < 1: + logger.warning( + f"episodes_collecting_initial_experience need to be at least 1 to sample from buffer, got {self.episodes_collecting_initial_experience}. setting to 1" + ) + + self.episodes_collecting_initial_experience = 1 + + # check that gradient_steps is positive + if self.gradient_steps <= 0: + raise ValueError( + f"gradient_steps need to be positive, got {self.gradient_steps}" + ) + + class LearningStrategy(BaseStrategy): """ A strategy which provides learning functionality, has a method to calculate the reward. @@ -758,6 +897,7 @@ class LearningStrategy(BaseStrategy): act_dim (int): The action dimension. unique_obs_dim (int): The unique observation dimension. num_timeseries_obs_dim (int): The number of observation timeseries dimension. + learning_role (Learning): The learning role orchestrating the learning. Args: *args (list): The arguments. @@ -766,11 +906,11 @@ class LearningStrategy(BaseStrategy): def __init__( self, + learning_role, obs_dim: int, act_dim: int, - unique_obs_dim: int = 0, + unique_obs_dim: int, num_timeseries_obs_dim: int = 3, - learning_role=None, *args, **kwargs, ): @@ -779,6 +919,10 @@ def __init__( """ super().__init__(*args, **kwargs) + # access to the learning_role that orchestrates learning + self.learning_role = learning_role + self.learning_config = learning_role.learning_config + self.obs_dim = obs_dim self.act_dim = act_dim @@ -790,9 +934,6 @@ def __init__( # them into suitable format for recurrent neural networks self.num_timeseries_obs_dim = num_timeseries_obs_dim - # access to the learning_role that orchestrates learning - self.learning_role = learning_role - class MinMaxStrategy(BaseStrategy): pass @@ -804,33 +945,3 @@ class MinMaxChargeStrategy(BaseStrategy): class ExchangeStrategy(BaseStrategy): pass - - -class LearningConfig(TypedDict): - """ - A class for the learning configuration. - """ - - continue_learning: bool - min_bid_price: float - max_bid_price: float - learning_mode: bool - algorithm: str - actor_architecture: str - learning_rate: float - learning_rate_schedule: str - training_episodes: int - episodes_collecting_initial_experience: int - train_freq: str - gradient_steps: int - batch_size: int - gamma: float - device: str - noise_sigma: float - noise_scale: int - noise_dt: int - action_noise_schedule: str - trained_policies_save_path: str - trained_policies_load_path: str - early_stopping_steps: int - early_stopping_threshold: float diff --git a/assume/common/outputs.py b/assume/common/outputs.py index ab37b7a66..2d109fab5 100644 --- a/assume/common/outputs.py +++ b/assume/common/outputs.py @@ -191,6 +191,8 @@ def setup(self): ) def on_ready(self): + super().on_ready() + if self.db_uri: self.db = create_engine(self.db_uri) if self.db is not None: diff --git a/assume/reinforcement_learning/algorithms/base_algorithm.py b/assume/reinforcement_learning/algorithms/base_algorithm.py index b40204760..44c0f492f 100644 --- a/assume/reinforcement_learning/algorithms/base_algorithm.py +++ b/assume/reinforcement_learning/algorithms/base_algorithm.py @@ -17,52 +17,25 @@ class RLAlgorithm: Args: learning_role (Learning Role object): Learning object - learning_rate (float): learning rate for adam optimizer - batch_size (int): Minibatch size for each gradient update - tau (float): the soft update coefficient ("Polyak update", between 0 and 1) - gamma (float): the discount factor - gradient_steps (int): how many gradient steps to do after each rollout - policy_delay (int): Policy and target networks will only be updated once every policy_delay steps per training steps. The Q values will be updated policy_delay more often (update every training step) - target_policy_noise (float): Standard deviation of Gaussian noise added to target policy (smoothing noise) - target_noise_clip (float): Limit for absolute value of target policy smoothing noise - actor_architecture (str): type of Actor neural network """ def __init__( self, # init learning_role as object of Learning class learning_role, - learning_rate=1e-4, - batch_size=1024, - tau=0.005, - gamma=0.99, - gradient_steps=100, - policy_delay=2, - target_policy_noise=0.2, - target_noise_clip=0.5, - actor_architecture="mlp", ): super().__init__() self.learning_role = learning_role - self.learning_rate = learning_rate - self.batch_size = batch_size - self.gamma = gamma - self.tau = tau + self.learning_config = learning_role.learning_config - self.gradient_steps = gradient_steps - - self.policy_delay = policy_delay - self.target_noise_clip = target_noise_clip - self.target_policy_noise = target_policy_noise - - if actor_architecture in actor_architecture_aliases.keys(): + if self.learning_config.actor_architecture in actor_architecture_aliases.keys(): self.actor_architecture_class = actor_architecture_aliases[ - actor_architecture + self.learning_config.actor_architecture ] else: raise ValueError( - f"Policy '{actor_architecture}' unknown. Supported architectures are {list(actor_architecture_aliases.keys())}" + f"Policy '{self.learning_config.actor_architecture}' unknown. Supported architectures are {list(actor_architecture_aliases.keys())}" ) self.device = self.learning_role.device diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py index f207ae1bc..5bd4edef5 100644 --- a/assume/reinforcement_learning/algorithms/matd3.py +++ b/assume/reinforcement_learning/algorithms/matd3.py @@ -32,31 +32,9 @@ class TD3(RLAlgorithm): Original paper: https://arxiv.org/pdf/1802.09477.pdf """ - def __init__( - self, - learning_role, - learning_rate=1e-4, - batch_size=1024, - tau=0.005, - gamma=0.99, - gradient_steps=100, - policy_delay=2, - target_policy_noise=0.2, - target_noise_clip=0.5, - actor_architecture="mlp", - ): - super().__init__( - learning_role, - learning_rate, - batch_size, - tau, - gamma, - gradient_steps, - policy_delay, - target_policy_noise, - target_noise_clip, - actor_architecture, - ) + def __init__(self, learning_role): + super().__init__(learning_role) + self.n_updates = 0 self.grad_clip_norm = 1.0 @@ -487,7 +465,7 @@ def update_policy(self): } for u_id in self.learning_role.rl_strats.keys() } - for _ in range(self.gradient_steps) + for _ in range(self.learning_config.gradient_steps) ] # update noise decay and learning rate @@ -510,10 +488,12 @@ def update_policy(self): ) strategy.action_noise.update_noise_decay(updated_noise_decay) - for step in range(self.gradient_steps): + for step in range(self.learning_config.gradient_steps): self.n_updates += 1 - transitions = self.learning_role.buffer.sample(self.batch_size) + transitions = self.learning_role.buffer.sample( + self.learning_config.batch_size + ) states, actions, next_states, rewards = ( transitions.observations, transitions.actions, @@ -523,8 +503,13 @@ def update_policy(self): with th.no_grad(): # Select action according to policy and add clipped noise - noise = th.randn_like(actions) * self.target_policy_noise - noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip) + noise = ( + th.randn_like(actions) * self.learning_config.target_policy_noise + ) + noise = noise.clamp( + -self.learning_config.target_noise_clip, + self.learning_config.target_noise_clip, + ) # Select next actions for all agents next_actions = th.stack( @@ -538,15 +523,15 @@ def update_policy(self): next_actions = next_actions.transpose(0, 1).contiguous() next_actions = next_actions.view(-1, n_rl_agents * self.act_dim) - all_actions = actions.view(self.batch_size, -1) + all_actions = actions.view(self.learning_config.batch_size, -1) # Precompute unique observation parts for all agents unique_obs_from_others = states[ :, :, self.obs_dim - self.unique_obs_dim : - ].reshape(self.batch_size, n_rl_agents, -1) + ].reshape(self.learning_config.batch_size, n_rl_agents, -1) next_unique_obs_from_others = next_states[ :, :, self.obs_dim - self.unique_obs_dim : - ].reshape(self.batch_size, n_rl_agents, -1) + ].reshape(self.learning_config.batch_size, n_rl_agents, -1) ##################################################################### # CRITIC UPDATE: Accumulate losses for all agents, then backprop once @@ -580,15 +565,19 @@ def update_policy(self): # Construct final state representations all_states = th.cat( ( - states[:, i, :].reshape(self.batch_size, -1), - other_unique_obs.reshape(self.batch_size, -1), + states[:, i, :].reshape(self.learning_config.batch_size, -1), + other_unique_obs.reshape(self.learning_config.batch_size, -1), ), dim=1, ) all_next_states = th.cat( ( - next_states[:, i, :].reshape(self.batch_size, -1), - other_next_unique_obs.reshape(self.batch_size, -1), + next_states[:, i, :].reshape( + self.learning_config.batch_size, -1 + ), + other_next_unique_obs.reshape( + self.learning_config.batch_size, -1 + ), ), dim=1, ) @@ -600,7 +589,8 @@ def update_policy(self): ) next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True) target_Q_values = ( - rewards[:, i].unsqueeze(1) + self.gamma * next_q_values + rewards[:, i].unsqueeze(1) + + self.learning_config.gamma * next_q_values ) # Get current Q-values estimates for each critic network @@ -643,7 +633,7 @@ def update_policy(self): ###################################################################### # ACTOR UPDATE (DELAYED): Accumulate losses for all agents in one pass ###################################################################### - if self.n_updates % self.policy_delay == 0: + if self.n_updates % self.learning_config.policy_delay == 0: # Zero-grad for all actors first for strategy in strategies: strategy.actor.optimizer.zero_grad(set_to_none=True) @@ -669,8 +659,10 @@ def update_policy(self): ) all_states_i = th.cat( ( - state_i.reshape(self.batch_size, -1), - other_unique_obs.reshape(self.batch_size, -1), + state_i.reshape(self.learning_config.batch_size, -1), + other_unique_obs.reshape( + self.learning_config.batch_size, -1 + ), ), dim=1, ) @@ -680,7 +672,9 @@ def update_policy(self): all_actions_clone[:, i, :] = action_i # Flatten again for the critic - all_actions_clone = all_actions_clone.view(self.batch_size, -1) + all_actions_clone = all_actions_clone.view( + self.learning_config.batch_size, -1 + ) # Calculate actor loss (negative Q1 of the updated action) actor_loss = -critic.q1_forward( @@ -736,7 +730,13 @@ def update_policy(self): all_target_actor_params.extend(strategy.actor_target.parameters()) # Perform batch-wise Polyak update (NO LOOPS) - polyak_update(all_critic_params, all_target_critic_params, self.tau) - polyak_update(all_actor_params, all_target_actor_params, self.tau) + polyak_update( + all_critic_params, + all_target_critic_params, + self.learning_config.tau, + ) + polyak_update( + all_actor_params, all_target_actor_params, self.learning_config.tau + ) self.learning_role.write_rl_grad_params_to_output(learning_rate, unit_params) diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py index 8b526d403..a0160b8cc 100644 --- a/assume/reinforcement_learning/learning_role.py +++ b/assume/reinforcement_learning/learning_role.py @@ -37,6 +37,8 @@ class Learning(Role): Args: learning_config (LearningConfig): The configuration for the learning process. + start (datetime.datetime): The start datetime for the simulation. + end (datetime.datetime): The end datetime for the simulation. """ @@ -46,47 +48,27 @@ def __init__( start: datetime, end: datetime, ): + super().__init__() + # how many learning roles do exist and how are they named self.buffer: ReplayBuffer = None self.episodes_done = 0 self.rl_strats: dict[int, LearningStrategy] = {} - self.rl_algorithm = learning_config.get("algorithm", "matd3") - self.actor_architecture = learning_config.get("actor_architecture", "mlp") + self.learning_config = learning_config self.critics = {} self.target_critics = {} - # define whether we train model or evaluate it - self.training_episodes = learning_config["training_episodes"] - self.learning_mode = learning_config["learning_mode"] - self.evaluation_mode = learning_config["evaluation_mode"] - self.continue_learning = learning_config["continue_learning"] - self.trained_policies_save_path = learning_config["trained_policies_save_path"] - self.trained_policies_load_path = learning_config.get( - "trained_policies_load_path", self.trained_policies_save_path - ) - - # if early_stopping_steps are not provided then set default to no early stopping (early_stopping_steps need to be greater than validation_episodes) - self.early_stopping_steps = learning_config.get( - "early_stopping_steps", - int( - self.training_episodes - / learning_config.get("validation_episodes_interval", 5) - + 1 - ), - ) - self.early_stopping_threshold = learning_config.get( - "early_stopping_threshold", 0.05 - ) - - cuda_device = ( - learning_config["device"] - if "cuda" in learning_config.get("device", "cpu") + self.device = th.device( + self.learning_config.device + if ( + self.learning_config + and "cuda" in self.learning_config.device + and th.cuda.is_available() + ) else "cpu" ) - self.device = th.device(cuda_device if th.cuda.is_available() else "cpu") - # future: add option to choose between float16 and float32 - # float_type = learning_config.get("float_type", "float32") + # float_type = learning_config.float_type self.float_type = th.float th.backends.cuda.matmul.allow_tf32 = True @@ -98,67 +80,47 @@ def __init__( self.end_datetime = end self.datetime = None + if self.learning_config.learning_mode: + # configure additional learning parameters if we are in learning or evaluation mode + if self.learning_config.learning_rate_schedule == "linear": + self.calc_lr_from_progress = linear_schedule_func( + self.learning_config.learning_rate + ) + else: + self.calc_lr_from_progress = ( + lambda x: self.learning_config.learning_rate + ) - self.learning_rate = learning_config.get("learning_rate", 1e-4) - self.learning_rate_schedule = learning_config.get( - "learning_rate_schedule", None - ) - if self.learning_rate_schedule == "linear": - self.calc_lr_from_progress = linear_schedule_func(self.learning_rate) - else: - self.calc_lr_from_progress = lambda x: self.learning_rate - - noise_dt = learning_config.get("noise_dt", 1) - self.action_noise_schedule = learning_config.get("action_noise_schedule", None) - if self.action_noise_schedule == "linear": - self.calc_noise_from_progress = linear_schedule_func(noise_dt) - else: - self.calc_noise_from_progress = lambda x: noise_dt - - # if we do not have initial experience collected we will get an error as no samples are available on the - # buffer from which we can draw experience to adapt the strategy, hence we set it to minimum one episode - - self.episodes_collecting_initial_experience = max( - learning_config.get("episodes_collecting_initial_experience", 5), 1 - ) - - self.train_freq = learning_config.get("train_freq", "24h") - self.gradient_steps = learning_config.get("gradient_steps", 100) - - # check that gradient_steps is positive - if self.gradient_steps <= 0: - raise ValueError( - f"gradient_steps need to be positive, got {self.gradient_steps}" - ) - - self.batch_size = learning_config.get("batch_size", 128) - self.gamma = learning_config.get("gamma", 0.99) - - self.eval_episodes_done = 0 + if self.learning_config.action_noise_schedule == "linear": + self.calc_noise_from_progress = linear_schedule_func( + self.learning_config.noise_dt + ) + else: + self.calc_noise_from_progress = lambda x: self.learning_config.noise_dt - # function that initializes learning, needs to be an extra function so that it can be called after buffer is given to Role - self.create_learning_algorithm(self.rl_algorithm) + self.eval_episodes_done = 0 - # store evaluation values - self.max_eval = defaultdict(lambda: -1e9) - self.rl_eval = defaultdict(list) - # list of avg_changes - self.avg_rewards = [] + # function that initializes learning, needs to be an extra function so that it can be called after buffer is given to Role + self.create_learning_algorithm(self.learning_config.algorithm) - self.tensor_board_logger = None - self.update_steps = None + # store evaluation values + self.max_eval = defaultdict(lambda: -1e9) + self.rl_eval = defaultdict(list) + # list of avg_changes + self.avg_rewards = [] - self.sync_train_freq_with_simulation_horizon() + self.tensor_board_logger = None + self.update_steps = None - # init dictionaries for all learning instances in this role - # Note: we use atomic-swaps later to ensure no overwrites while we write the data into the buffer - # this works since we do not use multi-threading, otherwise threading.locks would be needed here. - self.all_obs = defaultdict(lambda: defaultdict(list)) - self.all_actions = defaultdict(lambda: defaultdict(list)) - self.all_noises = defaultdict(lambda: defaultdict(list)) - self.all_rewards = defaultdict(lambda: defaultdict(list)) - self.all_regrets = defaultdict(lambda: defaultdict(list)) - self.all_profits = defaultdict(lambda: defaultdict(list)) + # init dictionaries for all learning instances in this role + # Note: we use atomic-swaps later to ensure no overwrites while we write the data into the buffer + # this works since we do not use multi-threading, otherwise threading.locks would be needed here. + self.all_obs = defaultdict(lambda: defaultdict(list)) + self.all_actions = defaultdict(lambda: defaultdict(list)) + self.all_noises = defaultdict(lambda: defaultdict(list)) + self.all_rewards = defaultdict(lambda: defaultdict(list)) + self.all_regrets = defaultdict(lambda: defaultdict(list)) + self.all_profits = defaultdict(lambda: defaultdict(list)) def on_ready(self): """ @@ -173,13 +135,13 @@ def on_ready(self): super().on_ready() shifted_start = self.start_datetime + pd.Timedelta( - self.train_freq + self.learning_config.train_freq ) # shift start by hours in time frequency recurrency_task = create_rrule( start=shifted_start, end=self.end_datetime, - freq=self.train_freq, + freq=self.learning_config.train_freq, ) self.context.schedule_recurrent_task( @@ -197,10 +159,10 @@ def sync_train_freq_with_simulation_horizon(self) -> str | None: # ensure train_freq evenly divides simulation length (may adjust self.train_freq) - if not self.learning_mode: + if not self.learning_config.learning_mode: return None - train_freq_str = str(self.train_freq) + train_freq_str = str(self.learning_config.train_freq) try: train_freq = pd.Timedelta(train_freq_str) except Exception: @@ -217,16 +179,16 @@ def sync_train_freq_with_simulation_horizon(self) -> str | None: (total_length / n_intervals).total_seconds() / 3600 ) new_train_freq_str = f"{new_train_freq_hours}h" - self.train_freq = new_train_freq_str + self.learning_config.train_freq = new_train_freq_str logger.warning( f"Simulation length ({total_length}) is not divisible by train_freq ({train_freq_str}). " f"Adjusting train_freq to {new_train_freq_str}." ) - return self.train_freq + return self.learning_config.train_freq - def determine_validation_interval(self, learning_config: LearningConfig) -> int: + def determine_validation_interval(self) -> int: """ Compute and validate validation_interval. @@ -235,17 +197,18 @@ def determine_validation_interval(self, learning_config: LearningConfig) -> int: Raises: ValueError if training_episodes is too small. """ - default_interval = learning_config.get("validation_episodes_interval", 5) - training_episodes = self.training_episodes + default_interval = self.learning_config.validation_episodes_interval + training_episodes = self.learning_config.training_episodes validation_interval = min(training_episodes, default_interval) min_required_episodes = ( - self.episodes_collecting_initial_experience + validation_interval + self.learning_config.episodes_collecting_initial_experience + + validation_interval ) - if training_episodes < min_required_episodes: + if self.learning_config.training_episodes < min_required_episodes: raise ValueError( - f"Training episodes ({training_episodes}) must be greater than the sum of initial experience episodes ({self.episodes_collecting_initial_experience}) and evaluation interval ({validation_interval})." + f"Training episodes ({training_episodes}) must be greater than the sum of initial experience episodes ({self.learning_config.episodes_collecting_initial_experience}) and evaluation interval ({validation_interval})." ) return validation_interval @@ -298,7 +261,7 @@ async def store_to_buffer_and_update(self) -> None: self.write_rl_params_to_output(cache) # if we are training also update the policy and write data into buffer - if not self.evaluation_mode: + if not self.learning_config.evaluation_mode: # Process cache in background await self._store_to_buffer_and_update_sync(cache, self.device) else: @@ -331,7 +294,10 @@ async def _store_to_buffer_and_update_sync(self, cache, device) -> None: reward=transform_buffer_data(cache["rewards"], device), ) - if self.episodes_done >= self.episodes_collecting_initial_experience: + if ( + self.episodes_done + >= self.learning_config.episodes_collecting_initial_experience + ): self.rl_algorithm.update_policy() def add_observation_to_cache(self, unit_id, start, observation) -> None: @@ -397,11 +363,14 @@ def load_inter_episodic_data(self, inter_episodic_data): self.initialize_policy(inter_episodic_data["actors_and_critics"]) # Disable initial exploration if initial experience collection is complete - if self.episodes_done >= self.episodes_collecting_initial_experience: + if ( + self.episodes_done + >= self.learning_config.episodes_collecting_initial_experience + ): self.turn_off_initial_exploration() # In continue_learning mode, disable it only for loaded strategies - elif self.continue_learning: + elif self.learning_config.continue_learning: self.turn_off_initial_exploration(loaded_only=True) def get_inter_episodic_data(self): @@ -448,16 +417,23 @@ def get_progress_remaining(self) -> float: elapsed_duration = self.context.current_timestamp - self.start learning_episodes = ( - self.training_episodes - self.episodes_collecting_initial_experience + self.learning_config.training_episodes + - self.learning_config.episodes_collecting_initial_experience ) - if self.episodes_done < self.episodes_collecting_initial_experience: + if ( + self.episodes_done + < self.learning_config.episodes_collecting_initial_experience + ): progress_remaining = 1 else: progress_remaining = ( 1 - ( - (self.episodes_done - self.episodes_collecting_initial_experience) + ( + self.episodes_done + - self.learning_config.episodes_collecting_initial_experience + ) / learning_episodes ) - ((1 / learning_episodes) * (elapsed_duration / total_duration)) @@ -476,14 +452,7 @@ def create_learning_algorithm(self, algorithm: RLAlgorithm): algorithm (RLAlgorithm): The name of the reinforcement learning algorithm. """ if algorithm == "matd3": - self.rl_algorithm = TD3( - learning_role=self, - learning_rate=self.learning_rate, - gradient_steps=self.gradient_steps, - batch_size=self.batch_size, - gamma=self.gamma, - actor_architecture=self.actor_architecture, - ) + self.rl_algorithm = TD3(learning_role=self) else: logger.error(f"Learning algorithm {algorithm} not implemented!") @@ -498,14 +467,17 @@ def initialize_policy(self, actors_and_critics: dict = None) -> None: self.rl_algorithm.initialize_policy(actors_and_critics) - if self.continue_learning is True and actors_and_critics is None: - directory = self.trained_policies_load_path - if Path(directory).is_dir(): + if ( + self.learning_config.continue_learning is True + and actors_and_critics is None + ): + directory = self.learning_config.trained_policies_load_path + if directory and Path(directory).is_dir(): logger.info(f"Loading pretrained policies from {directory}!") self.rl_algorithm.load_params(directory) else: raise FileNotFoundError( - f"Directory {directory} does not exist! Cannot load pretrained policies!" + f"Directory {directory} does not exist! Cannot load pretrained policies from trained_policies_load_path!" ) def compare_and_save_policies(self, metrics: dict) -> bool: @@ -545,7 +517,7 @@ def compare_and_save_policies(self, metrics: dict) -> bool: if metric == list(metrics.keys())[0]: # store the best for our current metric in its folder self.rl_algorithm.save_params( - directory=f"{self.trained_policies_save_path}/{metric}_eval_policies" + directory=f"{self.learning_config.trained_policies_save_path}/{metric}_eval_policies" ) logger.info( @@ -557,14 +529,20 @@ def compare_and_save_policies(self, metrics: dict) -> bool: ) # if we do not see any improvement in the last x evaluation runs we stop the training - if len(self.rl_eval[metric]) >= self.early_stopping_steps: + if len(self.rl_eval[metric]) >= self.learning_config.early_stopping_steps: self.avg_rewards.append( - sum(self.rl_eval[metric][-self.early_stopping_steps :]) - / self.early_stopping_steps + sum( + self.rl_eval[metric][ + -self.learning_config.early_stopping_steps : + ] + ) + / self.learning_config.early_stopping_steps ) - if len(self.avg_rewards) >= self.early_stopping_steps: - recent_rewards = self.avg_rewards[-self.early_stopping_steps :] + if len(self.avg_rewards) >= self.learning_config.early_stopping_steps: + recent_rewards = self.avg_rewards[ + -self.learning_config.early_stopping_steps : + ] min_reward = min(recent_rewards) max_reward = max(recent_rewards) @@ -573,21 +551,22 @@ def compare_and_save_policies(self, metrics: dict) -> bool: abs(min_reward), 1e-8 ) # Use small value to avoid zero-division - avg_change = (max_reward - min_reward) / denominator + avg_change = abs((max_reward - min_reward) / denominator) - if avg_change < self.early_stopping_threshold: + if avg_change < self.learning_config.early_stopping_threshold: logger.info( - f"Stopping training as no improvement above {self.early_stopping_threshold*100}% in last {self.early_stopping_steps} evaluations for {metric}" + f"Stopping training as no improvement above {self.learning_config.early_stopping_threshold*100}% in last {self.learning_config.early_stopping_steps} evaluations for {metric}" ) if ( - self.learning_rate_schedule or self.action_noise_schedule + self.learning_config.learning_rate_schedule + or self.learning_config.action_noise_schedule ) is not None: logger.info( - f"Learning rate schedule ({self.learning_rate_schedule}) or action noise schedule ({self.action_noise_schedule}) were scheduled to decay, further learning improvement can be possible. End value of schedule may not have been reached." + f"Learning rate schedule ({self.learning_config.learning_rate_schedule}) or action noise schedule ({self.learning_config.action_noise_schedule}) were scheduled to decay, further learning improvement can be possible. End value of schedule may not have been reached." ) self.rl_algorithm.save_params( - directory=f"{self.trained_policies_save_path}/last_policies" + directory=f"{self.learning_config.trained_policies_save_path}/last_policies" ) return True @@ -610,20 +589,21 @@ def init_logging( Args: simulation_id (str): The unique identifier for the simulation. + episode (int): The current training episode number. + eval_episode (int): The current evaluation episode number. db_uri (str): URI for connecting to the database. output_agent_addr (str): The address of the output agent. train_start (str): The start time of simulation. - freq (str): The frequency of simulation. """ self.tensor_board_logger = TensorBoardLogger( simulation_id=simulation_id, db_uri=db_uri, - learning_mode=self.learning_mode, - evaluation_mode=self.evaluation_mode, + learning_mode=self.learning_config.learning_mode, + evaluation_mode=self.learning_config.evaluation_mode, episode=episode, eval_episode=eval_episode, - episodes_collecting_initial_experience=self.episodes_collecting_initial_experience, + episodes_collecting_initial_experience=self.learning_config.episodes_collecting_initial_experience, ) # Parameters required for sending data to the output role @@ -698,19 +678,23 @@ def write_rl_grad_params_to_output( """ # gradient steps performed in previous training episodes gradient_steps_done = ( - max(self.episodes_done - self.episodes_collecting_initial_experience, 0) + max( + self.episodes_done + - self.learning_config.episodes_collecting_initial_experience, + 0, + ) * int( (timestamp2datetime(self.end) - timestamp2datetime(self.start)) - / pd.Timedelta(self.train_freq) + / pd.Timedelta(self.learning_config.train_freq) ) - * self.gradient_steps + * self.learning_config.gradient_steps ) output_list = [ { "step": gradient_steps_done + self.update_steps - * self.gradient_steps # gradient steps performed in current training episode + * self.learning_config.gradient_steps # gradient steps performed in current training episode + gradient_step, "unit": u_id, "actor_loss": params["actor_loss"], @@ -721,7 +705,7 @@ def write_rl_grad_params_to_output( "critic_max_grad_norm": params["critic_max_grad_norm"], "learning_rate": learning_rate, } - for gradient_step in range(self.gradient_steps) + for gradient_step in range(self.learning_config.gradient_steps) for u_id, params in unit_params_list[gradient_step].items() ] diff --git a/assume/reinforcement_learning/neural_network_architecture.py b/assume/reinforcement_learning/neural_network_architecture.py index dc1e1e591..44969a7be 100644 --- a/assume/reinforcement_learning/neural_network_architecture.py +++ b/assume/reinforcement_learning/neural_network_architecture.py @@ -22,7 +22,7 @@ def __init__( obs_dim: int, act_dim: int, float_type, - unique_obs_dim: int = 0, + unique_obs_dim: int, ): super().__init__() @@ -197,7 +197,7 @@ def __init__( obs_dim: int, act_dim: int, float_type, - unique_obs_dim: int = 0, + unique_obs_dim: int, num_timeseries_obs_dim: int = 3, *args, **kwargs, diff --git a/assume/scenario/loader_csv.py b/assume/scenario/loader_csv.py index c5000c1cb..810e964b1 100644 --- a/assume/scenario/loader_csv.py +++ b/assume/scenario/loader_csv.py @@ -16,7 +16,6 @@ import yaml from tqdm import tqdm -from assume.common.base import LearningConfig from assume.common.exceptions import AssumeException from assume.common.forecast_initialisation import ForecastInitialisation from assume.common.forecaster import ( @@ -756,26 +755,34 @@ def setup_world( "Disable CSV export to save data at regular intervals (export_csv_path = '')." ) - learning_config: LearningConfig = config.get("learning_config", {}) bidding_params = config.get("bidding_strategy_params", {}) - learning_config["learning_mode"] = config.get("learning_mode", False) - learning_config["evaluation_mode"] = evaluation_mode - - if terminate_learning: - learning_config["learning_mode"] = False - learning_config["evaluation_mode"] = False - - if not learning_config.get("trained_policies_save_path"): - learning_config["trained_policies_save_path"] = ( - f"learned_strategies/{simulation_id}" - ) + # handle initial learning parameters before leanring_role exists + learning_dict = config.get("learning_config", {}) + # those settings need to be overridden before passing to the LearningConfig + if learning_dict: + # make sure that continue_learning implies learning_mode + if learning_dict.get("continue_learning"): + learning_dict["learning_mode"] = True + # determined by learning loop in run_learning() + learning_dict["evaluation_mode"] = evaluation_mode + + if terminate_learning: + learning_dict["learning_mode"] = False + learning_dict["evaluation_mode"] = False + + # default path for saving trained policies is set here because + # a) depends on the simulation_id + # b) it is set relative to inputs_path in replace_paths() below + if not learning_dict.get("trained_policies_save_path"): + learning_dict["trained_policies_save_path"] = ( + f"learned_strategies/{simulation_id}" + ) - if not learning_config.get("trained_policies_load_path"): - learning_config["trained_policies_load_path"] = ( - f"learned_strategies/{simulation_id}/avg_reward_eval_policies" - ) + # learning mode always needed for reading units below + learning_mode = learning_dict.get("learning_mode", False) + # all paths should be relative to the inputs_path config = replace_paths(config, scenario_data["path"]) world.reset() @@ -785,7 +792,7 @@ def setup_world( end=end, save_frequency_hours=save_frequency_hours, simulation_id=simulation_id, - learning_config=learning_config, + learning_dict=learning_dict, episode=episode, eval_episode=eval_episode, bidding_params=bidding_params, @@ -823,7 +830,7 @@ def setup_world( unit_type="power_plant", forecaster=unit_forecasts, world_bidding_strategies=world.bidding_strategies, - learning_mode=learning_config["learning_mode"], + learning_mode=learning_mode, ) storage_units = read_units( @@ -831,7 +838,7 @@ def setup_world( unit_type="storage", forecaster=unit_forecasts, world_bidding_strategies=world.bidding_strategies, - learning_mode=learning_config["learning_mode"], + learning_mode=learning_mode, ) demand_units = read_units( @@ -839,7 +846,7 @@ def setup_world( unit_type="demand", forecaster=unit_forecasts, world_bidding_strategies=world.bidding_strategies, - learning_mode=learning_config["learning_mode"], + learning_mode=learning_mode, ) exchange_units = read_units( @@ -856,7 +863,7 @@ def setup_world( unit_type=unit_type, forecaster=unit_forecasts, world_bidding_strategies=world.bidding_strategies, - learning_mode=learning_config["learning_mode"], + learning_mode=learning_mode, ) for op, op_units in dsm_units.items(): units[op].extend(op_units) @@ -1047,8 +1054,8 @@ def run_learning( world.learning_role.rl_algorithm.initialize_policy() # check if we already stored policies for this simulation - save_path = world.learning_config["trained_policies_save_path"] - continue_learning = world.learning_config.get("continue_learning", False) + save_path = world.learning_role.learning_config.trained_policies_save_path + continue_learning = world.learning_role.learning_config.continue_learning confirm_learning_save_path(save_path, continue_learning) # also remove tensorboard logs @@ -1060,7 +1067,7 @@ def run_learning( # Information that needs to be stored across episodes, aka one simulation run inter_episodic_data = { "buffer": ReplayBuffer( - buffer_size=int(world.learning_config.get("replay_buffer_size", 5e5)), + buffer_size=world.learning_role.learning_config.replay_buffer_size, obs_dim=world.learning_role.rl_algorithm.obs_dim, act_dim=world.learning_role.rl_algorithm.act_dim, n_rl_units=len(world.learning_role.rl_strats), @@ -1077,10 +1084,9 @@ def run_learning( world.learning_role.load_inter_episodic_data(inter_episodic_data) - validation_interval = world.learning_role.determine_validation_interval( - world.learning_config - ) + validation_interval = world.learning_role.determine_validation_interval() + # sync train frequency with simulation horizon once at the beginning of training and overwrite scenario data world.scenario_data["config"]["learning_config"]["train_freq"] = ( world.learning_role.sync_train_freq_with_simulation_horizon() ) @@ -1088,7 +1094,7 @@ def run_learning( eval_episode = 1 for episode in tqdm( - range(1, world.learning_role.training_episodes + 1), + range(1, world.learning_role.learning_config.training_episodes + 1), desc="Training Episodes", ): # ----------------------------------------- @@ -1113,7 +1119,7 @@ def run_learning( if ( episode % validation_interval == 0 and episode - >= world.learning_role.episodes_collecting_initial_experience + >= world.learning_role.learning_config.episodes_collecting_initial_experience + validation_interval ): world.reset() @@ -1161,11 +1167,11 @@ def run_learning( # save the policies after each episode in case the simulation is stopped or crashes if ( episode - >= world.learning_role.episodes_collecting_initial_experience + >= world.learning_role.learning_config.episodes_collecting_initial_experience + validation_interval ): world.learning_role.rl_algorithm.save_params( - directory=f"{world.learning_role.trained_policies_save_path}/last_policies" + directory=f"{world.learning_role.learning_config.trained_policies_save_path}/last_policies" ) # container shutdown implicitly with new initialisation @@ -1175,11 +1181,9 @@ def run_learning( world.reset() - # Set 'trained_policies_load_path' to None in order to load the most recent policies, - # especially if previous strategies were loaded from an external source. - # This is useful when continuing from a previous learning session. + # latest policies for final simulation run world.scenario_data["config"]["learning_config"]["trained_policies_load_path"] = ( - f"{world.learning_role.trained_policies_save_path}/avg_reward_eval_policies" + f"{world.learning_role.learning_config.trained_policies_save_path}/last_policies" ) # load scenario for evaluation diff --git a/assume/strategies/learning_strategies.py b/assume/strategies/learning_strategies.py index 5a05a9336..11464a7a4 100644 --- a/assume/strategies/learning_strategies.py +++ b/assume/strategies/learning_strategies.py @@ -17,7 +17,6 @@ SupportsMinMax, SupportsMinMaxCharge, ) -from assume.common.exceptions import AssumeException from assume.common.fast_pandas import FastSeries from assume.common.market_objects import MarketConfig, Orderbook, Product from assume.common.utils import min_max_scale @@ -38,17 +37,18 @@ def __init__(self, *args, **kwargs): self.unit_id = kwargs["unit_id"] # defines bounds of actions space - self.min_bid_price = kwargs.get("min_bid_price", -100) - self.max_bid_price = kwargs.get("max_bid_price", 100) + self.min_bid_price = self.learning_config.min_bid_price + self.max_bid_price = self.learning_config.max_bid_price # tells us whether we are training the agents or just executing per-learning strategies - self.learning_mode = kwargs.get("learning_mode", False) - self.evaluation_mode = kwargs.get("evaluation_mode", False) + self.learning_mode = self.learning_config.learning_mode + self.evaluation_mode = self.learning_config.evaluation_mode # based on learning config - self.algorithm = kwargs.get("algorithm", "matd3") - self.actor_architecture = kwargs.get("actor_architecture", "mlp") + self.algorithm = self.learning_config.algorithm + self.actor_architecture = self.learning_config.actor_architecture + # check if actor architecture is available if self.actor_architecture in actor_architecture_aliases.keys(): self.actor_architecture_class = actor_architecture_aliases[ self.actor_architecture @@ -59,42 +59,39 @@ def __init__(self, *args, **kwargs): ) # sets the device of the actor network - device = kwargs.get("device", "cpu") - self.device = th.device(device if th.cuda.is_available() else "cpu") - if self.learning_mode and not self.learning_role: - raise AssumeException("Learning Role must be set in LearningMode") - - # always use CPU in evaluation mode for performance reasons - if not self.learning_mode: - self.device = th.device("cpu") + self.device = self.learning_role.device # future: add option to choose between float16 and float32 # float_type = kwargs.get("float_type", "float32") self.float_type = th.float # define standard deviation for the initial exploration noise - self.exploration_noise_std = kwargs.get("exploration_noise_std", 0.2) + self.exploration_noise_std = self.learning_config.exploration_noise_std if self.learning_mode or self.evaluation_mode: - self.collect_initial_experience_mode = bool( - kwargs.get("episodes_collecting_initial_experience", True) - ) + # learning role overwrites this if loaded from file or after initial experience episodes + self.collect_initial_experience_mode = True self.action_noise = NormalActionNoise( mu=0.0, - sigma=kwargs.get("noise_sigma", 0.1), + sigma=self.learning_config.noise_sigma, action_dimension=self.act_dim, - scale=kwargs.get("noise_scale", 1.0), - dt=kwargs.get("noise_dt", 1.0), + scale=self.learning_config.noise_scale, + dt=self.learning_config.noise_dt, ) self.learning_role.register_strategy(self) - elif Path(kwargs["trained_policies_load_path"]).is_dir(): - self.load_actor_params(load_path=kwargs["trained_policies_load_path"]) + # actor policies are only loaded here from file if learning mode is off (otherwise handled by learning_role) + # i.e., when loading pre-trained strategies without training ("learning_mode: false" and "trained_policies_load_path" specified in config) + # or final simulation run after training (terminate_learning == true) + elif Path(self.learning_config.trained_policies_load_path).is_dir(): + self.load_actor_params( + load_path=self.learning_config.trained_policies_load_path + ) else: raise FileNotFoundError( - f"No policies were provided for DRL unit {self.unit_id}!. Please provide a valid path to the trained policies." + f"No policies were provided for DRL unit {self.unit_id}!. Please provide a valid path to the trained policies. Expected them under filepath '{self.learning_config.trained_policies_load_path}'." ) def load_actor_params(self, load_path): @@ -725,7 +722,13 @@ class EnergyLearningSingleBidStrategy(EnergyLearningStrategy, MinMaxStrategy): """ def __init__(self, *args, **kwargs): + obs_dim = kwargs.pop("obs_dim", 74) + act_dim = kwargs.pop("act_dim", 1) + unique_obs_dim = kwargs.pop("unique_obs_dim", 2) super().__init__( + obs_dim=obs_dim, + act_dim=act_dim, + unique_obs_dim=unique_obs_dim, *args, **kwargs, ) diff --git a/assume/world.py b/assume/world.py index 24b4d1f37..3b987d428 100644 --- a/assume/world.py +++ b/assume/world.py @@ -186,7 +186,7 @@ def setup( simulation_id: str, save_frequency_hours, bidding_params: dict = {}, - learning_config: LearningConfig = {}, + learning_dict: dict = {}, episode: int = 1, eval_episode: int = 1, manager_address=None, @@ -202,7 +202,7 @@ def setup( simulation_id (str): The unique identifier for the simulation. save_frequency_hours (int): The frequency (in hours) at which to save simulation data. bidding_params (dict, optional): Parameters for bidding. Defaults to an empty dictionary. - learning_config (LearningConfig, optional): Configuration for the learning process. Defaults to an empty configuration. + learning_config (dict | None, optional): Configuration for the learning process. Defaults to None. manager_address: The address of the manager. **kwargs: Additional keyword arguments. @@ -221,10 +221,15 @@ def setup( self.simulation_id = simulation_id self.start = start self.end = end - self.learning_config = learning_config + + if not learning_dict: + self.learning_config: LearningConfig = None + else: + self.learning_config = LearningConfig(**learning_dict) + # initiate learning if the learning mode is on and hence we want to learn new strategies - self.learning_mode = self.learning_config.get("learning_mode", False) - self.evaluation_mode = self.learning_config.get("evaluation_mode", False) + self.learning_mode = learning_dict.get("learning_mode", False) + self.evaluation_mode = learning_dict.get("evaluation_mode", False) # initialize a config dictionary for the scenario data if not already present if not self.scenario_data.get("config"): @@ -233,13 +238,14 @@ def setup( # make a descriptor for the tqdm progress bar # use simulation_id of not in learning mode; use Episode ID if in learning mode # and use Evaluation Episode ID if in evaluation mode - self.simulation_desc = ( - simulation_id - if not self.learning_mode - else f"Training Episode {episode}" - if not self.evaluation_mode - else f"Evaluation Episode {eval_episode}" - ) + self.simulation_desc = simulation_id + + # update simulation description when learning + if self.learning_config: + if self.learning_config.evaluation_mode: + self.simulation_desc = f"Evaluation Episode {eval_episode}" + elif self.learning_mode: + self.simulation_desc = f"Training Episode {episode}" self.bidding_params = bidding_params @@ -281,7 +287,11 @@ def setup( # self.clock_agent.stopped.add_done_callback(stop) self.container.register(self.clock_agent, suggested_aid="clock_agent") else: - self.setup_learning(episode=episode, eval_episode=eval_episode) + if self.learning_config: + self.setup_learning( + episode=episode, + eval_episode=eval_episode, + ) self.setup_output_agent( save_frequency_hours=save_frequency_hours, @@ -300,16 +310,15 @@ def setup_learning(self, episode: int, eval_episode: int) -> None: the RL agent and adds the learning role to it for further processing. """ - self.bidding_params.update(self.learning_config) - - if self.learning_mode or self.evaluation_mode: - # if so, we initiate the rl learning role with parameters - from assume.reinforcement_learning.learning_role import Learning + from assume.reinforcement_learning.learning_role import Learning - self.learning_role = Learning( - self.learning_config, start=self.start, end=self.end - ) + # create LearningConfig object + self.learning_role = Learning( + learning_config=self.learning_config, start=self.start, end=self.end + ) + if self.learning_config.learning_mode or self.learning_config.evaluation_mode: + # if so, we initiate the rl learning role with parameters rl_agent = agent_composed_of( self.learning_role, register_in=self.container, @@ -326,9 +335,6 @@ def setup_learning(self, episode: int, eval_episode: int) -> None: train_start=self.start, ) - else: - self.learning_role = None - def setup_output_agent( self, save_frequency_hours: int, @@ -537,11 +543,13 @@ def _prepare_bidding_strategies(self, unit_params, unit_id): if strategy not in strategy_instances: # check if created cache has learning_strategy - if ( - issubclass(self.bidding_strategies[strategy], LearningStrategy) - and self.learning_mode - ): + if issubclass(self.bidding_strategies[strategy], LearningStrategy): # add learning role to the strategy to have access to store training data etc + if self.learning_config is None: + raise ValueError( + f"Learning strategy '{strategy}' requires a configured 'learning_config', but none was set. " + "Specify learning_config in config.yaml." + ) strategy_instances[strategy] = self.bidding_strategies[strategy]( unit_id=unit_id, learning_role=self.learning_role, diff --git a/assume_cli/cli.py b/assume_cli/cli.py index ed0f290ee..f0bd9cbe6 100644 --- a/assume_cli/cli.py +++ b/assume_cli/cli.py @@ -159,7 +159,7 @@ def cli(args=None): logging.info(f"loaded {args.scenario} - {args.case_study}") - if world.learning_config.get("learning_mode", False): + if world.learning_mode: run_learning(world) world.run() diff --git a/docker_configs/dashboard-definitions/ASSUME_Learning.json b/docker_configs/dashboard-definitions/ASSUME_Learning.json index 0a28d52d3..b819591c5 100644 --- a/docker_configs/dashboard-definitions/ASSUME_Learning.json +++ b/docker_configs/dashboard-definitions/ASSUME_Learning.json @@ -25,7 +25,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 5, + "id": 3, "links": [], "panels": [ { @@ -50,7 +50,7 @@ "content": "# Welcome to Learning Dashboard by ASSUME\n", "mode": "markdown" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "title": "", "type": "text" }, @@ -76,7 +76,7 @@ "content": "## How to Use the Dashboard \n\nThis interactive tool provides insights into the learning process of our reinforcement learning (RL) agent. While the TensorBoard integration (compare ReadMe) provides an overview of the general learning performance of all agents, this dashboard gives you the option to inspect **each agent individually or in comparison**. To effectively navigate the dashboard, keep the following key points in mind: \n\n### 1. Dashboard Structure \n\nThe dashboard is divided into two main rows and two columns: \n\n**COLUMNS** \n- **Training Episode:** Depicts the results of the selected training episodes at the top of the dashboard. Note that training episodes have special properties such as noise added for exploration purposes. \n- **Evaluation Episode:** Depicts the results of the selected evaluation episodes. \n\n**ROWS** \n- **Per Simulation:** Summarizes results across all simulated episodes, distinguishing between \"training\" and \"evaluation\" episodes. \n- **Per Episode:** Focuses on specific units and episodes, allowing for detailed analysis. \n\n### 2. Selecting Metrics and Simulations \n\nAt the top of the dashboard, you can filter data using the following options: \n\n- **Simulation ID:** Choose the simulation run you want to analyze. \n- **Unit & Episode:** Select specific units and episodes for detailed insights. You can also select all units, but be cautious—visualizing a large number of agents may cause performance issues. \n\nThe selected metrics will be displayed in corresponding visualization areas. \n\n### 3. Interacting with Plots \n\nThe plots are fully interactive: \n\n- **Zoom** in and out to explore trends. \n- **Click** on data points for detailed values. \n- **Hover** over the upper-left corner of each plot for additional insights. \n\n### How to Interpret the Data \n\nTo make the most of the dashboard, follow these key steps: \n\n1. **Understand Learning Trends:** \n - Observe how **reward** and **profitability** metrics evolve over time. \n - Look for an upward trend in evaluation runs (where exploration is off). Unlike fluctuating training rewards, evaluation rewards should show a steady increase. \n\n2. **Analyze Policy & Action Patterns:** \n - Examine the agent's learned policy and action distribution. \n - Identify patterns and assess whether they align with\n", "mode": "markdown" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "title": "", "type": "text" }, @@ -102,7 +102,7 @@ "content": "## Training Episode ", "mode": "markdown" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "title": "", "type": "text" }, @@ -127,7 +127,7 @@ "content": "## Evaluation Episode", "mode": "markdown" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "title": "", "type": "text" }, @@ -182,7 +182,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -233,12 +233,13 @@ ], "seriesMapping": "auto", "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -380,7 +381,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -431,12 +432,13 @@ ], "seriesMapping": "auto", "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -591,7 +593,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -642,12 +644,13 @@ ], "seriesMapping": "auto", "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -789,7 +792,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -840,12 +843,13 @@ ], "seriesMapping": "auto", "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -987,7 +991,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -1038,12 +1042,13 @@ ], "seriesMapping": "auto", "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -1185,7 +1190,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -1236,12 +1241,13 @@ ], "seriesMapping": "auto", "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -1392,6 +1398,7 @@ "type": "linear" }, "showPoints": "auto", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -1407,7 +1414,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -1447,12 +1454,13 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -1587,6 +1595,7 @@ "type": "linear" }, "showPoints": "auto", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -1602,7 +1611,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -1642,12 +1651,13 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -1782,6 +1792,7 @@ "type": "linear" }, "showPoints": "auto", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -1797,7 +1808,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -1837,12 +1848,13 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -1977,6 +1989,7 @@ "type": "linear" }, "showPoints": "auto", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -1992,7 +2005,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -2032,12 +2045,13 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -2172,6 +2186,7 @@ "type": "linear" }, "showPoints": "auto", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -2189,7 +2204,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -2215,12 +2230,13 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -2339,12 +2355,9 @@ "title": "Training ACTIONS during Episode ${episode}", "transformations": [ { - "id": "partitionByValues", + "id": "prepareTimeSeries", "options": { - "fields": [ - "unit" - ], - "keepFields": false + "format": "multi" } } ], @@ -2384,6 +2397,7 @@ "type": "linear" }, "showPoints": "auto", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -2401,7 +2415,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 } ] } @@ -2427,12 +2441,13 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "maxHeight": 600, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.2.1", "targets": [ { "datasource": { @@ -2551,12 +2566,9 @@ "title": "Evaluation ACTIONS during Eval Episode ${eval_episode}", "transformations": [ { - "id": "partitionByValues", + "id": "prepareTimeSeries", "options": { - "fields": [ - "unit" - ], - "keepFields": false + "format": "multi" } } ], @@ -2565,7 +2577,7 @@ ], "preload": false, "refresh": "", - "schemaVersion": 40, + "schemaVersion": 42, "tags": [], "templating": { "list": [ @@ -2673,6 +2685,5 @@ "timezone": "", "title": "ASSUME: Training progress", "uid": "JKQzx0q4k", - "version": 20, - "weekStart": "" + "version": 13 } diff --git a/examples/examples.py b/examples/examples.py index 951e3e8a7..f334a4ceb 100644 --- a/examples/examples.py +++ b/examples/examples.py @@ -155,7 +155,7 @@ # unit_type="custom_unit", # ) - if world.learning_config.get("learning_mode", False): + if world.learning_mode: # run learning if learning mode is enabled run_learning(world) diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml index 292804bdb..a1a6cc9c1 100644 --- a/examples/inputs/example_02a/config.yaml +++ b/examples/inputs/example_02a/config.yaml @@ -6,10 +6,10 @@ base: start_date: 2019-03-01 00:00 end_date: 2019-03-31 00:00 time_step: 1h - learning_mode: true save_frequency_hours: null learning_config: + learning_mode: true continue_learning: false trained_policies_save_path: null trained_policies_load_path: null @@ -51,10 +51,10 @@ base_lstm: start_date: 2019-03-01 00:00 end_date: 2019-03-31 00:00 time_step: 1h - learning_mode: true save_frequency_hours: null learning_config: + learning_mode: true continue_learning: false trained_policies_save_path: null max_bid_price: 100 @@ -97,10 +97,10 @@ tiny: start_date: 2019-01-01 00:00 end_date: 2019-01-05 00:00 time_step: 1h - learning_mode: true save_frequency_hours: null learning_config: + learning_mode: true continue_learning: false trained_policies_save_path: null max_bid_price: 100 diff --git a/examples/inputs/example_02b/config.yaml b/examples/inputs/example_02b/config.yaml index b70e4b992..6815fae79 100644 --- a/examples/inputs/example_02b/config.yaml +++ b/examples/inputs/example_02b/config.yaml @@ -7,9 +7,9 @@ base: end_date: 2019-04-01 00:00 time_step: 1h save_frequency_hours: null - learning_mode: True learning_config: + learning_mode: True continue_learning: false trained_policies_save_path: null trained_policies_load_path: null @@ -51,9 +51,9 @@ base_lstm: end_date: 2019-04-01 00:00 time_step: 1h save_frequency_hours: null - learning_mode: True learning_config: + learning_mode: True continue_learning: False trained_policies_save_path: null max_bid_price: 100 diff --git a/examples/inputs/example_02c/config.yaml b/examples/inputs/example_02c/config.yaml index 43c9656bf..8df83cd60 100644 --- a/examples/inputs/example_02c/config.yaml +++ b/examples/inputs/example_02c/config.yaml @@ -7,9 +7,9 @@ base: end_date: 2019-04-01 00:00 time_step: 1h save_frequency_hours: null - learning_mode: true learning_config: + learning_mode: true continue_learning: false trained_policies_save_path: null trained_policies_load_path: null diff --git a/examples/inputs/example_02d/config.yaml b/examples/inputs/example_02d/config.yaml index 43c9656bf..8df83cd60 100644 --- a/examples/inputs/example_02d/config.yaml +++ b/examples/inputs/example_02d/config.yaml @@ -7,9 +7,9 @@ base: end_date: 2019-04-01 00:00 time_step: 1h save_frequency_hours: null - learning_mode: true learning_config: + learning_mode: true continue_learning: false trained_policies_save_path: null trained_policies_load_path: null diff --git a/examples/inputs/example_02e/config.yaml b/examples/inputs/example_02e/config.yaml index 6b537f5f5..dbf0c1587 100644 --- a/examples/inputs/example_02e/config.yaml +++ b/examples/inputs/example_02e/config.yaml @@ -7,9 +7,9 @@ base: end_date: 2019-04-30 00:00 time_step: 1h save_frequency_hours: null - learning_mode: true learning_config: + learning_mode: true continue_learning: false trained_policies_save_path: null trained_policies_load_path: null @@ -53,9 +53,9 @@ tiny: end_date: 2019-01-05 00:00 time_step: 1h save_frequency_hours: null - learning_mode: True learning_config: + learning_mode: True continue_learning: False trained_policies_save_path: null max_bid_price: 50 diff --git a/examples/inputs/example_03a/config.yaml b/examples/inputs/example_03a/config.yaml index 2cda0010e..ad9dcf5bf 100644 --- a/examples/inputs/example_03a/config.yaml +++ b/examples/inputs/example_03a/config.yaml @@ -6,10 +6,10 @@ base_case_2019: start_date: 2019-03-01 00:00 end_date: 2019-03-31 00:00 time_step: 1h - learning_mode: true save_frequency_hours: null learning_config: + learning_mode: true continue_learning: false trained_policies_save_path: null max_bid_price: 100 diff --git a/examples/inputs/example_03b/config.yaml b/examples/inputs/example_03b/config.yaml index 176397bd3..3a597ba71 100644 --- a/examples/inputs/example_03b/config.yaml +++ b/examples/inputs/example_03b/config.yaml @@ -6,10 +6,10 @@ base_case_2021: start_date: 2021-03-01 00:00 end_date: 2021-03-31 00:00 time_step: 1h - learning_mode: True save_frequency_hours: Null learning_config: + learning_mode: True continue_learning: False trained_policies_save_path: null max_bid_price: 100 diff --git a/examples/inputs/example_03c/config.yaml b/examples/inputs/example_03c/config.yaml index ac65b7030..4f58f1fab 100644 --- a/examples/inputs/example_03c/config.yaml +++ b/examples/inputs/example_03c/config.yaml @@ -6,10 +6,10 @@ base_case_2019_with_storage: start_date: 2019-03-01 00:00 end_date: 2019-05-30 00:00 time_step: 1h - learning_mode: true save_frequency_hours: null learning_config: + learning_mode: true continue_learning: false trained_policies_save_path: null max_bid_price: 100 diff --git a/examples/notebooks/04a_reinforcement_learning_algorithm_example.ipynb b/examples/notebooks/04a_reinforcement_learning_algorithm_example.ipynb index dbeb6c179..13618b4e7 100644 --- a/examples/notebooks/04a_reinforcement_learning_algorithm_example.ipynb +++ b/examples/notebooks/04a_reinforcement_learning_algorithm_example.ipynb @@ -330,9 +330,9 @@ " world.learning_role.rl_algorithm.initialize_policy()\n", "\n", " # check if we already stored policies for this simulation\n", - " save_path = world.learning_config[\"trained_policies_save_path\"]\n", + " save_path = world.learning_config.trained_policies_save_path\n", "\n", - " if Path(save_path).is_dir() and not world.learning_config[\"continue_learning\"]:\n", + " if Path(save_path).is_dir() and not world.learning_config.continue_learning:\n", " # we are in learning mode and about to train new policies, which might overwrite existing ones\n", " accept = input(\n", " f\"{save_path=} exists - should we overwrite current learned strategies? (y/N) \"\n", @@ -357,7 +357,7 @@ " # Information that needs to be stored across episodes, aka one simulation run\n", " inter_episodic_data = {\n", " \"buffer\": ReplayBuffer(\n", - " buffer_size=int(world.learning_config.get(\"replay_buffer_size\", 5e5)),\n", + " buffer_size=world.learning_config.replay_buffer_size,\n", " obs_dim=world.learning_role.rl_algorithm.obs_dim,\n", " act_dim=world.learning_role.rl_algorithm.act_dim,\n", " n_rl_units=len(world.learning_role.rl_strats),\n", @@ -378,7 +378,7 @@ "\n", " validation_interval = min(\n", " world.learning_role.training_episodes,\n", - " world.learning_config.get(\"validation_episodes_interval\", 5),\n", + " world.learning_config.validation_episodes_interval,\n", " )\n", "\n", " eval_episode = 1\n", @@ -1105,7 +1105,7 @@ " # run learning if learning mode is enabled\n", " # needed as we simulate the modelling horizon multiple times to train reinforcement learning run_learning(world)\n", "\n", - " if world.learning_config.get(\"learning_mode\", False):\n", + " if world.learning_config.learning_mode:\n", " run_learning(world)\n", "\n", " # after the learning is done we make a normal run of the simulation, which equals a test run\n", diff --git a/examples/notebooks/04b_reinforcement_learning_example.ipynb b/examples/notebooks/04b_reinforcement_learning_example.ipynb index 4b9b20cd5..aed2b3b3c 100644 --- a/examples/notebooks/04b_reinforcement_learning_example.ipynb +++ b/examples/notebooks/04b_reinforcement_learning_example.ipynb @@ -1501,7 +1501,7 @@ " )\n", "\n", " # 4. Run the training phase\n", - " if world.learning_config.get(\"learning_mode\", False):\n", + " if world.learning_config.learning_mode:\n", " run_learning(world)\n", "\n", " # 5. Execute final evaluation run (no exploration)\n", diff --git a/examples/notebooks/04c_reinforcement_learning_storage_example.ipynb b/examples/notebooks/04c_reinforcement_learning_storage_example.ipynb index fab4b397e..3412a636a 100644 --- a/examples/notebooks/04c_reinforcement_learning_storage_example.ipynb +++ b/examples/notebooks/04c_reinforcement_learning_storage_example.ipynb @@ -1559,7 +1559,7 @@ " )\n", "\n", " # 4. Run the training phase\n", - " if world.learning_config.get(\"learning_mode\", False):\n", + " if world.learning_config.learning_mode:\n", " run_learning(world)\n", "\n", " # 5. Execute final evaluation run (no exploration)\n", diff --git a/examples/notebooks/09_example_Sim_and_xRL.ipynb b/examples/notebooks/09_example_Sim_and_xRL.ipynb index 0c6316765..6a026608a 100644 --- a/examples/notebooks/09_example_Sim_and_xRL.ipynb +++ b/examples/notebooks/09_example_Sim_and_xRL.ipynb @@ -523,10 +523,10 @@ " world.learning_role.rl_algorithm.initialize_policy()\n", "\n", " # check if we already stored policies for this simulation\n", - " save_path = world.learning_config[\"trained_policies_save_path\"]\n", + " save_path = world.learning_config.trained_policies_save_path\n", "\n", " if Path(save_path).is_dir():\n", - " if world.learning_config.get(\"continue_learning\", False):\n", + " if world.learning_config.continue_learning:\n", " logger.warning(\n", " f\"Save path '{save_path}' exists.\\n\"\n", " \"You are in continue learning mode. New strategies may overwrite previous ones.\\n\"\n", @@ -566,7 +566,7 @@ " # Information that needs to be stored across episodes, aka one simulation run\n", " inter_episodic_data = {\n", " \"buffer\": ReplayBuffer(\n", - " buffer_size=int(world.learning_config.get(\"replay_buffer_size\", 5e5)),\n", + " buffer_size=world.learning_config.replay_buffer_size,\n", " obs_dim=world.learning_role.rl_algorithm.obs_dim,\n", " act_dim=world.learning_role.rl_algorithm.act_dim,\n", " n_rl_units=len(world.learning_role.rl_strats),\n", @@ -587,7 +587,7 @@ "\n", " validation_interval = min(\n", " world.learning_role.training_episodes,\n", - " world.learning_config.get(\"validation_episodes_interval\", 5),\n", + " world.learning_config.validation_episodes_interval,\n", " )\n", "\n", " # Ensure training episodes exceed the sum of initial experience and one evaluation interval\n", @@ -798,7 +798,7 @@ ")\n", "\n", "# If learning mode is enabled, run the reinforcement learning loop\n", - "if world.learning_config.get(\"learning_mode\", False):\n", + "if world.learning_config.learning_mode:\n", " run_learning(world)\n", "\n", "# Run the simulation\n", diff --git a/tests/test_drl_storage_strategy.py b/tests/test_drl_storage_strategy.py index 989636809..6bf97b1c6 100644 --- a/tests/test_drl_storage_strategy.py +++ b/tests/test_drl_storage_strategy.py @@ -28,31 +28,30 @@ def storage_unit() -> Storage: """ Fixture to create a Storage unit instance with example parameters. """ - # Define the learning configuration for the StorageEnergyLearningStrategy - learning_config: LearningConfig = { + # Define the learning configuration for the StorageRLStrategy + config = { "obs_dim": 50, "act_dim": 2, - "algorithm": "matd3", - "learning_mode": True, - "training_episodes": 3, "unit_id": "test_storage", - "max_bid_price": 100, "max_demand": 1000, - "evaluation_mode": False, - "continue_learning": False, - "trained_policies_save_path": "not required", + "learning_config": LearningConfig( + algorithm="matd3", + learning_mode=True, + training_episodes=3, + max_bid_price=100, + ), } index = pd.date_range("2023-06-30 22:00:00", periods=48, freq="h") ff = UnitForecaster(index, market_prices={"test_market": 50}) - learning_role = Learning(learning_config, index[0], index[-1]) + learning_role = Learning(config["learning_config"], index[0], index[-1]) return Storage( id="test_storage", unit_operator="test_operator", technology="storage", bidding_strategies={ "test_market": StorageEnergyLearningStrategy( - learning_role=learning_role, **learning_config + learning_role=learning_role, **config ) }, max_power_charge=-500, # Negative for charging @@ -107,7 +106,7 @@ def test_storage_rl_strategy_sell_bid(mock_market_config, storage_unit): # Mock the calculate_marginal_cost method to return a fixed marginal cost with patch.object(Storage, "calculate_marginal_cost", return_value=10.0): # Calculate bids using the strategy - bids = strategy.calculate_bids( + bids = strategy.calculate_bids( # TODO storage_unit, mc, product_tuples=product_tuples ) diff --git a/tests/test_learning_role.py b/tests/test_learning_role.py index 57888d310..1f938cef0 100644 --- a/tests/test_learning_role.py +++ b/tests/test_learning_role.py @@ -7,9 +7,9 @@ import pytest try: + from assume.common.base import LearningConfig from assume.reinforcement_learning.learning_role import ( Learning, - LearningConfig, LearningStrategy, ) from assume.reinforcement_learning.neural_network_architecture import ( @@ -25,28 +25,31 @@ @pytest.mark.require_learning def test_learning_init(): - learning_config: LearningConfig = { + config = { "obs_dim": 3, "act_dim": 2, - "train_freq": "1h", "unique_obs_dim": 0, - "algorithm": "matd3", - "actor_architecture": "mlp", - "learning_mode": False, - "evaluation_mode": False, - "training_episodes": 3, - "episodes_collecting_initial_experience": 1, - "continue_learning": False, - "trained_policies_save_path": None, - "early_stopping_steps": 10, - "early_stopping_threshold": 0.05, + "learning_config": LearningConfig( + train_freq="1h", + algorithm="matd3", + actor_architecture="mlp", + learning_mode=True, + evaluation_mode=False, + training_episodes=3, + episodes_collecting_initial_experience=1, + continue_learning=False, + trained_policies_save_path=None, + early_stopping_steps=10, + early_stopping_threshold=0.05, + ), } + # test init - learn = Learning(learning_config, start, end) + learn = Learning(config["learning_config"], start=start, end=end) assert len(learn.rl_strats) == 0 # we need to add learning strategies first - learn.rl_strats["test_id"] = LearningStrategy(**learning_config) + learn.rl_strats["test_id"] = LearningStrategy(**config, learning_role=learn) # test creating actors learn.initialize_policy() diff --git a/tests/test_matd3.py b/tests/test_matd3.py index 2725db70f..5520dc483 100644 --- a/tests/test_matd3.py +++ b/tests/test_matd3.py @@ -4,11 +4,13 @@ import json import os -from copy import deepcopy +from copy import copy, deepcopy from datetime import datetime import pytest +from assume.common.base import LearningConfig + try: import torch as th @@ -27,49 +29,52 @@ @pytest.fixture -def base_learning_config(): +def base_learning_config() -> dict: return { "obs_dim": 10, "act_dim": 3, - "train_freq": "1h", "unique_obs_dim": 2, - "algorithm": "matd3", - "actor_architecture": "mlp", - "learning_mode": True, - "evaluation_mode": False, - "training_episodes": 1, - "episodes_collecting_initial_experience": 0, - "continue_learning": False, - "trained_policies_save_path": None, - "early_stopping_steps": 10, - "early_stopping_threshold": 0.05, - "learning_rate": 1e-4, - "batch_size": 100, - "tau": 0.005, - "gamma": 0.99, - "gradient_steps": 1, - "policy_delay": 2, - "target_policy_noise": 0.2, - "target_noise_clip": 0.5, + "num_timeseries_obs_dim": 4, + "learning_config": LearningConfig( + train_freq="1h", + algorithm="matd3", + actor_architecture="mlp", + learning_mode=True, + evaluation_mode=False, + training_episodes=1, + episodes_collecting_initial_experience=0, + continue_learning=False, + trained_policies_save_path=None, + early_stopping_steps=10, + early_stopping_threshold=0.05, + learning_rate=1e-4, + batch_size=100, + tau=0.005, + gamma=0.99, + gradient_steps=1, + policy_delay=2, + target_policy_noise=0.2, + target_noise_clip=0.5, + ), } @pytest.fixture(scope="function") def learning_role_n(base_learning_config): - config = base_learning_config.copy() - learn = Learning(config, start, end) - learn.rl_strats["agent_0"] = LearningStrategy(**config) - learn.rl_strats["agent_1"] = LearningStrategy(**config) + config = copy(base_learning_config) + learn = Learning(config["learning_config"], start, end) + learn.rl_strats["agent_0"] = LearningStrategy(**config, learning_role=learn) + learn.rl_strats["agent_1"] = LearningStrategy(**config, learning_role=learn) return learn @pytest.fixture(scope="function") def learning_role_n_plus_m(base_learning_config): - config = base_learning_config.copy() - learn = Learning(config, start, end) - learn.rl_strats["agent_0"] = LearningStrategy(**config) - learn.rl_strats["agent_1"] = LearningStrategy(**config) - learn.rl_strats["agent_2"] = LearningStrategy(**config) + config = copy(base_learning_config) + learn = Learning(config["learning_config"], start, end) + learn.rl_strats["agent_0"] = LearningStrategy(**config, learning_role=learn) + learn.rl_strats["agent_1"] = LearningStrategy(**config, learning_role=learn) + learn.rl_strats["agent_2"] = LearningStrategy(**config, learning_role=learn) return learn @@ -214,10 +219,14 @@ def test_td3_save_params(learning_role_n, tmp_path): def test_td3_load_matching_n(base_learning_config, saved_n_agent_model): save_dir, original_states = saved_n_agent_model - config_n_new = base_learning_config.copy() - learn_n_new = Learning(config_n_new, start, end) - learn_n_new.rl_strats["agent_0"] = LearningStrategy(**config_n_new) - learn_n_new.rl_strats["agent_1"] = LearningStrategy(**config_n_new) + config_n_new = copy(base_learning_config) + learn_n_new = Learning(config_n_new["learning_config"], start, end) + learn_n_new.rl_strats["agent_0"] = LearningStrategy( + **config_n_new, learning_role=learn_n_new + ) + learn_n_new.rl_strats["agent_1"] = LearningStrategy( + **config_n_new, learning_role=learn_n_new + ) learn_n_new.initialize_policy() learn_n_new.rl_algorithm.load_params(directory=save_dir) @@ -372,9 +381,9 @@ def test_td3_load_transfer_n_minus_m( @pytest.mark.require_learning def test_td3_load_corrupted_or_incomplete_critic(tmp_path, base_learning_config): - config = base_learning_config.copy() - learning = Learning(config, start, end) - learning.rl_strats["agent_0"] = LearningStrategy(**config) + config = copy(base_learning_config) + learning = Learning(config["learning_config"], start, end) + learning.rl_strats["agent_0"] = LearningStrategy(**config, learning_role=learning) learning.initialize_policy() original_state = deepcopy(learning.rl_strats["agent_0"].critics.state_dict()) @@ -417,17 +426,17 @@ def test_initialize_policy_dimension_mismatch( """ Test that mismatches in observation/action/unique/timeseries dims raise ValueErrors. """ - config = base_learning_config.copy() + config = copy(base_learning_config) config["num_timeseries_obs_dim"] = 1 # Ensure field exists for valid check - learn = Learning(config, start, end) + learn = Learning(config["learning_config"], start, end) # Create one agent with default config - strat_0 = LearningStrategy(**config) + strat_0 = LearningStrategy(**config, learning_role=learn) # Create second agent with mismatching value - config_mismatch = config.copy() + config_mismatch = copy(config) config_mismatch[mod_field] = mod_value - strat_1 = LearningStrategy(**config_mismatch) + strat_1 = LearningStrategy(**config_mismatch, learning_role=learn) learn.rl_strats["agent_0"] = strat_0 learn.rl_strats["agent_1"] = strat_1 @@ -442,13 +451,13 @@ def test_initialize_policy_all_dimensions_match(base_learning_config): """ Test that initialize_policy succeeds with all matching dimensions. """ - config = base_learning_config.copy() + config = copy(base_learning_config) config["num_timeseries_obs_dim"] = 1 # Ensure the optional field is populated - learn = Learning(config, start, end) - learn.rl_strats["agent_0"] = LearningStrategy(**config) - learn.rl_strats["agent_1"] = LearningStrategy(**config) - learn.rl_strats["agent_2"] = LearningStrategy(**config) + learn = Learning(config["learning_config"], start, end) + learn.rl_strats["agent_0"] = LearningStrategy(**config, learning_role=learn) + learn.rl_strats["agent_1"] = LearningStrategy(**config, learning_role=learn) + learn.rl_strats["agent_2"] = LearningStrategy(**config, learning_role=learn) try: learn.rl_algorithm.initialize_policy() # Should not raise diff --git a/tests/test_rl_strategies.py b/tests/test_rl_strategies.py index 9ed0c0a9b..84a6a051b 100644 --- a/tests/test_rl_strategies.py +++ b/tests/test_rl_strategies.py @@ -36,16 +36,15 @@ def power_plant() -> PowerPlant: fuel_prices={"lignite": 10, "co2": 10}, residual_load={"EOM": 0}, ) - learning_config: LearningConfig = { - "algorithm": "matd3", - "learning_mode": True, - "evaluation_mode": False, - "continue_learning": False, - "trained_policies_save_path": "not required", - "training_episodes": 3, + config = { "unit_id": "test_pp", + "learning_config": LearningConfig( + algorithm="matd3", + learning_mode=True, + training_episodes=3, + ), } - learning_role = Learning(learning_config, start, end) + learning_role = Learning(config["learning_config"], start, end) return PowerPlant( id="test_pp", @@ -57,9 +56,7 @@ def power_plant() -> PowerPlant: efficiency=0.5, additional_cost=10, bidding_strategies={ - "EOM": EnergyLearningStrategy( - learning_role=learning_role, **learning_config - ) + "EOM": EnergyLearningStrategy(learning_role=learning_role, **config) }, fuel_type="lignite", emission_factor=0.5, @@ -69,11 +66,11 @@ def power_plant() -> PowerPlant: @pytest.mark.require_learning @pytest.mark.parametrize( - "strategy_class, obs_dim, act_dim, actor_architecture, expected_bid_count, expected_volumes", + "strategy_class, obs_dim, act_dim, unique_obs_dim, actor_architecture, expected_bid_count, expected_volumes", [ - (EnergyLearningStrategy, 38, 2, "mlp", 2, [200, 800]), - (EnergyLearningStrategy, 38, 2, "lstm", 2, [200, 800]), - (EnergyLearningSingleBidStrategy, 74, 1, "mlp", 1, [1000]), + (EnergyLearningStrategy, 38, 2, 2, "mlp", 2, [200, 800]), + (EnergyLearningStrategy, 38, 2, 2, "lstm", 2, [200, 800]), + (EnergyLearningSingleBidStrategy, 74, 1, 2, "mlp", 1, [1000]), ], ) def test_learning_strategies_parametrized( @@ -82,6 +79,7 @@ def test_learning_strategies_parametrized( strategy_class, obs_dim, act_dim, + unique_obs_dim, actor_architecture, expected_bid_count, expected_volumes, @@ -92,24 +90,21 @@ def test_learning_strategies_parametrized( product_tuples = [ (start, start + pd.Timedelta(hours=1), None) for start in product_index ] - - # Build learning config dynamically - learning_config: LearningConfig = { - "algorithm": "matd3", - "learning_mode": True, - "training_episodes": 3, + # Build LearningConfig dynamically + config = { "unit_id": power_plant.id, - "evaluation_mode": False, - "continue_learning": False, - "trained_policies_save_path": "not required", + "learning_config": LearningConfig( + algorithm="matd3", + actor_architecture=actor_architecture, + learning_mode=True, + training_episodes=3, + ), } - if actor_architecture != "mlp": - learning_config["actor_architecture"] = actor_architecture - learning_role = Learning(learning_config, start, end) + learning_role = Learning(config["learning_config"], start, end) # Override the strategy power_plant.bidding_strategies[mc.market_id] = strategy_class( - learning_role=learning_role, **learning_config + learning_role=learning_role, **config ) strategy = power_plant.bidding_strategies[mc.market_id]