Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 147 additions & 36 deletions assume/common/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@
#
# SPDX-License-Identifier: AGPL-3.0-or-later

import logging
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import TypedDict

import numpy as np

from assume.common.fast_pandas import FastIndex, FastSeries
from assume.common.forecaster import UnitForecaster
from assume.common.market_objects import MarketConfig, Orderbook, Product

logger = logging.getLogger(__name__)


class BaseStrategy:
pass
Expand Down Expand Up @@ -744,6 +747,142 @@ def remove_empty_bids(self, bids: list) -> list:
return cleaned_bids


@dataclass
class LearningConfig:
"""
A class for the learning configuration.

Attributes:
learning_mode (bool): Should we use learning mode at all? If False, the learning bidding strategy is
loaded from trained_policies_load_path and no training occurs. Default is False.
evaluation_mode (bool): This setting is modified internally. Whether to run in evaluation mode. If True, the agent uses the learned policy
without exploration noise and no training updates occur. Default is False.
continue_learning (bool): Whether to use pre-learned strategies and then continue learning.
If True, loads existing policies from trained_policies_load_path and continues training. Default is False.
trained_policies_save_path (str | None): The directory path - relative to the scenario's inputs_path - where newly trained RL policies (actor and
critic networks) will be saved. Only needed when learning_mode is True. Value is set in setup_world(). Defaults to None.
trained_policies_load_path (str | None): The directory path - relative to the scenario's inputs_path - from which pre-trained policies should be
loaded. Needed when continue_learning is True or using pre-trained strategies. Default is None.

min_bid_price (float | None): The minimum bid price which limits the action of the actor to this price.
Used to constrain the actor's output to a realistic price range. Default is -100.0.
max_bid_price (float | None): The maximum bid price which limits the action of the actor to this price.
Used to constrain the actor's output to a realistic price range. Default is 100.0.

device (str): The device to use for PyTorch computations. Options include "cpu", "cuda", or specific
CUDA devices like "cuda:0". Default is "cpu".
episodes_collecting_initial_experience (int): The number of episodes at the start during which random
actions are chosen instead of using the actor network. This helps populate the replay buffer with
diverse experiences. Default is 5.
exploration_noise_std (float): The standard deviation of Gaussian noise added to actions during
exploration in the environment. Higher values encourage more exploration. Default is 0.2.
training_episodes (int): The number of training episodes, where one episode is the entire simulation
horizon specified in the general config. Default is 100.
validation_episodes_interval (int): The interval (in episodes) at which validation episodes are run
to evaluate the current policy's performance without training updates. Default is 5.
train_freq (str): Defines the frequency in time steps at which the actor and critic networks are updated.
Accepts time strings like "24h" for 24 hours or "1d" for 1 day. Default is "24h".
batch_size (int): The batch size of experiences sampled from the replay buffer for each training update.
Larger batches provide more stable gradients but require more memory. In environments with many leanring agents we advise small batch sizes.
Default is 128.
gradient_steps (int): The number of gradient descent steps performed during each training update.
More steps can lead to better learning but increase computation time. Default is 100.
learning_rate (float): The learning rate (step size) for the optimizer, which controls how much the
policy and value networks are updated during training. Default is 0.001.
learning_rate_schedule (str | None): Which learning rate decay schedule to use. Currently only "linear"
decay is available, which linearly decreases the learning rate over time. Default is None (constant learning rate).
early_stopping_steps (int | None): The number of validation steps over which the moving average reward
is calculated for early stopping. If the reward doesn't change by early_stopping_threshold over
this many steps, training stops. If None, defaults to training_episodes / validation_episodes_interval + 1.
early_stopping_threshold (float): The minimum improvement in moving average reward required to avoid
early stopping. If the reward improvement is less than this threshold over early_stopping_steps,
training is terminated early. Default is 0.05.

algorithm (str): Specifies which reinforcement learning algorithm to use. Currently, only "matd3"
(Multi-Agent Twin Delayed Deep Deterministic Policy Gradient) is implemented. Default is "matd3".
replay_buffer_size (int): The maximum number of transitions stored in the replay buffer for experience replay.
Larger buffers allow for more diverse training samples. Default is 500000.
gamma (float): The discount factor for future rewards, ranging from 0 to 1. Higher values give more
weight to long-term rewards in decision-making. Default is 0.99.
actor_architecture (str): The architecture of the neural networks used for the actors. Options include
"mlp" (Multi-Layer Perceptron) and "lstm" (Long Short-Term Memory). Default is "mlp".
policy_delay (int): The frequency (in gradient steps) at which the actor policy is updated.
TD3 updates the critic more frequently than the actor to stabilize training. Default is 2.
noise_sigma (float): The standard deviation of the Ornstein-Uhlenbeck or Gaussian noise distribution
used to generate exploration noise added to actions. Default is 0.1.
noise_scale (int): The scale factor multiplied by the noise drawn from the distribution.
Larger values increase exploration. Default is 1.
noise_dt (int): The time step parameter for the Ornstein-Uhlenbeck process, which determines how
quickly the noise decays over time. Used for noise scheduling. Default is 1.
action_noise_schedule (str | None): Which action noise decay schedule to use. Currently only "linear"
decay is available, which linearly decreases exploration noise over training. Default is "linear".
tau (float): The soft update coefficient for updating target networks. Controls how slowly target
networks track the main networks. Smaller values mean slower updates. Default is 0.005.
target_policy_noise (float): The standard deviation of noise added to target policy actions during
critic updates. This smoothing helps prevent overfitting to narrow policy peaks. Default is 0.2.
target_noise_clip (float): The maximum absolute value for clipping the target policy noise.
Prevents the noise from being too large. Default is 0.5.

"""

learning_mode: bool = False
evaluation_mode: bool = False
continue_learning: bool = False
trained_policies_save_path: str | None = None
trained_policies_load_path: str | None = None

min_bid_price: float | None = -100.0
max_bid_price: float | None = 100.0

device: str = "cpu"
episodes_collecting_initial_experience: int = 5
exploration_noise_std: float = 0.2
training_episodes: int = 100
validation_episodes_interval: int = 5
train_freq: str = "24h"
batch_size: int = 128
gradient_steps: int = 100
learning_rate: float = 0.001
learning_rate_schedule: str | None = None
early_stopping_steps: int | None = None
early_stopping_threshold: float = 0.05

algorithm: str = "matd3"
replay_buffer_size: int = 50000
gamma: float = 0.99
actor_architecture: str = "mlp"
policy_delay: int = 2
noise_sigma: float = 0.1
noise_scale: int = 1
noise_dt: int = 1
action_noise_schedule: str | None = None
tau: float = 0.005
target_policy_noise: float = 0.2
target_noise_clip: float = 0.5

def __post_init__(self):
"""Calculate defaults that depend on other fields and validate inputs."""
if self.early_stopping_steps is None:
self.early_stopping_steps = int(
self.training_episodes / self.validation_episodes_interval + 1
)

# if we do not have initial experience collected we will get an error as no samples are available on the
# buffer from which we can draw experience to adapt the strategy, hence we set it to minimum one episode
if self.episodes_collecting_initial_experience < 1:
logger.warning(
f"episodes_collecting_initial_experience need to be at least 1 to sample from buffer, got {self.episodes_collecting_initial_experience}. setting to 1"
)

self.episodes_collecting_initial_experience = 1

# check that gradient_steps is positive
if self.gradient_steps <= 0:
raise ValueError(
f"gradient_steps need to be positive, got {self.gradient_steps}"
)


class LearningStrategy(BaseStrategy):
"""
A strategy which provides learning functionality, has a method to calculate the reward.
Expand All @@ -758,6 +897,7 @@ class LearningStrategy(BaseStrategy):
act_dim (int): The action dimension.
unique_obs_dim (int): The unique observation dimension.
num_timeseries_obs_dim (int): The number of observation timeseries dimension.
learning_role (Learning): The learning role orchestrating the learning.

Args:
*args (list): The arguments.
Expand All @@ -766,11 +906,11 @@ class LearningStrategy(BaseStrategy):

def __init__(
self,
learning_role,
obs_dim: int,
act_dim: int,
unique_obs_dim: int = 0,
unique_obs_dim: int,
num_timeseries_obs_dim: int = 3,
learning_role=None,
*args,
**kwargs,
):
Expand All @@ -779,6 +919,10 @@ def __init__(
"""
super().__init__(*args, **kwargs)

# access to the learning_role that orchestrates learning
self.learning_role = learning_role
self.learning_config = learning_role.learning_config

self.obs_dim = obs_dim
self.act_dim = act_dim

Expand All @@ -790,9 +934,6 @@ def __init__(
# them into suitable format for recurrent neural networks
self.num_timeseries_obs_dim = num_timeseries_obs_dim

# access to the learning_role that orchestrates learning
self.learning_role = learning_role


class MinMaxStrategy(BaseStrategy):
pass
Expand All @@ -804,33 +945,3 @@ class MinMaxChargeStrategy(BaseStrategy):

class ExchangeStrategy(BaseStrategy):
pass


class LearningConfig(TypedDict):
"""
A class for the learning configuration.
"""

continue_learning: bool
min_bid_price: float
max_bid_price: float
learning_mode: bool
algorithm: str
actor_architecture: str
learning_rate: float
learning_rate_schedule: str
training_episodes: int
episodes_collecting_initial_experience: int
train_freq: str
gradient_steps: int
batch_size: int
gamma: float
device: str
noise_sigma: float
noise_scale: int
noise_dt: int
action_noise_schedule: str
trained_policies_save_path: str
trained_policies_load_path: str
early_stopping_steps: int
early_stopping_threshold: float
2 changes: 2 additions & 0 deletions assume/common/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ def setup(self):
)

def on_ready(self):
super().on_ready()

if self.db_uri:
self.db = create_engine(self.db_uri)
if self.db is not None:
Expand Down
35 changes: 4 additions & 31 deletions assume/reinforcement_learning/algorithms/base_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,52 +17,25 @@ class RLAlgorithm:

Args:
learning_role (Learning Role object): Learning object
learning_rate (float): learning rate for adam optimizer
batch_size (int): Minibatch size for each gradient update
tau (float): the soft update coefficient ("Polyak update", between 0 and 1)
gamma (float): the discount factor
gradient_steps (int): how many gradient steps to do after each rollout
policy_delay (int): Policy and target networks will only be updated once every policy_delay steps per training steps. The Q values will be updated policy_delay more often (update every training step)
target_policy_noise (float): Standard deviation of Gaussian noise added to target policy (smoothing noise)
target_noise_clip (float): Limit for absolute value of target policy smoothing noise
actor_architecture (str): type of Actor neural network
"""

def __init__(
self,
# init learning_role as object of Learning class
learning_role,
learning_rate=1e-4,
batch_size=1024,
tau=0.005,
gamma=0.99,
gradient_steps=100,
policy_delay=2,
target_policy_noise=0.2,
target_noise_clip=0.5,
actor_architecture="mlp",
):
super().__init__()

self.learning_role = learning_role
self.learning_rate = learning_rate
self.batch_size = batch_size
self.gamma = gamma
self.tau = tau
self.learning_config = learning_role.learning_config

self.gradient_steps = gradient_steps

self.policy_delay = policy_delay
self.target_noise_clip = target_noise_clip
self.target_policy_noise = target_policy_noise

if actor_architecture in actor_architecture_aliases.keys():
if self.learning_config.actor_architecture in actor_architecture_aliases.keys():
self.actor_architecture_class = actor_architecture_aliases[
actor_architecture
self.learning_config.actor_architecture
]
else:
raise ValueError(
f"Policy '{actor_architecture}' unknown. Supported architectures are {list(actor_architecture_aliases.keys())}"
f"Policy '{self.learning_config.actor_architecture}' unknown. Supported architectures are {list(actor_architecture_aliases.keys())}"
)

self.device = self.learning_role.device
Expand Down
Loading