diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/module-info.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/module-info.java index 187e5e3d52d3..aadca7e3a38c 100644 --- a/platform-sdk/consensus-otter-tests/src/testFixtures/java/module-info.java +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/module-info.java @@ -51,6 +51,7 @@ exports org.hiero.otter.fixtures.result; exports org.hiero.otter.fixtures.container.proto; exports org.hiero.otter.fixtures.app; + exports org.hiero.otter.fixtures.chaosbot; exports org.hiero.otter.fixtures.logging.internal; exports org.hiero.otter.fixtures.internal.helpers to org.hiero.consensus.otter.docker.app; diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/Network.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/Network.java index fb02195383a4..05fbeedda510 100644 --- a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/Network.java +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/Network.java @@ -256,6 +256,13 @@ default Partition createNetworkPartition(@NonNull final Node node0, @NonNull fin */ void setLatencyForAllConnections(@NonNull Node node, @NonNull LatencyRange latencyRange); + /** + * Restores the default latency for all connections from this node. The default is determined by the topology. + * + * @param node the node for which to remove custom latencies + */ + void restoreLatencyForAllConnections(@NonNull Node node); + /** * Sets the bandwidth limit for all connections from and to this node. * @@ -264,6 +271,13 @@ default Partition createNetworkPartition(@NonNull final Node node0, @NonNull fin */ void setBandwidthForAllConnections(@NonNull Node node, @NonNull BandwidthLimit bandwidthLimit); + /** + * Restores the default bandwidth limit for all connections from this node. The default is determined by the topology. + * + * @param node the node for which to remove bandwidth limits + */ + void restoreBandwidthLimitsForAllConnections(@NonNull Node node); + /** * Restore the network connectivity to its original/default state. Removes all partitions, cliques, and custom * connection settings. The defaults are defined by the {@link Topology} of the network. diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/TestEnvironment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/TestEnvironment.java index 16c487271551..ca5ea08f07bc 100644 --- a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/TestEnvironment.java +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/TestEnvironment.java @@ -3,6 +3,7 @@ import edu.umd.cs.findbugs.annotations.NonNull; import java.util.Set; +import org.hiero.otter.fixtures.chaosbot.ChaosBot; /** * Interface representing the test environment of an Otter test. @@ -44,6 +45,24 @@ public interface TestEnvironment { @NonNull TransactionGenerator transactionGenerator(); + /** + * Create a chaos bot that can introduce randomized faults into the test environment. + * + * @return the chaos bot + */ + @NonNull + ChaosBot createChaosBot(); + + /** + * Create a chaos bot that can introduce pseudo-randomized faults into the test environment. + * The created failures will be reproducible using the same seed. + * + * @param seed the seed for randomness + * @return the chaos bot + */ + @NonNull + ChaosBot createChaosBot(long seed); + /** * Destroys the test environment. Once this method is called, the test environment and all its * components are no longer usable. This method is idempotent, meaning that it is safe to call diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/ChaosBot.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/ChaosBot.java new file mode 100644 index 000000000000..d20609f53b88 --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/ChaosBot.java @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot; + +import edu.umd.cs.findbugs.annotations.NonNull; +import java.time.Duration; + +/** + * A chaos bot that introduces randomized faults into the network. + */ +public interface ChaosBot { + + /** + * Run chaos experiments for the specified duration. + * + * @param duration the duration to run chaos experiments + */ + void runChaos(@NonNull Duration duration); +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/ChaosBotImpl.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/ChaosBotImpl.java new file mode 100644 index 000000000000..e465caf4c8fe --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/ChaosBotImpl.java @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import static java.util.Objects.requireNonNull; +import static org.hiero.otter.fixtures.chaosbot.internal.RandomUtil.randomGaussianDuration; + +import com.swirlds.common.test.fixtures.Randotron; +import edu.umd.cs.findbugs.annotations.NonNull; +import java.time.Duration; +import java.time.Instant; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; +import java.util.PriorityQueue; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.hiero.otter.fixtures.Network; +import org.hiero.otter.fixtures.Node; +import org.hiero.otter.fixtures.TestEnvironment; +import org.hiero.otter.fixtures.TimeManager; +import org.hiero.otter.fixtures.chaosbot.ChaosBot; +import org.hiero.otter.fixtures.result.SingleNodeConsensusResult; + +/** + * Implementation of a chaos bot that creates random failures in the test environment. + */ +public class ChaosBotImpl implements ChaosBot { + + private static final Logger log = LogManager.getLogger(); + + // These values will become configurable in the future. + private static final Duration CHAOS_INTERVAL = Duration.ofMinutes(3L); + private static final Duration CHAOS_DEVIATION = Duration.ofMinutes(2L); + + private final TestEnvironment env; + private final Randotron randotron; + private final ExperimentFactory factory; + private final Map, Integer> statistics = new HashMap<>(); + + /** + * Create a new chaos bot. + * + * @param env the test environment + */ + public ChaosBotImpl(@NonNull final TestEnvironment env) { + this(env, Randotron.create()); + } + + /** + * Create a new chaos bot with a specific random seed. + * + * @param env the test environment + * @param seed the random seed + */ + public ChaosBotImpl(@NonNull final TestEnvironment env, final long seed) { + this(env, Randotron.create(seed)); + } + + private ChaosBotImpl(@NonNull final TestEnvironment env, @NonNull final Randotron randotron) { + this.env = requireNonNull(env); + this.randotron = requireNonNull(randotron); + this.factory = new ExperimentFactory(env, randotron); + } + + /** + * {@inheritDoc} + */ + @Override + public void runChaos(@NonNull final Duration duration) { + final Network network = env.network(); + final TimeManager timeManager = env.timeManager(); + final Instant endTime = timeManager.now().plus(duration); + + final PriorityQueue runningExperiments = + new PriorityQueue<>(Comparator.comparing(Experiment::endTime)); + Instant nextStart = calculateNextStart(randotron, timeManager.now()); + + while (timeManager.now().isBefore(endTime)) { + final Instant nextBreak = findEarliestInstant(endTime, nextStart, nextExperimentEnd(runningExperiments)); + timeManager.waitFor(Duration.between(timeManager.now(), nextBreak)); + + while (nextExperimentEnd(runningExperiments) + .isBefore(timeManager.now().plusNanos(1L))) { + final Experiment finishedExperiment = runningExperiments.poll(); + assert finishedExperiment != null; // nextExperimentEnd would have returned Instant.MAX if empty + finishedExperiment.end(); + } + + if (nextStart.isBefore(timeManager.now().plusNanos(1L))) { + final Experiment experiment = factory.createExperiment(); + if (experiment != null) { + statistics.merge(experiment.getClass(), 1, Integer::sum); + } + if (experiment != null) { + runningExperiments.add(experiment); + } + nextStart = calculateNextStart(randotron, timeManager.now()); + } + } + + log.info("Chaos bot finished. Statistics of experiments run:"); + for (final Map.Entry, Integer> entry : statistics.entrySet()) { + log.info(" {}: {}", entry.getKey().getSimpleName(), entry.getValue()); + } + + // End any remaining experiments. + network.restoreConnectivity(); + for (final Node node : network.nodes()) { + if (!node.isAlive()) { + node.start(); + } + } + + // Wait until all nodes are active again. + timeManager.waitForCondition( + network::allNodesAreActive, + Duration.ofMinutes(5L), + "Not all nodes became active again after chaos bot finished"); + + // Check that all nodes make progress + for (final Node node : network.nodes()) { + final SingleNodeConsensusResult consensusResult = node.newConsensusResult(); + final long currentRound = consensusResult.lastRoundNum(); + timeManager.waitForCondition( + () -> consensusResult.lastRoundNum() > currentRound, + Duration.ofSeconds(30L), + "Node " + node.selfId() + " did not make progress after chaos bot finished"); + } + } + + @NonNull + private static Instant nextExperimentEnd(@NonNull final PriorityQueue runningExperiments) { + return runningExperiments.isEmpty() + ? Instant.MAX + : runningExperiments.peek().endTime(); + } + + @NonNull + private static Instant findEarliestInstant( + @NonNull final Instant i1, @NonNull final Instant i2, @NonNull final Instant i3) { + return i1.isBefore(i2) ? (i1.isBefore(i3) ? i1 : i3) : (i2.isBefore(i3) ? i2 : i3); + } + + @NonNull + private Instant calculateNextStart(@NonNull final Randotron randotron, @NonNull final Instant now) { + return now.plus(randomGaussianDuration(randotron, CHAOS_INTERVAL, CHAOS_DEVIATION)); + } +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/Experiment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/Experiment.java new file mode 100644 index 000000000000..f3280e96fbd0 --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/Experiment.java @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import static java.util.Objects.requireNonNull; + +import edu.umd.cs.findbugs.annotations.NonNull; +import java.time.Instant; +import org.hiero.otter.fixtures.Network; + +/** + * An experiment that modifies the network or individual nodes in some way for a limited time. + */ +public abstract class Experiment { + + protected final Network network; + protected final Instant endTime; + + /** + * Create a new experiment. + * + * @param network the network of the test environment + * @param endTime the moment this experiment should end + */ + protected Experiment(@NonNull final Network network, @NonNull final Instant endTime) { + this.network = requireNonNull(network); + this.endTime = requireNonNull(endTime); + } + + /** + * The moment this experiment should end. + * + * @return the end time of the experiment + */ + @NonNull + public Instant endTime() { + return endTime; + } + + /** + * End the experiment, reverting any changes. + */ + public abstract void end(); +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/ExperimentFactory.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/ExperimentFactory.java new file mode 100644 index 000000000000..e415214755f1 --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/ExperimentFactory.java @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import static java.util.Objects.requireNonNull; + +import com.swirlds.common.test.fixtures.Randotron; +import edu.umd.cs.findbugs.annotations.NonNull; +import edu.umd.cs.findbugs.annotations.Nullable; +import org.hiero.otter.fixtures.TestEnvironment; + +/** + * Factory for creating random experiments. + */ +public class ExperimentFactory { + + private final TestEnvironment env; + private final Randotron randotron; + + /** + * Create a new experiment factory. + * + * @param env the test environment + * @param randotron the random number generator + */ + public ExperimentFactory(@NonNull final TestEnvironment env, @NonNull final Randotron randotron) { + this.env = requireNonNull(env); + this.randotron = requireNonNull(randotron); + } + + /** + * Create a new random experiment. + * + * @return the created experiment, or {@code null} if no suitable experiment could be created + */ + @Nullable + public Experiment createExperiment() { + // For now, we assume all experiments are equally likely. Will become configurable in the future. + final int experimentType = randotron.nextInt(5); + return switch (experimentType) { + case 0 -> HighLatencyNodeExperiment.create(env, randotron); + case 1 -> LowBandwidthNodeExperiment.create(env, randotron); + case 2 -> NetworkPartitionExperiment.create(env, randotron); + case 3 -> NodeFailureExperiment.create(env, randotron); + case 4 -> NodeIsolationExperiment.create(env, randotron); + default -> throw new IllegalStateException("Unreachable code reached in ExperimentFactory"); + }; + } +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/HighLatencyNodeExperiment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/HighLatencyNodeExperiment.java new file mode 100644 index 000000000000..2d7e17507d26 --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/HighLatencyNodeExperiment.java @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import static org.hiero.otter.fixtures.chaosbot.internal.RandomUtil.randomGaussianDuration; + +import com.swirlds.common.test.fixtures.Randotron; +import edu.umd.cs.findbugs.annotations.NonNull; +import edu.umd.cs.findbugs.annotations.Nullable; +import java.time.Duration; +import java.time.Instant; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.assertj.core.data.Percentage; +import org.hiero.otter.fixtures.Network; +import org.hiero.otter.fixtures.Node; +import org.hiero.otter.fixtures.TestEnvironment; +import org.hiero.otter.fixtures.TimeManager; +import org.hiero.otter.fixtures.network.utils.LatencyRange; + +/** + * An experiment that introduces high latency to all connections of a target node for a duration. + */ +public class HighLatencyNodeExperiment extends Experiment { + + private static final Logger log = LogManager.getLogger(); + + // These values will become configurable in the future. + private static final Duration MEAN_DURATION = Duration.ofMinutes(2L); + private static final Duration DURATION_DEVIATION = Duration.ofSeconds(30L); + private static final Duration MEAN_LATENCY = Duration.ofMillis(2_000L); + private static final Duration LATENCY_DEVIATION = Duration.ofMillis(500L); + private static final Percentage JITTER = Percentage.withPercentage(10); + + private static final Set affectedNodes = new HashSet<>(); + + private final Node targetNode; + + private HighLatencyNodeExperiment( + @NonNull final Network network, @NonNull final Instant endTime, @NonNull final Node targetNode) { + super(network, endTime); + this.targetNode = targetNode; + } + + /** + * Creates and starts a high-latency node experiment. + * + * @param env the test environment + * @param randotron the random number generator + * @return the created experiment, or {@code null} if no suitable node was found + */ + @Nullable + public static HighLatencyNodeExperiment create( + @NonNull final TestEnvironment env, @NonNull final Randotron randotron) { + final TimeManager timeManager = env.timeManager(); + final Network network = env.network(); + final List candidates = network.nodes().stream() + .filter(node -> !affectedNodes.contains(node)) + .toList(); + if (candidates.isEmpty()) { + log.info("No available nodes to apply high latency experiment."); + return null; + } + final Node targetNode = candidates.get(randotron.nextInt(candidates.size())); + final LatencyRange latencyRange = + LatencyRange.of(randomGaussianDuration(randotron, MEAN_LATENCY, LATENCY_DEVIATION), JITTER); + network.setLatencyForAllConnections(targetNode, latencyRange); + affectedNodes.add(targetNode); + final Duration duration = randomGaussianDuration(randotron, MEAN_DURATION, DURATION_DEVIATION); + log.info( + "Started high latency node experiment for node {} with latency range {} and duration {}.", + targetNode.selfId(), + latencyRange, + duration); + return new HighLatencyNodeExperiment(network, timeManager.now().plus(duration), targetNode); + } + + /** + * {@inheritDoc} + */ + @Override + public void end() { + network.restoreLatencyForAllConnections(targetNode); + affectedNodes.remove(targetNode); + log.info("Ended high latency experiment for node {}.", targetNode.selfId()); + } +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/LowBandwidthNodeExperiment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/LowBandwidthNodeExperiment.java new file mode 100644 index 000000000000..99c96d43b24f --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/LowBandwidthNodeExperiment.java @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import static org.hiero.otter.fixtures.chaosbot.internal.RandomUtil.randomGaussianBandwidthLimit; +import static org.hiero.otter.fixtures.chaosbot.internal.RandomUtil.randomGaussianDuration; + +import com.swirlds.common.test.fixtures.Randotron; +import edu.umd.cs.findbugs.annotations.NonNull; +import edu.umd.cs.findbugs.annotations.Nullable; +import java.time.Duration; +import java.time.Instant; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.hiero.otter.fixtures.Network; +import org.hiero.otter.fixtures.Node; +import org.hiero.otter.fixtures.TestEnvironment; +import org.hiero.otter.fixtures.TimeManager; +import org.hiero.otter.fixtures.network.utils.BandwidthLimit; + +/** + * An experiment that introduces low bandwidth to all connections of a target node for a duration. + */ +public class LowBandwidthNodeExperiment extends Experiment { + + private static final Logger log = LogManager.getLogger(); + + // These values will become configurable in the future. + private static final Duration MEAN_DURATION = Duration.ofMinutes(2L); + private static final Duration DURATION_DEVIATION = Duration.ofSeconds(30L); + private static final BandwidthLimit MEAN_BANDWIDTH_LIMIT = BandwidthLimit.ofKilobytesPerSecond(5); + private static final BandwidthLimit BANDWIDTH_DEVIATION = BandwidthLimit.ofKilobytesPerSecond(2); + + private static final Set affectedNodes = new HashSet<>(); + + private final Node targetNode; + + private LowBandwidthNodeExperiment( + @NonNull final Network network, @NonNull final Instant endTime, @NonNull final Node targetNode) { + super(network, endTime); + this.targetNode = targetNode; + } + + /** + * Creates and starts a low-bandwidth node experiment. + * + * @param env the test environment + * @param randotron the random number generator + * @return the created experiment, or {@code null} if no suitable node was found + */ + @Nullable + public static LowBandwidthNodeExperiment create( + @NonNull final TestEnvironment env, @NonNull final Randotron randotron) { + final TimeManager timeManager = env.timeManager(); + final Network network = env.network(); + final List candidates = network.nodes().stream() + .filter(node -> !affectedNodes.contains(node)) + .toList(); + if (candidates.isEmpty()) { + log.info("No available nodes to apply low bandwidth experiment."); + return null; + } + final Node targetNode = candidates.get(randotron.nextInt(candidates.size())); + final BandwidthLimit bandwidthLimit = + randomGaussianBandwidthLimit(randotron, MEAN_BANDWIDTH_LIMIT, BANDWIDTH_DEVIATION); + network.setBandwidthForAllConnections(targetNode, bandwidthLimit); + affectedNodes.add(targetNode); + final Duration duration = randomGaussianDuration(randotron, MEAN_DURATION, DURATION_DEVIATION); + log.info( + "Starting low bandwidth node experiment for node {} with bandwidth limit {} and duration {}.", + targetNode.selfId(), + bandwidthLimit, + duration); + return new LowBandwidthNodeExperiment(network, timeManager.now().plus(duration), targetNode); + } + + /** + * {@inheritDoc} + */ + @Override + public void end() { + network.restoreBandwidthLimitsForAllConnections(targetNode); + affectedNodes.remove(targetNode); + log.info("Ended low bandwidth experiment for node {}.", targetNode.selfId()); + } +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NetworkPartitionExperiment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NetworkPartitionExperiment.java new file mode 100644 index 000000000000..e13a1a700722 --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NetworkPartitionExperiment.java @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import static org.hiero.otter.fixtures.chaosbot.internal.RandomUtil.randomGaussianDuration; + +import com.swirlds.common.test.fixtures.Randotron; +import edu.umd.cs.findbugs.annotations.NonNull; +import edu.umd.cs.findbugs.annotations.Nullable; +import java.time.Duration; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Set; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.hiero.otter.fixtures.Network; +import org.hiero.otter.fixtures.Node; +import org.hiero.otter.fixtures.TestEnvironment; +import org.hiero.otter.fixtures.TimeManager; +import org.hiero.otter.fixtures.network.Partition; + +/** + * An experiment that creates a network partition for a duration. + */ +public class NetworkPartitionExperiment extends Experiment { + + private static final Logger log = LogManager.getLogger(); + + // These values will become configurable in the future. + private static final int MAX_PARTITIONS = 3; + private static final double MIN_PARTITION_FRACTION = 0.2; + private static final double MAX_PARTITION_FRACTION = 0.8; + private static final Duration MEAN_DURATION = Duration.ofMinutes(2L); + private static final Duration DURATION_DEVIATION = Duration.ofSeconds(30L); + + private final Partition partition; + + private NetworkPartitionExperiment( + @NonNull final Network network, @NonNull final Instant endTime, @NonNull final Partition partition) { + super(network, endTime); + this.partition = partition; + } + + /** + * Creates and starts a node failure experiment. + * + * @param env the test environment + * @param randotron the random number generator + * @return the created experiment, or {@code null} if there are already too many partitions + */ + @Nullable + public static NetworkPartitionExperiment create( + @NonNull final TestEnvironment env, @NonNull final Randotron randotron) { + final TimeManager timeManager = env.timeManager(); + final Network network = env.network(); + if (network.networkPartitions().size() >= MAX_PARTITIONS) { + log.info( + "Network has already the maximum number ({}) of partitions. Skipping network partition experiment.", + MAX_PARTITIONS); + return null; + } + final List nodes; + if (network.networkPartitions().isEmpty()) { + nodes = network.nodes(); + } else { + nodes = new ArrayList<>(network.networkPartitions().stream() + .map(Partition::nodes) + .max(Comparator.comparing(Set::size)) + .orElseThrow()); + } + final double partitionFraction = + MIN_PARTITION_FRACTION + (randotron.nextDouble() * (MAX_PARTITION_FRACTION - MIN_PARTITION_FRACTION)); + final int partitionSize = (int) Math.ceil(nodes.size() * partitionFraction); + final List partitionNodes = randotron + .ints(partitionSize, 0, nodes.size()) + .mapToObj(nodes::get) + .toList(); + final Partition partition = network.createNetworkPartition(partitionNodes); + final Duration duration = randomGaussianDuration(randotron, MEAN_DURATION, DURATION_DEVIATION); + log.info( + "Starting network partition experiment with partition size {} and duration {}.", + partitionSize, + duration); + return new NetworkPartitionExperiment(network, timeManager.now().plus(duration), partition); + } + + /** + * {@inheritDoc} + */ + @Override + public void end() { + network.removeNetworkPartition(partition); + log.info("Ended network partition experiment."); + } +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NodeFailureExperiment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NodeFailureExperiment.java new file mode 100644 index 000000000000..363a89376484 --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NodeFailureExperiment.java @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import static org.hiero.otter.fixtures.chaosbot.internal.RandomUtil.randomGaussianDuration; + +import com.swirlds.common.test.fixtures.Randotron; +import edu.umd.cs.findbugs.annotations.NonNull; +import edu.umd.cs.findbugs.annotations.Nullable; +import java.time.Duration; +import java.time.Instant; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.hiero.otter.fixtures.Network; +import org.hiero.otter.fixtures.Node; +import org.hiero.otter.fixtures.TestEnvironment; +import org.hiero.otter.fixtures.TimeManager; + +/** + * An experiment that kills a target node for a duration. + */ +public class NodeFailureExperiment extends Experiment { + + private static final Logger log = LogManager.getLogger(); + + // These values will become configurable in the future. + private static final Duration MEAN_DURATION = Duration.ofMinutes(2L); + private static final Duration DURATION_DEVIATION = Duration.ofSeconds(30L); + + private final Node targetNode; + + private NodeFailureExperiment( + @NonNull final Network network, @NonNull final Instant endTime, @NonNull final Node targetNode) { + super(network, endTime); + this.targetNode = targetNode; + } + + /** + * Creates and starts a node failure experiment. + * + * @param env the test environment + * @param randotron the random number generator + * @return the created experiment, or {@code null} if no suitable node was found + */ + @Nullable + public static NodeFailureExperiment create(@NonNull final TestEnvironment env, @NonNull final Randotron randotron) { + final TimeManager timeManager = env.timeManager(); + final Network network = env.network(); + final List candidates = + network.nodes().stream().filter(Node::isAlive).toList(); + if (candidates.isEmpty()) { + log.info("No available nodes to apply node failure experiment."); + return null; + } + final Node targetNode = candidates.get(randotron.nextInt(candidates.size())); + targetNode.killImmediately(); + final Duration duration = randomGaussianDuration(randotron, MEAN_DURATION, DURATION_DEVIATION); + log.info("Starting node failure experiment for node {} with duration {}.", targetNode.selfId(), duration); + return new NodeFailureExperiment(network, timeManager.now().plus(duration), targetNode); + } + + /** + * {@inheritDoc} + */ + @Override + public void end() { + targetNode.start(); + log.info("Ended node failure experiment for node {}.", targetNode.selfId()); + } +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NodeIsolationExperiment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NodeIsolationExperiment.java new file mode 100644 index 000000000000..39168137dcfe --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/NodeIsolationExperiment.java @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import static org.hiero.otter.fixtures.chaosbot.internal.RandomUtil.randomGaussianDuration; + +import com.swirlds.common.test.fixtures.Randotron; +import edu.umd.cs.findbugs.annotations.NonNull; +import edu.umd.cs.findbugs.annotations.Nullable; +import java.time.Duration; +import java.time.Instant; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.hiero.otter.fixtures.Network; +import org.hiero.otter.fixtures.Node; +import org.hiero.otter.fixtures.TestEnvironment; +import org.hiero.otter.fixtures.TimeManager; + +/** + * An experiment that isolates a target node from the network for a duration. + */ +public class NodeIsolationExperiment extends Experiment { + + private static final Logger log = LogManager.getLogger(); + + // These values will become configurable in the future. + private static final Duration MEAN_DURATION = Duration.ofMinutes(2L); + private static final Duration DURATION_DEVIATION = Duration.ofSeconds(30L); + + private final Node targetNode; + + private NodeIsolationExperiment( + @NonNull final Network network, @NonNull final Instant endTime, @NonNull final Node targetNode) { + super(network, endTime); + this.targetNode = targetNode; + } + + /** + * Creates and starts a node isolation experiment. + * + * @param env the test environment + * @param randotron the random number generator + * @return the created experiment, or {@code null} if no suitable node was found + */ + @Nullable + public static NodeIsolationExperiment create( + @NonNull final TestEnvironment env, @NonNull final Randotron randotron) { + final TimeManager timeManager = env.timeManager(); + final Network network = env.network(); + final List candidates = network.nodes().stream() + .filter(node -> !network.isIsolated(node)) + .toList(); + if (candidates.isEmpty()) { + log.info("No available nodes to apply node isolation experiment."); + return null; + } + final Node targetNode = candidates.get(randotron.nextInt(candidates.size())); + network.isolate(targetNode); + final Duration duration = randomGaussianDuration(randotron, MEAN_DURATION, DURATION_DEVIATION); + log.info("Starting node isolation experiment for node {} with duration {}.", targetNode.selfId(), duration); + return new NodeIsolationExperiment(network, timeManager.now().plus(duration), targetNode); + } + + /** + * {@inheritDoc} + */ + @Override + public void end() { + network.rejoin(targetNode); + log.info("Ended node isolation experiment for node {}.", targetNode.selfId()); + } +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/RandomUtil.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/RandomUtil.java new file mode 100644 index 000000000000..140b9497b7e1 --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/chaosbot/internal/RandomUtil.java @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.fixtures.chaosbot.internal; + +import com.swirlds.common.test.fixtures.Randotron; +import edu.umd.cs.findbugs.annotations.NonNull; +import java.time.Duration; +import org.hiero.otter.fixtures.network.utils.BandwidthLimit; + +/** + * Utility methods for generating random values. + */ +public class RandomUtil { + + private RandomUtil() {} + + /** + * Generate a random duration based on a Gaussian distribution. + * + * @param randotron the random number generator + * @param mean the mean duration + * @param stdDev the standard deviation of the duration + * @return a random duration + */ + public static Duration randomGaussianDuration( + @NonNull final Randotron randotron, @NonNull final Duration mean, @NonNull final Duration stdDev) { + final long jitterSeconds = + Math.max(-mean.getSeconds() + 1, (long) (randotron.nextGaussian() * stdDev.getSeconds())); + return mean.plusSeconds(jitterSeconds); + } + + /** + * Generate a random bandwidth limit based on a Gaussian distribution. + * + * @param randotron the random number generator + * @param mean the mean bandwidth limit + * @param stdDev the standard deviation of the bandwidth limit + * @return a random bandwidth limit + */ + public static BandwidthLimit randomGaussianBandwidthLimit( + @NonNull final Randotron randotron, + @NonNull final BandwidthLimit mean, + @NonNull final BandwidthLimit stdDev) { + final int jitterKbps = (int) (randotron.nextGaussian() * stdDev.toKilobytesPerSecond()); + return BandwidthLimit.ofKilobytesPerSecond(Math.max(1, mean.toKilobytesPerSecond() + jitterKbps)); + } +} diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/container/ContainerTestEnvironment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/container/ContainerTestEnvironment.java index b1b94f2fdee4..b613bc445897 100644 --- a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/container/ContainerTestEnvironment.java +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/container/ContainerTestEnvironment.java @@ -18,6 +18,8 @@ import org.hiero.otter.fixtures.TestEnvironment; import org.hiero.otter.fixtures.TimeManager; import org.hiero.otter.fixtures.TransactionGenerator; +import org.hiero.otter.fixtures.chaosbot.ChaosBot; +import org.hiero.otter.fixtures.chaosbot.internal.ChaosBotImpl; import org.hiero.otter.fixtures.internal.RegularTimeManager; /** @@ -115,6 +117,24 @@ public TransactionGenerator transactionGenerator() { return transactionGenerator; } + /** + * {@inheritDoc} + */ + @Override + @NonNull + public ChaosBot createChaosBot() { + return new ChaosBotImpl(this); + } + + /** + * {@inheritDoc} + */ + @Override + @NonNull + public ChaosBot createChaosBot(final long seed) { + return new ChaosBotImpl(this, seed); + } + /** * {@inheritDoc} */ diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/internal/AbstractNetwork.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/internal/AbstractNetwork.java index 49f5f5571676..2cbfaf4dd118 100644 --- a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/internal/AbstractNetwork.java +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/internal/AbstractNetwork.java @@ -476,12 +476,27 @@ public boolean isIsolated(@NonNull final Node node) { * {@inheritDoc} */ @Override - public void setLatencyForAllConnections(@NonNull final Node sender, @NonNull final LatencyRange latencyRange) { - log.info("Setting latency for all connections from node {} to range {}", sender.selfId(), latencyRange); - for (final Node receiver : nodes()) { - if (!receiver.equals(sender)) { - setLatencyRange(sender, receiver, latencyRange); - setLatencyRange(receiver, sender, latencyRange); + public void setLatencyForAllConnections(@NonNull final Node node, @NonNull final LatencyRange latencyRange) { + log.info("Setting latency for all connections from node {} to range {}", node.selfId(), latencyRange); + for (final Node otherNode : nodes()) { + if (!node.equals(otherNode)) { + setLatencyRange(node, otherNode, latencyRange); + setLatencyRange(otherNode, node, latencyRange); + } + } + updateConnections(); + } + + /** + * {@inheritDoc} + */ + @Override + public void restoreLatencyForAllConnections(@NonNull final Node node) { + log.info("Restoring latency for all connections from node {}", node); + for (final Node otherNode : nodes()) { + if (!node.equals(otherNode)) { + latencyOverrides.remove(new ConnectionKey(node.selfId(), otherNode.selfId())); + latencyOverrides.remove(new ConnectionKey(otherNode.selfId(), node.selfId())); } } updateConnections(); @@ -502,13 +517,24 @@ private void setLatencyRange( * {@inheritDoc} */ @Override - public void setBandwidthForAllConnections( - @NonNull final Node sender, @NonNull final BandwidthLimit bandwidthLimit) { - log.info("Setting bandwidth for all connections from node {} to {}", sender.selfId(), bandwidthLimit); - for (final Node receiver : nodes()) { - if (!receiver.equals(sender)) { - bandwidthOverrides.put(new ConnectionKey(sender.selfId(), receiver.selfId()), bandwidthLimit); - bandwidthOverrides.put(new ConnectionKey(receiver.selfId(), sender.selfId()), bandwidthLimit); + public void setBandwidthForAllConnections(@NonNull final Node node, @NonNull final BandwidthLimit bandwidthLimit) { + log.info("Setting bandwidth for all connections from node {} to {}", node.selfId(), bandwidthLimit); + for (final Node otherNode : nodes()) { + if (!node.equals(otherNode)) { + bandwidthOverrides.put(new ConnectionKey(node.selfId(), otherNode.selfId()), bandwidthLimit); + bandwidthOverrides.put(new ConnectionKey(otherNode.selfId(), node.selfId()), bandwidthLimit); + } + } + updateConnections(); + } + + @Override + public void restoreBandwidthLimitsForAllConnections(@NonNull final Node node) { + log.info("Restoring bandwidth for all connections from node {}", node); + for (final Node otherNode : nodes()) { + if (!node.equals(otherNode)) { + bandwidthOverrides.remove(new ConnectionKey(node.selfId(), otherNode.selfId())); + bandwidthOverrides.remove(new ConnectionKey(otherNode.selfId(), node.selfId())); } } updateConnections(); diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/network/utils/BandwidthLimit.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/network/utils/BandwidthLimit.java index 764a9caae8b1..d010ef6f57d5 100644 --- a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/network/utils/BandwidthLimit.java +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/network/utils/BandwidthLimit.java @@ -86,4 +86,9 @@ public int toMegabytesPerSecond() { public boolean isUnlimited() { return kilobytesPerSecond == UNLIMITED_KILOBYTES_PER_SECOND; } + + @Override + public String toString() { + return "BandwidthLimit{" + "KB/s=" + kilobytesPerSecond + '}'; + } } diff --git a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/turtle/TurtleTestEnvironment.java b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/turtle/TurtleTestEnvironment.java index 9ac48ef684c1..f2ea8efa3c92 100644 --- a/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/turtle/TurtleTestEnvironment.java +++ b/platform-sdk/consensus-otter-tests/src/testFixtures/java/org/hiero/otter/fixtures/turtle/TurtleTestEnvironment.java @@ -26,6 +26,7 @@ import org.hiero.otter.fixtures.TestEnvironment; import org.hiero.otter.fixtures.TimeManager; import org.hiero.otter.fixtures.TransactionGenerator; +import org.hiero.otter.fixtures.chaosbot.ChaosBot; import org.hiero.otter.fixtures.logging.internal.InMemorySubscriptionManager; import org.hiero.otter.fixtures.turtle.logging.TurtleLogClock; import org.hiero.otter.fixtures.turtle.logging.TurtleLogging; @@ -155,6 +156,24 @@ public TransactionGenerator transactionGenerator() { return transactionGenerator; } + /** + * {@inheritDoc} + */ + @Override + @NonNull + public ChaosBot createChaosBot() { + throw new UnsupportedOperationException("ChaosBot is not supported in TurtleTestEnvironment"); + } + + /** + * {@inheritDoc} + */ + @Override + @NonNull + public ChaosBot createChaosBot(final long seed) { + throw new UnsupportedOperationException("ChaosBot is not supported in TurtleTestEnvironment"); + } + /** * {@inheritDoc} */ diff --git a/platform-sdk/consensus-otter-tests/src/testOtter/java/org/hiero/otter/test/ChaosTest.java b/platform-sdk/consensus-otter-tests/src/testOtter/java/org/hiero/otter/test/ChaosTest.java new file mode 100644 index 000000000000..237a3b34070b --- /dev/null +++ b/platform-sdk/consensus-otter-tests/src/testOtter/java/org/hiero/otter/test/ChaosTest.java @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: Apache-2.0 +package org.hiero.otter.test; + +import edu.umd.cs.findbugs.annotations.NonNull; +import java.time.Duration; +import org.hiero.otter.fixtures.Capability; +import org.hiero.otter.fixtures.Network; +import org.hiero.otter.fixtures.OtterTest; +import org.hiero.otter.fixtures.TestEnvironment; +import org.junit.jupiter.api.Disabled; + +/** + * A test that runs chaos experiments on a network of nodes. + */ +public class ChaosTest { + + @OtterTest(requires = Capability.RECONNECT) + @Disabled("This test should only be run manually to verify stability under chaos conditions.") + void chaosTest(@NonNull final TestEnvironment env) { + final Network network = env.network(); + network.addNodes(4); + network.start(); + + env.createChaosBot().runChaos(Duration.ofMinutes(60L)); + } +}