Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
exports org.hiero.otter.fixtures.result;
exports org.hiero.otter.fixtures.container.proto;
exports org.hiero.otter.fixtures.app;
exports org.hiero.otter.fixtures.chaosbot;
exports org.hiero.otter.fixtures.logging.internal;
exports org.hiero.otter.fixtures.internal.helpers to
org.hiero.consensus.otter.docker.app;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,13 @@ default Partition createNetworkPartition(@NonNull final Node node0, @NonNull fin
*/
void setLatencyForAllConnections(@NonNull Node node, @NonNull LatencyRange latencyRange);

/**
* Restores the default latency for all connections from this node. The default is determined by the topology.
*
* @param node the node for which to remove custom latencies
*/
void restoreLatencyForAllConnections(@NonNull Node node);

/**
* Sets the bandwidth limit for all connections from and to this node.
*
Expand All @@ -264,6 +271,13 @@ default Partition createNetworkPartition(@NonNull final Node node0, @NonNull fin
*/
void setBandwidthForAllConnections(@NonNull Node node, @NonNull BandwidthLimit bandwidthLimit);

/**
* Restores the default bandwidth limit for all connections from this node. The default is determined by the topology.
*
* @param node the node for which to remove bandwidth limits
*/
void restoreBandwidthLimitsForAllConnections(@NonNull Node node);

/**
* Restore the network connectivity to its original/default state. Removes all partitions, cliques, and custom
* connection settings. The defaults are defined by the {@link Topology} of the network.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import edu.umd.cs.findbugs.annotations.NonNull;
import java.util.Set;
import org.hiero.otter.fixtures.chaosbot.ChaosBot;

/**
* Interface representing the test environment of an Otter test.
Expand Down Expand Up @@ -44,6 +45,24 @@ public interface TestEnvironment {
@NonNull
TransactionGenerator transactionGenerator();

/**
* Create a chaos bot that can introduce randomized faults into the test environment.
*
* @return the chaos bot
*/
@NonNull
ChaosBot createChaosBot();

/**
* Create a chaos bot that can introduce pseudo-randomized faults into the test environment.
* The created failures will be reproducible using the same seed.
*
* @param seed the seed for randomness
* @return the chaos bot
*/
@NonNull
ChaosBot createChaosBot(long seed);

/**
* Destroys the test environment. Once this method is called, the test environment and all its
* components are no longer usable. This method is idempotent, meaning that it is safe to call
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// SPDX-License-Identifier: Apache-2.0
package org.hiero.otter.fixtures.chaosbot;

import edu.umd.cs.findbugs.annotations.NonNull;
import java.time.Duration;

/**
* A chaos bot that introduces randomized faults into the network.
*/
public interface ChaosBot {

/**
* Run chaos experiments for the specified duration.
*
* @param duration the duration to run chaos experiments
*/
void runChaos(@NonNull Duration duration);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// SPDX-License-Identifier: Apache-2.0
package org.hiero.otter.fixtures.chaosbot.internal;

import static java.util.Objects.requireNonNull;
import static org.hiero.otter.fixtures.chaosbot.internal.RandomUtil.randomGaussianDuration;

import com.swirlds.common.test.fixtures.Randotron;
import edu.umd.cs.findbugs.annotations.NonNull;
import java.time.Duration;
import java.time.Instant;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.hiero.otter.fixtures.Network;
import org.hiero.otter.fixtures.Node;
import org.hiero.otter.fixtures.TestEnvironment;
import org.hiero.otter.fixtures.TimeManager;
import org.hiero.otter.fixtures.chaosbot.ChaosBot;
import org.hiero.otter.fixtures.result.SingleNodeConsensusResult;

/**
* Implementation of a chaos bot that creates random failures in the test environment.
*/
public class ChaosBotImpl implements ChaosBot {

private static final Logger log = LogManager.getLogger();

// These values will become configurable in the future.
private static final Duration CHAOS_INTERVAL = Duration.ofMinutes(3L);
private static final Duration CHAOS_DEVIATION = Duration.ofMinutes(2L);

private final TestEnvironment env;
private final Randotron randotron;
private final ExperimentFactory factory;
private final Map<Class<?>, Integer> statistics = new HashMap<>();

/**
* Create a new chaos bot.
*
* @param env the test environment
*/
public ChaosBotImpl(@NonNull final TestEnvironment env) {
this(env, Randotron.create());
}

/**
* Create a new chaos bot with a specific random seed.
*
* @param env the test environment
* @param seed the random seed
*/
public ChaosBotImpl(@NonNull final TestEnvironment env, final long seed) {
this(env, Randotron.create(seed));
}

private ChaosBotImpl(@NonNull final TestEnvironment env, @NonNull final Randotron randotron) {
this.env = requireNonNull(env);
this.randotron = requireNonNull(randotron);
this.factory = new ExperimentFactory(env, randotron);
}

/**
* {@inheritDoc}
*/
@Override
public void runChaos(@NonNull final Duration duration) {
final Network network = env.network();
final TimeManager timeManager = env.timeManager();
final Instant endTime = timeManager.now().plus(duration);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I suggest calling this chaosEndTime so as to distinguish it more from the endTime of each experiment.


final PriorityQueue<Experiment> runningExperiments =
new PriorityQueue<>(Comparator.comparing(Experiment::endTime));
Instant nextStart = calculateNextStart(randotron, timeManager.now());

while (timeManager.now().isBefore(endTime)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Some simple comments in this method would help readers understand more quickly

final Instant nextBreak = findEarliestInstant(endTime, nextStart, nextExperimentEnd(runningExperiments));
timeManager.waitFor(Duration.between(timeManager.now(), nextBreak));

while (nextExperimentEnd(runningExperiments)
.isBefore(timeManager.now().plusNanos(1L))) {
final Experiment finishedExperiment = runningExperiments.poll();
assert finishedExperiment != null; // nextExperimentEnd would have returned Instant.MAX if empty
finishedExperiment.end();
}

if (nextStart.isBefore(timeManager.now().plusNanos(1L))) {
final Experiment experiment = factory.createExperiment();
if (experiment != null) {
statistics.merge(experiment.getClass(), 1, Integer::sum);
}
if (experiment != null) {
runningExperiments.add(experiment);
}
nextStart = calculateNextStart(randotron, timeManager.now());
}
}

log.info("Chaos bot finished. Statistics of experiments run:");
for (final Map.Entry<Class<?>, Integer> entry : statistics.entrySet()) {
log.info(" {}: {}", entry.getKey().getSimpleName(), entry.getValue());
}

// End any remaining experiments.
network.restoreConnectivity();
for (final Node node : network.nodes()) {
if (!node.isAlive()) {
node.start();
}
}

// Wait until all nodes are active again.
timeManager.waitForCondition(
network::allNodesAreActive,
Duration.ofMinutes(5L),
"Not all nodes became active again after chaos bot finished");

// Check that all nodes make progress
for (final Node node : network.nodes()) {
final SingleNodeConsensusResult consensusResult = node.newConsensusResult();
final long currentRound = consensusResult.lastRoundNum();
timeManager.waitForCondition(
() -> consensusResult.lastRoundNum() > currentRound,
Duration.ofSeconds(30L),
"Node " + node.selfId() + " did not make progress after chaos bot finished");
}
}

@NonNull
private static Instant nextExperimentEnd(@NonNull final PriorityQueue<Experiment> runningExperiments) {
return runningExperiments.isEmpty()
? Instant.MAX
: runningExperiments.peek().endTime();
}

@NonNull
private static Instant findEarliestInstant(
@NonNull final Instant i1, @NonNull final Instant i2, @NonNull final Instant i3) {
return i1.isBefore(i2) ? (i1.isBefore(i3) ? i1 : i3) : (i2.isBefore(i3) ? i2 : i3);
}

@NonNull
private Instant calculateNextStart(@NonNull final Randotron randotron, @NonNull final Instant now) {
return now.plus(randomGaussianDuration(randotron, CHAOS_INTERVAL, CHAOS_DEVIATION));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// SPDX-License-Identifier: Apache-2.0
package org.hiero.otter.fixtures.chaosbot.internal;

import static java.util.Objects.requireNonNull;

import edu.umd.cs.findbugs.annotations.NonNull;
import java.time.Instant;
import org.hiero.otter.fixtures.Network;

/**
* An experiment that modifies the network or individual nodes in some way for a limited time.
*/
public abstract class Experiment {

protected final Network network;
protected final Instant endTime;

/**
* Create a new experiment.
*
* @param network the network of the test environment
* @param endTime the moment this experiment should end
*/
protected Experiment(@NonNull final Network network, @NonNull final Instant endTime) {
this.network = requireNonNull(network);
this.endTime = requireNonNull(endTime);
}

/**
* The moment this experiment should end.
*
* @return the end time of the experiment
*/
@NonNull
public Instant endTime() {
return endTime;
}

/**
* End the experiment, reverting any changes.
*/
public abstract void end();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// SPDX-License-Identifier: Apache-2.0
package org.hiero.otter.fixtures.chaosbot.internal;

import static java.util.Objects.requireNonNull;

import com.swirlds.common.test.fixtures.Randotron;
import edu.umd.cs.findbugs.annotations.NonNull;
import edu.umd.cs.findbugs.annotations.Nullable;
import org.hiero.otter.fixtures.TestEnvironment;

/**
* Factory for creating random experiments.
*/
public class ExperimentFactory {

private final TestEnvironment env;
private final Randotron randotron;

/**
* Create a new experiment factory.
*
* @param env the test environment
* @param randotron the random number generator
*/
public ExperimentFactory(@NonNull final TestEnvironment env, @NonNull final Randotron randotron) {
this.env = requireNonNull(env);
this.randotron = requireNonNull(randotron);
}

/**
* Create a new random experiment.
*
* @return the created experiment, or {@code null} if no suitable experiment could be created
*/
@Nullable
public Experiment createExperiment() {
// For now, we assume all experiments are equally likely. Will become configurable in the future.
final int experimentType = randotron.nextInt(5);
return switch (experimentType) {
case 0 -> HighLatencyNodeExperiment.create(env, randotron);
case 1 -> LowBandwidthNodeExperiment.create(env, randotron);
case 2 -> NetworkPartitionExperiment.create(env, randotron);
case 3 -> NodeFailureExperiment.create(env, randotron);
case 4 -> NodeIsolationExperiment.create(env, randotron);
default -> throw new IllegalStateException("Unreachable code reached in ExperimentFactory");
};
}
}
Loading
Loading