From 6ec8191b3408488c25bfb406523410a0ed395770 Mon Sep 17 00:00:00 2001
From: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
Date: Mon, 27 Apr 2026 13:53:49 -0700
Subject: [PATCH 1/7] Added stress test

Signed-off-by: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
---
 examples/stress/sleep_fanout.py               | 102 +-
 examples/stress/sleep_fanout_harness.py       | 115 ++-
 .../stress/sleep_fanout_harness_wrapper.sh    | 716 ++++++++++++++
 examples/stress/sleep_fanout_swarm_wrapper.sh | 870 ++++++++++++++++++
 examples/stress/sleep_fanout_wrapper.sh       | 610 ++++++++++++
 .../controllers/remote/_controller.py         |  39 +
 .../_internal/controllers/remote/_core.py     |  67 +-
 7 files changed, 2501 insertions(+), 18 deletions(-)
 create mode 100755 examples/stress/sleep_fanout_harness_wrapper.sh
 create mode 100755 examples/stress/sleep_fanout_swarm_wrapper.sh
 create mode 100755 examples/stress/sleep_fanout_wrapper.sh
diff --git a/examples/stress/sleep_fanout.py b/examples/stress/sleep_fanout.py
index c1b637561..30b640e10 100644
--- a/examples/stress/sleep_fanout.py
+++ b/examples/stress/sleep_fanout.py
@@ -1,25 +1,104 @@
 import asyncio
+import os
 from datetime import timedelta
 
 import flyte
 import flyte.report
 from flyte.extras import Sleep
 
+_STRESS_IMAGE_REGISTRY = os.getenv("FLYTE_STRESS_IMAGE_REGISTRY")
+_STRESS_IMAGE_NAME = os.getenv("FLYTE_STRESS_IMAGE_NAME")
+_STRESS_IMAGE_PLATFORMS = tuple(
+    p.strip() for p in os.getenv("FLYTE_STRESS_IMAGE_PLATFORMS", "linux/amd64").split(",") if p.strip()
+)
+_STRESS_RUNTIME_ENV = {
+    k: v
+    for k, v in {
+        "FLYTE_STRESS_IMAGE_REGISTRY": _STRESS_IMAGE_REGISTRY,
+        "FLYTE_STRESS_IMAGE_NAME": _STRESS_IMAGE_NAME,
+        "FLYTE_STRESS_IMAGE_PLATFORMS": ",".join(_STRESS_IMAGE_PLATFORMS),
+    }.items()
+    if v
+}
+
+# Let remote runs redirect image builds to a writable registry without
+# touching the task definitions. For dogfood, this can point at the shared ECR
+# repo used for ad hoc SDK test images. Default to amd64-only so the first
+# build is faster and matches the dogfood cluster architecture.
+stress_image = flyte.Image.from_debian_base(
+    python_version=(3, 12),
+    registry=_STRESS_IMAGE_REGISTRY,
+    name=_STRESS_IMAGE_NAME,
+    platform=_STRESS_IMAGE_PLATFORMS,
+)
+
+
+def _fanout_resources() -> flyte.Resources:
+    # Default for the distributed harness shape. Override these env vars when
+    # testing a single huge parent that needs much more headroom.
+    cpu_request = int(os.getenv("FLYTE_STRESS_FANOUT_CPU_REQUEST", "1"))
+    cpu_limit = int(os.getenv("FLYTE_STRESS_FANOUT_CPU_LIMIT", "2"))
+    memory_request = os.getenv("FLYTE_STRESS_FANOUT_MEMORY_REQUEST", "2Gi")
+    memory_limit = os.getenv("FLYTE_STRESS_FANOUT_MEMORY_LIMIT", "4Gi")
+    return flyte.Resources(cpu=(cpu_request, cpu_limit), memory=(memory_request, memory_limit))
+
+
+def _controller_tuning_env() -> dict[str, str]:
+    env: dict[str, str] = {}
+    for key in (
+        "_F_MAX_QPS",
+        "_F_CTRL_WORKERS",
+        "_F_P_CNC",
+        "_U_USE_ACTIONS",
+        "_F_TRACE_SUBMIT",
+        "_F_TRACE_SUBMIT_LIMIT",
+    ):
+        value = os.getenv(key)
+        if value is not None:
+            env[key] = value
+    return env
+
+
+def _nested_run_env() -> dict[str, str]:
+    return {
+        **_STRESS_RUNTIME_ENV,
+        **_controller_tuning_env(),
+    }
+
+
+def _controller_tuning_summary() -> str:
+    env = _controller_tuning_env()
+    return (
+        "controller_env "
+        f"_F_MAX_QPS={env.get('_F_MAX_QPS', '<unset>')} "
+        f"_F_CTRL_WORKERS={env.get('_F_CTRL_WORKERS', '<unset>')} "
+        f"_F_P_CNC={env.get('_F_P_CNC', '<unset>')} "
+        f"_U_USE_ACTIONS={env.get('_U_USE_ACTIONS', '<unset>')} "
+        f"_F_TRACE_SUBMIT={env.get('_F_TRACE_SUBMIT', '<unset>')} "
+        f"_F_TRACE_SUBMIT_LIMIT={env.get('_F_TRACE_SUBMIT_LIMIT', '<unset>')}"
+    )
+
 # Leaves run in leaseworker via the core-sleep plugin: no task pods are created,
 # so we can fan out wide without paying pod-startup cost.
 sleep_env = flyte.TaskEnvironment(
     name="sleep_fanout_leaf",
+    image=stress_image,
+    env_vars=_STRESS_RUNTIME_ENV,
     plugin_config=Sleep(),
 )
 
 fanout_env = flyte.TaskEnvironment(
     name="sleep_fanout",
-    resources=flyte.Resources(cpu="50m", memory="200Mi"),
+    image=stress_image,
+    env_vars=_STRESS_RUNTIME_ENV,
+    resources=_fanout_resources(),
     depends_on=[sleep_env],
 )
 
 swarm_env = flyte.TaskEnvironment(
     name="sleep_fanout_swarm",
+    image=stress_image,
+    env_vars=_STRESS_RUNTIME_ENV,
     resources=flyte.Resources(cpu=1, memory="500Mi"),
     depends_on=[fanout_env],
 )
@@ -41,8 +120,15 @@ async def sleep_fanout(
     All leaves run in leaseworker via the core-sleep plugin, so no task pods
     are created.
     """
+    print(
+        f"fanout_inputs n_children={n_children} "
+        f"sleep_duration={sleep_duration} "
+        f"sleep_seconds={sleep_duration.total_seconds()}",
+        flush=True,
+    )
+    print(_controller_tuning_summary(), flush=True)
     await asyncio.gather(*(sleep_leaf(duration=sleep_duration) for _ in range(n_children)))
-    print(f"Done. Total leaves: {n_children}")
+    print(f"Done. Total leaves: {n_children}", flush=True)
     return n_children
 
 
@@ -60,13 +146,19 @@ async def submit_runs(
     from aiolimiter import AsyncLimiter
 
     limiter = AsyncLimiter(max_rps, 1)
+    child_run_env = _nested_run_env()
 
-    async def submit_one() -> str:
+    async def submit_one(idx: int) -> str:
         async with limiter:
-            run = await flyte.run.aio(sleep_fanout, n_children=n_children, sleep_duration=sleep_duration)
+            run = await flyte.run.aio(
+                sleep_fanout.override(env_vars=child_run_env),
+                n_children=n_children,
+                sleep_duration=sleep_duration,
+            )
+            print(f"submitted_run idx={idx} url={run.url}", flush=True)
             return run.url
 
-    urls = await asyncio.gather(*(submit_one() for _ in range(n_runs)))
+    urls = await asyncio.gather(*(submit_one(i) for i in range(n_runs)))
     print(f"Swarm worker done. Submitted {len(urls)} runs at <= {max_rps} rps.")
     return list(urls)
 
diff --git a/examples/stress/sleep_fanout_harness.py b/examples/stress/sleep_fanout_harness.py
index 3e500cbb8..2c4fd1e04 100644
--- a/examples/stress/sleep_fanout_harness.py
+++ b/examples/stress/sleep_fanout_harness.py
@@ -1,5 +1,5 @@
 """
-Submit N copies of the sleep_fanout `main` task as fast as possible.
+Submit N copies of the `sleep_fanout` task through the `flyte run` CLI.
 
 Each run spawns n_children core-sleep leaves in leaseworker (no task pods).
 Submissions are launched with a bounded semaphore to cap in-flight TCP
@@ -9,34 +9,120 @@
 
 import argparse
 import asyncio
+import os
+import pathlib
+import shutil
+import sys
 import time
 from datetime import timedelta
 
-from sleep_fanout import sleep_fanout as sleep_fanout_main
+import re
 
-import flyte
+RUN_URL_RE = re.compile(r"URL:\s+(\S+/runs/[^/?\s]+)")
+RUN_NAME_RE = re.compile(r"Created Run:\s+([^\s]+)")
+RUNS_FILE = os.getenv("FLYTE_HARNESS_RUNS_FILE")
+REPO_ROOT = pathlib.Path(__file__).resolve().parents[2]
+LOCAL_SDK_SRC = REPO_ROOT / "src"
+FLYTE_BIN = os.getenv("FLYTE_HARNESS_FLYTE_BIN") or shutil.which("flyte") or "flyte"
+FORCE_LOCAL_SDK = os.getenv("FLYTE_HARNESS_FORCE_LOCAL_SDK", "").lower() in {"1", "true", "yes", "on"}
+
+
+def _subprocess_env() -> dict[str, str]:
+    env = os.environ.copy()
+    if FORCE_LOCAL_SDK:
+        existing = env.get("PYTHONPATH", "")
+        local_src = str(LOCAL_SDK_SRC)
+        env["PYTHONPATH"] = f"{local_src}:{existing}" if existing else local_src
+    return env
 
 
 async def submit_one(sem: asyncio.Semaphore, idx: int, n_children: int, sleep_duration: timedelta) -> str | None:
     async with sem:
+        os.environ.setdefault("_U_USE_ACTIONS", "1")
+        config = os.getenv("FLYTE_HARNESS_CONFIG", os.path.expanduser("~/.flyte/config-dogfood.yaml"))
+        image_builder = os.getenv("FLYTE_HARNESS_IMAGE_BUILDER", "remote")
+        project = os.getenv("FLYTE_HARNESS_PROJECT", "")
+        domain = os.getenv("FLYTE_HARNESS_DOMAIN", "")
+        run_env_keys = tuple(
+            k
+            for k in (
+                "_F_MAX_QPS",
+                "_F_CTRL_WORKERS",
+                "_F_P_CNC",
+                "_U_USE_ACTIONS",
+                "_F_TRACE_SUBMIT",
+                "_F_TRACE_SUBMIT_LIMIT",
+            )
+            if os.getenv(k)
+        )
+
+        cmd = [FLYTE_BIN, "-c", config, "--image-builder", image_builder, "run"]
+        if project:
+            cmd.extend(["-p", project])
+        if domain:
+            cmd.extend(["-d", domain])
+        for key in run_env_keys:
+            cmd.extend(["--env", f"{key}={os.environ[key]}"])
+        cmd.extend(
+            [
+                "examples/stress/sleep_fanout.py",
+                "sleep_fanout",
+                "--n_children",
+                str(n_children),
+                "--sleep_duration",
+                f"PT{int(sleep_duration.total_seconds())}S",
+            ]
+        )
+
         try:
-            run = await flyte.with_runcontext("remote").run.aio(
-                sleep_fanout_main,
-                n_children=n_children,
-                sleep_duration=sleep_duration,
+            proc = await asyncio.create_subprocess_exec(
+                *cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.STDOUT,
+                env=_subprocess_env(),
             )
-            return run.url
+
+            output_lines: list[str] = []
+            assert proc.stdout is not None
+            while True:
+                line = await proc.stdout.readline()
+                if not line:
+                    break
+                text = line.decode("utf-8", errors="replace").rstrip()
+                output_lines.append(text)
+
+            rc = await proc.wait()
+            output = "\n".join(output_lines)
+            if rc != 0:
+                print(f"[{idx}] submit failed rc={rc} output={output!r}", flush=True)
+                return None
+
+            url_match = RUN_URL_RE.search(output)
+            if url_match:
+                return url_match.group(1)
+
+            name_match = RUN_NAME_RE.search(output)
+            if name_match:
+                return name_match.group(1)
+
+            if "/runs/" in output:
+                print(f"[{idx}] submit failed: partial run URL parse failure output={output!r}", flush=True)
+            else:
+                print(f"[{idx}] submit failed: could not parse run id from output={output!r}", flush=True)
+                return None
+            return None
         except Exception as e:
             cause = getattr(e, "__cause__", None)
             print(f"[{idx}] submit failed: {type(e).__name__}: {e!r} cause={cause!r}", flush=True)
             return None
 
 
-async def submit_many(total: int, concurrency: int, n_children: int, sleep_duration: timedelta) -> None:
+async def submit_many(total: int, concurrency: int, n_children: int, sleep_duration: timedelta) -> int:
     sem = asyncio.Semaphore(concurrency)
     start = time.monotonic()
     submitted = 0
     failed = 0
+    runs_file_lock = asyncio.Lock()
 
     async def wrapped(i: int):
         nonlocal submitted, failed
@@ -45,7 +131,11 @@ async def wrapped(i: int):
             failed += 1
         else:
             submitted += 1
-            print(f"[{i}] {name}", flush=True)
+            if RUNS_FILE:
+                async with runs_file_lock:
+                    with open(RUNS_FILE, "a", encoding="utf-8") as f:
+                        f.write(f"{name}\n")
+            print(f"submitted_run idx={i} url={name}", flush=True)
         done = submitted + failed
         if done % 100 == 0:
             elapsed = time.monotonic() - start
@@ -57,6 +147,7 @@ async def wrapped(i: int):
     elapsed = time.monotonic() - start
     rps = submitted / elapsed if elapsed > 0 else 0
     print(f"\nDone. submitted={submitted} failed={failed} elapsed={elapsed:.2f}s rps={rps:.2f}")
+    return 1 if failed else 0
 
 
 # python stress/sleep_fanout_harness.py --total 25000 --concurrency 500 --n_children 10 --sleep_seconds 10
@@ -68,8 +159,7 @@ def main() -> None:
     parser.add_argument("--sleep_seconds", type=int, default=10)
     args = parser.parse_args()
 
-    flyte.init_from_config()
-    asyncio.run(
+    rc = asyncio.run(
         submit_many(
             total=args.total,
             concurrency=args.concurrency,
@@ -77,6 +167,7 @@ def main() -> None:
             sleep_duration=timedelta(seconds=args.sleep_seconds),
         )
     )
+    raise SystemExit(rc)
 
 
 if __name__ == "__main__":
diff --git a/examples/stress/sleep_fanout_harness_wrapper.sh b/examples/stress/sleep_fanout_harness_wrapper.sh
new file mode 100755
index 000000000..8a8404358
--- /dev/null
+++ b/examples/stress/sleep_fanout_harness_wrapper.sh
@@ -0,0 +1,716 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+CONFIG="${HOME}/.flyte/config-dogfood.yaml"
+PROJECT=""
+DOMAIN=""
+IMAGE_REGISTRY="${FLYTE_STRESS_IMAGE_REGISTRY:-376129846803.dkr.ecr.us-east-2.amazonaws.com/union}"
+IMAGE_NAME="${FLYTE_STRESS_IMAGE_NAME:-dogfood}"
+IMAGE_PLATFORMS="${FLYTE_STRESS_IMAGE_PLATFORMS:-linux/amd64}"
+IMAGE_BUILDER="${FLYTE_STRESS_IMAGE_BUILDER:-remote}"
+FANOUT_CPU_REQUEST="${FLYTE_STRESS_FANOUT_CPU_REQUEST:-1}"
+FANOUT_CPU_LIMIT="${FLYTE_STRESS_FANOUT_CPU_LIMIT:-2}"
+FANOUT_MEMORY_REQUEST="${FLYTE_STRESS_FANOUT_MEMORY_REQUEST:-2Gi}"
+FANOUT_MEMORY_LIMIT="${FLYTE_STRESS_FANOUT_MEMORY_LIMIT:-4Gi}"
+TOTAL_RUNS=20
+SUBMIT_CONCURRENCY=100
+N_CHILDREN=5000
+SLEEP_DURATION=800
+POLL_INTERVAL=2
+ABORT_REASON="wrapper interrupted"
+RUN_ENV_KVS=()
+
+EXPECTED_TOTAL_CHILDREN=0
+CHILD_RUNS=()
+
+LAUNCH_PID=""
+LAUNCH_LOG=""
+LAUNCH_RC_FILE=""
+RUNS_FILE=""
+LAUNCH_DONE=0
+LAUNCH_RC=0
+STOPPING=0
+ABORT_SENT=0
+
+SCRIPT_START_EPOCH="$(date +%s)"
+FIRST_DISCOVERED_AT=""
+FIRST_RUNNING_AT=""
+ALL_VISIBLE_AT=""
+TERMINAL_AT=""
+
+PEAK_SEEN=0
+PEAK_RUNNING=0
+PEAK_ACTIVE=0
+PEAK_CREATE_RPS=0
+PEAK_PARENT_LIVE=0
+PEAK_PARENT_RUNNING=0
+LAST_LAUNCH_STAGE=""
+SDK_WHEEL_PATH=""
+SDK_WHEEL_MTIME=""
+SDK_SRC_NEWER=0
+
+usage() {
+  cat <<'EOF'
+Usage:
+  examples/stress/sleep_fanout_harness_wrapper.sh [options]
+
+Options:
+  --config PATH               Flyte config path. Default: ~/.flyte/config-dogfood.yaml
+  --project NAME              Override project for get/abort.
+  --domain NAME               Override domain for get/abort.
+  --image-registry VALUE      Registry prefix for the task image.
+  --image-name VALUE          Repository name for the task image.
+  --image-builder VALUE       Flyte image builder to use for lookups. Default: remote
+  --run-env KEY=VALUE         Export an env var into the local submit harness and propagate it to remote runs.
+  --total-runs INT            Number of top-level sleep_fanout runs to submit. Default: 20
+  --submit-concurrency INT    Local submission concurrency. Default: 100
+  --n-children INT            Leaves per sleep_fanout run. Default: 5000
+  --sleep-duration VALUE      Sleep duration in seconds per leaf. Default: 800
+  --poll-interval SEC         Poll interval in seconds. Default: 2
+  --abort-reason TEXT         Reason passed to 'flyte abort run'. Default: wrapper interrupted
+  --help                      Show this message.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --config)
+      CONFIG="$2"
+      shift 2
+      ;;
+    --project)
+      PROJECT="$2"
+      shift 2
+      ;;
+    --domain)
+      DOMAIN="$2"
+      shift 2
+      ;;
+    --image-registry)
+      IMAGE_REGISTRY="$2"
+      shift 2
+      ;;
+    --image-name)
+      IMAGE_NAME="$2"
+      shift 2
+      ;;
+    --image-builder)
+      IMAGE_BUILDER="$2"
+      shift 2
+      ;;
+    --run-env)
+      RUN_ENV_KVS+=("$2")
+      shift 2
+      ;;
+    --total-runs)
+      TOTAL_RUNS="$2"
+      shift 2
+      ;;
+    --submit-concurrency)
+      SUBMIT_CONCURRENCY="$2"
+      shift 2
+      ;;
+    --n-children)
+      N_CHILDREN="$2"
+      shift 2
+      ;;
+    --sleep-duration)
+      SLEEP_DURATION="$2"
+      shift 2
+      ;;
+    --poll-interval)
+      POLL_INTERVAL="$2"
+      shift 2
+      ;;
+    --abort-reason)
+      ABORT_REASON="$2"
+      shift 2
+      ;;
+    --help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+EXPECTED_TOTAL_CHILDREN=$((TOTAL_RUNS * N_CHILDREN))
+
+if ! command -v flyte >/dev/null 2>&1; then
+  echo "flyte is required but was not found in PATH." >&2
+  exit 1
+fi
+
+if ! command -v jq >/dev/null 2>&1; then
+  echo "jq is required but was not found in PATH." >&2
+  exit 1
+fi
+
+if ! command -v python >/dev/null 2>&1; then
+  echo "python is required but was not found in PATH." >&2
+  exit 1
+fi
+
+CONFIG="${CONFIG/#\~/${HOME}}"
+LAUNCH_LOG="$(mktemp "${TMPDIR:-/tmp}/sleep-fanout-harness.XXXXXX.log")"
+LAUNCH_RC_FILE="$(mktemp "${TMPDIR:-/tmp}/sleep-fanout-harness.XXXXXX.rc")"
+RUNS_FILE="$(mktemp "${TMPDIR:-/tmp}/sleep-fanout-runs.XXXXXX.txt")"
+
+cleanup() {
+  if [[ -n "${LAUNCH_LOG}" && -f "${LAUNCH_LOG}" ]]; then
+    rm -f "${LAUNCH_LOG}"
+  fi
+  if [[ -n "${LAUNCH_RC_FILE}" && -f "${LAUNCH_RC_FILE}" ]]; then
+    rm -f "${LAUNCH_RC_FILE}"
+  fi
+  if [[ -n "${RUNS_FILE}" && -f "${RUNS_FILE}" ]]; then
+    rm -f "${RUNS_FILE}"
+  fi
+}
+trap cleanup EXIT
+
+project_args=()
+domain_args=()
+
+if [[ -n "${PROJECT}" ]]; then
+  project_args=(-p "${PROJECT}")
+fi
+
+if [[ -n "${DOMAIN}" ]]; then
+  domain_args=(-d "${DOMAIN}")
+fi
+
+flyte_cmd_json() {
+  COLUMNS=500 _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" -of json-raw "$@" \
+    | perl -pe 's/\e\[[0-9;]*[A-Za-z]//g'
+}
+
+is_terminal_phase() {
+  case "$1" in
+    ACTION_PHASE_SUCCEEDED|ACTION_PHASE_FAILED|ACTION_PHASE_ABORTED|ACTION_PHASE_TIMED_OUT)
+      return 0
+      ;;
+    *)
+      return 1
+      ;;
+  esac
+}
+
+format_duration() {
+  local value="$1"
+  if [[ -z "${value}" ]]; then
+    echo "n/a"
+    return
+  fi
+  printf '%02dh:%02dm:%02ds' "$((value / 3600))" "$(((value % 3600) / 60))" "$((value % 60))"
+}
+
+elapsed_from_start() {
+  local epoch="$1"
+  if [[ -z "${epoch}" ]]; then
+    echo ""
+    return
+  fi
+  echo "$((epoch - SCRIPT_START_EPOCH))"
+}
+
+print_row() {
+  printf '%-8s %-12s %-8s %-8s %-18s %-14s %-8s %-10s %-8s %-10s %-8s %-8s\n' \
+    "$(date +%H:%M:%S)" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9" "${10}" "${11}"
+}
+
+sanitize_run_name() {
+  local value="$1"
+  value="$(printf '%s' "${value}" | tr -d '\r')"
+  value="$(printf '%s' "${value}" | sed 's/[[:space:]]*$//')"
+  value="$(printf '%s' "${value}" | grep -Eo '[ur][[:alnum:]]{5,}' | head -n 1 || true)"
+  printf '%s' "${value}"
+}
+
+detect_sdk_wheel_status() {
+  local wheel_path=""
+  local wheel_mtime=""
+  local newest_src=""
+
+  wheel_path="$(
+    find "${REPO_ROOT}/dist" -maxdepth 1 -type f -name 'flyte-*.whl' -print 2>/dev/null \
+      | sort \
+      | tail -n 1 || true
+  )"
+  SDK_WHEEL_PATH="${wheel_path}"
+  SDK_WHEEL_MTIME=""
+  SDK_SRC_NEWER=0
+
+  if [[ -z "${wheel_path}" ]]; then
+    return
+  fi
+
+  wheel_mtime="$(stat -f '%m' "${wheel_path}" 2>/dev/null || true)"
+  SDK_WHEEL_MTIME="${wheel_mtime}"
+  newest_src="$(
+    find "${REPO_ROOT}/src/flyte" -type f -newer "${wheel_path}" -print 2>/dev/null \
+      | head -n 1 || true
+  )"
+  if [[ -n "${newest_src}" ]]; then
+    SDK_SRC_NEWER=1
+  fi
+}
+
+child_run_known() {
+  local target="$1"
+  local existing=""
+  for existing in "${CHILD_RUNS[@]}"; do
+    if [[ "${existing}" == "${target}" ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+discover_child_runs() {
+  local child_run=""
+
+  if [[ -s "${RUNS_FILE}" ]]; then
+    while IFS= read -r child_run; do
+      child_run="$(sanitize_run_name "${child_run}")"
+      [[ -z "${child_run}" ]] && continue
+      if ! child_run_known "${child_run}"; then
+        CHILD_RUNS+=("${child_run}")
+      fi
+    done < "${RUNS_FILE}"
+  fi
+
+  if [[ -s "${LAUNCH_LOG}" ]]; then
+    while IFS= read -r child_run; do
+      child_run="$(sanitize_run_name "${child_run}")"
+      [[ -z "${child_run}" ]] && continue
+      if ! child_run_known "${child_run}"; then
+        CHILD_RUNS+=("${child_run}")
+      fi
+    done < <(
+      perl -ne '
+        s/\e\[[0-9;]*[A-Za-z]//g;
+        s/\r/\n/g;
+        if (/submitted_run idx=\d+ url=.*\/runs\/([^\/?\s]+)/) {
+          print "$1\n";
+        } elsif (/submitted_run idx=\d+ url=([ur][[:alnum:]]{5,})/) {
+          print "$1\n";
+        }
+      ' "${LAUNCH_LOG}"
+    )
+  fi
+}
+
+fetch_actions_json_for_run() {
+  local run_name="$1"
+  flyte_cmd_json get action "${project_args[@]}" "${domain_args[@]}" "${run_name}"
+}
+
+aggregate_child_runs_tsv() {
+  local discovered=0
+  local roots_terminal=0
+  local parent_live=0
+  local parent_running=0
+  local seen=0
+  local queued=0
+  local waiting=0
+  local initializing=0
+  local running=0
+  local succeeded=0
+  local failed=0
+  local aborted=0
+  local timed_out=0
+  local run_name=""
+  local json=""
+  local root_phase=""
+  local c_seen=0
+  local c_queued=0
+  local c_waiting=0
+  local c_initializing=0
+  local c_running=0
+  local c_succeeded=0
+  local c_failed=0
+  local c_aborted=0
+  local c_timed_out=0
+  local not_created=0
+  local active=0
+
+  discovered="${#CHILD_RUNS[@]}"
+  for run_name in "${CHILD_RUNS[@]}"; do
+    if ! json="$(fetch_actions_json_for_run "${run_name}" 2>/dev/null)"; then
+      continue
+    fi
+
+    IFS=$'\t' read -r root_phase c_seen c_queued c_waiting c_initializing c_running c_succeeded c_failed c_aborted c_timed_out \
+      <<<"$(jq -r '
+        [ .[] ] as $all
+        | ($all | map(select(.id.name == "a0")) | .[0]) as $root
+        | [ $all[] | select(.id.name != "a0") ] as $kids
+        | [
+            ($root.status.phase // "MISSING"),
+            ($kids | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_QUEUED")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_WAITING_FOR_RESOURCES")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_INITIALIZING")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_RUNNING")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_SUCCEEDED")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_FAILED")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_ABORTED")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_TIMED_OUT")) | length)
+          ]
+        | @tsv
+      ' <<<"${json}")"
+
+    if is_terminal_phase "${root_phase}"; then
+      roots_terminal=$((roots_terminal + 1))
+    elif [[ "${root_phase}" != "MISSING" ]]; then
+      parent_live=$((parent_live + 1))
+      if [[ "${root_phase}" == "ACTION_PHASE_RUNNING" ]]; then
+        parent_running=$((parent_running + 1))
+      fi
+    fi
+    seen=$((seen + c_seen))
+    queued=$((queued + c_queued))
+    waiting=$((waiting + c_waiting))
+    initializing=$((initializing + c_initializing))
+    running=$((running + c_running))
+    succeeded=$((succeeded + c_succeeded))
+    failed=$((failed + c_failed))
+    aborted=$((aborted + c_aborted))
+    timed_out=$((timed_out + c_timed_out))
+  done
+
+  if (( EXPECTED_TOTAL_CHILDREN > seen )); then
+    not_created=$((EXPECTED_TOTAL_CHILDREN - seen))
+  fi
+  active=$((queued + waiting + initializing + running))
+
+  printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
+    "${discovered}/${TOTAL_RUNS}" \
+    "${parent_live}" \
+    "${parent_running}" \
+    "${seen}/${EXPECTED_TOTAL_CHILDREN}" \
+    "${not_created}" \
+    "${queued}" \
+    "${waiting}" \
+    "${initializing}" \
+    "${running}" \
+    "${active}" \
+    "${succeeded}" \
+    "${failed}" \
+    "${aborted}" \
+    "${timed_out}" \
+    "${roots_terminal}"
+}
+
+launch_stage_from_log() {
+  if [[ ! -s "${LAUNCH_LOG}" ]]; then
+    return 1
+  fi
+
+  local stage=""
+  stage="$(
+    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g; s/\r/\n/g' "${LAUNCH_LOG}" \
+      | sed '/^[[:space:]]*$/d' \
+      | grep -E '^(submitted=|Done\.|submitted_run|Error:|ERROR|Failed|failed)' \
+      | tail -n 1 || true
+  )"
+
+  if [[ -z "${stage}" ]]; then
+    stage="$(perl -pe 's/\e\[[0-9;]*[A-Za-z]//g; s/\r/\n/g' "${LAUNCH_LOG}" | sed '/^[[:space:]]*$/d' | tail -n 1)"
+  fi
+
+  [[ -n "${stage}" ]] || return 1
+  printf '%s' "${stage}"
+}
+
+abort_remote_runs() {
+  local run_name=""
+  if [[ "${ABORT_SENT}" -eq 1 ]]; then
+    return
+  fi
+  ABORT_SENT=1
+
+  for run_name in "${CHILD_RUNS[@]}"; do
+    COLUMNS=500 _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" \
+      abort run "${project_args[@]}" "${domain_args[@]}" --reason "${ABORT_REASON}" "${run_name}" >/dev/null 2>&1 || true
+  done
+}
+
+handle_signal() {
+  local sig="$1"
+  if [[ "${STOPPING}" -eq 1 ]]; then
+    echo
+    echo "Received ${sig} again, exiting immediately."
+    exit 130
+  fi
+  STOPPING=1
+
+  echo
+  echo "Received ${sig}, stopping local submissions and aborting discovered runs."
+  if [[ -n "${LAUNCH_PID}" ]] && kill -0 "${LAUNCH_PID}" 2>/dev/null; then
+    kill "${LAUNCH_PID}" 2>/dev/null || true
+  fi
+  discover_child_runs
+  abort_remote_runs
+}
+
+trap 'handle_signal INT' INT
+trap 'handle_signal TERM' TERM
+
+cd "${REPO_ROOT}"
+detect_sdk_wheel_status
+
+echo "Launching local multi-run harness"
+echo "  config: ${CONFIG}"
+echo "  total_runs: ${TOTAL_RUNS}"
+echo "  submit_concurrency: ${SUBMIT_CONCURRENCY}"
+echo "  n_children_per_run: ${N_CHILDREN}"
+echo "  total_children_expected: ${EXPECTED_TOTAL_CHILDREN}"
+echo "  sleep_duration: ${SLEEP_DURATION}"
+echo "  poll_interval: ${POLL_INTERVAL}s"
+echo "  image target: ${IMAGE_REGISTRY}/${IMAGE_NAME}"
+echo "  image builder: ${IMAGE_BUILDER}"
+echo "  image platforms: ${IMAGE_PLATFORMS}"
+if [[ "${FLYTE_HARNESS_FORCE_LOCAL_SDK:-0}" == "1" || "${FLYTE_HARNESS_FORCE_LOCAL_SDK:-}" == "true" ]]; then
+  echo "  sdk source: ${REPO_ROOT}/src forced via $(command -v flyte)"
+else
+  echo "  sdk source: installed flyte via $(command -v flyte)"
+fi
+if [[ -n "${SDK_WHEEL_PATH}" ]]; then
+  echo "  sdk wheel: ${SDK_WHEEL_PATH}"
+  if [[ "${SDK_SRC_NEWER}" -eq 1 ]]; then
+    echo "  warning: src/flyte is newer than the dist wheel; remote image will not include recent SDK src changes until you rebuild the wheel"
+  fi
+else
+  echo "  sdk wheel: <missing>"
+fi
+echo "  fanout parent resources: cpu ${FANOUT_CPU_REQUEST}/${FANOUT_CPU_LIMIT}, memory ${FANOUT_MEMORY_REQUEST}/${FANOUT_MEMORY_LIMIT}"
+echo "  use_actions: ${_U_USE_ACTIONS:-1}"
+if [[ -n "${PROJECT}" || -n "${DOMAIN}" ]]; then
+  echo "  project/domain override: ${PROJECT:-<config>} / ${DOMAIN:-<config>}"
+fi
+if [[ "${#RUN_ENV_KVS[@]}" -gt 0 ]]; then
+  echo "  run env overrides: ${RUN_ENV_KVS[*]}"
+fi
+echo
+printf '%-8s %-12s %-8s %-8s %-18s %-14s %-8s %-10s %-8s %-10s %-8s %-8s\n' \
+  "time" "runs" "p_live" "p_run" "seen_children" "not_created" "d_seen" "create_rps" "rps/p" "eta_fill" "running" "active"
+
+(
+  export _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}"
+  export FLYTE_STRESS_IMAGE_REGISTRY="${IMAGE_REGISTRY}"
+  export FLYTE_STRESS_IMAGE_NAME="${IMAGE_NAME}"
+  export FLYTE_STRESS_IMAGE_PLATFORMS="${IMAGE_PLATFORMS}"
+  export FLYTE_HARNESS_CONFIG="${CONFIG}"
+  export FLYTE_HARNESS_IMAGE_BUILDER="${IMAGE_BUILDER}"
+  export FLYTE_HARNESS_PROJECT="${PROJECT}"
+  export FLYTE_HARNESS_DOMAIN="${DOMAIN}"
+  export FLYTE_HARNESS_RUNS_FILE="${RUNS_FILE}"
+  local_kv=""
+  for local_kv in "${RUN_ENV_KVS[@]}"; do
+    export "${local_kv}"
+  done
+  rc=0
+  python examples/stress/sleep_fanout_harness.py \
+    --total "${TOTAL_RUNS}" \
+    --concurrency "${SUBMIT_CONCURRENCY}" \
+    --n_children "${N_CHILDREN}" \
+    --sleep_seconds "${SLEEP_DURATION}" || rc=$?
+  printf '%s\n' "${rc}" > "${LAUNCH_RC_FILE}"
+  exit "${rc}"
+) >"${LAUNCH_LOG}" 2>&1 &
+LAUNCH_PID=$!
+
+FINAL_DISCOVERED=0
+FINAL_SEEN=0
+FINAL_SUCCEEDED=0
+FINAL_FAILED=0
+FINAL_ABORTED=0
+FINAL_TIMED_OUT=0
+FINAL_ROOTS_TERMINAL=0
+FINAL_PARENT_LIVE=0
+FINAL_PARENT_RUNNING=0
+LAST_SAMPLE_TS=""
+LAST_SAMPLE_SEEN=""
+
+while true; do
+  discover_child_runs
+
+  if [[ -z "${FIRST_DISCOVERED_AT}" && "${#CHILD_RUNS[@]}" -gt 0 ]]; then
+    FIRST_DISCOVERED_AT="$(date +%s)"
+  fi
+
+  if [[ "${LAUNCH_DONE}" -eq 0 && -s "${LAUNCH_RC_FILE}" ]]; then
+    LAUNCH_RC="$(tr -d '\r\n[:space:]' < "${LAUNCH_RC_FILE}")"
+    if [[ -z "${LAUNCH_RC}" ]]; then
+      LAUNCH_RC=1
+    fi
+    wait "${LAUNCH_PID}" 2>/dev/null || true
+    LAUNCH_DONE=1
+  fi
+
+  if [[ "${#CHILD_RUNS[@]}" -gt 0 ]]; then
+    IFS=$'\t' read -r runs parent_live parent_running seen_children not_created queued waiting initializing running active succeeded failed aborted timed_out roots_terminal \
+      <<<"$(aggregate_child_runs_tsv)"
+
+    FINAL_DISCOVERED="${runs%%/*}"
+    FINAL_PARENT_LIVE="${parent_live}"
+    FINAL_PARENT_RUNNING="${parent_running}"
+    FINAL_SEEN="${seen_children%%/*}"
+    FINAL_SUCCEEDED="${succeeded}"
+    FINAL_FAILED="${failed}"
+    FINAL_ABORTED="${aborted}"
+    FINAL_TIMED_OUT="${timed_out}"
+    FINAL_ROOTS_TERMINAL="${roots_terminal}"
+
+    if (( FINAL_SEEN > PEAK_SEEN )); then
+      PEAK_SEEN="${FINAL_SEEN}"
+    fi
+    if (( running > PEAK_RUNNING )); then
+      PEAK_RUNNING="${running}"
+    fi
+    if (( active > PEAK_ACTIVE )); then
+      PEAK_ACTIVE="${active}"
+    fi
+    if (( parent_live > PEAK_PARENT_LIVE )); then
+      PEAK_PARENT_LIVE="${parent_live}"
+    fi
+    if (( parent_running > PEAK_PARENT_RUNNING )); then
+      PEAK_PARENT_RUNNING="${parent_running}"
+    fi
+    if [[ -z "${FIRST_RUNNING_AT}" && "${running}" -gt 0 ]]; then
+      FIRST_RUNNING_AT="$(date +%s)"
+    fi
+    if [[ -z "${ALL_VISIBLE_AT}" && "${FINAL_SEEN}" -ge "${EXPECTED_TOTAL_CHILDREN}" ]]; then
+      ALL_VISIBLE_AT="$(date +%s)"
+    fi
+
+    sample_ts="$(date +%s)"
+    delta_seen="0"
+    create_rps="n/a"
+    create_rps_per_parent="n/a"
+    eta_fill="n/a"
+    if [[ -n "${LAST_SAMPLE_TS}" && -n "${LAST_SAMPLE_SEEN}" ]]; then
+      delta_t=$((sample_ts - LAST_SAMPLE_TS))
+      if (( delta_t > 0 )); then
+        delta_seen=$((FINAL_SEEN - LAST_SAMPLE_SEEN))
+        if (( delta_seen < 0 )); then
+          delta_seen=0
+        fi
+        create_rps="$(python - <<'PY' "${delta_seen}" "${delta_t}"
+import sys
+delta_seen = int(sys.argv[1])
+delta_t = int(sys.argv[2])
+print(f"{delta_seen / delta_t:.1f}")
+PY
+)"
+        if (( parent_running > 0 )); then
+          create_rps_per_parent="$(python - <<'PY' "${delta_seen}" "${delta_t}" "${parent_running}"
+import sys
+delta_seen = int(sys.argv[1])
+delta_t = int(sys.argv[2])
+parent_running = int(sys.argv[3])
+print(f"{(delta_seen / delta_t) / parent_running:.1f}")
+PY
+)"
+        fi
+        create_rps_int="$(python - <<'PY' "${delta_seen}" "${delta_t}"
+import sys
+delta_seen = int(sys.argv[1])
+delta_t = int(sys.argv[2])
+print(int(delta_seen / delta_t))
+PY
+)"
+        if (( create_rps_int > PEAK_CREATE_RPS )); then
+          PEAK_CREATE_RPS="${create_rps_int}"
+        fi
+        if (( FINAL_SEEN < EXPECTED_TOTAL_CHILDREN && delta_seen > 0 )); then
+          eta_fill="$(python - <<'PY' "${EXPECTED_TOTAL_CHILDREN}" "${FINAL_SEEN}" "${delta_seen}" "${delta_t}"
+import math
+import sys
+expected = int(sys.argv[1])
+seen = int(sys.argv[2])
+delta_seen = int(sys.argv[3])
+delta_t = int(sys.argv[4])
+remaining = expected - seen
+seconds = math.ceil(remaining / (delta_seen / delta_t))
+h, rem = divmod(seconds, 3600)
+m, s = divmod(rem, 60)
+print(f"{h:02d}:{m:02d}:{s:02d}")
+PY
+)"
+        elif (( FINAL_SEEN >= EXPECTED_TOTAL_CHILDREN )); then
+          eta_fill="00:00:00"
+        fi
+      fi
+    fi
+    LAST_SAMPLE_TS="${sample_ts}"
+    LAST_SAMPLE_SEEN="${FINAL_SEEN}"
+
+    print_row \
+      "${runs}" \
+      "${parent_live}" \
+      "${parent_running}" \
+      "${seen_children}" \
+      "${not_created}" \
+      "${delta_seen}" \
+      "${create_rps}" \
+      "${create_rps_per_parent}" \
+      "${eta_fill}" \
+      "${running}" \
+      "${active}"
+
+    if [[ "${LAUNCH_DONE}" -eq 1 ]] && (( FINAL_ROOTS_TERMINAL == FINAL_DISCOVERED )) && (( active == 0 )); then
+      TERMINAL_AT="$(date +%s)"
+      break
+    fi
+  else
+    if stage="$(launch_stage_from_log)"; then
+      if [[ "${stage}" != "${LAST_LAUNCH_STAGE}" ]]; then
+        LAST_LAUNCH_STAGE="${stage}"
+        echo "launch: ${stage}"
+      fi
+    fi
+    print_row "0/${TOTAL_RUNS}" 0 0 "0/${EXPECTED_TOTAL_CHILDREN}" "${EXPECTED_TOTAL_CHILDREN}" 0 0 0 0 0 0
+  fi
+
+  if [[ "${LAUNCH_DONE}" -eq 1 && "${LAUNCH_RC}" -ne 0 && "${#CHILD_RUNS[@]}" -eq 0 ]]; then
+    echo
+    echo "Local submit harness failed before any runs were discovered." >&2
+    cat "${LAUNCH_LOG}" >&2
+    exit "${LAUNCH_RC}"
+  fi
+
+  sleep "${POLL_INTERVAL}"
+done
+
+echo
+echo "Aggregate Summary"
+echo "  runs_discovered: ${FINAL_DISCOVERED}/${TOTAL_RUNS}"
+echo "  total_expected_children: ${EXPECTED_TOTAL_CHILDREN}"
+echo "  child_run_roots_terminal: ${FINAL_ROOTS_TERMINAL}/${FINAL_DISCOVERED}"
+echo "  peak_parent_live: ${PEAK_PARENT_LIVE}"
+echo "  peak_parent_running: ${PEAK_PARENT_RUNNING}"
+echo "  children_seen: ${FINAL_SEEN}/${EXPECTED_TOTAL_CHILDREN}"
+echo "  succeeded: ${FINAL_SUCCEEDED}"
+echo "  failed: ${FINAL_FAILED}"
+echo "  aborted: ${FINAL_ABORTED}"
+echo "  timed_out: ${FINAL_TIMED_OUT}"
+echo "  peak_seen: ${PEAK_SEEN}/${EXPECTED_TOTAL_CHILDREN}"
+echo "  peak_running: ${PEAK_RUNNING}"
+echo "  peak_active: ${PEAK_ACTIVE}"
+echo "  peak_create_rps: ${PEAK_CREATE_RPS}"
+echo "  first_run_discovered: $(format_duration "$(elapsed_from_start "${FIRST_DISCOVERED_AT}")")"
+echo "  aggregate_first_running: $(format_duration "$(elapsed_from_start "${FIRST_RUNNING_AT}")")"
+echo "  aggregate_all_visible: $(format_duration "$(elapsed_from_start "${ALL_VISIBLE_AT}")")"
+echo "  aggregate_terminal: $(format_duration "$(elapsed_from_start "${TERMINAL_AT}")")"
+echo "  total_elapsed: $(format_duration "$(( $(date +%s) - SCRIPT_START_EPOCH ))")"
diff --git a/examples/stress/sleep_fanout_swarm_wrapper.sh b/examples/stress/sleep_fanout_swarm_wrapper.sh
new file mode 100755
index 000000000..7d3cd937f
--- /dev/null
+++ b/examples/stress/sleep_fanout_swarm_wrapper.sh
@@ -0,0 +1,870 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+CONFIG="${HOME}/.flyte/config-dogfood.yaml"
+RUN_NAME=""
+REQUESTED_RUN_NAME=""
+SWARM_SIZE=2
+RUNS_PER_WORKER=1
+MAX_RPS=1
+N_CHILDREN=5000
+SLEEP_DURATION=800
+POLL_INTERVAL=2
+ABORT_REASON="wrapper interrupted"
+PROJECT=""
+DOMAIN=""
+IMAGE_REGISTRY="${FLYTE_STRESS_IMAGE_REGISTRY:-376129846803.dkr.ecr.us-east-2.amazonaws.com/union}"
+IMAGE_NAME="${FLYTE_STRESS_IMAGE_NAME:-dogfood}"
+IMAGE_BUILDER="${FLYTE_STRESS_IMAGE_BUILDER:-remote}"
+RUN_ENV_ARGS=()
+
+LAUNCH_PID=""
+LAUNCH_LOG=""
+LAUNCH_DONE=0
+LAUNCH_RC=0
+RUN_VISIBLE=0
+ABORT_SENT=0
+STOPPING=0
+
+SCRIPT_START_EPOCH="$(date +%s)"
+RUN_VISIBLE_AT=""
+FIRST_RUNNING_AT=""
+ROOT_TERMINAL_AT=""
+
+PEAK_SEEN=0
+PEAK_RUNNING=0
+PEAK_ACTIVE=0
+LAST_LAUNCH_STAGE=""
+
+EXPECTED_TOP_ACTIONS=0
+EXPECTED_CHILD_RUNS=0
+EXPECTED_TOTAL_CHILDREN=0
+CHILD_RUNS=()
+CHILD_RUNS_DISCOVERED_AT=""
+AGG_FIRST_RUNNING_AT=""
+AGG_ALL_VISIBLE_AT=""
+AGG_TERMINAL_AT=""
+AGG_PEAK_SEEN=0
+AGG_PEAK_RUNNING=0
+AGG_PEAK_ACTIVE=0
+FINAL_ROOT_ACTIONS_JSON=""
+
+usage() {
+  cat <<'EOF'
+Usage:
+  examples/stress/sleep_fanout_swarm_wrapper.sh [options]
+
+Options:
+  --config PATH           Flyte config path. Default: ~/.flyte/config-dogfood.yaml
+  --project NAME          Override project for launch/get/abort.
+  --domain NAME           Override domain for launch/get/abort.
+  --run-name NAME         Use a fixed run name.
+  --image-registry VALUE  Registry prefix for the task image. Default: 376129846803.dkr.ecr.us-east-2.amazonaws.com/union
+  --image-name VALUE      Repository name for the task image. Default: dogfood
+  --image-builder VALUE   Flyte image builder to use. Default: remote
+  --run-env KEY=VALUE     Pass through to 'flyte run --env'. Can be specified multiple times.
+  --swarm-size INT        Number of submitter tasks. Default: 2
+  --runs-per-worker INT   Number of sleep_fanout runs each submitter launches. Default: 1
+  --max-rps INT           Max submissions per second per submitter. Default: 1
+  --n-children INT        Number of leaves per sleep_fanout run. Default: 5000
+  --sleep-duration VALUE  Sleep duration passed to each child run. Default: 800
+  --poll-interval SEC     Poll interval in seconds. Default: 2
+  --abort-reason TEXT     Reason passed to 'flyte abort run'. Default: wrapper interrupted
+  --help                  Show this message.
+
+Notes:
+  This wrapper first monitors the swarm root run (`main` + `primer` + submitters).
+  After the root finishes submitting, it discovers child run IDs from submitter
+  logs and then switches to aggregate leaf-action counts across those child runs.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --config)
+      CONFIG="$2"
+      shift 2
+      ;;
+    --project)
+      PROJECT="$2"
+      shift 2
+      ;;
+    --domain)
+      DOMAIN="$2"
+      shift 2
+      ;;
+    --run-name)
+      REQUESTED_RUN_NAME="$2"
+      shift 2
+      ;;
+    --image-registry)
+      IMAGE_REGISTRY="$2"
+      shift 2
+      ;;
+    --image-name)
+      IMAGE_NAME="$2"
+      shift 2
+      ;;
+    --image-builder)
+      IMAGE_BUILDER="$2"
+      shift 2
+      ;;
+    --run-env)
+      RUN_ENV_ARGS+=("--env" "$2")
+      shift 2
+      ;;
+    --swarm-size)
+      SWARM_SIZE="$2"
+      shift 2
+      ;;
+    --runs-per-worker)
+      RUNS_PER_WORKER="$2"
+      shift 2
+      ;;
+    --max-rps)
+      MAX_RPS="$2"
+      shift 2
+      ;;
+    --n-children)
+      N_CHILDREN="$2"
+      shift 2
+      ;;
+    --sleep-duration)
+      SLEEP_DURATION="$2"
+      shift 2
+      ;;
+    --poll-interval)
+      POLL_INTERVAL="$2"
+      shift 2
+      ;;
+    --abort-reason)
+      ABORT_REASON="$2"
+      shift 2
+      ;;
+    --help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+RUN_NAME="${REQUESTED_RUN_NAME}"
+EXPECTED_TOP_ACTIONS=$((SWARM_SIZE + 2))
+EXPECTED_CHILD_RUNS=$((SWARM_SIZE * RUNS_PER_WORKER))
+EXPECTED_TOTAL_CHILDREN=$((EXPECTED_CHILD_RUNS * N_CHILDREN))
+
+if ! command -v flyte >/dev/null 2>&1; then
+  echo "flyte is required but was not found in PATH." >&2
+  exit 1
+fi
+
+if ! command -v jq >/dev/null 2>&1; then
+  echo "jq is required but was not found in PATH." >&2
+  exit 1
+fi
+
+CONFIG="${CONFIG/#\~/${HOME}}"
+LAUNCH_LOG="$(mktemp "${TMPDIR:-/tmp}/sleep-fanout-swarm-launch.XXXXXX.log")"
+
+cleanup() {
+  if [[ -n "${LAUNCH_LOG}" && -f "${LAUNCH_LOG}" ]]; then
+    rm -f "${LAUNCH_LOG}"
+  fi
+}
+trap cleanup EXIT
+
+flyte_cmd() {
+  _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" "$@"
+}
+
+flyte_cmd_json() {
+  COLUMNS=500 _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" -of json-raw "$@" \
+    | perl -pe 's/\e\[[0-9;]*[A-Za-z]//g'
+}
+
+project_args=()
+domain_args=()
+run_args=()
+
+if [[ -n "${PROJECT}" ]]; then
+  project_args=(-p "${PROJECT}")
+  run_args+=(-p "${PROJECT}")
+fi
+
+if [[ -n "${DOMAIN}" ]]; then
+  domain_args=(-d "${DOMAIN}")
+  run_args+=(-d "${DOMAIN}")
+fi
+
+abort_remote_run() {
+  if [[ "${ABORT_SENT}" -eq 1 || "${RUN_VISIBLE}" -eq 0 || -z "${RUN_NAME}" ]]; then
+    return
+  fi
+
+  ABORT_SENT=1
+  echo
+  echo "Requesting abort for swarm root run ${RUN_NAME}..."
+  echo "Already-submitted child runs are separate runs and may continue."
+  if ! flyte_cmd abort run "${project_args[@]}" "${domain_args[@]}" --reason "${ABORT_REASON}" "${RUN_NAME}"; then
+    echo "Abort request failed for run ${RUN_NAME}." >&2
+  fi
+}
+
+handle_signal() {
+  local sig="$1"
+  if [[ "${STOPPING}" -eq 1 ]]; then
+    echo
+    echo "Received ${sig} again, exiting immediately."
+    exit 130
+  fi
+  STOPPING=1
+
+  echo
+  echo "Received ${sig}, requesting abort for swarm root run ${RUN_NAME:-<pending>}."
+  echo "Continuing to monitor until the current section reaches a terminal phase. Press Ctrl-C again to exit immediately."
+
+  if [[ -n "${LAUNCH_PID}" ]] && kill -0 "${LAUNCH_PID}" 2>/dev/null; then
+    kill "${LAUNCH_PID}" 2>/dev/null || true
+  fi
+
+  abort_remote_run
+}
+
+trap 'handle_signal INT' INT
+trap 'handle_signal TERM' TERM
+
+format_duration() {
+  local value="$1"
+  if [[ -z "${value}" ]]; then
+    echo "n/a"
+    return
+  fi
+
+  local seconds="$value"
+  printf '%02dh:%02dm:%02ds' "$((seconds / 3600))" "$(((seconds % 3600) / 60))" "$((seconds % 60))"
+}
+
+elapsed_from_start() {
+  local epoch="$1"
+  if [[ -z "${epoch}" ]]; then
+    echo ""
+    return
+  fi
+  echo "$((epoch - SCRIPT_START_EPOCH))"
+}
+
+is_terminal_phase() {
+  case "$1" in
+    ACTION_PHASE_SUCCEEDED|ACTION_PHASE_FAILED|ACTION_PHASE_ABORTED|ACTION_PHASE_TIMED_OUT)
+      return 0
+      ;;
+    *)
+      return 1
+      ;;
+  esac
+}
+
+fetch_actions_json() {
+  flyte_cmd_json get action "${project_args[@]}" "${domain_args[@]}" "${RUN_NAME}"
+}
+
+fetch_actions_json_for_run() {
+  local run_name="$1"
+  flyte_cmd_json get action "${project_args[@]}" "${domain_args[@]}" "${run_name}"
+}
+
+fetch_action_logs() {
+  local run_name="$1"
+  local action_name="$2"
+  COLUMNS=500 _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" \
+    get logs "${project_args[@]}" "${domain_args[@]}" "${run_name}" "${action_name}" 2>/dev/null \
+    | perl -pe 's/\e\[[0-9;]*[A-Za-z]//g'
+}
+
+sanitize_run_name() {
+  local value="$1"
+  value="$(printf '%s' "${value}" | tr -d '\r')"
+  value="$(printf '%s' "${value}" | sed 's/[[:space:]]*$//')"
+  value="$(printf '%s' "${value}" | grep -Eo '[ur][[:alnum:]]{5,}' | head -n 1 || true)"
+  printf '%s' "${value}"
+}
+
+resolve_run_name_from_log() {
+  if [[ -z "${LAUNCH_LOG}" || ! -s "${LAUNCH_LOG}" ]]; then
+    return 1
+  fi
+
+  local parsed=""
+  parsed="$(
+    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "${LAUNCH_LOG}" \
+      | sed -n 's/.*Created Run: //p' \
+      | tail -n 1
+  )"
+
+  if [[ -z "${parsed}" ]]; then
+    parsed="$(
+      perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "${LAUNCH_LOG}" \
+        | sed -n 's#.*URL: .*/runs/\([^/?[:space:]]*\).*#\1#p' \
+        | tail -n 1
+    )"
+  fi
+
+  if [[ -z "${parsed}" ]]; then
+    return 1
+  fi
+
+  parsed="$(sanitize_run_name "${parsed}")"
+  if [[ -z "${parsed}" ]]; then
+    return 1
+  fi
+
+  RUN_NAME="${parsed}"
+  return 0
+}
+
+launch_stage_from_log() {
+  if [[ -z "${LAUNCH_LOG}" || ! -s "${LAUNCH_LOG}" ]]; then
+    return 1
+  fi
+
+  local lines=""
+  local stage=""
+
+  lines="$(
+    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g; s/\r/\n/g' "${LAUNCH_LOG}" \
+      | sed '/^[[:space:]]*$/d' \
+      | tail -n 100
+  )"
+
+  if [[ -z "${lines}" ]]; then
+    return 1
+  fi
+
+  stage="$(
+    printf '%s\n' "${lines}" \
+      | grep -E '^(Building|Pushing|Image |Created Run:|URL:|Error:|ERROR|Failed|failed|Using |#)' \
+      | tail -n 1 || true
+  )"
+
+  if [[ -z "${stage}" ]]; then
+    stage="$(printf '%s\n' "${lines}" | tail -n 1)"
+  fi
+
+  if [[ -n "${stage}" ]]; then
+    printf '%s' "${stage}"
+    return 0
+  fi
+
+  return 1
+}
+
+print_row() {
+  printf '%-8s %-28s %-12s %-10s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
+    "$(date '+%H:%M:%S')" \
+    "$1" \
+    "$2" \
+    "$3" \
+    "$4" \
+    "$5" \
+    "$6" \
+    "$7" \
+    "$8" \
+    "$9" \
+    "${10}"
+}
+
+print_agg_row() {
+  printf '%-8s %-12s %-18s %-14s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
+    "$(date '+%H:%M:%S')" \
+    "$1" \
+    "$2" \
+    "$3" \
+    "$4" \
+    "$5" \
+    "$6" \
+    "$7" \
+    "$8" \
+    "$9" \
+    "${10}"
+}
+
+snapshot_tsv() {
+  local actions_json="$1"
+  jq -r \
+    --argjson expected "${EXPECTED_TOP_ACTIONS}" \
+    '
+      [ .[] ] as $all
+      | ($all | map(select(.id.name == "a0")) | .[0]) as $root
+      | [ $all[] | select(.id.name != "a0") ] as $kids
+      | {
+          root_phase: ($root.status.phase // "MISSING"),
+          seen: ($kids | length),
+          queued: ($kids | map(select(.status.phase == "ACTION_PHASE_QUEUED")) | length),
+          waiting: ($kids | map(select(.status.phase == "ACTION_PHASE_WAITING_FOR_RESOURCES")) | length),
+          initializing: ($kids | map(select(.status.phase == "ACTION_PHASE_INITIALIZING")) | length),
+          running: ($kids | map(select(.status.phase == "ACTION_PHASE_RUNNING")) | length),
+          succeeded: ($kids | map(select(.status.phase == "ACTION_PHASE_SUCCEEDED")) | length),
+          failed: ($kids | map(select(.status.phase == "ACTION_PHASE_FAILED")) | length),
+          aborted: ($kids | map(select(.status.phase == "ACTION_PHASE_ABORTED")) | length),
+          timed_out: ($kids | map(select(.status.phase == "ACTION_PHASE_TIMED_OUT")) | length)
+        }
+      | .not_seen = (if $expected > .seen then ($expected - .seen) else 0 end)
+      | .active = (.queued + .waiting + .initializing + .running)
+      | [
+          .root_phase,
+          .seen,
+          .not_seen,
+          .queued,
+          .waiting,
+          .initializing,
+          .running,
+          .active,
+          .succeeded,
+          .failed,
+          .aborted,
+          .timed_out
+        ]
+      | @tsv
+    ' <<<"${actions_json}"
+}
+
+child_run_known() {
+  local target="$1"
+  local existing=""
+  for existing in "${CHILD_RUNS[@]}"; do
+    if [[ "${existing}" == "${target}" ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+discover_child_runs_from_root() {
+  local root_actions_json="$1"
+  local action_name=""
+  local child_run=""
+
+  while IFS= read -r action_name; do
+    [[ -z "${action_name}" ]] && continue
+    while IFS= read -r child_run; do
+      [[ -z "${child_run}" ]] && continue
+      if ! child_run_known "${child_run}"; then
+        CHILD_RUNS+=("${child_run}")
+      fi
+    done < <(
+      fetch_action_logs "${RUN_NAME}" "${action_name}" \
+        | perl -ne '
+            if (/submitted_run idx=\d+ url=.*\/runs\/([^\/?\s]+)/) {
+              print "$1\n";
+            } elsif (/submitted_run idx=\d+ url=([ur][[:alnum:]]{5,})/) {
+              print "$1\n";
+            }
+          '
+    )
+  done < <(jq -r '.[] | select(.id.name != "a0") | .id.name' <<<"${root_actions_json}")
+}
+
+aggregate_child_runs_tsv() {
+  local discovered=0
+  local roots_terminal=0
+  local seen=0
+  local queued=0
+  local waiting=0
+  local initializing=0
+  local running=0
+  local succeeded=0
+  local failed=0
+  local aborted=0
+  local timed_out=0
+  local run_name=""
+  local json=""
+  local root_phase=""
+  local c_seen=0
+  local c_queued=0
+  local c_waiting=0
+  local c_initializing=0
+  local c_running=0
+  local c_succeeded=0
+  local c_failed=0
+  local c_aborted=0
+  local c_timed_out=0
+  local not_created=0
+  local active=0
+
+  discovered="${#CHILD_RUNS[@]}"
+  for run_name in "${CHILD_RUNS[@]}"; do
+    if ! json="$(fetch_actions_json_for_run "${run_name}" 2>/dev/null)"; then
+      continue
+    fi
+    IFS=$'\t' read -r root_phase c_seen c_queued c_waiting c_initializing c_running c_succeeded c_failed c_aborted c_timed_out \
+      <<<"$(jq -r '
+        [ .[] ] as $all
+        | ($all | map(select(.id.name == "a0")) | .[0]) as $root
+        | [ $all[] | select(.id.name != "a0") ] as $kids
+        | [
+            ($root.status.phase // "MISSING"),
+            ($kids | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_QUEUED")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_WAITING_FOR_RESOURCES")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_INITIALIZING")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_RUNNING")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_SUCCEEDED")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_FAILED")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_ABORTED")) | length),
+            ($kids | map(select(.status.phase == "ACTION_PHASE_TIMED_OUT")) | length)
+          ]
+        | @tsv
+      ' <<<"${json}")"
+
+    if is_terminal_phase "${root_phase}"; then
+      roots_terminal=$((roots_terminal + 1))
+    fi
+    seen=$((seen + c_seen))
+    queued=$((queued + c_queued))
+    waiting=$((waiting + c_waiting))
+    initializing=$((initializing + c_initializing))
+    running=$((running + c_running))
+    succeeded=$((succeeded + c_succeeded))
+    failed=$((failed + c_failed))
+    aborted=$((aborted + c_aborted))
+    timed_out=$((timed_out + c_timed_out))
+  done
+
+  if (( EXPECTED_TOTAL_CHILDREN > seen )); then
+    not_created=$((EXPECTED_TOTAL_CHILDREN - seen))
+  fi
+  active=$((queued + waiting + initializing + running))
+
+  printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
+    "${discovered}/${EXPECTED_CHILD_RUNS}" \
+    "${seen}/${EXPECTED_TOTAL_CHILDREN}" \
+    "${not_created}" \
+    "${queued}" \
+    "${waiting}" \
+    "${initializing}" \
+    "${running}" \
+    "${active}" \
+    "${succeeded}" \
+    "${failed}" \
+    "${aborted}" \
+    "${timed_out}" \
+    "${roots_terminal}"
+}
+
+print_root_summary() {
+  local root_phase="$1"
+  local seen="$2"
+  local succeeded="$3"
+  local failed="$4"
+  local aborted="$5"
+  local timed_out="$6"
+  local total_elapsed="$(( $(date +%s) - SCRIPT_START_EPOCH ))"
+
+  echo
+  echo "Root Summary"
+  echo "  run_name: ${RUN_NAME:-<unresolved>}"
+  echo "  root_phase: ${root_phase}"
+  echo "  abort_requested: $(if [[ "${STOPPING}" -eq 1 ]]; then echo yes; else echo no; fi)"
+  echo "  top_level_actions_seen: ${seen}/${EXPECTED_TOP_ACTIONS}"
+  echo "  succeeded: ${succeeded}"
+  echo "  failed: ${failed}"
+  echo "  aborted: ${aborted}"
+  echo "  timed_out: ${timed_out}"
+  echo "  peak_seen: ${PEAK_SEEN}/${EXPECTED_TOP_ACTIONS}"
+  echo "  peak_running: ${PEAK_RUNNING}"
+  echo "  peak_active: ${PEAK_ACTIVE}"
+  echo "  time_to_run_visible: $(format_duration "$(elapsed_from_start "${RUN_VISIBLE_AT}")")"
+  echo "  time_to_first_running: $(format_duration "$(elapsed_from_start "${FIRST_RUNNING_AT}")")"
+  echo "  time_to_root_terminal: $(format_duration "$(elapsed_from_start "${ROOT_TERMINAL_AT}")")"
+  echo "  total_elapsed: $(format_duration "${total_elapsed}")"
+}
+
+print_agg_summary() {
+  local discovered="$1"
+  local roots_terminal="$2"
+  local seen="$3"
+  local succeeded="$4"
+  local failed="$5"
+  local aborted="$6"
+  local timed_out="$7"
+  local total_elapsed="$(( $(date +%s) - SCRIPT_START_EPOCH ))"
+
+  echo
+  echo "Aggregate Summary"
+  echo "  child_runs_discovered: ${discovered}/${EXPECTED_CHILD_RUNS}"
+  echo "  total_expected_children: ${EXPECTED_TOTAL_CHILDREN}"
+  echo "  child_run_roots_terminal: ${roots_terminal}/${discovered}"
+  echo "  children_seen: ${seen}/${EXPECTED_TOTAL_CHILDREN}"
+  echo "  succeeded: ${succeeded}"
+  echo "  failed: ${failed}"
+  echo "  aborted: ${aborted}"
+  echo "  timed_out: ${timed_out}"
+  echo "  peak_seen: ${AGG_PEAK_SEEN}/${EXPECTED_TOTAL_CHILDREN}"
+  echo "  peak_running: ${AGG_PEAK_RUNNING}"
+  echo "  peak_active: ${AGG_PEAK_ACTIVE}"
+  echo "  child_runs_discovered_at: $(format_duration "$(elapsed_from_start "${CHILD_RUNS_DISCOVERED_AT}")")"
+  echo "  aggregate_first_running: $(format_duration "$(elapsed_from_start "${AGG_FIRST_RUNNING_AT}")")"
+  echo "  aggregate_all_visible: $(format_duration "$(elapsed_from_start "${AGG_ALL_VISIBLE_AT}")")"
+  echo "  aggregate_terminal: $(format_duration "$(elapsed_from_start "${AGG_TERMINAL_AT}")")"
+  echo "  total_elapsed: $(format_duration "${total_elapsed}")"
+}
+
+cd "${REPO_ROOT}"
+
+if [[ -n "${RUN_NAME}" ]]; then
+  echo "Launching swarm run ${RUN_NAME}"
+else
+  echo "Launching swarm run with generated actions name"
+fi
+echo "  config: ${CONFIG}"
+echo "  swarm_size: ${SWARM_SIZE}"
+echo "  runs_per_worker: ${RUNS_PER_WORKER}"
+echo "  max_rps: ${MAX_RPS}"
+echo "  n_children_per_run: ${N_CHILDREN}"
+echo "  sleep_duration: ${SLEEP_DURATION}"
+echo "  poll_interval: ${POLL_INTERVAL}s"
+echo "  image target: ${IMAGE_REGISTRY}/${IMAGE_NAME}"
+echo "  image builder: ${IMAGE_BUILDER}"
+echo "  image platforms: ${FLYTE_STRESS_IMAGE_PLATFORMS:-linux/amd64}"
+if [[ -n "${PROJECT}" || -n "${DOMAIN}" ]]; then
+  echo "  project/domain override: ${PROJECT:-<config>} / ${DOMAIN:-<config>}"
+fi
+if [[ "${#RUN_ENV_ARGS[@]}" -gt 0 ]]; then
+  echo "  run env overrides: ${RUN_ENV_ARGS[*]}"
+fi
+echo "  child_runs_expected: ${EXPECTED_CHILD_RUNS}"
+echo "  total_children_expected: ${EXPECTED_TOTAL_CHILDREN}"
+echo
+printf '%-8s %-28s %-12s %-10s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
+  "time" "root_phase" "seen_top" "not_seen" "queued" "waiting" "init" "running" "active" "ok" "aborted"
+
+(
+  export FLYTE_STRESS_IMAGE_REGISTRY="${IMAGE_REGISTRY}"
+  export FLYTE_STRESS_IMAGE_NAME="${IMAGE_NAME}"
+  if [[ -n "${REQUESTED_RUN_NAME}" ]]; then
+    flyte_cmd run "${run_args[@]}" "${RUN_ENV_ARGS[@]}" --name "${REQUESTED_RUN_NAME}" \
+      examples/stress/sleep_fanout.py main \
+      --swarm_size "${SWARM_SIZE}" \
+      --runs_per_worker "${RUNS_PER_WORKER}" \
+      --max_rps "${MAX_RPS}" \
+      --n_children "${N_CHILDREN}" \
+      --sleep_duration "${SLEEP_DURATION}"
+  else
+    flyte_cmd run "${run_args[@]}" "${RUN_ENV_ARGS[@]}" \
+      examples/stress/sleep_fanout.py main \
+      --swarm_size "${SWARM_SIZE}" \
+      --runs_per_worker "${RUNS_PER_WORKER}" \
+      --max_rps "${MAX_RPS}" \
+      --n_children "${N_CHILDREN}" \
+      --sleep_duration "${SLEEP_DURATION}"
+  fi
+) >"${LAUNCH_LOG}" 2>&1 &
+LAUNCH_PID=$!
+
+FINAL_ROOT_PHASE="UNKNOWN"
+FINAL_SEEN=0
+FINAL_SUCCEEDED=0
+FINAL_FAILED=0
+FINAL_ABORTED=0
+FINAL_TIMED_OUT=0
+
+while true; do
+  if [[ -z "${RUN_NAME}" ]] && resolve_run_name_from_log; then
+    echo
+    echo "Resolved run name: ${RUN_NAME}"
+  fi
+
+  if [[ "${LAUNCH_DONE}" -eq 0 ]] && ! kill -0 "${LAUNCH_PID}" 2>/dev/null; then
+    if wait "${LAUNCH_PID}"; then
+      LAUNCH_RC=0
+    else
+      LAUNCH_RC=$?
+    fi
+    LAUNCH_DONE=1
+
+    if [[ "${LAUNCH_RC}" -eq 0 ]]; then
+      echo
+      echo "Launch command completed for run ${RUN_NAME:-<unresolved>}."
+    elif [[ "${RUN_VISIBLE}" -eq 0 ]]; then
+      echo
+      echo "Launch command failed before the run became visible." >&2
+      cat "${LAUNCH_LOG}" >&2
+      exit "${LAUNCH_RC}"
+    else
+      echo
+      echo "Launch command exited with ${LAUNCH_RC}, but the run is already visible. Continuing to monitor." >&2
+      cat "${LAUNCH_LOG}" >&2
+    fi
+  fi
+
+  if [[ -n "${RUN_NAME}" ]] && actions_json="$(fetch_actions_json 2>/dev/null)"; then
+    now_epoch="$(date +%s)"
+    RUN_VISIBLE=1
+    FINAL_ROOT_ACTIONS_JSON="${actions_json}"
+    if [[ -z "${RUN_VISIBLE_AT}" ]]; then
+      RUN_VISIBLE_AT="${now_epoch}"
+    fi
+
+    if [[ "${STOPPING}" -eq 1 && "${ABORT_SENT}" -eq 0 ]]; then
+      abort_remote_run
+    fi
+
+    IFS=$'\t' read -r root_phase seen not_seen queued waiting initializing running active succeeded failed aborted timed_out \
+      <<<"$(snapshot_tsv "${actions_json}")"
+
+    FINAL_ROOT_PHASE="${root_phase}"
+    FINAL_SEEN="${seen}"
+    FINAL_SUCCEEDED="${succeeded}"
+    FINAL_FAILED="${failed}"
+    FINAL_ABORTED="${aborted}"
+    FINAL_TIMED_OUT="${timed_out}"
+
+    if (( seen > PEAK_SEEN )); then
+      PEAK_SEEN="${seen}"
+    fi
+    if (( running > PEAK_RUNNING )); then
+      PEAK_RUNNING="${running}"
+    fi
+    if (( active > PEAK_ACTIVE )); then
+      PEAK_ACTIVE="${active}"
+    fi
+    if [[ -z "${FIRST_RUNNING_AT}" && "${running}" -gt 0 ]]; then
+      FIRST_RUNNING_AT="${now_epoch}"
+    fi
+
+    print_row \
+      "${root_phase}" \
+      "${seen}/${EXPECTED_TOP_ACTIONS}" \
+      "${not_seen}" \
+      "${queued}" \
+      "${waiting}" \
+      "${initializing}" \
+      "${running}" \
+      "${active}" \
+      "${succeeded}" \
+      "${aborted}"
+
+    if is_terminal_phase "${root_phase}"; then
+      ROOT_TERMINAL_AT="${now_epoch}"
+      break
+    fi
+  else
+    if stage="$(launch_stage_from_log)"; then
+      if [[ "${stage}" != "${LAST_LAUNCH_STAGE}" ]]; then
+        LAST_LAUNCH_STAGE="${stage}"
+        echo "launch: ${stage}"
+      fi
+    fi
+    print_row "RESOLVING_RUN" "0/${EXPECTED_TOP_ACTIONS}" "${EXPECTED_TOP_ACTIONS}" 0 0 0 0 0 0 0
+  fi
+
+  sleep "${POLL_INTERVAL}"
+done
+
+if [[ "${LAUNCH_DONE}" -eq 0 ]]; then
+  if wait "${LAUNCH_PID}"; then
+    true
+  else
+    true
+  fi
+fi
+
+print_root_summary \
+  "${FINAL_ROOT_PHASE}" \
+  "${FINAL_SEEN}" \
+  "${FINAL_SUCCEEDED}" \
+  "${FINAL_FAILED}" \
+  "${FINAL_ABORTED}" \
+  "${FINAL_TIMED_OUT}"
+
+if [[ "${FINAL_ROOT_PHASE}" == "ACTION_PHASE_SUCCEEDED" && -n "${FINAL_ROOT_ACTIONS_JSON}" ]]; then
+  discover_child_runs_from_root "${FINAL_ROOT_ACTIONS_JSON}"
+
+  if [[ "${#CHILD_RUNS[@]}" -gt 0 ]]; then
+    CHILD_RUNS_DISCOVERED_AT="$(date +%s)"
+
+    echo
+    echo "Discovered child runs: ${#CHILD_RUNS[@]}/${EXPECTED_CHILD_RUNS}"
+    printf '%s\n' "${CHILD_RUNS[@]}"
+    echo
+    printf '%-8s %-12s %-18s %-14s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
+      "time" "runs" "seen_children" "not_created" "queued" "waiting" "init" "running" "active" "ok" "aborted"
+
+    AGG_FINAL_DISCOVERED=0
+    AGG_FINAL_SEEN=0
+    AGG_FINAL_SUCCEEDED=0
+    AGG_FINAL_FAILED=0
+    AGG_FINAL_ABORTED=0
+    AGG_FINAL_TIMED_OUT=0
+    AGG_FINAL_ROOTS_TERMINAL=0
+
+    while true; do
+      IFS=$'\t' read -r runs seen_children not_created queued waiting initializing running active succeeded failed aborted timed_out roots_terminal \
+        <<<"$(aggregate_child_runs_tsv)"
+
+      AGG_FINAL_DISCOVERED="${runs%%/*}"
+      AGG_FINAL_SEEN="${seen_children%%/*}"
+      AGG_FINAL_SUCCEEDED="${succeeded}"
+      AGG_FINAL_FAILED="${failed}"
+      AGG_FINAL_ABORTED="${aborted}"
+      AGG_FINAL_TIMED_OUT="${timed_out}"
+      AGG_FINAL_ROOTS_TERMINAL="${roots_terminal}"
+
+      if (( AGG_FINAL_SEEN > AGG_PEAK_SEEN )); then
+        AGG_PEAK_SEEN="${AGG_FINAL_SEEN}"
+      fi
+      if (( running > AGG_PEAK_RUNNING )); then
+        AGG_PEAK_RUNNING="${running}"
+      fi
+      if (( active > AGG_PEAK_ACTIVE )); then
+        AGG_PEAK_ACTIVE="${active}"
+      fi
+      if [[ -z "${AGG_FIRST_RUNNING_AT}" && "${running}" -gt 0 ]]; then
+        AGG_FIRST_RUNNING_AT="$(date +%s)"
+      fi
+      if [[ -z "${AGG_ALL_VISIBLE_AT}" && "${AGG_FINAL_SEEN}" -ge "${EXPECTED_TOTAL_CHILDREN}" ]]; then
+        AGG_ALL_VISIBLE_AT="$(date +%s)"
+      fi
+
+      print_agg_row \
+        "${runs}" \
+        "${seen_children}" \
+        "${not_created}" \
+        "${queued}" \
+        "${waiting}" \
+        "${initializing}" \
+        "${running}" \
+        "${active}" \
+        "${succeeded}" \
+        "${aborted}"
+
+      if (( AGG_FINAL_DISCOVERED == EXPECTED_CHILD_RUNS )) && (( AGG_FINAL_ROOTS_TERMINAL == AGG_FINAL_DISCOVERED )) && (( active == 0 )); then
+        AGG_TERMINAL_AT="$(date +%s)"
+        break
+      fi
+
+      sleep "${POLL_INTERVAL}"
+    done
+
+    print_agg_summary \
+      "${AGG_FINAL_DISCOVERED}" \
+      "${AGG_FINAL_ROOTS_TERMINAL}" \
+      "${AGG_FINAL_SEEN}" \
+      "${AGG_FINAL_SUCCEEDED}" \
+      "${AGG_FINAL_FAILED}" \
+      "${AGG_FINAL_ABORTED}" \
+      "${AGG_FINAL_TIMED_OUT}"
+  else
+    echo
+    echo "No child runs discovered from submitter logs."
+  fi
+fi
diff --git a/examples/stress/sleep_fanout_wrapper.sh b/examples/stress/sleep_fanout_wrapper.sh
new file mode 100755
index 000000000..8cf4188dd
--- /dev/null
+++ b/examples/stress/sleep_fanout_wrapper.sh
@@ -0,0 +1,610 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+CONFIG="${HOME}/.flyte/config-dogfood.yaml"
+RUN_NAME=""
+REQUESTED_RUN_NAME=""
+N_CHILDREN=10
+SLEEP_DURATION=10
+POLL_INTERVAL=1
+ABORT_REASON="wrapper interrupted"
+PROJECT=""
+DOMAIN=""
+IMAGE_REGISTRY="${FLYTE_STRESS_IMAGE_REGISTRY:-376129846803.dkr.ecr.us-east-2.amazonaws.com/union}"
+IMAGE_NAME="${FLYTE_STRESS_IMAGE_NAME:-dogfood}"
+IMAGE_BUILDER="${FLYTE_STRESS_IMAGE_BUILDER:-remote}"
+RUN_ENV_ARGS=()
+
+LAUNCH_PID=""
+LAUNCH_LOG=""
+LAUNCH_DONE=0
+LAUNCH_RC=0
+RUN_VISIBLE=0
+ABORT_SENT=0
+STOPPING=0
+STOPPING_AT=""
+ABORT_NOTE_SHOWN=0
+INTERRUPT_GRACE_SEC=15
+
+SCRIPT_START_EPOCH="$(date +%s)"
+RUN_VISIBLE_AT=""
+ALL_CHILDREN_VISIBLE_AT=""
+FIRST_RUNNING_AT=""
+FIRST_SUCCESS_AT=""
+ROOT_TERMINAL_AT=""
+
+PEAK_SEEN=0
+PEAK_RUNNING=0
+PEAK_ACTIVE=0
+LAST_LAUNCH_STAGE=""
+
+usage() {
+  cat <<'EOF'
+Usage:
+  examples/stress/sleep_fanout_wrapper.sh [options]
+
+Options:
+  --config PATH           Flyte config path. Default: ~/.flyte/config-dogfood.yaml
+  --project NAME          Override project for launch/get/abort.
+  --domain NAME           Override domain for launch/get/abort.
+  --run-name NAME         Use a fixed run name. Avoid this on _U_USE_ACTIONS=1 if you want abort to work.
+  --image-registry VALUE  Registry prefix for the task image. Default: 376129846803.dkr.ecr.us-east-2.amazonaws.com/union
+  --image-name VALUE      Repository name for the task image. Default: dogfood
+  --image-builder VALUE   Flyte image builder to use. Default: remote
+  --run-env KEY=VALUE     Pass through to 'flyte run --env'. Can be specified multiple times.
+  --n-children INT        Number of child actions. Default: 10
+  --sleep-duration VALUE  Sleep duration passed to the task. Default: 10
+  --poll-interval SEC     Poll interval in seconds. Default: 1
+  --abort-reason TEXT     Reason passed to 'flyte abort run'. Default: wrapper interrupted
+  --help                  Show this message.
+
+Example:
+  examples/stress/sleep_fanout_wrapper.sh \
+    --config ~/.flyte/config-dogfood.yaml \
+    --n-children 10 \
+    --sleep-duration 10
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --config)
+      CONFIG="$2"
+      shift 2
+      ;;
+    --project)
+      PROJECT="$2"
+      shift 2
+      ;;
+    --domain)
+      DOMAIN="$2"
+      shift 2
+      ;;
+    --run-name)
+      REQUESTED_RUN_NAME="$2"
+      shift 2
+      ;;
+    --image-registry)
+      IMAGE_REGISTRY="$2"
+      shift 2
+      ;;
+    --image-name)
+      IMAGE_NAME="$2"
+      shift 2
+      ;;
+    --image-builder)
+      IMAGE_BUILDER="$2"
+      shift 2
+      ;;
+    --run-env)
+      RUN_ENV_ARGS+=("--env" "$2")
+      shift 2
+      ;;
+    --n-children)
+      N_CHILDREN="$2"
+      shift 2
+      ;;
+    --sleep-duration)
+      SLEEP_DURATION="$2"
+      shift 2
+      ;;
+    --poll-interval)
+      POLL_INTERVAL="$2"
+      shift 2
+      ;;
+    --abort-reason)
+      ABORT_REASON="$2"
+      shift 2
+      ;;
+    --help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+RUN_NAME="${REQUESTED_RUN_NAME}"
+
+if ! command -v flyte >/dev/null 2>&1; then
+  echo "flyte is required but was not found in PATH." >&2
+  exit 1
+fi
+
+if ! command -v jq >/dev/null 2>&1; then
+  echo "jq is required but was not found in PATH." >&2
+  exit 1
+fi
+
+CONFIG="${CONFIG/#\~/${HOME}}"
+LAUNCH_LOG="$(mktemp "${TMPDIR:-/tmp}/sleep-fanout-launch.XXXXXX.log")"
+
+cleanup() {
+  if [[ -n "${LAUNCH_LOG}" && -f "${LAUNCH_LOG}" ]]; then
+    rm -f "${LAUNCH_LOG}"
+  fi
+}
+trap cleanup EXIT
+
+flyte_cmd() {
+  _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" "$@"
+}
+
+flyte_cmd_json() {
+  COLUMNS=500 _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" -of json-raw "$@" \
+    | perl -pe 's/\e\[[0-9;]*[A-Za-z]//g'
+}
+
+project_args=()
+domain_args=()
+run_args=()
+
+if [[ -n "${PROJECT}" ]]; then
+  project_args=(-p "${PROJECT}")
+  run_args+=(-p "${PROJECT}")
+fi
+
+if [[ -n "${DOMAIN}" ]]; then
+  domain_args=(-d "${DOMAIN}")
+  run_args+=(-d "${DOMAIN}")
+fi
+
+abort_remote_run() {
+  if [[ "${ABORT_SENT}" -eq 1 || "${RUN_VISIBLE}" -eq 0 || -z "${RUN_NAME}" ]]; then
+    return
+  fi
+
+  ABORT_SENT=1
+  echo
+  echo "Requesting abort for run ${RUN_NAME}..."
+  if ! flyte_cmd abort run "${project_args[@]}" "${domain_args[@]}" --reason "${ABORT_REASON}" "${RUN_NAME}"; then
+    echo "Abort request failed for run ${RUN_NAME}." >&2
+  fi
+}
+
+handle_signal() {
+  local sig="$1"
+  if [[ "${STOPPING}" -eq 1 ]]; then
+    echo
+    echo "Received ${sig} again, exiting immediately."
+    exit 130
+  fi
+  STOPPING=1
+  STOPPING_AT="$(date +%s)"
+
+  echo
+  echo "Received ${sig}, requesting abort for run ${RUN_NAME:-<pending>}."
+  echo "Continuing to monitor until the run reaches a terminal phase. Press Ctrl-C again to exit immediately."
+
+  if [[ -n "${LAUNCH_PID}" ]] && kill -0 "${LAUNCH_PID}" 2>/dev/null; then
+    kill "${LAUNCH_PID}" 2>/dev/null || true
+  fi
+
+  abort_remote_run
+}
+
+trap 'handle_signal INT' INT
+trap 'handle_signal TERM' TERM
+
+format_duration() {
+  local value="$1"
+  if [[ -z "${value}" ]]; then
+    echo "n/a"
+    return
+  fi
+
+  local seconds="$value"
+  printf '%02dh:%02dm:%02ds' "$((seconds / 3600))" "$(((seconds % 3600) / 60))" "$((seconds % 60))"
+}
+
+elapsed_from_start() {
+  local epoch="$1"
+  if [[ -z "${epoch}" ]]; then
+    echo ""
+    return
+  fi
+  echo "$((epoch - SCRIPT_START_EPOCH))"
+}
+
+is_terminal_phase() {
+  case "$1" in
+    ACTION_PHASE_SUCCEEDED|ACTION_PHASE_FAILED|ACTION_PHASE_ABORTED|ACTION_PHASE_TIMED_OUT)
+      return 0
+      ;;
+    *)
+      return 1
+      ;;
+  esac
+}
+
+fetch_actions_json() {
+  flyte_cmd_json get action "${project_args[@]}" "${domain_args[@]}" "${RUN_NAME}"
+}
+
+sanitize_run_name() {
+  local value="$1"
+  value="$(printf '%s' "${value}" | tr -d '\r')"
+  value="$(printf '%s' "${value}" | sed 's/[[:space:]]*$//')"
+  value="$(printf '%s' "${value}" | grep -Eo '[ur][[:alnum:]]{5,}' | head -n 1 || true)"
+  printf '%s' "${value}"
+}
+
+resolve_run_name_from_log() {
+  if [[ -z "${LAUNCH_LOG}" || ! -s "${LAUNCH_LOG}" ]]; then
+    return 1
+  fi
+
+  local parsed=""
+  parsed="$(
+    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "${LAUNCH_LOG}" \
+      | sed -n 's/.*Created Run: //p' \
+      | tail -n 1
+  )"
+
+  if [[ -z "${parsed}" ]]; then
+    parsed="$(
+      perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "${LAUNCH_LOG}" \
+        | sed -n 's#.*URL: .*/runs/\([^/?[:space:]]*\).*#\1#p' \
+        | tail -n 1
+    )"
+  fi
+
+  if [[ -z "${parsed}" ]]; then
+    return 1
+  fi
+
+  parsed="$(sanitize_run_name "${parsed}")"
+  if [[ -z "${parsed}" ]]; then
+    return 1
+  fi
+
+  RUN_NAME="${parsed}"
+  return 0
+}
+
+launch_stage_from_log() {
+  if [[ -z "${LAUNCH_LOG}" || ! -s "${LAUNCH_LOG}" ]]; then
+    return 1
+  fi
+
+  local lines=""
+  local stage=""
+
+  lines="$(
+    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g; s/\r/\n/g' "${LAUNCH_LOG}" \
+      | sed '/^[[:space:]]*$/d' \
+      | tail -n 100
+  )"
+
+  if [[ -z "${lines}" ]]; then
+    return 1
+  fi
+
+  stage="$(
+    printf '%s\n' "${lines}" \
+      | grep -E '^(Building|Pushing|Image |Created Run:|URL:|Error:|ERROR|Failed|failed|Using |#)' \
+      | tail -n 1 || true
+  )"
+
+  if [[ -z "${stage}" ]]; then
+    stage="$(printf '%s\n' "${lines}" | tail -n 1)"
+  fi
+
+  if [[ -n "${stage}" ]]; then
+    printf '%s' "${stage}"
+    return 0
+  fi
+
+  return 1
+}
+
+print_row() {
+  printf '%-8s %-28s %-14s %-14s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
+    "$(date '+%H:%M:%S')" \
+    "$1" \
+    "$2" \
+    "$3" \
+    "$4" \
+    "$5" \
+    "$6" \
+    "$7" \
+    "$8" \
+    "$9" \
+    "${10}"
+}
+
+snapshot_tsv() {
+  local actions_json="$1"
+  jq -r \
+    --argjson expected "${N_CHILDREN}" \
+    '
+      [ .[] ] as $all
+      | ($all | map(select(.id.name == "a0")) | .[0]) as $root
+      | [ $all[] | select(.id.name != "a0") ] as $kids
+      | {
+          root_phase: ($root.status.phase // "MISSING"),
+          seen: ($kids | length),
+          queued: ($kids | map(select(.status.phase == "ACTION_PHASE_QUEUED")) | length),
+          waiting: ($kids | map(select(.status.phase == "ACTION_PHASE_WAITING_FOR_RESOURCES")) | length),
+          initializing: ($kids | map(select(.status.phase == "ACTION_PHASE_INITIALIZING")) | length),
+          running: ($kids | map(select(.status.phase == "ACTION_PHASE_RUNNING")) | length),
+          succeeded: ($kids | map(select(.status.phase == "ACTION_PHASE_SUCCEEDED")) | length),
+          failed: ($kids | map(select(.status.phase == "ACTION_PHASE_FAILED")) | length),
+          aborted: ($kids | map(select(.status.phase == "ACTION_PHASE_ABORTED")) | length),
+          timed_out: ($kids | map(select(.status.phase == "ACTION_PHASE_TIMED_OUT")) | length)
+        }
+      | .not_created = (if $expected > .seen then ($expected - .seen) else 0 end)
+      | .active = (.queued + .waiting + .initializing + .running)
+      | [
+          .root_phase,
+          .seen,
+          .not_created,
+          .queued,
+          .waiting,
+          .initializing,
+          .running,
+          .active,
+          .succeeded,
+          .failed,
+          .aborted,
+          .timed_out
+        ]
+      | @tsv
+    ' <<<"${actions_json}"
+}
+
+print_summary() {
+  local root_phase="$1"
+  local seen="$2"
+  local succeeded="$3"
+  local failed="$4"
+  local aborted="$5"
+  local timed_out="$6"
+  local total_elapsed="$(( $(date +%s) - SCRIPT_START_EPOCH ))"
+
+  echo
+  echo "Summary"
+  echo "  run_name: ${RUN_NAME:-<unresolved>}"
+  echo "  root_phase: ${root_phase}"
+  echo "  abort_requested: $(if [[ "${STOPPING}" -eq 1 ]]; then echo yes; else echo no; fi)"
+  echo "  children_seen: ${seen}/${N_CHILDREN}"
+  echo "  succeeded: ${succeeded}"
+  echo "  failed: ${failed}"
+  echo "  aborted: ${aborted}"
+  echo "  timed_out: ${timed_out}"
+  echo "  peak_seen: ${PEAK_SEEN}/${N_CHILDREN}"
+  echo "  peak_running: ${PEAK_RUNNING}"
+  echo "  peak_active: ${PEAK_ACTIVE}"
+  echo "  time_to_run_visible: $(format_duration "$(elapsed_from_start "${RUN_VISIBLE_AT}")")"
+  echo "  time_to_all_children_visible: $(format_duration "$(elapsed_from_start "${ALL_CHILDREN_VISIBLE_AT}")")"
+  echo "  time_to_first_running: $(format_duration "$(elapsed_from_start "${FIRST_RUNNING_AT}")")"
+  echo "  time_to_first_success: $(format_duration "$(elapsed_from_start "${FIRST_SUCCESS_AT}")")"
+  echo "  time_to_root_terminal: $(format_duration "$(elapsed_from_start "${ROOT_TERMINAL_AT}")")"
+  echo "  total_elapsed: $(format_duration "${total_elapsed}")"
+}
+
+cd "${REPO_ROOT}"
+
+if [[ -n "${RUN_NAME}" ]]; then
+  echo "Launching run ${RUN_NAME}"
+else
+  echo "Launching run with generated actions name"
+fi
+echo "  config: ${CONFIG}"
+echo "  children: ${N_CHILDREN}"
+echo "  sleep_duration: ${SLEEP_DURATION}"
+echo "  poll_interval: ${POLL_INTERVAL}s"
+echo "  image target: ${IMAGE_REGISTRY}/${IMAGE_NAME}"
+echo "  image builder: ${IMAGE_BUILDER}"
+echo "  image platforms: ${FLYTE_STRESS_IMAGE_PLATFORMS:-linux/amd64}"
+if [[ -n "${PROJECT}" || -n "${DOMAIN}" ]]; then
+  echo "  project/domain override: ${PROJECT:-<config>} / ${DOMAIN:-<config>}"
+fi
+if [[ "${#RUN_ENV_ARGS[@]}" -gt 0 ]]; then
+  echo "  run env overrides: ${RUN_ENV_ARGS[*]}"
+fi
+if [[ -n "${REQUESTED_RUN_NAME}" && "${_U_USE_ACTIONS:-1}" == "1" ]]; then
+  echo "  warning: custom run names can break abort routing on the actions path"
+fi
+echo
+printf '%-8s %-28s %-14s %-14s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
+  "time" "root_phase" "seen" "not_created" "queued" "waiting" "init" "running" "active" "ok" "aborted"
+
+(
+  export FLYTE_STRESS_IMAGE_REGISTRY="${IMAGE_REGISTRY}"
+  export FLYTE_STRESS_IMAGE_NAME="${IMAGE_NAME}"
+  if [[ -n "${REQUESTED_RUN_NAME}" ]]; then
+    flyte_cmd run "${run_args[@]}" "${RUN_ENV_ARGS[@]}" --name "${REQUESTED_RUN_NAME}" \
+      examples/stress/sleep_fanout.py sleep_fanout \
+      --n_children "${N_CHILDREN}" \
+      --sleep_duration "${SLEEP_DURATION}"
+  else
+    flyte_cmd run "${run_args[@]}" "${RUN_ENV_ARGS[@]}" \
+      examples/stress/sleep_fanout.py sleep_fanout \
+      --n_children "${N_CHILDREN}" \
+      --sleep_duration "${SLEEP_DURATION}"
+  fi
+) >"${LAUNCH_LOG}" 2>&1 &
+LAUNCH_PID=$!
+
+FINAL_ROOT_PHASE="UNKNOWN"
+FINAL_SEEN=0
+FINAL_SUCCEEDED=0
+FINAL_FAILED=0
+FINAL_ABORTED=0
+FINAL_TIMED_OUT=0
+
+while true; do
+  if [[ -z "${RUN_NAME}" ]] && resolve_run_name_from_log; then
+    echo
+    echo "Resolved run name: ${RUN_NAME}"
+  fi
+
+  if [[ "${LAUNCH_DONE}" -eq 0 ]] && ! kill -0 "${LAUNCH_PID}" 2>/dev/null; then
+    if wait "${LAUNCH_PID}"; then
+      LAUNCH_RC=0
+    else
+      LAUNCH_RC=$?
+    fi
+    LAUNCH_DONE=1
+
+    if [[ "${LAUNCH_RC}" -eq 0 ]]; then
+      echo
+      echo "Launch command completed for run ${RUN_NAME:-<unresolved>}."
+    elif [[ "${RUN_VISIBLE}" -eq 0 ]]; then
+      echo
+      echo "Launch command failed before the run became visible." >&2
+      cat "${LAUNCH_LOG}" >&2
+      exit "${LAUNCH_RC}"
+    else
+      echo
+      echo "Launch command exited with ${LAUNCH_RC}, but the run is already visible. Continuing to monitor." >&2
+      cat "${LAUNCH_LOG}" >&2
+    fi
+  fi
+
+  if [[ -n "${RUN_NAME}" ]] && actions_json="$(fetch_actions_json 2>/dev/null)"; then
+    now_epoch="$(date +%s)"
+    RUN_VISIBLE=1
+    if [[ -z "${RUN_VISIBLE_AT}" ]]; then
+      RUN_VISIBLE_AT="${now_epoch}"
+    fi
+
+    if [[ "${STOPPING}" -eq 1 && "${ABORT_SENT}" -eq 0 ]]; then
+      abort_remote_run
+    fi
+
+    IFS=$'\t' read -r root_phase seen not_created queued waiting initializing running active succeeded failed aborted timed_out \
+      <<<"$(snapshot_tsv "${actions_json}")"
+
+    FINAL_ROOT_PHASE="${root_phase}"
+    FINAL_SEEN="${seen}"
+    FINAL_SUCCEEDED="${succeeded}"
+    FINAL_FAILED="${failed}"
+    FINAL_ABORTED="${aborted}"
+    FINAL_TIMED_OUT="${timed_out}"
+
+    if (( seen > PEAK_SEEN )); then
+      PEAK_SEEN="${seen}"
+    fi
+    if (( running > PEAK_RUNNING )); then
+      PEAK_RUNNING="${running}"
+    fi
+    if (( active > PEAK_ACTIVE )); then
+      PEAK_ACTIVE="${active}"
+    fi
+
+    if [[ -z "${ALL_CHILDREN_VISIBLE_AT}" && "${seen}" -eq "${N_CHILDREN}" ]]; then
+      ALL_CHILDREN_VISIBLE_AT="${now_epoch}"
+    fi
+    if [[ -z "${FIRST_RUNNING_AT}" && "${running}" -gt 0 ]]; then
+      FIRST_RUNNING_AT="${now_epoch}"
+    fi
+    if [[ -z "${FIRST_SUCCESS_AT}" && "${succeeded}" -gt 0 ]]; then
+      FIRST_SUCCESS_AT="${now_epoch}"
+    fi
+
+    print_row \
+      "${root_phase}" \
+      "${seen}/${N_CHILDREN}" \
+      "${not_created}" \
+      "${queued}" \
+      "${waiting}" \
+      "${initializing}" \
+      "${running}" \
+      "${active}" \
+      "${succeeded}" \
+      "${aborted}"
+
+    if [[ "${STOPPING}" -eq 1 && "${ABORT_NOTE_SHOWN}" -eq 0 && "${ABORT_SENT}" -eq 1 ]] && ! is_terminal_phase "${root_phase}"; then
+      echo "Abort requested. Waiting for root action to become terminal..."
+      ABORT_NOTE_SHOWN=1
+    fi
+
+    if is_terminal_phase "${root_phase}"; then
+      ROOT_TERMINAL_AT="${now_epoch}"
+      break
+    fi
+  else
+    if [[ -z "${RUN_NAME}" ]]; then
+      launch_stage="$(launch_stage_from_log || true)"
+      if [[ -n "${launch_stage}" && "${launch_stage}" != "${LAST_LAUNCH_STAGE}" ]]; then
+        echo "launch: ${launch_stage}"
+        LAST_LAUNCH_STAGE="${launch_stage}"
+      fi
+    fi
+
+    if [[ "${LAUNCH_DONE}" -eq 1 && "${LAUNCH_RC}" -ne 0 && "${RUN_VISIBLE}" -eq 0 ]]; then
+      echo "Run ${RUN_NAME:-<unresolved>} never became visible after launch failure." >&2
+      exit "${LAUNCH_RC}"
+    fi
+
+    if [[ -z "${RUN_NAME}" ]]; then
+      print_row "RESOLVING_RUN" "0/${N_CHILDREN}" "${N_CHILDREN}" 0 0 0 0 0 0 0
+    else
+      print_row "NO_ACTION_DATA" "0/${N_CHILDREN}" "${N_CHILDREN}" 0 0 0 0 0 0 0
+    fi
+
+    if [[ "${STOPPING}" -eq 1 && "${LAUNCH_DONE}" -eq 1 && "${RUN_VISIBLE}" -eq 0 && -n "${STOPPING_AT}" ]]; then
+      if (( $(date +%s) - STOPPING_AT >= INTERRUPT_GRACE_SEC )); then
+        echo "Run ${RUN_NAME:-<unresolved>} did not become visible within ${INTERRUPT_GRACE_SEC}s after interrupt. Exiting." >&2
+        exit 130
+      fi
+    fi
+  fi
+
+  sleep "${POLL_INTERVAL}"
+done
+
+if [[ "${LAUNCH_DONE}" -eq 0 ]]; then
+  if wait "${LAUNCH_PID}"; then
+    LAUNCH_RC=0
+  else
+    LAUNCH_RC=$?
+  fi
+  LAUNCH_DONE=1
+fi
+
+print_summary \
+  "${FINAL_ROOT_PHASE}" \
+  "${FINAL_SEEN}" \
+  "${FINAL_SUCCEEDED}" \
+  "${FINAL_FAILED}" \
+  "${FINAL_ABORTED}" \
+  "${FINAL_TIMED_OUT}"
+
+if [[ "${STOPPING}" -eq 1 ]]; then
+  exit 130
+fi
+
+if [[ "${FINAL_ROOT_PHASE}" != "ACTION_PHASE_SUCCEEDED" ]]; then
+  exit 1
+fi
diff --git a/src/flyte/_internal/controllers/remote/_controller.py b/src/flyte/_internal/controllers/remote/_controller.py
index a37356924..fd35dce2c 100644
--- a/src/flyte/_internal/controllers/remote/_controller.py
+++ b/src/flyte/_internal/controllers/remote/_controller.py
@@ -4,6 +4,7 @@
 import concurrent.futures
 import os
 import threading
+import time
 from collections import defaultdict
 from collections.abc import Callable
 from contextlib import nullcontext
@@ -153,6 +154,7 @@ async def _submit(self, _task_call_seq: int, _task: TaskTemplate, *args, **kwarg
         if tctx is None:
             raise flyte.errors.RuntimeSystemError("BadContext", "Task context not initialized")
         current_action_id = tctx.action
+        trace_enabled = self._should_trace_sequence(_task_call_seq)
 
         # In the case of a regular code bundle, we will just pass it down as it is to the downstream tasks
         # It is not allowed to change the code bundle (for regular code bundles) in the middle of a run.
@@ -167,11 +169,14 @@ async def _submit(self, _task_call_seq: int, _task: TaskTemplate, *args, **kwarg
             )
 
         _ctx = ctx.new_in_driver_literal_conversion(True) if ctx.is_task_context() else nullcontext()
+        sdk_inputs_start = time.monotonic()
         with _ctx:
             inputs = await convert.convert_from_native_to_inputs(_task.native_interface, *args, **kwargs)
+        sdk_inputs_ms = (time.monotonic() - sdk_inputs_start) * 1000
 
         root_dir = Path(code_bundle.destination).absolute() if code_bundle else Path.cwd()
         # Don't set output path in sec context because node executor will set it
+        sdk_serialize_start = time.monotonic()
         new_serialization_context = SerializationContext(
             project=current_action_id.project,
             domain=current_action_id.domain,
@@ -189,12 +194,17 @@ async def _submit(self, _task_call_seq: int, _task: TaskTemplate, *args, **kwarg
         sub_action_id, sub_action_output_path = convert.generate_sub_action_id_and_output_path(
             tctx, task_spec, inputs_hash, _task_call_seq
         )
+        sdk_serialize_ms = (time.monotonic() - sdk_serialize_start) * 1000
         logger.info(f"Sub action {sub_action_id} output path {sub_action_output_path}")
 
         serialized_inputs = inputs.proto_inputs.SerializeToString(deterministic=True)
+        serialized_input_bytes = len(serialized_inputs)
         inputs_uri = io.inputs_path(sub_action_output_path)
+        storage_put_start = time.monotonic()
         await upload_inputs_with_retry(serialized_inputs, inputs_uri, max_bytes=_task.max_inline_io_bytes)
+        storage_put_ms = (time.monotonic() - storage_put_start) * 1000
 
+        sdk_cache_start = time.monotonic()
         md = task_spec.task_template.metadata
         ignored_input_vars = []
         if len(md.cache_ignore_input_vars) > 0:
@@ -210,6 +220,7 @@ async def _submit(self, _task_call_seq: int, _task: TaskTemplate, *args, **kwarg
                 ignored_input_vars,
                 inputs.proto_inputs,
             )
+        sdk_cache_ms = (time.monotonic() - sdk_cache_start) * 1000
 
         # Clear to free memory
         serialized_inputs = None  # type: ignore
@@ -233,13 +244,41 @@ async def _submit(self, _task_call_seq: int, _task: TaskTemplate, *args, **kwarg
             cache_key=cache_key,
             queue=_task.queue,
         )
+        self._mark_action_for_trace(action.name)
+        if trace_enabled:
+            self._trace_log(
+                action.name,
+                "sdk_prepare",
+                kind="sdk_only",
+                seq=_task_call_seq,
+                task=_task.name,
+                sdk_inputs_ms=f"{sdk_inputs_ms:.1f}",
+                sdk_serialize_ms=f"{sdk_serialize_ms:.1f}",
+                sdk_cache_ms=f"{sdk_cache_ms:.1f}",
+                input_bytes=serialized_input_bytes,
+            )
+            self._trace_log(
+                action.name,
+                "storage_put_inputs",
+                kind="storage_api",
+                elapsed_ms=f"{storage_put_ms:.1f}",
+                input_bytes=serialized_input_bytes,
+            )
 
         try:
             logger.info(
                 f"Submitting action Run:[{action.run_name}, Parent:[{action.parent_action_name}], "
                 f"task:[{_task.name}], action:[{action.name}]"
             )
+            submit_start = time.monotonic()
             n = await self.submit_action(action)
+            if trace_enabled:
+                self._trace_log(
+                    action.name,
+                    "submit_action_done",
+                    kind="mixed",
+                    elapsed_ms=f"{(time.monotonic() - submit_start) * 1000:.1f}",
+                )
             logger.info(f"Action for task [{_task.name}] action id: {action.name}, completed!")
         except asyncio.CancelledError:
             # If the action is cancelled, we need to cancel the action on the server as well
diff --git a/src/flyte/_internal/controllers/remote/_core.py b/src/flyte/_internal/controllers/remote/_core.py
index b67bf13be..a5fcc9d19 100644
--- a/src/flyte/_internal/controllers/remote/_core.py
+++ b/src/flyte/_internal/controllers/remote/_core.py
@@ -4,6 +4,7 @@
 import os
 import sys
 import threading
+import time
 from asyncio import Event
 from typing import Awaitable, Coroutine, Optional
 
@@ -65,7 +66,7 @@ def __init__(
         self._shared_queue: asyncio.Queue[Action] = asyncio.Queue(maxsize=10000)
         self._running = False
         self._resource_log_task = None
-        self._workers = workers
+        self._workers = int(os.getenv("_F_CTRL_WORKERS", str(workers)))
         self._max_retries = int(os.getenv("_F_MAX_RETRIES", max_system_retries))
         self._resource_log_interval = resource_log_interval_sec
         self._min_backoff_on_err = min_backoff_on_err_sec
@@ -77,6 +78,10 @@ def __init__(
         self._informer_start_wait_timeout = thread_wait_timeout_sec
         max_qps = int(os.getenv("_F_MAX_QPS", "100"))
         self._rate_limiter = AsyncLimiter(max_qps, 1.0)
+        self._trace_submit = os.getenv("_F_TRACE_SUBMIT", "").lower() in {"1", "true", "yes", "on"}
+        self._trace_submit_limit = int(os.getenv("_F_TRACE_SUBMIT_LIMIT", "10"))
+        self._trace_actions: set[str] = set()
+        self._trace_lock = threading.Lock()
 
         # Thread management
         self._thread = None
@@ -86,6 +91,28 @@ def __init__(
         self._thread_com_lock = threading.Lock()
         self._start()
 
+    def _should_trace_sequence(self, seq: int) -> bool:
+        return self._trace_submit and seq <= self._trace_submit_limit
+
+    def _mark_action_for_trace(self, action_name: str):
+        if not self._trace_submit:
+            return
+        with self._trace_lock:
+            if len(self._trace_actions) < self._trace_submit_limit:
+                self._trace_actions.add(action_name)
+
+    def _trace_enabled_for(self, action_name: str) -> bool:
+        if not self._trace_submit:
+            return False
+        with self._trace_lock:
+            return action_name in self._trace_actions
+
+    def _trace_log(self, action_name: str, phase: str, **fields):
+        if not self._trace_enabled_for(action_name):
+            return
+        payload = " ".join(f"{key}={value}" for key, value in fields.items())
+        print(f"submit_trace action={action_name} phase={phase} {payload}".rstrip(), flush=True)
+
     # ---------------- Public sync methods, we can add more sync methods if needed
     @log
     def submit_action_sync(self, action: Action) -> Action:
@@ -277,6 +304,8 @@ async def _bg_finalize_informer(
     async def _bg_submit_action(self, action: Action) -> Action:
         """Submit a resource and await its completion, returning the final state"""
         logger.debug(f"{threading.current_thread().name} Submitting action {action.name}")
+        trace_enabled = self._trace_enabled_for(action.name)
+        informer_start = time.monotonic()
         informer = await self._informers.get_or_create(
             action.action_id.run,
             action.parent_action_name,
@@ -286,11 +315,36 @@ async def _bg_submit_action(self, action: Action) -> Action:
             timeout=self._informer_start_wait_timeout,
             actions_service=self._actions_service,
         )
+        if trace_enabled:
+            watch_api = "actions.watch_for_updates" if self._actions_service else "state.watch"
+            self._trace_log(
+                action.name,
+                "informer_ready",
+                kind="controlplane_api",
+                api=watch_api,
+                elapsed_ms=f"{(time.monotonic() - informer_start) * 1000:.1f}",
+            )
+        queue_submit_start = time.monotonic()
         await informer.submit(action)
+        if trace_enabled:
+            self._trace_log(
+                action.name,
+                "queue_submit",
+                kind="sdk_only",
+                elapsed_ms=f"{(time.monotonic() - queue_submit_start) * 1000:.1f}",
+            )
 
         logger.debug(f"{threading.current_thread().name} Waiting for completion of {action.name}")
         # Wait for completion
+        wait_start = time.monotonic()
         await informer.wait_for_action_completion(action.name)
+        if trace_enabled:
+            self._trace_log(
+                action.name,
+                "wait_for_completion",
+                kind="lifecycle_wait",
+                elapsed_ms=f"{(time.monotonic() - wait_start) * 1000:.1f}",
+            )
         logger.info(f"{threading.current_thread().name} Action {action.name} completed")
 
         # Get final resource state and clean up
@@ -346,7 +400,9 @@ async def _bg_launch(self, action: Action):
         Attempt to launch an action.
         """
         if not action.is_started():
+            limiter_wait_start = time.monotonic()
             async with self._rate_limiter:
+                limiter_wait_ms = (time.monotonic() - limiter_wait_start) * 1000
                 task: run_definition_pb2.TaskAction | None = None
                 trace: run_definition_pb2.TraceAction | None = None
                 if action.type == "task":
@@ -375,6 +431,7 @@ async def _bg_launch(self, action: Action):
                     trace = action.trace
 
                 logger.debug(f"Attempting to launch action: {action.name}, actions? {bool(self._actions_service)}")
+                launch_start = time.monotonic()
                 try:
                     if self._actions_service:
                         await self._actions_service.enqueue(
@@ -406,6 +463,14 @@ async def _bg_launch(self, action: Action):
                             timeout_ms=int(self._enqueue_timeout * 1000),
                         )
                     logger.info(f"Successfully launched action: {action.name}")
+                    self._trace_log(
+                        action.name,
+                        "enqueue_action",
+                        kind="controlplane_api",
+                        api="actions.enqueue" if self._actions_service else "queue.enqueue_action",
+                        limiter_wait_ms=f"{limiter_wait_ms:.1f}",
+                        elapsed_ms=f"{(time.monotonic() - launch_start) * 1000:.1f}",
+                    )
                 except ConnectError as e:
                     if e.code == Code.ALREADY_EXISTS:
                         logger.info(f"Action {action.name} already exists, continuing to monitor.")

From ff8d0599954e5fa2fcfa732b7535001db0454c01 Mon Sep 17 00:00:00 2001
From: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
Date: Mon, 27 Apr 2026 16:12:57 -0700
Subject: [PATCH 2/7] cleanup

Signed-off-by: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
---
 .../stress/sleep_fanout_harness_wrapper.sh    |   5 -
 examples/stress/sleep_fanout_swarm_wrapper.sh | 870 ------------------
 examples/stress/sleep_fanout_wrapper.sh       | 610 ------------
 3 files changed, 1485 deletions(-)
 delete mode 100755 examples/stress/sleep_fanout_swarm_wrapper.sh
 delete mode 100755 examples/stress/sleep_fanout_wrapper.sh

diff --git a/examples/stress/sleep_fanout_harness_wrapper.sh b/examples/stress/sleep_fanout_harness_wrapper.sh
index 8a8404358..3511dea01 100755
--- a/examples/stress/sleep_fanout_harness_wrapper.sh
+++ b/examples/stress/sleep_fanout_harness_wrapper.sh
@@ -50,7 +50,6 @@ PEAK_PARENT_LIVE=0
 PEAK_PARENT_RUNNING=0
 LAST_LAUNCH_STAGE=""
 SDK_WHEEL_PATH=""
-SDK_WHEEL_MTIME=""
 SDK_SRC_NEWER=0
 
 usage() {
@@ -237,7 +236,6 @@ sanitize_run_name() {
 
 detect_sdk_wheel_status() {
   local wheel_path=""
-  local wheel_mtime=""
   local newest_src=""
 
   wheel_path="$(
@@ -246,15 +244,12 @@ detect_sdk_wheel_status() {
       | tail -n 1 || true
   )"
   SDK_WHEEL_PATH="${wheel_path}"
-  SDK_WHEEL_MTIME=""
   SDK_SRC_NEWER=0
 
   if [[ -z "${wheel_path}" ]]; then
     return
   fi
 
-  wheel_mtime="$(stat -f '%m' "${wheel_path}" 2>/dev/null || true)"
-  SDK_WHEEL_MTIME="${wheel_mtime}"
   newest_src="$(
     find "${REPO_ROOT}/src/flyte" -type f -newer "${wheel_path}" -print 2>/dev/null \
       | head -n 1 || true
diff --git a/examples/stress/sleep_fanout_swarm_wrapper.sh b/examples/stress/sleep_fanout_swarm_wrapper.sh
deleted file mode 100755
index 7d3cd937f..000000000
--- a/examples/stress/sleep_fanout_swarm_wrapper.sh
+++ /dev/null
@@ -1,870 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
-
-CONFIG="${HOME}/.flyte/config-dogfood.yaml"
-RUN_NAME=""
-REQUESTED_RUN_NAME=""
-SWARM_SIZE=2
-RUNS_PER_WORKER=1
-MAX_RPS=1
-N_CHILDREN=5000
-SLEEP_DURATION=800
-POLL_INTERVAL=2
-ABORT_REASON="wrapper interrupted"
-PROJECT=""
-DOMAIN=""
-IMAGE_REGISTRY="${FLYTE_STRESS_IMAGE_REGISTRY:-376129846803.dkr.ecr.us-east-2.amazonaws.com/union}"
-IMAGE_NAME="${FLYTE_STRESS_IMAGE_NAME:-dogfood}"
-IMAGE_BUILDER="${FLYTE_STRESS_IMAGE_BUILDER:-remote}"
-RUN_ENV_ARGS=()
-
-LAUNCH_PID=""
-LAUNCH_LOG=""
-LAUNCH_DONE=0
-LAUNCH_RC=0
-RUN_VISIBLE=0
-ABORT_SENT=0
-STOPPING=0
-
-SCRIPT_START_EPOCH="$(date +%s)"
-RUN_VISIBLE_AT=""
-FIRST_RUNNING_AT=""
-ROOT_TERMINAL_AT=""
-
-PEAK_SEEN=0
-PEAK_RUNNING=0
-PEAK_ACTIVE=0
-LAST_LAUNCH_STAGE=""
-
-EXPECTED_TOP_ACTIONS=0
-EXPECTED_CHILD_RUNS=0
-EXPECTED_TOTAL_CHILDREN=0
-CHILD_RUNS=()
-CHILD_RUNS_DISCOVERED_AT=""
-AGG_FIRST_RUNNING_AT=""
-AGG_ALL_VISIBLE_AT=""
-AGG_TERMINAL_AT=""
-AGG_PEAK_SEEN=0
-AGG_PEAK_RUNNING=0
-AGG_PEAK_ACTIVE=0
-FINAL_ROOT_ACTIONS_JSON=""
-
-usage() {
-  cat <<'EOF'
-Usage:
-  examples/stress/sleep_fanout_swarm_wrapper.sh [options]
-
-Options:
-  --config PATH           Flyte config path. Default: ~/.flyte/config-dogfood.yaml
-  --project NAME          Override project for launch/get/abort.
-  --domain NAME           Override domain for launch/get/abort.
-  --run-name NAME         Use a fixed run name.
-  --image-registry VALUE  Registry prefix for the task image. Default: 376129846803.dkr.ecr.us-east-2.amazonaws.com/union
-  --image-name VALUE      Repository name for the task image. Default: dogfood
-  --image-builder VALUE   Flyte image builder to use. Default: remote
-  --run-env KEY=VALUE     Pass through to 'flyte run --env'. Can be specified multiple times.
-  --swarm-size INT        Number of submitter tasks. Default: 2
-  --runs-per-worker INT   Number of sleep_fanout runs each submitter launches. Default: 1
-  --max-rps INT           Max submissions per second per submitter. Default: 1
-  --n-children INT        Number of leaves per sleep_fanout run. Default: 5000
-  --sleep-duration VALUE  Sleep duration passed to each child run. Default: 800
-  --poll-interval SEC     Poll interval in seconds. Default: 2
-  --abort-reason TEXT     Reason passed to 'flyte abort run'. Default: wrapper interrupted
-  --help                  Show this message.
-
-Notes:
-  This wrapper first monitors the swarm root run (`main` + `primer` + submitters).
-  After the root finishes submitting, it discovers child run IDs from submitter
-  logs and then switches to aggregate leaf-action counts across those child runs.
-EOF
-}
-
-while [[ $# -gt 0 ]]; do
-  case "$1" in
-    --config)
-      CONFIG="$2"
-      shift 2
-      ;;
-    --project)
-      PROJECT="$2"
-      shift 2
-      ;;
-    --domain)
-      DOMAIN="$2"
-      shift 2
-      ;;
-    --run-name)
-      REQUESTED_RUN_NAME="$2"
-      shift 2
-      ;;
-    --image-registry)
-      IMAGE_REGISTRY="$2"
-      shift 2
-      ;;
-    --image-name)
-      IMAGE_NAME="$2"
-      shift 2
-      ;;
-    --image-builder)
-      IMAGE_BUILDER="$2"
-      shift 2
-      ;;
-    --run-env)
-      RUN_ENV_ARGS+=("--env" "$2")
-      shift 2
-      ;;
-    --swarm-size)
-      SWARM_SIZE="$2"
-      shift 2
-      ;;
-    --runs-per-worker)
-      RUNS_PER_WORKER="$2"
-      shift 2
-      ;;
-    --max-rps)
-      MAX_RPS="$2"
-      shift 2
-      ;;
-    --n-children)
-      N_CHILDREN="$2"
-      shift 2
-      ;;
-    --sleep-duration)
-      SLEEP_DURATION="$2"
-      shift 2
-      ;;
-    --poll-interval)
-      POLL_INTERVAL="$2"
-      shift 2
-      ;;
-    --abort-reason)
-      ABORT_REASON="$2"
-      shift 2
-      ;;
-    --help)
-      usage
-      exit 0
-      ;;
-    *)
-      echo "Unknown option: $1" >&2
-      usage >&2
-      exit 2
-      ;;
-  esac
-done
-
-RUN_NAME="${REQUESTED_RUN_NAME}"
-EXPECTED_TOP_ACTIONS=$((SWARM_SIZE + 2))
-EXPECTED_CHILD_RUNS=$((SWARM_SIZE * RUNS_PER_WORKER))
-EXPECTED_TOTAL_CHILDREN=$((EXPECTED_CHILD_RUNS * N_CHILDREN))
-
-if ! command -v flyte >/dev/null 2>&1; then
-  echo "flyte is required but was not found in PATH." >&2
-  exit 1
-fi
-
-if ! command -v jq >/dev/null 2>&1; then
-  echo "jq is required but was not found in PATH." >&2
-  exit 1
-fi
-
-CONFIG="${CONFIG/#\~/${HOME}}"
-LAUNCH_LOG="$(mktemp "${TMPDIR:-/tmp}/sleep-fanout-swarm-launch.XXXXXX.log")"
-
-cleanup() {
-  if [[ -n "${LAUNCH_LOG}" && -f "${LAUNCH_LOG}" ]]; then
-    rm -f "${LAUNCH_LOG}"
-  fi
-}
-trap cleanup EXIT
-
-flyte_cmd() {
-  _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" "$@"
-}
-
-flyte_cmd_json() {
-  COLUMNS=500 _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" -of json-raw "$@" \
-    | perl -pe 's/\e\[[0-9;]*[A-Za-z]//g'
-}
-
-project_args=()
-domain_args=()
-run_args=()
-
-if [[ -n "${PROJECT}" ]]; then
-  project_args=(-p "${PROJECT}")
-  run_args+=(-p "${PROJECT}")
-fi
-
-if [[ -n "${DOMAIN}" ]]; then
-  domain_args=(-d "${DOMAIN}")
-  run_args+=(-d "${DOMAIN}")
-fi
-
-abort_remote_run() {
-  if [[ "${ABORT_SENT}" -eq 1 || "${RUN_VISIBLE}" -eq 0 || -z "${RUN_NAME}" ]]; then
-    return
-  fi
-
-  ABORT_SENT=1
-  echo
-  echo "Requesting abort for swarm root run ${RUN_NAME}..."
-  echo "Already-submitted child runs are separate runs and may continue."
-  if ! flyte_cmd abort run "${project_args[@]}" "${domain_args[@]}" --reason "${ABORT_REASON}" "${RUN_NAME}"; then
-    echo "Abort request failed for run ${RUN_NAME}." >&2
-  fi
-}
-
-handle_signal() {
-  local sig="$1"
-  if [[ "${STOPPING}" -eq 1 ]]; then
-    echo
-    echo "Received ${sig} again, exiting immediately."
-    exit 130
-  fi
-  STOPPING=1
-
-  echo
-  echo "Received ${sig}, requesting abort for swarm root run ${RUN_NAME:-<pending>}."
-  echo "Continuing to monitor until the current section reaches a terminal phase. Press Ctrl-C again to exit immediately."
-
-  if [[ -n "${LAUNCH_PID}" ]] && kill -0 "${LAUNCH_PID}" 2>/dev/null; then
-    kill "${LAUNCH_PID}" 2>/dev/null || true
-  fi
-
-  abort_remote_run
-}
-
-trap 'handle_signal INT' INT
-trap 'handle_signal TERM' TERM
-
-format_duration() {
-  local value="$1"
-  if [[ -z "${value}" ]]; then
-    echo "n/a"
-    return
-  fi
-
-  local seconds="$value"
-  printf '%02dh:%02dm:%02ds' "$((seconds / 3600))" "$(((seconds % 3600) / 60))" "$((seconds % 60))"
-}
-
-elapsed_from_start() {
-  local epoch="$1"
-  if [[ -z "${epoch}" ]]; then
-    echo ""
-    return
-  fi
-  echo "$((epoch - SCRIPT_START_EPOCH))"
-}
-
-is_terminal_phase() {
-  case "$1" in
-    ACTION_PHASE_SUCCEEDED|ACTION_PHASE_FAILED|ACTION_PHASE_ABORTED|ACTION_PHASE_TIMED_OUT)
-      return 0
-      ;;
-    *)
-      return 1
-      ;;
-  esac
-}
-
-fetch_actions_json() {
-  flyte_cmd_json get action "${project_args[@]}" "${domain_args[@]}" "${RUN_NAME}"
-}
-
-fetch_actions_json_for_run() {
-  local run_name="$1"
-  flyte_cmd_json get action "${project_args[@]}" "${domain_args[@]}" "${run_name}"
-}
-
-fetch_action_logs() {
-  local run_name="$1"
-  local action_name="$2"
-  COLUMNS=500 _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" \
-    get logs "${project_args[@]}" "${domain_args[@]}" "${run_name}" "${action_name}" 2>/dev/null \
-    | perl -pe 's/\e\[[0-9;]*[A-Za-z]//g'
-}
-
-sanitize_run_name() {
-  local value="$1"
-  value="$(printf '%s' "${value}" | tr -d '\r')"
-  value="$(printf '%s' "${value}" | sed 's/[[:space:]]*$//')"
-  value="$(printf '%s' "${value}" | grep -Eo '[ur][[:alnum:]]{5,}' | head -n 1 || true)"
-  printf '%s' "${value}"
-}
-
-resolve_run_name_from_log() {
-  if [[ -z "${LAUNCH_LOG}" || ! -s "${LAUNCH_LOG}" ]]; then
-    return 1
-  fi
-
-  local parsed=""
-  parsed="$(
-    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "${LAUNCH_LOG}" \
-      | sed -n 's/.*Created Run: //p' \
-      | tail -n 1
-  )"
-
-  if [[ -z "${parsed}" ]]; then
-    parsed="$(
-      perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "${LAUNCH_LOG}" \
-        | sed -n 's#.*URL: .*/runs/\([^/?[:space:]]*\).*#\1#p' \
-        | tail -n 1
-    )"
-  fi
-
-  if [[ -z "${parsed}" ]]; then
-    return 1
-  fi
-
-  parsed="$(sanitize_run_name "${parsed}")"
-  if [[ -z "${parsed}" ]]; then
-    return 1
-  fi
-
-  RUN_NAME="${parsed}"
-  return 0
-}
-
-launch_stage_from_log() {
-  if [[ -z "${LAUNCH_LOG}" || ! -s "${LAUNCH_LOG}" ]]; then
-    return 1
-  fi
-
-  local lines=""
-  local stage=""
-
-  lines="$(
-    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g; s/\r/\n/g' "${LAUNCH_LOG}" \
-      | sed '/^[[:space:]]*$/d' \
-      | tail -n 100
-  )"
-
-  if [[ -z "${lines}" ]]; then
-    return 1
-  fi
-
-  stage="$(
-    printf '%s\n' "${lines}" \
-      | grep -E '^(Building|Pushing|Image |Created Run:|URL:|Error:|ERROR|Failed|failed|Using |#)' \
-      | tail -n 1 || true
-  )"
-
-  if [[ -z "${stage}" ]]; then
-    stage="$(printf '%s\n' "${lines}" | tail -n 1)"
-  fi
-
-  if [[ -n "${stage}" ]]; then
-    printf '%s' "${stage}"
-    return 0
-  fi
-
-  return 1
-}
-
-print_row() {
-  printf '%-8s %-28s %-12s %-10s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
-    "$(date '+%H:%M:%S')" \
-    "$1" \
-    "$2" \
-    "$3" \
-    "$4" \
-    "$5" \
-    "$6" \
-    "$7" \
-    "$8" \
-    "$9" \
-    "${10}"
-}
-
-print_agg_row() {
-  printf '%-8s %-12s %-18s %-14s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
-    "$(date '+%H:%M:%S')" \
-    "$1" \
-    "$2" \
-    "$3" \
-    "$4" \
-    "$5" \
-    "$6" \
-    "$7" \
-    "$8" \
-    "$9" \
-    "${10}"
-}
-
-snapshot_tsv() {
-  local actions_json="$1"
-  jq -r \
-    --argjson expected "${EXPECTED_TOP_ACTIONS}" \
-    '
-      [ .[] ] as $all
-      | ($all | map(select(.id.name == "a0")) | .[0]) as $root
-      | [ $all[] | select(.id.name != "a0") ] as $kids
-      | {
-          root_phase: ($root.status.phase // "MISSING"),
-          seen: ($kids | length),
-          queued: ($kids | map(select(.status.phase == "ACTION_PHASE_QUEUED")) | length),
-          waiting: ($kids | map(select(.status.phase == "ACTION_PHASE_WAITING_FOR_RESOURCES")) | length),
-          initializing: ($kids | map(select(.status.phase == "ACTION_PHASE_INITIALIZING")) | length),
-          running: ($kids | map(select(.status.phase == "ACTION_PHASE_RUNNING")) | length),
-          succeeded: ($kids | map(select(.status.phase == "ACTION_PHASE_SUCCEEDED")) | length),
-          failed: ($kids | map(select(.status.phase == "ACTION_PHASE_FAILED")) | length),
-          aborted: ($kids | map(select(.status.phase == "ACTION_PHASE_ABORTED")) | length),
-          timed_out: ($kids | map(select(.status.phase == "ACTION_PHASE_TIMED_OUT")) | length)
-        }
-      | .not_seen = (if $expected > .seen then ($expected - .seen) else 0 end)
-      | .active = (.queued + .waiting + .initializing + .running)
-      | [
-          .root_phase,
-          .seen,
-          .not_seen,
-          .queued,
-          .waiting,
-          .initializing,
-          .running,
-          .active,
-          .succeeded,
-          .failed,
-          .aborted,
-          .timed_out
-        ]
-      | @tsv
-    ' <<<"${actions_json}"
-}
-
-child_run_known() {
-  local target="$1"
-  local existing=""
-  for existing in "${CHILD_RUNS[@]}"; do
-    if [[ "${existing}" == "${target}" ]]; then
-      return 0
-    fi
-  done
-  return 1
-}
-
-discover_child_runs_from_root() {
-  local root_actions_json="$1"
-  local action_name=""
-  local child_run=""
-
-  while IFS= read -r action_name; do
-    [[ -z "${action_name}" ]] && continue
-    while IFS= read -r child_run; do
-      [[ -z "${child_run}" ]] && continue
-      if ! child_run_known "${child_run}"; then
-        CHILD_RUNS+=("${child_run}")
-      fi
-    done < <(
-      fetch_action_logs "${RUN_NAME}" "${action_name}" \
-        | perl -ne '
-            if (/submitted_run idx=\d+ url=.*\/runs\/([^\/?\s]+)/) {
-              print "$1\n";
-            } elsif (/submitted_run idx=\d+ url=([ur][[:alnum:]]{5,})/) {
-              print "$1\n";
-            }
-          '
-    )
-  done < <(jq -r '.[] | select(.id.name != "a0") | .id.name' <<<"${root_actions_json}")
-}
-
-aggregate_child_runs_tsv() {
-  local discovered=0
-  local roots_terminal=0
-  local seen=0
-  local queued=0
-  local waiting=0
-  local initializing=0
-  local running=0
-  local succeeded=0
-  local failed=0
-  local aborted=0
-  local timed_out=0
-  local run_name=""
-  local json=""
-  local root_phase=""
-  local c_seen=0
-  local c_queued=0
-  local c_waiting=0
-  local c_initializing=0
-  local c_running=0
-  local c_succeeded=0
-  local c_failed=0
-  local c_aborted=0
-  local c_timed_out=0
-  local not_created=0
-  local active=0
-
-  discovered="${#CHILD_RUNS[@]}"
-  for run_name in "${CHILD_RUNS[@]}"; do
-    if ! json="$(fetch_actions_json_for_run "${run_name}" 2>/dev/null)"; then
-      continue
-    fi
-    IFS=$'\t' read -r root_phase c_seen c_queued c_waiting c_initializing c_running c_succeeded c_failed c_aborted c_timed_out \
-      <<<"$(jq -r '
-        [ .[] ] as $all
-        | ($all | map(select(.id.name == "a0")) | .[0]) as $root
-        | [ $all[] | select(.id.name != "a0") ] as $kids
-        | [
-            ($root.status.phase // "MISSING"),
-            ($kids | length),
-            ($kids | map(select(.status.phase == "ACTION_PHASE_QUEUED")) | length),
-            ($kids | map(select(.status.phase == "ACTION_PHASE_WAITING_FOR_RESOURCES")) | length),
-            ($kids | map(select(.status.phase == "ACTION_PHASE_INITIALIZING")) | length),
-            ($kids | map(select(.status.phase == "ACTION_PHASE_RUNNING")) | length),
-            ($kids | map(select(.status.phase == "ACTION_PHASE_SUCCEEDED")) | length),
-            ($kids | map(select(.status.phase == "ACTION_PHASE_FAILED")) | length),
-            ($kids | map(select(.status.phase == "ACTION_PHASE_ABORTED")) | length),
-            ($kids | map(select(.status.phase == "ACTION_PHASE_TIMED_OUT")) | length)
-          ]
-        | @tsv
-      ' <<<"${json}")"
-
-    if is_terminal_phase "${root_phase}"; then
-      roots_terminal=$((roots_terminal + 1))
-    fi
-    seen=$((seen + c_seen))
-    queued=$((queued + c_queued))
-    waiting=$((waiting + c_waiting))
-    initializing=$((initializing + c_initializing))
-    running=$((running + c_running))
-    succeeded=$((succeeded + c_succeeded))
-    failed=$((failed + c_failed))
-    aborted=$((aborted + c_aborted))
-    timed_out=$((timed_out + c_timed_out))
-  done
-
-  if (( EXPECTED_TOTAL_CHILDREN > seen )); then
-    not_created=$((EXPECTED_TOTAL_CHILDREN - seen))
-  fi
-  active=$((queued + waiting + initializing + running))
-
-  printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
-    "${discovered}/${EXPECTED_CHILD_RUNS}" \
-    "${seen}/${EXPECTED_TOTAL_CHILDREN}" \
-    "${not_created}" \
-    "${queued}" \
-    "${waiting}" \
-    "${initializing}" \
-    "${running}" \
-    "${active}" \
-    "${succeeded}" \
-    "${failed}" \
-    "${aborted}" \
-    "${timed_out}" \
-    "${roots_terminal}"
-}
-
-print_root_summary() {
-  local root_phase="$1"
-  local seen="$2"
-  local succeeded="$3"
-  local failed="$4"
-  local aborted="$5"
-  local timed_out="$6"
-  local total_elapsed="$(( $(date +%s) - SCRIPT_START_EPOCH ))"
-
-  echo
-  echo "Root Summary"
-  echo "  run_name: ${RUN_NAME:-<unresolved>}"
-  echo "  root_phase: ${root_phase}"
-  echo "  abort_requested: $(if [[ "${STOPPING}" -eq 1 ]]; then echo yes; else echo no; fi)"
-  echo "  top_level_actions_seen: ${seen}/${EXPECTED_TOP_ACTIONS}"
-  echo "  succeeded: ${succeeded}"
-  echo "  failed: ${failed}"
-  echo "  aborted: ${aborted}"
-  echo "  timed_out: ${timed_out}"
-  echo "  peak_seen: ${PEAK_SEEN}/${EXPECTED_TOP_ACTIONS}"
-  echo "  peak_running: ${PEAK_RUNNING}"
-  echo "  peak_active: ${PEAK_ACTIVE}"
-  echo "  time_to_run_visible: $(format_duration "$(elapsed_from_start "${RUN_VISIBLE_AT}")")"
-  echo "  time_to_first_running: $(format_duration "$(elapsed_from_start "${FIRST_RUNNING_AT}")")"
-  echo "  time_to_root_terminal: $(format_duration "$(elapsed_from_start "${ROOT_TERMINAL_AT}")")"
-  echo "  total_elapsed: $(format_duration "${total_elapsed}")"
-}
-
-print_agg_summary() {
-  local discovered="$1"
-  local roots_terminal="$2"
-  local seen="$3"
-  local succeeded="$4"
-  local failed="$5"
-  local aborted="$6"
-  local timed_out="$7"
-  local total_elapsed="$(( $(date +%s) - SCRIPT_START_EPOCH ))"
-
-  echo
-  echo "Aggregate Summary"
-  echo "  child_runs_discovered: ${discovered}/${EXPECTED_CHILD_RUNS}"
-  echo "  total_expected_children: ${EXPECTED_TOTAL_CHILDREN}"
-  echo "  child_run_roots_terminal: ${roots_terminal}/${discovered}"
-  echo "  children_seen: ${seen}/${EXPECTED_TOTAL_CHILDREN}"
-  echo "  succeeded: ${succeeded}"
-  echo "  failed: ${failed}"
-  echo "  aborted: ${aborted}"
-  echo "  timed_out: ${timed_out}"
-  echo "  peak_seen: ${AGG_PEAK_SEEN}/${EXPECTED_TOTAL_CHILDREN}"
-  echo "  peak_running: ${AGG_PEAK_RUNNING}"
-  echo "  peak_active: ${AGG_PEAK_ACTIVE}"
-  echo "  child_runs_discovered_at: $(format_duration "$(elapsed_from_start "${CHILD_RUNS_DISCOVERED_AT}")")"
-  echo "  aggregate_first_running: $(format_duration "$(elapsed_from_start "${AGG_FIRST_RUNNING_AT}")")"
-  echo "  aggregate_all_visible: $(format_duration "$(elapsed_from_start "${AGG_ALL_VISIBLE_AT}")")"
-  echo "  aggregate_terminal: $(format_duration "$(elapsed_from_start "${AGG_TERMINAL_AT}")")"
-  echo "  total_elapsed: $(format_duration "${total_elapsed}")"
-}
-
-cd "${REPO_ROOT}"
-
-if [[ -n "${RUN_NAME}" ]]; then
-  echo "Launching swarm run ${RUN_NAME}"
-else
-  echo "Launching swarm run with generated actions name"
-fi
-echo "  config: ${CONFIG}"
-echo "  swarm_size: ${SWARM_SIZE}"
-echo "  runs_per_worker: ${RUNS_PER_WORKER}"
-echo "  max_rps: ${MAX_RPS}"
-echo "  n_children_per_run: ${N_CHILDREN}"
-echo "  sleep_duration: ${SLEEP_DURATION}"
-echo "  poll_interval: ${POLL_INTERVAL}s"
-echo "  image target: ${IMAGE_REGISTRY}/${IMAGE_NAME}"
-echo "  image builder: ${IMAGE_BUILDER}"
-echo "  image platforms: ${FLYTE_STRESS_IMAGE_PLATFORMS:-linux/amd64}"
-if [[ -n "${PROJECT}" || -n "${DOMAIN}" ]]; then
-  echo "  project/domain override: ${PROJECT:-<config>} / ${DOMAIN:-<config>}"
-fi
-if [[ "${#RUN_ENV_ARGS[@]}" -gt 0 ]]; then
-  echo "  run env overrides: ${RUN_ENV_ARGS[*]}"
-fi
-echo "  child_runs_expected: ${EXPECTED_CHILD_RUNS}"
-echo "  total_children_expected: ${EXPECTED_TOTAL_CHILDREN}"
-echo
-printf '%-8s %-28s %-12s %-10s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
-  "time" "root_phase" "seen_top" "not_seen" "queued" "waiting" "init" "running" "active" "ok" "aborted"
-
-(
-  export FLYTE_STRESS_IMAGE_REGISTRY="${IMAGE_REGISTRY}"
-  export FLYTE_STRESS_IMAGE_NAME="${IMAGE_NAME}"
-  if [[ -n "${REQUESTED_RUN_NAME}" ]]; then
-    flyte_cmd run "${run_args[@]}" "${RUN_ENV_ARGS[@]}" --name "${REQUESTED_RUN_NAME}" \
-      examples/stress/sleep_fanout.py main \
-      --swarm_size "${SWARM_SIZE}" \
-      --runs_per_worker "${RUNS_PER_WORKER}" \
-      --max_rps "${MAX_RPS}" \
-      --n_children "${N_CHILDREN}" \
-      --sleep_duration "${SLEEP_DURATION}"
-  else
-    flyte_cmd run "${run_args[@]}" "${RUN_ENV_ARGS[@]}" \
-      examples/stress/sleep_fanout.py main \
-      --swarm_size "${SWARM_SIZE}" \
-      --runs_per_worker "${RUNS_PER_WORKER}" \
-      --max_rps "${MAX_RPS}" \
-      --n_children "${N_CHILDREN}" \
-      --sleep_duration "${SLEEP_DURATION}"
-  fi
-) >"${LAUNCH_LOG}" 2>&1 &
-LAUNCH_PID=$!
-
-FINAL_ROOT_PHASE="UNKNOWN"
-FINAL_SEEN=0
-FINAL_SUCCEEDED=0
-FINAL_FAILED=0
-FINAL_ABORTED=0
-FINAL_TIMED_OUT=0
-
-while true; do
-  if [[ -z "${RUN_NAME}" ]] && resolve_run_name_from_log; then
-    echo
-    echo "Resolved run name: ${RUN_NAME}"
-  fi
-
-  if [[ "${LAUNCH_DONE}" -eq 0 ]] && ! kill -0 "${LAUNCH_PID}" 2>/dev/null; then
-    if wait "${LAUNCH_PID}"; then
-      LAUNCH_RC=0
-    else
-      LAUNCH_RC=$?
-    fi
-    LAUNCH_DONE=1
-
-    if [[ "${LAUNCH_RC}" -eq 0 ]]; then
-      echo
-      echo "Launch command completed for run ${RUN_NAME:-<unresolved>}."
-    elif [[ "${RUN_VISIBLE}" -eq 0 ]]; then
-      echo
-      echo "Launch command failed before the run became visible." >&2
-      cat "${LAUNCH_LOG}" >&2
-      exit "${LAUNCH_RC}"
-    else
-      echo
-      echo "Launch command exited with ${LAUNCH_RC}, but the run is already visible. Continuing to monitor." >&2
-      cat "${LAUNCH_LOG}" >&2
-    fi
-  fi
-
-  if [[ -n "${RUN_NAME}" ]] && actions_json="$(fetch_actions_json 2>/dev/null)"; then
-    now_epoch="$(date +%s)"
-    RUN_VISIBLE=1
-    FINAL_ROOT_ACTIONS_JSON="${actions_json}"
-    if [[ -z "${RUN_VISIBLE_AT}" ]]; then
-      RUN_VISIBLE_AT="${now_epoch}"
-    fi
-
-    if [[ "${STOPPING}" -eq 1 && "${ABORT_SENT}" -eq 0 ]]; then
-      abort_remote_run
-    fi
-
-    IFS=$'\t' read -r root_phase seen not_seen queued waiting initializing running active succeeded failed aborted timed_out \
-      <<<"$(snapshot_tsv "${actions_json}")"
-
-    FINAL_ROOT_PHASE="${root_phase}"
-    FINAL_SEEN="${seen}"
-    FINAL_SUCCEEDED="${succeeded}"
-    FINAL_FAILED="${failed}"
-    FINAL_ABORTED="${aborted}"
-    FINAL_TIMED_OUT="${timed_out}"
-
-    if (( seen > PEAK_SEEN )); then
-      PEAK_SEEN="${seen}"
-    fi
-    if (( running > PEAK_RUNNING )); then
-      PEAK_RUNNING="${running}"
-    fi
-    if (( active > PEAK_ACTIVE )); then
-      PEAK_ACTIVE="${active}"
-    fi
-    if [[ -z "${FIRST_RUNNING_AT}" && "${running}" -gt 0 ]]; then
-      FIRST_RUNNING_AT="${now_epoch}"
-    fi
-
-    print_row \
-      "${root_phase}" \
-      "${seen}/${EXPECTED_TOP_ACTIONS}" \
-      "${not_seen}" \
-      "${queued}" \
-      "${waiting}" \
-      "${initializing}" \
-      "${running}" \
-      "${active}" \
-      "${succeeded}" \
-      "${aborted}"
-
-    if is_terminal_phase "${root_phase}"; then
-      ROOT_TERMINAL_AT="${now_epoch}"
-      break
-    fi
-  else
-    if stage="$(launch_stage_from_log)"; then
-      if [[ "${stage}" != "${LAST_LAUNCH_STAGE}" ]]; then
-        LAST_LAUNCH_STAGE="${stage}"
-        echo "launch: ${stage}"
-      fi
-    fi
-    print_row "RESOLVING_RUN" "0/${EXPECTED_TOP_ACTIONS}" "${EXPECTED_TOP_ACTIONS}" 0 0 0 0 0 0 0
-  fi
-
-  sleep "${POLL_INTERVAL}"
-done
-
-if [[ "${LAUNCH_DONE}" -eq 0 ]]; then
-  if wait "${LAUNCH_PID}"; then
-    true
-  else
-    true
-  fi
-fi
-
-print_root_summary \
-  "${FINAL_ROOT_PHASE}" \
-  "${FINAL_SEEN}" \
-  "${FINAL_SUCCEEDED}" \
-  "${FINAL_FAILED}" \
-  "${FINAL_ABORTED}" \
-  "${FINAL_TIMED_OUT}"
-
-if [[ "${FINAL_ROOT_PHASE}" == "ACTION_PHASE_SUCCEEDED" && -n "${FINAL_ROOT_ACTIONS_JSON}" ]]; then
-  discover_child_runs_from_root "${FINAL_ROOT_ACTIONS_JSON}"
-
-  if [[ "${#CHILD_RUNS[@]}" -gt 0 ]]; then
-    CHILD_RUNS_DISCOVERED_AT="$(date +%s)"
-
-    echo
-    echo "Discovered child runs: ${#CHILD_RUNS[@]}/${EXPECTED_CHILD_RUNS}"
-    printf '%s\n' "${CHILD_RUNS[@]}"
-    echo
-    printf '%-8s %-12s %-18s %-14s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
-      "time" "runs" "seen_children" "not_created" "queued" "waiting" "init" "running" "active" "ok" "aborted"
-
-    AGG_FINAL_DISCOVERED=0
-    AGG_FINAL_SEEN=0
-    AGG_FINAL_SUCCEEDED=0
-    AGG_FINAL_FAILED=0
-    AGG_FINAL_ABORTED=0
-    AGG_FINAL_TIMED_OUT=0
-    AGG_FINAL_ROOTS_TERMINAL=0
-
-    while true; do
-      IFS=$'\t' read -r runs seen_children not_created queued waiting initializing running active succeeded failed aborted timed_out roots_terminal \
-        <<<"$(aggregate_child_runs_tsv)"
-
-      AGG_FINAL_DISCOVERED="${runs%%/*}"
-      AGG_FINAL_SEEN="${seen_children%%/*}"
-      AGG_FINAL_SUCCEEDED="${succeeded}"
-      AGG_FINAL_FAILED="${failed}"
-      AGG_FINAL_ABORTED="${aborted}"
-      AGG_FINAL_TIMED_OUT="${timed_out}"
-      AGG_FINAL_ROOTS_TERMINAL="${roots_terminal}"
-
-      if (( AGG_FINAL_SEEN > AGG_PEAK_SEEN )); then
-        AGG_PEAK_SEEN="${AGG_FINAL_SEEN}"
-      fi
-      if (( running > AGG_PEAK_RUNNING )); then
-        AGG_PEAK_RUNNING="${running}"
-      fi
-      if (( active > AGG_PEAK_ACTIVE )); then
-        AGG_PEAK_ACTIVE="${active}"
-      fi
-      if [[ -z "${AGG_FIRST_RUNNING_AT}" && "${running}" -gt 0 ]]; then
-        AGG_FIRST_RUNNING_AT="$(date +%s)"
-      fi
-      if [[ -z "${AGG_ALL_VISIBLE_AT}" && "${AGG_FINAL_SEEN}" -ge "${EXPECTED_TOTAL_CHILDREN}" ]]; then
-        AGG_ALL_VISIBLE_AT="$(date +%s)"
-      fi
-
-      print_agg_row \
-        "${runs}" \
-        "${seen_children}" \
-        "${not_created}" \
-        "${queued}" \
-        "${waiting}" \
-        "${initializing}" \
-        "${running}" \
-        "${active}" \
-        "${succeeded}" \
-        "${aborted}"
-
-      if (( AGG_FINAL_DISCOVERED == EXPECTED_CHILD_RUNS )) && (( AGG_FINAL_ROOTS_TERMINAL == AGG_FINAL_DISCOVERED )) && (( active == 0 )); then
-        AGG_TERMINAL_AT="$(date +%s)"
-        break
-      fi
-
-      sleep "${POLL_INTERVAL}"
-    done
-
-    print_agg_summary \
-      "${AGG_FINAL_DISCOVERED}" \
-      "${AGG_FINAL_ROOTS_TERMINAL}" \
-      "${AGG_FINAL_SEEN}" \
-      "${AGG_FINAL_SUCCEEDED}" \
-      "${AGG_FINAL_FAILED}" \
-      "${AGG_FINAL_ABORTED}" \
-      "${AGG_FINAL_TIMED_OUT}"
-  else
-    echo
-    echo "No child runs discovered from submitter logs."
-  fi
-fi
diff --git a/examples/stress/sleep_fanout_wrapper.sh b/examples/stress/sleep_fanout_wrapper.sh
deleted file mode 100755
index 8cf4188dd..000000000
--- a/examples/stress/sleep_fanout_wrapper.sh
+++ /dev/null
@@ -1,610 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
-
-CONFIG="${HOME}/.flyte/config-dogfood.yaml"
-RUN_NAME=""
-REQUESTED_RUN_NAME=""
-N_CHILDREN=10
-SLEEP_DURATION=10
-POLL_INTERVAL=1
-ABORT_REASON="wrapper interrupted"
-PROJECT=""
-DOMAIN=""
-IMAGE_REGISTRY="${FLYTE_STRESS_IMAGE_REGISTRY:-376129846803.dkr.ecr.us-east-2.amazonaws.com/union}"
-IMAGE_NAME="${FLYTE_STRESS_IMAGE_NAME:-dogfood}"
-IMAGE_BUILDER="${FLYTE_STRESS_IMAGE_BUILDER:-remote}"
-RUN_ENV_ARGS=()
-
-LAUNCH_PID=""
-LAUNCH_LOG=""
-LAUNCH_DONE=0
-LAUNCH_RC=0
-RUN_VISIBLE=0
-ABORT_SENT=0
-STOPPING=0
-STOPPING_AT=""
-ABORT_NOTE_SHOWN=0
-INTERRUPT_GRACE_SEC=15
-
-SCRIPT_START_EPOCH="$(date +%s)"
-RUN_VISIBLE_AT=""
-ALL_CHILDREN_VISIBLE_AT=""
-FIRST_RUNNING_AT=""
-FIRST_SUCCESS_AT=""
-ROOT_TERMINAL_AT=""
-
-PEAK_SEEN=0
-PEAK_RUNNING=0
-PEAK_ACTIVE=0
-LAST_LAUNCH_STAGE=""
-
-usage() {
-  cat <<'EOF'
-Usage:
-  examples/stress/sleep_fanout_wrapper.sh [options]
-
-Options:
-  --config PATH           Flyte config path. Default: ~/.flyte/config-dogfood.yaml
-  --project NAME          Override project for launch/get/abort.
-  --domain NAME           Override domain for launch/get/abort.
-  --run-name NAME         Use a fixed run name. Avoid this on _U_USE_ACTIONS=1 if you want abort to work.
-  --image-registry VALUE  Registry prefix for the task image. Default: 376129846803.dkr.ecr.us-east-2.amazonaws.com/union
-  --image-name VALUE      Repository name for the task image. Default: dogfood
-  --image-builder VALUE   Flyte image builder to use. Default: remote
-  --run-env KEY=VALUE     Pass through to 'flyte run --env'. Can be specified multiple times.
-  --n-children INT        Number of child actions. Default: 10
-  --sleep-duration VALUE  Sleep duration passed to the task. Default: 10
-  --poll-interval SEC     Poll interval in seconds. Default: 1
-  --abort-reason TEXT     Reason passed to 'flyte abort run'. Default: wrapper interrupted
-  --help                  Show this message.
-
-Example:
-  examples/stress/sleep_fanout_wrapper.sh \
-    --config ~/.flyte/config-dogfood.yaml \
-    --n-children 10 \
-    --sleep-duration 10
-EOF
-}
-
-while [[ $# -gt 0 ]]; do
-  case "$1" in
-    --config)
-      CONFIG="$2"
-      shift 2
-      ;;
-    --project)
-      PROJECT="$2"
-      shift 2
-      ;;
-    --domain)
-      DOMAIN="$2"
-      shift 2
-      ;;
-    --run-name)
-      REQUESTED_RUN_NAME="$2"
-      shift 2
-      ;;
-    --image-registry)
-      IMAGE_REGISTRY="$2"
-      shift 2
-      ;;
-    --image-name)
-      IMAGE_NAME="$2"
-      shift 2
-      ;;
-    --image-builder)
-      IMAGE_BUILDER="$2"
-      shift 2
-      ;;
-    --run-env)
-      RUN_ENV_ARGS+=("--env" "$2")
-      shift 2
-      ;;
-    --n-children)
-      N_CHILDREN="$2"
-      shift 2
-      ;;
-    --sleep-duration)
-      SLEEP_DURATION="$2"
-      shift 2
-      ;;
-    --poll-interval)
-      POLL_INTERVAL="$2"
-      shift 2
-      ;;
-    --abort-reason)
-      ABORT_REASON="$2"
-      shift 2
-      ;;
-    --help)
-      usage
-      exit 0
-      ;;
-    *)
-      echo "Unknown option: $1" >&2
-      usage >&2
-      exit 2
-      ;;
-  esac
-done
-
-RUN_NAME="${REQUESTED_RUN_NAME}"
-
-if ! command -v flyte >/dev/null 2>&1; then
-  echo "flyte is required but was not found in PATH." >&2
-  exit 1
-fi
-
-if ! command -v jq >/dev/null 2>&1; then
-  echo "jq is required but was not found in PATH." >&2
-  exit 1
-fi
-
-CONFIG="${CONFIG/#\~/${HOME}}"
-LAUNCH_LOG="$(mktemp "${TMPDIR:-/tmp}/sleep-fanout-launch.XXXXXX.log")"
-
-cleanup() {
-  if [[ -n "${LAUNCH_LOG}" && -f "${LAUNCH_LOG}" ]]; then
-    rm -f "${LAUNCH_LOG}"
-  fi
-}
-trap cleanup EXIT
-
-flyte_cmd() {
-  _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" "$@"
-}
-
-flyte_cmd_json() {
-  COLUMNS=500 _U_USE_ACTIONS="${_U_USE_ACTIONS:-1}" flyte -c "${CONFIG}" --image-builder "${IMAGE_BUILDER}" -of json-raw "$@" \
-    | perl -pe 's/\e\[[0-9;]*[A-Za-z]//g'
-}
-
-project_args=()
-domain_args=()
-run_args=()
-
-if [[ -n "${PROJECT}" ]]; then
-  project_args=(-p "${PROJECT}")
-  run_args+=(-p "${PROJECT}")
-fi
-
-if [[ -n "${DOMAIN}" ]]; then
-  domain_args=(-d "${DOMAIN}")
-  run_args+=(-d "${DOMAIN}")
-fi
-
-abort_remote_run() {
-  if [[ "${ABORT_SENT}" -eq 1 || "${RUN_VISIBLE}" -eq 0 || -z "${RUN_NAME}" ]]; then
-    return
-  fi
-
-  ABORT_SENT=1
-  echo
-  echo "Requesting abort for run ${RUN_NAME}..."
-  if ! flyte_cmd abort run "${project_args[@]}" "${domain_args[@]}" --reason "${ABORT_REASON}" "${RUN_NAME}"; then
-    echo "Abort request failed for run ${RUN_NAME}." >&2
-  fi
-}
-
-handle_signal() {
-  local sig="$1"
-  if [[ "${STOPPING}" -eq 1 ]]; then
-    echo
-    echo "Received ${sig} again, exiting immediately."
-    exit 130
-  fi
-  STOPPING=1
-  STOPPING_AT="$(date +%s)"
-
-  echo
-  echo "Received ${sig}, requesting abort for run ${RUN_NAME:-<pending>}."
-  echo "Continuing to monitor until the run reaches a terminal phase. Press Ctrl-C again to exit immediately."
-
-  if [[ -n "${LAUNCH_PID}" ]] && kill -0 "${LAUNCH_PID}" 2>/dev/null; then
-    kill "${LAUNCH_PID}" 2>/dev/null || true
-  fi
-
-  abort_remote_run
-}
-
-trap 'handle_signal INT' INT
-trap 'handle_signal TERM' TERM
-
-format_duration() {
-  local value="$1"
-  if [[ -z "${value}" ]]; then
-    echo "n/a"
-    return
-  fi
-
-  local seconds="$value"
-  printf '%02dh:%02dm:%02ds' "$((seconds / 3600))" "$(((seconds % 3600) / 60))" "$((seconds % 60))"
-}
-
-elapsed_from_start() {
-  local epoch="$1"
-  if [[ -z "${epoch}" ]]; then
-    echo ""
-    return
-  fi
-  echo "$((epoch - SCRIPT_START_EPOCH))"
-}
-
-is_terminal_phase() {
-  case "$1" in
-    ACTION_PHASE_SUCCEEDED|ACTION_PHASE_FAILED|ACTION_PHASE_ABORTED|ACTION_PHASE_TIMED_OUT)
-      return 0
-      ;;
-    *)
-      return 1
-      ;;
-  esac
-}
-
-fetch_actions_json() {
-  flyte_cmd_json get action "${project_args[@]}" "${domain_args[@]}" "${RUN_NAME}"
-}
-
-sanitize_run_name() {
-  local value="$1"
-  value="$(printf '%s' "${value}" | tr -d '\r')"
-  value="$(printf '%s' "${value}" | sed 's/[[:space:]]*$//')"
-  value="$(printf '%s' "${value}" | grep -Eo '[ur][[:alnum:]]{5,}' | head -n 1 || true)"
-  printf '%s' "${value}"
-}
-
-resolve_run_name_from_log() {
-  if [[ -z "${LAUNCH_LOG}" || ! -s "${LAUNCH_LOG}" ]]; then
-    return 1
-  fi
-
-  local parsed=""
-  parsed="$(
-    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "${LAUNCH_LOG}" \
-      | sed -n 's/.*Created Run: //p' \
-      | tail -n 1
-  )"
-
-  if [[ -z "${parsed}" ]]; then
-    parsed="$(
-      perl -pe 's/\e\[[0-9;]*[A-Za-z]//g' "${LAUNCH_LOG}" \
-        | sed -n 's#.*URL: .*/runs/\([^/?[:space:]]*\).*#\1#p' \
-        | tail -n 1
-    )"
-  fi
-
-  if [[ -z "${parsed}" ]]; then
-    return 1
-  fi
-
-  parsed="$(sanitize_run_name "${parsed}")"
-  if [[ -z "${parsed}" ]]; then
-    return 1
-  fi
-
-  RUN_NAME="${parsed}"
-  return 0
-}
-
-launch_stage_from_log() {
-  if [[ -z "${LAUNCH_LOG}" || ! -s "${LAUNCH_LOG}" ]]; then
-    return 1
-  fi
-
-  local lines=""
-  local stage=""
-
-  lines="$(
-    perl -pe 's/\e\[[0-9;]*[A-Za-z]//g; s/\r/\n/g' "${LAUNCH_LOG}" \
-      | sed '/^[[:space:]]*$/d' \
-      | tail -n 100
-  )"
-
-  if [[ -z "${lines}" ]]; then
-    return 1
-  fi
-
-  stage="$(
-    printf '%s\n' "${lines}" \
-      | grep -E '^(Building|Pushing|Image |Created Run:|URL:|Error:|ERROR|Failed|failed|Using |#)' \
-      | tail -n 1 || true
-  )"
-
-  if [[ -z "${stage}" ]]; then
-    stage="$(printf '%s\n' "${lines}" | tail -n 1)"
-  fi
-
-  if [[ -n "${stage}" ]]; then
-    printf '%s' "${stage}"
-    return 0
-  fi
-
-  return 1
-}
-
-print_row() {
-  printf '%-8s %-28s %-14s %-14s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
-    "$(date '+%H:%M:%S')" \
-    "$1" \
-    "$2" \
-    "$3" \
-    "$4" \
-    "$5" \
-    "$6" \
-    "$7" \
-    "$8" \
-    "$9" \
-    "${10}"
-}
-
-snapshot_tsv() {
-  local actions_json="$1"
-  jq -r \
-    --argjson expected "${N_CHILDREN}" \
-    '
-      [ .[] ] as $all
-      | ($all | map(select(.id.name == "a0")) | .[0]) as $root
-      | [ $all[] | select(.id.name != "a0") ] as $kids
-      | {
-          root_phase: ($root.status.phase // "MISSING"),
-          seen: ($kids | length),
-          queued: ($kids | map(select(.status.phase == "ACTION_PHASE_QUEUED")) | length),
-          waiting: ($kids | map(select(.status.phase == "ACTION_PHASE_WAITING_FOR_RESOURCES")) | length),
-          initializing: ($kids | map(select(.status.phase == "ACTION_PHASE_INITIALIZING")) | length),
-          running: ($kids | map(select(.status.phase == "ACTION_PHASE_RUNNING")) | length),
-          succeeded: ($kids | map(select(.status.phase == "ACTION_PHASE_SUCCEEDED")) | length),
-          failed: ($kids | map(select(.status.phase == "ACTION_PHASE_FAILED")) | length),
-          aborted: ($kids | map(select(.status.phase == "ACTION_PHASE_ABORTED")) | length),
-          timed_out: ($kids | map(select(.status.phase == "ACTION_PHASE_TIMED_OUT")) | length)
-        }
-      | .not_created = (if $expected > .seen then ($expected - .seen) else 0 end)
-      | .active = (.queued + .waiting + .initializing + .running)
-      | [
-          .root_phase,
-          .seen,
-          .not_created,
-          .queued,
-          .waiting,
-          .initializing,
-          .running,
-          .active,
-          .succeeded,
-          .failed,
-          .aborted,
-          .timed_out
-        ]
-      | @tsv
-    ' <<<"${actions_json}"
-}
-
-print_summary() {
-  local root_phase="$1"
-  local seen="$2"
-  local succeeded="$3"
-  local failed="$4"
-  local aborted="$5"
-  local timed_out="$6"
-  local total_elapsed="$(( $(date +%s) - SCRIPT_START_EPOCH ))"
-
-  echo
-  echo "Summary"
-  echo "  run_name: ${RUN_NAME:-<unresolved>}"
-  echo "  root_phase: ${root_phase}"
-  echo "  abort_requested: $(if [[ "${STOPPING}" -eq 1 ]]; then echo yes; else echo no; fi)"
-  echo "  children_seen: ${seen}/${N_CHILDREN}"
-  echo "  succeeded: ${succeeded}"
-  echo "  failed: ${failed}"
-  echo "  aborted: ${aborted}"
-  echo "  timed_out: ${timed_out}"
-  echo "  peak_seen: ${PEAK_SEEN}/${N_CHILDREN}"
-  echo "  peak_running: ${PEAK_RUNNING}"
-  echo "  peak_active: ${PEAK_ACTIVE}"
-  echo "  time_to_run_visible: $(format_duration "$(elapsed_from_start "${RUN_VISIBLE_AT}")")"
-  echo "  time_to_all_children_visible: $(format_duration "$(elapsed_from_start "${ALL_CHILDREN_VISIBLE_AT}")")"
-  echo "  time_to_first_running: $(format_duration "$(elapsed_from_start "${FIRST_RUNNING_AT}")")"
-  echo "  time_to_first_success: $(format_duration "$(elapsed_from_start "${FIRST_SUCCESS_AT}")")"
-  echo "  time_to_root_terminal: $(format_duration "$(elapsed_from_start "${ROOT_TERMINAL_AT}")")"
-  echo "  total_elapsed: $(format_duration "${total_elapsed}")"
-}
-
-cd "${REPO_ROOT}"
-
-if [[ -n "${RUN_NAME}" ]]; then
-  echo "Launching run ${RUN_NAME}"
-else
-  echo "Launching run with generated actions name"
-fi
-echo "  config: ${CONFIG}"
-echo "  children: ${N_CHILDREN}"
-echo "  sleep_duration: ${SLEEP_DURATION}"
-echo "  poll_interval: ${POLL_INTERVAL}s"
-echo "  image target: ${IMAGE_REGISTRY}/${IMAGE_NAME}"
-echo "  image builder: ${IMAGE_BUILDER}"
-echo "  image platforms: ${FLYTE_STRESS_IMAGE_PLATFORMS:-linux/amd64}"
-if [[ -n "${PROJECT}" || -n "${DOMAIN}" ]]; then
-  echo "  project/domain override: ${PROJECT:-<config>} / ${DOMAIN:-<config>}"
-fi
-if [[ "${#RUN_ENV_ARGS[@]}" -gt 0 ]]; then
-  echo "  run env overrides: ${RUN_ENV_ARGS[*]}"
-fi
-if [[ -n "${REQUESTED_RUN_NAME}" && "${_U_USE_ACTIONS:-1}" == "1" ]]; then
-  echo "  warning: custom run names can break abort routing on the actions path"
-fi
-echo
-printf '%-8s %-28s %-14s %-14s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n' \
-  "time" "root_phase" "seen" "not_created" "queued" "waiting" "init" "running" "active" "ok" "aborted"
-
-(
-  export FLYTE_STRESS_IMAGE_REGISTRY="${IMAGE_REGISTRY}"
-  export FLYTE_STRESS_IMAGE_NAME="${IMAGE_NAME}"
-  if [[ -n "${REQUESTED_RUN_NAME}" ]]; then
-    flyte_cmd run "${run_args[@]}" "${RUN_ENV_ARGS[@]}" --name "${REQUESTED_RUN_NAME}" \
-      examples/stress/sleep_fanout.py sleep_fanout \
-      --n_children "${N_CHILDREN}" \
-      --sleep_duration "${SLEEP_DURATION}"
-  else
-    flyte_cmd run "${run_args[@]}" "${RUN_ENV_ARGS[@]}" \
-      examples/stress/sleep_fanout.py sleep_fanout \
-      --n_children "${N_CHILDREN}" \
-      --sleep_duration "${SLEEP_DURATION}"
-  fi
-) >"${LAUNCH_LOG}" 2>&1 &
-LAUNCH_PID=$!
-
-FINAL_ROOT_PHASE="UNKNOWN"
-FINAL_SEEN=0
-FINAL_SUCCEEDED=0
-FINAL_FAILED=0
-FINAL_ABORTED=0
-FINAL_TIMED_OUT=0
-
-while true; do
-  if [[ -z "${RUN_NAME}" ]] && resolve_run_name_from_log; then
-    echo
-    echo "Resolved run name: ${RUN_NAME}"
-  fi
-
-  if [[ "${LAUNCH_DONE}" -eq 0 ]] && ! kill -0 "${LAUNCH_PID}" 2>/dev/null; then
-    if wait "${LAUNCH_PID}"; then
-      LAUNCH_RC=0
-    else
-      LAUNCH_RC=$?
-    fi
-    LAUNCH_DONE=1
-
-    if [[ "${LAUNCH_RC}" -eq 0 ]]; then
-      echo
-      echo "Launch command completed for run ${RUN_NAME:-<unresolved>}."
-    elif [[ "${RUN_VISIBLE}" -eq 0 ]]; then
-      echo
-      echo "Launch command failed before the run became visible." >&2
-      cat "${LAUNCH_LOG}" >&2
-      exit "${LAUNCH_RC}"
-    else
-      echo
-      echo "Launch command exited with ${LAUNCH_RC}, but the run is already visible. Continuing to monitor." >&2
-      cat "${LAUNCH_LOG}" >&2
-    fi
-  fi
-
-  if [[ -n "${RUN_NAME}" ]] && actions_json="$(fetch_actions_json 2>/dev/null)"; then
-    now_epoch="$(date +%s)"
-    RUN_VISIBLE=1
-    if [[ -z "${RUN_VISIBLE_AT}" ]]; then
-      RUN_VISIBLE_AT="${now_epoch}"
-    fi
-
-    if [[ "${STOPPING}" -eq 1 && "${ABORT_SENT}" -eq 0 ]]; then
-      abort_remote_run
-    fi
-
-    IFS=$'\t' read -r root_phase seen not_created queued waiting initializing running active succeeded failed aborted timed_out \
-      <<<"$(snapshot_tsv "${actions_json}")"
-
-    FINAL_ROOT_PHASE="${root_phase}"
-    FINAL_SEEN="${seen}"
-    FINAL_SUCCEEDED="${succeeded}"
-    FINAL_FAILED="${failed}"
-    FINAL_ABORTED="${aborted}"
-    FINAL_TIMED_OUT="${timed_out}"
-
-    if (( seen > PEAK_SEEN )); then
-      PEAK_SEEN="${seen}"
-    fi
-    if (( running > PEAK_RUNNING )); then
-      PEAK_RUNNING="${running}"
-    fi
-    if (( active > PEAK_ACTIVE )); then
-      PEAK_ACTIVE="${active}"
-    fi
-
-    if [[ -z "${ALL_CHILDREN_VISIBLE_AT}" && "${seen}" -eq "${N_CHILDREN}" ]]; then
-      ALL_CHILDREN_VISIBLE_AT="${now_epoch}"
-    fi
-    if [[ -z "${FIRST_RUNNING_AT}" && "${running}" -gt 0 ]]; then
-      FIRST_RUNNING_AT="${now_epoch}"
-    fi
-    if [[ -z "${FIRST_SUCCESS_AT}" && "${succeeded}" -gt 0 ]]; then
-      FIRST_SUCCESS_AT="${now_epoch}"
-    fi
-
-    print_row \
-      "${root_phase}" \
-      "${seen}/${N_CHILDREN}" \
-      "${not_created}" \
-      "${queued}" \
-      "${waiting}" \
-      "${initializing}" \
-      "${running}" \
-      "${active}" \
-      "${succeeded}" \
-      "${aborted}"
-
-    if [[ "${STOPPING}" -eq 1 && "${ABORT_NOTE_SHOWN}" -eq 0 && "${ABORT_SENT}" -eq 1 ]] && ! is_terminal_phase "${root_phase}"; then
-      echo "Abort requested. Waiting for root action to become terminal..."
-      ABORT_NOTE_SHOWN=1
-    fi
-
-    if is_terminal_phase "${root_phase}"; then
-      ROOT_TERMINAL_AT="${now_epoch}"
-      break
-    fi
-  else
-    if [[ -z "${RUN_NAME}" ]]; then
-      launch_stage="$(launch_stage_from_log || true)"
-      if [[ -n "${launch_stage}" && "${launch_stage}" != "${LAST_LAUNCH_STAGE}" ]]; then
-        echo "launch: ${launch_stage}"
-        LAST_LAUNCH_STAGE="${launch_stage}"
-      fi
-    fi
-
-    if [[ "${LAUNCH_DONE}" -eq 1 && "${LAUNCH_RC}" -ne 0 && "${RUN_VISIBLE}" -eq 0 ]]; then
-      echo "Run ${RUN_NAME:-<unresolved>} never became visible after launch failure." >&2
-      exit "${LAUNCH_RC}"
-    fi
-
-    if [[ -z "${RUN_NAME}" ]]; then
-      print_row "RESOLVING_RUN" "0/${N_CHILDREN}" "${N_CHILDREN}" 0 0 0 0 0 0 0
-    else
-      print_row "NO_ACTION_DATA" "0/${N_CHILDREN}" "${N_CHILDREN}" 0 0 0 0 0 0 0
-    fi
-
-    if [[ "${STOPPING}" -eq 1 && "${LAUNCH_DONE}" -eq 1 && "${RUN_VISIBLE}" -eq 0 && -n "${STOPPING_AT}" ]]; then
-      if (( $(date +%s) - STOPPING_AT >= INTERRUPT_GRACE_SEC )); then
-        echo "Run ${RUN_NAME:-<unresolved>} did not become visible within ${INTERRUPT_GRACE_SEC}s after interrupt. Exiting." >&2
-        exit 130
-      fi
-    fi
-  fi
-
-  sleep "${POLL_INTERVAL}"
-done
-
-if [[ "${LAUNCH_DONE}" -eq 0 ]]; then
-  if wait "${LAUNCH_PID}"; then
-    LAUNCH_RC=0
-  else
-    LAUNCH_RC=$?
-  fi
-  LAUNCH_DONE=1
-fi
-
-print_summary \
-  "${FINAL_ROOT_PHASE}" \
-  "${FINAL_SEEN}" \
-  "${FINAL_SUCCEEDED}" \
-  "${FINAL_FAILED}" \
-  "${FINAL_ABORTED}" \
-  "${FINAL_TIMED_OUT}"
-
-if [[ "${STOPPING}" -eq 1 ]]; then
-  exit 130
-fi
-
-if [[ "${FINAL_ROOT_PHASE}" != "ACTION_PHASE_SUCCEEDED" ]]; then
-  exit 1
-fi

From 0ec6b08b3ecb0ee6e82d88272bddd87634eaafcc Mon Sep 17 00:00:00 2001
From: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
Date: Mon, 27 Apr 2026 16:16:14 -0700
Subject: [PATCH 3/7] lint

Signed-off-by: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
---
 examples/stress/sleep_fanout_harness.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/stress/sleep_fanout_harness.py b/examples/stress/sleep_fanout_harness.py
index 2c4fd1e04..c1d43052c 100644
--- a/examples/stress/sleep_fanout_harness.py
+++ b/examples/stress/sleep_fanout_harness.py
@@ -11,13 +11,11 @@
 import asyncio
 import os
 import pathlib
+import re
 import shutil
-import sys
 import time
 from datetime import timedelta
 
-import re
-
 RUN_URL_RE = re.compile(r"URL:\s+(\S+/runs/[^/?\s]+)")
 RUN_NAME_RE = re.compile(r"Created Run:\s+([^\s]+)")
 RUNS_FILE = os.getenv("FLYTE_HARNESS_RUNS_FILE")

From be96f27b63a0a4e56f02e4c07463c31be29ab88b Mon Sep 17 00:00:00 2001
From: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
Date: Mon, 27 Apr 2026 16:18:14 -0700
Subject: [PATCH 4/7] added readme

Signed-off-by: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
---
 examples/stress/README.md | 50 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 examples/stress/README.md

diff --git a/examples/stress/README.md b/examples/stress/README.md
new file mode 100644
index 000000000..b36edf5a8
--- /dev/null
+++ b/examples/stress/README.md
@@ -0,0 +1,50 @@
+# Stress Examples
+
+This directory contains ad hoc stress and failure-mode examples for Flyte and Union dogfood testing.
+
+## Primary Entry Point
+
+Use [sleep_fanout_harness_wrapper.sh](/Users/praful/flyte-sdk/examples/stress/sleep_fanout_harness_wrapper.sh) for multi-run `core-sleep` fanout tests:
+
+```bash
+examples/stress/sleep_fanout_harness_wrapper.sh \
+  --config ~/.flyte/config-dogfood.yaml \
+  --total-runs 10 \
+  --submit-concurrency 10 \
+  --n-children 1000 \
+  --sleep-duration 600 \
+  --poll-interval 1 \
+  --run-env _F_MAX_QPS=150 \
+  --run-env _F_CTRL_WORKERS=20 \
+  --run-env _F_P_CNC=1000
+```
+
+This wrapper:
+- submits many top-level `sleep_fanout` runs through `flyte run`
+- tracks aggregate child visibility and running counts
+- prints parent-run counts (`p_live`, `p_run`) and child creation rate (`create_rps`, `rps/p`)
+
+The underlying task definitions live in [sleep_fanout.py](/Users/praful/flyte-sdk/examples/stress/sleep_fanout.py), and the local submit helper lives in [sleep_fanout_harness.py](/Users/praful/flyte-sdk/examples/stress/sleep_fanout_harness.py).
+
+## Key Files
+
+- [sleep_fanout.py](/Users/praful/flyte-sdk/examples/stress/sleep_fanout.py): `core-sleep` leaf task, parent fanout task, and swarm submit task definitions.
+- [sleep_fanout_harness.py](/Users/praful/flyte-sdk/examples/stress/sleep_fanout_harness.py): local async submit harness used by the wrapper.
+- [runs_per_second.py](/Users/praful/flyte-sdk/examples/stress/runs_per_second.py): launch-rate test helper.
+- [fanout_concurrency.py](/Users/praful/flyte-sdk/examples/stress/fanout_concurrency.py): simple fanout/concurrency experiment.
+- [large_fanout.py](/Users/praful/flyte-sdk/examples/stress/large_fanout.py): wide fanout example.
+- [duplicate_action_id.py](/Users/praful/flyte-sdk/examples/stress/duplicate_action_id.py): action-id collision / dedupe behavior probe.
+- [crash_recovery_trace.py](/Users/praful/flyte-sdk/examples/stress/crash_recovery_trace.py), [long_recovery.py](/Users/praful/flyte-sdk/examples/stress/long_recovery.py), [fast_crasher.py](/Users/praful/flyte-sdk/examples/stress/fast_crasher.py): controller and recovery failure scenarios.
+- [cpu_gremlin.py](/Users/praful/flyte-sdk/examples/stress/cpu_gremlin.py), [network_gremlin.py](/Users/praful/flyte-sdk/examples/stress/network_gremlin.py): fault-injection style workload examples.
+- [large_file_io.py](/Users/praful/flyte-sdk/examples/stress/large_file_io.py), [large_dir_io.py](/Users/praful/flyte-sdk/examples/stress/large_dir_io.py), [benchmark/large_io_comparison.py](/Users/praful/flyte-sdk/examples/stress/benchmark/large_io_comparison.py): large I/O stress examples.
+- [scale_test_same_image.py](/Users/praful/flyte-sdk/examples/stress/scale_test_same_image.py), [scale_test_varied_images.py](/Users/praful/flyte-sdk/examples/stress/scale_test_varied_images.py), [image_builds.py](/Users/praful/flyte-sdk/examples/stress/image_builds.py): image build and scale tests.
+
+## Notes
+
+- `sleep_fanout` leaves use the `core-sleep` plugin, so the children run in leaseworker instead of creating task pods.
+- Parent resource defaults for fanout are controlled in `sleep_fanout.py` via:
+  - `FLYTE_STRESS_FANOUT_CPU_REQUEST`
+  - `FLYTE_STRESS_FANOUT_CPU_LIMIT`
+  - `FLYTE_STRESS_FANOUT_MEMORY_REQUEST`
+  - `FLYTE_STRESS_FANOUT_MEMORY_LIMIT`
+- Remote image contents come from the built wheel in `dist/`, not directly from local `src/`. If the wrapper warns that `src/flyte` is newer than the wheel, rebuild the wheel before relying on SDK changes in remote runs.

From 5d9c3f8fc857fe5d1da574a0cffef88eb4eb58d4 Mon Sep 17 00:00:00 2001
From: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
Date: Mon, 27 Apr 2026 17:48:24 -0700
Subject: [PATCH 5/7] add token retry logic during refresh for timeout

Signed-off-by: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
---
 .../remote/_client/auth/_token_client.py      | 36 ++++++++++-
 tests/flyte/remote/test_auth_token_client.py  | 62 +++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 tests/flyte/remote/test_auth_token_client.py

diff --git a/src/flyte/remote/_client/auth/_token_client.py b/src/flyte/remote/_client/auth/_token_client.py
index 214fdc5e0..7b459c91c 100644
--- a/src/flyte/remote/_client/auth/_token_client.py
+++ b/src/flyte/remote/_client/auth/_token_client.py
@@ -17,6 +17,10 @@
 error_slow_down = "slow_down"
 error_auth_pending = "authorization_pending"
 
+_TOKEN_REQUEST_MAX_ATTEMPTS = 3
+_TOKEN_REQUEST_INITIAL_BACKOFF_SECONDS = 0.5
+_TOKEN_REQUEST_MAX_BACKOFF_SECONDS = 2.0
+
 
 # Grant Types
 class GrantType(str, enum.Enum):
@@ -82,6 +86,36 @@ def get_basic_authorization_header(client_id: str, client_secret: str) -> str:
     return "Basic {}".format(base64.b64encode(concatenated.encode(utf_8)).decode(utf_8))
 
 
+async def _post_token_request(
+    http_session: httpx.AsyncClient,
+    token_endpoint: str,
+    *,
+    data: dict[str, str],
+    headers: dict[str, str],
+) -> httpx.Response:
+    """POST to the token endpoint with bounded retries for transient transport failures."""
+    for attempt in range(1, _TOKEN_REQUEST_MAX_ATTEMPTS + 1):
+        try:
+            return await http_session.post(token_endpoint, data=data, headers=headers)
+        except httpx.TransportError:
+            if attempt >= _TOKEN_REQUEST_MAX_ATTEMPTS:
+                raise
+            backoff = min(
+                _TOKEN_REQUEST_INITIAL_BACKOFF_SECONDS * (2 ** (attempt - 1)),
+                _TOKEN_REQUEST_MAX_BACKOFF_SECONDS,
+            )
+            logger.warning(
+                "Token endpoint request failed for %s, retrying in %.1fs (%d/%d)",
+                token_endpoint,
+                backoff,
+                attempt,
+                _TOKEN_REQUEST_MAX_ATTEMPTS,
+                exc_info=True,
+            )
+            await asyncio.sleep(backoff)
+    raise RuntimeError("unreachable")
+
+
 async def get_token(
     token_endpoint: str,
     http_session: httpx.AsyncClient,
@@ -149,7 +183,7 @@ async def get_token(
     if refresh_token:
         body["refresh_token"] = refresh_token
 
-    response = await http_session.post(token_endpoint, data=body, headers=headers)
+    response = await _post_token_request(http_session, token_endpoint, data=body, headers=headers)
 
     if not response.is_success:
         j = response.json()
diff --git a/tests/flyte/remote/test_auth_token_client.py b/tests/flyte/remote/test_auth_token_client.py
new file mode 100644
index 000000000..9ebe1e110
--- /dev/null
+++ b/tests/flyte/remote/test_auth_token_client.py
@@ -0,0 +1,62 @@
+from unittest.mock import AsyncMock, Mock
+
+import httpx
+import pytest
+
+from flyte.remote._client.auth._token_client import get_token
+
+
+def _success_response(access_token: str = "access-token", refresh_token: str | None = None, expires_in: int = 3600):
+    response = Mock(spec=httpx.Response)
+    response.is_success = True
+    payload = {
+        "access_token": access_token,
+        "expires_in": expires_in,
+    }
+    if refresh_token is not None:
+        payload["refresh_token"] = refresh_token
+    response.json.return_value = payload
+    return response
+
+
+@pytest.mark.asyncio
+async def test_get_token_retries_transient_transport_errors(monkeypatch):
+    session = Mock(spec=httpx.AsyncClient)
+    session.post = AsyncMock(
+        side_effect=[
+            httpx.ConnectTimeout("connect timed out"),
+            _success_response(refresh_token="refresh-token"),
+        ]
+    )
+    sleep = AsyncMock()
+    monkeypatch.setattr("flyte.remote._client.auth._token_client.asyncio.sleep", sleep)
+
+    token, refresh_token, expires_in = await get_token(
+        token_endpoint="https://issuer.example.com/oauth/token",
+        http_session=session,
+        client_id="client-id",
+    )
+
+    assert token == "access-token"
+    assert refresh_token == "refresh-token"
+    assert expires_in == 3600
+    assert session.post.await_count == 2
+    sleep.assert_awaited_once_with(0.5)
+
+
+@pytest.mark.asyncio
+async def test_get_token_raises_after_retry_budget_exhausted(monkeypatch):
+    session = Mock(spec=httpx.AsyncClient)
+    session.post = AsyncMock(side_effect=httpx.ConnectTimeout("connect timed out"))
+    sleep = AsyncMock()
+    monkeypatch.setattr("flyte.remote._client.auth._token_client.asyncio.sleep", sleep)
+
+    with pytest.raises(httpx.ConnectTimeout):
+        await get_token(
+            token_endpoint="https://issuer.example.com/oauth/token",
+            http_session=session,
+            client_id="client-id",
+        )
+
+    assert session.post.await_count == 3
+    assert sleep.await_count == 2

From 9492705913e2b2a821372f99fdad68adb3818cc0 Mon Sep 17 00:00:00 2001
From: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
Date: Mon, 27 Apr 2026 22:20:13 -0700
Subject: [PATCH 6/7] fix fmt

Signed-off-by: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
---
 examples/stress/sleep_fanout_harness.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/stress/sleep_fanout_harness.py b/examples/stress/sleep_fanout_harness.py
index c1d43052c..541408018 100644
--- a/examples/stress/sleep_fanout_harness.py
+++ b/examples/stress/sleep_fanout_harness.py
@@ -34,6 +34,11 @@ def _subprocess_env() -> dict[str, str]:
     return env
 
 
+def _append_run_name(path: str, name: str) -> None:
+    with open(path, "a", encoding="utf-8") as f:
+        f.write(f"{name}\n")
+
+
 async def submit_one(sem: asyncio.Semaphore, idx: int, n_children: int, sleep_duration: timedelta) -> str | None:
     async with sem:
         os.environ.setdefault("_U_USE_ACTIONS", "1")
@@ -131,8 +136,7 @@ async def wrapped(i: int):
             submitted += 1
             if RUNS_FILE:
                 async with runs_file_lock:
-                    with open(RUNS_FILE, "a", encoding="utf-8") as f:
-                        f.write(f"{name}\n")
+                    await asyncio.to_thread(_append_run_name, RUNS_FILE, name)
             print(f"submitted_run idx={i} url={name}", flush=True)
         done = submitted + failed
         if done % 100 == 0:

From f9d228d0ee6cdf49abd2f2bac273de1e8361783b Mon Sep 17 00:00:00 2001
From: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
Date: Mon, 27 Apr 2026 22:24:10 -0700
Subject: [PATCH 7/7] increase to 8sec

Signed-off-by: pmahindrakar-oss <prafulla.mahindrakar@gmail.com>
---
 src/flyte/remote/_client/auth/_token_client.py | 6 +++---
 tests/flyte/remote/test_auth_token_client.py   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/flyte/remote/_client/auth/_token_client.py b/src/flyte/remote/_client/auth/_token_client.py
index 7b459c91c..9ea6198a2 100644
--- a/src/flyte/remote/_client/auth/_token_client.py
+++ b/src/flyte/remote/_client/auth/_token_client.py
@@ -17,9 +17,9 @@
 error_slow_down = "slow_down"
 error_auth_pending = "authorization_pending"
 
-_TOKEN_REQUEST_MAX_ATTEMPTS = 3
-_TOKEN_REQUEST_INITIAL_BACKOFF_SECONDS = 0.5
-_TOKEN_REQUEST_MAX_BACKOFF_SECONDS = 2.0
+_TOKEN_REQUEST_MAX_ATTEMPTS = 5
+_TOKEN_REQUEST_INITIAL_BACKOFF_SECONDS = 1.0
+_TOKEN_REQUEST_MAX_BACKOFF_SECONDS = 8.0
 
 
 # Grant Types
diff --git a/tests/flyte/remote/test_auth_token_client.py b/tests/flyte/remote/test_auth_token_client.py
index 9ebe1e110..8a66b29cc 100644
--- a/tests/flyte/remote/test_auth_token_client.py
+++ b/tests/flyte/remote/test_auth_token_client.py
@@ -41,7 +41,7 @@ async def test_get_token_retries_transient_transport_errors(monkeypatch):
     assert refresh_token == "refresh-token"
     assert expires_in == 3600
     assert session.post.await_count == 2
-    sleep.assert_awaited_once_with(0.5)
+    sleep.assert_awaited_once_with(1.0)
 
 
 @pytest.mark.asyncio
@@ -58,5 +58,5 @@ async def test_get_token_raises_after_retry_budget_exhausted(monkeypatch):
             client_id="client-id",
         )
 
-    assert session.post.await_count == 3
-    assert sleep.await_count == 2
+    assert session.post.await_count == 5
+    assert sleep.await_count == 4