diff --git a/.gitignore b/.gitignore index dff0a6cb48f..57ae23aec39 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ tooling/ef_tests/state/runner_v2/success_report.txt tooling/reorgs/data tooling/sync/logs/ +tooling/sync/multisync_logs/ # Repos checked out by make target /hive/ diff --git a/tooling/sync/Makefile b/tooling/sync/Makefile index 39969dc6bcf..440ce117ec0 100644 --- a/tooling/sync/Makefile +++ b/tooling/sync/Makefile @@ -2,7 +2,9 @@ flamegraph-branch flamegraph-inner flamegraph-mainnet flamegraph-sepolia flamegraph-holesky \ flamegraph-hoodi start-lighthouse start-ethrex backup-db start-mainnet-metrics-docker \ start-sepolia-metrics-docker start-holesky-metrics-docker start-hoodi-metrics-docker \ -start-metrics-docker tail-syncing-logs tail-metrics-logs copy_flamegraph import-with-metrics +start-metrics-docker tail-syncing-logs tail-metrics-logs copy_flamegraph import-with-metrics \ +multisync-up multisync-down multisync-clean multisync-logs multisync-status \ +multisync-restart multisync-monitor multisync-run multisync-loop ETHREX_DIR ?= "../.." EVM ?= levm @@ -220,3 +222,76 @@ server-sync: sleep 0.2 tmux new-window -t sync:2 -n ethrex "cd ../../metrics && docker stop metrics-ethereum-metrics-exporter-1 || true && docker compose -f docker-compose-metrics.yaml -f docker-compose-metrics-l1.overrides.yaml up -d && cd .. && ulimit -n 1000000 && rm -rf ~/.local/share/ethrex && RUST_LOG=info,ethrex_p2p::sync=debug $(if $(DEBUG_ASSERT),RUSTFLAGS='-C debug-assertions=yes') $(if $(HEALING),SKIP_START_SNAP_SYNC=1) cargo run --release --bin ethrex --features rocksdb -- --http.addr 0.0.0.0 --metrics --metrics.port 3701 --network $(SERVER_SYNC_NETWORK) $(if $(MEMORY),--datadir memory) --authrpc.jwtsecret ~/secrets/jwt.hex $(if $(or $(FULL_SYNC),$(HEALING)),--syncmode full) 2>&1 | tee $(LOGS_FILE)" + +# ============================================================================== +# Docker Compose Multi-Network Snapsync +# ============================================================================== + +MULTISYNC_COMPOSE = docker compose -f docker-compose.multisync.yaml +MULTISYNC_NETWORKS ?= hoodi,sepolia,mainnet +comma := , +MULTISYNC_NETWORK_LIST := $(subst $(comma), ,$(MULTISYNC_NETWORKS)) +MULTISYNC_SERVICES := $(foreach n,$(MULTISYNC_NETWORK_LIST),setup-jwt-$(n) ethrex-$(n) consensus-$(n)) + +multisync-up: ## Start all networks specified in MULTISYNC_NETWORKS via Docker Compose. + $(MULTISYNC_COMPOSE) up -d $(MULTISYNC_SERVICES) + +multisync-down: ## Stop and remove all snapsync containers. + $(MULTISYNC_COMPOSE) down + +multisync-clean: ## Stop, remove containers AND volumes (full reset). + $(MULTISYNC_COMPOSE) down -v + +multisync-logs: ## Tail logs from all networks. + $(MULTISYNC_COMPOSE) logs -f + +multisync-logs-%: ## Tail logs for a specific network (e.g., multisync-logs-hoodi). + $(MULTISYNC_COMPOSE) logs -f ethrex-$* consensus-$* + +multisync-logs-ethrex-%: ## Tail only ethrex logs for a network (e.g., multisync-logs-ethrex-hoodi). + $(MULTISYNC_COMPOSE) logs -f ethrex-$* + +multisync-logs-consensus-%: ## Tail only consensus logs for a network (e.g., multisync-logs-consensus-hoodi). + $(MULTISYNC_COMPOSE) logs -f consensus-$* + +multisync-restart: ## Restart the cycle (clean volumes + start fresh). + $(MULTISYNC_COMPOSE) down -v + $(MULTISYNC_COMPOSE) up -d $(MULTISYNC_SERVICES) + +multisync-monitor: ## Monitor all networks (one-shot, exits on completion). + python3 docker_monitor.py --networks $(MULTISYNC_NETWORKS) --exit-on-success + +multisync-run: ## Full run: start + monitor (one-shot, exits on completion). + $(MULTISYNC_COMPOSE) up -d $(MULTISYNC_SERVICES) + @echo "Waiting 10s for containers to start..." + @sleep 10 + python3 docker_monitor.py --networks $(MULTISYNC_NETWORKS) --exit-on-success + +multisync-loop: ## Continuous loop: sync all networks, restart on success, repeat forever. + $(MULTISYNC_COMPOSE) up -d $(MULTISYNC_SERVICES) + @echo "Waiting 10s for containers to start..." + @sleep 10 + python3 docker_monitor.py --networks $(MULTISYNC_NETWORKS) --compose-file docker-compose.multisync.yaml --compose-dir $(CURDIR) + +multisync-history: ## View the run history log. + @if [ -f multisync_logs/run_history.log ]; then \ + cat multisync_logs/run_history.log; \ + else \ + echo "No run history found. Run 'make multisync-loop' first."; \ + fi + +multisync-list-logs: ## List all saved run logs. + @if [ -d multisync_logs ]; then \ + echo "=== Saved Run Logs ===" && \ + ls -la multisync_logs/ && \ + echo "" && \ + for dir in multisync_logs/run_*/; do \ + if [ -d "$$dir" ]; then \ + echo "$$dir:"; \ + ls "$$dir"; \ + echo ""; \ + fi; \ + done; \ + else \ + echo "No logs directory found."; \ + fi diff --git a/tooling/sync/README.md b/tooling/sync/README.md index 364c271f6b4..e459e752fa9 100644 --- a/tooling/sync/README.md +++ b/tooling/sync/README.md @@ -61,3 +61,126 @@ It's advisable to only run flamegraphs on blocks that have already been synced, - `make copy-flamegraph` can be used to quickly copy the flamegraph generated by the flamegraph commands from the `ethrex` repo folder to the `tooling/sync/flamegraphs` folder so it isn't overwritten by future flamegraph runs. `GRAPHNAME` can be provided to give the file a custom name. - `make import-with-metrics` can be used to import blocks from an RLP file with metrics enabled, specially useful for a block processing profile. The path to the rlp file can be passed with the `RLP_FILE` environment variable, while the network can be provided with the `NETWORK` variable. + +## Multi-Network Parallel Snapsync + +This feature allows running multiple Ethrex nodes in parallel (hoodi, sepolia, mainnet) via Docker Compose, with automated monitoring, Slack notifications, and a history log of runs. + +### Overview + +The parallel snapsync system: +- Spawns multiple networks simultaneously via Docker Compose +- Monitors snapsync progress with a 4-hour timeout +- Verifies block processing for 22 minutes after sync completion +- Sends Slack notifications on success/failure +- Maintains a history log of all runs +- On success: restarts containers and begins a new sync cycle +- On failure: keeps containers running for debugging + +### Requirements + +- Docker and Docker Compose +- Python 3 with the `requests` library (`pip install requests`) +- (Optional) Slack webhook URLs for notifications + +### Quick Start + +```bash +# Start a continuous monitoring loop (recommended for servers) +make multisync-loop + +# Or run a single sync cycle +make multisync-run +``` + +### Docker Compose Setup + +The `docker-compose.multisync.yaml` file defines services for each network with isolated volumes. Each network uses Lighthouse as the consensus client with checkpoint sync. + +Host port mapping: +- **hoodi**: `localhost:8545` +- **sepolia**: `localhost:8546` +- **mainnet**: `localhost:8547` +- **hoodi-2**: `localhost:8548` (for additional testing) + +### Environment Variables + +Create a `.env` file in `tooling/sync/` with: + +```bash +# Slack notifications (optional) +SLACK_WEBHOOK_URL_SUCCESS=https://hooks.slack.com/services/... +SLACK_WEBHOOK_URL_FAILED=https://hooks.slack.com/services/... +``` + +The `MULTISYNC_NETWORKS` variable controls which networks to sync (default: `hoodi,sepolia,mainnet`): + +```bash +# Sync only hoodi and sepolia +make multisync-loop MULTISYNC_NETWORKS=hoodi,sepolia +``` + +### Monitoring Behavior + +The `docker_monitor.py` script manages the sync lifecycle: + +1. **Waiting**: Node container starting up +2. **Syncing**: Snapsync in progress (4-hour timeout) +3. **Block Processing**: Sync complete, verifying block processing (22 minutes) +4. **Success**: Network synced and processing blocks +5. **Failed**: Timeout, stall, or error detected + +The monitor checks for: +- Sync timeout (default 4 hours) +- Block processing stall (10 minutes without new blocks) +- Node unresponsiveness + +### Logs and History + +Logs are saved to `tooling/sync/multisync_logs/`: + +``` +multisync_logs/ +├── run_history.log # Append-only history of all runs +└── run_YYYYMMDD_HHMMSS/ # Per-run folder + ├── summary.txt # Run summary + ├── ethrex-hoodi.log # Ethrex logs per network + ├── consensus-hoodi.log # Lighthouse logs per network + └── ... +``` + +### Commands + +**Starting and Stopping:** + +- `make multisync-up` starts all networks via Docker Compose. +- `make multisync-down` stops and removes containers (preserves volumes). +- `make multisync-clean` stops containers and removes volumes (full reset). +- `make multisync-restart` restarts the cycle (clean volumes + start fresh). + +**Monitoring:** + +- `make multisync-loop` runs continuous sync cycles (recommended for servers). On success, restarts and syncs again. On failure, stops for debugging. +- `make multisync-run` runs a single sync cycle and exits on completion. +- `make multisync-monitor` monitors already-running containers (one-shot). + +**Logs:** + +- `make multisync-logs` tails logs from all networks. +- `make multisync-logs-hoodi` tails logs for a specific network. +- `make multisync-logs-ethrex-hoodi` tails only ethrex logs for a network. +- `make multisync-logs-consensus-hoodi` tails only consensus logs for a network. +- `make multisync-history` views the run history log. +- `make multisync-list-logs` lists all saved run logs. + +### Slack Notifications + +When configured, notifications are sent: +- On **success**: All networks synced and processing blocks +- On **failure**: Any network failed (timeout, stall, or error) + +Notifications include: +- Run ID and count +- Host, branch, and commit info +- Per-network status with sync time and blocks processed +- Link to the commit on GitHub diff --git a/tooling/sync/docker-compose.multisync.yaml b/tooling/sync/docker-compose.multisync.yaml new file mode 100644 index 00000000000..6fd04fa259c --- /dev/null +++ b/tooling/sync/docker-compose.multisync.yaml @@ -0,0 +1,240 @@ +# Multi-network parallel snapsync +# +# For full documentation, see: tooling/sync/README.md (section "Multi-Network Parallel Snapsync") +# +# This file defines 4 networks: hoodi, sepolia, mainnet, and hoodi-2. +# By default, the Makefile starts only hoodi, sepolia, and mainnet. +# +# Usage via Makefile (recommended): +# make multisync-loop # Start default networks and monitor continuously +# make multisync-run # Single sync cycle +# make multisync-loop MULTISYNC_NETWORKS=hoodi,sepolia # Custom network subset +# +# Direct docker compose usage (starts ALL 4 networks): +# docker compose -f docker-compose.multisync.yaml up -d +# docker compose -f docker-compose.multisync.yaml logs -f +# docker compose -f docker-compose.multisync.yaml down -v +# +# To start a subset directly with docker compose: +# docker compose -f docker-compose.multisync.yaml up -d ethrex-hoodi consensus-hoodi setup-jwt-hoodi +# +# Each network runs in isolation with its own volumes. +# Host ports are mapped for external RPC access: +# hoodi: localhost:8545 +# sepolia: localhost:8546 +# mainnet: localhost:8547 +# hoodi-2: localhost:8548 + +x-ethrex-common: ðrex-common + image: "ghcr.io/lambdaclass/ethrex:main" + pull_policy: always + ulimits: + nofile: 1000000 + restart: unless-stopped + +x-consensus-common: &consensus-common + image: sigp/lighthouse:v8.0.1 + restart: unless-stopped + +services: + # ============================================================================= + # HOODI + # ============================================================================= + setup-jwt-hoodi: + image: alpine + volumes: + - secrets-hoodi:/secrets + command: sh -c 'apk add openssl && openssl rand -hex 32 | tr -d "\n" | tee /secrets/jwt.hex' + + consensus-hoodi: + <<: *consensus-common + container_name: consensus-hoodi + volumes: + - secrets-hoodi:/secrets + - consensus-hoodi:/root/.lighthouse + command: > + lighthouse bn + --network hoodi + --http --http-address 0.0.0.0 + --execution-endpoint http://ethrex-hoodi:8551 + --execution-jwt /secrets/jwt.hex + --checkpoint-sync-url https://hoodi-checkpoint-sync.stakely.io/ + --checkpoint-sync-url-timeout 600 + --purge-db + depends_on: + setup-jwt-hoodi: + condition: service_completed_successfully + + ethrex-hoodi: + <<: *ethrex-common + container_name: ethrex-hoodi + ports: + - "8545:8545" # RPC + volumes: + - secrets-hoodi:/secrets + - ethrex-hoodi:/data + command: > + --http.addr 0.0.0.0 + --network hoodi + --authrpc.addr 0.0.0.0 + --authrpc.jwtsecret /secrets/jwt.hex + --syncmode snap + --datadir /data + depends_on: + setup-jwt-hoodi: + condition: service_completed_successfully + + # ============================================================================= + # SEPOLIA + # ============================================================================= + setup-jwt-sepolia: + image: alpine + volumes: + - secrets-sepolia:/secrets + command: sh -c 'apk add openssl && openssl rand -hex 32 | tr -d "\n" | tee /secrets/jwt.hex' + + consensus-sepolia: + <<: *consensus-common + container_name: consensus-sepolia + volumes: + - secrets-sepolia:/secrets + - consensus-sepolia:/root/.lighthouse + command: > + lighthouse bn + --network sepolia + --http --http-address 0.0.0.0 + --execution-endpoint http://ethrex-sepolia:8551 + --execution-jwt /secrets/jwt.hex + --checkpoint-sync-url https://checkpoint-sync.sepolia.ethpandaops.io + --checkpoint-sync-url-timeout 600 + --purge-db + depends_on: + setup-jwt-sepolia: + condition: service_completed_successfully + + ethrex-sepolia: + <<: *ethrex-common + container_name: ethrex-sepolia + ports: + - "8546:8545" # RPC on different host port + volumes: + - secrets-sepolia:/secrets + - ethrex-sepolia:/data + command: > + --http.addr 0.0.0.0 + --network sepolia + --authrpc.addr 0.0.0.0 + --authrpc.jwtsecret /secrets/jwt.hex + --syncmode snap + --datadir /data + depends_on: + setup-jwt-sepolia: + condition: service_completed_successfully + + # ============================================================================= + # MAINNET + # ============================================================================= + setup-jwt-mainnet: + image: alpine + volumes: + - secrets-mainnet:/secrets + command: sh -c 'apk add openssl && openssl rand -hex 32 | tr -d "\n" | tee /secrets/jwt.hex' + + consensus-mainnet: + <<: *consensus-common + container_name: consensus-mainnet + volumes: + - secrets-mainnet:/secrets + - consensus-mainnet:/root/.lighthouse + command: > + lighthouse bn + --network mainnet + --http --http-address 0.0.0.0 + --execution-endpoint http://ethrex-mainnet:8551 + --execution-jwt /secrets/jwt.hex + --checkpoint-sync-url https://mainnet-checkpoint-sync.attestant.io + --checkpoint-sync-url-timeout 600 + --purge-db + depends_on: + setup-jwt-mainnet: + condition: service_completed_successfully + + ethrex-mainnet: + <<: *ethrex-common + container_name: ethrex-mainnet + ports: + - "8547:8545" # RPC on different host port + volumes: + - secrets-mainnet:/secrets + - ethrex-mainnet:/data + command: > + --http.addr 0.0.0.0 + --network mainnet + --authrpc.addr 0.0.0.0 + --authrpc.jwtsecret /secrets/jwt.hex + --syncmode snap + --datadir /data + depends_on: + setup-jwt-mainnet: + condition: service_completed_successfully + + # ============================================================================= + # HOODI-2 + # ============================================================================= + setup-jwt-hoodi-2: + image: alpine + volumes: + - secrets-hoodi-2:/secrets + command: sh -c 'apk add openssl && openssl rand -hex 32 | tr -d "\n" | tee /secrets/jwt.hex' + + consensus-hoodi-2: + <<: *consensus-common + container_name: consensus-hoodi-2 + volumes: + - secrets-hoodi-2:/secrets + - consensus-hoodi-2:/root/.lighthouse + command: > + lighthouse bn + --network hoodi + --http --http-address 0.0.0.0 + --execution-endpoint http://ethrex-hoodi-2:8551 + --execution-jwt /secrets/jwt.hex + --checkpoint-sync-url https://hoodi-checkpoint-sync.stakely.io/ + --checkpoint-sync-url-timeout 600 + --purge-db + depends_on: + setup-jwt-hoodi-2: + condition: service_completed_successfully + + ethrex-hoodi-2: + <<: *ethrex-common + container_name: ethrex-hoodi-2 + ports: + - "8548:8545" # RPC on different host port + volumes: + - secrets-hoodi-2:/secrets + - ethrex-hoodi-2:/data + command: > + --http.addr 0.0.0.0 + --network hoodi + --authrpc.addr 0.0.0.0 + --authrpc.jwtsecret /secrets/jwt.hex + --syncmode snap + --datadir /data + depends_on: + setup-jwt-hoodi-2: + condition: service_completed_successfully + +volumes: + secrets-hoodi: + secrets-sepolia: + secrets-mainnet: + consensus-hoodi: + consensus-sepolia: + consensus-mainnet: + ethrex-hoodi: + ethrex-sepolia: + ethrex-mainnet: + secrets-hoodi-2: + consensus-hoodi-2: + ethrex-hoodi-2: diff --git a/tooling/sync/docker_monitor.py b/tooling/sync/docker_monitor.py new file mode 100644 index 00000000000..b49bfb8ad2a --- /dev/null +++ b/tooling/sync/docker_monitor.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +"""Monitor Docker Compose snapsync instances for sync completion.""" + +import argparse +import os +import socket +import subprocess +import sys +import time +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any, Optional + +import requests + +# Load .env file if it exists +if os.path.exists('.env'): + with open('.env') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + key, _, value = line.partition('=') + os.environ[key.strip()] = value.strip() + +CHECK_INTERVAL = 10 +SYNC_TIMEOUT = 4 * 60 # 4 hours default sync timeout (in minutes) +BLOCK_PROCESSING_DURATION = 22 * 60 # Monitor block processing for 22 minutes +BLOCK_STALL_TIMEOUT = 10 * 60 # Fail if no new block for 10 minutes +STATUS_PRINT_INTERVAL = 30 + +# Network to port mapping (fixed in docker-compose.multisync.yaml) +NETWORK_PORTS = { + "hoodi": 8545, + "sepolia": 8546, + "mainnet": 8547, + "hoodi-2": 8548, +} + +# Logging configuration +LOGS_DIR = Path("./multisync_logs") +RUN_LOG_FILE = LOGS_DIR / "run_history.log" # Append-only text log + +STATUS_EMOJI = { + "waiting": "⏳", "syncing": "🔄", "synced": "✅", + "block_processing": "📦", "success": "🎉", "failed": "❌" +} + + +@dataclass +class Instance: + name: str + port: int + container: str = "" + status: str = "waiting" + start_time: float = 0 + sync_time: float = 0 + last_block: int = 0 + last_block_time: float = 0 # When we last saw a new block + block_check_start: float = 0 + initial_block: int = 0 # Block when entering block_processing + error: str = "" + + @property + def rpc_url(self) -> str: + return f"http://localhost:{self.port}" + + +def fmt_time(secs: float) -> str: + secs = int(abs(secs)) + h, m, s = secs // 3600, (secs % 3600) // 60, secs % 60 + return " ".join(f"{v}{u}" for v, u in [(h, "h"), (m, "m"), (s, "s")] if v or (not h and not m)) + + +def git_commit() -> str: + try: + return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL).decode().strip() + except Exception: + return "unknown" + + +def git_branch() -> str: + try: + return subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], stderr=subprocess.DEVNULL).decode().strip() + except Exception: + return "unknown" + + +def container_start_time(name: str) -> Optional[float]: + try: + out = subprocess.check_output(["docker", "inspect", "-f", "{{.State.StartedAt}}", name], stderr=subprocess.DEVNULL).decode().strip() + if '.' in out: + base, frac = out.rsplit('.', 1) + out = f"{base}.{frac.rstrip('Z')[:6]}" + return datetime.fromisoformat(out.replace('Z', '+00:00')).timestamp() + except Exception: + return None + + +def rpc_call(url: str, method: str) -> Optional[Any]: + try: + return requests.post(url, json={"jsonrpc": "2.0", "method": method, "params": [], "id": 1}, timeout=5).json().get("result") + except Exception: + return None + + +def slack_notify(run_id: str, run_count: int, instances: list, hostname: str, branch: str, commit: str): + """Send a single summary Slack message for the run.""" + all_success = all(i.status == "success" for i in instances) + url = os.environ.get("SLACK_WEBHOOK_URL_SUCCESS" if all_success else "SLACK_WEBHOOK_URL_FAILED") + if not url: + return + status_icon = "✅" if all_success else "❌" + header = f"{status_icon} Run #{run_count} (ID: {run_id})" + run_start = datetime.strptime(run_id, "%Y%m%d_%H%M%S") + elapsed_secs = (datetime.now() - run_start).total_seconds() + elapsed_str = fmt_time(elapsed_secs) + summary = f"*Host:* `{hostname}`\n*Branch:* `{branch}`\n*Commit:* \n*Elapsed:* `{elapsed_str}`\n*Logs:* `tooling/sync/multisync_logs/run_{run_id}`\n*Result:* {'SUCCESS' if all_success else 'FAILED'}" + blocks = [ + {"type": "header", "text": {"type": "plain_text", "text": header}}, + {"type": "section", "text": {"type": "mrkdwn", "text": summary}}, + {"type": "divider"} + ] + for i in instances: + icon = "✅" if i.status == "success" else "❌" + line = f"{icon} *{i.name}*: `{i.status}`" + if i.sync_time: + line += f" (sync: {fmt_time(i.sync_time)})" + if i.initial_block: + line += f" post-sync block: {i.initial_block}" + if i.initial_block and i.last_block > i.initial_block: + blocks_processed = i.last_block - i.initial_block + line += f" (processed +{blocks_processed} blocks in {BLOCK_PROCESSING_DURATION//60}m)" + if i.error: + line += f"\n Error: {i.error}" + blocks.append({"type": "section", "text": {"type": "mrkdwn", "text": line}}) + try: + requests.post(url, json={"blocks": blocks}, timeout=10) + except Exception: + # Slack notification failures are non-critical; ignore them so they + # do not interfere with the main monitoring workflow. + pass + + +def ensure_logs_dir(): + """Ensure the logs directory exists.""" + LOGS_DIR.mkdir(parents=True, exist_ok=True) + + +def save_container_logs(container: str, run_id: str, suffix: str = ""): + """Save container logs to a file.""" + log_file = LOGS_DIR / f"run_{run_id}" / f"{container}{suffix}.log" + log_file.parent.mkdir(parents=True, exist_ok=True) + try: + logs = subprocess.check_output( + ["docker", "logs", container], + stderr=subprocess.STDOUT, + timeout=60 + ).decode(errors='replace') + log_file.write_text(logs) + print(f" 📄 Saved logs: {log_file}") + return True + except subprocess.CalledProcessError as e: + print(f" ⚠️ Failed to get logs for {container}: {e}") + return False + except subprocess.TimeoutExpired: + print(f" ⚠️ Timeout getting logs for {container}") + return False + except Exception as e: + print(f" ⚠️ Error saving logs for {container}: {e}") + return False + + +def save_all_logs(instances: list[Instance], run_id: str, compose_file: str): + """Save logs for all containers (ethrex + consensus).""" + print(f"\n📁 Saving logs for run {run_id}...") + + for inst in instances: + # Save ethrex logs + save_container_logs(inst.container, run_id) + # Save consensus logs (convention: consensus-{network}) + consensus_container = inst.container.replace("ethrex-", "consensus-") + save_container_logs(consensus_container, run_id) + + print(f"📁 Logs saved to {LOGS_DIR}/run_{run_id}/\n") + + +def log_run_result(run_id: str, run_count: int, instances: list[Instance], hostname: str, branch: str, commit: str): + """Append run result to the persistent log file.""" + ensure_logs_dir() + all_success = all(i.status == "success" for i in instances) + status_icon = "✅" if all_success else "❌" + run_start = datetime.strptime(run_id, "%Y%m%d_%H%M%S") + elapsed_secs = (datetime.now() - run_start).total_seconds() + elapsed_str = fmt_time(elapsed_secs) + # Build log entry as plain text + lines = [ + f"\n{'='*60}", + f"{status_icon} Run #{run_count} (ID: {run_id})", + f"{'='*60}", + f"Host: {hostname}", + f"Branch: {branch}", + f"Commit: {commit}", + f"Elapsed: {elapsed_str}", + f"Result: {'SUCCESS' if all_success else 'FAILED'}", + "", + ] + for inst in instances: + icon = "✅" if inst.status == "success" else "❌" + line = f" {icon} {inst.name}: {inst.status}" + if inst.sync_time: + line += f" (sync: {fmt_time(inst.sync_time)})" + if inst.initial_block: + line += f" post-sync block: {inst.initial_block}" + if inst.initial_block and inst.last_block > inst.initial_block: + blocks_processed = inst.last_block - inst.initial_block + line += f" (processed +{blocks_processed} blocks in {BLOCK_PROCESSING_DURATION//60}m)" + if inst.error: + line += f"\n Error: {inst.error}" + lines.append(line) + lines.append("") + # Append to log file + with open(RUN_LOG_FILE, "a") as f: + f.write("\n".join(lines) + "\n") + print(f"📝 Run logged to {RUN_LOG_FILE}") + # Also write summary to the run folder + summary_file = LOGS_DIR / f"run_{run_id}" / "summary.txt" + summary_file.parent.mkdir(parents=True, exist_ok=True) + summary_file.write_text("\n".join(lines)) + + +def generate_run_id() -> str: + """Generate a unique run ID based on timestamp.""" + return datetime.now().strftime("%Y%m%d_%H%M%S") + + +def restart_containers(compose_file: str, compose_dir: str): + """Stop and restart docker compose containers, clearing volumes.""" + print("\n🔄 Restarting containers...\n", flush=True) + try: + subprocess.run(["docker", "compose", "-f", compose_file, "down", "-v"], cwd=compose_dir, check=True) + time.sleep(5) + subprocess.run(["docker", "compose", "-f", compose_file, "up", "-d"], cwd=compose_dir, check=True) + print("✅ Containers restarted successfully\n", flush=True) + return True + except subprocess.CalledProcessError as e: + print(f"❌ Failed to restart containers: {e}\n", flush=True) + return False + + +def reset_instance(inst: Instance): + """Reset instance state for a new sync cycle.""" + inst.status = "waiting" + inst.start_time = 0 + inst.sync_time = 0 + inst.last_block = 0 + inst.last_block_time = 0 + inst.block_check_start = 0 + inst.initial_block = 0 + inst.error = "" + + +def print_status(instances: list[Instance]): + print("\033[2J\033[H", end="") + print(f"{'='*60}\nStatus at {time.strftime('%H:%M:%S')}\n{'='*60}") + + for i in instances: + elapsed = time.time() - i.start_time if i.start_time else 0 + extra = { + "waiting": " (waiting for node...)", + "syncing": f" ({fmt_time(elapsed)} elapsed)", + "synced": f" (synced in {fmt_time(i.sync_time)})", + "block_processing": f" (block {i.last_block}, +{i.last_block - i.initial_block} blocks, {fmt_time(BLOCK_PROCESSING_DURATION - (time.time() - i.block_check_start))} left)", + "success": f" ✓ synced in {fmt_time(i.sync_time)}, processed +{i.last_block - i.initial_block} blocks", + "failed": f" - {i.error}" + }.get(i.status, "") + print(f" {STATUS_EMOJI.get(i.status, '?')} {i.name} (:{i.port}): {i.status}{extra}") + + print(flush=True) + + +def update_instance(inst: Instance, timeout_min: int) -> bool: + if inst.status in ("success", "failed"): + return False + + now = time.time() + block = rpc_call(inst.rpc_url, "eth_blockNumber") + block = int(block, 16) if block else None + + if block is None: + if inst.status != "waiting": + inst.status, inst.error = "failed", "Node stopped responding" + return True + return False + + if inst.status == "waiting": + inst.status, inst.start_time = "syncing", inst.start_time or now + return True + + if inst.status == "syncing": + if (now - inst.start_time) > timeout_min * 60: + inst.status, inst.error = "failed", f"Sync timeout after {fmt_time(timeout_min * 60)}" + return True + if rpc_call(inst.rpc_url, "eth_syncing") is False: + inst.status, inst.sync_time = "synced", now - inst.start_time + inst.block_check_start, inst.last_block = now, block + inst.initial_block, inst.last_block_time = block, now + return True + + if inst.status == "synced": + inst.status = "block_processing" + inst.block_check_start, inst.last_block, inst.initial_block, inst.last_block_time = now, block, block, now + return True + + if inst.status == "block_processing": + # Check for stalled node (no new blocks for too long) + if (now - inst.last_block_time) > BLOCK_STALL_TIMEOUT: + inst.status, inst.error = "failed", f"Block processing stalled at {inst.last_block} for {fmt_time(BLOCK_STALL_TIMEOUT)}" + return True + # Update last block time if we see progress + if block and block > inst.last_block: + inst.last_block, inst.last_block_time = block, now + # Success after duration, but only if we made progress + if (now - inst.block_check_start) > BLOCK_PROCESSING_DURATION: + if inst.last_block > inst.initial_block: + inst.status = "success" + return True + else: + inst.status, inst.error = "failed", "No block progress during monitoring" + return True + + return False + + +def main(): + p = argparse.ArgumentParser(description="Monitor Docker snapsync instances") + p.add_argument("--networks", default="hoodi,sepolia,mainnet") + p.add_argument("--timeout", type=int, default=SYNC_TIMEOUT) + p.add_argument("--no-slack", action="store_true") + p.add_argument("--exit-on-success", action="store_true") + p.add_argument("--compose-file", default="docker-compose.multisync.yaml", help="Docker compose file name") + p.add_argument("--compose-dir", default=".", help="Directory containing docker compose file") + args = p.parse_args() + + names = [n.strip() for n in args.networks.split(",")] + ports = [] + for n in names: + if n not in NETWORK_PORTS: + sys.exit(f"Error: unknown network '{n}', known networks: {list(NETWORK_PORTS.keys())}") + ports.append(NETWORK_PORTS[n]) + containers = [f"ethrex-{n}" for n in names] + + instances = [Instance(n, p, c) for n, p, c in zip(names, ports, containers)] + + # Detect state of already-running containers + for inst in instances: + if t := container_start_time(inst.container): + inst.start_time = t + # Check if already synced + syncing = rpc_call(inst.rpc_url, "eth_syncing") + if syncing is False: + # Already synced - go straight to block_processing + block = rpc_call(inst.rpc_url, "eth_blockNumber") + block = int(block, 16) if block else 0 + inst.status = "block_processing" + inst.sync_time = time.time() - t + inst.block_check_start = time.time() + inst.initial_block = block + inst.last_block = block + inst.last_block_time = time.time() + elif syncing is not None: + # Still syncing + inst.status = "syncing" + # else: node not responding yet, stay in "waiting" + + hostname = socket.gethostname() + branch = git_branch() + commit = git_commit() + run_count = 1 + run_id = generate_run_id() + + # Ensure logs directory exists + ensure_logs_dir() + print(f"📁 Logs will be saved to {LOGS_DIR.absolute()}") + print(f"📝 Run history: {RUN_LOG_FILE.absolute()}\n") + + try: + while True: + print(f"🔍 Run #{run_count} (ID: {run_id}): Monitoring {len(instances)} instances (timeout: {args.timeout}m)", flush=True) + last_print = 0 + while True: + changed = any(update_instance(i, args.timeout) for i in instances) + if changed or (time.time() - last_print) > STATUS_PRINT_INTERVAL: + print_status(instances) + last_print = time.time() + if all(i.status in ("success", "failed") for i in instances): + print_status(instances) + break + time.sleep(CHECK_INTERVAL) + # Log the run result and save container logs BEFORE any restart + save_all_logs(instances, run_id, args.compose_file) + log_run_result(run_id, run_count, instances, hostname, branch, commit) + # Send a single Slack summary notification for the run + if not args.no_slack: + slack_notify(run_id, run_count, instances, hostname, branch, commit) + # Check results + if all(i.status == "success" for i in instances): + print(f"🎉 Run #{run_count}: All instances synced successfully!") + if args.exit_on_success: + sys.exit(0) + # Restart for another run + if restart_containers(args.compose_file, args.compose_dir): + for inst in instances: + reset_instance(inst) + run_count += 1 + run_id = generate_run_id() # New run ID for the new cycle + time.sleep(30) # Wait for containers to start + else: + print("❌ Failed to restart containers", file=sys.stderr) + sys.exit(1) + else: + # On failure: containers are NOT stopped, you can inspect the DB + print("\n" + "="*60) + print("⚠️ FAILURE - Containers are still running for inspection") + print("="*60) + print("\nYou can:") + print(" - Inspect the database in the running containers") + print(" - Check logs: docker logs ") + print(f" - View saved logs: {LOGS_DIR}/run_{run_id}/") + print(f" - View run history: {RUN_LOG_FILE}") + print("\nTo restart manually: make multisync-restart") + sys.exit(1) + except KeyboardInterrupt: + print("\n⚠️ Interrupted") + print_status(instances) + sys.exit(130) + + +if __name__ == "__main__": + main()