Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 88 additions & 41 deletions lisa/microsoft/testsuites/cloud_hypervisor/ch_tests_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,12 @@ def _write_testcase_log(self, log_path: Path, testcase: str, trace: str) -> None
testcase_log_file = log_path.joinpath(f"{testcase}.log")
with open(testcase_log_file, "w") as f:
if hasattr(self, "_last_result") and self._last_result is not None:
f.write(self._last_result.stdout)
# FIX E: Write BOTH stdout and stderr to per-test logs
if self._last_result.stdout:
f.write(self._last_result.stdout)
if self._last_result.stderr:
f.write("\n=== STDERR ===\n")
f.write(self._last_result.stderr)
else:
f.write(f"Test failed before execution: {trace}")

Expand Down Expand Up @@ -804,7 +809,7 @@ def _run_with_enhanced_diagnostics(
# Tunables (pull from env if provided; else use sane defaults)
idle_secs = int(os.environ.get("CH_IDLE_SECS", "600"))
hang_kill_secs = int(os.environ.get("CH_HANG_KILL_SECS", "1800"))
check_interval = int(os.environ.get("CH_CHECK_INTERVAL", "30"))
check_interval = int(os.environ.get("CH_CHECK_INTERVAL", "10"))

# --- 1) Rich Rust diagnostics ---
enhanced_env_vars = self.env_vars.copy()
Expand Down Expand Up @@ -864,11 +869,13 @@ def _run_with_enhanced_diagnostics(
echo "[perf-stable] NUMA binding: $numa_prefix"
fi

# start tests, line-buffered if available, stream to log
# Capture BOTH stdout and stderr into the main log
# This makes watchdog log-growth detection reliable and captures all diagnostics
if command -v stdbuf >/dev/null; then
( stdbuf -oL -eL $numa_prefix scripts/dev_cli.sh {cmd_args} | tee "$log_file" ) &
( stdbuf -oL -eL $numa_prefix scripts/dev_cli.sh {cmd_args} 2>&1 \
| tee -a "$log_file" ) &
else
( $numa_prefix scripts/dev_cli.sh {cmd_args} | tee "$log_file" ) &
( $numa_prefix scripts/dev_cli.sh {cmd_args} 2>&1 | tee -a "$log_file" ) &
fi
pid=$!

Expand All @@ -895,39 +902,71 @@ def _run_with_enhanced_diagnostics(
echo "[watchdog] pstree / ps snapshot" | tee -a "$log_file"
pstree -ap 2>/dev/null | head -200 | tee -a "$log_file" || true
ps -eo pid,ppid,stat,etime,cmd | head -200 | tee -a "$log_file" || true
ps -eL -o pid,tid,ppid,stat,etime,comm,cmd | head -200 \\
| tee -a "$log_file" || true

# Find a good target: prefer the integration test binary; otherwise a child
# of the cargo/dev_cli process; otherwise fall back to the main pid.
tpid="$(pgrep -n -f 'target/.*/deps/integration-' || true)"
if [ -z "$tpid" ]; then
# newest child of $pid (often cargo test or the binary)
tpid="$(pgrep -P "$pid" | tail -n1 || true)"
fi
[ -z "$tpid" ] && tpid="$pid"

# Best-effort freeze to avoid the attach race
sudo kill -STOP "$tpid" 2>/dev/null || true

# Snapshot a core ASAP (prefer gcore; fall back to gdb generate-core-file)
core_out="core.$(basename "$tpid").$(date +%s)"
if command -v gcore >/dev/null 2>&1; then
sudo gcore -o "$core_out" "$tpid" >/dev/null 2>&1 || true
else
sudo gdb -batch -p "$tpid" \\
-ex "set pagination off" \\
-ex "generate-core-file $core_out" \\
-ex "detach" -ex "quit" >/dev/null 2>&1 || true
fi

# Then grab a concise live backtrace
sudo gdb -batch -p "$tpid" \\
-ex "set pagination off" \\
-ex "set print elements 0" \\
-ex "set backtrace limit 64" \\
-ex "thread apply all bt" \\
-ex "info threads" > "$live_bt_file" 2>&1 || true
ps -eL -o pid,tid,ppid,stat,etime,comm,cmd | head -200 \\
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check the indentation.

| tee -a "$log_file" || true

# Find a good target: prefer the integration test binary; otherwise a child
# of the cargo/dev_cli process; otherwise fall back to the main pid.
tpid="$(pgrep -n -f 'target/.*/deps/integration-' || true)"
if [ -z "$tpid" ]; then
# newest child of $pid (often cargo test or the binary)
tpid="$(pgrep -P "$pid" | tail -n1 || true)"
fi
[ -z "$tpid" ] && tpid="$pid"

# Verify the target pid is still alive. If it raced away, fall back to $pid.
if ! kill -0 "$tpid" 2>/dev/null; then
echo "[watchdog] Selected pid $tpid is not alive;" \\
" falling back to pid $pid" | tee -a "$log_file"
tpid="$pid"
fi

# Best-effort freeze to avoid the attach race
sudo kill -STOP "$tpid" 2>/dev/null || true

# Use consistent core filename pattern that matches search pattern
core_out="core.integration-$(date +%s)"
echo "[watchdog] Generating core: $core_out" | tee -a "$log_file"
if command -v gcore >/dev/null 2>&1; then
sudo gcore -o "$core_out" "$tpid" 2>&1 | tee -a "$log_file" || true
else
sudo gdb -batch -p "$tpid" \\
-ex "set pagination off" \\
-ex "generate-core-file $core_out" \\
-ex "detach" -ex "quit" 2>&1 | tee -a "$log_file" || true
fi

# Write live backtrace to BOTH main log and side file
echo "[watchdog] Attaching gdb to pid $tpid for live backtrace" \\
| tee -a "$log_file"
{{
echo "[watchdog] gdb attach target pid=$tpid parent_pid=$pid";
# Keep this robust: avoid complex quoting/command-substitution.
echo "[watchdog] comm(target)=$(cat /proc/$tpid/comm 2>/dev/null ||" \\
" echo n/a)";
echo "[watchdog] comm(parent)=$(cat /proc/$pid/comm 2>/dev/null ||" \\
" echo n/a)";
}} 2>/dev/null | tee -a "$log_file" || true
sudo gdb -batch -p "$tpid" \\
-ex "set pagination off" \\
-ex "set print elements 0" \\
-ex "set backtrace limit 64" \\
-ex "thread apply all bt" \\
-ex "info threads" \\
2>&1 | tee -a "$log_file" \
> "$live_bt_file" || true
# If attach failed (e.g. tpid exited/raced), retry once against the main pid
if grep -q "No such process" "$live_bt_file" 2>/dev/null; then
echo "[watchdog] gdb attach on pid $tpid failed;" \\
" retrying against pid $pid" | tee -a "$log_file"
sudo gdb -batch -p "$pid" \\
-ex "set pagination off" \\
-ex "set print elements 0" \\
-ex "set backtrace limit 64" \\
-ex "thread apply all bt" \\
-ex "info threads" \\
2>&1 | tee -a "$log_file" > "$live_bt_file" || true
fi

# Let it run again
sudo kill -CONT "$tpid" 2>/dev/null || true
Expand Down Expand Up @@ -969,23 +1008,31 @@ def _run_with_enhanced_diagnostics(
# on failure, try to symbolize a core dump
if [ $ec -ne 0 ]; then
core=""
# Use consistent core filename pattern that matches search pattern
for dir in . .. /var/crash /cores /var/lib/systemd/coredump /tmp; do
c=$(ls -t "$dir"/core.integration-* 2>/dev/null | head -1)
c=$(ls -t "$dir"/core.integration-* "$dir"/core.* 2>/dev/null \\
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check the indentation.

| head -1)
[ -n "$c" ] && core="$c" && break || true
done
# Loosen binary discovery - search more locations
bin=$(ls -t target/*/deps/integration-* 2>/dev/null | head -1 || true)
# If test runs under workspace path, widen further:
shopt -s globstar nullglob
[ -z "$bin" ] && bin=$(ls -t **/target/*/deps/integration-* 2>/dev/null \\
| head -1 || true)
# Fall back to cloud-hypervisor binary if integration test binary not found
[ -z "$bin" ] && bin="$(command -v cloud-hypervisor || true)"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check the indentation.


if [ -n "$core" ] && [ -n "$bin" ]; then
echo "[diagnostics] Found core: $core, binary: $bin" | tee -a "$log_file"
echo "[diagnostics] Symbolizing core dump..." | tee -a "$log_file"
sudo gdb -batch -q "$bin" "$core" \\
-ex "set pagination off" \\
-ex "thread apply all bt full" \\
-ex "info threads" > "$core_bt_file" 2>&1 || true
-ex "info threads" 2>&1 | tee -a "$log_file" > "$core_bt_file" || true
else
echo "[diagnostics] No core/bin found for symbolization" | tee -a "$log_file"
echo "[diagnostics] No core/bin found for symbolization (core=$core, bin=$bin)" \\
| tee -a "$log_file"
fi
fi

Expand Down
Loading