Skip to content

Commit dd874ee

Browse files
dennys246claude
andcommitted
feat(admin): GET /v1/debug/vram endpoint + n_ctx probe lift + auth gate fix
Adds /v1/debug/vram admin endpoint exposing the leader VRAM state as JSON (live nvidia-smi ratio + projected model footprint from project_vram_usage). Returns 503 when nvidia-smi is unavailable. Auth via bearer or localhost, matching existing debug endpoint pattern. Lifts _current_llama_server_n_ctx from doctor/checks.py to leader_proxy.py (canonical location alongside _query_nvidia_smi). Doctor now delegates via thin import wrapper. Pre-merge review fold: fixes pre-existing _is_debug_path/_route_debug desync (deps + install-status were missing from _is_debug_path, bypassing auth gate). Adds DEBUG log for projection failures. Regression test ensures all routed debug paths stay in _is_debug_path. 11 new tests, 4951 passed full suite (pre-existing test_valence_annotation failure excluded -- parallel substrate work). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e1d6f45 commit dd874ee

3 files changed

Lines changed: 386 additions & 50 deletions

File tree

src/maxim/doctor/checks.py

Lines changed: 6 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -535,58 +535,14 @@ def check_context_window(port: int = 8100) -> CheckResult:
535535
def _current_llama_server_n_ctx(port: int) -> int | None:
536536
"""Return the n_ctx the running llama-cpp-server is configured for, or None.
537537
538-
Reuses the detection strategy from :func:`check_context_window`: first
539-
``/v1/models`` metadata, then process command-line inspection. Kept as a
540-
separate helper so :func:`check_vram_pressure` doesn't re-run the network
541-
probe redundantly if the context window check already ran.
538+
Delegates to the canonical implementation in ``leader_proxy`` -- both
539+
this module and ``_handle_debug_vram`` share the same probe logic.
542540
"""
543-
import socket
544-
545-
try:
546-
req_bytes = (f"GET /v1/models HTTP/1.0\r\nHost: 127.0.0.1:{port}\r\n\r\n").encode()
547-
with socket.create_connection(("127.0.0.1", port), timeout=2.0) as s:
548-
s.sendall(req_bytes)
549-
raw = b""
550-
while True:
551-
chunk = s.recv(4096)
552-
if not chunk:
553-
break
554-
raw += chunk
555-
body = raw.split(b"\r\n\r\n", 1)[-1]
556-
import json as _json
557-
558-
data = _json.loads(body)
559-
for model in data.get("data", []):
560-
ctx = model.get("context_length") or model.get("n_ctx") or model.get("max_context_length")
561-
if ctx:
562-
return int(ctx)
563-
except Exception:
564-
pass
541+
from maxim.runtime.leader_proxy import (
542+
_current_llama_server_n_ctx as _probe_n_ctx,
543+
)
565544

566-
# Process args fallback — best-effort, cross-platform
567-
try:
568-
import platform as _platform
569-
570-
system = _platform.system().lower()
571-
if system == "linux":
572-
import glob
573-
574-
for cmdline_path in glob.glob("/proc/*/cmdline"):
575-
try:
576-
with open(cmdline_path, "rb") as f:
577-
args = f.read().split(b"\x00")
578-
args_str = [a.decode("utf-8", errors="ignore") for a in args]
579-
if any("llama" in a for a in args_str):
580-
for i, arg in enumerate(args_str):
581-
if arg in ("--ctx-size", "-c", "--n-ctx") and i + 1 < len(args_str):
582-
return int(args_str[i + 1])
583-
if arg.startswith("--ctx-size="):
584-
return int(arg.split("=", 1)[1])
585-
except Exception:
586-
continue
587-
except Exception:
588-
pass
589-
return None
545+
return _probe_n_ctx(port)
590546

591547

592548
def check_vram_pressure(port: int = 8100) -> CheckResult:

src/maxim/runtime/leader_proxy.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,61 @@ def _query_nvidia_smi() -> dict[str, Any] | None:
137137
return None
138138

139139

140+
def _current_llama_server_n_ctx(port: int) -> int | None:
141+
"""Return the n_ctx the running llama-cpp-server is configured for, or None.
142+
143+
Probes ``/v1/models`` on the upstream llama-cpp-server for
144+
``context_length``, falling back to process command-line inspection on
145+
Linux. Shared by :func:`check_vram_pressure` in ``doctor/checks.py``
146+
and :func:`_handle_debug_vram` in this module.
147+
"""
148+
import socket
149+
150+
try:
151+
req_bytes = (f"GET /v1/models HTTP/1.0\r\nHost: 127.0.0.1:{port}\r\n\r\n").encode()
152+
with socket.create_connection(("127.0.0.1", port), timeout=2.0) as s:
153+
s.sendall(req_bytes)
154+
raw = b""
155+
while True:
156+
chunk = s.recv(4096)
157+
if not chunk:
158+
break
159+
raw += chunk
160+
body = raw.split(b"\r\n\r\n", 1)[-1]
161+
data = json.loads(body)
162+
for model in data.get("data", []):
163+
ctx = model.get("context_length") or model.get("n_ctx") or model.get("max_context_length")
164+
if ctx:
165+
return int(ctx)
166+
except Exception:
167+
pass
168+
169+
# Process args fallback — best-effort, cross-platform
170+
try:
171+
import platform as _platform
172+
173+
system = _platform.system().lower()
174+
if system == "linux":
175+
import glob
176+
177+
for cmdline_path in glob.glob("/proc/*/cmdline"):
178+
try:
179+
with open(cmdline_path, "rb") as f:
180+
args = f.read().split(b"\x00")
181+
args_str = [a.decode("utf-8", errors="ignore") for a in args]
182+
if any("llama" in a for a in args_str):
183+
for i, arg in enumerate(args_str):
184+
if arg in ("--ctx-size", "-c", "--n-ctx") and i + 1 < len(args_str):
185+
return int(args_str[i + 1])
186+
if arg.startswith("--ctx-size="):
187+
return int(arg.split("=", 1)[1])
188+
except Exception:
189+
continue
190+
except Exception:
191+
pass
192+
return None
193+
194+
140195
# ─── log ring buffer ─────────────────────────────────────────────────────
141196

142197
_MAX_LOG_LINES = 500
@@ -498,6 +553,103 @@ def _handle_debug_last_requests(self) -> None:
498553
return
499554
self._send_json(200, self.request_log.snapshot())
500555

556+
def _handle_debug_vram(self) -> None:
557+
"""GET /v1/debug/vram — VRAM projection and live nvidia-smi metrics.
558+
559+
Returns the leader's current VRAM state as JSON. Calls
560+
:func:`project_vram_usage` from ``runtime/lane_models.py`` (the
561+
pure-data helper that both ``check_vram_pressure`` and
562+
``_check_vram_spillover_risk`` already delegate to) and formats
563+
the ``VRAMProjection`` dataclass + live nvidia-smi ratio into a
564+
JSON response.
565+
566+
503 if nvidia-smi is unavailable (not a GPU node).
567+
"""
568+
gpu = _query_nvidia_smi()
569+
if gpu is None:
570+
self._send_json(
571+
503,
572+
{
573+
"error": "nvidia-smi unavailable",
574+
"fix": "This endpoint requires an NVIDIA GPU with nvidia-smi installed.",
575+
},
576+
)
577+
return
578+
579+
# Import thresholds from the single source of truth in lane_models.
580+
from maxim.runtime.lane_models import (
581+
_SPILLOVER_RATIO,
582+
_SPILLOVER_WARN_RATIO,
583+
project_vram_usage,
584+
)
585+
586+
vram_used_gb = float(gpu.get("vram_used_gb") or 0.0)
587+
vram_total_gb = float(gpu.get("vram_total_gb") or 0.0)
588+
ratio = round(vram_used_gb / vram_total_gb, 4) if vram_total_gb > 0 else 0.0
589+
590+
live = {
591+
"vram_used_gb": vram_used_gb,
592+
"vram_total_gb": vram_total_gb,
593+
"vram_utilization_pct": gpu.get("utilization_pct"),
594+
"temperature_c": gpu.get("temperature_c"),
595+
"ratio": ratio,
596+
"spillover": ratio > _SPILLOVER_RATIO,
597+
"warning": ratio > _SPILLOVER_WARN_RATIO,
598+
}
599+
600+
# Build projection from active model + running n_ctx.
601+
projection = None
602+
try:
603+
import maxim.runtime.llm_server as _srv
604+
605+
active_profile = _srv._active_model
606+
if active_profile:
607+
# Extract port from upstream_url (always http://127.0.0.1:{port}).
608+
try:
609+
from urllib.parse import urlparse
610+
611+
upstream_port = urlparse(self.upstream_url).port or DEFAULT_UPSTREAM_PORT
612+
except Exception:
613+
upstream_port = DEFAULT_UPSTREAM_PORT
614+
running_n_ctx = _current_llama_server_n_ctx(upstream_port)
615+
if running_n_ctx and running_n_ctx > 0:
616+
from maxim.models.language.config import _BUILTIN_PROFILES
617+
618+
profile_meta = _BUILTIN_PROFILES.get(active_profile, {})
619+
vram_proj = project_vram_usage(
620+
active_profile,
621+
profile_meta,
622+
running_n_ctx,
623+
vram_total_gb,
624+
)
625+
if vram_proj is not None:
626+
projection = {
627+
"profile": vram_proj.profile,
628+
"n_ctx": vram_proj.n_ctx,
629+
"weights_gb": vram_proj.weights_gb,
630+
"kv_cache_gb": round(vram_proj.kv_cache_gb, 3),
631+
"headroom_gb": round(vram_proj.headroom_gb, 3),
632+
"projected_total_gb": round(vram_proj.projected_total_gb, 3),
633+
"physical_vram_gb": vram_proj.physical_vram_gb,
634+
"spillover_risk": vram_proj.spillover_risk,
635+
"recommended_n_ctx": vram_proj.recommended_n_ctx,
636+
}
637+
except Exception:
638+
logger.debug("vram endpoint: projection unavailable", exc_info=True)
639+
640+
self._send_json(
641+
200,
642+
{
643+
"live": live,
644+
"projection": projection,
645+
"thresholds": {
646+
"spillover_ratio": _SPILLOVER_RATIO,
647+
"warn_ratio": _SPILLOVER_WARN_RATIO,
648+
},
649+
"timestamp": time.time(),
650+
},
651+
)
652+
501653
def _is_debug_path(self, path: str) -> bool:
502654
stripped = path.rstrip("/").split("?")[0]
503655
return stripped in (
@@ -508,6 +660,9 @@ def _is_debug_path(self, path: str) -> bool:
508660
"/v1/debug/version",
509661
"/v1/debug/logs",
510662
"/v1/debug/last-requests",
663+
"/v1/debug/vram",
664+
"/v1/debug/deps",
665+
"/v1/debug/install-status",
511666
)
512667

513668
def _route_debug(self, path: str) -> None:
@@ -526,6 +681,8 @@ def _route_debug(self, path: str) -> None:
526681
self._handle_debug_logs()
527682
elif stripped == "/v1/debug/last-requests":
528683
self._handle_debug_last_requests()
684+
elif stripped == "/v1/debug/vram":
685+
self._handle_debug_vram()
529686
elif stripped == "/v1/debug/deps":
530687
self._handle_debug_deps()
531688
elif stripped == "/v1/debug/install-status":

0 commit comments

Comments
 (0)