@@ -137,6 +137,61 @@ def _query_nvidia_smi() -> dict[str, Any] | None:
137137 return None
138138
139139
140+ def _current_llama_server_n_ctx (port : int ) -> int | None :
141+ """Return the n_ctx the running llama-cpp-server is configured for, or None.
142+
143+ Probes ``/v1/models`` on the upstream llama-cpp-server for
144+ ``context_length``, falling back to process command-line inspection on
145+ Linux. Shared by :func:`check_vram_pressure` in ``doctor/checks.py``
146+ and :func:`_handle_debug_vram` in this module.
147+ """
148+ import socket
149+
150+ try :
151+ req_bytes = (f"GET /v1/models HTTP/1.0\r \n Host: 127.0.0.1:{ port } \r \n \r \n " ).encode ()
152+ with socket .create_connection (("127.0.0.1" , port ), timeout = 2.0 ) as s :
153+ s .sendall (req_bytes )
154+ raw = b""
155+ while True :
156+ chunk = s .recv (4096 )
157+ if not chunk :
158+ break
159+ raw += chunk
160+ body = raw .split (b"\r \n \r \n " , 1 )[- 1 ]
161+ data = json .loads (body )
162+ for model in data .get ("data" , []):
163+ ctx = model .get ("context_length" ) or model .get ("n_ctx" ) or model .get ("max_context_length" )
164+ if ctx :
165+ return int (ctx )
166+ except Exception :
167+ pass
168+
169+ # Process args fallback — best-effort, cross-platform
170+ try :
171+ import platform as _platform
172+
173+ system = _platform .system ().lower ()
174+ if system == "linux" :
175+ import glob
176+
177+ for cmdline_path in glob .glob ("/proc/*/cmdline" ):
178+ try :
179+ with open (cmdline_path , "rb" ) as f :
180+ args = f .read ().split (b"\x00 " )
181+ args_str = [a .decode ("utf-8" , errors = "ignore" ) for a in args ]
182+ if any ("llama" in a for a in args_str ):
183+ for i , arg in enumerate (args_str ):
184+ if arg in ("--ctx-size" , "-c" , "--n-ctx" ) and i + 1 < len (args_str ):
185+ return int (args_str [i + 1 ])
186+ if arg .startswith ("--ctx-size=" ):
187+ return int (arg .split ("=" , 1 )[1 ])
188+ except Exception :
189+ continue
190+ except Exception :
191+ pass
192+ return None
193+
194+
140195# ─── log ring buffer ─────────────────────────────────────────────────────
141196
142197_MAX_LOG_LINES = 500
@@ -498,6 +553,103 @@ def _handle_debug_last_requests(self) -> None:
498553 return
499554 self ._send_json (200 , self .request_log .snapshot ())
500555
556+ def _handle_debug_vram (self ) -> None :
557+ """GET /v1/debug/vram — VRAM projection and live nvidia-smi metrics.
558+
559+ Returns the leader's current VRAM state as JSON. Calls
560+ :func:`project_vram_usage` from ``runtime/lane_models.py`` (the
561+ pure-data helper that both ``check_vram_pressure`` and
562+ ``_check_vram_spillover_risk`` already delegate to) and formats
563+ the ``VRAMProjection`` dataclass + live nvidia-smi ratio into a
564+ JSON response.
565+
566+ 503 if nvidia-smi is unavailable (not a GPU node).
567+ """
568+ gpu = _query_nvidia_smi ()
569+ if gpu is None :
570+ self ._send_json (
571+ 503 ,
572+ {
573+ "error" : "nvidia-smi unavailable" ,
574+ "fix" : "This endpoint requires an NVIDIA GPU with nvidia-smi installed." ,
575+ },
576+ )
577+ return
578+
579+ # Import thresholds from the single source of truth in lane_models.
580+ from maxim .runtime .lane_models import (
581+ _SPILLOVER_RATIO ,
582+ _SPILLOVER_WARN_RATIO ,
583+ project_vram_usage ,
584+ )
585+
586+ vram_used_gb = float (gpu .get ("vram_used_gb" ) or 0.0 )
587+ vram_total_gb = float (gpu .get ("vram_total_gb" ) or 0.0 )
588+ ratio = round (vram_used_gb / vram_total_gb , 4 ) if vram_total_gb > 0 else 0.0
589+
590+ live = {
591+ "vram_used_gb" : vram_used_gb ,
592+ "vram_total_gb" : vram_total_gb ,
593+ "vram_utilization_pct" : gpu .get ("utilization_pct" ),
594+ "temperature_c" : gpu .get ("temperature_c" ),
595+ "ratio" : ratio ,
596+ "spillover" : ratio > _SPILLOVER_RATIO ,
597+ "warning" : ratio > _SPILLOVER_WARN_RATIO ,
598+ }
599+
600+ # Build projection from active model + running n_ctx.
601+ projection = None
602+ try :
603+ import maxim .runtime .llm_server as _srv
604+
605+ active_profile = _srv ._active_model
606+ if active_profile :
607+ # Extract port from upstream_url (always http://127.0.0.1:{port}).
608+ try :
609+ from urllib .parse import urlparse
610+
611+ upstream_port = urlparse (self .upstream_url ).port or DEFAULT_UPSTREAM_PORT
612+ except Exception :
613+ upstream_port = DEFAULT_UPSTREAM_PORT
614+ running_n_ctx = _current_llama_server_n_ctx (upstream_port )
615+ if running_n_ctx and running_n_ctx > 0 :
616+ from maxim .models .language .config import _BUILTIN_PROFILES
617+
618+ profile_meta = _BUILTIN_PROFILES .get (active_profile , {})
619+ vram_proj = project_vram_usage (
620+ active_profile ,
621+ profile_meta ,
622+ running_n_ctx ,
623+ vram_total_gb ,
624+ )
625+ if vram_proj is not None :
626+ projection = {
627+ "profile" : vram_proj .profile ,
628+ "n_ctx" : vram_proj .n_ctx ,
629+ "weights_gb" : vram_proj .weights_gb ,
630+ "kv_cache_gb" : round (vram_proj .kv_cache_gb , 3 ),
631+ "headroom_gb" : round (vram_proj .headroom_gb , 3 ),
632+ "projected_total_gb" : round (vram_proj .projected_total_gb , 3 ),
633+ "physical_vram_gb" : vram_proj .physical_vram_gb ,
634+ "spillover_risk" : vram_proj .spillover_risk ,
635+ "recommended_n_ctx" : vram_proj .recommended_n_ctx ,
636+ }
637+ except Exception :
638+ logger .debug ("vram endpoint: projection unavailable" , exc_info = True )
639+
640+ self ._send_json (
641+ 200 ,
642+ {
643+ "live" : live ,
644+ "projection" : projection ,
645+ "thresholds" : {
646+ "spillover_ratio" : _SPILLOVER_RATIO ,
647+ "warn_ratio" : _SPILLOVER_WARN_RATIO ,
648+ },
649+ "timestamp" : time .time (),
650+ },
651+ )
652+
501653 def _is_debug_path (self , path : str ) -> bool :
502654 stripped = path .rstrip ("/" ).split ("?" )[0 ]
503655 return stripped in (
@@ -508,6 +660,9 @@ def _is_debug_path(self, path: str) -> bool:
508660 "/v1/debug/version" ,
509661 "/v1/debug/logs" ,
510662 "/v1/debug/last-requests" ,
663+ "/v1/debug/vram" ,
664+ "/v1/debug/deps" ,
665+ "/v1/debug/install-status" ,
511666 )
512667
513668 def _route_debug (self , path : str ) -> None :
@@ -526,6 +681,8 @@ def _route_debug(self, path: str) -> None:
526681 self ._handle_debug_logs ()
527682 elif stripped == "/v1/debug/last-requests" :
528683 self ._handle_debug_last_requests ()
684+ elif stripped == "/v1/debug/vram" :
685+ self ._handle_debug_vram ()
529686 elif stripped == "/v1/debug/deps" :
530687 self ._handle_debug_deps ()
531688 elif stripped == "/v1/debug/install-status" :
0 commit comments