diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 335750a..d580d0f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -184,11 +184,18 @@ jobs: mv "$file" "${file%.deb}-ubuntu${OS_VERSION}.deb" done - - name: Build package (wheel + sdist) + - name: Build GCM package (wheel + sdist) run: | source ~/.cache/venv-ci/bin/activate python -m build + - name: Build Sentinel package (wheel + sdist) + run: | + source ~/.cache/venv-ci/bin/activate + cd gcm-sentinel + python -m build + cp dist/* ../dist/ + - name: Create GitHub Release and upload assets if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && github.repository == 'facebookresearch/gcm' && env.RELEASE_EXISTS == 'false' }} uses: softprops/action-gh-release@v2 @@ -269,12 +276,18 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Package and push Helm chart + - name: Package and push GCM Helm chart run: | sed -i "s/^version:.*/version: ${VERSION}/" charts/gcm/Chart.yaml helm package charts/gcm helm push gcm-${VERSION}.tgz oci://ghcr.io/facebookresearch/charts + - name: Package and push Sentinel Helm chart + run: | + sed -i "s/^version:.*/version: ${VERSION}/" charts/gcm-sentinel/Chart.yaml + helm package charts/gcm-sentinel + helm push gcm-sentinel-${VERSION}.tgz oci://ghcr.io/facebookresearch/charts + docker-publish: needs: [check-release] runs-on: ubuntu-latest @@ -288,6 +301,7 @@ jobs: REGISTRY: ghcr.io GCM_IMAGE: ghcr.io/facebookresearch/gcm NPD_IMAGE: ghcr.io/facebookresearch/gcm-npd + SENTINEL_IMAGE: ghcr.io/facebookresearch/gcm-sentinel steps: - uses: actions/checkout@v4 @@ -331,3 +345,16 @@ jobs: docker build -f docker/Dockerfile.npd -t ${{ env.NPD_IMAGE }}:${{ env.VERSION }} -t ${{ env.NPD_IMAGE }}:latest . docker push ${{ env.NPD_IMAGE }}:${{ env.VERSION }} docker push ${{ env.NPD_IMAGE }}:latest + + - name: Build and push Sentinel image + uses: docker/build-push-action@v6 + with: + context: ./gcm-sentinel + file: gcm-sentinel/Dockerfile + platforms: linux/amd64 + push: true + tags: | + ${{ env.SENTINEL_IMAGE }}:${{ env.VERSION }} + ${{ env.SENTINEL_IMAGE }}:latest + cache-from: type=gha,scope=sentinel + cache-to: type=gha,mode=max,scope=sentinel diff --git a/charts/gcm-sentinel/Chart.yaml b/charts/gcm-sentinel/Chart.yaml new file mode 100644 index 0000000..0de074e --- /dev/null +++ b/charts/gcm-sentinel/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: gcm-sentinel +description: AI-powered GPU cluster sentinel agent +type: application +version: 0.0.0 +appVersion: "0.0.0" diff --git a/charts/gcm-sentinel/templates/_helpers.tpl b/charts/gcm-sentinel/templates/_helpers.tpl new file mode 100644 index 0000000..3eb5b85 --- /dev/null +++ b/charts/gcm-sentinel/templates/_helpers.tpl @@ -0,0 +1,9 @@ +{{/* +Validate sentinel.actionMode — fail fast on invalid values. +*/}} +{{- define "gcm-sentinel.validateActionMode" -}} +{{- $valid := list "recommend" "annotate" "execute" -}} +{{- if not (has .Values.sentinel.actionMode $valid) -}} +{{- fail (printf "Invalid sentinel.actionMode: %q. Must be one of: %s" .Values.sentinel.actionMode (join ", " $valid)) -}} +{{- end -}} +{{- end -}} diff --git a/charts/gcm-sentinel/templates/clusterrole.yaml b/charts/gcm-sentinel/templates/clusterrole.yaml new file mode 100644 index 0000000..f0e7c7b --- /dev/null +++ b/charts/gcm-sentinel/templates/clusterrole.yaml @@ -0,0 +1,44 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ .Release.Name }}-gcm-sentinel + labels: + app: gcm-sentinel + app.kubernetes.io/name: gcm-sentinel + app.kubernetes.io/instance: {{ .Release.Name }} +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["nodes/status"] + verbs: ["get"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list"] + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "create", "patch", "update"] + + {{- if or (eq .Values.sentinel.actionMode "annotate") (eq .Values.sentinel.actionMode "execute") }} + # Annotate/execute: write annotations on nodes. + - apiGroups: [""] + resources: ["nodes"] + verbs: ["patch"] + {{- end }} + + {{- if eq .Values.sentinel.actionMode "execute" }} + # Execute: cordon (patch nodes) + drain (evict pods). + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + {{- end }} diff --git a/charts/gcm-sentinel/templates/clusterrolebinding.yaml b/charts/gcm-sentinel/templates/clusterrolebinding.yaml new file mode 100644 index 0000000..c272428 --- /dev/null +++ b/charts/gcm-sentinel/templates/clusterrolebinding.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ .Release.Name }}-gcm-sentinel + labels: + app: gcm-sentinel + app.kubernetes.io/name: gcm-sentinel + app.kubernetes.io/instance: {{ .Release.Name }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Release.Name }}-gcm-sentinel +subjects: + - kind: ServiceAccount + name: {{ .Release.Name }}-gcm-sentinel + namespace: {{ .Release.Namespace }} diff --git a/charts/gcm-sentinel/templates/configmap.yaml b/charts/gcm-sentinel/templates/configmap.yaml new file mode 100644 index 0000000..19155fd --- /dev/null +++ b/charts/gcm-sentinel/templates/configmap.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-gcm-sentinel-prompt + namespace: {{ .Release.Namespace }} + labels: + app: gcm-sentinel + app.kubernetes.io/name: gcm-sentinel + app.kubernetes.io/instance: {{ .Release.Name }} +data: + system-prompt.txt: | + You are an expert GPU cluster reliability engineer. A Kubernetes node has reported + a GPU-related problem. + + Investigate thoroughly using ALL available tools — explore every data source you + have access to before making your assessment. The tools available to you are your + eyes and hands into the cluster. Use them extensively. + + For XID error code reference, see: https://docs.nvidia.com/deploy/xid-errors/ + + ## Assessment + + Provide your final assessment as JSON: + ```json + {"severity": "critical|warning|info|ok", "summary": "...", "root_cause": "...", "recommended_action": "cordon|drain|taint|reboot|none", "confidence": 0.0-1.0} + ``` diff --git a/charts/gcm-sentinel/templates/deployment.yaml b/charts/gcm-sentinel/templates/deployment.yaml new file mode 100644 index 0000000..a66062b --- /dev/null +++ b/charts/gcm-sentinel/templates/deployment.yaml @@ -0,0 +1,89 @@ +{{- include "gcm-sentinel.validateActionMode" . -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Release.Name }}-gcm-sentinel + namespace: {{ .Release.Namespace }} + labels: + app: gcm-sentinel + app.kubernetes.io/name: gcm-sentinel + app.kubernetes.io/instance: {{ .Release.Name }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app: gcm-sentinel + app.kubernetes.io/instance: {{ .Release.Name }} + template: + metadata: + labels: + app: gcm-sentinel + app.kubernetes.io/instance: {{ .Release.Name }} + annotations: + checksum/prompt: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + spec: + serviceAccountName: {{ .Release.Name }}-gcm-sentinel + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: gcm-sentinel + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + - serve + - --log-level + - "{{ .Values.logLevel }}" + env: + - name: GCM_SENTINEL_LLM_API + value: "{{ .Values.llm.api }}" + - name: GCM_SENTINEL_API_KEY + {{- if .Values.llm.existingSecret }} + valueFrom: + secretKeyRef: + name: {{ .Values.llm.existingSecret }} + key: {{ .Values.llm.secretKey | default "api-key" }} + {{- else }} + value: "{{ .Values.llm.apiKey }}" + {{- end }} + - name: GCM_SENTINEL_MODEL + value: "{{ .Values.llm.model }}" + - name: GCM_SENTINEL_PROMETHEUS_URL + value: "{{ .Values.prometheus.url }}" + - name: GCM_SENTINEL_ACTION_MODE + value: "{{ .Values.sentinel.actionMode }}" + - name: GCM_SENTINEL_COOLDOWN_SECONDS + value: "{{ .Values.sentinel.cooldownSeconds }}" + {{- if .Values.sentinel.nodeAllowlist }} + - name: GCM_SENTINEL_NODE_ALLOWLIST + value: "{{ .Values.sentinel.nodeAllowlist }}" + {{- end }} + - name: GCM_SENTINEL_MAX_ACTIONS_PER_HOUR + value: "{{ .Values.sentinel.maxActionsPerHour }}" + - name: GCM_SENTINEL_MAX_TOOL_ROUNDS + value: "{{ .Values.sentinel.maxToolRounds }}" + - name: GCM_SENTINEL_MAX_TOOL_OUTPUT + value: "{{ .Values.sentinel.maxToolOutput }}" + - name: GCM_SENTINEL_SYSTEM_PROMPT_PATH + value: "/etc/gcm-sentinel/system-prompt.txt" + {{- if .Values.sentinel.webhookUrl }} + - name: GCM_SENTINEL_WEBHOOK_URL + value: "{{ .Values.sentinel.webhookUrl }}" + {{- end }} + - name: GCM_SENTINEL_WATCH_CONDITIONS + value: '{{ join "," .Values.sentinel.watchConditions }}' + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: system-prompt + mountPath: /etc/gcm-sentinel + readOnly: true + volumes: + - name: system-prompt + configMap: + name: {{ .Release.Name }}-gcm-sentinel-prompt diff --git a/charts/gcm-sentinel/templates/serviceaccount.yaml b/charts/gcm-sentinel/templates/serviceaccount.yaml new file mode 100644 index 0000000..e813ae1 --- /dev/null +++ b/charts/gcm-sentinel/templates/serviceaccount.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Release.Name }}-gcm-sentinel + namespace: {{ .Release.Namespace }} + labels: + app: gcm-sentinel + app.kubernetes.io/name: gcm-sentinel + app.kubernetes.io/instance: {{ .Release.Name }} diff --git a/charts/gcm-sentinel/values.yaml b/charts/gcm-sentinel/values.yaml new file mode 100644 index 0000000..4a2857d --- /dev/null +++ b/charts/gcm-sentinel/values.yaml @@ -0,0 +1,124 @@ +replicaCount: 1 + +image: + repository: ghcr.io/facebookresearch/gcm-sentinel + tag: latest + pullPolicy: IfNotPresent + +# LLM configuration. +llm: + # Provider: "anthropic" (Claude) or "openai" (GPT, vLLM, Ollama, Azure OpenAI). + api: "anthropic" + # API key — provide ONE of: + # apiKey: "..." (plaintext, for testing only) + # existingSecret: "my-secret" + secretKey: "api-key" (recommended for production) + apiKey: "" + existingSecret: "" + secretKey: "api-key" + # Model name/ID. + model: "claude-sonnet-4-6-20250725" + +# Prometheus endpoint in the cluster. +prometheus: + url: "http://kube-prometheus-stack-prometheus.monitoring:9090" + +# ============================================================================ +# SAFETY CONFIGURATION +# +# By default, gcm-sentinel is OBSERVE-ONLY. It investigates GPU failures and +# reports findings but NEVER modifies cluster state (no cordon, no drain, +# no taint). This is enforced at three levels: +# +# 1. actionMode="recommend" — remediation tools are not even in the LLM's +# tool schema, so it cannot attempt to call them. +# 2. RBAC — the ClusterRole only gets node patch/eviction verbs when +# actionMode is "annotate" or "execute". In recommend mode, RBAC is +# strictly read-only. +# 3. Rate limits — even in execute mode, actions are gated by confidence +# threshold, cooldown, and per-hour rate limits. +# +# To enable automated remediation, you must EXPLICITLY set: +# --set sentinel.actionMode=execute +# +# We strongly recommend running in "recommend" mode for at least 2 weeks +# to validate the agent's assessments before enabling any mutations. +# ============================================================================ + +sentinel: + # ACTION MODE — controls whether the agent can modify cluster state. + # + # "recommend" (DEFAULT, SAFE) — observe-only. Investigate, log, emit + # K8s Events, send webhooks. Zero cluster mutations. + # RBAC is read-only. The agent cannot cordon/drain/taint. + # + # "annotate" — everything above + write gcm-sentinel/* annotations on + # the node. A separate controller or human acts on them. + # RBAC adds node patch. The agent still cannot cordon/drain. + # + # "execute" — everything above + The agent can cordon/drain/taint nodes. + # ONLY if max_actions_per_hour not exceeded. RBAC adds evictions. + # USE WITH CAUTION. + actionMode: "recommend" + + # Minimum seconds between investigations for the same node+condition. + # Prevents runaway investigations from NPD condition flapping. + cooldownSeconds: "3600" + + # If non-empty, only investigate nodes matching these patterns (fnmatch glob). + # Use to scope rollout to a subset of nodes. Comma-separated. + # Example: "gpu-node-01,gpu-node-02,gpu-node-0*" + nodeAllowlist: "" + + # Max remediation actions (cordon/drain/taint) per hour across ALL nodes. + # Circuit-breaker to prevent cascading drain. Only relevant in execute mode. + maxActionsPerHour: "3" + + # Maximum tool-use rounds per investigation. + maxToolRounds: "25" + + # Max characters per tool result sent to the LLM. Controls context cost. + # 0 = unlimited (LLM sees full output). + maxToolOutput: "8000" + + # Webhook URL to POST results to (e.g. Slack incoming webhook). + # Leave empty to disable. + webhookUrl: "" + + # Node conditions to watch. Defaults include both OSS GCM conditions + # (Gcm* prefix) and internal variants (NPD* prefix). Only conditions + # that exist on your cluster will trigger — extra names are harmless. + watchConditions: + # OSS GCM conditions (https://github.com/facebookresearch/gcm) + - GcmXidErrorsProblem + - GcmSmiEccProblem + - GcmSmiDisconnectedProblem + - GcmProcZombieProblem + - GcmDcgmiNvlinkStatusProblem + - GcmDcgmiDiagProblem + # Internal variants + - NPDXidErrorsProblem + - NPDSmiEccProblem + - NPDSmiDisconnectedProblem + - NPDProcZombieProblem + - NPDDcgmiNvlinkStatusProblem + - NPDDcgmiDiagProblem + - NPDDcgmRunningProblem + +# Resource limits. +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +# Log level. +logLevel: INFO + +# Node selector — run on infra/non-GPU nodes so the sentinel agent +# doesn't consume GPU resources. Set to match your cluster's infra nodes. +# Example for AWS: node.kubernetes.io/instance-type: m5.4xlarge +nodeSelector: {} + +tolerations: [] diff --git a/gcm-sentinel/Dockerfile b/gcm-sentinel/Dockerfile new file mode 100644 index 0000000..9047f86 --- /dev/null +++ b/gcm-sentinel/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.11-slim AS builder + +WORKDIR /build +COPY pyproject.toml README.md ./ +COPY src/ src/ + +RUN pip install --no-cache-dir --prefix=/install . + +FROM python:3.11-slim + +LABEL org.opencontainers.image.source=https://github.com/facebookresearch/gcm + +WORKDIR /app + +COPY --from=builder /install /usr/local +COPY . . + +RUN gcm-sentinel --help + +ENTRYPOINT ["gcm-sentinel"] +CMD ["serve"] diff --git a/gcm-sentinel/README.md b/gcm-sentinel/README.md new file mode 100644 index 0000000..16fffe9 --- /dev/null +++ b/gcm-sentinel/README.md @@ -0,0 +1,139 @@ +# gcm-sentinel + +AI-powered GPU cluster sentinel agent. Watches for GPU hardware failures on Kubernetes clusters and uses an LLM (Claude or GPT) to investigate — querying Prometheus metrics, Kubernetes state, pod logs, and training job status — then recommends remediation actions. + +## Quick Start + +### Try it (no cluster needed) + +```bash +pip install gcm-sentinel +export ANTHROPIC_API_KEY=your-key-here +python -m gcm_sentinel.demo.run +``` + +This runs 5 GPU failure scenarios (ECC errors, GPU off bus, NVLink, thermal, transient XID) against mock backends. Only an API key is needed. + +To use OpenAI instead: + +```bash +pip install gcm-sentinel[openai] +export OPENAI_API_KEY=your-key-here +python -m gcm_sentinel.demo.run +``` + +### Deploy to a cluster + +```bash +helm install gcm-sentinel ./charts/gcm-sentinel \ + --set llm.apiKey=your-key-here \ + --set prometheus.url=http://prometheus.monitoring:9090 +``` + +The agent starts in **observe-only mode** by default. It investigates GPU failures and sends results to K8s Events and webhooks, but makes **zero cluster mutations**. + +### One-shot investigation (CLI) + +```bash +pip install gcm-sentinel +export GCM_SENTINEL_API_KEY=your-key-here +gcm-sentinel investigate gpu-node-07 --condition GcmSmiEccProblem +``` + +## How It Works + +``` +GCM health check detects GPU failure + → NPD sets node condition = True (e.g. GcmXidErrorsProblem) + → gcm-sentinel watches for condition changes via K8s watch API + → LLM investigates using tools (up to 25 rounds): + query_prometheus — PromQL queries (kube-state-metrics, node-exporter, etc.) + query_dcgm_direct — GPU metrics directly from dcgm-exporter on the node + query_node_exporter_direct— host metrics directly from node-exporter + query_infiniband_direct — InfiniBand port metrics from node-exporter + get_node_info — conditions, taints, labels, pods + get_node_events — recent K8s Events + get_pod_logs — dcgm-exporter, NPD diagnostic output + get_gcm_health — NPD problem gauges and counters + get_workload_info — training job identity, sibling pods + get_workload_logs — NCCL/CUDA error logs from training pods + query_alertmanager — currently firing alerts + → Assessment: severity + root_cause + recommended_action + confidence + → Output: K8s Event + webhook (Slack/PagerDuty) + optional node annotation +``` + +## Safety + +The agent defaults to **observe-only**. It must be explicitly configured to modify cluster state. + +| Mode | Behavior | Helm flag | +|---|---|---| +| `recommend` (default) | Investigate + report. No mutations. | `--set sentinel.actionMode=recommend` | +| `annotate` | Above + write `gcm-sentinel/*` annotations on node. | `--set sentinel.actionMode=annotate` | +| `execute` | Above + can cordon/drain/taint. Gated by confidence + rate limit. | `--set sentinel.actionMode=execute` | + +Additional safety: +- **Cooldown**: 1 hour between investigations for the same node+condition +- **Node allowlist**: scope to specific nodes (`--set sentinel.nodeAllowlist="gpu-node-01"`) +- **Rate limit**: max 3 remediation actions per hour (circuit-breaker) +- **RBAC**: ClusterRole only gets `patch`/`eviction` verbs in annotate/execute modes +- **Tool removal**: remediation tools are absent from the LLM's schema unless `execute` mode + +See [Safety & Rollout Guide](https://facebookresearch.github.io/gcm/docs/GCM_Sentinel/safety) for the full rollout playbook. + +## Configuration + +All via environment variables (`GCM_SENTINEL_` prefix) or Helm values: + +| Variable | Default | Description | +|---|---|---| +| `API_KEY` | (required) | LLM API key | +| `LLM_API` | `anthropic` | `anthropic` or `openai` | +| `MODEL` | `claude-sonnet-4-6-20250725` | Model name | +| `PROMETHEUS_URL` | `http://kube-prometheus-stack-prometheus.monitoring:9090` | Prometheus URL | +| `ALERTMANAGER_URL` | (Alertmanager URL) | Leave empty to disable | +| `ACTION_MODE` | `recommend` | `recommend` / `annotate` / `execute` | +| `COOLDOWN_SECONDS` | `3600` | Cooldown per node+condition | +| `NODE_ALLOWLIST` | (all) | Comma-separated node patterns | +| `MAX_ACTIONS_PER_HOUR` | `3` | Circuit-breaker | +| `WATCH_CONDITIONS` | GCM defaults | Comma-separated conditions to watch | +| `MAX_TOOL_OUTPUT` | `8000` | Max chars per tool result sent to LLM. 0 = unlimited. | +| `WEBHOOK_URL` | (empty) | Slack/PagerDuty webhook | + +All variables use the `GCM_SENTINEL_` prefix (e.g. `GCM_SENTINEL_API_KEY`). + +## Watch Conditions + +Defaults watch for [GCM health check](https://facebookresearch.github.io/gcm/docs/GCM_Health_Checks/kubernetes_deployment) conditions (`Gcm*` prefix). If you use different NPD condition names: + +```bash +export GCM_SENTINEL_WATCH_CONDITIONS="MyGPUCheck,CustomXidCondition" +``` + +## Adding New Data Sources + +The agent queries data sources via a plugin system. Each data source is a Python class that exposes tools the LLM can call — Prometheus queries, K8s API calls, HTTP requests to any service. + +To give the agent access to a new system (IPMI sensors, cloud APIs, job schedulers, etc.), create a data source file and register it: + +```python +# src/gcm_sentinel/datasources/my_source.py +from gcm_sentinel.datasources import DataSource + +class MyDataSource(DataSource): + name = "my_source" + + def get_tools(self): + return [{"name": "query_my_source", "description": "...", "input_schema": {...}}] + + async def execute(self, tool_name, tool_input): + return await my_query(tool_input["node_name"]) +``` + +Then add `registry.register(MyDataSource(cfg))` in `datasources/__init__.py`. + +See the full [Adding a New Data Source](https://facebookresearch.github.io/gcm/docs/GCM_Sentinel/adding_new_datasource) guide for a complete walkthrough with a real example. + +## License + +See [LICENSE](../LICENSE). diff --git a/gcm-sentinel/demo/__init__.py b/gcm-sentinel/demo/__init__.py new file mode 100644 index 0000000..6d8d1e4 --- /dev/null +++ b/gcm-sentinel/demo/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Demo mode for gcm-sentinel — runs scenarios against mock backends.""" diff --git a/gcm-sentinel/demo/run.py b/gcm-sentinel/demo/run.py new file mode 100644 index 0000000..0c2ace5 --- /dev/null +++ b/gcm-sentinel/demo/run.py @@ -0,0 +1,126 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Demo: run investigation scenarios against mock data. + +Usage: python -m gcm_sentinel.demo.run [--scenario NAME] +Requires ANTHROPIC_API_KEY or OPENAI_API_KEY to be set. +""" +from __future__ import annotations + +import asyncio +import json +import logging +import os +import sys +from pathlib import Path + +from gcm_sentinel.config import SentinelConfig +from gcm_sentinel.datasources import DataSource, DataSourceRegistry +from gcm_sentinel.engine import run_investigation +from gcm_sentinel.models import NodeConditionEvent + +SCENARIOS_DIR = Path(__file__).resolve().parent.parent / "tests" / "scenarios" + + +class MockDataSource(DataSource): + """Returns canned data from a scenario JSON file for all tools.""" + name = "mock" + + def __init__(self, cfg, scenario): + super().__init__(cfg) + self._scenario = scenario + + def get_tools(self): + node_schema = {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]} + return [ + {"name": "get_node_info", "description": "Node info.", "input_schema": node_schema}, + {"name": "get_node_events", "description": "Node events.", "input_schema": node_schema}, + {"name": "get_gcm_health", "description": "GCM/NPD health.", "input_schema": node_schema}, + { + "name": "query_prometheus", + "description": "Execute a PromQL query. Use the metrics reference in the system prompt.", + "input_schema": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "duration": {"type": "string", "default": "5m"}, + }, + "required": ["query"], + }, + }, + {"name": "query_dcgm_direct", "description": "GPU metrics from dcgm-exporter.", "input_schema": node_schema}, + ] + + async def execute(self, tool_name, tool_input): + if tool_name == "get_node_info": + return json.dumps(self._scenario.get("node_info", {}), indent=2) + elif tool_name == "get_gcm_health": + return json.dumps(self._scenario.get("gcm_health", {}), indent=2) + elif tool_name in ("query_prometheus", "query_dcgm_direct"): + query = tool_input.get("query", "") + for metric_name, response in self._scenario.get("prometheus_responses", {}).items(): + if metric_name in query: + return json.dumps(response.get("data", {}), indent=2) + return json.dumps({"resultType": "vector", "result": []}) + return "[]" + + def get_system_prompt_section(self): + return "" + + +async def run_scenario(scenario_path: Path, cfg: SentinelConfig) -> None: + with open(scenario_path) as f: + scenario = json.load(f) + + event = NodeConditionEvent( + node_name=scenario["node_name"], condition_type=scenario["condition_type"], + status="True", reason=scenario.get("reason", scenario["condition_type"]), + message=scenario.get("message", ""), + ) + + print(f"\n{'='*70}\nSCENARIO: {scenario['name']}\n{'='*70}") + print(f"Node: {event.node_name} Condition: {event.condition_type}") + print(f"Expected: severity={scenario.get('expected_severity')}, action={scenario.get('expected_action')}\n") + + registry = DataSourceRegistry(cfg) + registry.register(MockDataSource(cfg, scenario)) + + try: + result = await run_investigation(event, cfg, registry) + except Exception as exc: + print(f"ERROR: {exc}") + return + + print(f"{'='*70}\nRESULT: severity={result.severity.value} action={result.recommended_action.value} confidence={result.confidence:.0%}") + print(f"Summary: {result.summary}\nRoot cause: {result.root_cause}\nTool calls: {len(result.tool_calls)}\n{'='*70}\n") + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") + + api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("OPENAI_API_KEY") or os.environ.get("GCM_SENTINEL_API_KEY") + if not api_key: + print("Set ANTHROPIC_API_KEY, OPENAI_API_KEY, or GCM_SENTINEL_API_KEY") + sys.exit(1) + + llm_api = os.environ.get("GCM_SENTINEL_LLM_API", "anthropic") + if not os.environ.get("GCM_SENTINEL_LLM_API") and os.environ.get("OPENAI_API_KEY") and not os.environ.get("ANTHROPIC_API_KEY"): + llm_api = "openai" + + cfg = SentinelConfig(api_key=api_key, llm_api=llm_api) + + scenario_filter = sys.argv[2] if len(sys.argv) > 2 and sys.argv[1] == "--scenario" else None + files = sorted(SCENARIOS_DIR.glob("*.json")) + if scenario_filter: + files = [f for f in files if scenario_filter in f.stem] + if not files: + print(f"No scenarios found in {SCENARIOS_DIR}") + sys.exit(1) + + for f in files: + asyncio.run(run_scenario(f, cfg)) + print("All scenarios complete.") + + +if __name__ == "__main__": + main() diff --git a/gcm-sentinel/pyproject.toml b/gcm-sentinel/pyproject.toml new file mode 100644 index 0000000..53ffe5a --- /dev/null +++ b/gcm-sentinel/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = ["setuptools>=68.0", "setuptools-scm>=8.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "gcm-sentinel" +version = "0.0.0" +description = "AI-powered GPU cluster sentinel agent" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "anthropic>=0.39.0", + "httpx>=0.27.0", + "kubernetes>=31.0.0", + "pydantic>=2.7.0", + "pydantic-settings>=2.5.0", + "click>=8.0.0", +] + +[project.optional-dependencies] +openai = [ + "openai>=1.0.0", +] +dev = [ + "pytest>=8.0", + "pytest-asyncio>=0.24.0", + "openai>=1.0.0", +] + +[project.scripts] +gcm-sentinel = "gcm_sentinel.main:cli" + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/gcm-sentinel/src/gcm_sentinel/__init__.py b/gcm-sentinel/src/gcm_sentinel/__init__.py new file mode 100644 index 0000000..84e3ae7 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""GCM Sentinel: AI-powered GPU cluster sentinel agent.""" diff --git a/gcm-sentinel/src/gcm_sentinel/backends/__init__.py b/gcm-sentinel/src/gcm_sentinel/backends/__init__.py new file mode 100644 index 0000000..fa4c9cd --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/backends/__init__.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +import dataclasses +from abc import ABC, abstractmethod +from typing import Any + + +@dataclasses.dataclass +class ToolUseRequest: + id: str + name: str + input: dict + + +@dataclasses.dataclass +class ChatResponse: + text_parts: list[str] + tool_requests: list[ToolUseRequest] + done: bool + raw_response: Any # SDK-specific response for building follow-up messages. + + +class LLMBackend(ABC): + @abstractmethod + def chat(self, messages: list[dict], system: str, tools: list[dict], max_tokens: int = 4096) -> ChatResponse: ... + + @abstractmethod + def append_assistant_response(self, messages: list[dict], response: ChatResponse) -> None: ... + + @abstractmethod + def append_tool_results(self, messages: list[dict], response: ChatResponse, results: list[tuple[ToolUseRequest, str]]) -> None: ... + + +def get_backend(api: str, api_key: str, model: str) -> LLMBackend: + if api == "anthropic": + from gcm_sentinel.backends.anthropic_backend import AnthropicBackend + return AnthropicBackend(api_key=api_key, model=model) + elif api == "openai": + try: + from gcm_sentinel.backends.openai_backend import OpenAIBackend + except ImportError: + raise ImportError("OpenAI backend requires the 'openai' package. Install with: pip install gcm-sentinel[openai]") + return OpenAIBackend(api_key=api_key, model=model) + else: + raise ValueError(f"Unknown LLM API: {api!r}. Must be 'anthropic' or 'openai'.") diff --git a/gcm-sentinel/src/gcm_sentinel/backends/anthropic_backend.py b/gcm-sentinel/src/gcm_sentinel/backends/anthropic_backend.py new file mode 100644 index 0000000..48a34fa --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/backends/anthropic_backend.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +import anthropic + +from gcm_sentinel.backends import ChatResponse, LLMBackend, ToolUseRequest + + +class AnthropicBackend(LLMBackend): + def __init__(self, api_key: str, model: str) -> None: + self._client = anthropic.Anthropic(api_key=api_key) + self._model = model + + def chat(self, messages, system, tools, max_tokens=4096): + resp = self._client.messages.create( + model=self._model, max_tokens=max_tokens, + system=system, tools=tools, messages=messages, + ) + text_parts, tool_requests = [], [] + for block in resp.content: + if block.type == "text": + text_parts.append(block.text) + elif block.type == "tool_use": + tool_requests.append(ToolUseRequest(id=block.id, name=block.name, input=block.input)) + + return ChatResponse( + text_parts=text_parts, tool_requests=tool_requests, + done=resp.stop_reason == "end_turn" or not tool_requests, + raw_response=resp, + ) + + def append_assistant_response(self, messages, response): + messages.append({"role": "assistant", "content": response.raw_response.content}) + + def append_tool_results(self, messages, response, results): + messages.append({"role": "user", "content": [ + {"type": "tool_result", "tool_use_id": req.id, "content": text} + for req, text in results + ]}) diff --git a/gcm-sentinel/src/gcm_sentinel/backends/openai_backend.py b/gcm-sentinel/src/gcm_sentinel/backends/openai_backend.py new file mode 100644 index 0000000..119e29f --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/backends/openai_backend.py @@ -0,0 +1,70 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""OpenAI backend. Also works with any OpenAI-compatible API (vLLM, Ollama, +Azure OpenAI) — set OPENAI_BASE_URL env var to point at your endpoint.""" + +from __future__ import annotations + +import json + +from openai import OpenAI + +from gcm_sentinel.backends import ChatResponse, LLMBackend, ToolUseRequest + + +def _to_openai_tools(tools: list[dict]) -> list[dict]: + return [ + {"type": "function", "function": { + "name": t["name"], "description": t.get("description", ""), + "parameters": t.get("input_schema", {}), + }} + for t in tools + ] + + +class OpenAIBackend(LLMBackend): + def __init__(self, api_key: str, model: str) -> None: + self._client = OpenAI(api_key=api_key) + self._model = model + + def chat(self, messages, system, tools, max_tokens=4096): + full_messages = [{"role": "system", "content": system}] + messages + openai_tools = _to_openai_tools(tools) + + resp = self._client.chat.completions.create( + model=self._model, messages=full_messages, + tools=openai_tools or None, max_tokens=max_tokens, + ) + choice = resp.choices[0] + msg = choice.message + + text_parts = [msg.content] if msg.content else [] + tool_requests = [] + for tc in msg.tool_calls or []: + try: + args = json.loads(tc.function.arguments) + except (json.JSONDecodeError, TypeError): + args = {} + tool_requests.append(ToolUseRequest(id=tc.id, name=tc.function.name, input=args)) + + return ChatResponse( + text_parts=text_parts, tool_requests=tool_requests, + done=choice.finish_reason != "tool_calls" or not tool_requests, + raw_response=resp, + ) + + def append_assistant_response(self, messages, response): + choice = response.raw_response.choices[0] + msg = {"role": "assistant"} + if choice.message.content: + msg["content"] = choice.message.content + if choice.message.tool_calls: + msg["tool_calls"] = [ + {"id": tc.id, "type": "function", "function": {"name": tc.function.name, "arguments": tc.function.arguments}} + for tc in choice.message.tool_calls + ] + messages.append(msg) + + def append_tool_results(self, messages, response, results): + for req, text in results: + messages.append({"role": "tool", "tool_call_id": req.id, "content": text}) diff --git a/gcm-sentinel/src/gcm_sentinel/config.py b/gcm-sentinel/src/gcm_sentinel/config.py new file mode 100644 index 0000000..35d7309 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/config.py @@ -0,0 +1,80 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +from pydantic import Field, field_validator +from pydantic_settings import BaseSettings + +DEFAULT_WATCH_CONDITIONS: list[str] = [ + # OSS GCM conditions (https://github.com/facebookresearch/gcm) + "GcmXidErrorsProblem", + "GcmSmiEccProblem", + "GcmSmiDisconnectedProblem", + "GcmProcZombieProblem", + "GcmDcgmiNvlinkStatusProblem", + "GcmDcgmiDiagProblem", + # Internal variants (some deployments use NPD prefix) + "NPDXidErrorsProblem", + "NPDSmiEccProblem", + "NPDSmiDisconnectedProblem", + "NPDProcZombieProblem", + "NPDDcgmiNvlinkStatusProblem", + "NPDDcgmiDiagProblem", + "NPDDcgmRunningProblem", +] + +ACTION_MODES = ("recommend", "annotate", "execute") + + +def _parse_csv(v: object) -> list[str]: + if isinstance(v, str): + return [c.strip() for c in v.split(",") if c.strip()] + return v # type: ignore[return-value] + + +class SentinelConfig(BaseSettings): + model_config = {"env_prefix": "GCM_SENTINEL_"} + + llm_api: str = Field(default="anthropic", description="'anthropic' or 'openai'") + api_key: str = Field(default="", description="API key for the LLM provider") + model: str = Field(default="claude-sonnet-4-6-20250725", description="Model name/ID") + + prometheus_url: str = Field( + default="http://kube-prometheus-stack-prometheus.monitoring:9090", + ) + alertmanager_url: str = Field( + default="http://kube-prometheus-stack-alertmanager.monitoring:9093", + description="Leave empty to disable", + ) + + action_mode: str = Field(default="recommend") + cooldown_seconds: int = Field(default=3600) + node_allowlist: list[str] = Field(default_factory=list) + max_actions_per_hour: int = Field(default=3) + + watch_conditions: list[str] = Field( + default_factory=lambda: list(DEFAULT_WATCH_CONDITIONS), + ) + max_tool_rounds: int = Field(default=25) + max_tool_output: int = Field(default=8000, description="Max chars per tool result sent to LLM. 0 = unlimited.") + system_prompt_path: str = Field(default="/etc/gcm-sentinel/system-prompt.txt") + log_level: str = Field(default="INFO") + webhook_url: str = Field(default="") + + _parse_watch_conditions = field_validator("watch_conditions", mode="before")(_parse_csv) + _parse_node_allowlist = field_validator("node_allowlist", mode="before")(_parse_csv) + + @field_validator("action_mode") + @classmethod + def _validate_action_mode(cls, v: str) -> str: + if v not in ACTION_MODES: + raise ValueError(f"action_mode must be one of {ACTION_MODES}, got '{v}'") + return v + + @property + def can_execute(self) -> bool: + return self.action_mode == "execute" + + @property + def can_annotate(self) -> bool: + return self.action_mode in ("annotate", "execute") diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/__init__.py b/gcm-sentinel/src/gcm_sentinel/datasources/__init__.py new file mode 100644 index 0000000..51ffe0a --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/__init__.py @@ -0,0 +1,93 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Data source plugin system. + +Each data source exposes tools the LLM can call during investigation. +To add a new data source, subclass DataSource and register it in build_datasources(). +""" +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod + +from gcm_sentinel.config import SentinelConfig + +logger = logging.getLogger(__name__) + + +class DataSource(ABC): + """Base class for data sources.""" + + name: str # Short identifier, set as class attribute. + + def __init__(self, cfg: SentinelConfig) -> None: + self.cfg = cfg + + def is_available(self) -> bool: + return True + + @abstractmethod + def get_tools(self) -> list[dict]: ... + + @abstractmethod + async def execute(self, tool_name: str, tool_input: dict) -> str: ... + + def get_system_prompt_section(self) -> str: + return "" + + +class DataSourceRegistry: + def __init__(self, cfg: SentinelConfig) -> None: + self.cfg = cfg + self._sources: list[DataSource] = [] + self._tool_to_source: dict[str, DataSource] = {} + self._all_tools: list[dict] = [] + + def register(self, ds: DataSource) -> None: + if not ds.is_available(): + logger.info("Data source '%s' not available, skipping", ds.name) + return + tools = ds.get_tools() + for tool in tools: + self._tool_to_source[tool["name"]] = ds + self._all_tools.extend(tools) + self._sources.append(ds) + logger.info("Registered data source '%s' (%d tools)", ds.name, len(tools)) + + def get_all_tools(self) -> list[dict]: + return self._all_tools + + def get_system_prompt_additions(self) -> str: + return "\n\n".join(s for ds in self._sources if (s := ds.get_system_prompt_section())) + + async def execute(self, tool_name: str, tool_input: dict) -> str: + ds = self._tool_to_source.get(tool_name) + if ds is None: + return f"Unknown tool: {tool_name}" + return await ds.execute(tool_name, tool_input) + + @property + def datasource_names(self) -> list[str]: + return [ds.name for ds in self._sources] + + +def build_datasources(cfg: SentinelConfig) -> DataSourceRegistry: + from gcm_sentinel.datasources.alertmanager import AlertmanagerDataSource + from gcm_sentinel.datasources.dcgm_direct import DCGMDirectDataSource + from gcm_sentinel.datasources.gcm_health import GCMHealthDataSource + from gcm_sentinel.datasources.kubernetes_core import KubernetesCoreDataSource + from gcm_sentinel.datasources.node_direct import NodeDirectDataSource + from gcm_sentinel.datasources.kubernetes_workloads import KubernetesWorkloadsDataSource + from gcm_sentinel.datasources.prometheus import PrometheusDataSource + from gcm_sentinel.datasources.remediation import RemediationDataSource + + registry = DataSourceRegistry(cfg) + registry.register(PrometheusDataSource(cfg)) + registry.register(DCGMDirectDataSource(cfg)) + registry.register(NodeDirectDataSource(cfg)) + registry.register(KubernetesCoreDataSource(cfg)) + registry.register(GCMHealthDataSource(cfg)) + registry.register(KubernetesWorkloadsDataSource(cfg)) + registry.register(AlertmanagerDataSource(cfg)) + registry.register(RemediationDataSource(cfg)) + return registry diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/alertmanager.py b/gcm-sentinel/src/gcm_sentinel/datasources/alertmanager.py new file mode 100644 index 0000000..293c602 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/alertmanager.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +from gcm_sentinel.datasources import DataSource +from gcm_sentinel.tools.prometheus import query_alertmanager + + +class AlertmanagerDataSource(DataSource): + name = "alertmanager" + + def is_available(self): + return bool(self.cfg.alertmanager_url) + + def get_tools(self): + return [{ + "name": "query_alertmanager", + "description": "Currently firing Alertmanager alerts, optionally filtered by node.", + "input_schema": { + "type": "object", + "properties": {"node_name": {"type": "string", "default": ""}}, + "required": [], + }, + }] + + async def execute(self, tool_name, tool_input): + return await query_alertmanager(self.cfg.alertmanager_url, tool_input.get("node_name", "")) diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/dcgm_direct.py b/gcm-sentinel/src/gcm_sentinel/datasources/dcgm_direct.py new file mode 100644 index 0000000..9e1cb06 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/dcgm_direct.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Query dcgm-exporter directly on GPU nodes. + +Falls back gracefully — if dcgm-exporter isn't running on a node, +the tool returns a clear message and the LLM uses other data sources. +""" +from __future__ import annotations + +from gcm_sentinel.datasources import DataSource +from gcm_sentinel.tools.dcgm import query_dcgm_direct + + +class DCGMDirectDataSource(DataSource): + name = "dcgm_direct" + + def get_tools(self): + return [{ + "name": "query_dcgm_direct", + "description": ( + "Query GPU metrics directly from the dcgm-exporter pod running on a node. " + "Use this when Prometheus doesn't have DCGM metrics, or to get the latest " + "real-time GPU data. Optionally filter by metric name substring." + ), + "input_schema": { + "type": "object", + "properties": { + "node_name": {"type": "string", "description": "Kubernetes node name"}, + "metric_filter": { + "type": "string", + "description": "Only return metrics containing this substring (e.g. 'ECC', 'TEMP', 'XID'). Empty for key metrics summary.", + "default": "", + }, + }, + "required": ["node_name"], + }, + }] + + async def execute(self, tool_name, tool_input): + return await query_dcgm_direct(tool_input["node_name"], tool_input.get("metric_filter", "")) diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/gcm_health.py b/gcm-sentinel/src/gcm_sentinel/datasources/gcm_health.py new file mode 100644 index 0000000..954afe7 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/gcm_health.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +from gcm_sentinel.datasources import DataSource +from gcm_sentinel.tools.gcm import get_gcm_health + + +class GCMHealthDataSource(DataSource): + name = "gcm_health" + + def get_tools(self): + return [{ + "name": "get_gcm_health", + "description": "NPD health: problem_gauge/counter metrics + Gcm/NPD node conditions.", + "input_schema": {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]}, + }] + + async def execute(self, tool_name, tool_input): + return await get_gcm_health(tool_input["node_name"], self.cfg.prometheus_url) diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_core.py b/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_core.py new file mode 100644 index 0000000..66f15cf --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_core.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +from gcm_sentinel.datasources import DataSource +from gcm_sentinel.tools.kubernetes import get_node_events, get_node_info, get_pod_logs + +_NODE_SCHEMA = {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]} + + +class KubernetesCoreDataSource(DataSource): + name = "kubernetes_core" + + def get_tools(self): + return [ + {"name": "get_node_info", "description": "Node conditions, taints, labels, annotations, resources, pods.", "input_schema": _NODE_SCHEMA}, + {"name": "get_node_events", "description": "Recent K8s Events on the node (warnings, errors, investigation history).", "input_schema": _NODE_SCHEMA}, + { + "name": "get_pod_logs", + "description": "Logs from monitoring pods (dcgm, NPD, GCM). Use pod_name_pattern to filter.", + "input_schema": { + "type": "object", + "properties": { + "node_name": {"type": "string"}, + "pod_name_pattern": {"type": "string", "default": ""}, + "tail_lines": {"type": "integer", "default": 100}, + }, + "required": ["node_name"], + }, + }, + ] + + async def execute(self, tool_name, tool_input): + if tool_name == "get_node_info": + return await get_node_info(tool_input["node_name"]) + elif tool_name == "get_node_events": + return await get_node_events(tool_input["node_name"]) + elif tool_name == "get_pod_logs": + return await get_pod_logs(tool_input["node_name"], tool_input.get("pod_name_pattern", ""), tool_input.get("tail_lines", 100)) + return f"Unknown tool: {tool_name}" diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_workloads.py b/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_workloads.py new file mode 100644 index 0000000..8ab898f --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_workloads.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +from gcm_sentinel.datasources import DataSource +from gcm_sentinel.tools.kubernetes import get_workload_info, get_workload_logs + + +class KubernetesWorkloadsDataSource(DataSource): + name = "kubernetes_workloads" + + def get_tools(self): + return [ + { + "name": "get_workload_info", + "description": "GPU workload identity: job owner, labels, GPU requests, container status, sibling pods.", + "input_schema": {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]}, + }, + { + "name": "get_workload_logs", + "description": "Error logs from GPU workload pods (NCCL, CUDA, XID, OOM patterns).", + "input_schema": { + "type": "object", + "properties": {"node_name": {"type": "string"}, "tail_lines": {"type": "integer", "default": 200}}, + "required": ["node_name"], + }, + }, + ] + + async def execute(self, tool_name, tool_input): + if tool_name == "get_workload_info": + return await get_workload_info(tool_input["node_name"]) + elif tool_name == "get_workload_logs": + return await get_workload_logs(tool_input["node_name"], tool_input.get("tail_lines", 200)) + return f"Unknown tool: {tool_name}" diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/node_direct.py b/gcm-sentinel/src/gcm_sentinel/datasources/node_direct.py new file mode 100644 index 0000000..a9ffbf6 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/node_direct.py @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Query node-exporter and InfiniBand metrics directly on nodes.""" +from __future__ import annotations + +from gcm_sentinel.datasources import DataSource +from gcm_sentinel.tools.node_exporter import query_infiniband_direct, query_node_exporter_direct + +_NODE_SCHEMA = {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]} + + +class NodeDirectDataSource(DataSource): + name = "node_direct" + + def get_tools(self): + return [ + { + "name": "query_node_exporter_direct", + "description": ( + "Query host metrics directly from node-exporter on a node. " + "Returns CPU, memory, temps, disk, and network stats. " + "Use metric_filter to narrow (e.g. 'hwmon_temp', 'load', 'infiniband')." + ), + "input_schema": { + "type": "object", + "properties": { + "node_name": {"type": "string"}, + "metric_filter": {"type": "string", "description": "Substring filter", "default": ""}, + }, + "required": ["node_name"], + }, + }, + { + "name": "query_infiniband_direct", + "description": ( + "Query InfiniBand port metrics directly from node-exporter. " + "Shows IB link errors, data counters, and port status. " + "Falls back to general network error metrics if no IB found." + ), + "input_schema": _NODE_SCHEMA, + }, + ] + + async def execute(self, tool_name, tool_input): + if tool_name == "query_node_exporter_direct": + return await query_node_exporter_direct(tool_input["node_name"], tool_input.get("metric_filter", "")) + elif tool_name == "query_infiniband_direct": + return await query_infiniband_direct(tool_input["node_name"]) + return f"Unknown tool: {tool_name}" diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/prometheus.py b/gcm-sentinel/src/gcm_sentinel/datasources/prometheus.py new file mode 100644 index 0000000..8eda97a --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/prometheus.py @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +from gcm_sentinel.datasources import DataSource +from gcm_sentinel.tools.prometheus import query_prometheus + + +class PrometheusDataSource(DataSource): + name = "prometheus" + + def get_tools(self): + return [{ + "name": "query_prometheus", + "description": ( + "Execute a PromQL query against the cluster's Prometheus server. " + "Common metric prefixes: DCGM_FI_DEV_* (GPU), node_* (host), " + "kube_* (K8s state), problem_* (NPD). " + "Filter by node with {instance=~\"NODE.*\"}, by GPU with {gpu=\"IDX\"}." + ), + "input_schema": { + "type": "object", + "properties": { + "query": {"type": "string", "description": "PromQL expression"}, + "duration": {"type": "string", "description": "Range fallback (e.g. '5m')", "default": "5m"}, + }, + "required": ["query"], + }, + }] + + async def execute(self, tool_name, tool_input): + return await query_prometheus(self.cfg.prometheus_url, tool_input["query"], tool_input.get("duration", "5m")) diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/remediation.py b/gcm-sentinel/src/gcm_sentinel/datasources/remediation.py new file mode 100644 index 0000000..8a73d29 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/datasources/remediation.py @@ -0,0 +1,107 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Only registered when action_mode="execute". Tools are absent from +the LLM's schema in recommend/annotate modes. + +All safety checks enforced HERE, before the K8s API call: +- Rate limiting +- Node name validation (can only act on the node being investigated) +- Node name validation (can only act on the node being investigated) +""" +from __future__ import annotations + +import logging +import time +from collections import deque + +from gcm_sentinel.datasources import DataSource +from gcm_sentinel.tools.kubernetes import cordon_node, drain_node, taint_node + +logger = logging.getLogger(__name__) + + +class RemediationDataSource(DataSource): + name = "remediation" + + def __init__(self, cfg): + super().__init__(cfg) + self._action_timestamps: deque[float] = deque() + self._allowed_node: str | None = None # Set by the engine before each investigation. + + def is_available(self): + return self.cfg.can_execute + + def set_allowed_node(self, node_name: str) -> None: + """Restrict remediation to a specific node. Called by the engine.""" + self._allowed_node = node_name + + def _is_rate_limited(self) -> bool: + now = time.monotonic() + while self._action_timestamps and (now - self._action_timestamps[0]) > 3600: + self._action_timestamps.popleft() + return len(self._action_timestamps) >= self.cfg.max_actions_per_hour + + def get_tools(self): + return [ + { + "name": "cordon_node", + "description": "Mark node unschedulable. Only for confirmed hardware issues.", + "input_schema": {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]}, + }, + { + "name": "drain_node", + "description": "Evict all non-DaemonSet pods + cordon. Only for critical hardware failures.", + "input_schema": {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]}, + }, + { + "name": "taint_node", + "description": "Add a taint to a node.", + "input_schema": { + "type": "object", + "properties": { + "node_name": {"type": "string"}, + "key": {"type": "string"}, "value": {"type": "string"}, + "effect": {"type": "string", "enum": ["NoSchedule", "PreferNoSchedule", "NoExecute"]}, + }, + "required": ["node_name", "key", "value", "effect"], + }, + }, + ] + + async def execute(self, tool_name, tool_input): + target_node = tool_input.get("node_name", "") + + # Guard: only allow actions on the node being investigated. + if self._allowed_node and target_node != self._allowed_node: + msg = ( + f"BLOCKED: {tool_name} rejected — target node '{target_node}' does not match " + f"the node being investigated ('{self._allowed_node}'). " + f"You can only remediate the node you are investigating." + ) + logger.warning(msg) + return msg + + # Guard: rate limit. + if self._is_rate_limited(): + msg = ( + f"RATE LIMITED: {tool_name} blocked. " + f"{self.cfg.max_actions_per_hour} remediation actions per hour limit reached. " + f"Include this action in your recommended_action assessment instead." + ) + logger.warning(msg) + return msg + + # Execute. + if tool_name == "cordon_node": + result = await cordon_node(target_node) + elif tool_name == "drain_node": + result = await drain_node(target_node) + elif tool_name == "taint_node": + result = await taint_node(target_node, tool_input["key"], tool_input["value"], tool_input["effect"]) + else: + return f"Unknown tool: {tool_name}" + + self._action_timestamps.append(time.monotonic()) + logger.info("Remediation action %s executed on %s (%d/%d this hour)", + tool_name, target_node, len(self._action_timestamps), self.cfg.max_actions_per_hour) + return result diff --git a/gcm-sentinel/src/gcm_sentinel/engine.py b/gcm-sentinel/src/gcm_sentinel/engine.py new file mode 100644 index 0000000..83560f8 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/engine.py @@ -0,0 +1,175 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Investigation engine — drives an LLM through a tool-use loop.""" + +from __future__ import annotations + +import json +import logging +import time +from pathlib import Path + +from gcm_sentinel.backends import get_backend +from gcm_sentinel.config import SentinelConfig +from gcm_sentinel.models import Action, NodeConditionEvent, Severity, ToolCall, SentinelResult +from gcm_sentinel.datasources import DataSourceRegistry, build_datasources + +logger = logging.getLogger(__name__) + +DEFAULT_SYSTEM_PROMPT = """\ +You are an expert GPU cluster reliability engineer. A Kubernetes node has reported +a GPU-related problem. + +Investigate thoroughly using ALL available tools — explore every data source you +have access to before making your assessment. The tools available to you are your +eyes and hands into the cluster. Use them extensively. + +For XID error code reference, see: https://docs.nvidia.com/deploy/xid-errors/ + +## Assessment + +Provide your final assessment as JSON: +```json +{"severity": "critical|warning|info|ok", "summary": "...", "root_cause": "...", "recommended_action": "cordon|drain|taint|reboot|none", "confidence": 0.0-1.0} +``` +""" + +_MODE_DESCRIPTIONS = { + "recommend": "Action mode: RECOMMEND (observe-only). Do NOT attempt cluster mutations.", + "annotate": "Action mode: ANNOTATE. Recommendations will be written as node annotations. Do NOT mutate directly.", + "execute": "Action mode: EXECUTE. You may cordon/drain/taint nodes for confirmed hardware failures.", +} + + +def _load_system_prompt(cfg: SentinelConfig, registry: DataSourceRegistry) -> str: + path = Path(cfg.system_prompt_path) + base = path.read_text() if path.is_file() else DEFAULT_SYSTEM_PROMPT + additions = registry.get_system_prompt_additions() + return f"{base}\n\n{additions}" if additions else base + + +def _parse_assessment(text: str) -> dict: + # Strip markdown fences. + cleaned = text + if "```json" in cleaned: + cleaned = cleaned.split("```json")[-1].split("```")[0] + elif "```" in cleaned: + parts = cleaned.split("```") + if len(parts) >= 3: + cleaned = parts[-2] + + # Find JSON object with assessment fields. + target_keys = {"severity", "recommended_action", "confidence"} + depth, start = 0, -1 + for i, ch in enumerate(cleaned): + if ch == "{": + if depth == 0: + start = i + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0 and start >= 0: + try: + parsed = json.loads(cleaned[start:i + 1]) + if isinstance(parsed, dict) and target_keys & parsed.keys(): + return parsed + except json.JSONDecodeError: + pass + start = -1 + return {} + + +async def run_investigation( + event: NodeConditionEvent, + cfg: SentinelConfig, + registry: DataSourceRegistry | None = None, +) -> SentinelResult: + if registry is None: + registry = build_datasources(cfg) + + # Restrict remediation tools to the node being investigated. + from gcm_sentinel.datasources.remediation import RemediationDataSource + for ds in registry._sources: + if isinstance(ds, RemediationDataSource): + ds.set_allowed_node(event.node_name) + + start_time = time.monotonic() + system_prompt = _load_system_prompt(cfg, registry) + tools = registry.get_all_tools() + backend = get_backend(cfg.llm_api, cfg.api_key, cfg.model) + tool_calls_log: list[ToolCall] = [] + text_parts: list[str] = [] + + logger.info("Investigating %s/%s via %s/%s (%d tools)", event.node_name, event.condition_type, cfg.llm_api, cfg.model, len(tools)) + + messages: list[dict] = [{"role": "user", "content": ( + f"Node condition change detected:\n\n" + f"- **Node**: {event.node_name}\n- **Condition**: {event.condition_type}\n" + f"- **Status**: {event.status}\n- **Message**: {event.message}\n" + f"- **Time**: {event.timestamp}\n\n{_MODE_DESCRIPTIONS[cfg.action_mode]}\n\n" + f"Investigate and provide your assessment." + )}] + + investigation_parts = [f"== Investigation for {event.node_name} =="] + + for round_num in range(1, cfg.max_tool_rounds + 1): + response = backend.chat(messages=messages, system=system_prompt, tools=tools) + text_parts = response.text_parts + investigation_parts.extend(text_parts) + + if response.done: + break + + backend.append_assistant_response(messages, response) + results = [] + for req in response.tool_requests: + investigation_parts.append(f"\n> {req.name}({json.dumps(req.input, default=str)[:200]})") + try: + result = await registry.execute(req.name, req.input) + except Exception as exc: + result = f"Tool error: {type(exc).__name__}: {exc}" + logger.exception("Tool %s failed", req.name) + + # Truncate if configured (controls LLM context cost). + llm_result = result + if cfg.max_tool_output and len(result) > cfg.max_tool_output: + llm_result = result[:cfg.max_tool_output] + ( + f"\n\n[OUTPUT TRUNCATED: showing {cfg.max_tool_output} of {len(result)} chars. " + f"Call the tool again with a more specific query to get the data you need.]" + ) + + tool_calls_log.append(ToolCall(tool_name=req.name, tool_input=req.input, result=result)) + investigation_parts.append(f"Result: {result}") + results.append((req, llm_result)) + + backend.append_tool_results(messages, response, results) + + final_text = "\n".join(text_parts) + assessment = _parse_assessment(final_text) + + logger.info("Investigation complete for %s in %.1fs (%d tool calls)", event.node_name, time.monotonic() - start_time, len(tool_calls_log)) + + try: + severity = Severity(assessment.get("severity", "warning")) + except ValueError: + severity = Severity.WARNING + try: + action = Action(assessment.get("recommended_action", "none")) + except ValueError: + action = Action.NONE + try: + confidence = max(0.0, min(1.0, float(assessment.get("confidence", 0.5)))) + except (ValueError, TypeError): + confidence = 0.5 + + return SentinelResult( + node_name=event.node_name, + condition=event.condition_type, + severity=severity, + summary=assessment.get("summary", final_text[:500]), + root_cause=assessment.get("root_cause", "Unknown"), + recommended_action=action, + confidence=confidence, + tool_calls=tool_calls_log, + investigation_log="\n".join(investigation_parts), + ) diff --git a/gcm-sentinel/src/gcm_sentinel/k8s.py b/gcm-sentinel/src/gcm_sentinel/k8s.py new file mode 100644 index 0000000..17e25aa --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/k8s.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Shared Kubernetes client initialization.""" + +from kubernetes import client, config + + +def get_core_api() -> client.CoreV1Api: + try: + config.load_incluster_config() + except config.ConfigException: + config.load_kube_config() + return client.CoreV1Api() diff --git a/gcm-sentinel/src/gcm_sentinel/main.py b/gcm-sentinel/src/gcm_sentinel/main.py new file mode 100644 index 0000000..cd62a2e --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/main.py @@ -0,0 +1,100 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Main entry point for the GCM sentinel agent. + +The agent is a pure watcher process — no listening ports, no HTTP server. +It watches K8s node conditions and triggers investigation when a GPU problem is +detected. Results go to logs, K8s Events, webhooks, and node annotations. +""" + +from __future__ import annotations + +import asyncio +import logging + +import click + +from gcm_sentinel.config import SentinelConfig +from gcm_sentinel.watcher import NodeConditionWatcher + +logger = logging.getLogger(__name__) + + +@click.group() +def cli(): + """GCM Sentinel — AI-powered GPU cluster sentinel agent.""" + pass + + +@cli.command() +@click.option("--log-level", default="INFO", help="Log level") +def serve(log_level: str): + """Start the sentinel agent (watches for GPU failures, triggers investigation).""" + logging.basicConfig( + level=getattr(logging, log_level.upper()), + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + cfg = SentinelConfig() + + # Print safety status prominently at startup. + mode_banner = { + "recommend": "OBSERVE-ONLY (no cluster mutations)", + "annotate": "ANNOTATE (writes node annotations, no cordon/drain)", + "execute": "EXECUTE (can cordon/drain/taint nodes!)", + } + logger.info("=" * 60) + logger.info("GCM Sentinel Agent") + logger.info("Action mode: %s — %s", cfg.action_mode, mode_banner[cfg.action_mode]) + if cfg.action_mode == "execute": + logger.warning( + "EXECUTE MODE ACTIVE — agent can modify cluster state. " + "Max actions/hour: %d", + cfg.max_actions_per_hour, + ) + if cfg.node_allowlist: + logger.info("Node allowlist: %s", cfg.node_allowlist) + else: + logger.info("Node allowlist: (all nodes)") + logger.info("Cooldown: %ds, Prometheus: %s", cfg.cooldown_seconds, cfg.prometheus_url) + logger.info("=" * 60) + + # Run the watcher loop — blocks forever, reconnects on errors. + watcher = NodeConditionWatcher(cfg) + asyncio.run(watcher.watch_loop()) + + +@cli.command() +@click.argument("node_name") +@click.option("--condition", default="GcmXidErrorsProblem", help="Condition to investigate") +@click.option("--message", default="XID errors detected on GPU", help="Condition message") +def investigate(node_name: str, condition: str, message: str): + """Run a one-shot investigation for a specific node.""" + from gcm_sentinel.models import NodeConditionEvent + from gcm_sentinel.engine import run_investigation + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + cfg = SentinelConfig() + event = NodeConditionEvent( + node_name=node_name, + condition_type=condition, + status="True", + reason=condition, + message=message, + ) + result = asyncio.run(run_investigation(event, cfg)) + print(f"\n{'='*60}") + print(f"Node: {result.node_name}") + print(f"Severity: {result.severity.value}") + print(f"Confidence: {result.confidence:.0%}") + print(f"Action: {result.recommended_action.value}") + print(f"Summary: {result.summary}") + print(f"Root cause: {result.root_cause}") + print(f"Tool calls: {len(result.tool_calls)}") + print(f"{'='*60}") + + +if __name__ == "__main__": + cli() diff --git a/gcm-sentinel/src/gcm_sentinel/models.py b/gcm-sentinel/src/gcm_sentinel/models.py new file mode 100644 index 0000000..1ec034b --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/models.py @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +import enum +from datetime import datetime, timezone + +from pydantic import BaseModel, Field + + +class Severity(str, enum.Enum): + CRITICAL = "critical" + WARNING = "warning" + INFO = "info" + OK = "ok" + + +class Action(str, enum.Enum): + CORDON = "cordon" + DRAIN = "drain" + TAINT = "taint" + REBOOT = "reboot" + NONE = "none" + + +class NodeConditionEvent(BaseModel): + node_name: str + condition_type: str + status: str + reason: str = "" + message: str = "" + timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + + +class ToolCall(BaseModel): + tool_name: str + tool_input: dict + result: str + + +class SentinelResult(BaseModel): + node_name: str + condition: str + severity: Severity + summary: str + root_cause: str + recommended_action: Action + confidence: float = Field(ge=0.0, le=1.0) + tool_calls: list[ToolCall] = Field(default_factory=list) + investigation_log: str = "" + timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) diff --git a/gcm-sentinel/src/gcm_sentinel/notify.py b/gcm-sentinel/src/gcm_sentinel/notify.py new file mode 100644 index 0000000..07c449b --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/notify.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +import logging +from datetime import datetime, timezone + +import httpx +from kubernetes import client + +from gcm_sentinel.k8s import get_core_api +from gcm_sentinel.models import SentinelResult + +logger = logging.getLogger(__name__) + + +async def annotate_node(result: SentinelResult) -> None: + try: + api = get_core_api() + api.patch_node(result.node_name, {"metadata": {"annotations": { + "gcm-sentinel/severity": result.severity.value, + "gcm-sentinel/action": result.recommended_action.value, + "gcm-sentinel/confidence": f"{result.confidence:.2f}", + "gcm-sentinel/summary": result.summary[:250], + "gcm-sentinel/root-cause": result.root_cause[:250], + "gcm-sentinel/condition": result.condition, + "gcm-sentinel/timestamp": datetime.now(timezone.utc).isoformat(), + }}}) + except Exception: + logger.exception("Failed to annotate node %s", result.node_name) + + +async def emit_k8s_event(result: SentinelResult) -> None: + try: + now = datetime.now(timezone.utc) + get_core_api().create_namespaced_event("default", client.CoreV1Event( + metadata=client.V1ObjectMeta(generate_name="gcm-sentinel-", namespace="default"), + involved_object=client.V1ObjectReference(kind="Node", name=result.node_name, api_version="v1"), + reason="GCMSentinel", + message=f"[{result.severity.value.upper()}] {result.summary} | Action: {result.recommended_action.value} | Confidence: {result.confidence:.0%}", + type="Warning" if result.severity.value in ("critical", "warning") else "Normal", + first_timestamp=now, last_timestamp=now, + source=client.V1EventSource(component="gcm-sentinel"), + reporting_component="gcm-sentinel", + )) + except Exception: + logger.exception("Failed to create K8s Event for %s", result.node_name) + + +async def emit_webhook(result: SentinelResult, webhook_url: str) -> None: + payload = { + "text": ( + f"[{result.severity.value.upper()}] GPU Sentinel: {result.node_name} / {result.condition}\n" + f"*Summary*: {result.summary}\n*Root cause*: {result.root_cause}\n" + f"*Action*: {result.recommended_action.value} (confidence {result.confidence:.0%})" + ), + "sentinel_result": { + "node_name": result.node_name, "condition": result.condition, + "severity": result.severity.value, "summary": result.summary, + "root_cause": result.root_cause, + "recommended_action": result.recommended_action.value, + "confidence": result.confidence, "timestamp": result.timestamp.isoformat(), + }, + } + try: + async with httpx.AsyncClient(timeout=10.0) as http: + (await http.post(webhook_url, json=payload)).raise_for_status() + except Exception: + logger.exception("Webhook delivery failed for %s", result.node_name) + + +async def notify(result: SentinelResult, webhook_url: str = "") -> None: + await emit_k8s_event(result) + if webhook_url: + await emit_webhook(result, webhook_url) diff --git a/gcm-sentinel/src/gcm_sentinel/tools/__init__.py b/gcm-sentinel/src/gcm_sentinel/tools/__init__.py new file mode 100644 index 0000000..ae1b0cf --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/tools/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. diff --git a/gcm-sentinel/src/gcm_sentinel/tools/dcgm.py b/gcm-sentinel/src/gcm_sentinel/tools/dcgm.py new file mode 100644 index 0000000..7defa51 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/tools/dcgm.py @@ -0,0 +1,101 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Query dcgm-exporter directly on a node when Prometheus doesn't have DCGM metrics.""" +from __future__ import annotations + +import httpx + +from gcm_sentinel.k8s import get_core_api + + +def _find_dcgm_pod_ip(node_name: str) -> str | None: + """Find the dcgm-exporter pod IP running on a specific node.""" + api = get_core_api() + pods = api.list_pod_for_all_namespaces( + field_selector=f"spec.nodeName={node_name}", + label_selector="app.kubernetes.io/name=dcgm-exporter", + ) + if not pods.items: + # Try broader match. + pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}") + for pod in pods.items: + if "dcgm" in pod.metadata.name.lower(): + return pod.status.pod_ip + return None + return pods.items[0].status.pod_ip + + +def _parse_prometheus_text(text: str, metric_filter: str = "") -> dict[str, list[dict]]: + """Parse Prometheus exposition format into {metric_name: [{labels, value}]}.""" + metrics: dict[str, list[dict]] = {} + for line in text.strip().split("\n"): + if line.startswith("#") or not line: + continue + if metric_filter and metric_filter not in line: + continue + # "METRIC_NAME{label=val,...} VALUE" + if "{" in line: + name_part, rest = line.split("{", 1) + labels_str, value_str = rest.rsplit("}", 1) + labels = {} + for pair in labels_str.split(","): + if "=" in pair: + k, v = pair.split("=", 1) + labels[k.strip()] = v.strip().strip('"') + else: + parts = line.split() + if len(parts) < 2: + continue + name_part = parts[0] + value_str = parts[1] + labels = {} + + name = name_part.strip() + try: + value = float(value_str.strip()) + except ValueError: + continue + + metrics.setdefault(name, []).append({"labels": labels, "value": value}) + return metrics + + +async def query_dcgm_direct(node_name: str, metric_filter: str = "") -> str: + """Query dcgm-exporter directly on a node via its pod IP. + + Useful when Prometheus doesn't scrape dcgm-exporter (common on clusters + where DCGM metrics go through OTel instead). + """ + import json + + pod_ip = _find_dcgm_pod_ip(node_name) + if not pod_ip: + return f"No dcgm-exporter pod found on node {node_name}" + + async with httpx.AsyncClient(timeout=10.0) as client: + try: + resp = await client.get(f"http://{pod_ip}:9400/metrics") + resp.raise_for_status() + except Exception as exc: + return f"Failed to query dcgm-exporter at {pod_ip}:9400: {exc}" + + metrics = _parse_prometheus_text(resp.text, metric_filter) + + # If no filter, summarize key metrics only (full output is too large). + if not metric_filter: + key_metrics = [ + "DCGM_FI_DEV_GPU_TEMP", "DCGM_FI_DEV_MEMORY_TEMP", + "DCGM_FI_DEV_POWER_USAGE", "DCGM_FI_DEV_GPU_UTIL", + "DCGM_FI_DEV_SM_CLOCK", "DCGM_FI_DEV_FB_USED", + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL", "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL", + "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL", "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL", + "DCGM_FI_DEV_RETIRED_SBE", "DCGM_FI_DEV_RETIRED_DBE", + "DCGM_FI_DEV_RETIRED_PENDING", + "DCGM_FI_DEV_XID_ERRORS", "DCGM_FI_DEV_PCIE_REPLAY_COUNTER", + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL", + "DCGM_FI_DEV_THERMAL_VIOLATION", "DCGM_FI_DEV_PSTATE", + ] + metrics = {k: v for k, v in metrics.items() if k in key_metrics} + + text = json.dumps(metrics, indent=2) + return text diff --git a/gcm-sentinel/src/gcm_sentinel/tools/gcm.py b/gcm-sentinel/src/gcm_sentinel/tools/gcm.py new file mode 100644 index 0000000..064bef4 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/tools/gcm.py @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +import json + +import httpx + +from gcm_sentinel.k8s import get_core_api + + +async def get_gcm_health(node_name: str, prometheus_url: str) -> str: + """NPD health: problem_gauge/counter metrics + Gcm/NPD node conditions.""" + health: dict = {"node": node_name, "npd_metrics": {}, "conditions": []} + + npd_queries = { + "problem_gauge": f'problem_gauge{{instance=~"{node_name}.*"}}', + "problem_counter": f'problem_counter{{instance=~"{node_name}.*"}}', + } + + async with httpx.AsyncClient(timeout=15.0) as http: + for key, query in npd_queries.items(): + try: + resp = await http.get( + f"{prometheus_url}/api/v1/query", + params={"query": query}, + ) + resp.raise_for_status() + health["npd_metrics"][key] = resp.json().get("data", {}).get("result", []) + except Exception as exc: + health["npd_metrics"][key] = f"error: {exc}" + + try: + api = get_core_api() + node = api.read_node(node_name) + for cond in node.status.conditions or []: + if cond.type.startswith(("Gcm", "NPD")): + health["conditions"].append({ + "type": cond.type, + "status": cond.status, + "reason": cond.reason or "", + "message": cond.message or "", + "lastTransitionTime": str(cond.last_transition_time), + }) + except Exception as exc: + health["conditions_error"] = str(exc) + + text = json.dumps(health, indent=2, default=str) + return text diff --git a/gcm-sentinel/src/gcm_sentinel/tools/kubernetes.py b/gcm-sentinel/src/gcm_sentinel/tools/kubernetes.py new file mode 100644 index 0000000..f02102a --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/tools/kubernetes.py @@ -0,0 +1,257 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +import json +import logging + +from kubernetes import client + +from gcm_sentinel.k8s import get_core_api + +logger = logging.getLogger(__name__) + + + + +# --------------------------------------------------------------------------- +# Read-only tools +# --------------------------------------------------------------------------- + +async def get_node_info(node_name: str) -> str: + api = get_core_api() + node = api.read_node(node_name) + + info = { + "name": node_name, + "labels": dict(node.metadata.labels or {}), + "annotations": { + k: v for k, v in (node.metadata.annotations or {}).items() + if k.startswith("gcm-sentinel") or k.startswith("node.kubernetes.io/") + }, + "taints": [ + {"key": t.key, "value": t.value or "", "effect": t.effect} + for t in (node.spec.taints or []) + ], + "conditions": [ + { + "type": c.type, "status": c.status, + "reason": c.reason or "", "message": c.message or "", + "lastTransitionTime": str(c.last_transition_time), + } + for c in (node.status.conditions or []) + ], + "allocatable": {k: str(v) for k, v in (node.status.allocatable or {}).items()}, + "capacity": {k: str(v) for k, v in (node.status.capacity or {}).items()}, + "pods": [ + {"name": p.metadata.name, "namespace": p.metadata.namespace, "phase": p.status.phase} + for p in api.list_pod_for_all_namespaces( + field_selector=f"spec.nodeName={node_name}" + ).items[:50] + ], + } + return json.dumps(info, indent=2, default=str) + + +async def get_node_events(node_name: str) -> str: + api = get_core_api() + events = api.list_event_for_all_namespaces( + field_selector=f"involvedObject.name={node_name},involvedObject.kind=Node", + ) + result = [ + { + "type": e.type, "reason": e.reason or "", + "message": (e.message or "")[:300], + "source": e.source.component if e.source else "", + "count": e.count, + "firstTimestamp": str(e.first_timestamp), + "lastTimestamp": str(e.last_timestamp), + } + for e in events.items[-50:] + ] + return json.dumps(result, indent=2, default=str) + + +async def get_pod_logs(node_name: str, pod_name_pattern: str = "", tail_lines: int = 100) -> str: + api = get_core_api() + pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}") + monitoring_patterns = ["dcgm", "npd", "node-problem", "gcm", "gpu"] + + results = {} + for pod in pods.items: + name, ns = pod.metadata.name, pod.metadata.namespace + if pod_name_pattern: + if pod_name_pattern.lower() not in name.lower(): + continue + elif not any(p in name.lower() for p in monitoring_patterns): + continue + + try: + container = pod.spec.containers[0].name if pod.spec.containers else None + if not container: + continue + logs = api.read_namespaced_pod_log( + name=name, namespace=ns, container=container, + tail_lines=tail_lines, timestamps=True, + ) + results[f"{ns}/{name}"] = logs[-3000:] + except Exception as exc: + results[f"{ns}/{name}"] = f"error: {exc}" + + if len(results) >= 5: + break + + return json.dumps(results, indent=2) if results else "No matching pods found." + + +async def get_workload_info(node_name: str) -> str: + api = get_core_api() + pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}") + system_ns = {"kube-system", "monitoring", "kube-node-lease", "kube-public"} + + workloads = [] + for pod in pods.items: + ns = pod.metadata.namespace + if ns in system_ns: + continue + owner_refs = pod.metadata.owner_references or [] + if any(ref.kind == "DaemonSet" for ref in owner_refs): + continue + + gpu_req = "0" + for c in pod.spec.containers or []: + reqs = (c.resources.requests or {}) if c.resources else {} + if "nvidia.com/gpu" in reqs: + gpu_req = str(reqs["nvidia.com/gpu"]) + if gpu_req == "0": + continue + + container_statuses = [] + for cs in pod.status.container_statuses or []: + s = {"name": cs.name, "ready": cs.ready, "restartCount": cs.restart_count} + if cs.state: + if cs.state.running: + s["state"] = "running" + elif cs.state.waiting: + s["state"] = f"waiting: {cs.state.waiting.reason}" + elif cs.state.terminated: + s["state"] = f"terminated: {cs.state.terminated.reason} (exit {cs.state.terminated.exit_code})" + container_statuses.append(s) + + # Find sibling pods from the same Job owner. + siblings = [] + for ref in owner_refs: + if ref.kind == "Job": + try: + for sp in api.list_namespaced_pod(ns, label_selector=f"job-name={ref.name}").items: + if sp.metadata.name != pod.metadata.name: + siblings.append({"name": sp.metadata.name, "node": sp.spec.node_name, "phase": sp.status.phase}) + except Exception: + pass + break + + workloads.append({ + "pod": pod.metadata.name, "namespace": ns, "phase": pod.status.phase, + "gpu_requests": gpu_req, + "owners": [{"kind": r.kind, "name": r.name} for r in owner_refs], + "labels": dict(pod.metadata.labels or {}), + "container_statuses": container_statuses, + "sibling_pods": siblings[:20], + }) + + return json.dumps(workloads, indent=2, default=str) if workloads else "No GPU workloads found." + + +async def get_workload_logs(node_name: str, tail_lines: int = 200) -> str: + api = get_core_api() + pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}") + system_ns = {"kube-system", "monitoring", "kube-node-lease", "kube-public"} + error_patterns = [ + "nccl", "cuda error", "cuda_error", "torch.distributed", "runtimeerror", + "traceback", "nvrm", "xid", "ecc", "fallen off", "sigterm", "sigkill", + "oom", "out of memory", "failed", "error", + ] + + results = {} + for pod in pods.items: + ns = pod.metadata.namespace + if ns in system_ns: + continue + if any(r.kind == "DaemonSet" for r in (pod.metadata.owner_references or [])): + continue + has_gpu = any( + "nvidia.com/gpu" in ((c.resources.requests or {}) if c.resources else {}) + for c in (pod.spec.containers or []) + ) + if not has_gpu: + continue + + try: + container = pod.spec.containers[0].name if pod.spec.containers else None + if not container: + continue + logs = api.read_namespaced_pod_log( + name=pod.metadata.name, namespace=ns, container=container, + tail_lines=tail_lines, timestamps=True, + ) + error_lines = [ln for ln in logs.split("\n") if any(p in ln.lower() for p in error_patterns)] + if error_lines: + results[f"{ns}/{pod.metadata.name}"] = {"error_lines": error_lines[-50:], "total": len(error_lines)} + else: + results[f"{ns}/{pod.metadata.name}"] = {"note": "no errors found", "tail": logs[-1000:]} + except Exception as exc: + results[f"{ns}/{pod.metadata.name}"] = f"error: {exc}" + + if len(results) >= 5: + break + + return json.dumps(results, indent=2, default=str) if results else "No GPU workload pods found." + + +# --------------------------------------------------------------------------- +# Remediation tools +# --------------------------------------------------------------------------- + +async def cordon_node(node_name: str) -> str: + api = get_core_api() + api.patch_node(node_name, {"spec": {"unschedulable": True}}) + logger.info("Cordoned node %s", node_name) + return f"Cordoned node {node_name}" + + +async def drain_node(node_name: str) -> str: + api = get_core_api() + api.patch_node(node_name, {"spec": {"unschedulable": True}}) + + pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}") + evicted, skipped = 0, 0 + for pod in pods.items: + if any(r.kind == "DaemonSet" for r in (pod.metadata.owner_references or [])): + skipped += 1 + continue + try: + api.create_namespaced_pod_eviction( + pod.metadata.name, pod.metadata.namespace, + client.V1Eviction( + metadata=client.V1ObjectMeta(name=pod.metadata.name, namespace=pod.metadata.namespace), + delete_options=client.V1DeleteOptions(grace_period_seconds=30), + ), + ) + evicted += 1 + except client.ApiException as exc: + logger.warning("Failed to evict %s/%s: %s", pod.metadata.namespace, pod.metadata.name, exc.reason) + + logger.info("Drained node %s: evicted=%d, skipped_daemonset=%d", node_name, evicted, skipped) + return f"Drained node {node_name}: evicted {evicted} pods, skipped {skipped} DaemonSet pods" + + +async def taint_node(node_name: str, key: str, value: str, effect: str) -> str: + if effect not in ("NoSchedule", "PreferNoSchedule", "NoExecute"): + return f"Invalid effect: {effect}" + api = get_core_api() + node = api.read_node(node_name) + taints = list(node.spec.taints or []) + taints.append(client.V1Taint(key=key, value=value, effect=effect)) + api.patch_node(node_name, {"spec": {"taints": [t.to_dict() for t in taints]}}) + logger.info("Tainted node %s: %s=%s:%s", node_name, key, value, effect) + return f"Tainted node {node_name}: {key}={value}:{effect}" diff --git a/gcm-sentinel/src/gcm_sentinel/tools/node_exporter.py b/gcm-sentinel/src/gcm_sentinel/tools/node_exporter.py new file mode 100644 index 0000000..a69cdd6 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/tools/node_exporter.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Query node-exporter directly on a node.""" +from __future__ import annotations + +import json + +import httpx + +from gcm_sentinel.k8s import get_core_api +from gcm_sentinel.tools.dcgm import _parse_prometheus_text + + +def _find_node_exporter_ip(node_name: str) -> str | None: + api = get_core_api() + pods = api.list_pod_for_all_namespaces( + field_selector=f"spec.nodeName={node_name}", + ) + for pod in pods.items: + name = pod.metadata.name.lower() + if "node-exporter" in name or "node_exporter" in name: + return pod.status.pod_ip + return None + + +async def query_node_exporter_direct(node_name: str, metric_filter: str = "") -> str: + """Query node-exporter directly on a node via its pod IP.""" + pod_ip = _find_node_exporter_ip(node_name) + if not pod_ip: + return f"No node-exporter pod found on node {node_name}" + + async with httpx.AsyncClient(timeout=10.0) as client: + try: + resp = await client.get(f"http://{pod_ip}:9100/metrics") + resp.raise_for_status() + except Exception as exc: + return f"Failed to query node-exporter at {pod_ip}:9100: {exc}" + + metrics = _parse_prometheus_text(resp.text, metric_filter) + + if not metric_filter: + # Without a filter, return key host health metrics only. + key_prefixes = [ + "node_hwmon_temp_celsius", "node_cpu_seconds_total", + "node_memory_MemAvailable_bytes", "node_memory_MemTotal_bytes", + "node_load1", "node_load5", "node_load15", + "node_disk_io_time_seconds_total", + "node_filesystem_avail_bytes", + "node_infiniband", # IB metrics if available. + "node_network_receive_errs_total", "node_network_transmit_errs_total", + ] + metrics = {k: v for k, v in metrics.items() if any(k.startswith(p) for p in key_prefixes)} + + text = json.dumps(metrics, indent=2) + return text + + +async def query_infiniband_direct(node_name: str) -> str: + """Query InfiniBand/network metrics from node-exporter on a node.""" + pod_ip = _find_node_exporter_ip(node_name) + if not pod_ip: + return f"No node-exporter pod found on node {node_name}" + + async with httpx.AsyncClient(timeout=10.0) as client: + try: + resp = await client.get(f"http://{pod_ip}:9100/metrics") + resp.raise_for_status() + except Exception as exc: + return f"Failed to query node-exporter at {pod_ip}:9100: {exc}" + + metrics = _parse_prometheus_text(resp.text) + ib_metrics = {k: v for k, v in metrics.items() if "infiniband" in k or "ib_" in k.lower()} + + if not ib_metrics: + # Fall back to general network error metrics. + ib_metrics = {k: v for k, v in metrics.items() + if k.startswith(("node_network_receive_errs", "node_network_transmit_errs", + "node_network_receive_drop", "node_network_transmit_drop"))} + if not ib_metrics: + return "No InfiniBand or network error metrics found on this node." + + text = json.dumps(ib_metrics, indent=2) + return text diff --git a/gcm-sentinel/src/gcm_sentinel/tools/prometheus.py b/gcm-sentinel/src/gcm_sentinel/tools/prometheus.py new file mode 100644 index 0000000..7087d77 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/tools/prometheus.py @@ -0,0 +1,84 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +import json +import time + +import httpx + + +def _parse_duration(duration: str) -> float: + units = {"s": 1, "m": 60, "h": 3600, "d": 86400} + if duration and duration[-1].lower() in units: + try: + return float(duration[:-1]) * units[duration[-1].lower()] + except ValueError: + pass + try: + return float(duration) + except (ValueError, TypeError): + return 300.0 + + +async def query_prometheus(prometheus_url: str, query: str, duration: str = "5m") -> str: + async with httpx.AsyncClient(timeout=30.0) as client: + resp = await client.get( + f"{prometheus_url}/api/v1/query", + params={"query": query, "timeout": "15s"}, + ) + resp.raise_for_status() + data = resp.json() + + if data.get("status") != "success": + return f"Prometheus query failed: {data}" + + if not data.get("data", {}).get("result"): + duration_seconds = _parse_duration(duration) + now = time.time() + resp = await client.get( + f"{prometheus_url}/api/v1/query_range", + params={ + "query": query, + "start": str(now - duration_seconds), + "end": str(now), + "step": "60s", + "timeout": "15s", + }, + ) + resp.raise_for_status() + data = resp.json() + + return json.dumps(data.get("data", {}), indent=2) + + +async def query_alertmanager(alertmanager_url: str, node_name: str = "") -> str: + async with httpx.AsyncClient(timeout=15.0) as client: + try: + resp = await client.get(f"{alertmanager_url}/api/v2/alerts") + resp.raise_for_status() + alerts = resp.json() + except Exception as exc: + return f"Alertmanager query failed: {exc}" + + if node_name: + node_labels = ("instance", "node", "nodename", "kubernetes_node") + alerts = [ + a for a in alerts + if any(node_name in str(a.get("labels", {}).get(k, "")) for k in node_labels) + ] + + results = [ + { + "alertname": a.get("labels", {}).get("alertname", ""), + "severity": a.get("labels", {}).get("severity", ""), + "state": a.get("status", {}).get("state", ""), + "labels": a.get("labels", {}), + "startsAt": a.get("startsAt", ""), + "summary": a.get("annotations", {}).get("summary", ""), + "description": a.get("annotations", {}).get("description", "")[:300], + } + for a in alerts[:30] + ] + + return json.dumps(results, indent=2) if results else "No firing alerts found." diff --git a/gcm-sentinel/src/gcm_sentinel/watcher.py b/gcm-sentinel/src/gcm_sentinel/watcher.py new file mode 100644 index 0000000..083a1c1 --- /dev/null +++ b/gcm-sentinel/src/gcm_sentinel/watcher.py @@ -0,0 +1,113 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Watches K8s node conditions and triggers investigation on GPU problems.""" + +from __future__ import annotations + +import asyncio +import fnmatch +import logging +import time + +from kubernetes import client, config, watch + +from gcm_sentinel.config import SentinelConfig +from gcm_sentinel.models import NodeConditionEvent +from gcm_sentinel.notify import annotate_node, notify +from gcm_sentinel.datasources import build_datasources +from gcm_sentinel.engine import run_investigation + +logger = logging.getLogger(__name__) + + +class NodeConditionWatcher: + def __init__(self, cfg: SentinelConfig) -> None: + self.cfg = cfg + self._registry = None + self._condition_states: dict[tuple[str, str], str] = {} + self._last_run_time: dict[tuple[str, str], float] = {} + self._active_tasks: set[asyncio.Task] = set() + + def _init_k8s(self) -> None: + try: + config.load_incluster_config() + except config.ConfigException: + config.load_kube_config() + + def _node_in_scope(self, node_name: str) -> bool: + if not self.cfg.node_allowlist: + return True + return any(fnmatch.fnmatch(node_name, p) for p in self.cfg.node_allowlist) + + def _in_cooldown(self, node_name: str, condition: str) -> bool: + last = self._last_run_time.get((node_name, condition)) + if last is None: + return False + remaining = self.cfg.cooldown_seconds - (time.monotonic() - last) + if remaining > 0: + logger.info("Cooldown: %s/%s (%.0fs left)", node_name, condition, remaining) + return True + return False + + def _detect_changes(self, node_name: str, conditions: list) -> list[NodeConditionEvent]: + events = [] + for cond in conditions: + if cond.type not in self.cfg.watch_conditions: + continue + key = (node_name, cond.type) + prev = self._condition_states.get(key) + if prev != cond.status: + self._condition_states[key] = cond.status + if cond.status == "True": + events.append(NodeConditionEvent( + node_name=node_name, condition_type=cond.type, + status=cond.status, reason=cond.reason or "", message=cond.message or "", + )) + return events + + async def _handle_event(self, event: NodeConditionEvent) -> None: + node, cond = event.node_name, event.condition_type + if not self._node_in_scope(node) or self._in_cooldown(node, cond): + return + + self._last_run_time[(node, cond)] = time.monotonic() + logger.info("Investigation triggered: %s on %s (%s)", cond, node, event.message) + + try: + if self._registry is None: + self._registry = build_datasources(self.cfg) + result = await run_investigation(event, self.cfg, self._registry) + + logger.info("Result for %s: severity=%s action=%s confidence=%.2f", + node, result.severity.value, result.recommended_action.value, result.confidence) + logger.info("Summary: %s", result.summary) + + if self.cfg.can_annotate: + await annotate_node(result) + await notify(result, webhook_url=self.cfg.webhook_url) + + except Exception: + logger.exception("Investigation failed for %s", node) + + async def watch_loop(self) -> None: + self._init_k8s() + api = client.CoreV1Api() + w = watch.Watch() + + logger.info("Watcher started (mode=%s, conditions=%d, allowlist=%s, cooldown=%ds)", + self.cfg.action_mode, len(self.cfg.watch_conditions), + self.cfg.node_allowlist or "(all)", self.cfg.cooldown_seconds) + + while True: + try: + for event in w.stream(api.list_node, timeout_seconds=0): + if event["type"] not in ("ADDED", "MODIFIED"): + continue + node = event["object"] + for change in self._detect_changes(node.metadata.name, node.status.conditions or []): + task = asyncio.create_task(self._handle_event(change)) + self._active_tasks.add(task) + task.add_done_callback(self._active_tasks.discard) + except Exception: + logger.exception("Watch stream error, reconnecting in 5s (%d active investigations)", len(self._active_tasks)) + await asyncio.sleep(5) diff --git a/gcm-sentinel/tests/scenarios/gpu_fallen_off_bus.json b/gcm-sentinel/tests/scenarios/gpu_fallen_off_bus.json new file mode 100644 index 0000000..954769f --- /dev/null +++ b/gcm-sentinel/tests/scenarios/gpu_fallen_off_bus.json @@ -0,0 +1,75 @@ +{ + "name": "GPU Fallen Off Bus", + "description": "GPU 5 is no longer visible to nvidia-smi, likely a PCIe or hardware failure.", + "node_name": "gpu-node-12", + "condition_type": "NPDSmiDisconnectedProblem", + "reason": "NPDSmiDisconnectedProblem", + "message": "nvidia-smi reports only 7/8 GPUs visible. GPU index 5 not responding.", + "prometheus_responses": { + "DCGM_FI_DEV_GPU_TEMP": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "0", "instance": "gpu-node-12:9400"}, "value": [1700000000, "62"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-12:9400"}, "value": [1700000000, "64"]}, + {"metric": {"gpu": "2", "instance": "gpu-node-12:9400"}, "value": [1700000000, "63"]}, + {"metric": {"gpu": "3", "instance": "gpu-node-12:9400"}, "value": [1700000000, "61"]}, + {"metric": {"gpu": "4", "instance": "gpu-node-12:9400"}, "value": [1700000000, "65"]}, + {"metric": {"gpu": "6", "instance": "gpu-node-12:9400"}, "value": [1700000000, "64"]}, + {"metric": {"gpu": "7", "instance": "gpu-node-12:9400"}, "value": [1700000000, "63"]} + ] + } + }, + "DCGM_FI_DEV_POWER_USAGE": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "0", "instance": "gpu-node-12:9400"}, "value": [1700000000, "310"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-12:9400"}, "value": [1700000000, "305"]} + ] + } + }, + "DCGM_FI_DEV_XID_ERRORS": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "4", "instance": "gpu-node-12:9400"}, "value": [1700000000, "79"]} + ] + } + } + }, + "node_info": { + "name": "gpu-node-12", + "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-12"}, + "taints": [], + "conditions": [ + {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"}, + {"type": "NPDSmiDisconnectedProblem", "status": "True", "reason": "NPDSmiDisconnectedProblem", "message": "nvidia-smi reports only 7/8 GPUs visible. GPU index 5 not responding."} + ], + "allocatable": {"nvidia.com/gpu": "7", "cpu": "192", "memory": "2097152Mi"}, + "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "pods": [ + {"name": "training-job-worker-0", "namespace": "default", "phase": "Running"} + ], + "pod_count": 8 + }, + "gcm_health": { + "node": "gpu-node-12", + "npd_metrics": { + "problem_gauge": [ + {"metric": {"reason": "NPDSmiDisconnectedProblem", "instance": "gpu-node-12:20257"}, "value": [1700000000, "1"]} + ], + "problem_counter": [ + {"metric": {"reason": "NPDSmiDisconnectedProblem", "instance": "gpu-node-12:20257"}, "value": [1700000000, "1"]} + ] + }, + "conditions": [ + {"type": "NPDSmiDisconnectedProblem", "status": "True", "reason": "NPDSmiDisconnectedProblem", "message": "nvidia-smi reports only 7/8 GPUs visible"} + ] + }, + "expected_severity": "critical", + "expected_action": "drain" +} diff --git a/gcm-sentinel/tests/scenarios/hbm_degradation.json b/gcm-sentinel/tests/scenarios/hbm_degradation.json new file mode 100644 index 0000000..b98c70f --- /dev/null +++ b/gcm-sentinel/tests/scenarios/hbm_degradation.json @@ -0,0 +1,89 @@ +{ + "name": "HBM Memory Degradation", + "description": "GPU 3 on the node shows HBM (High Bandwidth Memory) ECC errors accumulating, indicating memory degradation.", + "node_name": "gpu-node-07", + "condition_type": "NPDSmiEccProblem", + "reason": "NPDSmiEccProblem", + "message": "ECC errors detected: gpu3 has 142 volatile uncorrectable ECC errors", + "prometheus_responses": { + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "3", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-003"}, "value": [1700000000, "142"]}, + {"metric": {"gpu": "0", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-000"}, "value": [1700000000, "0"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-001"}, "value": [1700000000, "0"]}, + {"metric": {"gpu": "2", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-002"}, "value": [1700000000, "0"]}, + {"metric": {"gpu": "4", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-004"}, "value": [1700000000, "0"]}, + {"metric": {"gpu": "5", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-005"}, "value": [1700000000, "0"]}, + {"metric": {"gpu": "6", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-006"}, "value": [1700000000, "0"]}, + {"metric": {"gpu": "7", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-007"}, "value": [1700000000, "0"]} + ] + } + }, + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "3", "instance": "gpu-node-07:9400"}, "value": [1700000000, "8947"]}, + {"metric": {"gpu": "0", "instance": "gpu-node-07:9400"}, "value": [1700000000, "12"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-07:9400"}, "value": [1700000000, "5"]} + ] + } + }, + "DCGM_FI_DEV_GPU_TEMP": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "3", "instance": "gpu-node-07:9400"}, "value": [1700000000, "78"]}, + {"metric": {"gpu": "0", "instance": "gpu-node-07:9400"}, "value": [1700000000, "65"]} + ] + } + }, + "DCGM_FI_DEV_RETIRED_DBE": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "3", "instance": "gpu-node-07:9400"}, "value": [1700000000, "7"]} + ] + } + } + }, + "node_info": { + "name": "gpu-node-07", + "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-07"}, + "taints": [], + "conditions": [ + {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"}, + {"type": "NPDSmiEccProblem", "status": "True", "reason": "NPDSmiEccProblem", "message": "ECC errors detected: gpu3 has 142 volatile uncorrectable ECC errors"}, + {"type": "NPDXidErrorsProblem", "status": "False", "reason": "NPDXidErrorsProblem", "message": "No XID errors"} + ], + "allocatable": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "pods": [ + {"name": "training-job-worker-3", "namespace": "default", "phase": "Running"}, + {"name": "dcgm-exporter-xyz", "namespace": "monitoring", "phase": "Running"} + ], + "pod_count": 12 + }, + "gcm_health": { + "node": "gpu-node-07", + "npd_metrics": { + "problem_gauge": [ + {"metric": {"reason": "NPDSmiEccProblem", "instance": "gpu-node-07:20257"}, "value": [1700000000, "1"]} + ], + "problem_counter": [ + {"metric": {"reason": "NPDSmiEccProblem", "instance": "gpu-node-07:20257"}, "value": [1700000000, "3"]} + ] + }, + "conditions": [ + {"type": "NPDSmiEccProblem", "status": "True", "reason": "NPDSmiEccProblem", "message": "ECC errors detected: gpu3 has 142 volatile uncorrectable ECC errors"} + ] + }, + "expected_severity": "critical", + "expected_action": "drain" +} diff --git a/gcm-sentinel/tests/scenarios/nvlink_errors.json b/gcm-sentinel/tests/scenarios/nvlink_errors.json new file mode 100644 index 0000000..96350f0 --- /dev/null +++ b/gcm-sentinel/tests/scenarios/nvlink_errors.json @@ -0,0 +1,74 @@ +{ + "name": "NVLink Errors", + "description": "NVLink CRC errors accumulating between GPU 2 and GPU 6, degrading inter-GPU bandwidth.", + "node_name": "gpu-node-21", + "condition_type": "NPDDcgmiNvlinkStatusProblem", + "reason": "NPDDcgmiNvlinkStatusProblem", + "message": "NVLink errors detected: link 2->6 has elevated CRC error rate", + "prometheus_responses": { + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "2", "instance": "gpu-node-21:9400", "nvlink": "0"}, "value": [1700000000, "54231"]}, + {"metric": {"gpu": "6", "instance": "gpu-node-21:9400", "nvlink": "0"}, "value": [1700000000, "54198"]}, + {"metric": {"gpu": "0", "instance": "gpu-node-21:9400", "nvlink": "0"}, "value": [1700000000, "3"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-21:9400", "nvlink": "0"}, "value": [1700000000, "0"]} + ] + } + }, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "2", "instance": "gpu-node-21:9400"}, "value": [1700000000, "12000000"]}, + {"metric": {"gpu": "6", "instance": "gpu-node-21:9400"}, "value": [1700000000, "12500000"]}, + {"metric": {"gpu": "0", "instance": "gpu-node-21:9400"}, "value": [1700000000, "48000000"]} + ] + } + }, + "DCGM_FI_DEV_GPU_TEMP": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "2", "instance": "gpu-node-21:9400"}, "value": [1700000000, "71"]}, + {"metric": {"gpu": "6", "instance": "gpu-node-21:9400"}, "value": [1700000000, "73"]} + ] + } + } + }, + "node_info": { + "name": "gpu-node-21", + "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-21"}, + "taints": [], + "conditions": [ + {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"}, + {"type": "NPDDcgmiNvlinkStatusProblem", "status": "True", "reason": "NPDDcgmiNvlinkStatusProblem", "message": "NVLink errors detected: link 2->6 has elevated CRC error rate"} + ], + "allocatable": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "pods": [ + {"name": "nccl-test-worker-1", "namespace": "default", "phase": "Running"} + ], + "pod_count": 10 + }, + "gcm_health": { + "node": "gpu-node-21", + "npd_metrics": { + "problem_gauge": [ + {"metric": {"reason": "NPDDcgmiNvlinkStatusProblem", "instance": "gpu-node-21:20257"}, "value": [1700000000, "1"]} + ], + "problem_counter": [ + {"metric": {"reason": "NPDDcgmiNvlinkStatusProblem", "instance": "gpu-node-21:20257"}, "value": [1700000000, "2"]} + ] + }, + "conditions": [ + {"type": "NPDDcgmiNvlinkStatusProblem", "status": "True", "reason": "NPDDcgmiNvlinkStatusProblem", "message": "NVLink errors detected"} + ] + }, + "expected_severity": "warning", + "expected_action": "cordon" +} diff --git a/gcm-sentinel/tests/scenarios/thermal_throttle.json b/gcm-sentinel/tests/scenarios/thermal_throttle.json new file mode 100644 index 0000000..b5b6c47 --- /dev/null +++ b/gcm-sentinel/tests/scenarios/thermal_throttle.json @@ -0,0 +1,105 @@ +{ + "name": "Thermal Throttle", + "description": "Multiple GPUs on the node are thermal throttling due to cooling failure.", + "node_name": "gpu-node-03", + "condition_type": "NPDDcgmiDiagProblem", + "reason": "NPDDcgmiDiagProblem", + "message": "dcgmi diag detected thermal violations on GPUs 0,1,2,3", + "prometheus_responses": { + "DCGM_FI_DEV_GPU_TEMP": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "0", "instance": "gpu-node-03:9400"}, "value": [1700000000, "92"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-03:9400"}, "value": [1700000000, "91"]}, + {"metric": {"gpu": "2", "instance": "gpu-node-03:9400"}, "value": [1700000000, "93"]}, + {"metric": {"gpu": "3", "instance": "gpu-node-03:9400"}, "value": [1700000000, "90"]}, + {"metric": {"gpu": "4", "instance": "gpu-node-03:9400"}, "value": [1700000000, "72"]}, + {"metric": {"gpu": "5", "instance": "gpu-node-03:9400"}, "value": [1700000000, "70"]}, + {"metric": {"gpu": "6", "instance": "gpu-node-03:9400"}, "value": [1700000000, "71"]}, + {"metric": {"gpu": "7", "instance": "gpu-node-03:9400"}, "value": [1700000000, "69"]} + ] + } + }, + "DCGM_FI_DEV_THERMAL_VIOLATION": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "0", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1523456789"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1412345678"]}, + {"metric": {"gpu": "2", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1634567890"]}, + {"metric": {"gpu": "3", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1345678901"]} + ] + } + }, + "DCGM_FI_DEV_POWER_USAGE": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "0", "instance": "gpu-node-03:9400"}, "value": [1700000000, "250"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-03:9400"}, "value": [1700000000, "245"]}, + {"metric": {"gpu": "2", "instance": "gpu-node-03:9400"}, "value": [1700000000, "255"]}, + {"metric": {"gpu": "3", "instance": "gpu-node-03:9400"}, "value": [1700000000, "248"]}, + {"metric": {"gpu": "4", "instance": "gpu-node-03:9400"}, "value": [1700000000, "320"]}, + {"metric": {"gpu": "5", "instance": "gpu-node-03:9400"}, "value": [1700000000, "315"]} + ] + } + }, + "DCGM_FI_DEV_SM_CLOCK": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "0", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1200"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1180"]}, + {"metric": {"gpu": "4", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1980"]} + ] + } + }, + "node_hwmon_temp_celsius": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"chip": "inlet", "instance": "gpu-node-03:9100"}, "value": [1700000000, "38"]}, + {"metric": {"chip": "outlet", "instance": "gpu-node-03:9100"}, "value": [1700000000, "52"]} + ] + } + } + }, + "node_info": { + "name": "gpu-node-03", + "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-03"}, + "taints": [], + "conditions": [ + {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"}, + {"type": "NPDDcgmiDiagProblem", "status": "True", "reason": "NPDDcgmiDiagProblem", "message": "dcgmi diag detected thermal violations on GPUs 0,1,2,3"} + ], + "allocatable": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "pods": [ + {"name": "training-job-worker-5", "namespace": "default", "phase": "Running"}, + {"name": "dcgm-exporter-abc", "namespace": "monitoring", "phase": "Running"} + ], + "pod_count": 11 + }, + "gcm_health": { + "node": "gpu-node-03", + "npd_metrics": { + "problem_gauge": [ + {"metric": {"reason": "NPDDcgmiDiagProblem", "instance": "gpu-node-03:20257"}, "value": [1700000000, "1"]} + ], + "problem_counter": [ + {"metric": {"reason": "NPDDcgmiDiagProblem", "instance": "gpu-node-03:20257"}, "value": [1700000000, "5"]} + ] + }, + "conditions": [ + {"type": "NPDDcgmiDiagProblem", "status": "True", "reason": "NPDDcgmiDiagProblem", "message": "dcgmi diag detected thermal violations"} + ] + }, + "expected_severity": "critical", + "expected_action": "drain" +} diff --git a/gcm-sentinel/tests/scenarios/transient_xid.json b/gcm-sentinel/tests/scenarios/transient_xid.json new file mode 100644 index 0000000..387b3af --- /dev/null +++ b/gcm-sentinel/tests/scenarios/transient_xid.json @@ -0,0 +1,102 @@ +{ + "name": "Transient XID Error", + "description": "A single transient XID 63 error was reported on GPU 1, but the GPU appears healthy otherwise.", + "node_name": "gpu-node-29", + "condition_type": "NPDXidErrorsProblem", + "reason": "NPDXidErrorsProblem", + "message": "XID error 63 detected on GPU 1", + "prometheus_responses": { + "DCGM_FI_DEV_XID_ERRORS": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "63"]} + ] + } + }, + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "0", "instance": "gpu-node-29:9400"}, "value": [1700000000, "0"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "0"]}, + {"metric": {"gpu": "2", "instance": "gpu-node-29:9400"}, "value": [1700000000, "0"]} + ] + } + }, + "DCGM_FI_DEV_GPU_TEMP": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "0", "instance": "gpu-node-29:9400"}, "value": [1700000000, "65"]}, + {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "67"]}, + {"metric": {"gpu": "2", "instance": "gpu-node-29:9400"}, "value": [1700000000, "64"]} + ] + } + }, + "DCGM_FI_DEV_POWER_USAGE": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "310"]} + ] + } + }, + "DCGM_FI_DEV_SM_CLOCK": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "1980"]} + ] + } + }, + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": { + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "2"]} + ] + } + } + }, + "node_info": { + "name": "gpu-node-29", + "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-29"}, + "taints": [], + "conditions": [ + {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"}, + {"type": "NPDXidErrorsProblem", "status": "True", "reason": "NPDXidErrorsProblem", "message": "XID error 63 detected on GPU 1"}, + {"type": "NPDSmiEccProblem", "status": "False", "reason": "NPDSmiEccProblem", "message": "No ECC errors"}, + {"type": "NPDSmiDisconnectedProblem", "status": "False", "reason": "NPDSmiDisconnectedProblem", "message": "All GPUs visible"} + ], + "allocatable": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"}, + "pods": [ + {"name": "training-job-worker-2", "namespace": "default", "phase": "Running"}, + {"name": "dcgm-exporter-def", "namespace": "monitoring", "phase": "Running"} + ], + "pod_count": 9 + }, + "gcm_health": { + "node": "gpu-node-29", + "npd_metrics": { + "problem_gauge": [ + {"metric": {"reason": "NPDXidErrorsProblem", "instance": "gpu-node-29:20257"}, "value": [1700000000, "1"]} + ], + "problem_counter": [ + {"metric": {"reason": "NPDXidErrorsProblem", "instance": "gpu-node-29:20257"}, "value": [1700000000, "1"]} + ] + }, + "conditions": [ + {"type": "NPDXidErrorsProblem", "status": "True", "reason": "NPDXidErrorsProblem", "message": "XID error 63 detected on GPU 1"} + ] + }, + "expected_severity": "info", + "expected_action": "none" +} diff --git a/gcm-sentinel/tests/test_sentinel.py b/gcm-sentinel/tests/test_sentinel.py new file mode 100644 index 0000000..c8cfde1 --- /dev/null +++ b/gcm-sentinel/tests/test_sentinel.py @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Tests for the critical paths: safety gating, config parsing, scenario validation.""" +from __future__ import annotations + +import json +import time +from pathlib import Path + +import pytest + +from gcm_sentinel.config import DEFAULT_WATCH_CONDITIONS, SentinelConfig +from gcm_sentinel.models import Action, Severity + + +class TestSafety: + """The most important tests: verify remediation tools are gated by action_mode.""" + + def test_recommend_mode_has_no_remediation_tools(self): + from gcm_sentinel.datasources import build_datasources + + registry = build_datasources(SentinelConfig(api_key="test", action_mode="recommend")) + tool_names = {t["name"] for t in registry.get_all_tools()} + assert "cordon_node" not in tool_names + assert "drain_node" not in tool_names + assert "taint_node" not in tool_names + # Read-only tools present. + assert "query_prometheus" in tool_names + assert "get_node_info" in tool_names + + def test_annotate_mode_has_no_remediation_tools(self): + from gcm_sentinel.datasources import build_datasources + + registry = build_datasources(SentinelConfig(api_key="test", action_mode="annotate")) + assert "cordon_node" not in {t["name"] for t in registry.get_all_tools()} + + def test_execute_mode_has_remediation_tools(self): + from gcm_sentinel.datasources import build_datasources + + registry = build_datasources(SentinelConfig(api_key="test", action_mode="execute")) + tool_names = {t["name"] for t in registry.get_all_tools()} + assert "cordon_node" in tool_names + assert "drain_node" in tool_names + assert "taint_node" in tool_names + + @pytest.mark.asyncio + async def test_unknown_tool_blocked_by_registry(self): + from gcm_sentinel.datasources import build_datasources + + registry = build_datasources(SentinelConfig(api_key="test", action_mode="recommend")) + result = await registry.execute("cordon_node", {"node_name": "x"}) + assert "Unknown tool" in result + + def test_invalid_action_mode_rejected(self): + with pytest.raises(ValueError, match="action_mode must be one of"): + SentinelConfig(api_key="test", action_mode="yolo") + + +class TestConfig: + def test_defaults(self): + cfg = SentinelConfig(api_key="test") + assert cfg.llm_api == "anthropic" + assert cfg.action_mode == "recommend" + assert cfg.can_execute is False + assert cfg.can_annotate is False + assert cfg.cooldown_seconds == 3600 + assert len(cfg.watch_conditions) == len(DEFAULT_WATCH_CONDITIONS) + + def test_csv_watch_conditions(self): + cfg = SentinelConfig(api_key="t", watch_conditions="GcmXidErrorsProblem,GcmSmiEccProblem") + assert cfg.watch_conditions == ["GcmXidErrorsProblem", "GcmSmiEccProblem"] + + def test_csv_node_allowlist(self): + cfg = SentinelConfig(api_key="t", node_allowlist="gpu-node-0*,gpu-node-12") + assert cfg.node_allowlist == ["gpu-node-0*", "gpu-node-12"] + + def test_action_mode_properties(self): + assert SentinelConfig(api_key="t", action_mode="recommend").can_execute is False + assert SentinelConfig(api_key="t", action_mode="annotate").can_annotate is True + assert SentinelConfig(api_key="t", action_mode="execute").can_execute is True + + +class TestWatcherSafety: + def test_node_allowlist_filtering(self): + from gcm_sentinel.watcher import NodeConditionWatcher + + watcher = NodeConditionWatcher(SentinelConfig(api_key="t", node_allowlist=["gpu-node-0*", "gpu-node-12"])) + assert watcher._node_in_scope("gpu-node-01") is True + assert watcher._node_in_scope("gpu-node-21") is False + + def test_cooldown(self): + from gcm_sentinel.watcher import NodeConditionWatcher + + watcher = NodeConditionWatcher(SentinelConfig(api_key="t", cooldown_seconds=60)) + assert watcher._in_cooldown("n1", "cond") is False + watcher._last_run_time[("n1", "cond")] = time.monotonic() + assert watcher._in_cooldown("n1", "cond") is True + assert watcher._in_cooldown("n1", "other") is False + + def test_rate_limiter(self): + from gcm_sentinel.datasources.remediation import RemediationDataSource + + ds = RemediationDataSource(SentinelConfig(api_key="t", action_mode="execute", max_actions_per_hour=2)) + assert ds._is_rate_limited() is False + ds._action_timestamps.append(time.monotonic()) + ds._action_timestamps.append(time.monotonic()) + assert ds._is_rate_limited() is True + + +class TestScenarios: + @pytest.mark.parametrize("name", [ + "hbm_degradation", "gpu_fallen_off_bus", "nvlink_errors", + "thermal_throttle", "transient_xid", + ]) + def test_scenario_valid(self, name): + path = Path(__file__).parent / "scenarios" / f"{name}.json" + scenario = json.loads(path.read_text()) + assert scenario["condition_type"] in DEFAULT_WATCH_CONDITIONS + assert scenario["expected_severity"] in [s.value for s in Severity] + assert scenario["expected_action"] in [a.value for a in Action] diff --git a/website/docs/GCM_Health_Checks/kubernetes_deployment.md b/website/docs/GCM_Health_Checks/kubernetes_deployment.md index e9e64eb..96e0337 100644 --- a/website/docs/GCM_Health_Checks/kubernetes_deployment.md +++ b/website/docs/GCM_Health_Checks/kubernetes_deployment.md @@ -96,6 +96,7 @@ kubectl get node \ Health check conditions are standard Kubernetes node conditions. Downstream tools can watch them to automatically remediate unhealthy nodes: +- **[GCM Sentinel](../GCM_Sentinel/getting_started.md)**: AI-powered investigation using Claude. Watches for GCM conditions, investigates by querying Prometheus metrics, node state, and workload logs, then recommends (or executes) remediation actions. See the [Getting Started](../GCM_Sentinel/getting_started.md) guide. - **[Draino](https://github.com/planetlabs/draino)**: watches node conditions and automatically cordons/drains unhealthy nodes - **Cluster autoscalers**: can detect unhealthy nodes and replace them - **Custom controllers**: can implement organization-specific remediation workflows diff --git a/website/docs/GCM_Sentinel/_category_.json b/website/docs/GCM_Sentinel/_category_.json new file mode 100644 index 0000000..d74b556 --- /dev/null +++ b/website/docs/GCM_Sentinel/_category_.json @@ -0,0 +1,8 @@ +{ + "label": "GCM Sentinel", + "position": 5, + "link": { + "type": "generated-index", + "description": "AI-powered GPU cluster sentinel agent (experimental)." + } +} diff --git a/website/docs/GCM_Sentinel/adding_new_datasource.md b/website/docs/GCM_Sentinel/adding_new_datasource.md new file mode 100644 index 0000000..d3a7912 --- /dev/null +++ b/website/docs/GCM_Sentinel/adding_new_datasource.md @@ -0,0 +1,212 @@ +--- +sidebar_position: 4 +--- + +# Adding a New Data Source + +The sentinel agent investigates GPU failures by querying **data sources** — Prometheus metrics, Kubernetes state, pod logs, etc. Each data source is a Python class that subclasses `DataSource` and exposes tools the LLM can call during investigation. + +The built-in data sources cover the common stack (Prometheus, K8s, DCGM, NPD, Alertmanager). You can add your own to give the agent access to any system you have — IPMI sensors, cloud APIs, job schedulers, internal dashboards, etc. + +Adding a data source is one Python file plus one line of registration. This guide walks through a complete example: adding IPMI/BMC sensor queries. + +## 1. Create the data source file + +Create a new file under `src/gcm_sentinel/datasources/`. The naming convention is `.py`. + +```python +# src/gcm_sentinel/datasources/ipmi.py +from __future__ import annotations + +import json +import httpx +from gcm_sentinel.datasources import DataSource + + +class IPMIDataSource(DataSource): + name = "ipmi" + + def is_available(self): + # Return False to auto-skip if prerequisites aren't met. + # The registry will log "Data source 'ipmi' not available, skipping". + return bool(self.cfg.ipmi_url) # Add ipmi_url to SentinelConfig first. + + def get_tools(self): + # Return Anthropic-format tool definitions. These are automatically + # converted for OpenAI if that backend is in use. + return [{ + "name": "get_ipmi_sensors", + "description": "Read IPMI sensor data (temperatures, fan speeds, PSU status) for a node via its BMC.", + "input_schema": { + "type": "object", + "properties": { + "node_name": { + "type": "string", + "description": "Name of the Kubernetes node.", + }, + }, + "required": ["node_name"], + }, + }] + + async def execute(self, tool_name, tool_input): + # Called when the LLM invokes your tool. + node_name = tool_input["node_name"] + bmc_host = f"{node_name}-bmc" # Your BMC naming convention. + + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.get( + f"{self.cfg.ipmi_url}/sensors", + params={"host": bmc_host}, + ) + resp.raise_for_status() + data = resp.json() + + # Return a string — this becomes the tool result the LLM sees. + text = json.dumps(data, indent=2) + return text[:8000] # Truncate to avoid filling the context window. + + def get_system_prompt_section(self): + # Optional: add reference information to the system prompt. + # The LLM sees this alongside the investigation guidelines and + # metrics reference, so it knows when/how to use your tool. + return """\ +## IPMI sensor reference + +The `get_ipmi_sensors` tool returns BMC sensor readings: +- Inlet/outlet temperatures (celsius) +- Fan speeds (RPM) +- PSU voltage and status +- CPU temperatures + +Use this to check for cooling failures or power supply issues when +investigating thermal throttling or unexpected GPU shutdowns.""" +``` + +## 2. Add the config field (if needed) + +If your data source needs configuration (URLs, credentials, feature flags), add fields to `SentinelConfig` in `src/gcm_sentinel/config.py`: + +```python +class SentinelConfig(BaseSettings): + model_config = {"env_prefix": "GCM_SENTINEL_"} + + # ... existing fields ... + + ipmi_url: str = Field( + default="", + description="IPMI/BMC API URL. Leave empty to disable.", + ) +``` + +This automatically creates the env var `GCM_SENTINEL_IPMI_URL`. + +## 3. Register the data source + +Add one line to `build_datasources()` in `src/gcm_sentinel/datasources/__init__.py`: + +```python +def build_datasources(cfg: SentinelConfig) -> DataSourceRegistry: + from gcm_sentinel.datasources.alertmanager import AlertmanagerDataSource + from gcm_sentinel.datasources.gcm_health import GCMHealthDataSource + from gcm_sentinel.datasources.ipmi import IPMIDataSource # NEW + from gcm_sentinel.datasources.kubernetes_core import KubernetesCoreDataSource + from gcm_sentinel.datasources.kubernetes_workloads import KubernetesWorkloadsDataSource + from gcm_sentinel.datasources.prometheus import PrometheusDataSource + from gcm_sentinel.datasources.remediation import RemediationDataSource + + registry = DataSourceRegistry(cfg) + registry.register(PrometheusDataSource(cfg)) + registry.register(KubernetesCoreDataSource(cfg)) + registry.register(GCMHealthDataSource(cfg)) + registry.register(KubernetesWorkloadsDataSource(cfg)) + registry.register(AlertmanagerDataSource(cfg)) + registry.register(IPMIDataSource(cfg)) # NEW + registry.register(RemediationDataSource(cfg)) + return registry +``` + +That's it. The investigation engine will automatically: +- Include your tool in the LLM's tool list +- Route tool calls to your `execute()` method +- Include your system prompt section in the investigation context + +## 4. Add Helm values (if needed) + +If your data source has configurable URLs or settings, expose them in `charts/gcm-sentinel/values.yaml` and wire them through `charts/gcm-sentinel/templates/deployment.yaml`: + +```yaml +# values.yaml +ipmi: + url: "" # Set to enable IPMI sensor queries. +``` + +```yaml +# deployment.yaml (in the env section) +{{- if .Values.ipmi.url }} +- name: GCM_SENTINEL_IPMI_URL + value: "{{ .Values.ipmi.url }}" +{{- end }} +``` + +## 5. Test + +Add a test verifying your data source registers correctly and handles errors: + +```python +# tests/test_engine.py +def test_ipmi_skipped_when_url_empty(): + from gcm_sentinel.datasources import build_datasources + + cfg = SentinelConfig(api_key="test") # ipmi_url defaults to "" + registry = build_datasources(cfg) + assert "get_ipmi_sensors" not in {t["name"] for t in registry.get_all_tools()} + +def test_ipmi_registered_when_url_set(): + from gcm_sentinel.datasources import build_datasources + + cfg = SentinelConfig(api_key="test", ipmi_url="http://bmc-api:8080") + registry = build_datasources(cfg) + assert "get_ipmi_sensors" in {t["name"] for t in registry.get_all_tools()} +``` + +## Data source API reference + +Each data source implements the `DataSource` base class: + +| Method | Required | Description | +|---|---|---| +| `name` | Yes | Class attribute. Short identifier (e.g. `"ipmi"`). | +| `get_tools()` | Yes | Return tool definitions (Anthropic format). | +| `execute(tool_name, tool_input)` | Yes | Handle tool calls. Return a string. | +| `is_available()` | No | Return `False` to skip registration. Default: `True`. | +| `get_system_prompt_section()` | No | Return extra system prompt text. Default: `""`. | + +## Built-in data sources + +| Data source | File | Tools | `is_available` condition | +|---|---|---|---| +| `prometheus` | `prometheus.py` | `query_prometheus` | Always | +| `dcgm_direct` | `dcgm_direct.py` | `query_dcgm_direct` | Always | +| `node_direct` | `node_direct.py` | `query_node_exporter_direct`, `query_infiniband_direct` | Always | +| `kubernetes_core` | `kubernetes_core.py` | `get_node_info`, `get_node_events`, `get_pod_logs` | Always | +| `kubernetes_workloads` | `kubernetes_workloads.py` | `get_workload_info`, `get_workload_logs` | Always | +| `gcm_health` | `gcm_health.py` | `get_gcm_health` | Always | +| `alertmanager` | `alertmanager.py` | `query_alertmanager` | `alertmanager_url` is set | +| `remediation` | `remediation.py` | `cordon_node`, `drain_node`, `taint_node` | `action_mode == "execute"` | + +## Ideas for new data sources + +| Name | Data source | What it adds | +|---|---|---| +| `IPMIDataSource` | BMC/IPMI API | Baseboard temps, fan speeds, PSU health | +| `SlurmDataSource` | `squeue`/`sinfo` CLI | Job info on Slurm-native clusters | +| `CloudDataSource` | AWS EC2 / GCP / Azure | Scheduled maintenance, instance health | +| `FabricManagerDataSource` | NVIDIA FM API | NVSwitch topology and health | +| `EFADataSource` | EFA metrics | AWS Elastic Fabric Adapter errors | + +## Tips + +- **Handle errors gracefully**: If your data source is unreachable, return an error string instead of raising. The LLM will adapt. +- **System prompt section**: Keep it concise. The LLM sees this on every investigation — long sections waste tokens. +- **`is_available()`**: Use this to auto-skip when prerequisites aren't met (URL not configured, library not installed). The registry logs "not available, skipping" — no error. diff --git a/website/docs/GCM_Sentinel/architecture.md b/website/docs/GCM_Sentinel/architecture.md new file mode 100644 index 0000000..492453c --- /dev/null +++ b/website/docs/GCM_Sentinel/architecture.md @@ -0,0 +1,72 @@ +--- +sidebar_position: 2 +--- + +# Architecture + +## Data flow + +``` + ┌─────────────────────────────────────┐ + │ gcm-sentinel (Deployment) │ + │ │ + │ NodeConditionWatcher │ + │ ├─ watches K8s nodes via watch API │ +NPD sets condition ─┤ ├─ cooldown / allowlist gates │ + = True │ └─ triggers run_investigation() │ + │ │ │ + │ DataSourceRegistry │ + │ ├─ Prometheus (PromQL) │ + │ ├─ DCGMDirect (dcgm-exporter) │ + │ ├─ NodeDirect (node-exporter, IB) │ + │ ├─ KubernetesCore │ + │ ├─ KubernetesWorkloads │ + │ ├─ GCMHealth │ + │ ├─ Alertmanager │ + │ └─ Remediation (execute mode only) │ + │ └─ rate limit enforced here │ + │ │ │ + │ LLM Backend (Anthropic or OpenAI) │ + │ ├─ tool-use loop (up to 25 rounds) │ + │ └─ returns JSON assessment │ + │ │ │ + │ Output │ + │ ├─ K8s Event on node │ + │ ├─ Webhook (Slack/PagerDuty) │ + │ ├─ Node annotations (annotate mode) │ + │ └─ Structured logs │ + └─────────────────────────────────────┘ +``` + +## Data sources + +Each data source is a class that subclasses `DataSource` and contributes tools (for the LLM to call) and optional system prompt sections. The investigation engine doesn't know which data sources are registered — it just calls `registry.get_all_tools()` and `registry.execute(name, input)`. + +See [Adding a New Data Source](./adding_new_datasource.md) for a step-by-step guide on integrating your own systems. + +## LLM backends + +The `backends/` package abstracts away the differences between LLM APIs: + +| Backend | SDK | Tool-use protocol | +|---|---|---| +| `AnthropicBackend` | `anthropic` | `tool_use` blocks in content, `tool_result` in user messages | +| `OpenAIBackend` | `openai` | `tool_calls` on message, `role=tool` messages | + +Both backends implement the same `LLMBackend` interface. The investigation engine calls `backend.chat()`, `backend.append_assistant_response()`, and `backend.append_tool_results()` without knowing which API is behind it. + +## Condition name compatibility + +GCM Sentinel watches for both OSS and internal condition name formats: + +| OSS (Gcm prefix) | Internal (NPD prefix) | Health check | +|---|---|---| +| `GcmXidErrorsProblem` | `NPDXidErrorsProblem` | `check-syslogs xid` | +| `GcmSmiEccProblem` | `NPDSmiEccProblem` | `check-nvidia-smi ecc` | +| `GcmSmiDisconnectedProblem` | `NPDSmiDisconnectedProblem` | `check-nvidia-smi gpu_num` | +| `GcmProcZombieProblem` | `NPDProcZombieProblem` | `check-process zombie` | +| `GcmDcgmiNvlinkStatusProblem` | `NPDDcgmiNvlinkStatusProblem` | `check-dcgmi nvlink` | +| `GcmDcgmiDiagProblem` | `NPDDcgmiDiagProblem` | `check-dcgmi diag` | +| — | `NPDDcgmRunningProblem` | (internal only) | + +Both prefixes are watched by default. Extra names that don't exist on your cluster are harmless. diff --git a/website/docs/GCM_Sentinel/getting_started.md b/website/docs/GCM_Sentinel/getting_started.md new file mode 100644 index 0000000..9d93bbc --- /dev/null +++ b/website/docs/GCM_Sentinel/getting_started.md @@ -0,0 +1,116 @@ +--- +sidebar_position: 1 +--- + +# Getting Started + +:::caution Experimental +GCM Sentinel is experimental. APIs, configuration, and behavior may change between releases. +::: + +GCM Sentinel is an AI-powered sentinel agent that investigates GPU hardware failures detected by [GCM Health Checks](../GCM_Health_Checks/getting_started.md). It uses an LLM (Claude or GPT) to query Prometheus metrics, Kubernetes state, pod logs, and workload status — then recommends remediation actions. + +## Quick Start + +### Prerequisites + +- An LLM API key ([Anthropic](https://console.anthropic.com/) or [OpenAI](https://platform.openai.com/)) +- For cluster deployment: a Kubernetes GPU cluster with GPU metrics exposed to Prometheus (e.g. via [DCGM-exporter](https://github.com/NVIDIA/dcgm-exporter)) and a node health system that sets K8s node conditions (e.g. [GCM Health Checks](../GCM_Health_Checks/kubernetes_deployment.md)) + +### Try It (No Cluster Needed) + +Run 5 GPU failure scenarios against mock backends: + +```shell +pip install gcm-sentinel +export ANTHROPIC_API_KEY=your-key-here +python -m gcm_sentinel.demo.run +``` + +Or with OpenAI: + +```shell +pip install gcm-sentinel[openai] +export OPENAI_API_KEY=your-key-here +python -m gcm_sentinel.demo.run +``` + +### Deploy to Your Cluster + +```shell +helm install gcm-sentinel ./charts/gcm-sentinel \ + --set llm.apiKey=your-key-here \ + --set prometheus.url=http://prometheus.monitoring:9090 +``` + +Or from source: + +```shell +git clone https://github.com/facebookresearch/gcm.git +cd gcm +helm install gcm-sentinel ./charts/gcm-sentinel \ + --set llm.apiKey=your-key-here +``` + +The agent starts in **recommend mode** (observe-only) by default — zero cluster mutations. + +### One-Shot Investigation (CLI) + +```shell +pip install gcm-sentinel +export GCM_SENTINEL_API_KEY=your-key-here +gcm-sentinel investigate gpu-node-07 --condition GcmSmiEccProblem +``` + +## How It Works + +``` +GCM Health Check detects GPU failure + → NPD sets node condition (e.g. GcmXidErrorsProblem = True) + → gcm-sentinel watcher detects the change + → LLM investigates using tools: + query_prometheus — PromQL queries (whatever Prometheus scrapes) + query_dcgm_direct — GPU metrics directly from dcgm-exporter on the node + query_node_exporter_direct— host metrics directly from node-exporter + query_infiniband_direct — InfiniBand port metrics from node-exporter + get_node_info — K8s node conditions, taints, labels, pods + get_node_events — recent K8s Events + get_pod_logs — dcgm-exporter and NPD diagnostic logs + get_gcm_health — NPD problem gauges and counters + get_workload_info — training job identity, sibling workers + get_workload_logs — NCCL/CUDA error logs from training pods + query_alertmanager — currently firing alerts + → SentinelResult: severity + root_cause + recommended_action + confidence + → Output: K8s Event + webhook (Slack/PagerDuty) + optional node annotation +``` + +## Configuration + +All settings via environment variables (`GCM_SENTINEL_` prefix) or Helm values: + +| Variable | Default | Description | +|---|---|---| +| `API_KEY` | (required) | LLM API key | +| `LLM_API` | `anthropic` | `anthropic` or `openai` | +| `MODEL` | `claude-sonnet-4-6-20250725` | Model name | +| `ACTION_MODE` | `recommend` | `recommend` / `annotate` / `execute` | +| `PROMETHEUS_URL` | kube-prometheus-stack default | Prometheus URL | +| `WEBHOOK_URL` | (empty) | Slack/PagerDuty webhook | +| `WATCH_CONDITIONS` | GCM defaults | Comma-separated conditions | +| `MAX_TOOL_OUTPUT` | `8000` | Max chars per tool result sent to LLM. 0 = unlimited. | + +See the [Safety & Rollout Guide](./safety.md) for the full configuration reference. + +## Works Without GCM + +GCM Sentinel works with **any** system that sets Kubernetes node conditions on GPU failures. Configure your condition names: + +```shell +export GCM_SENTINEL_WATCH_CONDITIONS="MyGPUCheck,CustomXidCondition" +``` + +It also works without automated detection — use the CLI to trigger investigation manually. + +## Adding New Data Sources + +You can give the agent access to any system — IPMI sensors, cloud APIs, job schedulers, internal dashboards — by writing a data source (one Python file + one line of registration). See [Adding a New Data Source](./adding_new_datasource.md). diff --git a/website/docs/GCM_Sentinel/safety.md b/website/docs/GCM_Sentinel/safety.md new file mode 100644 index 0000000..6b01855 --- /dev/null +++ b/website/docs/GCM_Sentinel/safety.md @@ -0,0 +1,174 @@ +--- +sidebar_position: 3 +--- + +# Safety & Rollout Guide + +:::danger Critical +GCM Sentinel can investigate GPU failures on your production cluster. When configured in `execute` mode, it can **cordon and drain nodes**, which evicts running training jobs. Read this page carefully before deploying. +::: + +## Default: Observe-Only + +Out of the box, gcm-sentinel is **completely safe**: + +```shell +# This is observe-only — zero cluster mutations +helm install gcm-sentinel ./charts/gcm-sentinel \ + --set llm.apiKey=your-key-here +``` + +The default `actionMode=recommend` means: +- The agent **cannot** cordon, drain, or taint any node +- The remediation tools are **not even in the LLM's tool schema** — it cannot attempt to call them +- The RBAC ClusterRole is **read-only** — even if something went wrong, K8s won't allow mutations +- The agent only reads metrics, node info, events, and pod logs + +## Five Layers of Defense + +| Layer | What it does | How it works | +|---|---|---| +| **1. Action mode** | Controls whether the LLM even sees remediation tools | `recommend`/`annotate`: tools absent from schema — LLM cannot call them. `execute`: tools present. Enforced in Python before anything reaches the LLM. | +| **2. RBAC** | Kubernetes-level permission enforcement | `recommend`: no `patch`/`eviction` verbs. `annotate`: adds node `patch` only. `execute`: adds `pods/eviction`. Even if code has a bug, K8s rejects unauthorized calls. | +| **3. Rate limiter** | Prevents cascading drain | Max 3 remediation actions per hour. Enforced **before** the K8s API call — if the limit is hit, the tool returns "RATE LIMITED" to the LLM and no mutation happens. | +| **4. Cooldown** | Prevents runaway re-investigation | Same node+condition won't be re-investigated within 1 hour. Stops NPD flapping from burning API credits. | + +## Controlling via Helm + +Every safety parameter is a Helm value and a corresponding environment variable: + +:::tip API Key Security +For production, store the API key in a Kubernetes Secret instead of passing it as plaintext: +```shell +kubectl create secret generic gcm-sentinel-llm --from-literal=api-key=your-key-here +helm install gcm-sentinel ./charts/gcm-sentinel --set llm.existingSecret=gcm-sentinel-llm +``` +::: + +```shell +# Observe-only (DEFAULT — no flag needed) +helm install gcm-sentinel ./charts/gcm-sentinel \ + --set llm.apiKey=your-key-here + +# Annotate mode — writes K8s annotations, no cordon/drain +helm install gcm-sentinel ./charts/gcm-sentinel \ + --set llm.apiKey=your-key-here \ + --set sentinel.actionMode=annotate + +# Execute mode — can cordon/drain, with safety rails +helm install gcm-sentinel ./charts/gcm-sentinel \ + --set llm.apiKey=your-key-here \ + --set sentinel.actionMode=execute \ + --set sentinel.maxActionsPerHour=1 \ + --set sentinel.nodeAllowlist="gpu-node-01" +``` + +### Safety Parameters Reference + +| Helm Value | Env Var | Default | Description | +|---|---|---|---| +| `sentinel.actionMode` | `GCM_SENTINEL_ACTION_MODE` | `recommend` | `recommend` / `annotate` / `execute` | +| `sentinel.cooldownSeconds` | `GCM_SENTINEL_COOLDOWN_SECONDS` | `3600` | Seconds between investigations for same node+condition | +| `sentinel.nodeAllowlist` | `GCM_SENTINEL_NODE_ALLOWLIST` | `""` (all) | Comma-separated node patterns (fnmatch) | +| `sentinel.maxActionsPerHour` | `GCM_SENTINEL_MAX_ACTIONS_PER_HOUR` | `3` | Max remediations per hour (circuit-breaker) | + +### Helm Validation + +Invalid `actionMode` values are rejected at deploy time: + +``` +$ helm install gcm-sentinel ./charts/gcm-sentinel --set sentinel.actionMode=yolo +Error: ... Invalid sentinel.actionMode: "yolo". Must be one of: recommend, annotate, execute +``` + +## Recommended Rollout + +### Week 1-2: Recommend Mode (Observe) + +```shell +helm install gcm-sentinel ./charts/gcm-sentinel \ + --set llm.apiKey=your-key-here \ + --set sentinel.webhookUrl=https://hooks.slack.com/services/... +``` + +- Review investigation results in Slack/PagerDuty +- Compare the agent's assessments with your team's diagnosis +- Check K8s Events: `kubectl get events --field-selector reason=GCMSentinel` + +### Week 3-4: Annotate Mode (Validate) + +```shell +helm upgrade gcm-sentinel ./charts/gcm-sentinel \ + --set sentinel.actionMode=annotate +``` + +- Check node annotations: `kubectl get node -o jsonpath='{.metadata.annotations}' | jq 'with_entries(select(.key | startswith("gcm-sentinel")))'` +- Verify the recommended actions match what you'd do manually +- Optionally build a controller that reads `gcm-sentinel/action` annotations + +### Week 5+: Execute Mode (Targeted) + +```shell +helm upgrade gcm-sentinel ./charts/gcm-sentinel \ + --set sentinel.actionMode=execute \ + --set sentinel.maxActionsPerHour=1 \ + --set sentinel.nodeAllowlist="gpu-node-01,gpu-node-02" +``` + +- Start with 1-2 test nodes via `nodeAllowlist` +- Set `maxActionsPerHour=1` (conservative) +- Monitor K8s Events: `kubectl get events --field-selector reason=GCMSentinel` +- Expand `nodeAllowlist` gradually as you gain confidence + +### Full Auto (Mature) + +```shell +helm upgrade gcm-sentinel ./charts/gcm-sentinel \ + --set sentinel.actionMode=execute \ + --set sentinel.nodeAllowlist="" \ + --set sentinel.maxActionsPerHour=5 +``` + +## Kill Switch + +To immediately disable all remediation without redeploying: + +```shell +# Downgrade to observe-only +helm upgrade gcm-sentinel ./charts/gcm-sentinel \ + --set sentinel.actionMode=recommend + +# Or via env var (restart required, replace RELEASE_NAME with your Helm release name) +kubectl set env deployment/RELEASE_NAME-gcm-sentinel GCM_SENTINEL_ACTION_MODE=recommend +``` + +## Verifying Safety at Runtime + +The agent logs its action mode prominently at startup: + +``` +============================================================ +GCM Sentinel Agent +Action mode: recommend — OBSERVE-ONLY (no cluster mutations) +Node allowlist: (all nodes) +Cooldown: 3600s, Prometheus: http://prometheus.monitoring:9090 +============================================================ +``` + +In execute mode, you'll see a warning: + +``` +============================================================ +GCM Sentinel Agent +Action mode: execute — EXECUTE (can cordon/drain/taint nodes!) +WARNING: EXECUTE MODE ACTIVE — agent can modify cluster state. + Max actions/hour: 1 +Node allowlist: gpu-node-01, gpu-node-02 +============================================================ +``` + +Check the current mode by inspecting the pod logs: + +```shell +kubectl logs deployment/RELEASE_NAME-gcm-sentinel | head -10 +``` diff --git a/website/docs/getting_started.md b/website/docs/getting_started.md index 2b2357f..a5f360b 100644 --- a/website/docs/getting_started.md +++ b/website/docs/getting_started.md @@ -13,6 +13,7 @@ GCM is a monorepo with the following components: - [GCM Monitoring](./GCM_Monitoring/getting_started.md): Continuous data collection, mostly for the [Slurm](https://slurm.schedmd.com/documentation.html) workload scheduler, providing visibility into job performance and resource utilization. - [GCM Health Checks](./GCM_Health_Checks/getting_started.md): Verifies the proper functioning of hardware, software, network, storage, and services throughout the job lifecycle. - [GCM GPU Metrics](./GCM_GPU_Metrics/getting_started.md): Enhances OpenTelemetry data by correlating telemetry with Slurm metadata, enabling attribution of metrics (e.g., GPU utilization) to specific jobs and users. +- [GCM Sentinel](./GCM_Sentinel/getting_started.md): AI-powered GPU failure investigation using Claude. Investigates health check failures like an on-call engineer — querying metrics, node state, and workload logs — then recommends remediation actions. @@ -23,6 +24,7 @@ Each component has their own Getting Started and Contributing Guide: - [Monitoring](./GCM_Monitoring/getting_started.md) - [Health Checks](./GCM_Health_Checks/getting_started.md) - [Telemetry Processor / GPU Metrics](./GCM_GPU_Metrics/getting_started.md) +- [AI Sentinel](./GCM_Sentinel/getting_started.md) ### Contributing