diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 335750a..d580d0f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -184,11 +184,18 @@ jobs:
             mv "$file" "${file%.deb}-ubuntu${OS_VERSION}.deb"
           done
 
-      - name: Build package (wheel + sdist)
+      - name: Build GCM package (wheel + sdist)
         run: |
           source ~/.cache/venv-ci/bin/activate
           python -m build
 
+      - name: Build Sentinel package (wheel + sdist)
+        run: |
+          source ~/.cache/venv-ci/bin/activate
+          cd gcm-sentinel
+          python -m build
+          cp dist/* ../dist/
+
       - name: Create GitHub Release and upload assets
         if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && github.repository == 'facebookresearch/gcm' && env.RELEASE_EXISTS == 'false' }}
         uses: softprops/action-gh-release@v2
@@ -269,12 +276,18 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Package and push Helm chart
+      - name: Package and push GCM Helm chart
         run: |
           sed -i "s/^version:.*/version: ${VERSION}/" charts/gcm/Chart.yaml
           helm package charts/gcm
           helm push gcm-${VERSION}.tgz oci://ghcr.io/facebookresearch/charts
 
+      - name: Package and push Sentinel Helm chart
+        run: |
+          sed -i "s/^version:.*/version: ${VERSION}/" charts/gcm-sentinel/Chart.yaml
+          helm package charts/gcm-sentinel
+          helm push gcm-sentinel-${VERSION}.tgz oci://ghcr.io/facebookresearch/charts
+
   docker-publish:
     needs: [check-release]
     runs-on: ubuntu-latest
@@ -288,6 +301,7 @@ jobs:
       REGISTRY: ghcr.io
       GCM_IMAGE: ghcr.io/facebookresearch/gcm
       NPD_IMAGE: ghcr.io/facebookresearch/gcm-npd
+      SENTINEL_IMAGE: ghcr.io/facebookresearch/gcm-sentinel
 
     steps:
       - uses: actions/checkout@v4
@@ -331,3 +345,16 @@ jobs:
           docker build -f docker/Dockerfile.npd -t ${{ env.NPD_IMAGE }}:${{ env.VERSION }} -t ${{ env.NPD_IMAGE }}:latest .
           docker push ${{ env.NPD_IMAGE }}:${{ env.VERSION }}
           docker push ${{ env.NPD_IMAGE }}:latest
+
+      - name: Build and push Sentinel image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./gcm-sentinel
+          file: gcm-sentinel/Dockerfile
+          platforms: linux/amd64
+          push: true
+          tags: |
+            ${{ env.SENTINEL_IMAGE }}:${{ env.VERSION }}
+            ${{ env.SENTINEL_IMAGE }}:latest
+          cache-from: type=gha,scope=sentinel
+          cache-to: type=gha,mode=max,scope=sentinel
diff --git a/charts/gcm-sentinel/Chart.yaml b/charts/gcm-sentinel/Chart.yaml
new file mode 100644
index 0000000..0de074e
--- /dev/null
+++ b/charts/gcm-sentinel/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: gcm-sentinel
+description: AI-powered GPU cluster sentinel agent
+type: application
+version: 0.0.0
+appVersion: "0.0.0"
diff --git a/charts/gcm-sentinel/templates/_helpers.tpl b/charts/gcm-sentinel/templates/_helpers.tpl
new file mode 100644
index 0000000..3eb5b85
--- /dev/null
+++ b/charts/gcm-sentinel/templates/_helpers.tpl
@@ -0,0 +1,9 @@
+{{/*
+Validate sentinel.actionMode — fail fast on invalid values.
+*/}}
+{{- define "gcm-sentinel.validateActionMode" -}}
+{{- $valid := list "recommend" "annotate" "execute" -}}
+{{- if not (has .Values.sentinel.actionMode $valid) -}}
+{{- fail (printf "Invalid sentinel.actionMode: %q. Must be one of: %s" .Values.sentinel.actionMode (join ", " $valid)) -}}
+{{- end -}}
+{{- end -}}
diff --git a/charts/gcm-sentinel/templates/clusterrole.yaml b/charts/gcm-sentinel/templates/clusterrole.yaml
new file mode 100644
index 0000000..f0e7c7b
--- /dev/null
+++ b/charts/gcm-sentinel/templates/clusterrole.yaml
@@ -0,0 +1,44 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["nodes/status"]
+    verbs: ["get"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list"]
+  - apiGroups: [""]
+    resources: ["pods/log"]
+    verbs: ["get"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["get", "list"]
+  - apiGroups: ["apps"]
+    resources: ["statefulsets", "replicasets"]
+    verbs: ["get", "list"]
+  - apiGroups: [""]
+    resources: ["events"]
+    verbs: ["get", "list", "create", "patch", "update"]
+
+  {{- if or (eq .Values.sentinel.actionMode "annotate") (eq .Values.sentinel.actionMode "execute") }}
+  # Annotate/execute: write annotations on nodes.
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["patch"]
+  {{- end }}
+
+  {{- if eq .Values.sentinel.actionMode "execute" }}
+  # Execute: cordon (patch nodes) + drain (evict pods).
+  - apiGroups: [""]
+    resources: ["pods/eviction"]
+    verbs: ["create"]
+  {{- end }}
diff --git a/charts/gcm-sentinel/templates/clusterrolebinding.yaml b/charts/gcm-sentinel/templates/clusterrolebinding.yaml
new file mode 100644
index 0000000..c272428
--- /dev/null
+++ b/charts/gcm-sentinel/templates/clusterrolebinding.yaml
@@ -0,0 +1,16 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ .Release.Name }}-gcm-sentinel
+subjects:
+  - kind: ServiceAccount
+    name: {{ .Release.Name }}-gcm-sentinel
+    namespace: {{ .Release.Namespace }}
diff --git a/charts/gcm-sentinel/templates/configmap.yaml b/charts/gcm-sentinel/templates/configmap.yaml
new file mode 100644
index 0000000..19155fd
--- /dev/null
+++ b/charts/gcm-sentinel/templates/configmap.yaml
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel-prompt
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
+data:
+  system-prompt.txt: |
+    You are an expert GPU cluster reliability engineer. A Kubernetes node has reported
+    a GPU-related problem.
+
+    Investigate thoroughly using ALL available tools — explore every data source you
+    have access to before making your assessment. The tools available to you are your
+    eyes and hands into the cluster. Use them extensively.
+
+    For XID error code reference, see: https://docs.nvidia.com/deploy/xid-errors/
+
+    ## Assessment
+
+    Provide your final assessment as JSON:
+    ```json
+    {"severity": "critical|warning|info|ok", "summary": "...", "root_cause": "...", "recommended_action": "cordon|drain|taint|reboot|none", "confidence": 0.0-1.0}
+    ```
diff --git a/charts/gcm-sentinel/templates/deployment.yaml b/charts/gcm-sentinel/templates/deployment.yaml
new file mode 100644
index 0000000..a66062b
--- /dev/null
+++ b/charts/gcm-sentinel/templates/deployment.yaml
@@ -0,0 +1,89 @@
+{{- include "gcm-sentinel.validateActionMode" . -}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  selector:
+    matchLabels:
+      app: gcm-sentinel
+      app.kubernetes.io/instance: {{ .Release.Name }}
+  template:
+    metadata:
+      labels:
+        app: gcm-sentinel
+        app.kubernetes.io/instance: {{ .Release.Name }}
+      annotations:
+        checksum/prompt: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
+    spec:
+      serviceAccountName: {{ .Release.Name }}-gcm-sentinel
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      containers:
+        - name: gcm-sentinel
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          args:
+            - serve
+            - --log-level
+            - "{{ .Values.logLevel }}"
+          env:
+            - name: GCM_SENTINEL_LLM_API
+              value: "{{ .Values.llm.api }}"
+            - name: GCM_SENTINEL_API_KEY
+              {{- if .Values.llm.existingSecret }}
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.llm.existingSecret }}
+                  key: {{ .Values.llm.secretKey | default "api-key" }}
+              {{- else }}
+              value: "{{ .Values.llm.apiKey }}"
+              {{- end }}
+            - name: GCM_SENTINEL_MODEL
+              value: "{{ .Values.llm.model }}"
+            - name: GCM_SENTINEL_PROMETHEUS_URL
+              value: "{{ .Values.prometheus.url }}"
+            - name: GCM_SENTINEL_ACTION_MODE
+              value: "{{ .Values.sentinel.actionMode }}"
+            - name: GCM_SENTINEL_COOLDOWN_SECONDS
+              value: "{{ .Values.sentinel.cooldownSeconds }}"
+            {{- if .Values.sentinel.nodeAllowlist }}
+            - name: GCM_SENTINEL_NODE_ALLOWLIST
+              value: "{{ .Values.sentinel.nodeAllowlist }}"
+            {{- end }}
+            - name: GCM_SENTINEL_MAX_ACTIONS_PER_HOUR
+              value: "{{ .Values.sentinel.maxActionsPerHour }}"
+            - name: GCM_SENTINEL_MAX_TOOL_ROUNDS
+              value: "{{ .Values.sentinel.maxToolRounds }}"
+            - name: GCM_SENTINEL_MAX_TOOL_OUTPUT
+              value: "{{ .Values.sentinel.maxToolOutput }}"
+            - name: GCM_SENTINEL_SYSTEM_PROMPT_PATH
+              value: "/etc/gcm-sentinel/system-prompt.txt"
+            {{- if .Values.sentinel.webhookUrl }}
+            - name: GCM_SENTINEL_WEBHOOK_URL
+              value: "{{ .Values.sentinel.webhookUrl }}"
+            {{- end }}
+            - name: GCM_SENTINEL_WATCH_CONDITIONS
+              value: '{{ join "," .Values.sentinel.watchConditions }}'
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: system-prompt
+              mountPath: /etc/gcm-sentinel
+              readOnly: true
+      volumes:
+        - name: system-prompt
+          configMap:
+            name: {{ .Release.Name }}-gcm-sentinel-prompt
diff --git a/charts/gcm-sentinel/templates/serviceaccount.yaml b/charts/gcm-sentinel/templates/serviceaccount.yaml
new file mode 100644
index 0000000..e813ae1
--- /dev/null
+++ b/charts/gcm-sentinel/templates/serviceaccount.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
diff --git a/charts/gcm-sentinel/values.yaml b/charts/gcm-sentinel/values.yaml
new file mode 100644
index 0000000..4a2857d
--- /dev/null
+++ b/charts/gcm-sentinel/values.yaml
@@ -0,0 +1,124 @@
+replicaCount: 1
+
+image:
+  repository: ghcr.io/facebookresearch/gcm-sentinel
+  tag: latest
+  pullPolicy: IfNotPresent
+
+# LLM configuration.
+llm:
+  # Provider: "anthropic" (Claude) or "openai" (GPT, vLLM, Ollama, Azure OpenAI).
+  api: "anthropic"
+  # API key — provide ONE of:
+  #   apiKey: "..." (plaintext, for testing only)
+  #   existingSecret: "my-secret" + secretKey: "api-key" (recommended for production)
+  apiKey: ""
+  existingSecret: ""
+  secretKey: "api-key"
+  # Model name/ID.
+  model: "claude-sonnet-4-6-20250725"
+
+# Prometheus endpoint in the cluster.
+prometheus:
+  url: "http://kube-prometheus-stack-prometheus.monitoring:9090"
+
+# ============================================================================
+# SAFETY CONFIGURATION
+#
+# By default, gcm-sentinel is OBSERVE-ONLY. It investigates GPU failures and
+# reports findings but NEVER modifies cluster state (no cordon, no drain,
+# no taint). This is enforced at three levels:
+#
+#   1. actionMode="recommend" — remediation tools are not even in the LLM's
+#      tool schema, so it cannot attempt to call them.
+#   2. RBAC — the ClusterRole only gets node patch/eviction verbs when
+#      actionMode is "annotate" or "execute". In recommend mode, RBAC is
+#      strictly read-only.
+#   3. Rate limits — even in execute mode, actions are gated by confidence
+#      threshold, cooldown, and per-hour rate limits.
+#
+# To enable automated remediation, you must EXPLICITLY set:
+#   --set sentinel.actionMode=execute
+#
+# We strongly recommend running in "recommend" mode for at least 2 weeks
+# to validate the agent's assessments before enabling any mutations.
+# ============================================================================
+
+sentinel:
+  # ACTION MODE — controls whether the agent can modify cluster state.
+  #
+  #   "recommend" (DEFAULT, SAFE) — observe-only. Investigate, log, emit
+  #               K8s Events, send webhooks. Zero cluster mutations.
+  #               RBAC is read-only. The agent cannot cordon/drain/taint.
+  #
+  #   "annotate"  — everything above + write gcm-sentinel/* annotations on
+  #                 the node. A separate controller or human acts on them.
+  #                 RBAC adds node patch. The agent still cannot cordon/drain.
+  #
+  #   "execute"   — everything above + The agent can cordon/drain/taint nodes.
+  #                 ONLY if max_actions_per_hour not exceeded. RBAC adds evictions.
+  #                 USE WITH CAUTION.
+  actionMode: "recommend"
+
+  # Minimum seconds between investigations for the same node+condition.
+  # Prevents runaway investigations from NPD condition flapping.
+  cooldownSeconds: "3600"
+
+  # If non-empty, only investigate nodes matching these patterns (fnmatch glob).
+  # Use to scope rollout to a subset of nodes. Comma-separated.
+  # Example: "gpu-node-01,gpu-node-02,gpu-node-0*"
+  nodeAllowlist: ""
+
+  # Max remediation actions (cordon/drain/taint) per hour across ALL nodes.
+  # Circuit-breaker to prevent cascading drain. Only relevant in execute mode.
+  maxActionsPerHour: "3"
+
+  # Maximum tool-use rounds per investigation.
+  maxToolRounds: "25"
+
+  # Max characters per tool result sent to the LLM. Controls context cost.
+  # 0 = unlimited (LLM sees full output).
+  maxToolOutput: "8000"
+
+  # Webhook URL to POST results to (e.g. Slack incoming webhook).
+  # Leave empty to disable.
+  webhookUrl: ""
+
+  # Node conditions to watch. Defaults include both OSS GCM conditions
+  # (Gcm* prefix) and internal variants (NPD* prefix). Only conditions
+  # that exist on your cluster will trigger — extra names are harmless.
+  watchConditions:
+    # OSS GCM conditions (https://github.com/facebookresearch/gcm)
+    - GcmXidErrorsProblem
+    - GcmSmiEccProblem
+    - GcmSmiDisconnectedProblem
+    - GcmProcZombieProblem
+    - GcmDcgmiNvlinkStatusProblem
+    - GcmDcgmiDiagProblem
+    # Internal variants
+    - NPDXidErrorsProblem
+    - NPDSmiEccProblem
+    - NPDSmiDisconnectedProblem
+    - NPDProcZombieProblem
+    - NPDDcgmiNvlinkStatusProblem
+    - NPDDcgmiDiagProblem
+    - NPDDcgmRunningProblem
+
+# Resource limits.
+resources:
+  requests:
+    cpu: 100m
+    memory: 256Mi
+  limits:
+    cpu: 500m
+    memory: 512Mi
+
+# Log level.
+logLevel: INFO
+
+# Node selector — run on infra/non-GPU nodes so the sentinel agent
+# doesn't consume GPU resources. Set to match your cluster's infra nodes.
+# Example for AWS: node.kubernetes.io/instance-type: m5.4xlarge
+nodeSelector: {}
+
+tolerations: []
diff --git a/gcm-sentinel/Dockerfile b/gcm-sentinel/Dockerfile
new file mode 100644
index 0000000..9047f86
--- /dev/null
+++ b/gcm-sentinel/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.11-slim AS builder
+
+WORKDIR /build
+COPY pyproject.toml README.md ./
+COPY src/ src/
+
+RUN pip install --no-cache-dir --prefix=/install .
+
+FROM python:3.11-slim
+
+LABEL org.opencontainers.image.source=https://github.com/facebookresearch/gcm
+
+WORKDIR /app
+
+COPY --from=builder /install /usr/local
+COPY . .
+
+RUN gcm-sentinel --help
+
+ENTRYPOINT ["gcm-sentinel"]
+CMD ["serve"]
diff --git a/gcm-sentinel/README.md b/gcm-sentinel/README.md
new file mode 100644
index 0000000..16fffe9
--- /dev/null
+++ b/gcm-sentinel/README.md
@@ -0,0 +1,139 @@
+# gcm-sentinel
+
+AI-powered GPU cluster sentinel agent. Watches for GPU hardware failures on Kubernetes clusters and uses an LLM (Claude or GPT) to investigate — querying Prometheus metrics, Kubernetes state, pod logs, and training job status — then recommends remediation actions.
+
+## Quick Start
+
+### Try it (no cluster needed)
+
+```bash
+pip install gcm-sentinel
+export ANTHROPIC_API_KEY=your-key-here
+python -m gcm_sentinel.demo.run
+```
+
+This runs 5 GPU failure scenarios (ECC errors, GPU off bus, NVLink, thermal, transient XID) against mock backends. Only an API key is needed.
+
+To use OpenAI instead:
+
+```bash
+pip install gcm-sentinel[openai]
+export OPENAI_API_KEY=your-key-here
+python -m gcm_sentinel.demo.run
+```
+
+### Deploy to a cluster
+
+```bash
+helm install gcm-sentinel ./charts/gcm-sentinel \
+  --set llm.apiKey=your-key-here \
+  --set prometheus.url=http://prometheus.monitoring:9090
+```
+
+The agent starts in **observe-only mode** by default. It investigates GPU failures and sends results to K8s Events and webhooks, but makes **zero cluster mutations**.
+
+### One-shot investigation (CLI)
+
+```bash
+pip install gcm-sentinel
+export GCM_SENTINEL_API_KEY=your-key-here
+gcm-sentinel investigate gpu-node-07 --condition GcmSmiEccProblem
+```
+
+## How It Works
+
+```
+GCM health check detects GPU failure
+  → NPD sets node condition = True (e.g. GcmXidErrorsProblem)
+  → gcm-sentinel watches for condition changes via K8s watch API
+  → LLM investigates using tools (up to 25 rounds):
+      query_prometheus          — PromQL queries (kube-state-metrics, node-exporter, etc.)
+      query_dcgm_direct         — GPU metrics directly from dcgm-exporter on the node
+      query_node_exporter_direct— host metrics directly from node-exporter
+      query_infiniband_direct   — InfiniBand port metrics from node-exporter
+      get_node_info             — conditions, taints, labels, pods
+      get_node_events           — recent K8s Events
+      get_pod_logs              — dcgm-exporter, NPD diagnostic output
+      get_gcm_health            — NPD problem gauges and counters
+      get_workload_info         — training job identity, sibling pods
+      get_workload_logs         — NCCL/CUDA error logs from training pods
+      query_alertmanager        — currently firing alerts
+  → Assessment: severity + root_cause + recommended_action + confidence
+  → Output: K8s Event + webhook (Slack/PagerDuty) + optional node annotation
+```
+
+## Safety
+
+The agent defaults to **observe-only**. It must be explicitly configured to modify cluster state.
+
+| Mode | Behavior | Helm flag |
+|---|---|---|
+| `recommend` (default) | Investigate + report. No mutations. | `--set sentinel.actionMode=recommend` |
+| `annotate` | Above + write `gcm-sentinel/*` annotations on node. | `--set sentinel.actionMode=annotate` |
+| `execute` | Above + can cordon/drain/taint. Gated by confidence + rate limit. | `--set sentinel.actionMode=execute` |
+
+Additional safety:
+- **Cooldown**: 1 hour between investigations for the same node+condition
+- **Node allowlist**: scope to specific nodes (`--set sentinel.nodeAllowlist="gpu-node-01"`)
+- **Rate limit**: max 3 remediation actions per hour (circuit-breaker)
+- **RBAC**: ClusterRole only gets `patch`/`eviction` verbs in annotate/execute modes
+- **Tool removal**: remediation tools are absent from the LLM's schema unless `execute` mode
+
+See [Safety & Rollout Guide](https://facebookresearch.github.io/gcm/docs/GCM_Sentinel/safety) for the full rollout playbook.
+
+## Configuration
+
+All via environment variables (`GCM_SENTINEL_` prefix) or Helm values:
+
+| Variable | Default | Description |
+|---|---|---|
+| `API_KEY` | (required) | LLM API key |
+| `LLM_API` | `anthropic` | `anthropic` or `openai` |
+| `MODEL` | `claude-sonnet-4-6-20250725` | Model name |
+| `PROMETHEUS_URL` | `http://kube-prometheus-stack-prometheus.monitoring:9090` | Prometheus URL |
+| `ALERTMANAGER_URL` | (Alertmanager URL) | Leave empty to disable |
+| `ACTION_MODE` | `recommend` | `recommend` / `annotate` / `execute` |
+| `COOLDOWN_SECONDS` | `3600` | Cooldown per node+condition |
+| `NODE_ALLOWLIST` | (all) | Comma-separated node patterns |
+| `MAX_ACTIONS_PER_HOUR` | `3` | Circuit-breaker |
+| `WATCH_CONDITIONS` | GCM defaults | Comma-separated conditions to watch |
+| `MAX_TOOL_OUTPUT` | `8000` | Max chars per tool result sent to LLM. 0 = unlimited. |
+| `WEBHOOK_URL` | (empty) | Slack/PagerDuty webhook |
+
+All variables use the `GCM_SENTINEL_` prefix (e.g. `GCM_SENTINEL_API_KEY`).
+
+## Watch Conditions
+
+Defaults watch for [GCM health check](https://facebookresearch.github.io/gcm/docs/GCM_Health_Checks/kubernetes_deployment) conditions (`Gcm*` prefix). If you use different NPD condition names:
+
+```bash
+export GCM_SENTINEL_WATCH_CONDITIONS="MyGPUCheck,CustomXidCondition"
+```
+
+## Adding New Data Sources
+
+The agent queries data sources via a plugin system. Each data source is a Python class that exposes tools the LLM can call — Prometheus queries, K8s API calls, HTTP requests to any service.
+
+To give the agent access to a new system (IPMI sensors, cloud APIs, job schedulers, etc.), create a data source file and register it:
+
+```python
+# src/gcm_sentinel/datasources/my_source.py
+from gcm_sentinel.datasources import DataSource
+
+class MyDataSource(DataSource):
+    name = "my_source"
+
+    def get_tools(self):
+        return [{"name": "query_my_source", "description": "...", "input_schema": {...}}]
+
+    async def execute(self, tool_name, tool_input):
+        return await my_query(tool_input["node_name"])
+```
+
+Then add `registry.register(MyDataSource(cfg))` in `datasources/__init__.py`.
+
+See the full [Adding a New Data Source](https://facebookresearch.github.io/gcm/docs/GCM_Sentinel/adding_new_datasource) guide for a complete walkthrough with a real example.
+
+## License
+
+See [LICENSE](../LICENSE).
diff --git a/gcm-sentinel/demo/__init__.py b/gcm-sentinel/demo/__init__.py
new file mode 100644
index 0000000..6d8d1e4
--- /dev/null
+++ b/gcm-sentinel/demo/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Demo mode for gcm-sentinel — runs scenarios against mock backends."""
diff --git a/gcm-sentinel/demo/run.py b/gcm-sentinel/demo/run.py
new file mode 100644
index 0000000..0c2ace5
--- /dev/null
+++ b/gcm-sentinel/demo/run.py
@@ -0,0 +1,126 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Demo: run investigation scenarios against mock data.
+
+Usage: python -m gcm_sentinel.demo.run [--scenario NAME]
+Requires ANTHROPIC_API_KEY or OPENAI_API_KEY to be set.
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import sys
+from pathlib import Path
+
+from gcm_sentinel.config import SentinelConfig
+from gcm_sentinel.datasources import DataSource, DataSourceRegistry
+from gcm_sentinel.engine import run_investigation
+from gcm_sentinel.models import NodeConditionEvent
+
+SCENARIOS_DIR = Path(__file__).resolve().parent.parent / "tests" / "scenarios"
+
+
+class MockDataSource(DataSource):
+    """Returns canned data from a scenario JSON file for all tools."""
+    name = "mock"
+
+    def __init__(self, cfg, scenario):
+        super().__init__(cfg)
+        self._scenario = scenario
+
+    def get_tools(self):
+        node_schema = {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]}
+        return [
+            {"name": "get_node_info", "description": "Node info.", "input_schema": node_schema},
+            {"name": "get_node_events", "description": "Node events.", "input_schema": node_schema},
+            {"name": "get_gcm_health", "description": "GCM/NPD health.", "input_schema": node_schema},
+            {
+                "name": "query_prometheus",
+                "description": "Execute a PromQL query. Use the metrics reference in the system prompt.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "query": {"type": "string"},
+                        "duration": {"type": "string", "default": "5m"},
+                    },
+                    "required": ["query"],
+                },
+            },
+            {"name": "query_dcgm_direct", "description": "GPU metrics from dcgm-exporter.", "input_schema": node_schema},
+        ]
+
+    async def execute(self, tool_name, tool_input):
+        if tool_name == "get_node_info":
+            return json.dumps(self._scenario.get("node_info", {}), indent=2)
+        elif tool_name == "get_gcm_health":
+            return json.dumps(self._scenario.get("gcm_health", {}), indent=2)
+        elif tool_name in ("query_prometheus", "query_dcgm_direct"):
+            query = tool_input.get("query", "")
+            for metric_name, response in self._scenario.get("prometheus_responses", {}).items():
+                if metric_name in query:
+                    return json.dumps(response.get("data", {}), indent=2)
+            return json.dumps({"resultType": "vector", "result": []})
+        return "[]"
+
+    def get_system_prompt_section(self):
+        return ""
+
+
+async def run_scenario(scenario_path: Path, cfg: SentinelConfig) -> None:
+    with open(scenario_path) as f:
+        scenario = json.load(f)
+
+    event = NodeConditionEvent(
+        node_name=scenario["node_name"], condition_type=scenario["condition_type"],
+        status="True", reason=scenario.get("reason", scenario["condition_type"]),
+        message=scenario.get("message", ""),
+    )
+
+    print(f"\n{'='*70}\nSCENARIO: {scenario['name']}\n{'='*70}")
+    print(f"Node: {event.node_name}  Condition: {event.condition_type}")
+    print(f"Expected: severity={scenario.get('expected_severity')}, action={scenario.get('expected_action')}\n")
+
+    registry = DataSourceRegistry(cfg)
+    registry.register(MockDataSource(cfg, scenario))
+
+    try:
+        result = await run_investigation(event, cfg, registry)
+    except Exception as exc:
+        print(f"ERROR: {exc}")
+        return
+
+    print(f"{'='*70}\nRESULT: severity={result.severity.value} action={result.recommended_action.value} confidence={result.confidence:.0%}")
+    print(f"Summary: {result.summary}\nRoot cause: {result.root_cause}\nTool calls: {len(result.tool_calls)}\n{'='*70}\n")
+
+
+def main() -> None:
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+
+    api_key = os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("OPENAI_API_KEY") or os.environ.get("GCM_SENTINEL_API_KEY")
+    if not api_key:
+        print("Set ANTHROPIC_API_KEY, OPENAI_API_KEY, or GCM_SENTINEL_API_KEY")
+        sys.exit(1)
+
+    llm_api = os.environ.get("GCM_SENTINEL_LLM_API", "anthropic")
+    if not os.environ.get("GCM_SENTINEL_LLM_API") and os.environ.get("OPENAI_API_KEY") and not os.environ.get("ANTHROPIC_API_KEY"):
+        llm_api = "openai"
+
+    cfg = SentinelConfig(api_key=api_key, llm_api=llm_api)
+
+    scenario_filter = sys.argv[2] if len(sys.argv) > 2 and sys.argv[1] == "--scenario" else None
+    files = sorted(SCENARIOS_DIR.glob("*.json"))
+    if scenario_filter:
+        files = [f for f in files if scenario_filter in f.stem]
+    if not files:
+        print(f"No scenarios found in {SCENARIOS_DIR}")
+        sys.exit(1)
+
+    for f in files:
+        asyncio.run(run_scenario(f, cfg))
+    print("All scenarios complete.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gcm-sentinel/pyproject.toml b/gcm-sentinel/pyproject.toml
new file mode 100644
index 0000000..53ffe5a
--- /dev/null
+++ b/gcm-sentinel/pyproject.toml
@@ -0,0 +1,34 @@
+[build-system]
+requires = ["setuptools>=68.0", "setuptools-scm>=8.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "gcm-sentinel"
+version = "0.0.0"
+description = "AI-powered GPU cluster sentinel agent"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "anthropic>=0.39.0",
+    "httpx>=0.27.0",
+    "kubernetes>=31.0.0",
+    "pydantic>=2.7.0",
+    "pydantic-settings>=2.5.0",
+    "click>=8.0.0",
+]
+
+[project.optional-dependencies]
+openai = [
+    "openai>=1.0.0",
+]
+dev = [
+    "pytest>=8.0",
+    "pytest-asyncio>=0.24.0",
+    "openai>=1.0.0",
+]
+
+[project.scripts]
+gcm-sentinel = "gcm_sentinel.main:cli"
+
+[tool.setuptools.packages.find]
+where = ["src"]
diff --git a/gcm-sentinel/src/gcm_sentinel/__init__.py b/gcm-sentinel/src/gcm_sentinel/__init__.py
new file mode 100644
index 0000000..84e3ae7
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""GCM Sentinel: AI-powered GPU cluster sentinel agent."""
diff --git a/gcm-sentinel/src/gcm_sentinel/backends/__init__.py b/gcm-sentinel/src/gcm_sentinel/backends/__init__.py
new file mode 100644
index 0000000..fa4c9cd
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/backends/__init__.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+import dataclasses
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+@dataclasses.dataclass
+class ToolUseRequest:
+    id: str
+    name: str
+    input: dict
+
+
+@dataclasses.dataclass
+class ChatResponse:
+    text_parts: list[str]
+    tool_requests: list[ToolUseRequest]
+    done: bool
+    raw_response: Any  # SDK-specific response for building follow-up messages.
+
+
+class LLMBackend(ABC):
+    @abstractmethod
+    def chat(self, messages: list[dict], system: str, tools: list[dict], max_tokens: int = 4096) -> ChatResponse: ...
+
+    @abstractmethod
+    def append_assistant_response(self, messages: list[dict], response: ChatResponse) -> None: ...
+
+    @abstractmethod
+    def append_tool_results(self, messages: list[dict], response: ChatResponse, results: list[tuple[ToolUseRequest, str]]) -> None: ...
+
+
+def get_backend(api: str, api_key: str, model: str) -> LLMBackend:
+    if api == "anthropic":
+        from gcm_sentinel.backends.anthropic_backend import AnthropicBackend
+        return AnthropicBackend(api_key=api_key, model=model)
+    elif api == "openai":
+        try:
+            from gcm_sentinel.backends.openai_backend import OpenAIBackend
+        except ImportError:
+            raise ImportError("OpenAI backend requires the 'openai' package. Install with: pip install gcm-sentinel[openai]")
+        return OpenAIBackend(api_key=api_key, model=model)
+    else:
+        raise ValueError(f"Unknown LLM API: {api!r}. Must be 'anthropic' or 'openai'.")
diff --git a/gcm-sentinel/src/gcm_sentinel/backends/anthropic_backend.py b/gcm-sentinel/src/gcm_sentinel/backends/anthropic_backend.py
new file mode 100644
index 0000000..48a34fa
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/backends/anthropic_backend.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+import anthropic
+
+from gcm_sentinel.backends import ChatResponse, LLMBackend, ToolUseRequest
+
+
+class AnthropicBackend(LLMBackend):
+    def __init__(self, api_key: str, model: str) -> None:
+        self._client = anthropic.Anthropic(api_key=api_key)
+        self._model = model
+
+    def chat(self, messages, system, tools, max_tokens=4096):
+        resp = self._client.messages.create(
+            model=self._model, max_tokens=max_tokens,
+            system=system, tools=tools, messages=messages,
+        )
+        text_parts, tool_requests = [], []
+        for block in resp.content:
+            if block.type == "text":
+                text_parts.append(block.text)
+            elif block.type == "tool_use":
+                tool_requests.append(ToolUseRequest(id=block.id, name=block.name, input=block.input))
+
+        return ChatResponse(
+            text_parts=text_parts, tool_requests=tool_requests,
+            done=resp.stop_reason == "end_turn" or not tool_requests,
+            raw_response=resp,
+        )
+
+    def append_assistant_response(self, messages, response):
+        messages.append({"role": "assistant", "content": response.raw_response.content})
+
+    def append_tool_results(self, messages, response, results):
+        messages.append({"role": "user", "content": [
+            {"type": "tool_result", "tool_use_id": req.id, "content": text}
+            for req, text in results
+        ]})
diff --git a/gcm-sentinel/src/gcm_sentinel/backends/openai_backend.py b/gcm-sentinel/src/gcm_sentinel/backends/openai_backend.py
new file mode 100644
index 0000000..119e29f
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/backends/openai_backend.py
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""OpenAI backend. Also works with any OpenAI-compatible API (vLLM, Ollama,
+Azure OpenAI) — set OPENAI_BASE_URL env var to point at your endpoint."""
+
+from __future__ import annotations
+
+import json
+
+from openai import OpenAI
+
+from gcm_sentinel.backends import ChatResponse, LLMBackend, ToolUseRequest
+
+
+def _to_openai_tools(tools: list[dict]) -> list[dict]:
+    return [
+        {"type": "function", "function": {
+            "name": t["name"], "description": t.get("description", ""),
+            "parameters": t.get("input_schema", {}),
+        }}
+        for t in tools
+    ]
+
+
+class OpenAIBackend(LLMBackend):
+    def __init__(self, api_key: str, model: str) -> None:
+        self._client = OpenAI(api_key=api_key)
+        self._model = model
+
+    def chat(self, messages, system, tools, max_tokens=4096):
+        full_messages = [{"role": "system", "content": system}] + messages
+        openai_tools = _to_openai_tools(tools)
+
+        resp = self._client.chat.completions.create(
+            model=self._model, messages=full_messages,
+            tools=openai_tools or None, max_tokens=max_tokens,
+        )
+        choice = resp.choices[0]
+        msg = choice.message
+
+        text_parts = [msg.content] if msg.content else []
+        tool_requests = []
+        for tc in msg.tool_calls or []:
+            try:
+                args = json.loads(tc.function.arguments)
+            except (json.JSONDecodeError, TypeError):
+                args = {}
+            tool_requests.append(ToolUseRequest(id=tc.id, name=tc.function.name, input=args))
+
+        return ChatResponse(
+            text_parts=text_parts, tool_requests=tool_requests,
+            done=choice.finish_reason != "tool_calls" or not tool_requests,
+            raw_response=resp,
+        )
+
+    def append_assistant_response(self, messages, response):
+        choice = response.raw_response.choices[0]
+        msg = {"role": "assistant"}
+        if choice.message.content:
+            msg["content"] = choice.message.content
+        if choice.message.tool_calls:
+            msg["tool_calls"] = [
+                {"id": tc.id, "type": "function", "function": {"name": tc.function.name, "arguments": tc.function.arguments}}
+                for tc in choice.message.tool_calls
+            ]
+        messages.append(msg)
+
+    def append_tool_results(self, messages, response, results):
+        for req, text in results:
+            messages.append({"role": "tool", "tool_call_id": req.id, "content": text})
diff --git a/gcm-sentinel/src/gcm_sentinel/config.py b/gcm-sentinel/src/gcm_sentinel/config.py
new file mode 100644
index 0000000..35d7309
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/config.py
@@ -0,0 +1,80 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+from pydantic import Field, field_validator
+from pydantic_settings import BaseSettings
+
+DEFAULT_WATCH_CONDITIONS: list[str] = [
+    # OSS GCM conditions (https://github.com/facebookresearch/gcm)
+    "GcmXidErrorsProblem",
+    "GcmSmiEccProblem",
+    "GcmSmiDisconnectedProblem",
+    "GcmProcZombieProblem",
+    "GcmDcgmiNvlinkStatusProblem",
+    "GcmDcgmiDiagProblem",
+    # Internal variants (some deployments use NPD prefix)
+    "NPDXidErrorsProblem",
+    "NPDSmiEccProblem",
+    "NPDSmiDisconnectedProblem",
+    "NPDProcZombieProblem",
+    "NPDDcgmiNvlinkStatusProblem",
+    "NPDDcgmiDiagProblem",
+    "NPDDcgmRunningProblem",
+]
+
+ACTION_MODES = ("recommend", "annotate", "execute")
+
+
+def _parse_csv(v: object) -> list[str]:
+    if isinstance(v, str):
+        return [c.strip() for c in v.split(",") if c.strip()]
+    return v  # type: ignore[return-value]
+
+
+class SentinelConfig(BaseSettings):
+    model_config = {"env_prefix": "GCM_SENTINEL_"}
+
+    llm_api: str = Field(default="anthropic", description="'anthropic' or 'openai'")
+    api_key: str = Field(default="", description="API key for the LLM provider")
+    model: str = Field(default="claude-sonnet-4-6-20250725", description="Model name/ID")
+
+    prometheus_url: str = Field(
+        default="http://kube-prometheus-stack-prometheus.monitoring:9090",
+    )
+    alertmanager_url: str = Field(
+        default="http://kube-prometheus-stack-alertmanager.monitoring:9093",
+        description="Leave empty to disable",
+    )
+
+    action_mode: str = Field(default="recommend")
+    cooldown_seconds: int = Field(default=3600)
+    node_allowlist: list[str] = Field(default_factory=list)
+    max_actions_per_hour: int = Field(default=3)
+
+    watch_conditions: list[str] = Field(
+        default_factory=lambda: list(DEFAULT_WATCH_CONDITIONS),
+    )
+    max_tool_rounds: int = Field(default=25)
+    max_tool_output: int = Field(default=8000, description="Max chars per tool result sent to LLM. 0 = unlimited.")
+    system_prompt_path: str = Field(default="/etc/gcm-sentinel/system-prompt.txt")
+    log_level: str = Field(default="INFO")
+    webhook_url: str = Field(default="")
+
+    _parse_watch_conditions = field_validator("watch_conditions", mode="before")(_parse_csv)
+    _parse_node_allowlist = field_validator("node_allowlist", mode="before")(_parse_csv)
+
+    @field_validator("action_mode")
+    @classmethod
+    def _validate_action_mode(cls, v: str) -> str:
+        if v not in ACTION_MODES:
+            raise ValueError(f"action_mode must be one of {ACTION_MODES}, got '{v}'")
+        return v
+
+    @property
+    def can_execute(self) -> bool:
+        return self.action_mode == "execute"
+
+    @property
+    def can_annotate(self) -> bool:
+        return self.action_mode in ("annotate", "execute")
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/__init__.py b/gcm-sentinel/src/gcm_sentinel/datasources/__init__.py
new file mode 100644
index 0000000..51ffe0a
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/__init__.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Data source plugin system.
+
+Each data source exposes tools the LLM can call during investigation.
+To add a new data source, subclass DataSource and register it in build_datasources().
+"""
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+
+from gcm_sentinel.config import SentinelConfig
+
+logger = logging.getLogger(__name__)
+
+
+class DataSource(ABC):
+    """Base class for data sources."""
+
+    name: str  # Short identifier, set as class attribute.
+
+    def __init__(self, cfg: SentinelConfig) -> None:
+        self.cfg = cfg
+
+    def is_available(self) -> bool:
+        return True
+
+    @abstractmethod
+    def get_tools(self) -> list[dict]: ...
+
+    @abstractmethod
+    async def execute(self, tool_name: str, tool_input: dict) -> str: ...
+
+    def get_system_prompt_section(self) -> str:
+        return ""
+
+
+class DataSourceRegistry:
+    def __init__(self, cfg: SentinelConfig) -> None:
+        self.cfg = cfg
+        self._sources: list[DataSource] = []
+        self._tool_to_source: dict[str, DataSource] = {}
+        self._all_tools: list[dict] = []
+
+    def register(self, ds: DataSource) -> None:
+        if not ds.is_available():
+            logger.info("Data source '%s' not available, skipping", ds.name)
+            return
+        tools = ds.get_tools()
+        for tool in tools:
+            self._tool_to_source[tool["name"]] = ds
+        self._all_tools.extend(tools)
+        self._sources.append(ds)
+        logger.info("Registered data source '%s' (%d tools)", ds.name, len(tools))
+
+    def get_all_tools(self) -> list[dict]:
+        return self._all_tools
+
+    def get_system_prompt_additions(self) -> str:
+        return "\n\n".join(s for ds in self._sources if (s := ds.get_system_prompt_section()))
+
+    async def execute(self, tool_name: str, tool_input: dict) -> str:
+        ds = self._tool_to_source.get(tool_name)
+        if ds is None:
+            return f"Unknown tool: {tool_name}"
+        return await ds.execute(tool_name, tool_input)
+
+    @property
+    def datasource_names(self) -> list[str]:
+        return [ds.name for ds in self._sources]
+
+
+def build_datasources(cfg: SentinelConfig) -> DataSourceRegistry:
+    from gcm_sentinel.datasources.alertmanager import AlertmanagerDataSource
+    from gcm_sentinel.datasources.dcgm_direct import DCGMDirectDataSource
+    from gcm_sentinel.datasources.gcm_health import GCMHealthDataSource
+    from gcm_sentinel.datasources.kubernetes_core import KubernetesCoreDataSource
+    from gcm_sentinel.datasources.node_direct import NodeDirectDataSource
+    from gcm_sentinel.datasources.kubernetes_workloads import KubernetesWorkloadsDataSource
+    from gcm_sentinel.datasources.prometheus import PrometheusDataSource
+    from gcm_sentinel.datasources.remediation import RemediationDataSource
+
+    registry = DataSourceRegistry(cfg)
+    registry.register(PrometheusDataSource(cfg))
+    registry.register(DCGMDirectDataSource(cfg))
+    registry.register(NodeDirectDataSource(cfg))
+    registry.register(KubernetesCoreDataSource(cfg))
+    registry.register(GCMHealthDataSource(cfg))
+    registry.register(KubernetesWorkloadsDataSource(cfg))
+    registry.register(AlertmanagerDataSource(cfg))
+    registry.register(RemediationDataSource(cfg))
+    return registry
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/alertmanager.py b/gcm-sentinel/src/gcm_sentinel/datasources/alertmanager.py
new file mode 100644
index 0000000..293c602
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/alertmanager.py
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+from gcm_sentinel.datasources import DataSource
+from gcm_sentinel.tools.prometheus import query_alertmanager
+
+
+class AlertmanagerDataSource(DataSource):
+    name = "alertmanager"
+
+    def is_available(self):
+        return bool(self.cfg.alertmanager_url)
+
+    def get_tools(self):
+        return [{
+            "name": "query_alertmanager",
+            "description": "Currently firing Alertmanager alerts, optionally filtered by node.",
+            "input_schema": {
+                "type": "object",
+                "properties": {"node_name": {"type": "string", "default": ""}},
+                "required": [],
+            },
+        }]
+
+    async def execute(self, tool_name, tool_input):
+        return await query_alertmanager(self.cfg.alertmanager_url, tool_input.get("node_name", ""))
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/dcgm_direct.py b/gcm-sentinel/src/gcm_sentinel/datasources/dcgm_direct.py
new file mode 100644
index 0000000..9e1cb06
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/dcgm_direct.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Query dcgm-exporter directly on GPU nodes.
+
+Falls back gracefully — if dcgm-exporter isn't running on a node,
+the tool returns a clear message and the LLM uses other data sources.
+"""
+from __future__ import annotations
+
+from gcm_sentinel.datasources import DataSource
+from gcm_sentinel.tools.dcgm import query_dcgm_direct
+
+
+class DCGMDirectDataSource(DataSource):
+    name = "dcgm_direct"
+
+    def get_tools(self):
+        return [{
+            "name": "query_dcgm_direct",
+            "description": (
+                "Query GPU metrics directly from the dcgm-exporter pod running on a node. "
+                "Use this when Prometheus doesn't have DCGM metrics, or to get the latest "
+                "real-time GPU data. Optionally filter by metric name substring."
+            ),
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "node_name": {"type": "string", "description": "Kubernetes node name"},
+                    "metric_filter": {
+                        "type": "string",
+                        "description": "Only return metrics containing this substring (e.g. 'ECC', 'TEMP', 'XID'). Empty for key metrics summary.",
+                        "default": "",
+                    },
+                },
+                "required": ["node_name"],
+            },
+        }]
+
+    async def execute(self, tool_name, tool_input):
+        return await query_dcgm_direct(tool_input["node_name"], tool_input.get("metric_filter", ""))
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/gcm_health.py b/gcm-sentinel/src/gcm_sentinel/datasources/gcm_health.py
new file mode 100644
index 0000000..954afe7
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/gcm_health.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+from gcm_sentinel.datasources import DataSource
+from gcm_sentinel.tools.gcm import get_gcm_health
+
+
+class GCMHealthDataSource(DataSource):
+    name = "gcm_health"
+
+    def get_tools(self):
+        return [{
+            "name": "get_gcm_health",
+            "description": "NPD health: problem_gauge/counter metrics + Gcm/NPD node conditions.",
+            "input_schema": {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]},
+        }]
+
+    async def execute(self, tool_name, tool_input):
+        return await get_gcm_health(tool_input["node_name"], self.cfg.prometheus_url)
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_core.py b/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_core.py
new file mode 100644
index 0000000..66f15cf
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_core.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+from gcm_sentinel.datasources import DataSource
+from gcm_sentinel.tools.kubernetes import get_node_events, get_node_info, get_pod_logs
+
+_NODE_SCHEMA = {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]}
+
+
+class KubernetesCoreDataSource(DataSource):
+    name = "kubernetes_core"
+
+    def get_tools(self):
+        return [
+            {"name": "get_node_info", "description": "Node conditions, taints, labels, annotations, resources, pods.", "input_schema": _NODE_SCHEMA},
+            {"name": "get_node_events", "description": "Recent K8s Events on the node (warnings, errors, investigation history).", "input_schema": _NODE_SCHEMA},
+            {
+                "name": "get_pod_logs",
+                "description": "Logs from monitoring pods (dcgm, NPD, GCM). Use pod_name_pattern to filter.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "node_name": {"type": "string"},
+                        "pod_name_pattern": {"type": "string", "default": ""},
+                        "tail_lines": {"type": "integer", "default": 100},
+                    },
+                    "required": ["node_name"],
+                },
+            },
+        ]
+
+    async def execute(self, tool_name, tool_input):
+        if tool_name == "get_node_info":
+            return await get_node_info(tool_input["node_name"])
+        elif tool_name == "get_node_events":
+            return await get_node_events(tool_input["node_name"])
+        elif tool_name == "get_pod_logs":
+            return await get_pod_logs(tool_input["node_name"], tool_input.get("pod_name_pattern", ""), tool_input.get("tail_lines", 100))
+        return f"Unknown tool: {tool_name}"
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_workloads.py b/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_workloads.py
new file mode 100644
index 0000000..8ab898f
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/kubernetes_workloads.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+from gcm_sentinel.datasources import DataSource
+from gcm_sentinel.tools.kubernetes import get_workload_info, get_workload_logs
+
+
+class KubernetesWorkloadsDataSource(DataSource):
+    name = "kubernetes_workloads"
+
+    def get_tools(self):
+        return [
+            {
+                "name": "get_workload_info",
+                "description": "GPU workload identity: job owner, labels, GPU requests, container status, sibling pods.",
+                "input_schema": {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]},
+            },
+            {
+                "name": "get_workload_logs",
+                "description": "Error logs from GPU workload pods (NCCL, CUDA, XID, OOM patterns).",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {"node_name": {"type": "string"}, "tail_lines": {"type": "integer", "default": 200}},
+                    "required": ["node_name"],
+                },
+            },
+        ]
+
+    async def execute(self, tool_name, tool_input):
+        if tool_name == "get_workload_info":
+            return await get_workload_info(tool_input["node_name"])
+        elif tool_name == "get_workload_logs":
+            return await get_workload_logs(tool_input["node_name"], tool_input.get("tail_lines", 200))
+        return f"Unknown tool: {tool_name}"
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/node_direct.py b/gcm-sentinel/src/gcm_sentinel/datasources/node_direct.py
new file mode 100644
index 0000000..a9ffbf6
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/node_direct.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Query node-exporter and InfiniBand metrics directly on nodes."""
+from __future__ import annotations
+
+from gcm_sentinel.datasources import DataSource
+from gcm_sentinel.tools.node_exporter import query_infiniband_direct, query_node_exporter_direct
+
+_NODE_SCHEMA = {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]}
+
+
+class NodeDirectDataSource(DataSource):
+    name = "node_direct"
+
+    def get_tools(self):
+        return [
+            {
+                "name": "query_node_exporter_direct",
+                "description": (
+                    "Query host metrics directly from node-exporter on a node. "
+                    "Returns CPU, memory, temps, disk, and network stats. "
+                    "Use metric_filter to narrow (e.g. 'hwmon_temp', 'load', 'infiniband')."
+                ),
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "node_name": {"type": "string"},
+                        "metric_filter": {"type": "string", "description": "Substring filter", "default": ""},
+                    },
+                    "required": ["node_name"],
+                },
+            },
+            {
+                "name": "query_infiniband_direct",
+                "description": (
+                    "Query InfiniBand port metrics directly from node-exporter. "
+                    "Shows IB link errors, data counters, and port status. "
+                    "Falls back to general network error metrics if no IB found."
+                ),
+                "input_schema": _NODE_SCHEMA,
+            },
+        ]
+
+    async def execute(self, tool_name, tool_input):
+        if tool_name == "query_node_exporter_direct":
+            return await query_node_exporter_direct(tool_input["node_name"], tool_input.get("metric_filter", ""))
+        elif tool_name == "query_infiniband_direct":
+            return await query_infiniband_direct(tool_input["node_name"])
+        return f"Unknown tool: {tool_name}"
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/prometheus.py b/gcm-sentinel/src/gcm_sentinel/datasources/prometheus.py
new file mode 100644
index 0000000..8eda97a
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/prometheus.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+from gcm_sentinel.datasources import DataSource
+from gcm_sentinel.tools.prometheus import query_prometheus
+
+
+class PrometheusDataSource(DataSource):
+    name = "prometheus"
+
+    def get_tools(self):
+        return [{
+            "name": "query_prometheus",
+            "description": (
+                "Execute a PromQL query against the cluster's Prometheus server. "
+                "Common metric prefixes: DCGM_FI_DEV_* (GPU), node_* (host), "
+                "kube_* (K8s state), problem_* (NPD). "
+                "Filter by node with {instance=~\"NODE.*\"}, by GPU with {gpu=\"IDX\"}."
+            ),
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string", "description": "PromQL expression"},
+                    "duration": {"type": "string", "description": "Range fallback (e.g. '5m')", "default": "5m"},
+                },
+                "required": ["query"],
+            },
+        }]
+
+    async def execute(self, tool_name, tool_input):
+        return await query_prometheus(self.cfg.prometheus_url, tool_input["query"], tool_input.get("duration", "5m"))
diff --git a/gcm-sentinel/src/gcm_sentinel/datasources/remediation.py b/gcm-sentinel/src/gcm_sentinel/datasources/remediation.py
new file mode 100644
index 0000000..8a73d29
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/datasources/remediation.py
@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Only registered when action_mode="execute". Tools are absent from
+the LLM's schema in recommend/annotate modes.
+
+All safety checks enforced HERE, before the K8s API call:
+- Rate limiting
+- Node name validation (can only act on the node being investigated)
+- Node name validation (can only act on the node being investigated)
+"""
+from __future__ import annotations
+
+import logging
+import time
+from collections import deque
+
+from gcm_sentinel.datasources import DataSource
+from gcm_sentinel.tools.kubernetes import cordon_node, drain_node, taint_node
+
+logger = logging.getLogger(__name__)
+
+
+class RemediationDataSource(DataSource):
+    name = "remediation"
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self._action_timestamps: deque[float] = deque()
+        self._allowed_node: str | None = None  # Set by the engine before each investigation.
+
+    def is_available(self):
+        return self.cfg.can_execute
+
+    def set_allowed_node(self, node_name: str) -> None:
+        """Restrict remediation to a specific node. Called by the engine."""
+        self._allowed_node = node_name
+
+    def _is_rate_limited(self) -> bool:
+        now = time.monotonic()
+        while self._action_timestamps and (now - self._action_timestamps[0]) > 3600:
+            self._action_timestamps.popleft()
+        return len(self._action_timestamps) >= self.cfg.max_actions_per_hour
+
+    def get_tools(self):
+        return [
+            {
+                "name": "cordon_node",
+                "description": "Mark node unschedulable. Only for confirmed hardware issues.",
+                "input_schema": {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]},
+            },
+            {
+                "name": "drain_node",
+                "description": "Evict all non-DaemonSet pods + cordon. Only for critical hardware failures.",
+                "input_schema": {"type": "object", "properties": {"node_name": {"type": "string"}}, "required": ["node_name"]},
+            },
+            {
+                "name": "taint_node",
+                "description": "Add a taint to a node.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "node_name": {"type": "string"},
+                        "key": {"type": "string"}, "value": {"type": "string"},
+                        "effect": {"type": "string", "enum": ["NoSchedule", "PreferNoSchedule", "NoExecute"]},
+                    },
+                    "required": ["node_name", "key", "value", "effect"],
+                },
+            },
+        ]
+
+    async def execute(self, tool_name, tool_input):
+        target_node = tool_input.get("node_name", "")
+
+        # Guard: only allow actions on the node being investigated.
+        if self._allowed_node and target_node != self._allowed_node:
+            msg = (
+                f"BLOCKED: {tool_name} rejected — target node '{target_node}' does not match "
+                f"the node being investigated ('{self._allowed_node}'). "
+                f"You can only remediate the node you are investigating."
+            )
+            logger.warning(msg)
+            return msg
+
+        # Guard: rate limit.
+        if self._is_rate_limited():
+            msg = (
+                f"RATE LIMITED: {tool_name} blocked. "
+                f"{self.cfg.max_actions_per_hour} remediation actions per hour limit reached. "
+                f"Include this action in your recommended_action assessment instead."
+            )
+            logger.warning(msg)
+            return msg
+
+        # Execute.
+        if tool_name == "cordon_node":
+            result = await cordon_node(target_node)
+        elif tool_name == "drain_node":
+            result = await drain_node(target_node)
+        elif tool_name == "taint_node":
+            result = await taint_node(target_node, tool_input["key"], tool_input["value"], tool_input["effect"])
+        else:
+            return f"Unknown tool: {tool_name}"
+
+        self._action_timestamps.append(time.monotonic())
+        logger.info("Remediation action %s executed on %s (%d/%d this hour)",
+                     tool_name, target_node, len(self._action_timestamps), self.cfg.max_actions_per_hour)
+        return result
diff --git a/gcm-sentinel/src/gcm_sentinel/engine.py b/gcm-sentinel/src/gcm_sentinel/engine.py
new file mode 100644
index 0000000..83560f8
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/engine.py
@@ -0,0 +1,175 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Investigation engine — drives an LLM through a tool-use loop."""
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+from pathlib import Path
+
+from gcm_sentinel.backends import get_backend
+from gcm_sentinel.config import SentinelConfig
+from gcm_sentinel.models import Action, NodeConditionEvent, Severity, ToolCall, SentinelResult
+from gcm_sentinel.datasources import DataSourceRegistry, build_datasources
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_SYSTEM_PROMPT = """\
+You are an expert GPU cluster reliability engineer. A Kubernetes node has reported
+a GPU-related problem.
+
+Investigate thoroughly using ALL available tools — explore every data source you
+have access to before making your assessment. The tools available to you are your
+eyes and hands into the cluster. Use them extensively.
+
+For XID error code reference, see: https://docs.nvidia.com/deploy/xid-errors/
+
+## Assessment
+
+Provide your final assessment as JSON:
+```json
+{"severity": "critical|warning|info|ok", "summary": "...", "root_cause": "...", "recommended_action": "cordon|drain|taint|reboot|none", "confidence": 0.0-1.0}
+```
+"""
+
+_MODE_DESCRIPTIONS = {
+    "recommend": "Action mode: RECOMMEND (observe-only). Do NOT attempt cluster mutations.",
+    "annotate": "Action mode: ANNOTATE. Recommendations will be written as node annotations. Do NOT mutate directly.",
+    "execute": "Action mode: EXECUTE. You may cordon/drain/taint nodes for confirmed hardware failures.",
+}
+
+
+def _load_system_prompt(cfg: SentinelConfig, registry: DataSourceRegistry) -> str:
+    path = Path(cfg.system_prompt_path)
+    base = path.read_text() if path.is_file() else DEFAULT_SYSTEM_PROMPT
+    additions = registry.get_system_prompt_additions()
+    return f"{base}\n\n{additions}" if additions else base
+
+
+def _parse_assessment(text: str) -> dict:
+    # Strip markdown fences.
+    cleaned = text
+    if "```json" in cleaned:
+        cleaned = cleaned.split("```json")[-1].split("```")[0]
+    elif "```" in cleaned:
+        parts = cleaned.split("```")
+        if len(parts) >= 3:
+            cleaned = parts[-2]
+
+    # Find JSON object with assessment fields.
+    target_keys = {"severity", "recommended_action", "confidence"}
+    depth, start = 0, -1
+    for i, ch in enumerate(cleaned):
+        if ch == "{":
+            if depth == 0:
+                start = i
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0 and start >= 0:
+                try:
+                    parsed = json.loads(cleaned[start:i + 1])
+                    if isinstance(parsed, dict) and target_keys & parsed.keys():
+                        return parsed
+                except json.JSONDecodeError:
+                    pass
+                start = -1
+    return {}
+
+
+async def run_investigation(
+    event: NodeConditionEvent,
+    cfg: SentinelConfig,
+    registry: DataSourceRegistry | None = None,
+) -> SentinelResult:
+    if registry is None:
+        registry = build_datasources(cfg)
+
+    # Restrict remediation tools to the node being investigated.
+    from gcm_sentinel.datasources.remediation import RemediationDataSource
+    for ds in registry._sources:
+        if isinstance(ds, RemediationDataSource):
+            ds.set_allowed_node(event.node_name)
+
+    start_time = time.monotonic()
+    system_prompt = _load_system_prompt(cfg, registry)
+    tools = registry.get_all_tools()
+    backend = get_backend(cfg.llm_api, cfg.api_key, cfg.model)
+    tool_calls_log: list[ToolCall] = []
+    text_parts: list[str] = []
+
+    logger.info("Investigating %s/%s via %s/%s (%d tools)", event.node_name, event.condition_type, cfg.llm_api, cfg.model, len(tools))
+
+    messages: list[dict] = [{"role": "user", "content": (
+        f"Node condition change detected:\n\n"
+        f"- **Node**: {event.node_name}\n- **Condition**: {event.condition_type}\n"
+        f"- **Status**: {event.status}\n- **Message**: {event.message}\n"
+        f"- **Time**: {event.timestamp}\n\n{_MODE_DESCRIPTIONS[cfg.action_mode]}\n\n"
+        f"Investigate and provide your assessment."
+    )}]
+
+    investigation_parts = [f"== Investigation for {event.node_name} =="]
+
+    for round_num in range(1, cfg.max_tool_rounds + 1):
+        response = backend.chat(messages=messages, system=system_prompt, tools=tools)
+        text_parts = response.text_parts
+        investigation_parts.extend(text_parts)
+
+        if response.done:
+            break
+
+        backend.append_assistant_response(messages, response)
+        results = []
+        for req in response.tool_requests:
+            investigation_parts.append(f"\n> {req.name}({json.dumps(req.input, default=str)[:200]})")
+            try:
+                result = await registry.execute(req.name, req.input)
+            except Exception as exc:
+                result = f"Tool error: {type(exc).__name__}: {exc}"
+                logger.exception("Tool %s failed", req.name)
+
+            # Truncate if configured (controls LLM context cost).
+            llm_result = result
+            if cfg.max_tool_output and len(result) > cfg.max_tool_output:
+                llm_result = result[:cfg.max_tool_output] + (
+                    f"\n\n[OUTPUT TRUNCATED: showing {cfg.max_tool_output} of {len(result)} chars. "
+                    f"Call the tool again with a more specific query to get the data you need.]"
+                )
+
+            tool_calls_log.append(ToolCall(tool_name=req.name, tool_input=req.input, result=result))
+            investigation_parts.append(f"Result: {result}")
+            results.append((req, llm_result))
+
+        backend.append_tool_results(messages, response, results)
+
+    final_text = "\n".join(text_parts)
+    assessment = _parse_assessment(final_text)
+
+    logger.info("Investigation complete for %s in %.1fs (%d tool calls)", event.node_name, time.monotonic() - start_time, len(tool_calls_log))
+
+    try:
+        severity = Severity(assessment.get("severity", "warning"))
+    except ValueError:
+        severity = Severity.WARNING
+    try:
+        action = Action(assessment.get("recommended_action", "none"))
+    except ValueError:
+        action = Action.NONE
+    try:
+        confidence = max(0.0, min(1.0, float(assessment.get("confidence", 0.5))))
+    except (ValueError, TypeError):
+        confidence = 0.5
+
+    return SentinelResult(
+        node_name=event.node_name,
+        condition=event.condition_type,
+        severity=severity,
+        summary=assessment.get("summary", final_text[:500]),
+        root_cause=assessment.get("root_cause", "Unknown"),
+        recommended_action=action,
+        confidence=confidence,
+        tool_calls=tool_calls_log,
+        investigation_log="\n".join(investigation_parts),
+    )
diff --git a/gcm-sentinel/src/gcm_sentinel/k8s.py b/gcm-sentinel/src/gcm_sentinel/k8s.py
new file mode 100644
index 0000000..17e25aa
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/k8s.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Shared Kubernetes client initialization."""
+
+from kubernetes import client, config
+
+
+def get_core_api() -> client.CoreV1Api:
+    try:
+        config.load_incluster_config()
+    except config.ConfigException:
+        config.load_kube_config()
+    return client.CoreV1Api()
diff --git a/gcm-sentinel/src/gcm_sentinel/main.py b/gcm-sentinel/src/gcm_sentinel/main.py
new file mode 100644
index 0000000..cd62a2e
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/main.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Main entry point for the GCM sentinel agent.
+
+The agent is a pure watcher process — no listening ports, no HTTP server.
+It watches K8s node conditions and triggers investigation when a GPU problem is
+detected. Results go to logs, K8s Events, webhooks, and node annotations.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+
+import click
+
+from gcm_sentinel.config import SentinelConfig
+from gcm_sentinel.watcher import NodeConditionWatcher
+
+logger = logging.getLogger(__name__)
+
+
+@click.group()
+def cli():
+    """GCM Sentinel — AI-powered GPU cluster sentinel agent."""
+    pass
+
+
+@cli.command()
+@click.option("--log-level", default="INFO", help="Log level")
+def serve(log_level: str):
+    """Start the sentinel agent (watches for GPU failures, triggers investigation)."""
+    logging.basicConfig(
+        level=getattr(logging, log_level.upper()),
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+    cfg = SentinelConfig()
+
+    # Print safety status prominently at startup.
+    mode_banner = {
+        "recommend": "OBSERVE-ONLY (no cluster mutations)",
+        "annotate": "ANNOTATE (writes node annotations, no cordon/drain)",
+        "execute": "EXECUTE (can cordon/drain/taint nodes!)",
+    }
+    logger.info("=" * 60)
+    logger.info("GCM Sentinel Agent")
+    logger.info("Action mode: %s — %s", cfg.action_mode, mode_banner[cfg.action_mode])
+    if cfg.action_mode == "execute":
+        logger.warning(
+            "EXECUTE MODE ACTIVE — agent can modify cluster state. "
+            "Max actions/hour: %d",
+            cfg.max_actions_per_hour,
+        )
+    if cfg.node_allowlist:
+        logger.info("Node allowlist: %s", cfg.node_allowlist)
+    else:
+        logger.info("Node allowlist: (all nodes)")
+    logger.info("Cooldown: %ds, Prometheus: %s", cfg.cooldown_seconds, cfg.prometheus_url)
+    logger.info("=" * 60)
+
+    # Run the watcher loop — blocks forever, reconnects on errors.
+    watcher = NodeConditionWatcher(cfg)
+    asyncio.run(watcher.watch_loop())
+
+
+@cli.command()
+@click.argument("node_name")
+@click.option("--condition", default="GcmXidErrorsProblem", help="Condition to investigate")
+@click.option("--message", default="XID errors detected on GPU", help="Condition message")
+def investigate(node_name: str, condition: str, message: str):
+    """Run a one-shot investigation for a specific node."""
+    from gcm_sentinel.models import NodeConditionEvent
+    from gcm_sentinel.engine import run_investigation
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+    cfg = SentinelConfig()
+    event = NodeConditionEvent(
+        node_name=node_name,
+        condition_type=condition,
+        status="True",
+        reason=condition,
+        message=message,
+    )
+    result = asyncio.run(run_investigation(event, cfg))
+    print(f"\n{'='*60}")
+    print(f"Node:       {result.node_name}")
+    print(f"Severity:   {result.severity.value}")
+    print(f"Confidence: {result.confidence:.0%}")
+    print(f"Action:     {result.recommended_action.value}")
+    print(f"Summary:    {result.summary}")
+    print(f"Root cause: {result.root_cause}")
+    print(f"Tool calls: {len(result.tool_calls)}")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/gcm-sentinel/src/gcm_sentinel/models.py b/gcm-sentinel/src/gcm_sentinel/models.py
new file mode 100644
index 0000000..1ec034b
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/models.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+import enum
+from datetime import datetime, timezone
+
+from pydantic import BaseModel, Field
+
+
+class Severity(str, enum.Enum):
+    CRITICAL = "critical"
+    WARNING = "warning"
+    INFO = "info"
+    OK = "ok"
+
+
+class Action(str, enum.Enum):
+    CORDON = "cordon"
+    DRAIN = "drain"
+    TAINT = "taint"
+    REBOOT = "reboot"
+    NONE = "none"
+
+
+class NodeConditionEvent(BaseModel):
+    node_name: str
+    condition_type: str
+    status: str
+    reason: str = ""
+    message: str = ""
+    timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+
+
+class ToolCall(BaseModel):
+    tool_name: str
+    tool_input: dict
+    result: str
+
+
+class SentinelResult(BaseModel):
+    node_name: str
+    condition: str
+    severity: Severity
+    summary: str
+    root_cause: str
+    recommended_action: Action
+    confidence: float = Field(ge=0.0, le=1.0)
+    tool_calls: list[ToolCall] = Field(default_factory=list)
+    investigation_log: str = ""
+    timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
diff --git a/gcm-sentinel/src/gcm_sentinel/notify.py b/gcm-sentinel/src/gcm_sentinel/notify.py
new file mode 100644
index 0000000..07c449b
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/notify.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+import logging
+from datetime import datetime, timezone
+
+import httpx
+from kubernetes import client
+
+from gcm_sentinel.k8s import get_core_api
+from gcm_sentinel.models import SentinelResult
+
+logger = logging.getLogger(__name__)
+
+
+async def annotate_node(result: SentinelResult) -> None:
+    try:
+        api = get_core_api()
+        api.patch_node(result.node_name, {"metadata": {"annotations": {
+            "gcm-sentinel/severity": result.severity.value,
+            "gcm-sentinel/action": result.recommended_action.value,
+            "gcm-sentinel/confidence": f"{result.confidence:.2f}",
+            "gcm-sentinel/summary": result.summary[:250],
+            "gcm-sentinel/root-cause": result.root_cause[:250],
+            "gcm-sentinel/condition": result.condition,
+            "gcm-sentinel/timestamp": datetime.now(timezone.utc).isoformat(),
+        }}})
+    except Exception:
+        logger.exception("Failed to annotate node %s", result.node_name)
+
+
+async def emit_k8s_event(result: SentinelResult) -> None:
+    try:
+        now = datetime.now(timezone.utc)
+        get_core_api().create_namespaced_event("default", client.CoreV1Event(
+            metadata=client.V1ObjectMeta(generate_name="gcm-sentinel-", namespace="default"),
+            involved_object=client.V1ObjectReference(kind="Node", name=result.node_name, api_version="v1"),
+            reason="GCMSentinel",
+            message=f"[{result.severity.value.upper()}] {result.summary} | Action: {result.recommended_action.value} | Confidence: {result.confidence:.0%}",
+            type="Warning" if result.severity.value in ("critical", "warning") else "Normal",
+            first_timestamp=now, last_timestamp=now,
+            source=client.V1EventSource(component="gcm-sentinel"),
+            reporting_component="gcm-sentinel",
+        ))
+    except Exception:
+        logger.exception("Failed to create K8s Event for %s", result.node_name)
+
+
+async def emit_webhook(result: SentinelResult, webhook_url: str) -> None:
+    payload = {
+        "text": (
+            f"[{result.severity.value.upper()}] GPU Sentinel: {result.node_name} / {result.condition}\n"
+            f"*Summary*: {result.summary}\n*Root cause*: {result.root_cause}\n"
+            f"*Action*: {result.recommended_action.value} (confidence {result.confidence:.0%})"
+        ),
+        "sentinel_result": {
+            "node_name": result.node_name, "condition": result.condition,
+            "severity": result.severity.value, "summary": result.summary,
+            "root_cause": result.root_cause,
+            "recommended_action": result.recommended_action.value,
+            "confidence": result.confidence, "timestamp": result.timestamp.isoformat(),
+        },
+    }
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as http:
+            (await http.post(webhook_url, json=payload)).raise_for_status()
+    except Exception:
+        logger.exception("Webhook delivery failed for %s", result.node_name)
+
+
+async def notify(result: SentinelResult, webhook_url: str = "") -> None:
+    await emit_k8s_event(result)
+    if webhook_url:
+        await emit_webhook(result, webhook_url)
diff --git a/gcm-sentinel/src/gcm_sentinel/tools/__init__.py b/gcm-sentinel/src/gcm_sentinel/tools/__init__.py
new file mode 100644
index 0000000..ae1b0cf
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/tools/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
diff --git a/gcm-sentinel/src/gcm_sentinel/tools/dcgm.py b/gcm-sentinel/src/gcm_sentinel/tools/dcgm.py
new file mode 100644
index 0000000..7defa51
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/tools/dcgm.py
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Query dcgm-exporter directly on a node when Prometheus doesn't have DCGM metrics."""
+from __future__ import annotations
+
+import httpx
+
+from gcm_sentinel.k8s import get_core_api
+
+
+def _find_dcgm_pod_ip(node_name: str) -> str | None:
+    """Find the dcgm-exporter pod IP running on a specific node."""
+    api = get_core_api()
+    pods = api.list_pod_for_all_namespaces(
+        field_selector=f"spec.nodeName={node_name}",
+        label_selector="app.kubernetes.io/name=dcgm-exporter",
+    )
+    if not pods.items:
+        # Try broader match.
+        pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}")
+        for pod in pods.items:
+            if "dcgm" in pod.metadata.name.lower():
+                return pod.status.pod_ip
+        return None
+    return pods.items[0].status.pod_ip
+
+
+def _parse_prometheus_text(text: str, metric_filter: str = "") -> dict[str, list[dict]]:
+    """Parse Prometheus exposition format into {metric_name: [{labels, value}]}."""
+    metrics: dict[str, list[dict]] = {}
+    for line in text.strip().split("\n"):
+        if line.startswith("#") or not line:
+            continue
+        if metric_filter and metric_filter not in line:
+            continue
+        # "METRIC_NAME{label=val,...} VALUE"
+        if "{" in line:
+            name_part, rest = line.split("{", 1)
+            labels_str, value_str = rest.rsplit("}", 1)
+            labels = {}
+            for pair in labels_str.split(","):
+                if "=" in pair:
+                    k, v = pair.split("=", 1)
+                    labels[k.strip()] = v.strip().strip('"')
+        else:
+            parts = line.split()
+            if len(parts) < 2:
+                continue
+            name_part = parts[0]
+            value_str = parts[1]
+            labels = {}
+
+        name = name_part.strip()
+        try:
+            value = float(value_str.strip())
+        except ValueError:
+            continue
+
+        metrics.setdefault(name, []).append({"labels": labels, "value": value})
+    return metrics
+
+
+async def query_dcgm_direct(node_name: str, metric_filter: str = "") -> str:
+    """Query dcgm-exporter directly on a node via its pod IP.
+
+    Useful when Prometheus doesn't scrape dcgm-exporter (common on clusters
+    where DCGM metrics go through OTel instead).
+    """
+    import json
+
+    pod_ip = _find_dcgm_pod_ip(node_name)
+    if not pod_ip:
+        return f"No dcgm-exporter pod found on node {node_name}"
+
+    async with httpx.AsyncClient(timeout=10.0) as client:
+        try:
+            resp = await client.get(f"http://{pod_ip}:9400/metrics")
+            resp.raise_for_status()
+        except Exception as exc:
+            return f"Failed to query dcgm-exporter at {pod_ip}:9400: {exc}"
+
+    metrics = _parse_prometheus_text(resp.text, metric_filter)
+
+    # If no filter, summarize key metrics only (full output is too large).
+    if not metric_filter:
+        key_metrics = [
+            "DCGM_FI_DEV_GPU_TEMP", "DCGM_FI_DEV_MEMORY_TEMP",
+            "DCGM_FI_DEV_POWER_USAGE", "DCGM_FI_DEV_GPU_UTIL",
+            "DCGM_FI_DEV_SM_CLOCK", "DCGM_FI_DEV_FB_USED",
+            "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL", "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL",
+            "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL", "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL",
+            "DCGM_FI_DEV_RETIRED_SBE", "DCGM_FI_DEV_RETIRED_DBE",
+            "DCGM_FI_DEV_RETIRED_PENDING",
+            "DCGM_FI_DEV_XID_ERRORS", "DCGM_FI_DEV_PCIE_REPLAY_COUNTER",
+            "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL",
+            "DCGM_FI_DEV_THERMAL_VIOLATION", "DCGM_FI_DEV_PSTATE",
+        ]
+        metrics = {k: v for k, v in metrics.items() if k in key_metrics}
+
+    text = json.dumps(metrics, indent=2)
+    return text
diff --git a/gcm-sentinel/src/gcm_sentinel/tools/gcm.py b/gcm-sentinel/src/gcm_sentinel/tools/gcm.py
new file mode 100644
index 0000000..064bef4
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/tools/gcm.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+import json
+
+import httpx
+
+from gcm_sentinel.k8s import get_core_api
+
+
+async def get_gcm_health(node_name: str, prometheus_url: str) -> str:
+    """NPD health: problem_gauge/counter metrics + Gcm/NPD node conditions."""
+    health: dict = {"node": node_name, "npd_metrics": {}, "conditions": []}
+
+    npd_queries = {
+        "problem_gauge": f'problem_gauge{{instance=~"{node_name}.*"}}',
+        "problem_counter": f'problem_counter{{instance=~"{node_name}.*"}}',
+    }
+
+    async with httpx.AsyncClient(timeout=15.0) as http:
+        for key, query in npd_queries.items():
+            try:
+                resp = await http.get(
+                    f"{prometheus_url}/api/v1/query",
+                    params={"query": query},
+                )
+                resp.raise_for_status()
+                health["npd_metrics"][key] = resp.json().get("data", {}).get("result", [])
+            except Exception as exc:
+                health["npd_metrics"][key] = f"error: {exc}"
+
+    try:
+        api = get_core_api()
+        node = api.read_node(node_name)
+        for cond in node.status.conditions or []:
+            if cond.type.startswith(("Gcm", "NPD")):
+                health["conditions"].append({
+                    "type": cond.type,
+                    "status": cond.status,
+                    "reason": cond.reason or "",
+                    "message": cond.message or "",
+                    "lastTransitionTime": str(cond.last_transition_time),
+                })
+    except Exception as exc:
+        health["conditions_error"] = str(exc)
+
+    text = json.dumps(health, indent=2, default=str)
+    return text
diff --git a/gcm-sentinel/src/gcm_sentinel/tools/kubernetes.py b/gcm-sentinel/src/gcm_sentinel/tools/kubernetes.py
new file mode 100644
index 0000000..f02102a
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/tools/kubernetes.py
@@ -0,0 +1,257 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+import json
+import logging
+
+from kubernetes import client
+
+from gcm_sentinel.k8s import get_core_api
+
+logger = logging.getLogger(__name__)
+
+
+
+
+# ---------------------------------------------------------------------------
+# Read-only tools
+# ---------------------------------------------------------------------------
+
+async def get_node_info(node_name: str) -> str:
+    api = get_core_api()
+    node = api.read_node(node_name)
+
+    info = {
+        "name": node_name,
+        "labels": dict(node.metadata.labels or {}),
+        "annotations": {
+            k: v for k, v in (node.metadata.annotations or {}).items()
+            if k.startswith("gcm-sentinel") or k.startswith("node.kubernetes.io/")
+        },
+        "taints": [
+            {"key": t.key, "value": t.value or "", "effect": t.effect}
+            for t in (node.spec.taints or [])
+        ],
+        "conditions": [
+            {
+                "type": c.type, "status": c.status,
+                "reason": c.reason or "", "message": c.message or "",
+                "lastTransitionTime": str(c.last_transition_time),
+            }
+            for c in (node.status.conditions or [])
+        ],
+        "allocatable": {k: str(v) for k, v in (node.status.allocatable or {}).items()},
+        "capacity": {k: str(v) for k, v in (node.status.capacity or {}).items()},
+        "pods": [
+            {"name": p.metadata.name, "namespace": p.metadata.namespace, "phase": p.status.phase}
+            for p in api.list_pod_for_all_namespaces(
+                field_selector=f"spec.nodeName={node_name}"
+            ).items[:50]
+        ],
+    }
+    return json.dumps(info, indent=2, default=str)
+
+
+async def get_node_events(node_name: str) -> str:
+    api = get_core_api()
+    events = api.list_event_for_all_namespaces(
+        field_selector=f"involvedObject.name={node_name},involvedObject.kind=Node",
+    )
+    result = [
+        {
+            "type": e.type, "reason": e.reason or "",
+            "message": (e.message or "")[:300],
+            "source": e.source.component if e.source else "",
+            "count": e.count,
+            "firstTimestamp": str(e.first_timestamp),
+            "lastTimestamp": str(e.last_timestamp),
+        }
+        for e in events.items[-50:]
+    ]
+    return json.dumps(result, indent=2, default=str)
+
+
+async def get_pod_logs(node_name: str, pod_name_pattern: str = "", tail_lines: int = 100) -> str:
+    api = get_core_api()
+    pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}")
+    monitoring_patterns = ["dcgm", "npd", "node-problem", "gcm", "gpu"]
+
+    results = {}
+    for pod in pods.items:
+        name, ns = pod.metadata.name, pod.metadata.namespace
+        if pod_name_pattern:
+            if pod_name_pattern.lower() not in name.lower():
+                continue
+        elif not any(p in name.lower() for p in monitoring_patterns):
+            continue
+
+        try:
+            container = pod.spec.containers[0].name if pod.spec.containers else None
+            if not container:
+                continue
+            logs = api.read_namespaced_pod_log(
+                name=name, namespace=ns, container=container,
+                tail_lines=tail_lines, timestamps=True,
+            )
+            results[f"{ns}/{name}"] = logs[-3000:]
+        except Exception as exc:
+            results[f"{ns}/{name}"] = f"error: {exc}"
+
+        if len(results) >= 5:
+            break
+
+    return json.dumps(results, indent=2) if results else "No matching pods found."
+
+
+async def get_workload_info(node_name: str) -> str:
+    api = get_core_api()
+    pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}")
+    system_ns = {"kube-system", "monitoring", "kube-node-lease", "kube-public"}
+
+    workloads = []
+    for pod in pods.items:
+        ns = pod.metadata.namespace
+        if ns in system_ns:
+            continue
+        owner_refs = pod.metadata.owner_references or []
+        if any(ref.kind == "DaemonSet" for ref in owner_refs):
+            continue
+
+        gpu_req = "0"
+        for c in pod.spec.containers or []:
+            reqs = (c.resources.requests or {}) if c.resources else {}
+            if "nvidia.com/gpu" in reqs:
+                gpu_req = str(reqs["nvidia.com/gpu"])
+        if gpu_req == "0":
+            continue
+
+        container_statuses = []
+        for cs in pod.status.container_statuses or []:
+            s = {"name": cs.name, "ready": cs.ready, "restartCount": cs.restart_count}
+            if cs.state:
+                if cs.state.running:
+                    s["state"] = "running"
+                elif cs.state.waiting:
+                    s["state"] = f"waiting: {cs.state.waiting.reason}"
+                elif cs.state.terminated:
+                    s["state"] = f"terminated: {cs.state.terminated.reason} (exit {cs.state.terminated.exit_code})"
+            container_statuses.append(s)
+
+        # Find sibling pods from the same Job owner.
+        siblings = []
+        for ref in owner_refs:
+            if ref.kind == "Job":
+                try:
+                    for sp in api.list_namespaced_pod(ns, label_selector=f"job-name={ref.name}").items:
+                        if sp.metadata.name != pod.metadata.name:
+                            siblings.append({"name": sp.metadata.name, "node": sp.spec.node_name, "phase": sp.status.phase})
+                except Exception:
+                    pass
+                break
+
+        workloads.append({
+            "pod": pod.metadata.name, "namespace": ns, "phase": pod.status.phase,
+            "gpu_requests": gpu_req,
+            "owners": [{"kind": r.kind, "name": r.name} for r in owner_refs],
+            "labels": dict(pod.metadata.labels or {}),
+            "container_statuses": container_statuses,
+            "sibling_pods": siblings[:20],
+        })
+
+    return json.dumps(workloads, indent=2, default=str) if workloads else "No GPU workloads found."
+
+
+async def get_workload_logs(node_name: str, tail_lines: int = 200) -> str:
+    api = get_core_api()
+    pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}")
+    system_ns = {"kube-system", "monitoring", "kube-node-lease", "kube-public"}
+    error_patterns = [
+        "nccl", "cuda error", "cuda_error", "torch.distributed", "runtimeerror",
+        "traceback", "nvrm", "xid", "ecc", "fallen off", "sigterm", "sigkill",
+        "oom", "out of memory", "failed", "error",
+    ]
+
+    results = {}
+    for pod in pods.items:
+        ns = pod.metadata.namespace
+        if ns in system_ns:
+            continue
+        if any(r.kind == "DaemonSet" for r in (pod.metadata.owner_references or [])):
+            continue
+        has_gpu = any(
+            "nvidia.com/gpu" in ((c.resources.requests or {}) if c.resources else {})
+            for c in (pod.spec.containers or [])
+        )
+        if not has_gpu:
+            continue
+
+        try:
+            container = pod.spec.containers[0].name if pod.spec.containers else None
+            if not container:
+                continue
+            logs = api.read_namespaced_pod_log(
+                name=pod.metadata.name, namespace=ns, container=container,
+                tail_lines=tail_lines, timestamps=True,
+            )
+            error_lines = [ln for ln in logs.split("\n") if any(p in ln.lower() for p in error_patterns)]
+            if error_lines:
+                results[f"{ns}/{pod.metadata.name}"] = {"error_lines": error_lines[-50:], "total": len(error_lines)}
+            else:
+                results[f"{ns}/{pod.metadata.name}"] = {"note": "no errors found", "tail": logs[-1000:]}
+        except Exception as exc:
+            results[f"{ns}/{pod.metadata.name}"] = f"error: {exc}"
+
+        if len(results) >= 5:
+            break
+
+    return json.dumps(results, indent=2, default=str) if results else "No GPU workload pods found."
+
+
+# ---------------------------------------------------------------------------
+# Remediation tools
+# ---------------------------------------------------------------------------
+
+async def cordon_node(node_name: str) -> str:
+    api = get_core_api()
+    api.patch_node(node_name, {"spec": {"unschedulable": True}})
+    logger.info("Cordoned node %s", node_name)
+    return f"Cordoned node {node_name}"
+
+
+async def drain_node(node_name: str) -> str:
+    api = get_core_api()
+    api.patch_node(node_name, {"spec": {"unschedulable": True}})
+
+    pods = api.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}")
+    evicted, skipped = 0, 0
+    for pod in pods.items:
+        if any(r.kind == "DaemonSet" for r in (pod.metadata.owner_references or [])):
+            skipped += 1
+            continue
+        try:
+            api.create_namespaced_pod_eviction(
+                pod.metadata.name, pod.metadata.namespace,
+                client.V1Eviction(
+                    metadata=client.V1ObjectMeta(name=pod.metadata.name, namespace=pod.metadata.namespace),
+                    delete_options=client.V1DeleteOptions(grace_period_seconds=30),
+                ),
+            )
+            evicted += 1
+        except client.ApiException as exc:
+            logger.warning("Failed to evict %s/%s: %s", pod.metadata.namespace, pod.metadata.name, exc.reason)
+
+    logger.info("Drained node %s: evicted=%d, skipped_daemonset=%d", node_name, evicted, skipped)
+    return f"Drained node {node_name}: evicted {evicted} pods, skipped {skipped} DaemonSet pods"
+
+
+async def taint_node(node_name: str, key: str, value: str, effect: str) -> str:
+    if effect not in ("NoSchedule", "PreferNoSchedule", "NoExecute"):
+        return f"Invalid effect: {effect}"
+    api = get_core_api()
+    node = api.read_node(node_name)
+    taints = list(node.spec.taints or [])
+    taints.append(client.V1Taint(key=key, value=value, effect=effect))
+    api.patch_node(node_name, {"spec": {"taints": [t.to_dict() for t in taints]}})
+    logger.info("Tainted node %s: %s=%s:%s", node_name, key, value, effect)
+    return f"Tainted node {node_name}: {key}={value}:{effect}"
diff --git a/gcm-sentinel/src/gcm_sentinel/tools/node_exporter.py b/gcm-sentinel/src/gcm_sentinel/tools/node_exporter.py
new file mode 100644
index 0000000..a69cdd6
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/tools/node_exporter.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Query node-exporter directly on a node."""
+from __future__ import annotations
+
+import json
+
+import httpx
+
+from gcm_sentinel.k8s import get_core_api
+from gcm_sentinel.tools.dcgm import _parse_prometheus_text
+
+
+def _find_node_exporter_ip(node_name: str) -> str | None:
+    api = get_core_api()
+    pods = api.list_pod_for_all_namespaces(
+        field_selector=f"spec.nodeName={node_name}",
+    )
+    for pod in pods.items:
+        name = pod.metadata.name.lower()
+        if "node-exporter" in name or "node_exporter" in name:
+            return pod.status.pod_ip
+    return None
+
+
+async def query_node_exporter_direct(node_name: str, metric_filter: str = "") -> str:
+    """Query node-exporter directly on a node via its pod IP."""
+    pod_ip = _find_node_exporter_ip(node_name)
+    if not pod_ip:
+        return f"No node-exporter pod found on node {node_name}"
+
+    async with httpx.AsyncClient(timeout=10.0) as client:
+        try:
+            resp = await client.get(f"http://{pod_ip}:9100/metrics")
+            resp.raise_for_status()
+        except Exception as exc:
+            return f"Failed to query node-exporter at {pod_ip}:9100: {exc}"
+
+    metrics = _parse_prometheus_text(resp.text, metric_filter)
+
+    if not metric_filter:
+        # Without a filter, return key host health metrics only.
+        key_prefixes = [
+            "node_hwmon_temp_celsius", "node_cpu_seconds_total",
+            "node_memory_MemAvailable_bytes", "node_memory_MemTotal_bytes",
+            "node_load1", "node_load5", "node_load15",
+            "node_disk_io_time_seconds_total",
+            "node_filesystem_avail_bytes",
+            "node_infiniband",  # IB metrics if available.
+            "node_network_receive_errs_total", "node_network_transmit_errs_total",
+        ]
+        metrics = {k: v for k, v in metrics.items() if any(k.startswith(p) for p in key_prefixes)}
+
+    text = json.dumps(metrics, indent=2)
+    return text
+
+
+async def query_infiniband_direct(node_name: str) -> str:
+    """Query InfiniBand/network metrics from node-exporter on a node."""
+    pod_ip = _find_node_exporter_ip(node_name)
+    if not pod_ip:
+        return f"No node-exporter pod found on node {node_name}"
+
+    async with httpx.AsyncClient(timeout=10.0) as client:
+        try:
+            resp = await client.get(f"http://{pod_ip}:9100/metrics")
+            resp.raise_for_status()
+        except Exception as exc:
+            return f"Failed to query node-exporter at {pod_ip}:9100: {exc}"
+
+    metrics = _parse_prometheus_text(resp.text)
+    ib_metrics = {k: v for k, v in metrics.items() if "infiniband" in k or "ib_" in k.lower()}
+
+    if not ib_metrics:
+        # Fall back to general network error metrics.
+        ib_metrics = {k: v for k, v in metrics.items()
+                      if k.startswith(("node_network_receive_errs", "node_network_transmit_errs",
+                                       "node_network_receive_drop", "node_network_transmit_drop"))}
+        if not ib_metrics:
+            return "No InfiniBand or network error metrics found on this node."
+
+    text = json.dumps(ib_metrics, indent=2)
+    return text
diff --git a/gcm-sentinel/src/gcm_sentinel/tools/prometheus.py b/gcm-sentinel/src/gcm_sentinel/tools/prometheus.py
new file mode 100644
index 0000000..7087d77
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/tools/prometheus.py
@@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from __future__ import annotations
+
+import json
+import time
+
+import httpx
+
+
+def _parse_duration(duration: str) -> float:
+    units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
+    if duration and duration[-1].lower() in units:
+        try:
+            return float(duration[:-1]) * units[duration[-1].lower()]
+        except ValueError:
+            pass
+    try:
+        return float(duration)
+    except (ValueError, TypeError):
+        return 300.0
+
+
+async def query_prometheus(prometheus_url: str, query: str, duration: str = "5m") -> str:
+    async with httpx.AsyncClient(timeout=30.0) as client:
+        resp = await client.get(
+            f"{prometheus_url}/api/v1/query",
+            params={"query": query, "timeout": "15s"},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+        if data.get("status") != "success":
+            return f"Prometheus query failed: {data}"
+
+        if not data.get("data", {}).get("result"):
+            duration_seconds = _parse_duration(duration)
+            now = time.time()
+            resp = await client.get(
+                f"{prometheus_url}/api/v1/query_range",
+                params={
+                    "query": query,
+                    "start": str(now - duration_seconds),
+                    "end": str(now),
+                    "step": "60s",
+                    "timeout": "15s",
+                },
+            )
+            resp.raise_for_status()
+            data = resp.json()
+
+        return json.dumps(data.get("data", {}), indent=2)
+
+
+async def query_alertmanager(alertmanager_url: str, node_name: str = "") -> str:
+    async with httpx.AsyncClient(timeout=15.0) as client:
+        try:
+            resp = await client.get(f"{alertmanager_url}/api/v2/alerts")
+            resp.raise_for_status()
+            alerts = resp.json()
+        except Exception as exc:
+            return f"Alertmanager query failed: {exc}"
+
+    if node_name:
+        node_labels = ("instance", "node", "nodename", "kubernetes_node")
+        alerts = [
+            a for a in alerts
+            if any(node_name in str(a.get("labels", {}).get(k, "")) for k in node_labels)
+        ]
+
+    results = [
+        {
+            "alertname": a.get("labels", {}).get("alertname", ""),
+            "severity": a.get("labels", {}).get("severity", ""),
+            "state": a.get("status", {}).get("state", ""),
+            "labels": a.get("labels", {}),
+            "startsAt": a.get("startsAt", ""),
+            "summary": a.get("annotations", {}).get("summary", ""),
+            "description": a.get("annotations", {}).get("description", "")[:300],
+        }
+        for a in alerts[:30]
+    ]
+
+    return json.dumps(results, indent=2) if results else "No firing alerts found."
diff --git a/gcm-sentinel/src/gcm_sentinel/watcher.py b/gcm-sentinel/src/gcm_sentinel/watcher.py
new file mode 100644
index 0000000..083a1c1
--- /dev/null
+++ b/gcm-sentinel/src/gcm_sentinel/watcher.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Watches K8s node conditions and triggers investigation on GPU problems."""
+
+from __future__ import annotations
+
+import asyncio
+import fnmatch
+import logging
+import time
+
+from kubernetes import client, config, watch
+
+from gcm_sentinel.config import SentinelConfig
+from gcm_sentinel.models import NodeConditionEvent
+from gcm_sentinel.notify import annotate_node, notify
+from gcm_sentinel.datasources import build_datasources
+from gcm_sentinel.engine import run_investigation
+
+logger = logging.getLogger(__name__)
+
+
+class NodeConditionWatcher:
+    def __init__(self, cfg: SentinelConfig) -> None:
+        self.cfg = cfg
+        self._registry = None
+        self._condition_states: dict[tuple[str, str], str] = {}
+        self._last_run_time: dict[tuple[str, str], float] = {}
+        self._active_tasks: set[asyncio.Task] = set()
+
+    def _init_k8s(self) -> None:
+        try:
+            config.load_incluster_config()
+        except config.ConfigException:
+            config.load_kube_config()
+
+    def _node_in_scope(self, node_name: str) -> bool:
+        if not self.cfg.node_allowlist:
+            return True
+        return any(fnmatch.fnmatch(node_name, p) for p in self.cfg.node_allowlist)
+
+    def _in_cooldown(self, node_name: str, condition: str) -> bool:
+        last = self._last_run_time.get((node_name, condition))
+        if last is None:
+            return False
+        remaining = self.cfg.cooldown_seconds - (time.monotonic() - last)
+        if remaining > 0:
+            logger.info("Cooldown: %s/%s (%.0fs left)", node_name, condition, remaining)
+            return True
+        return False
+
+    def _detect_changes(self, node_name: str, conditions: list) -> list[NodeConditionEvent]:
+        events = []
+        for cond in conditions:
+            if cond.type not in self.cfg.watch_conditions:
+                continue
+            key = (node_name, cond.type)
+            prev = self._condition_states.get(key)
+            if prev != cond.status:
+                self._condition_states[key] = cond.status
+                if cond.status == "True":
+                    events.append(NodeConditionEvent(
+                        node_name=node_name, condition_type=cond.type,
+                        status=cond.status, reason=cond.reason or "", message=cond.message or "",
+                    ))
+        return events
+
+    async def _handle_event(self, event: NodeConditionEvent) -> None:
+        node, cond = event.node_name, event.condition_type
+        if not self._node_in_scope(node) or self._in_cooldown(node, cond):
+            return
+
+        self._last_run_time[(node, cond)] = time.monotonic()
+        logger.info("Investigation triggered: %s on %s (%s)", cond, node, event.message)
+
+        try:
+            if self._registry is None:
+                self._registry = build_datasources(self.cfg)
+            result = await run_investigation(event, self.cfg, self._registry)
+
+            logger.info("Result for %s: severity=%s action=%s confidence=%.2f",
+                        node, result.severity.value, result.recommended_action.value, result.confidence)
+            logger.info("Summary: %s", result.summary)
+
+            if self.cfg.can_annotate:
+                await annotate_node(result)
+            await notify(result, webhook_url=self.cfg.webhook_url)
+
+        except Exception:
+            logger.exception("Investigation failed for %s", node)
+
+    async def watch_loop(self) -> None:
+        self._init_k8s()
+        api = client.CoreV1Api()
+        w = watch.Watch()
+
+        logger.info("Watcher started (mode=%s, conditions=%d, allowlist=%s, cooldown=%ds)",
+                     self.cfg.action_mode, len(self.cfg.watch_conditions),
+                     self.cfg.node_allowlist or "(all)", self.cfg.cooldown_seconds)
+
+        while True:
+            try:
+                for event in w.stream(api.list_node, timeout_seconds=0):
+                    if event["type"] not in ("ADDED", "MODIFIED"):
+                        continue
+                    node = event["object"]
+                    for change in self._detect_changes(node.metadata.name, node.status.conditions or []):
+                        task = asyncio.create_task(self._handle_event(change))
+                        self._active_tasks.add(task)
+                        task.add_done_callback(self._active_tasks.discard)
+            except Exception:
+                logger.exception("Watch stream error, reconnecting in 5s (%d active investigations)", len(self._active_tasks))
+                await asyncio.sleep(5)
diff --git a/gcm-sentinel/tests/scenarios/gpu_fallen_off_bus.json b/gcm-sentinel/tests/scenarios/gpu_fallen_off_bus.json
new file mode 100644
index 0000000..954769f
--- /dev/null
+++ b/gcm-sentinel/tests/scenarios/gpu_fallen_off_bus.json
@@ -0,0 +1,75 @@
+{
+  "name": "GPU Fallen Off Bus",
+  "description": "GPU 5 is no longer visible to nvidia-smi, likely a PCIe or hardware failure.",
+  "node_name": "gpu-node-12",
+  "condition_type": "NPDSmiDisconnectedProblem",
+  "reason": "NPDSmiDisconnectedProblem",
+  "message": "nvidia-smi reports only 7/8 GPUs visible. GPU index 5 not responding.",
+  "prometheus_responses": {
+    "DCGM_FI_DEV_GPU_TEMP": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "0", "instance": "gpu-node-12:9400"}, "value": [1700000000, "62"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-12:9400"}, "value": [1700000000, "64"]},
+          {"metric": {"gpu": "2", "instance": "gpu-node-12:9400"}, "value": [1700000000, "63"]},
+          {"metric": {"gpu": "3", "instance": "gpu-node-12:9400"}, "value": [1700000000, "61"]},
+          {"metric": {"gpu": "4", "instance": "gpu-node-12:9400"}, "value": [1700000000, "65"]},
+          {"metric": {"gpu": "6", "instance": "gpu-node-12:9400"}, "value": [1700000000, "64"]},
+          {"metric": {"gpu": "7", "instance": "gpu-node-12:9400"}, "value": [1700000000, "63"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_POWER_USAGE": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "0", "instance": "gpu-node-12:9400"}, "value": [1700000000, "310"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-12:9400"}, "value": [1700000000, "305"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_XID_ERRORS": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "4", "instance": "gpu-node-12:9400"}, "value": [1700000000, "79"]}
+        ]
+      }
+    }
+  },
+  "node_info": {
+    "name": "gpu-node-12",
+    "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-12"},
+    "taints": [],
+    "conditions": [
+      {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"},
+      {"type": "NPDSmiDisconnectedProblem", "status": "True", "reason": "NPDSmiDisconnectedProblem", "message": "nvidia-smi reports only 7/8 GPUs visible. GPU index 5 not responding."}
+    ],
+    "allocatable": {"nvidia.com/gpu": "7", "cpu": "192", "memory": "2097152Mi"},
+    "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "pods": [
+      {"name": "training-job-worker-0", "namespace": "default", "phase": "Running"}
+    ],
+    "pod_count": 8
+  },
+  "gcm_health": {
+    "node": "gpu-node-12",
+    "npd_metrics": {
+      "problem_gauge": [
+        {"metric": {"reason": "NPDSmiDisconnectedProblem", "instance": "gpu-node-12:20257"}, "value": [1700000000, "1"]}
+      ],
+      "problem_counter": [
+        {"metric": {"reason": "NPDSmiDisconnectedProblem", "instance": "gpu-node-12:20257"}, "value": [1700000000, "1"]}
+      ]
+    },
+    "conditions": [
+      {"type": "NPDSmiDisconnectedProblem", "status": "True", "reason": "NPDSmiDisconnectedProblem", "message": "nvidia-smi reports only 7/8 GPUs visible"}
+    ]
+  },
+  "expected_severity": "critical",
+  "expected_action": "drain"
+}
diff --git a/gcm-sentinel/tests/scenarios/hbm_degradation.json b/gcm-sentinel/tests/scenarios/hbm_degradation.json
new file mode 100644
index 0000000..b98c70f
--- /dev/null
+++ b/gcm-sentinel/tests/scenarios/hbm_degradation.json
@@ -0,0 +1,89 @@
+{
+  "name": "HBM Memory Degradation",
+  "description": "GPU 3 on the node shows HBM (High Bandwidth Memory) ECC errors accumulating, indicating memory degradation.",
+  "node_name": "gpu-node-07",
+  "condition_type": "NPDSmiEccProblem",
+  "reason": "NPDSmiEccProblem",
+  "message": "ECC errors detected: gpu3 has 142 volatile uncorrectable ECC errors",
+  "prometheus_responses": {
+    "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "3", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-003"}, "value": [1700000000, "142"]},
+          {"metric": {"gpu": "0", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-000"}, "value": [1700000000, "0"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-001"}, "value": [1700000000, "0"]},
+          {"metric": {"gpu": "2", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-002"}, "value": [1700000000, "0"]},
+          {"metric": {"gpu": "4", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-004"}, "value": [1700000000, "0"]},
+          {"metric": {"gpu": "5", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-005"}, "value": [1700000000, "0"]},
+          {"metric": {"gpu": "6", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-006"}, "value": [1700000000, "0"]},
+          {"metric": {"gpu": "7", "instance": "gpu-node-07:9400", "UUID": "GPU-abc-007"}, "value": [1700000000, "0"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "3", "instance": "gpu-node-07:9400"}, "value": [1700000000, "8947"]},
+          {"metric": {"gpu": "0", "instance": "gpu-node-07:9400"}, "value": [1700000000, "12"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-07:9400"}, "value": [1700000000, "5"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_GPU_TEMP": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "3", "instance": "gpu-node-07:9400"}, "value": [1700000000, "78"]},
+          {"metric": {"gpu": "0", "instance": "gpu-node-07:9400"}, "value": [1700000000, "65"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_RETIRED_DBE": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "3", "instance": "gpu-node-07:9400"}, "value": [1700000000, "7"]}
+        ]
+      }
+    }
+  },
+  "node_info": {
+    "name": "gpu-node-07",
+    "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-07"},
+    "taints": [],
+    "conditions": [
+      {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"},
+      {"type": "NPDSmiEccProblem", "status": "True", "reason": "NPDSmiEccProblem", "message": "ECC errors detected: gpu3 has 142 volatile uncorrectable ECC errors"},
+      {"type": "NPDXidErrorsProblem", "status": "False", "reason": "NPDXidErrorsProblem", "message": "No XID errors"}
+    ],
+    "allocatable": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "pods": [
+      {"name": "training-job-worker-3", "namespace": "default", "phase": "Running"},
+      {"name": "dcgm-exporter-xyz", "namespace": "monitoring", "phase": "Running"}
+    ],
+    "pod_count": 12
+  },
+  "gcm_health": {
+    "node": "gpu-node-07",
+    "npd_metrics": {
+      "problem_gauge": [
+        {"metric": {"reason": "NPDSmiEccProblem", "instance": "gpu-node-07:20257"}, "value": [1700000000, "1"]}
+      ],
+      "problem_counter": [
+        {"metric": {"reason": "NPDSmiEccProblem", "instance": "gpu-node-07:20257"}, "value": [1700000000, "3"]}
+      ]
+    },
+    "conditions": [
+      {"type": "NPDSmiEccProblem", "status": "True", "reason": "NPDSmiEccProblem", "message": "ECC errors detected: gpu3 has 142 volatile uncorrectable ECC errors"}
+    ]
+  },
+  "expected_severity": "critical",
+  "expected_action": "drain"
+}
diff --git a/gcm-sentinel/tests/scenarios/nvlink_errors.json b/gcm-sentinel/tests/scenarios/nvlink_errors.json
new file mode 100644
index 0000000..96350f0
--- /dev/null
+++ b/gcm-sentinel/tests/scenarios/nvlink_errors.json
@@ -0,0 +1,74 @@
+{
+  "name": "NVLink Errors",
+  "description": "NVLink CRC errors accumulating between GPU 2 and GPU 6, degrading inter-GPU bandwidth.",
+  "node_name": "gpu-node-21",
+  "condition_type": "NPDDcgmiNvlinkStatusProblem",
+  "reason": "NPDDcgmiNvlinkStatusProblem",
+  "message": "NVLink errors detected: link 2->6 has elevated CRC error rate",
+  "prometheus_responses": {
+    "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "2", "instance": "gpu-node-21:9400", "nvlink": "0"}, "value": [1700000000, "54231"]},
+          {"metric": {"gpu": "6", "instance": "gpu-node-21:9400", "nvlink": "0"}, "value": [1700000000, "54198"]},
+          {"metric": {"gpu": "0", "instance": "gpu-node-21:9400", "nvlink": "0"}, "value": [1700000000, "3"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-21:9400", "nvlink": "0"}, "value": [1700000000, "0"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "2", "instance": "gpu-node-21:9400"}, "value": [1700000000, "12000000"]},
+          {"metric": {"gpu": "6", "instance": "gpu-node-21:9400"}, "value": [1700000000, "12500000"]},
+          {"metric": {"gpu": "0", "instance": "gpu-node-21:9400"}, "value": [1700000000, "48000000"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_GPU_TEMP": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "2", "instance": "gpu-node-21:9400"}, "value": [1700000000, "71"]},
+          {"metric": {"gpu": "6", "instance": "gpu-node-21:9400"}, "value": [1700000000, "73"]}
+        ]
+      }
+    }
+  },
+  "node_info": {
+    "name": "gpu-node-21",
+    "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-21"},
+    "taints": [],
+    "conditions": [
+      {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"},
+      {"type": "NPDDcgmiNvlinkStatusProblem", "status": "True", "reason": "NPDDcgmiNvlinkStatusProblem", "message": "NVLink errors detected: link 2->6 has elevated CRC error rate"}
+    ],
+    "allocatable": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "pods": [
+      {"name": "nccl-test-worker-1", "namespace": "default", "phase": "Running"}
+    ],
+    "pod_count": 10
+  },
+  "gcm_health": {
+    "node": "gpu-node-21",
+    "npd_metrics": {
+      "problem_gauge": [
+        {"metric": {"reason": "NPDDcgmiNvlinkStatusProblem", "instance": "gpu-node-21:20257"}, "value": [1700000000, "1"]}
+      ],
+      "problem_counter": [
+        {"metric": {"reason": "NPDDcgmiNvlinkStatusProblem", "instance": "gpu-node-21:20257"}, "value": [1700000000, "2"]}
+      ]
+    },
+    "conditions": [
+      {"type": "NPDDcgmiNvlinkStatusProblem", "status": "True", "reason": "NPDDcgmiNvlinkStatusProblem", "message": "NVLink errors detected"}
+    ]
+  },
+  "expected_severity": "warning",
+  "expected_action": "cordon"
+}
diff --git a/gcm-sentinel/tests/scenarios/thermal_throttle.json b/gcm-sentinel/tests/scenarios/thermal_throttle.json
new file mode 100644
index 0000000..b5b6c47
--- /dev/null
+++ b/gcm-sentinel/tests/scenarios/thermal_throttle.json
@@ -0,0 +1,105 @@
+{
+  "name": "Thermal Throttle",
+  "description": "Multiple GPUs on the node are thermal throttling due to cooling failure.",
+  "node_name": "gpu-node-03",
+  "condition_type": "NPDDcgmiDiagProblem",
+  "reason": "NPDDcgmiDiagProblem",
+  "message": "dcgmi diag detected thermal violations on GPUs 0,1,2,3",
+  "prometheus_responses": {
+    "DCGM_FI_DEV_GPU_TEMP": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "0", "instance": "gpu-node-03:9400"}, "value": [1700000000, "92"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-03:9400"}, "value": [1700000000, "91"]},
+          {"metric": {"gpu": "2", "instance": "gpu-node-03:9400"}, "value": [1700000000, "93"]},
+          {"metric": {"gpu": "3", "instance": "gpu-node-03:9400"}, "value": [1700000000, "90"]},
+          {"metric": {"gpu": "4", "instance": "gpu-node-03:9400"}, "value": [1700000000, "72"]},
+          {"metric": {"gpu": "5", "instance": "gpu-node-03:9400"}, "value": [1700000000, "70"]},
+          {"metric": {"gpu": "6", "instance": "gpu-node-03:9400"}, "value": [1700000000, "71"]},
+          {"metric": {"gpu": "7", "instance": "gpu-node-03:9400"}, "value": [1700000000, "69"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_THERMAL_VIOLATION": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "0", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1523456789"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1412345678"]},
+          {"metric": {"gpu": "2", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1634567890"]},
+          {"metric": {"gpu": "3", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1345678901"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_POWER_USAGE": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "0", "instance": "gpu-node-03:9400"}, "value": [1700000000, "250"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-03:9400"}, "value": [1700000000, "245"]},
+          {"metric": {"gpu": "2", "instance": "gpu-node-03:9400"}, "value": [1700000000, "255"]},
+          {"metric": {"gpu": "3", "instance": "gpu-node-03:9400"}, "value": [1700000000, "248"]},
+          {"metric": {"gpu": "4", "instance": "gpu-node-03:9400"}, "value": [1700000000, "320"]},
+          {"metric": {"gpu": "5", "instance": "gpu-node-03:9400"}, "value": [1700000000, "315"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_SM_CLOCK": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "0", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1200"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1180"]},
+          {"metric": {"gpu": "4", "instance": "gpu-node-03:9400"}, "value": [1700000000, "1980"]}
+        ]
+      }
+    },
+    "node_hwmon_temp_celsius": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"chip": "inlet", "instance": "gpu-node-03:9100"}, "value": [1700000000, "38"]},
+          {"metric": {"chip": "outlet", "instance": "gpu-node-03:9100"}, "value": [1700000000, "52"]}
+        ]
+      }
+    }
+  },
+  "node_info": {
+    "name": "gpu-node-03",
+    "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-03"},
+    "taints": [],
+    "conditions": [
+      {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"},
+      {"type": "NPDDcgmiDiagProblem", "status": "True", "reason": "NPDDcgmiDiagProblem", "message": "dcgmi diag detected thermal violations on GPUs 0,1,2,3"}
+    ],
+    "allocatable": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "pods": [
+      {"name": "training-job-worker-5", "namespace": "default", "phase": "Running"},
+      {"name": "dcgm-exporter-abc", "namespace": "monitoring", "phase": "Running"}
+    ],
+    "pod_count": 11
+  },
+  "gcm_health": {
+    "node": "gpu-node-03",
+    "npd_metrics": {
+      "problem_gauge": [
+        {"metric": {"reason": "NPDDcgmiDiagProblem", "instance": "gpu-node-03:20257"}, "value": [1700000000, "1"]}
+      ],
+      "problem_counter": [
+        {"metric": {"reason": "NPDDcgmiDiagProblem", "instance": "gpu-node-03:20257"}, "value": [1700000000, "5"]}
+      ]
+    },
+    "conditions": [
+      {"type": "NPDDcgmiDiagProblem", "status": "True", "reason": "NPDDcgmiDiagProblem", "message": "dcgmi diag detected thermal violations"}
+    ]
+  },
+  "expected_severity": "critical",
+  "expected_action": "drain"
+}
diff --git a/gcm-sentinel/tests/scenarios/transient_xid.json b/gcm-sentinel/tests/scenarios/transient_xid.json
new file mode 100644
index 0000000..387b3af
--- /dev/null
+++ b/gcm-sentinel/tests/scenarios/transient_xid.json
@@ -0,0 +1,102 @@
+{
+  "name": "Transient XID Error",
+  "description": "A single transient XID 63 error was reported on GPU 1, but the GPU appears healthy otherwise.",
+  "node_name": "gpu-node-29",
+  "condition_type": "NPDXidErrorsProblem",
+  "reason": "NPDXidErrorsProblem",
+  "message": "XID error 63 detected on GPU 1",
+  "prometheus_responses": {
+    "DCGM_FI_DEV_XID_ERRORS": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "63"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "0", "instance": "gpu-node-29:9400"}, "value": [1700000000, "0"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "0"]},
+          {"metric": {"gpu": "2", "instance": "gpu-node-29:9400"}, "value": [1700000000, "0"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_GPU_TEMP": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "0", "instance": "gpu-node-29:9400"}, "value": [1700000000, "65"]},
+          {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "67"]},
+          {"metric": {"gpu": "2", "instance": "gpu-node-29:9400"}, "value": [1700000000, "64"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_POWER_USAGE": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "310"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_SM_CLOCK": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "1980"]}
+        ]
+      }
+    },
+    "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": {
+      "status": "success",
+      "data": {
+        "resultType": "vector",
+        "result": [
+          {"metric": {"gpu": "1", "instance": "gpu-node-29:9400"}, "value": [1700000000, "2"]}
+        ]
+      }
+    }
+  },
+  "node_info": {
+    "name": "gpu-node-29",
+    "labels": {"node.kubernetes.io/instance-type": "p5en.48xlarge", "kubernetes.io/hostname": "gpu-node-29"},
+    "taints": [],
+    "conditions": [
+      {"type": "Ready", "status": "True", "reason": "KubeletReady", "message": "kubelet is ready"},
+      {"type": "NPDXidErrorsProblem", "status": "True", "reason": "NPDXidErrorsProblem", "message": "XID error 63 detected on GPU 1"},
+      {"type": "NPDSmiEccProblem", "status": "False", "reason": "NPDSmiEccProblem", "message": "No ECC errors"},
+      {"type": "NPDSmiDisconnectedProblem", "status": "False", "reason": "NPDSmiDisconnectedProblem", "message": "All GPUs visible"}
+    ],
+    "allocatable": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "capacity": {"nvidia.com/gpu": "8", "cpu": "192", "memory": "2097152Mi"},
+    "pods": [
+      {"name": "training-job-worker-2", "namespace": "default", "phase": "Running"},
+      {"name": "dcgm-exporter-def", "namespace": "monitoring", "phase": "Running"}
+    ],
+    "pod_count": 9
+  },
+  "gcm_health": {
+    "node": "gpu-node-29",
+    "npd_metrics": {
+      "problem_gauge": [
+        {"metric": {"reason": "NPDXidErrorsProblem", "instance": "gpu-node-29:20257"}, "value": [1700000000, "1"]}
+      ],
+      "problem_counter": [
+        {"metric": {"reason": "NPDXidErrorsProblem", "instance": "gpu-node-29:20257"}, "value": [1700000000, "1"]}
+      ]
+    },
+    "conditions": [
+      {"type": "NPDXidErrorsProblem", "status": "True", "reason": "NPDXidErrorsProblem", "message": "XID error 63 detected on GPU 1"}
+    ]
+  },
+  "expected_severity": "info",
+  "expected_action": "none"
+}
diff --git a/gcm-sentinel/tests/test_sentinel.py b/gcm-sentinel/tests/test_sentinel.py
new file mode 100644
index 0000000..c8cfde1
--- /dev/null
+++ b/gcm-sentinel/tests/test_sentinel.py
@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""Tests for the critical paths: safety gating, config parsing, scenario validation."""
+from __future__ import annotations
+
+import json
+import time
+from pathlib import Path
+
+import pytest
+
+from gcm_sentinel.config import DEFAULT_WATCH_CONDITIONS, SentinelConfig
+from gcm_sentinel.models import Action, Severity
+
+
+class TestSafety:
+    """The most important tests: verify remediation tools are gated by action_mode."""
+
+    def test_recommend_mode_has_no_remediation_tools(self):
+        from gcm_sentinel.datasources import build_datasources
+
+        registry = build_datasources(SentinelConfig(api_key="test", action_mode="recommend"))
+        tool_names = {t["name"] for t in registry.get_all_tools()}
+        assert "cordon_node" not in tool_names
+        assert "drain_node" not in tool_names
+        assert "taint_node" not in tool_names
+        # Read-only tools present.
+        assert "query_prometheus" in tool_names
+        assert "get_node_info" in tool_names
+
+    def test_annotate_mode_has_no_remediation_tools(self):
+        from gcm_sentinel.datasources import build_datasources
+
+        registry = build_datasources(SentinelConfig(api_key="test", action_mode="annotate"))
+        assert "cordon_node" not in {t["name"] for t in registry.get_all_tools()}
+
+    def test_execute_mode_has_remediation_tools(self):
+        from gcm_sentinel.datasources import build_datasources
+
+        registry = build_datasources(SentinelConfig(api_key="test", action_mode="execute"))
+        tool_names = {t["name"] for t in registry.get_all_tools()}
+        assert "cordon_node" in tool_names
+        assert "drain_node" in tool_names
+        assert "taint_node" in tool_names
+
+    @pytest.mark.asyncio
+    async def test_unknown_tool_blocked_by_registry(self):
+        from gcm_sentinel.datasources import build_datasources
+
+        registry = build_datasources(SentinelConfig(api_key="test", action_mode="recommend"))
+        result = await registry.execute("cordon_node", {"node_name": "x"})
+        assert "Unknown tool" in result
+
+    def test_invalid_action_mode_rejected(self):
+        with pytest.raises(ValueError, match="action_mode must be one of"):
+            SentinelConfig(api_key="test", action_mode="yolo")
+
+
+class TestConfig:
+    def test_defaults(self):
+        cfg = SentinelConfig(api_key="test")
+        assert cfg.llm_api == "anthropic"
+        assert cfg.action_mode == "recommend"
+        assert cfg.can_execute is False
+        assert cfg.can_annotate is False
+        assert cfg.cooldown_seconds == 3600
+        assert len(cfg.watch_conditions) == len(DEFAULT_WATCH_CONDITIONS)
+
+    def test_csv_watch_conditions(self):
+        cfg = SentinelConfig(api_key="t", watch_conditions="GcmXidErrorsProblem,GcmSmiEccProblem")
+        assert cfg.watch_conditions == ["GcmXidErrorsProblem", "GcmSmiEccProblem"]
+
+    def test_csv_node_allowlist(self):
+        cfg = SentinelConfig(api_key="t", node_allowlist="gpu-node-0*,gpu-node-12")
+        assert cfg.node_allowlist == ["gpu-node-0*", "gpu-node-12"]
+
+    def test_action_mode_properties(self):
+        assert SentinelConfig(api_key="t", action_mode="recommend").can_execute is False
+        assert SentinelConfig(api_key="t", action_mode="annotate").can_annotate is True
+        assert SentinelConfig(api_key="t", action_mode="execute").can_execute is True
+
+
+class TestWatcherSafety:
+    def test_node_allowlist_filtering(self):
+        from gcm_sentinel.watcher import NodeConditionWatcher
+
+        watcher = NodeConditionWatcher(SentinelConfig(api_key="t", node_allowlist=["gpu-node-0*", "gpu-node-12"]))
+        assert watcher._node_in_scope("gpu-node-01") is True
+        assert watcher._node_in_scope("gpu-node-21") is False
+
+    def test_cooldown(self):
+        from gcm_sentinel.watcher import NodeConditionWatcher
+
+        watcher = NodeConditionWatcher(SentinelConfig(api_key="t", cooldown_seconds=60))
+        assert watcher._in_cooldown("n1", "cond") is False
+        watcher._last_run_time[("n1", "cond")] = time.monotonic()
+        assert watcher._in_cooldown("n1", "cond") is True
+        assert watcher._in_cooldown("n1", "other") is False
+
+    def test_rate_limiter(self):
+        from gcm_sentinel.datasources.remediation import RemediationDataSource
+
+        ds = RemediationDataSource(SentinelConfig(api_key="t", action_mode="execute", max_actions_per_hour=2))
+        assert ds._is_rate_limited() is False
+        ds._action_timestamps.append(time.monotonic())
+        ds._action_timestamps.append(time.monotonic())
+        assert ds._is_rate_limited() is True
+
+
+class TestScenarios:
+    @pytest.mark.parametrize("name", [
+        "hbm_degradation", "gpu_fallen_off_bus", "nvlink_errors",
+        "thermal_throttle", "transient_xid",
+    ])
+    def test_scenario_valid(self, name):
+        path = Path(__file__).parent / "scenarios" / f"{name}.json"
+        scenario = json.loads(path.read_text())
+        assert scenario["condition_type"] in DEFAULT_WATCH_CONDITIONS
+        assert scenario["expected_severity"] in [s.value for s in Severity]
+        assert scenario["expected_action"] in [a.value for a in Action]
diff --git a/website/docs/GCM_Health_Checks/kubernetes_deployment.md b/website/docs/GCM_Health_Checks/kubernetes_deployment.md
index e9e64eb..96e0337 100644
--- a/website/docs/GCM_Health_Checks/kubernetes_deployment.md
+++ b/website/docs/GCM_Health_Checks/kubernetes_deployment.md
@@ -96,6 +96,7 @@ kubectl get node <node-name> \
 
 Health check conditions are standard Kubernetes node conditions. Downstream tools can watch them to automatically remediate unhealthy nodes:
 
+- **[GCM Sentinel](../GCM_Sentinel/getting_started.md)**: AI-powered investigation using Claude. Watches for GCM conditions, investigates by querying Prometheus metrics, node state, and workload logs, then recommends (or executes) remediation actions. See the [Getting Started](../GCM_Sentinel/getting_started.md) guide.
 - **[Draino](https://github.com/planetlabs/draino)**: watches node conditions and automatically cordons/drains unhealthy nodes
 - **Cluster autoscalers**: can detect unhealthy nodes and replace them
 - **Custom controllers**: can implement organization-specific remediation workflows
diff --git a/website/docs/GCM_Sentinel/_category_.json b/website/docs/GCM_Sentinel/_category_.json
new file mode 100644
index 0000000..d74b556
--- /dev/null
+++ b/website/docs/GCM_Sentinel/_category_.json
@@ -0,0 +1,8 @@
+{
+  "label": "GCM Sentinel",
+  "position": 5,
+  "link": {
+    "type": "generated-index",
+    "description": "AI-powered GPU cluster sentinel agent (experimental)."
+  }
+}
diff --git a/website/docs/GCM_Sentinel/adding_new_datasource.md b/website/docs/GCM_Sentinel/adding_new_datasource.md
new file mode 100644
index 0000000..d3a7912
--- /dev/null
+++ b/website/docs/GCM_Sentinel/adding_new_datasource.md
@@ -0,0 +1,212 @@
+---
+sidebar_position: 4
+---
+
+# Adding a New Data Source
+
+The sentinel agent investigates GPU failures by querying **data sources** — Prometheus metrics, Kubernetes state, pod logs, etc. Each data source is a Python class that subclasses `DataSource` and exposes tools the LLM can call during investigation.
+
+The built-in data sources cover the common stack (Prometheus, K8s, DCGM, NPD, Alertmanager). You can add your own to give the agent access to any system you have — IPMI sensors, cloud APIs, job schedulers, internal dashboards, etc.
+
+Adding a data source is one Python file plus one line of registration. This guide walks through a complete example: adding IPMI/BMC sensor queries.
+
+## 1. Create the data source file
+
+Create a new file under `src/gcm_sentinel/datasources/`. The naming convention is `<source_name>.py`.
+
+```python
+# src/gcm_sentinel/datasources/ipmi.py
+from __future__ import annotations
+
+import json
+import httpx
+from gcm_sentinel.datasources import DataSource
+
+
+class IPMIDataSource(DataSource):
+    name = "ipmi"
+
+    def is_available(self):
+        # Return False to auto-skip if prerequisites aren't met.
+        # The registry will log "Data source 'ipmi' not available, skipping".
+        return bool(self.cfg.ipmi_url)  # Add ipmi_url to SentinelConfig first.
+
+    def get_tools(self):
+        # Return Anthropic-format tool definitions. These are automatically
+        # converted for OpenAI if that backend is in use.
+        return [{
+            "name": "get_ipmi_sensors",
+            "description": "Read IPMI sensor data (temperatures, fan speeds, PSU status) for a node via its BMC.",
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "node_name": {
+                        "type": "string",
+                        "description": "Name of the Kubernetes node.",
+                    },
+                },
+                "required": ["node_name"],
+            },
+        }]
+
+    async def execute(self, tool_name, tool_input):
+        # Called when the LLM invokes your tool.
+        node_name = tool_input["node_name"]
+        bmc_host = f"{node_name}-bmc"  # Your BMC naming convention.
+
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.get(
+                f"{self.cfg.ipmi_url}/sensors",
+                params={"host": bmc_host},
+            )
+            resp.raise_for_status()
+            data = resp.json()
+
+        # Return a string — this becomes the tool result the LLM sees.
+        text = json.dumps(data, indent=2)
+        return text[:8000]  # Truncate to avoid filling the context window.
+
+    def get_system_prompt_section(self):
+        # Optional: add reference information to the system prompt.
+        # The LLM sees this alongside the investigation guidelines and
+        # metrics reference, so it knows when/how to use your tool.
+        return """\
+## IPMI sensor reference
+
+The `get_ipmi_sensors` tool returns BMC sensor readings:
+- Inlet/outlet temperatures (celsius)
+- Fan speeds (RPM)
+- PSU voltage and status
+- CPU temperatures
+
+Use this to check for cooling failures or power supply issues when
+investigating thermal throttling or unexpected GPU shutdowns."""
+```
+
+## 2. Add the config field (if needed)
+
+If your data source needs configuration (URLs, credentials, feature flags), add fields to `SentinelConfig` in `src/gcm_sentinel/config.py`:
+
+```python
+class SentinelConfig(BaseSettings):
+    model_config = {"env_prefix": "GCM_SENTINEL_"}
+
+    # ... existing fields ...
+
+    ipmi_url: str = Field(
+        default="",
+        description="IPMI/BMC API URL. Leave empty to disable.",
+    )
+```
+
+This automatically creates the env var `GCM_SENTINEL_IPMI_URL`.
+
+## 3. Register the data source
+
+Add one line to `build_datasources()` in `src/gcm_sentinel/datasources/__init__.py`:
+
+```python
+def build_datasources(cfg: SentinelConfig) -> DataSourceRegistry:
+    from gcm_sentinel.datasources.alertmanager import AlertmanagerDataSource
+    from gcm_sentinel.datasources.gcm_health import GCMHealthDataSource
+    from gcm_sentinel.datasources.ipmi import IPMIDataSource  # NEW
+    from gcm_sentinel.datasources.kubernetes_core import KubernetesCoreDataSource
+    from gcm_sentinel.datasources.kubernetes_workloads import KubernetesWorkloadsDataSource
+    from gcm_sentinel.datasources.prometheus import PrometheusDataSource
+    from gcm_sentinel.datasources.remediation import RemediationDataSource
+
+    registry = DataSourceRegistry(cfg)
+    registry.register(PrometheusDataSource(cfg))
+    registry.register(KubernetesCoreDataSource(cfg))
+    registry.register(GCMHealthDataSource(cfg))
+    registry.register(KubernetesWorkloadsDataSource(cfg))
+    registry.register(AlertmanagerDataSource(cfg))
+    registry.register(IPMIDataSource(cfg))  # NEW
+    registry.register(RemediationDataSource(cfg))
+    return registry
+```
+
+That's it. The investigation engine will automatically:
+- Include your tool in the LLM's tool list
+- Route tool calls to your `execute()` method
+- Include your system prompt section in the investigation context
+
+## 4. Add Helm values (if needed)
+
+If your data source has configurable URLs or settings, expose them in `charts/gcm-sentinel/values.yaml` and wire them through `charts/gcm-sentinel/templates/deployment.yaml`:
+
+```yaml
+# values.yaml
+ipmi:
+  url: ""  # Set to enable IPMI sensor queries.
+```
+
+```yaml
+# deployment.yaml (in the env section)
+{{- if .Values.ipmi.url }}
+- name: GCM_SENTINEL_IPMI_URL
+  value: "{{ .Values.ipmi.url }}"
+{{- end }}
+```
+
+## 5. Test
+
+Add a test verifying your data source registers correctly and handles errors:
+
+```python
+# tests/test_engine.py
+def test_ipmi_skipped_when_url_empty():
+    from gcm_sentinel.datasources import build_datasources
+
+    cfg = SentinelConfig(api_key="test")  # ipmi_url defaults to ""
+    registry = build_datasources(cfg)
+    assert "get_ipmi_sensors" not in {t["name"] for t in registry.get_all_tools()}
+
+def test_ipmi_registered_when_url_set():
+    from gcm_sentinel.datasources import build_datasources
+
+    cfg = SentinelConfig(api_key="test", ipmi_url="http://bmc-api:8080")
+    registry = build_datasources(cfg)
+    assert "get_ipmi_sensors" in {t["name"] for t in registry.get_all_tools()}
+```
+
+## Data source API reference
+
+Each data source implements the `DataSource` base class:
+
+| Method | Required | Description |
+|---|---|---|
+| `name` | Yes | Class attribute. Short identifier (e.g. `"ipmi"`). |
+| `get_tools()` | Yes | Return tool definitions (Anthropic format). |
+| `execute(tool_name, tool_input)` | Yes | Handle tool calls. Return a string. |
+| `is_available()` | No | Return `False` to skip registration. Default: `True`. |
+| `get_system_prompt_section()` | No | Return extra system prompt text. Default: `""`. |
+
+## Built-in data sources
+
+| Data source | File | Tools | `is_available` condition |
+|---|---|---|---|
+| `prometheus` | `prometheus.py` | `query_prometheus` | Always |
+| `dcgm_direct` | `dcgm_direct.py` | `query_dcgm_direct` | Always |
+| `node_direct` | `node_direct.py` | `query_node_exporter_direct`, `query_infiniband_direct` | Always |
+| `kubernetes_core` | `kubernetes_core.py` | `get_node_info`, `get_node_events`, `get_pod_logs` | Always |
+| `kubernetes_workloads` | `kubernetes_workloads.py` | `get_workload_info`, `get_workload_logs` | Always |
+| `gcm_health` | `gcm_health.py` | `get_gcm_health` | Always |
+| `alertmanager` | `alertmanager.py` | `query_alertmanager` | `alertmanager_url` is set |
+| `remediation` | `remediation.py` | `cordon_node`, `drain_node`, `taint_node` | `action_mode == "execute"` |
+
+## Ideas for new data sources
+
+| Name | Data source | What it adds |
+|---|---|---|
+| `IPMIDataSource` | BMC/IPMI API | Baseboard temps, fan speeds, PSU health |
+| `SlurmDataSource` | `squeue`/`sinfo` CLI | Job info on Slurm-native clusters |
+| `CloudDataSource` | AWS EC2 / GCP / Azure | Scheduled maintenance, instance health |
+| `FabricManagerDataSource` | NVIDIA FM API | NVSwitch topology and health |
+| `EFADataSource` | EFA metrics | AWS Elastic Fabric Adapter errors |
+
+## Tips
+
+- **Handle errors gracefully**: If your data source is unreachable, return an error string instead of raising. The LLM will adapt.
+- **System prompt section**: Keep it concise. The LLM sees this on every investigation — long sections waste tokens.
+- **`is_available()`**: Use this to auto-skip when prerequisites aren't met (URL not configured, library not installed). The registry logs "not available, skipping" — no error.
diff --git a/website/docs/GCM_Sentinel/architecture.md b/website/docs/GCM_Sentinel/architecture.md
new file mode 100644
index 0000000..492453c
--- /dev/null
+++ b/website/docs/GCM_Sentinel/architecture.md
@@ -0,0 +1,72 @@
+---
+sidebar_position: 2
+---
+
+# Architecture
+
+## Data flow
+
+```
+                    ┌─────────────────────────────────────┐
+                    │        gcm-sentinel (Deployment)       │
+                    │                                     │
+                    │  NodeConditionWatcher                │
+                    │  ├─ watches K8s nodes via watch API  │
+NPD sets condition ─┤  ├─ cooldown / allowlist gates       │
+  = True            │  └─ triggers run_investigation()     │
+                    │          │                           │
+                    │  DataSourceRegistry                  │
+                    │  ├─ Prometheus (PromQL)               │
+                    │  ├─ DCGMDirect (dcgm-exporter)       │
+                    │  ├─ NodeDirect (node-exporter, IB)   │
+                    │  ├─ KubernetesCore                    │
+                    │  ├─ KubernetesWorkloads               │
+                    │  ├─ GCMHealth                         │
+                    │  ├─ Alertmanager                      │
+                    │  └─ Remediation (execute mode only)   │
+                    │      └─ rate limit enforced here      │
+                    │          │                           │
+                    │  LLM Backend (Anthropic or OpenAI)   │
+                    │  ├─ tool-use loop (up to 25 rounds)  │
+                    │  └─ returns JSON assessment           │
+                    │          │                           │
+                    │  Output                              │
+                    │  ├─ K8s Event on node                │
+                    │  ├─ Webhook (Slack/PagerDuty)        │
+                    │  ├─ Node annotations (annotate mode) │
+                    │  └─ Structured logs                  │
+                    └─────────────────────────────────────┘
+```
+
+## Data sources
+
+Each data source is a class that subclasses `DataSource` and contributes tools (for the LLM to call) and optional system prompt sections. The investigation engine doesn't know which data sources are registered — it just calls `registry.get_all_tools()` and `registry.execute(name, input)`.
+
+See [Adding a New Data Source](./adding_new_datasource.md) for a step-by-step guide on integrating your own systems.
+
+## LLM backends
+
+The `backends/` package abstracts away the differences between LLM APIs:
+
+| Backend | SDK | Tool-use protocol |
+|---|---|---|
+| `AnthropicBackend` | `anthropic` | `tool_use` blocks in content, `tool_result` in user messages |
+| `OpenAIBackend` | `openai` | `tool_calls` on message, `role=tool` messages |
+
+Both backends implement the same `LLMBackend` interface. The investigation engine calls `backend.chat()`, `backend.append_assistant_response()`, and `backend.append_tool_results()` without knowing which API is behind it.
+
+## Condition name compatibility
+
+GCM Sentinel watches for both OSS and internal condition name formats:
+
+| OSS (Gcm prefix) | Internal (NPD prefix) | Health check |
+|---|---|---|
+| `GcmXidErrorsProblem` | `NPDXidErrorsProblem` | `check-syslogs xid` |
+| `GcmSmiEccProblem` | `NPDSmiEccProblem` | `check-nvidia-smi ecc` |
+| `GcmSmiDisconnectedProblem` | `NPDSmiDisconnectedProblem` | `check-nvidia-smi gpu_num` |
+| `GcmProcZombieProblem` | `NPDProcZombieProblem` | `check-process zombie` |
+| `GcmDcgmiNvlinkStatusProblem` | `NPDDcgmiNvlinkStatusProblem` | `check-dcgmi nvlink` |
+| `GcmDcgmiDiagProblem` | `NPDDcgmiDiagProblem` | `check-dcgmi diag` |
+| — | `NPDDcgmRunningProblem` | (internal only) |
+
+Both prefixes are watched by default. Extra names that don't exist on your cluster are harmless.
diff --git a/website/docs/GCM_Sentinel/getting_started.md b/website/docs/GCM_Sentinel/getting_started.md
new file mode 100644
index 0000000..9d93bbc
--- /dev/null
+++ b/website/docs/GCM_Sentinel/getting_started.md
@@ -0,0 +1,116 @@
+---
+sidebar_position: 1
+---
+
+# Getting Started
+
+:::caution Experimental
+GCM Sentinel is experimental. APIs, configuration, and behavior may change between releases.
+:::
+
+GCM Sentinel is an AI-powered sentinel agent that investigates GPU hardware failures detected by [GCM Health Checks](../GCM_Health_Checks/getting_started.md). It uses an LLM (Claude or GPT) to query Prometheus metrics, Kubernetes state, pod logs, and workload status — then recommends remediation actions.
+
+## Quick Start
+
+### Prerequisites
+
+- An LLM API key ([Anthropic](https://console.anthropic.com/) or [OpenAI](https://platform.openai.com/))
+- For cluster deployment: a Kubernetes GPU cluster with GPU metrics exposed to Prometheus (e.g. via [DCGM-exporter](https://github.com/NVIDIA/dcgm-exporter)) and a node health system that sets K8s node conditions (e.g. [GCM Health Checks](../GCM_Health_Checks/kubernetes_deployment.md))
+
+### Try It (No Cluster Needed)
+
+Run 5 GPU failure scenarios against mock backends:
+
+```shell
+pip install gcm-sentinel
+export ANTHROPIC_API_KEY=your-key-here
+python -m gcm_sentinel.demo.run
+```
+
+Or with OpenAI:
+
+```shell
+pip install gcm-sentinel[openai]
+export OPENAI_API_KEY=your-key-here
+python -m gcm_sentinel.demo.run
+```
+
+### Deploy to Your Cluster
+
+```shell
+helm install gcm-sentinel ./charts/gcm-sentinel \
+  --set llm.apiKey=your-key-here \
+  --set prometheus.url=http://prometheus.monitoring:9090
+```
+
+Or from source:
+
+```shell
+git clone https://github.com/facebookresearch/gcm.git
+cd gcm
+helm install gcm-sentinel ./charts/gcm-sentinel \
+  --set llm.apiKey=your-key-here
+```
+
+The agent starts in **recommend mode** (observe-only) by default — zero cluster mutations.
+
+### One-Shot Investigation (CLI)
+
+```shell
+pip install gcm-sentinel
+export GCM_SENTINEL_API_KEY=your-key-here
+gcm-sentinel investigate gpu-node-07 --condition GcmSmiEccProblem
+```
+
+## How It Works
+
+```
+GCM Health Check detects GPU failure
+  → NPD sets node condition (e.g. GcmXidErrorsProblem = True)
+  → gcm-sentinel watcher detects the change
+  → LLM investigates using tools:
+      query_prometheus          — PromQL queries (whatever Prometheus scrapes)
+      query_dcgm_direct         — GPU metrics directly from dcgm-exporter on the node
+      query_node_exporter_direct— host metrics directly from node-exporter
+      query_infiniband_direct   — InfiniBand port metrics from node-exporter
+      get_node_info             — K8s node conditions, taints, labels, pods
+      get_node_events           — recent K8s Events
+      get_pod_logs              — dcgm-exporter and NPD diagnostic logs
+      get_gcm_health            — NPD problem gauges and counters
+      get_workload_info         — training job identity, sibling workers
+      get_workload_logs         — NCCL/CUDA error logs from training pods
+      query_alertmanager        — currently firing alerts
+  → SentinelResult: severity + root_cause + recommended_action + confidence
+  → Output: K8s Event + webhook (Slack/PagerDuty) + optional node annotation
+```
+
+## Configuration
+
+All settings via environment variables (`GCM_SENTINEL_` prefix) or Helm values:
+
+| Variable | Default | Description |
+|---|---|---|
+| `API_KEY` | (required) | LLM API key |
+| `LLM_API` | `anthropic` | `anthropic` or `openai` |
+| `MODEL` | `claude-sonnet-4-6-20250725` | Model name |
+| `ACTION_MODE` | `recommend` | `recommend` / `annotate` / `execute` |
+| `PROMETHEUS_URL` | kube-prometheus-stack default | Prometheus URL |
+| `WEBHOOK_URL` | (empty) | Slack/PagerDuty webhook |
+| `WATCH_CONDITIONS` | GCM defaults | Comma-separated conditions |
+| `MAX_TOOL_OUTPUT` | `8000` | Max chars per tool result sent to LLM. 0 = unlimited. |
+
+See the [Safety & Rollout Guide](./safety.md) for the full configuration reference.
+
+## Works Without GCM
+
+GCM Sentinel works with **any** system that sets Kubernetes node conditions on GPU failures. Configure your condition names:
+
+```shell
+export GCM_SENTINEL_WATCH_CONDITIONS="MyGPUCheck,CustomXidCondition"
+```
+
+It also works without automated detection — use the CLI to trigger investigation manually.
+
+## Adding New Data Sources
+
+You can give the agent access to any system — IPMI sensors, cloud APIs, job schedulers, internal dashboards — by writing a data source (one Python file + one line of registration). See [Adding a New Data Source](./adding_new_datasource.md).
diff --git a/website/docs/GCM_Sentinel/safety.md b/website/docs/GCM_Sentinel/safety.md
new file mode 100644
index 0000000..6b01855
--- /dev/null
+++ b/website/docs/GCM_Sentinel/safety.md
@@ -0,0 +1,174 @@
+---
+sidebar_position: 3
+---
+
+# Safety & Rollout Guide
+
+:::danger Critical
+GCM Sentinel can investigate GPU failures on your production cluster. When configured in `execute` mode, it can **cordon and drain nodes**, which evicts running training jobs. Read this page carefully before deploying.
+:::
+
+## Default: Observe-Only
+
+Out of the box, gcm-sentinel is **completely safe**:
+
+```shell
+# This is observe-only — zero cluster mutations
+helm install gcm-sentinel ./charts/gcm-sentinel \
+  --set llm.apiKey=your-key-here
+```
+
+The default `actionMode=recommend` means:
+- The agent **cannot** cordon, drain, or taint any node
+- The remediation tools are **not even in the LLM's tool schema** — it cannot attempt to call them
+- The RBAC ClusterRole is **read-only** — even if something went wrong, K8s won't allow mutations
+- The agent only reads metrics, node info, events, and pod logs
+
+## Five Layers of Defense
+
+| Layer | What it does | How it works |
+|---|---|---|
+| **1. Action mode** | Controls whether the LLM even sees remediation tools | `recommend`/`annotate`: tools absent from schema — LLM cannot call them. `execute`: tools present. Enforced in Python before anything reaches the LLM. |
+| **2. RBAC** | Kubernetes-level permission enforcement | `recommend`: no `patch`/`eviction` verbs. `annotate`: adds node `patch` only. `execute`: adds `pods/eviction`. Even if code has a bug, K8s rejects unauthorized calls. |
+| **3. Rate limiter** | Prevents cascading drain | Max 3 remediation actions per hour. Enforced **before** the K8s API call — if the limit is hit, the tool returns "RATE LIMITED" to the LLM and no mutation happens. |
+| **4. Cooldown** | Prevents runaway re-investigation | Same node+condition won't be re-investigated within 1 hour. Stops NPD flapping from burning API credits. |
+
+## Controlling via Helm
+
+Every safety parameter is a Helm value and a corresponding environment variable:
+
+:::tip API Key Security
+For production, store the API key in a Kubernetes Secret instead of passing it as plaintext:
+```shell
+kubectl create secret generic gcm-sentinel-llm --from-literal=api-key=your-key-here
+helm install gcm-sentinel ./charts/gcm-sentinel --set llm.existingSecret=gcm-sentinel-llm
+```
+:::
+
+```shell
+# Observe-only (DEFAULT — no flag needed)
+helm install gcm-sentinel ./charts/gcm-sentinel \
+  --set llm.apiKey=your-key-here
+
+# Annotate mode — writes K8s annotations, no cordon/drain
+helm install gcm-sentinel ./charts/gcm-sentinel \
+  --set llm.apiKey=your-key-here \
+  --set sentinel.actionMode=annotate
+
+# Execute mode — can cordon/drain, with safety rails
+helm install gcm-sentinel ./charts/gcm-sentinel \
+  --set llm.apiKey=your-key-here \
+  --set sentinel.actionMode=execute \
+  --set sentinel.maxActionsPerHour=1 \
+  --set sentinel.nodeAllowlist="gpu-node-01"
+```
+
+### Safety Parameters Reference
+
+| Helm Value | Env Var | Default | Description |
+|---|---|---|---|
+| `sentinel.actionMode` | `GCM_SENTINEL_ACTION_MODE` | `recommend` | `recommend` / `annotate` / `execute` |
+| `sentinel.cooldownSeconds` | `GCM_SENTINEL_COOLDOWN_SECONDS` | `3600` | Seconds between investigations for same node+condition |
+| `sentinel.nodeAllowlist` | `GCM_SENTINEL_NODE_ALLOWLIST` | `""` (all) | Comma-separated node patterns (fnmatch) |
+| `sentinel.maxActionsPerHour` | `GCM_SENTINEL_MAX_ACTIONS_PER_HOUR` | `3` | Max remediations per hour (circuit-breaker) |
+
+### Helm Validation
+
+Invalid `actionMode` values are rejected at deploy time:
+
+```
+$ helm install gcm-sentinel ./charts/gcm-sentinel --set sentinel.actionMode=yolo
+Error: ... Invalid sentinel.actionMode: "yolo". Must be one of: recommend, annotate, execute
+```
+
+## Recommended Rollout
+
+### Week 1-2: Recommend Mode (Observe)
+
+```shell
+helm install gcm-sentinel ./charts/gcm-sentinel \
+  --set llm.apiKey=your-key-here \
+  --set sentinel.webhookUrl=https://hooks.slack.com/services/...
+```
+
+- Review investigation results in Slack/PagerDuty
+- Compare the agent's assessments with your team's diagnosis
+- Check K8s Events: `kubectl get events --field-selector reason=GCMSentinel`
+
+### Week 3-4: Annotate Mode (Validate)
+
+```shell
+helm upgrade gcm-sentinel ./charts/gcm-sentinel \
+  --set sentinel.actionMode=annotate
+```
+
+- Check node annotations: `kubectl get node <name> -o jsonpath='{.metadata.annotations}' | jq 'with_entries(select(.key | startswith("gcm-sentinel")))'`
+- Verify the recommended actions match what you'd do manually
+- Optionally build a controller that reads `gcm-sentinel/action` annotations
+
+### Week 5+: Execute Mode (Targeted)
+
+```shell
+helm upgrade gcm-sentinel ./charts/gcm-sentinel \
+  --set sentinel.actionMode=execute \
+  --set sentinel.maxActionsPerHour=1 \
+  --set sentinel.nodeAllowlist="gpu-node-01,gpu-node-02"
+```
+
+- Start with 1-2 test nodes via `nodeAllowlist`
+- Set `maxActionsPerHour=1` (conservative)
+- Monitor K8s Events: `kubectl get events --field-selector reason=GCMSentinel`
+- Expand `nodeAllowlist` gradually as you gain confidence
+
+### Full Auto (Mature)
+
+```shell
+helm upgrade gcm-sentinel ./charts/gcm-sentinel \
+  --set sentinel.actionMode=execute \
+  --set sentinel.nodeAllowlist="" \
+  --set sentinel.maxActionsPerHour=5
+```
+
+## Kill Switch
+
+To immediately disable all remediation without redeploying:
+
+```shell
+# Downgrade to observe-only
+helm upgrade gcm-sentinel ./charts/gcm-sentinel \
+  --set sentinel.actionMode=recommend
+
+# Or via env var (restart required, replace RELEASE_NAME with your Helm release name)
+kubectl set env deployment/RELEASE_NAME-gcm-sentinel GCM_SENTINEL_ACTION_MODE=recommend
+```
+
+## Verifying Safety at Runtime
+
+The agent logs its action mode prominently at startup:
+
+```
+============================================================
+GCM Sentinel Agent
+Action mode: recommend — OBSERVE-ONLY (no cluster mutations)
+Node allowlist: (all nodes)
+Cooldown: 3600s, Prometheus: http://prometheus.monitoring:9090
+============================================================
+```
+
+In execute mode, you'll see a warning:
+
+```
+============================================================
+GCM Sentinel Agent
+Action mode: execute — EXECUTE (can cordon/drain/taint nodes!)
+WARNING: EXECUTE MODE ACTIVE — agent can modify cluster state.
+  Max actions/hour: 1
+Node allowlist: gpu-node-01, gpu-node-02
+============================================================
+```
+
+Check the current mode by inspecting the pod logs:
+
+```shell
+kubectl logs deployment/RELEASE_NAME-gcm-sentinel | head -10
+```
diff --git a/website/docs/getting_started.md b/website/docs/getting_started.md
index 2b2357f..a5f360b 100644
--- a/website/docs/getting_started.md
+++ b/website/docs/getting_started.md
@@ -13,6 +13,7 @@ GCM is a monorepo with the following components:
 - [GCM Monitoring](./GCM_Monitoring/getting_started.md): Continuous data collection, mostly for the [Slurm](https://slurm.schedmd.com/documentation.html) workload scheduler, providing visibility into job performance and resource utilization.
 - [GCM Health Checks](./GCM_Health_Checks/getting_started.md): Verifies the proper functioning of hardware, software, network, storage, and services throughout the job lifecycle.
 - [GCM GPU Metrics](./GCM_GPU_Metrics/getting_started.md): Enhances OpenTelemetry data by correlating telemetry with Slurm metadata, enabling attribution of metrics (e.g., GPU utilization) to specific jobs and users.
+- [GCM Sentinel](./GCM_Sentinel/getting_started.md): AI-powered GPU failure investigation using Claude. Investigates health check failures like an on-call engineer — querying metrics, node state, and workload logs — then recommends remediation actions.
 
 <img src="/gcm/img/gcm_high_level.png" style={{ maxHeight: '400px', display: 'block', margin: '0 auto' }} />
 
@@ -23,6 +24,7 @@ Each component has their own Getting Started and Contributing Guide:
 - [Monitoring](./GCM_Monitoring/getting_started.md)
 - [Health Checks](./GCM_Health_Checks/getting_started.md)
 - [Telemetry Processor / GPU Metrics](./GCM_GPU_Metrics/getting_started.md)
+- [AI Sentinel](./GCM_Sentinel/getting_started.md)
 
 ### Contributing