facebookresearch · luccabb · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
@@ -184,11 +184,18 @@
             mv "$file" "${file%.deb}-ubuntu${OS_VERSION}.deb"
           done
 
-      - name: Build package (wheel + sdist)
+      - name: Build GCM package (wheel + sdist)
         run: |
           source ~/.cache/venv-ci/bin/activate
           python -m build
 
+      - name: Build Sentinel package (wheel + sdist)
+        run: |
+          source ~/.cache/venv-ci/bin/activate
+          cd gcm-sentinel
+          python -m build
+          cp dist/* ../dist/
+
       - name: Create GitHub Release and upload assets
         if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && github.repository == 'facebookresearch/gcm' && env.RELEASE_EXISTS == 'false' }}
         uses: softprops/action-gh-release@v2
@@ -269,12 +276,18 @@
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Package and push Helm chart
+      - name: Package and push GCM Helm chart
         run: |
           sed -i "s/^version:.*/version: ${VERSION}/" charts/gcm/Chart.yaml
           helm package charts/gcm
           helm push gcm-${VERSION}.tgz oci://ghcr.io/facebookresearch/charts
 
+      - name: Package and push Sentinel Helm chart
+        run: |
+          sed -i "s/^version:.*/version: ${VERSION}/" charts/gcm-sentinel/Chart.yaml
+          helm package charts/gcm-sentinel
+          helm push gcm-sentinel-${VERSION}.tgz oci://ghcr.io/facebookresearch/charts
+
   docker-publish:
     needs: [check-release]
     runs-on: ubuntu-latest
@@ -288,6 +301,7 @@
       REGISTRY: ghcr.io
       GCM_IMAGE: ghcr.io/facebookresearch/gcm
       NPD_IMAGE: ghcr.io/facebookresearch/gcm-npd
+      SENTINEL_IMAGE: ghcr.io/facebookresearch/gcm-sentinel
 
     steps:
       - uses: actions/checkout@v4
@@ -331,3 +345,16 @@
           docker build -f docker/Dockerfile.npd -t ${{ env.NPD_IMAGE }}:${{ env.VERSION }} -t ${{ env.NPD_IMAGE }}:latest .
           docker push ${{ env.NPD_IMAGE }}:${{ env.VERSION }}
           docker push ${{ env.NPD_IMAGE }}:latest
+
+      - name: Build and push Sentinel image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./gcm-sentinel
+          file: gcm-sentinel/Dockerfile
+          platforms: linux/amd64
+          push: true
+          tags: |
+            ${{ env.SENTINEL_IMAGE }}:${{ env.VERSION }}
+            ${{ env.SENTINEL_IMAGE }}:latest
+          cache-from: type=gha,scope=sentinel
+          cache-to: type=gha,mode=max,scope=sentinel
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: gcm-sentinel
+description: AI-powered GPU cluster sentinel agent
+type: application
+version: 0.0.0
+appVersion: "0.0.0"
@@ -0,0 +1,9 @@
+{{/*
+Validate sentinel.actionMode — fail fast on invalid values.
+*/}}
+{{- define "gcm-sentinel.validateActionMode" -}}
+{{- $valid := list "recommend" "annotate" "execute" -}}
+{{- if not (has .Values.sentinel.actionMode $valid) -}}
+{{- fail (printf "Invalid sentinel.actionMode: %q. Must be one of: %s" .Values.sentinel.actionMode (join ", " $valid)) -}}
+{{- end -}}
+{{- end -}}
@@ -0,0 +1,44 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["nodes/status"]
+    verbs: ["get"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list"]
+  - apiGroups: [""]
+    resources: ["pods/log"]
+    verbs: ["get"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["get", "list"]
+  - apiGroups: ["apps"]
+    resources: ["statefulsets", "replicasets"]
+    verbs: ["get", "list"]
+  - apiGroups: [""]
+    resources: ["events"]
+    verbs: ["get", "list", "create", "patch", "update"]
+
+  {{- if or (eq .Values.sentinel.actionMode "annotate") (eq .Values.sentinel.actionMode "execute") }}
+  # Annotate/execute: write annotations on nodes.
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["patch"]
+  {{- end }}
+
+  {{- if eq .Values.sentinel.actionMode "execute" }}
+  # Execute: cordon (patch nodes) + drain (evict pods).
+  - apiGroups: [""]
+    resources: ["pods/eviction"]
+    verbs: ["create"]
+  {{- end }}
@@ -0,0 +1,16 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ .Release.Name }}-gcm-sentinel
+subjects:
+  - kind: ServiceAccount
+    name: {{ .Release.Name }}-gcm-sentinel
+    namespace: {{ .Release.Namespace }}
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel-prompt
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
+data:
+  system-prompt.txt: |
+    You are an expert GPU cluster reliability engineer. A Kubernetes node has reported
+    a GPU-related problem.
+
+    Investigate thoroughly using ALL available tools — explore every data source you
+    have access to before making your assessment. The tools available to you are your
+    eyes and hands into the cluster. Use them extensively.
+
+    For XID error code reference, see: https://docs.nvidia.com/deploy/xid-errors/
+
+    ## Assessment
+
+    Provide your final assessment as JSON:
+    ```json
+    {"severity": "critical|warning|info|ok", "summary": "...", "root_cause": "...", "recommended_action": "cordon|drain|taint|reboot|none", "confidence": 0.0-1.0}
+    ```
@@ -0,0 +1,89 @@
+{{- include "gcm-sentinel.validateActionMode" . -}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  selector:
+    matchLabels:
+      app: gcm-sentinel
+      app.kubernetes.io/instance: {{ .Release.Name }}
+  template:
+    metadata:
+      labels:
+        app: gcm-sentinel
+        app.kubernetes.io/instance: {{ .Release.Name }}
+      annotations:
+        checksum/prompt: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
+    spec:
+      serviceAccountName: {{ .Release.Name }}-gcm-sentinel
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      containers:
+        - name: gcm-sentinel
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          args:
+            - serve
+            - --log-level
+            - "{{ .Values.logLevel }}"
+          env:
+            - name: GCM_SENTINEL_LLM_API
+              value: "{{ .Values.llm.api }}"
+            - name: GCM_SENTINEL_API_KEY
+              {{- if .Values.llm.existingSecret }}
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.llm.existingSecret }}
+                  key: {{ .Values.llm.secretKey | default "api-key" }}
+              {{- else }}
+              value: "{{ .Values.llm.apiKey }}"
+              {{- end }}
+            - name: GCM_SENTINEL_MODEL
+              value: "{{ .Values.llm.model }}"
+            - name: GCM_SENTINEL_PROMETHEUS_URL
+              value: "{{ .Values.prometheus.url }}"
+            - name: GCM_SENTINEL_ACTION_MODE
+              value: "{{ .Values.sentinel.actionMode }}"
+            - name: GCM_SENTINEL_COOLDOWN_SECONDS
+              value: "{{ .Values.sentinel.cooldownSeconds }}"
+            {{- if .Values.sentinel.nodeAllowlist }}
+            - name: GCM_SENTINEL_NODE_ALLOWLIST
+              value: "{{ .Values.sentinel.nodeAllowlist }}"
+            {{- end }}
+            - name: GCM_SENTINEL_MAX_ACTIONS_PER_HOUR
+              value: "{{ .Values.sentinel.maxActionsPerHour }}"
+            - name: GCM_SENTINEL_MAX_TOOL_ROUNDS
+              value: "{{ .Values.sentinel.maxToolRounds }}"
+            - name: GCM_SENTINEL_MAX_TOOL_OUTPUT
+              value: "{{ .Values.sentinel.maxToolOutput }}"
+            - name: GCM_SENTINEL_SYSTEM_PROMPT_PATH
+              value: "/etc/gcm-sentinel/system-prompt.txt"
+            {{- if .Values.sentinel.webhookUrl }}
+            - name: GCM_SENTINEL_WEBHOOK_URL
+              value: "{{ .Values.sentinel.webhookUrl }}"
+            {{- end }}
+            - name: GCM_SENTINEL_WATCH_CONDITIONS
+              value: '{{ join "," .Values.sentinel.watchConditions }}'
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: system-prompt
+              mountPath: /etc/gcm-sentinel
+              readOnly: true
+      volumes:
+        - name: system-prompt
+          configMap:
+            name: {{ .Release.Name }}-gcm-sentinel-prompt
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ .Release.Name }}-gcm-sentinel
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: gcm-sentinel
+    app.kubernetes.io/name: gcm-sentinel
+    app.kubernetes.io/instance: {{ .Release.Name }}
@@ -0,0 +1,124 @@
+replicaCount: 1
+
+image:
+  repository: ghcr.io/facebookresearch/gcm-sentinel
+  tag: latest
+  pullPolicy: IfNotPresent
+
+# LLM configuration.
+llm:
+  # Provider: "anthropic" (Claude) or "openai" (GPT, vLLM, Ollama, Azure OpenAI).
+  api: "anthropic"
+  # API key — provide ONE of:
+  #   apiKey: "..." (plaintext, for testing only)
+  #   existingSecret: "my-secret" + secretKey: "api-key" (recommended for production)
+  apiKey: ""
+  existingSecret: ""
+  secretKey: "api-key"
+  # Model name/ID.
+  model: "claude-sonnet-4-6-20250725"
+
+# Prometheus endpoint in the cluster.
+prometheus:
+  url: "http://kube-prometheus-stack-prometheus.monitoring:9090"
+
+# ============================================================================
+# SAFETY CONFIGURATION
+#
+# By default, gcm-sentinel is OBSERVE-ONLY. It investigates GPU failures and
+# reports findings but NEVER modifies cluster state (no cordon, no drain,
+# no taint). This is enforced at three levels:
+#
+#   1. actionMode="recommend" — remediation tools are not even in the LLM's
+#      tool schema, so it cannot attempt to call them.
+#   2. RBAC — the ClusterRole only gets node patch/eviction verbs when
+#      actionMode is "annotate" or "execute". In recommend mode, RBAC is
+#      strictly read-only.
+#   3. Rate limits — even in execute mode, actions are gated by confidence
+#      threshold, cooldown, and per-hour rate limits.
+#
+# To enable automated remediation, you must EXPLICITLY set:
+#   --set sentinel.actionMode=execute
+#
+# We strongly recommend running in "recommend" mode for at least 2 weeks
+# to validate the agent's assessments before enabling any mutations.
+# ============================================================================
+
+sentinel:
+  # ACTION MODE — controls whether the agent can modify cluster state.
+  #
+  #   "recommend" (DEFAULT, SAFE) — observe-only. Investigate, log, emit
+  #               K8s Events, send webhooks. Zero cluster mutations.
+  #               RBAC is read-only. The agent cannot cordon/drain/taint.
+  #
+  #   "annotate"  — everything above + write gcm-sentinel/* annotations on
+  #                 the node. A separate controller or human acts on them.
+  #                 RBAC adds node patch. The agent still cannot cordon/drain.
+  #
+  #   "execute"   — everything above + The agent can cordon/drain/taint nodes.
+  #                 ONLY if max_actions_per_hour not exceeded. RBAC adds evictions.
+  #                 USE WITH CAUTION.
+  actionMode: "recommend"
+
+  # Minimum seconds between investigations for the same node+condition.
+  # Prevents runaway investigations from NPD condition flapping.
+  cooldownSeconds: "3600"
+
+  # If non-empty, only investigate nodes matching these patterns (fnmatch glob).
+  # Use to scope rollout to a subset of nodes. Comma-separated.
+  # Example: "gpu-node-01,gpu-node-02,gpu-node-0*"
+  nodeAllowlist: ""
+
+  # Max remediation actions (cordon/drain/taint) per hour across ALL nodes.
+  # Circuit-breaker to prevent cascading drain. Only relevant in execute mode.
+  maxActionsPerHour: "3"
+
+  # Maximum tool-use rounds per investigation.
+  maxToolRounds: "25"
+
+  # Max characters per tool result sent to the LLM. Controls context cost.
+  # 0 = unlimited (LLM sees full output).
+  maxToolOutput: "8000"
+
+  # Webhook URL to POST results to (e.g. Slack incoming webhook).
+  # Leave empty to disable.
+  webhookUrl: ""
+
+  # Node conditions to watch. Defaults include both OSS GCM conditions
+  # (Gcm* prefix) and internal variants (NPD* prefix). Only conditions
+  # that exist on your cluster will trigger — extra names are harmless.
+  watchConditions:
+    # OSS GCM conditions (https://github.com/facebookresearch/gcm)
+    - GcmXidErrorsProblem
+    - GcmSmiEccProblem
+    - GcmSmiDisconnectedProblem
+    - GcmProcZombieProblem
+    - GcmDcgmiNvlinkStatusProblem
+    - GcmDcgmiDiagProblem
+    # Internal variants
+    - NPDXidErrorsProblem
+    - NPDSmiEccProblem
+    - NPDSmiDisconnectedProblem
+    - NPDProcZombieProblem
+    - NPDDcgmiNvlinkStatusProblem
+    - NPDDcgmiDiagProblem
+    - NPDDcgmRunningProblem
+
+# Resource limits.
+resources:
+  requests:
+    cpu: 100m
+    memory: 256Mi
+  limits:
+    cpu: 500m
+    memory: 512Mi
+
+# Log level.
+logLevel: INFO
+
+# Node selector — run on infra/non-GPU nodes so the sentinel agent
+# doesn't consume GPU resources. Set to match your cluster's infra nodes.
+# Example for AWS: node.kubernetes.io/instance-type: m5.4xlarge
+nodeSelector: {}
+
+tolerations: []