Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,18 @@
mv "$file" "${file%.deb}-ubuntu${OS_VERSION}.deb"
done

- name: Build package (wheel + sdist)
- name: Build GCM package (wheel + sdist)
run: |
source ~/.cache/venv-ci/bin/activate
python -m build

- name: Build Sentinel package (wheel + sdist)
run: |
source ~/.cache/venv-ci/bin/activate
cd gcm-sentinel
python -m build
cp dist/* ../dist/

- name: Create GitHub Release and upload assets
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && github.repository == 'facebookresearch/gcm' && env.RELEASE_EXISTS == 'false' }}
uses: softprops/action-gh-release@v2
Expand Down Expand Up @@ -269,12 +276,18 @@
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Package and push Helm chart
- name: Package and push GCM Helm chart
run: |
sed -i "s/^version:.*/version: ${VERSION}/" charts/gcm/Chart.yaml
helm package charts/gcm
helm push gcm-${VERSION}.tgz oci://ghcr.io/facebookresearch/charts

- name: Package and push Sentinel Helm chart
run: |
sed -i "s/^version:.*/version: ${VERSION}/" charts/gcm-sentinel/Chart.yaml
helm package charts/gcm-sentinel
helm push gcm-sentinel-${VERSION}.tgz oci://ghcr.io/facebookresearch/charts

docker-publish:
needs: [check-release]
runs-on: ubuntu-latest
Expand All @@ -288,6 +301,7 @@
REGISTRY: ghcr.io
GCM_IMAGE: ghcr.io/facebookresearch/gcm
NPD_IMAGE: ghcr.io/facebookresearch/gcm-npd
SENTINEL_IMAGE: ghcr.io/facebookresearch/gcm-sentinel

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -331,3 +345,16 @@
docker build -f docker/Dockerfile.npd -t ${{ env.NPD_IMAGE }}:${{ env.VERSION }} -t ${{ env.NPD_IMAGE }}:latest .
docker push ${{ env.NPD_IMAGE }}:${{ env.VERSION }}
docker push ${{ env.NPD_IMAGE }}:latest

- name: Build and push Sentinel image
uses: docker/build-push-action@v6

Check warning

Code scanning / CodeQL

Unpinned tag for a non-immutable Action in workflow Medium

Unpinned 3rd party Action 'Release GCM Monitoring and Health Checks on version bump' step
Uses Step
uses 'docker/build-push-action' with ref 'v6', not a pinned commit hash
with:
context: ./gcm-sentinel
file: gcm-sentinel/Dockerfile
platforms: linux/amd64
push: true
tags: |
${{ env.SENTINEL_IMAGE }}:${{ env.VERSION }}
${{ env.SENTINEL_IMAGE }}:latest
cache-from: type=gha,scope=sentinel
cache-to: type=gha,mode=max,scope=sentinel
6 changes: 6 additions & 0 deletions charts/gcm-sentinel/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v2
name: gcm-sentinel
description: AI-powered GPU cluster sentinel agent
type: application
version: 0.0.0
appVersion: "0.0.0"
9 changes: 9 additions & 0 deletions charts/gcm-sentinel/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{{/*
Validate sentinel.actionMode — fail fast on invalid values.
*/}}
{{- define "gcm-sentinel.validateActionMode" -}}
{{- $valid := list "recommend" "annotate" "execute" -}}
{{- if not (has .Values.sentinel.actionMode $valid) -}}
{{- fail (printf "Invalid sentinel.actionMode: %q. Must be one of: %s" .Values.sentinel.actionMode (join ", " $valid)) -}}
{{- end -}}
{{- end -}}
44 changes: 44 additions & 0 deletions charts/gcm-sentinel/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ .Release.Name }}-gcm-sentinel
labels:
app: gcm-sentinel
app.kubernetes.io/name: gcm-sentinel
app.kubernetes.io/instance: {{ .Release.Name }}
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["nodes/status"]
verbs: ["get"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "list"]
- apiGroups: ["apps"]
resources: ["statefulsets", "replicasets"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "list", "create", "patch", "update"]

{{- if or (eq .Values.sentinel.actionMode "annotate") (eq .Values.sentinel.actionMode "execute") }}
# Annotate/execute: write annotations on nodes.
- apiGroups: [""]
resources: ["nodes"]
verbs: ["patch"]
{{- end }}

{{- if eq .Values.sentinel.actionMode "execute" }}
# Execute: cordon (patch nodes) + drain (evict pods).
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
{{- end }}
16 changes: 16 additions & 0 deletions charts/gcm-sentinel/templates/clusterrolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ .Release.Name }}-gcm-sentinel
labels:
app: gcm-sentinel
app.kubernetes.io/name: gcm-sentinel
app.kubernetes.io/instance: {{ .Release.Name }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ .Release.Name }}-gcm-sentinel
subjects:
- kind: ServiceAccount
name: {{ .Release.Name }}-gcm-sentinel
namespace: {{ .Release.Namespace }}
26 changes: 26 additions & 0 deletions charts/gcm-sentinel/templates/configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Release.Name }}-gcm-sentinel-prompt
namespace: {{ .Release.Namespace }}
labels:
app: gcm-sentinel
app.kubernetes.io/name: gcm-sentinel
app.kubernetes.io/instance: {{ .Release.Name }}
data:
system-prompt.txt: |
You are an expert GPU cluster reliability engineer. A Kubernetes node has reported
a GPU-related problem.

Investigate thoroughly using ALL available tools — explore every data source you
have access to before making your assessment. The tools available to you are your
eyes and hands into the cluster. Use them extensively.

For XID error code reference, see: https://docs.nvidia.com/deploy/xid-errors/

## Assessment

Provide your final assessment as JSON:
```json
{"severity": "critical|warning|info|ok", "summary": "...", "root_cause": "...", "recommended_action": "cordon|drain|taint|reboot|none", "confidence": 0.0-1.0}
```
89 changes: 89 additions & 0 deletions charts/gcm-sentinel/templates/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{{- include "gcm-sentinel.validateActionMode" . -}}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ .Release.Name }}-gcm-sentinel
namespace: {{ .Release.Namespace }}
labels:
app: gcm-sentinel
app.kubernetes.io/name: gcm-sentinel
app.kubernetes.io/instance: {{ .Release.Name }}
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
app: gcm-sentinel
app.kubernetes.io/instance: {{ .Release.Name }}
template:
metadata:
labels:
app: gcm-sentinel
app.kubernetes.io/instance: {{ .Release.Name }}
annotations:
checksum/prompt: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
spec:
serviceAccountName: {{ .Release.Name }}-gcm-sentinel
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: gcm-sentinel
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
args:
- serve
- --log-level
- "{{ .Values.logLevel }}"
env:
- name: GCM_SENTINEL_LLM_API
value: "{{ .Values.llm.api }}"
- name: GCM_SENTINEL_API_KEY
{{- if .Values.llm.existingSecret }}
valueFrom:
secretKeyRef:
name: {{ .Values.llm.existingSecret }}
key: {{ .Values.llm.secretKey | default "api-key" }}
{{- else }}
value: "{{ .Values.llm.apiKey }}"
{{- end }}
- name: GCM_SENTINEL_MODEL
value: "{{ .Values.llm.model }}"
- name: GCM_SENTINEL_PROMETHEUS_URL
value: "{{ .Values.prometheus.url }}"
- name: GCM_SENTINEL_ACTION_MODE
value: "{{ .Values.sentinel.actionMode }}"
- name: GCM_SENTINEL_COOLDOWN_SECONDS
value: "{{ .Values.sentinel.cooldownSeconds }}"
{{- if .Values.sentinel.nodeAllowlist }}
- name: GCM_SENTINEL_NODE_ALLOWLIST
value: "{{ .Values.sentinel.nodeAllowlist }}"
{{- end }}
- name: GCM_SENTINEL_MAX_ACTIONS_PER_HOUR
value: "{{ .Values.sentinel.maxActionsPerHour }}"
- name: GCM_SENTINEL_MAX_TOOL_ROUNDS
value: "{{ .Values.sentinel.maxToolRounds }}"
- name: GCM_SENTINEL_MAX_TOOL_OUTPUT
value: "{{ .Values.sentinel.maxToolOutput }}"
- name: GCM_SENTINEL_SYSTEM_PROMPT_PATH
value: "/etc/gcm-sentinel/system-prompt.txt"
{{- if .Values.sentinel.webhookUrl }}
- name: GCM_SENTINEL_WEBHOOK_URL
value: "{{ .Values.sentinel.webhookUrl }}"
{{- end }}
- name: GCM_SENTINEL_WATCH_CONDITIONS
value: '{{ join "," .Values.sentinel.watchConditions }}'
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
- name: system-prompt
mountPath: /etc/gcm-sentinel
readOnly: true
volumes:
- name: system-prompt
configMap:
name: {{ .Release.Name }}-gcm-sentinel-prompt
9 changes: 9 additions & 0 deletions charts/gcm-sentinel/templates/serviceaccount.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ .Release.Name }}-gcm-sentinel
namespace: {{ .Release.Namespace }}
labels:
app: gcm-sentinel
app.kubernetes.io/name: gcm-sentinel
app.kubernetes.io/instance: {{ .Release.Name }}
124 changes: 124 additions & 0 deletions charts/gcm-sentinel/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
replicaCount: 1

image:
repository: ghcr.io/facebookresearch/gcm-sentinel
tag: latest
pullPolicy: IfNotPresent

# LLM configuration.
llm:
# Provider: "anthropic" (Claude) or "openai" (GPT, vLLM, Ollama, Azure OpenAI).
api: "anthropic"
# API key — provide ONE of:
# apiKey: "..." (plaintext, for testing only)
# existingSecret: "my-secret" + secretKey: "api-key" (recommended for production)
apiKey: ""
existingSecret: ""
secretKey: "api-key"
# Model name/ID.
model: "claude-sonnet-4-6-20250725"

# Prometheus endpoint in the cluster.
prometheus:
url: "http://kube-prometheus-stack-prometheus.monitoring:9090"

# ============================================================================
# SAFETY CONFIGURATION
#
# By default, gcm-sentinel is OBSERVE-ONLY. It investigates GPU failures and
# reports findings but NEVER modifies cluster state (no cordon, no drain,
# no taint). This is enforced at three levels:
#
# 1. actionMode="recommend" — remediation tools are not even in the LLM's
# tool schema, so it cannot attempt to call them.
# 2. RBAC — the ClusterRole only gets node patch/eviction verbs when
# actionMode is "annotate" or "execute". In recommend mode, RBAC is
# strictly read-only.
# 3. Rate limits — even in execute mode, actions are gated by confidence
# threshold, cooldown, and per-hour rate limits.
#
# To enable automated remediation, you must EXPLICITLY set:
# --set sentinel.actionMode=execute
#
# We strongly recommend running in "recommend" mode for at least 2 weeks
# to validate the agent's assessments before enabling any mutations.
# ============================================================================

sentinel:
# ACTION MODE — controls whether the agent can modify cluster state.
#
# "recommend" (DEFAULT, SAFE) — observe-only. Investigate, log, emit
# K8s Events, send webhooks. Zero cluster mutations.
# RBAC is read-only. The agent cannot cordon/drain/taint.
#
# "annotate" — everything above + write gcm-sentinel/* annotations on
# the node. A separate controller or human acts on them.
# RBAC adds node patch. The agent still cannot cordon/drain.
#
# "execute" — everything above + The agent can cordon/drain/taint nodes.
# ONLY if max_actions_per_hour not exceeded. RBAC adds evictions.
# USE WITH CAUTION.
actionMode: "recommend"

# Minimum seconds between investigations for the same node+condition.
# Prevents runaway investigations from NPD condition flapping.
cooldownSeconds: "3600"

# If non-empty, only investigate nodes matching these patterns (fnmatch glob).
# Use to scope rollout to a subset of nodes. Comma-separated.
# Example: "gpu-node-01,gpu-node-02,gpu-node-0*"
nodeAllowlist: ""

# Max remediation actions (cordon/drain/taint) per hour across ALL nodes.
# Circuit-breaker to prevent cascading drain. Only relevant in execute mode.
maxActionsPerHour: "3"

# Maximum tool-use rounds per investigation.
maxToolRounds: "25"

# Max characters per tool result sent to the LLM. Controls context cost.
# 0 = unlimited (LLM sees full output).
maxToolOutput: "8000"

# Webhook URL to POST results to (e.g. Slack incoming webhook).
# Leave empty to disable.
webhookUrl: ""

# Node conditions to watch. Defaults include both OSS GCM conditions
# (Gcm* prefix) and internal variants (NPD* prefix). Only conditions
# that exist on your cluster will trigger — extra names are harmless.
watchConditions:
# OSS GCM conditions (https://github.com/facebookresearch/gcm)
- GcmXidErrorsProblem
- GcmSmiEccProblem
- GcmSmiDisconnectedProblem
- GcmProcZombieProblem
- GcmDcgmiNvlinkStatusProblem
- GcmDcgmiDiagProblem
# Internal variants
- NPDXidErrorsProblem
- NPDSmiEccProblem
- NPDSmiDisconnectedProblem
- NPDProcZombieProblem
- NPDDcgmiNvlinkStatusProblem
- NPDDcgmiDiagProblem
- NPDDcgmRunningProblem

# Resource limits.
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi

# Log level.
logLevel: INFO

# Node selector — run on infra/non-GPU nodes so the sentinel agent
# doesn't consume GPU resources. Set to match your cluster's infra nodes.
# Example for AWS: node.kubernetes.io/instance-type: m5.4xlarge
nodeSelector: {}

tolerations: []
Loading
Loading