From f78bcef638362bc7a02797fbbb11117087a84401 Mon Sep 17 00:00:00 2001 From: dttung2905 Date: Sat, 25 Apr 2026 17:59:57 +0100 Subject: [PATCH] Add PDB for kai operator Signed-off-by: dttung2905 --- .../kai-scheduler/templates/_helpers.tpl | 21 +++++++++++++++++++ .../operator-poddisruptionbudget.yaml | 18 ++++++++++++++++ deployments/kai-scheduler/values.yaml | 6 ++++++ 3 files changed, 45 insertions(+) create mode 100644 deployments/kai-scheduler/templates/_helpers.tpl create mode 100644 deployments/kai-scheduler/templates/services/operator-poddisruptionbudget.yaml diff --git a/deployments/kai-scheduler/templates/_helpers.tpl b/deployments/kai-scheduler/templates/_helpers.tpl new file mode 100644 index 000000000..d16e05c3f --- /dev/null +++ b/deployments/kai-scheduler/templates/_helpers.tpl @@ -0,0 +1,21 @@ +# Copyright 2025 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +{{/* +Operator PodDisruptionBudget: merge values.operator.podDisruptionBudget with safe defaults. +Uses hasKey (not default) so enabled: false and maxUnavailable: 0 are respected (Sprig default() treats them as empty). + +Returns a small YAML object with keys: enabled, maxUnavailable +*/}} +{{- define "kai-scheduler.operator.podDisruptionBudgetConfig" -}} +{{- $pdb := .Values.operator.podDisruptionBudget | default dict }} +{{- $pdbEnabled := true }} +{{- if hasKey $pdb "enabled" }} +{{- $pdbEnabled = $pdb.enabled }} +{{- end }} +{{- $maxUnavailable := 1 }} +{{- if hasKey $pdb "maxUnavailable" }} +{{- $maxUnavailable = int $pdb.maxUnavailable }} +{{- end }} +{{- dict "enabled" $pdbEnabled "maxUnavailable" $maxUnavailable | toYaml }} +{{- end }} diff --git a/deployments/kai-scheduler/templates/services/operator-poddisruptionbudget.yaml b/deployments/kai-scheduler/templates/services/operator-poddisruptionbudget.yaml new file mode 100644 index 000000000..fe4cdc6e7 --- /dev/null +++ b/deployments/kai-scheduler/templates/services/operator-poddisruptionbudget.yaml @@ -0,0 +1,18 @@ +# Copyright 2025 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +{{- $cfg := fromYaml (include "kai-scheduler.operator.podDisruptionBudgetConfig" .) }} +{{- if and $cfg.enabled (gt (int .Values.operator.replicaCount) 1) }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: kai-operator-pdb + namespace: {{ .Release.Namespace }} + labels: + app: kai-operator +spec: + maxUnavailable: {{ $cfg.maxUnavailable }} + selector: + matchLabels: + app: kai-operator +{{- end }} diff --git a/deployments/kai-scheduler/values.yaml b/deployments/kai-scheduler/values.yaml index 0c94709a4..172ae4c78 100644 --- a/deployments/kai-scheduler/values.yaml +++ b/deployments/kai-scheduler/values.yaml @@ -45,6 +45,12 @@ operator: probeBindAddress: ":8081" qps: 50 burst: 300 + # PodDisruptionBudget limits voluntary evictions (drain) so not all operator pods are removed at once. + # Renders only when operator.replicaCount > 1. With a single replica, a PDB (e.g. minAvailable: 1) can + # block drains; maxUnavailable is not used for replicaCount: 1 by default. + podDisruptionBudget: + enabled: true + maxUnavailable: 1 podgrouper: enabled: true