Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Suppressed noisy `Reconciler error` logs and `PodGrouperWarning` events on transient PodGroup update conflicts. The podgrouper now treats `IsConflict` errors as expected and silently requeues the reconcile instead of surfacing the apiserver's "object has been modified" message.
- Fixed kai-operator not reconciling on Prometheus and ServiceMonitor changes. The Config controller now watches owned `Prometheus` and `ServiceMonitor` resources, so deletions and drift trigger reconciliation. CRD presence is checked at startup against the API server (the scheme-only check used previously could not detect missing CRDs), and the watch is registered only when the CRDs are installed. [#877](https://github.com/kai-scheduler/KAI-Scheduler/issues/877)
- Added `before-hook-creation` to the `crd-upgrader` Helm hook delete policy so failed hook Jobs no longer block subsequent `helm upgrade --install` retries. Aligns with the policy already used by the chart's other hook resources. [#1404](https://github.com/kai-scheduler/KAI-Scheduler/issues/1404)
- Fixed `podgroupcontroller` logging spurious errors on every reconcile for completed/failed pods because it tried to fetch DRA `ResourceClaim` objects that the DRA driver had already deleted. Terminal pods now skip the ResourceClaim lookup entirely, mirroring the scheduler-side fix in [#1456](https://github.com/kai-scheduler/KAI-Scheduler/pull/1456). [#1529](https://github.com/kai-scheduler/KAI-Scheduler/issues/1529)

## [v0.14.0] - 2026-03-30

Expand Down
13 changes: 13 additions & 0 deletions pkg/podgroupcontroller/controllers/metadata/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ func GetPodMetadata(
) (*PodMetadata, error) {
var err error

if isTerminalPod(pod) {
// DRA ResourceClaims of terminal pods are deleted by the DRA driver, and
// the pod no longer requests or holds any resources, so skip the lookup.
return &PodMetadata{
RequestedResources: v1.ResourceList{},
AllocatedResources: v1.ResourceList{},
}, nil
}

draClaims, err := commonresources.FetchPodResourceClaims(ctx, pod, kubeClient, draAPIVersion)
if err != nil {
return nil, err
Expand Down Expand Up @@ -57,6 +66,10 @@ func isActivePod(pod *v1.Pod) bool {
return pod.Status.Phase == v1.PodPending || pod.Status.Phase == v1.PodRunning
}

func isTerminalPod(pod *v1.Pod) bool {
return pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed
}

func isAllocatedPod(pod *v1.Pod) bool {
if pod.Status.Phase == v1.PodPending {
return isPodScheduled(pod)
Expand Down
85 changes: 85 additions & 0 deletions pkg/podgroupcontroller/controllers/metadata/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,17 @@
package metadata

import (
"context"
"testing"

"github.com/stretchr/testify/assert"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

func TestIsPodAllocated(t *testing.T) {
Expand Down Expand Up @@ -96,6 +104,83 @@ func TestIsPodAllocated(t *testing.T) {
}
}

func TestIsTerminalPod(t *testing.T) {
tests := []struct {
name string
pod *v1.Pod
expectedResult bool
}{
{
"pending pod",
&v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}},
false,
},
{
"running pod",
&v1.Pod{Status: v1.PodStatus{Phase: v1.PodRunning}},
false,
},
{
"succeeded pod",
&v1.Pod{Status: v1.PodStatus{Phase: v1.PodSucceeded}},
true,
},
{
"failed pod",
&v1.Pod{Status: v1.PodStatus{Phase: v1.PodFailed}},
true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := isTerminalPod(tt.pod)
if tt.expectedResult != result {
t.Errorf("isTerminalPod() failed. test name: %s, expected: %v, actual: %v",
tt.name, tt.expectedResult, result)
}
})
}
}

// TestGetPodMetadata_TerminalPodSkipsResourceClaimLookup verifies that pods
// in Succeeded/Failed phases do not trigger a ResourceClaim lookup. The DRA
// driver removes per-pod ResourceClaims when pods reach a terminal phase, so
// fetching them on every reconcile would always fail and produce spurious
// error logs (issue #1529).
func TestGetPodMetadata_TerminalPodSkipsResourceClaimLookup(t *testing.T) {
tests := []struct {
name string
phase v1.PodPhase
}{
{"succeeded pod with missing claim", v1.PodSucceeded},
{"failed pod with missing claim", v1.PodFailed},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"},
Spec: v1.PodSpec{
ResourceClaims: []v1.PodResourceClaim{
{Name: "gpu", ResourceClaimName: ptr.To("missing-claim")},
},
},
Status: v1.PodStatus{Phase: tt.phase},
}

scheme := runtime.NewScheme()
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
kubeClient := fake.NewClientBuilder().WithScheme(scheme).Build()

meta, err := GetPodMetadata(context.Background(), pod, kubeClient, "V1")
assert.NoError(t, err)
assert.NotNil(t, meta)
assert.Empty(t, meta.RequestedResources)
assert.Empty(t, meta.AllocatedResources)
})
}
}

func TestIsActivePod(t *testing.T) {
tests := []struct {
name string
Expand Down
Loading