kai-scheduler · david-gang · May 6, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - Suppressed noisy `Reconciler error` logs and `PodGrouperWarning` events on transient PodGroup update conflicts. The podgrouper now treats `IsConflict` errors as expected and silently requeues the reconcile instead of surfacing the apiserver's "object has been modified" message.
 - Fixed kai-operator not reconciling on Prometheus and ServiceMonitor changes. The Config controller now watches owned `Prometheus` and `ServiceMonitor` resources, so deletions and drift trigger reconciliation. CRD presence is checked at startup against the API server (the scheme-only check used previously could not detect missing CRDs), and the watch is registered only when the CRDs are installed. [#877](https://github.com/kai-scheduler/KAI-Scheduler/issues/877)
 - Added `before-hook-creation` to the `crd-upgrader` Helm hook delete policy so failed hook Jobs no longer block subsequent `helm upgrade --install` retries. Aligns with the policy already used by the chart's other hook resources. [#1404](https://github.com/kai-scheduler/KAI-Scheduler/issues/1404)
+- Fixed `podgroupcontroller` logging spurious errors on every reconcile for completed/failed pods because it tried to fetch DRA `ResourceClaim` objects that the DRA driver had already deleted. Terminal pods now skip the ResourceClaim lookup entirely, mirroring the scheduler-side fix in [#1456](https://github.com/kai-scheduler/KAI-Scheduler/pull/1456). [#1529](https://github.com/kai-scheduler/KAI-Scheduler/issues/1529)
 
 ## [v0.14.0] - 2026-03-30
 

diff --git a/pkg/podgroupcontroller/controllers/metadata/pod.go b/pkg/podgroupcontroller/controllers/metadata/pod.go
@@ -26,6 +26,15 @@ func GetPodMetadata(
 ) (*PodMetadata, error) {
 	var err error
 
+	if isTerminalPod(pod) {
+		// DRA ResourceClaims of terminal pods are deleted by the DRA driver, and
+		// the pod no longer requests or holds any resources, so skip the lookup.
+		return &PodMetadata{
+			RequestedResources: v1.ResourceList{},
+			AllocatedResources: v1.ResourceList{},
+		}, nil
+	}
+
 	draClaims, err := commonresources.FetchPodResourceClaims(ctx, pod, kubeClient, draAPIVersion)
 	if err != nil {
 		return nil, err
@@ -57,6 +66,10 @@ func isActivePod(pod *v1.Pod) bool {
 	return pod.Status.Phase == v1.PodPending || pod.Status.Phase == v1.PodRunning
 }
 
+func isTerminalPod(pod *v1.Pod) bool {
+	return pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed
+}
+
 func isAllocatedPod(pod *v1.Pod) bool {
 	if pod.Status.Phase == v1.PodPending {
 		return isPodScheduled(pod)

diff --git a/pkg/podgroupcontroller/controllers/metadata/pod_test.go b/pkg/podgroupcontroller/controllers/metadata/pod_test.go
@@ -4,9 +4,17 @@
 package metadata
 
 import (
+	"context"
 	"testing"
 
+	"github.com/stretchr/testify/assert"
 	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/utils/ptr"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
 )
 
 func TestIsPodAllocated(t *testing.T) {
@@ -96,6 +104,83 @@ func TestIsPodAllocated(t *testing.T) {
 	}
 }
 
+func TestIsTerminalPod(t *testing.T) {
+	tests := []struct {
+		name           string
+		pod            *v1.Pod
+		expectedResult bool
+	}{
+		{
+			"pending pod",
+			&v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}},
+			false,
+		},
+		{
+			"running pod",
+			&v1.Pod{Status: v1.PodStatus{Phase: v1.PodRunning}},
+			false,
+		},
+		{
+			"succeeded pod",
+			&v1.Pod{Status: v1.PodStatus{Phase: v1.PodSucceeded}},
+			true,
+		},
+		{
+			"failed pod",
+			&v1.Pod{Status: v1.PodStatus{Phase: v1.PodFailed}},
+			true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := isTerminalPod(tt.pod)
+			if tt.expectedResult != result {
+				t.Errorf("isTerminalPod() failed. test name: %s, expected: %v, actual: %v",
+					tt.name, tt.expectedResult, result)
+			}
+		})
+	}
+}
+
+// TestGetPodMetadata_TerminalPodSkipsResourceClaimLookup verifies that pods
+// in Succeeded/Failed phases do not trigger a ResourceClaim lookup. The DRA
+// driver removes per-pod ResourceClaims when pods reach a terminal phase, so
+// fetching them on every reconcile would always fail and produce spurious
+// error logs (issue #1529).
+func TestGetPodMetadata_TerminalPodSkipsResourceClaimLookup(t *testing.T) {
+	tests := []struct {
+		name  string
+		phase v1.PodPhase
+	}{
+		{"succeeded pod with missing claim", v1.PodSucceeded},
+		{"failed pod with missing claim", v1.PodFailed},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			pod := &v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"},
+				Spec: v1.PodSpec{
+					ResourceClaims: []v1.PodResourceClaim{
+						{Name: "gpu", ResourceClaimName: ptr.To("missing-claim")},
+					},
+				},
+				Status: v1.PodStatus{Phase: tt.phase},
+			}
+
+			scheme := runtime.NewScheme()
+			utilruntime.Must(clientgoscheme.AddToScheme(scheme))
+			kubeClient := fake.NewClientBuilder().WithScheme(scheme).Build()
+
+			meta, err := GetPodMetadata(context.Background(), pod, kubeClient, "V1")
+			assert.NoError(t, err)
+			assert.NotNil(t, meta)
+			assert.Empty(t, meta.RequestedResources)
+			assert.Empty(t, meta.AllocatedResources)
+		})
+	}
+}
+
 func TestIsActivePod(t *testing.T) {
 	tests := []struct {
 		name           string