Skip to content

Commit 3e3a712

Browse files
committed
fix: recover from stale-cache conflict on ConstraintTemplatePodStatus update
The ConstraintTemplate reconciler reads the per-pod ConstraintTemplatePodStatus through the controller-runtime informer cache and then writes it back with the delegating client. When the same ConstraintTemplate is reconciled twice in rapid succession (e.g. a CT update followed by the owned-CRD update event re-enqueuing the CT), the cache can briefly serve a stale resourceVersion and the next Update fails with a 409 Conflict, producing a noisy "update ct pod status error" log on every occurrence. Wrap the three PodStatus Update call sites in a retry helper that, on Conflict, refetches the PodStatus via mgr.GetAPIReader() (uncached) to avoid looping against the same stale cache view, re-applies the desired Status, and retries. Mirrors the retry-on-conflict pattern already used in generateCRD for the ConstraintTemplate annotation update. Fixes #4595
1 parent e250d0d commit 3e3a712

2 files changed

Lines changed: 229 additions & 3 deletions

File tree

pkg/controller/constrainttemplate/constrainttemplate_controller.go

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ func newReconciler(mgr manager.Manager, cfClient *constraintclient.Client, wm *w
193193
r := newStatsReporter()
194194
reconciler := &ReconcileConstraintTemplate{
195195
Client: mgr.GetClient(),
196+
apiReader: mgr.GetAPIReader(),
196197
scheme: mgr.GetScheme(),
197198
cfClient: cfClient,
198199
watcher: w,
@@ -284,6 +285,10 @@ var _ reconcile.Reconciler = &ReconcileConstraintTemplate{}
284285
// ReconcileConstraintTemplate reconciles a ConstraintTemplate object.
285286
type ReconcileConstraintTemplate struct {
286287
client.Client
288+
// apiReader is an uncached reader used to refetch the latest version of a ConstraintTemplatePodStatus when an
289+
// optimistic-concurrency conflict is hit on update. The default reconciler client reads through the informer cache,
290+
// which can briefly return a stale resourceVersion right after a successful write and cause spurious 409 conflicts.
291+
apiReader client.Reader
287292
scheme *runtime.Scheme
288293
watcher *watch.Registrar
289294
statusWatcher *watch.Registrar
@@ -394,7 +399,7 @@ func (r *ReconcileConstraintTemplate) Reconcile(ctx context.Context, request rec
394399
createErr := &v1beta1.CreateCRDError{Code: ErrCreateCode, Message: err.Error()}
395400
status.Status.Errors = append(status.Status.Errors, createErr)
396401

397-
if updateErr := r.Update(ctx, status); updateErr != nil {
402+
if updateErr := r.updatePodStatusWithRetry(ctx, status); updateErr != nil {
398403
logger.Error(updateErr, "update status error")
399404
return reconcile.Result{Requeue: true}, nil
400405
}
@@ -454,7 +459,7 @@ func (r *ReconcileConstraintTemplate) reportErrorOnCTStatus(ctx context.Context,
454459
Message: fmt.Sprintf("%s: %s", message, err),
455460
}
456461
status.Status.Errors = append(status.Status.Errors, createErr)
457-
if err2 := r.Update(ctx, status); err2 != nil {
462+
if err2 := r.updatePodStatusWithRetry(ctx, status); err2 != nil {
458463
return errorpkg.Wrap(err, fmt.Sprintf("Could not update status: %s", err2))
459464
}
460465
return err
@@ -522,7 +527,7 @@ func (r *ReconcileConstraintTemplate) handleUpdate(
522527
if err != nil {
523528
return reconcile.Result{}, err
524529
}
525-
if err := r.Update(ctx, status); err != nil {
530+
if err := r.updatePodStatusWithRetry(ctx, status); err != nil {
526531
logger.Error(err, "update ct pod status error")
527532
return reconcile.Result{Requeue: true}, nil
528533
}
@@ -587,6 +592,38 @@ func (r *ReconcileConstraintTemplate) deleteAllStatus(ctx context.Context, ctNam
587592
return nil
588593
}
589594

595+
// updatePodStatusWithRetry updates a ConstraintTemplatePodStatus and transparently recovers from optimistic-concurrency
596+
// conflicts caused by a stale informer cache. The reconciler's default client reads through the cache, so when the same
597+
// ConstraintTemplate is reconciled twice in rapid succession (for example: CT update followed by the owned-CRD update
598+
// event re-enqueuing the CT) the second reconcile can observe a PodStatus with an older resourceVersion than the one
599+
// already on the API server, and the subsequent Update fails with a 409 Conflict.
600+
//
601+
// On Conflict we refetch the PodStatus via the uncached APIReader, re-apply our desired Status, and attempt the Update
602+
// once more inline before yielding to RetryOnConflict's backoff. This collapses the common case (stale cache, no other
603+
// writer racing us) into a single fast retry. If that inline attempt still conflicts, the error is returned to
604+
// RetryOnConflict which will back off and try again. Non-conflict errors are returned as-is. The supplied status
605+
// pointer is updated in place with the latest object so callers can continue to mutate it after a successful return.
606+
func (r *ReconcileConstraintTemplate) updatePodStatusWithRetry(ctx context.Context, status *statusv1beta1.ConstraintTemplatePodStatus) error {
607+
desiredStatus := status.Status
608+
key := client.ObjectKeyFromObject(status)
609+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
610+
err := r.Update(ctx, status)
611+
if err == nil {
612+
return nil
613+
}
614+
if !apierrors.IsConflict(err) {
615+
return err
616+
}
617+
latest := &statusv1beta1.ConstraintTemplatePodStatus{}
618+
if getErr := r.apiReader.Get(ctx, key, latest); getErr != nil {
619+
return getErr
620+
}
621+
latest.Status = desiredStatus
622+
*status = *latest
623+
return r.Update(ctx, status)
624+
})
625+
}
626+
590627
func (r *ReconcileConstraintTemplate) getOrCreatePodStatus(ctx context.Context, ctName string) (*statusv1beta1.ConstraintTemplatePodStatus, error) {
591628
statusObj := &statusv1beta1.ConstraintTemplatePodStatus{}
592629
sName, err := statusv1beta1.KeyForConstraintTemplate(util.GetPodName(), ctName)
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/*
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
*/
15+
16+
package constrainttemplate
17+
18+
import (
19+
"context"
20+
"testing"
21+
22+
"github.com/open-policy-agent/frameworks/constraint/pkg/apis/templates/v1beta1"
23+
statusv1beta1 "github.com/open-policy-agent/gatekeeper/v3/apis/status/v1beta1"
24+
"github.com/stretchr/testify/require"
25+
apierrors "k8s.io/apimachinery/pkg/api/errors"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/apimachinery/pkg/types"
29+
"sigs.k8s.io/controller-runtime/pkg/client"
30+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
31+
)
32+
33+
// staleClient wraps a controller-runtime client and serves Get requests from a snapshot taken at construction time,
34+
// while forwarding Update/Create/Delete/List to the real (live) client. This simulates a controller-runtime informer
35+
// cache that has not yet observed the latest write to the backing store, which is the underlying cause of the spurious
36+
// "the object has been modified" conflicts we hit in the ConstraintTemplate reconciler.
37+
type staleClient struct {
38+
client.Client
39+
stale map[client.ObjectKey]*statusv1beta1.ConstraintTemplatePodStatus
40+
}
41+
42+
func (s *staleClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
43+
if cts, ok := obj.(*statusv1beta1.ConstraintTemplatePodStatus); ok {
44+
if cached, ok := s.stale[key]; ok {
45+
cached.DeepCopyInto(cts)
46+
return nil
47+
}
48+
}
49+
return s.Client.Get(ctx, key, obj, opts...)
50+
}
51+
52+
func newPodStatusForTest(name string) *statusv1beta1.ConstraintTemplatePodStatus {
53+
return &statusv1beta1.ConstraintTemplatePodStatus{
54+
TypeMeta: metav1.TypeMeta{
55+
APIVersion: statusv1beta1.GroupVersion.String(),
56+
Kind: "ConstraintTemplatePodStatus",
57+
},
58+
ObjectMeta: metav1.ObjectMeta{
59+
Name: name,
60+
Namespace: "gatekeeper-system",
61+
},
62+
}
63+
}
64+
65+
func newSchemeForTest(t *testing.T) *runtime.Scheme {
66+
t.Helper()
67+
scheme := runtime.NewScheme()
68+
require.NoError(t, statusv1beta1.AddToScheme(scheme))
69+
require.NoError(t, v1beta1.AddToScheme(scheme))
70+
return scheme
71+
}
72+
73+
// TestUpdatePodStatusWithRetry_StaleCacheRecovers reproduces the race that produced "update ct pod status error" in
74+
// production: the reconciler's cached client returns a stale resourceVersion for the PodStatus, so the first Update
75+
// returns a 409 Conflict; the helper must refetch via the uncached APIReader and successfully retry.
76+
func TestUpdatePodStatusWithRetry_StaleCacheRecovers(t *testing.T) {
77+
ctx := context.Background()
78+
scheme := newSchemeForTest(t)
79+
80+
initial := newPodStatusForTest("pod-template-status")
81+
initial.Status.ID = "initial"
82+
83+
// Live store: the API-server-equivalent that holds the latest resourceVersion.
84+
live := fake.NewClientBuilder().
85+
WithScheme(scheme).
86+
WithObjects(initial.DeepCopy()).
87+
Build()
88+
89+
// Capture the resourceVersion the fake client assigned to the initial object on Build(). We can't read it off
90+
// `initial` because WithObjects deep-copies its input before stamping a resourceVersion on the stored copy.
91+
storedBefore := &statusv1beta1.ConstraintTemplatePodStatus{}
92+
require.NoError(t, live.Get(ctx, client.ObjectKeyFromObject(initial), storedBefore))
93+
staleResourceVersion := storedBefore.ResourceVersion
94+
require.NotEmpty(t, staleResourceVersion, "test sanity: fake client must stamp a resourceVersion on stored objects")
95+
96+
// Simulate an out-of-band update advancing the resourceVersion on the API server while the controller's informer
97+
// cache still serves the old version.
98+
latestOnAPIServer := storedBefore.DeepCopy()
99+
latestOnAPIServer.Status.ID = "out-of-band"
100+
require.NoError(t, live.Update(ctx, latestOnAPIServer))
101+
require.NotEqual(t, staleResourceVersion, latestOnAPIServer.ResourceVersion, "test sanity: Update must bump resourceVersion")
102+
103+
// The stale informer view: same spec the cache observed at storedBefore, pinned to the older resourceVersion.
104+
cachedView := storedBefore.DeepCopy()
105+
cachedView.ResourceVersion = staleResourceVersion
106+
107+
cached := &staleClient{
108+
Client: live,
109+
stale: map[client.ObjectKey]*statusv1beta1.ConstraintTemplatePodStatus{
110+
client.ObjectKeyFromObject(initial): cachedView,
111+
},
112+
}
113+
114+
r := &ReconcileConstraintTemplate{
115+
Client: cached,
116+
apiReader: live,
117+
scheme: scheme,
118+
}
119+
120+
// The reconciler reads the PodStatus through the cached client and gets the stale version, then mutates and tries
121+
// to write. The first Update must fail with Conflict (server side rejects the old resourceVersion); the helper must
122+
// recover via the uncached apiReader.
123+
status := &statusv1beta1.ConstraintTemplatePodStatus{}
124+
require.NoError(t, cached.Get(ctx, client.ObjectKeyFromObject(initial), status))
125+
require.Equal(t, staleResourceVersion, status.ResourceVersion, "test sanity: cached view is pinned to the older resourceVersion")
126+
require.Equal(t, "initial", status.Status.ID, "test sanity: cached view still reflects the pre-out-of-band spec")
127+
status.Status.ID = "desired-by-reconciler"
128+
129+
require.NoError(t, r.updatePodStatusWithRetry(ctx, status))
130+
131+
final := &statusv1beta1.ConstraintTemplatePodStatus{}
132+
require.NoError(t, live.Get(ctx, client.ObjectKeyFromObject(initial), final))
133+
require.Equal(t, "desired-by-reconciler", final.Status.ID, "desired status must be applied on top of the latest resourceVersion")
134+
}
135+
136+
// TestUpdatePodStatusWithRetry_NonConflictErrorReturned ensures that errors other than 409 Conflict are returned to the
137+
// caller unchanged and not silently swallowed by the retry loop.
138+
func TestUpdatePodStatusWithRetry_NonConflictErrorReturned(t *testing.T) {
139+
ctx := context.Background()
140+
scheme := newSchemeForTest(t)
141+
142+
live := fake.NewClientBuilder().WithScheme(scheme).Build()
143+
144+
r := &ReconcileConstraintTemplate{
145+
Client: live,
146+
apiReader: live,
147+
scheme: scheme,
148+
}
149+
150+
// PodStatus does not exist in the live store, so Update must fail with NotFound. The helper must surface that error
151+
// without retrying.
152+
missing := newPodStatusForTest("missing-status")
153+
missing.Status.ID = "anything"
154+
155+
err := r.updatePodStatusWithRetry(ctx, missing)
156+
require.Error(t, err)
157+
require.True(t, apierrors.IsNotFound(err), "expected NotFound, got %v", err)
158+
}
159+
160+
// TestUpdatePodStatusWithRetry_HappyPath ensures that a normal update with the latest resourceVersion succeeds without
161+
// any retry overhead.
162+
func TestUpdatePodStatusWithRetry_HappyPath(t *testing.T) {
163+
ctx := context.Background()
164+
scheme := newSchemeForTest(t)
165+
166+
initial := newPodStatusForTest("happy-status")
167+
initial.Status.ID = "before"
168+
169+
live := fake.NewClientBuilder().
170+
WithScheme(scheme).
171+
WithObjects(initial.DeepCopy()).
172+
Build()
173+
174+
r := &ReconcileConstraintTemplate{
175+
Client: live,
176+
apiReader: live,
177+
scheme: scheme,
178+
}
179+
180+
current := &statusv1beta1.ConstraintTemplatePodStatus{}
181+
require.NoError(t, live.Get(ctx, client.ObjectKeyFromObject(initial), current))
182+
current.Status.ID = "after"
183+
184+
require.NoError(t, r.updatePodStatusWithRetry(ctx, current))
185+
186+
final := &statusv1beta1.ConstraintTemplatePodStatus{}
187+
require.NoError(t, live.Get(ctx, types.NamespacedName{Name: initial.Name, Namespace: initial.Namespace}, final))
188+
require.Equal(t, "after", final.Status.ID)
189+
}

0 commit comments

Comments
 (0)