Skip to content

Commit 36db108

Browse files
pedjakclaude
andcommitted
Set Progressing to Succeeded on ClusterExtensionRevision only after availability probes pass
Previously, `Progressing` reason was set to `Succeeded` as soon as the rollout transition completed, not taking into account availability probes of the last phase. This meant setting `.spec.progressDeadlineMinutes` could not catch workloads that were applied to the cluster but never became available, due to for example bad image reference. Now `Progressing` stays at `RollingOut` during transition and is only set to `Succeeded` once probes confirm availability, allowing the progress deadline to correctly expire when a workload fails to become available in time. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4121321 commit 36db108

File tree

3 files changed

+54
-16
lines changed

3 files changed

+54
-16
lines changed

internal/operator-controller/controllers/clusterextensionrevision_controller.go

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"errors"
99
"fmt"
1010
"strings"
11-
"sync"
1211
"time"
1312

1413
appsv1 "k8s.io/api/apps/v1"
@@ -20,6 +19,7 @@ import (
2019
"k8s.io/apimachinery/pkg/runtime/schema"
2120
"k8s.io/apimachinery/pkg/types"
2221
"k8s.io/apimachinery/pkg/util/sets"
22+
"k8s.io/utils/clock"
2323
"pkg.package-operator.run/boxcutter"
2424
"pkg.package-operator.run/boxcutter/machinery"
2525
machinerytypes "pkg.package-operator.run/boxcutter/machinery/types"
@@ -48,9 +48,7 @@ type ClusterExtensionRevisionReconciler struct {
4848
Client client.Client
4949
RevisionEngineFactory RevisionEngineFactory
5050
TrackingCache trackingCache
51-
// track if we have queued up the reconciliation that detects eventual progress deadline issues
52-
// keys is revision UUID, value is boolean
53-
progressDeadlineCheckInFlight sync.Map
51+
Clock clock.Clock
5452
}
5553

5654
type trackingCache interface {
@@ -86,15 +84,15 @@ func (c *ClusterExtensionRevisionReconciler) Reconcile(ctx context.Context, req
8684
// check if we reached the progress deadline only if the revision is still progressing and has not succeeded yet
8785
if isStillProgressing && !succeeded {
8886
timeout := time.Duration(pd) * time.Minute
89-
if time.Since(existingRev.CreationTimestamp.Time) > timeout {
87+
if c.Clock.Since(existingRev.CreationTimestamp.Time) > timeout {
9088
// progress deadline reached, reset any errors and stop reconciling this revision
9189
markAsNotProgressing(reconciledRev, ocv1.ReasonProgressDeadlineExceeded, fmt.Sprintf("Revision has not rolled out for %d minutes.", pd))
9290
reconcileErr = nil
9391
res = ctrl.Result{}
94-
} else if _, found := c.progressDeadlineCheckInFlight.Load(existingRev.GetUID()); !found && reconcileErr == nil {
95-
// if we haven't already queued up a reconcile to check for progress deadline, queue one up, but only once
96-
c.progressDeadlineCheckInFlight.Store(existingRev.GetUID(), true)
97-
res = ctrl.Result{RequeueAfter: timeout}
92+
} else if reconcileErr == nil {
93+
requeueAfter := existingRev.CreationTimestamp.Time.Add(timeout).Add(2 * time.Second).Sub(c.Clock.Now()).Round(time.Second)
94+
l.Info(fmt.Sprintf("ProgressDeadline not exceeded, requeue after ~%v to check again.", requeueAfter))
95+
res = ctrl.Result{RequeueAfter: requeueAfter}
9896
}
9997
}
10098
}
@@ -208,9 +206,7 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, cer
208206
}
209207

210208
revVersion := cer.GetAnnotations()[labels.BundleVersionKey]
211-
if !rres.InTransition() {
212-
markAsProgressing(cer, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion))
213-
} else {
209+
if rres.InTransition() {
214210
markAsProgressing(cer, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion))
215211
}
216212

@@ -231,6 +227,7 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, cer
231227
}
232228
}
233229

230+
markAsProgressing(cer, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion))
234231
markAsAvailable(cer, ocv1.ClusterExtensionRevisionReasonProbesSucceeded, "Objects are available and pass all probes.")
235232

236233
// We'll probably only want to remove this once we are done updating the ClusterExtension conditions
@@ -333,6 +330,7 @@ func (c *ClusterExtensionRevisionReconciler) SetupWithManager(mgr ctrl.Manager)
333330
return true
334331
},
335332
}
333+
c.Clock = clock.RealClock{}
336334
return ctrl.NewControllerManagedBy(mgr).
337335
For(
338336
&ocv1.ClusterExtensionRevision{},

internal/operator-controller/controllers/clusterextensionrevision_controller_test.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ import (
1616
"k8s.io/apimachinery/pkg/runtime/schema"
1717
"k8s.io/apimachinery/pkg/types"
1818
"k8s.io/apimachinery/pkg/util/sets"
19+
"k8s.io/utils/clock"
20+
testing2 "k8s.io/utils/clock/testing"
1921
"pkg.package-operator.run/boxcutter"
2022
"pkg.package-operator.run/boxcutter/machinery"
2123
machinerytypes "pkg.package-operator.run/boxcutter/machinery/types"
@@ -372,6 +374,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_RevisionReconciliation(t
372374
name: "set Progressing:True:Succeeded once transition rollout is finished",
373375
revisionResult: mockRevisionResult{
374376
inTransition: false,
377+
isComplete: true,
375378
},
376379
reconcilingRevisionName: clusterExtensionRevisionName,
377380
existingObjs: func() []client.Object {
@@ -380,7 +383,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_RevisionReconciliation(t
380383
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
381384
Type: ocv1.TypeProgressing,
382385
Status: metav1.ConditionTrue,
383-
Reason: ocv1.ReasonSucceeded,
386+
Reason: ocv1.ReasonRollingOut,
384387
Message: "Revision 1.0.0 is rolling out.",
385388
ObservedGeneration: 1,
386389
})
@@ -945,16 +948,19 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
945948
validate func(*testing.T, client.Client)
946949
reconcileErr error
947950
reconcileResult ctrl.Result
951+
clock clock.Clock
948952
}{
949953
{
950954
name: "progressing set to false when progress deadline is exceeded",
951955
existingObjs: func() []client.Object {
952956
ext := newTestClusterExtension()
953957
rev1 := newTestClusterExtensionRevision(t, clusterExtensionRevisionName, ext, testScheme)
954958
rev1.Spec.ProgressDeadlineMinutes = 1
955-
rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-61 * time.Second))
959+
rev1.CreationTimestamp = metav1.NewTime(time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC))
956960
return []client.Object{rev1, ext}
957961
},
962+
// 61sec elapsed since the creation of the revision
963+
clock: testing2.NewFakeClock(time.Date(2022, 1, 1, 0, 1, 1, 0, time.UTC)),
958964
revisionResult: &mockRevisionResult{
959965
inTransition: true,
960966
},
@@ -975,13 +981,14 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
975981
ext := newTestClusterExtension()
976982
rev1 := newTestClusterExtensionRevision(t, clusterExtensionRevisionName, ext, testScheme)
977983
rev1.Spec.ProgressDeadlineMinutes = 1
978-
rev1.CreationTimestamp = metav1.NewTime(time.Now())
984+
rev1.CreationTimestamp = metav1.NewTime(time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC))
979985
return []client.Object{rev1, ext}
980986
},
987+
clock: testing2.NewFakeClock(time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC)),
981988
revisionResult: &mockRevisionResult{
982989
inTransition: true,
983990
},
984-
reconcileResult: ctrl.Result{RequeueAfter: 1 * time.Minute},
991+
reconcileResult: ctrl.Result{RequeueAfter: 62 * time.Second},
985992
validate: func(t *testing.T, c client.Client) {
986993
rev := &ocv1.ClusterExtensionRevision{}
987994
err := c.Get(t.Context(), client.ObjectKey{
@@ -1049,6 +1056,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
10491056
TrackingCache: &mockTrackingCache{
10501057
client: testClient,
10511058
},
1059+
Clock: tc.clock,
10521060
}).Reconcile(t.Context(), ctrl.Request{
10531061
NamespacedName: types.NamespacedName{
10541062
Name: clusterExtensionRevisionName,

test/e2e/features/install.feature

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,38 @@ Feature: Install ClusterExtension
383383
invalid ClusterExtension configuration: invalid configuration: unknown field "watchNamespace"
384384
"""
385385

386+
@BoxcutterRuntime
387+
@ProgressDeadline
388+
Scenario: Report ClusterExtension as not progressing if the rollout does not become available within given timeout
389+
Given min value for ClusterExtension .spec.progressDeadlineMinutes is set to 1
390+
And min value for ClusterExtensionRevision .spec.progressDeadlineMinutes is set to 1
391+
When ClusterExtension is applied
392+
"""
393+
apiVersion: olm.operatorframework.io/v1
394+
kind: ClusterExtension
395+
metadata:
396+
name: ${NAME}
397+
spec:
398+
namespace: ${TEST_NAMESPACE}
399+
progressDeadlineMinutes: 1
400+
serviceAccount:
401+
name: olm-sa
402+
source:
403+
sourceType: Catalog
404+
catalog:
405+
packageName: test
406+
version: 1.0.2
407+
selector:
408+
matchLabels:
409+
"olm.operatorframework.io/metadata.name": test-catalog
410+
"""
411+
Then ClusterExtensionRevision "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded
412+
And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message:
413+
"""
414+
Revision has not rolled out for 1 minutes.
415+
"""
416+
And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation
417+
386418
@BoxcutterRuntime
387419
@ProgressDeadline
388420
Scenario: Report ClusterExtension as not progressing if the rollout does not complete within given timeout

0 commit comments

Comments
 (0)