Skip to content

Commit 93b4145

Browse files
committed
✨ Add ReconcileTimeouts metric to track ReconciliationTimeout timeouts.
This PR adds a new metric controller_runtime_reconcile_timeouts_total to track when a controller's reconciliation has reached a ReconciliationTimeout. This provides visibility into when reconcile operations time out due to the controller-runtime wrapper timeout, allowing users to alert / monitor unexpectedly long running controller reconiliations.
1 parent 6724282 commit 93b4145

File tree

3 files changed

+152
-2
lines changed

3 files changed

+152
-2
lines changed

pkg/internal/controller/controller.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ import (
3939
"sigs.k8s.io/controller-runtime/pkg/source"
4040
)
4141

42+
// errReconciliationTimeout is the error used as the cause when the ReconciliationTimeout guardrail fires.
43+
// This allows us to distinguish wrapper timeouts from user-initiated context cancellations.
44+
var errReconciliationTimeout = errors.New("reconciliation timeout")
45+
4246
// Options are the arguments for creating a new Controller.
4347
type Options[request comparable] struct {
4448
// Reconciler is a function that can be called at any time with the Name / Namespace of an object and
@@ -207,13 +211,26 @@ func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ rec
207211
}
208212
}()
209213

214+
var timeoutCause error
210215
if c.ReconciliationTimeout > 0 {
216+
timeoutCause = errReconciliationTimeout
211217
var cancel context.CancelFunc
212-
ctx, cancel = context.WithTimeout(ctx, c.ReconciliationTimeout)
218+
ctx, cancel = context.WithTimeoutCause(ctx, c.ReconciliationTimeout, timeoutCause)
213219
defer cancel()
214220
}
215221

216-
return c.Do.Reconcile(ctx, req)
222+
res, err := c.Do.Reconcile(ctx, req)
223+
224+
// Check if the reconciliation timed out due to our wrapper timeout guardrail.
225+
// We check ctx.Err() == context.DeadlineExceeded first to ensure the context was actually
226+
// cancelled due to a deadline (not parent cancellation or other reasons), then verify it was
227+
// our specific timeout cause. This prevents false positives from parent context cancellations
228+
// or other timeout scenarios.
229+
if timeoutCause != nil && ctx.Err() == context.DeadlineExceeded && errors.Is(context.Cause(ctx), timeoutCause) {
230+
ctrlmetrics.ReconcileTimeouts.WithLabelValues(c.Name).Inc()
231+
}
232+
233+
return res, err
217234
}
218235

219236
// Watch implements controller.Controller.
@@ -437,6 +454,7 @@ func (c *Controller[request]) initMetrics() {
437454
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0)
438455
ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0)
439456
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0)
457+
ctrlmetrics.ReconcileTimeouts.WithLabelValues(c.Name).Add(0)
440458
ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles))
441459
ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0)
442460
}

pkg/internal/controller/controller_test.go

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,128 @@ var _ = Describe("controller", func() {
155155
Expect(err).To(Equal(context.DeadlineExceeded))
156156
})
157157

158+
Context("prometheus metric reconcile_timeouts", func() {
159+
var reconcileTimeouts dto.Metric
160+
161+
BeforeEach(func() {
162+
ctrlmetrics.ReconcileTimeouts.Reset()
163+
reconcileTimeouts.Reset()
164+
ctrl.Name = testControllerName
165+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
166+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
167+
})
168+
169+
It("should increment when ReconciliationTimeout context timeout hits DeadlineExceeded", func(ctx SpecContext) {
170+
ctrl.ReconciliationTimeout = time.Duration(1)
171+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
172+
<-ctx.Done()
173+
return reconcile.Result{}, ctx.Err()
174+
})
175+
_, err := ctrl.Reconcile(ctx,
176+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
177+
Expect(err).To(HaveOccurred())
178+
Expect(err).To(Equal(context.DeadlineExceeded))
179+
180+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
181+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(1.0))
182+
})
183+
184+
It("should not increment when user code cancels context earlier than the ReconciliationTimeout", func(ctx SpecContext) {
185+
ctrl.ReconciliationTimeout = 10 * time.Second
186+
userCancelCause := errors.New("user cancellation")
187+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
188+
// User code creates its own timeout with a different cause
189+
userCtx, cancel := context.WithTimeoutCause(ctx, time.Millisecond, userCancelCause)
190+
defer cancel()
191+
<-userCtx.Done()
192+
return reconcile.Result{}, context.Cause(userCtx)
193+
})
194+
_, err := ctrl.Reconcile(ctx,
195+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
196+
Expect(err).To(HaveOccurred())
197+
Expect(errors.Is(err, userCancelCause)).To(BeTrue())
198+
199+
// Metric should not be incremented because the context timeout didn't hit DeadlineExceeded
200+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
201+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
202+
})
203+
204+
It("should not increment when reconciliation completes before timeout", func(ctx SpecContext) {
205+
ctrl.ReconciliationTimeout = 10 * time.Second
206+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
207+
// Reconcile completes successfully before timeout
208+
return reconcile.Result{}, nil
209+
})
210+
_, err := ctrl.Reconcile(ctx,
211+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
212+
Expect(err).NotTo(HaveOccurred())
213+
214+
// Metric should not be incremented because the timeout was not exceeded
215+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
216+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
217+
})
218+
219+
It("should increment multiple times when multiple reconciles timeout", func(ctx SpecContext) {
220+
ctrl.ReconciliationTimeout = time.Duration(1)
221+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
222+
<-ctx.Done()
223+
return reconcile.Result{}, ctx.Err()
224+
})
225+
226+
const numTimeouts = 3
227+
// Call Reconcile multiple times, each should timeout and increment the metric
228+
for i := range numTimeouts {
229+
_, err := ctrl.Reconcile(ctx,
230+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: fmt.Sprintf("bar%d", i)}})
231+
Expect(err).To(HaveOccurred())
232+
Expect(err).To(Equal(context.DeadlineExceeded))
233+
}
234+
235+
// Metric should be incremented 3 times
236+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
237+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(float64(numTimeouts)))
238+
})
239+
240+
It("should not increment when parent context is cancelled", func(ctx SpecContext) {
241+
parentCtx, cancel := context.WithCancel(ctx)
242+
defer cancel()
243+
244+
ctrl.ReconciliationTimeout = 10 * time.Second
245+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
246+
// Wait for parent cancellation
247+
<-ctx.Done()
248+
return reconcile.Result{}, ctx.Err()
249+
})
250+
251+
// Cancel parent context immediately
252+
cancel()
253+
254+
_, err := ctrl.Reconcile(parentCtx,
255+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
256+
Expect(err).To(HaveOccurred())
257+
Expect(err).To(Equal(context.Canceled))
258+
259+
// Metric should not be incremented because the wrapper timeout didn't fire
260+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
261+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
262+
})
263+
264+
It("should not increment when ReconciliationTimeout is zero", func(ctx SpecContext) {
265+
// Ensure ReconciliationTimeout is zero (not set)
266+
ctrl.ReconciliationTimeout = 0
267+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
268+
return reconcile.Result{}, nil
269+
})
270+
_, err := ctrl.Reconcile(ctx,
271+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
272+
Expect(err).NotTo(HaveOccurred())
273+
274+
// Metric should not be incremented because ReconciliationTimeout is not set
275+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
276+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
277+
})
278+
})
279+
158280
It("should not configure a timeout if ReconciliationTimeout is zero", func(ctx SpecContext) {
159281
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
160282
defer GinkgoRecover()

pkg/internal/controller/metrics/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ var (
8080
Name: "controller_runtime_active_workers",
8181
Help: "Number of currently used workers per controller",
8282
}, []string{"controller"})
83+
84+
// ReconcileTimeouts is a prometheus counter metric which holds the total
85+
// number of reconciliations that timed out due to the ReconciliationTimeout
86+
// context timeout. This metric only increments when the wrapper timeout fires,
87+
// not when user reconcilers cancels the context or completes before the timeout.
88+
ReconcileTimeouts = prometheus.NewCounterVec(prometheus.CounterOpts{
89+
Name: "controller_runtime_reconcile_timeouts_total",
90+
Help: "Total number of reconciliation timeouts per controller",
91+
}, []string{"controller"})
8392
)
8493

8594
func init() {
@@ -91,6 +100,7 @@ func init() {
91100
ReconcileTime,
92101
WorkerCount,
93102
ActiveWorkers,
103+
ReconcileTimeouts,
94104
// expose process metrics like CPU, Memory, file descriptor usage etc.
95105
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
96106
// expose all Go runtime metrics like GC stats, memory stats etc.

0 commit comments

Comments
 (0)