Skip to content

Commit 7ce21b3

Browse files
authored
Annotations to mark exit codes as being terminal (non-retryable) (#197)
Enable AppWrappers to be optionally annotated to indicate a subset of container exit codes that are terminal (disabling retry if they occur).
1 parent 990335b commit 7ce21b3

File tree

2 files changed

+82
-18
lines changed

2 files changed

+82
-18
lines changed

api/v1beta2/appwrapper_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ const (
172172
ForcefulDeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration"
173173
DeletionOnFailureGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration"
174174
SuccessTTLAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration"
175+
TerminalExitCodesAnnotation = "workload.codeflare.dev.appwrapper/terminalExitCodes"
176+
RetryableExitCodesAnnotation = "workload.codeflare.dev.appwrapper/retryableExitCodes"
175177
)
176178

177179
//+kubebuilder:object:root=true

internal/controller/appwrapper/appwrapper_controller.go

Lines changed: 80 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"strconv"
23+
"strings"
2324
"time"
2425

2526
batchv1 "k8s.io/api/batch/v1"
@@ -57,11 +58,12 @@ type AppWrapperReconciler struct {
5758
}
5859

5960
type podStatusSummary struct {
60-
expected int32
61-
pending int32
62-
running int32
63-
succeeded int32
64-
failed int32
61+
expected int32
62+
pending int32
63+
running int32
64+
succeeded int32
65+
failed int32
66+
terminalFailure bool
6567
}
6668

6769
type componentStatusSummary struct {
@@ -215,7 +217,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
215217
if fatal {
216218
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperFailed) // always move to failed on fatal error
217219
} else {
218-
return r.resetOrFail(ctx, aw)
220+
return r.resetOrFail(ctx, aw, false)
219221
}
220222
}
221223
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperRunning)
@@ -225,11 +227,15 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
225227
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperSuspending) // begin undeployment
226228
}
227229

228-
// First, check the Component-level status of the workload
230+
// Gather status information at the Component and Pod level.
229231
compStatus, err := r.getComponentStatus(ctx, aw)
230232
if err != nil {
231233
return ctrl.Result{}, err
232234
}
235+
podStatus, err := r.getPodStatus(ctx, aw)
236+
if err != nil {
237+
return ctrl.Result{}, err
238+
}
233239

234240
// Detect externally deleted components and transition to Failed with no GracePeriod or retry
235241
if compStatus.deployed != compStatus.expected {
@@ -251,13 +257,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
251257
Reason: "FailedComponent",
252258
Message: fmt.Sprintf("Found %v failed components", compStatus.failed),
253259
})
254-
return r.resetOrFail(ctx, aw)
255-
}
256-
257-
// Second, check the Pod-level status of the workload
258-
podStatus, err := r.getPodStatus(ctx, aw)
259-
if err != nil {
260-
return ctrl.Result{}, err
260+
return r.resetOrFail(ctx, aw, podStatus.terminalFailure)
261261
}
262262

263263
// Handle Success
@@ -295,7 +295,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
295295
if now.Before(deadline) {
296296
return ctrl.Result{RequeueAfter: deadline.Sub(now)}, r.Status().Update(ctx, aw)
297297
} else {
298-
return r.resetOrFail(ctx, aw)
298+
return r.resetOrFail(ctx, aw, podStatus.terminalFailure)
299299
}
300300
}
301301

@@ -330,7 +330,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
330330
Reason: "InsufficientPodsReady",
331331
Message: podDetailsMessage,
332332
})
333-
return r.resetOrFail(ctx, aw)
333+
return r.resetOrFail(ctx, aw, podStatus.terminalFailure)
334334
}
335335

336336
case workloadv1beta2.AppWrapperSuspending: // undeploying components
@@ -473,9 +473,9 @@ func (r *AppWrapperReconciler) updateStatus(ctx context.Context, aw *workloadv1b
473473
return ctrl.Result{}, nil
474474
}
475475

476-
func (r *AppWrapperReconciler) resetOrFail(ctx context.Context, aw *workloadv1beta2.AppWrapper) (ctrl.Result, error) {
476+
func (r *AppWrapperReconciler) resetOrFail(ctx context.Context, aw *workloadv1beta2.AppWrapper, terminalFailure bool) (ctrl.Result, error) {
477477
maxRetries := r.retryLimit(ctx, aw)
478-
if aw.Status.Retries < maxRetries {
478+
if !terminalFailure && aw.Status.Retries < maxRetries {
479479
aw.Status.Retries += 1
480480
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperResetting)
481481
} else {
@@ -508,6 +508,40 @@ func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1b
508508
summary.succeeded += 1
509509
case v1.PodFailed:
510510
summary.failed += 1
511+
if terminalCodes := r.terminalExitCodes(ctx, aw); len(terminalCodes) > 0 {
512+
for _, containerStatus := range pod.Status.ContainerStatuses {
513+
if containerStatus.State.Terminated != nil {
514+
exitCode := containerStatus.State.Terminated.ExitCode
515+
if exitCode != 0 {
516+
for _, ec := range terminalCodes {
517+
if ec == int(exitCode) {
518+
summary.terminalFailure = true
519+
break
520+
}
521+
}
522+
}
523+
}
524+
}
525+
}
526+
if retryableCodes := r.retryableExitCodes(ctx, aw); len(retryableCodes) > 0 {
527+
for _, containerStatus := range pod.Status.ContainerStatuses {
528+
if containerStatus.State.Terminated != nil {
529+
exitCode := containerStatus.State.Terminated.ExitCode
530+
if exitCode != 0 {
531+
terminal := true
532+
for _, ec := range retryableCodes {
533+
if ec == int(exitCode) {
534+
terminal = false
535+
break
536+
}
537+
}
538+
if terminal {
539+
summary.terminalFailure = terminal
540+
}
541+
}
542+
}
543+
}
544+
}
511545
}
512546
}
513547

@@ -749,6 +783,34 @@ func (r *AppWrapperReconciler) timeToLiveAfterSucceededDuration(ctx context.Cont
749783
return r.Config.FaultTolerance.SuccessTTL
750784
}
751785

786+
func (r *AppWrapperReconciler) terminalExitCodes(_ context.Context, aw *workloadv1beta2.AppWrapper) []int {
787+
ans := []int{}
788+
if exitCodeAnn, ok := aw.Annotations[workloadv1beta2.TerminalExitCodesAnnotation]; ok {
789+
exitCodes := strings.Split(exitCodeAnn, ",")
790+
for _, str := range exitCodes {
791+
exitCode, err := strconv.Atoi(str)
792+
if err == nil {
793+
ans = append(ans, exitCode)
794+
}
795+
}
796+
}
797+
return ans
798+
}
799+
800+
func (r *AppWrapperReconciler) retryableExitCodes(_ context.Context, aw *workloadv1beta2.AppWrapper) []int {
801+
ans := []int{}
802+
if exitCodeAnn, ok := aw.Annotations[workloadv1beta2.RetryableExitCodesAnnotation]; ok {
803+
exitCodes := strings.Split(exitCodeAnn, ",")
804+
for _, str := range exitCodes {
805+
exitCode, err := strconv.Atoi(str)
806+
if err == nil {
807+
ans = append(ans, exitCode)
808+
}
809+
}
810+
}
811+
return ans
812+
}
813+
752814
func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) {
753815
if meta.IsStatusConditionTrue(aw.Status.Conditions, string(condition)) {
754816
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{

0 commit comments

Comments
 (0)