@@ -20,6 +20,7 @@ import (
20
20
"context"
21
21
"fmt"
22
22
"strconv"
23
+ "strings"
23
24
"time"
24
25
25
26
batchv1 "k8s.io/api/batch/v1"
@@ -57,11 +58,12 @@ type AppWrapperReconciler struct {
57
58
}
58
59
59
60
type podStatusSummary struct {
60
- expected int32
61
- pending int32
62
- running int32
63
- succeeded int32
64
- failed int32
61
+ expected int32
62
+ pending int32
63
+ running int32
64
+ succeeded int32
65
+ failed int32
66
+ terminalFailure bool
65
67
}
66
68
67
69
type componentStatusSummary struct {
@@ -215,7 +217,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
215
217
if fatal {
216
218
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperFailed ) // always move to failed on fatal error
217
219
} else {
218
- return r .resetOrFail (ctx , aw )
220
+ return r .resetOrFail (ctx , aw , false )
219
221
}
220
222
}
221
223
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperRunning )
@@ -225,11 +227,15 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
225
227
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperSuspending ) // begin undeployment
226
228
}
227
229
228
- // First, check the Component-level status of the workload
230
+ // Gather status information at the Component and Pod level.
229
231
compStatus , err := r .getComponentStatus (ctx , aw )
230
232
if err != nil {
231
233
return ctrl.Result {}, err
232
234
}
235
+ podStatus , err := r .getPodStatus (ctx , aw )
236
+ if err != nil {
237
+ return ctrl.Result {}, err
238
+ }
233
239
234
240
// Detect externally deleted components and transition to Failed with no GracePeriod or retry
235
241
if compStatus .deployed != compStatus .expected {
@@ -251,13 +257,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
251
257
Reason : "FailedComponent" ,
252
258
Message : fmt .Sprintf ("Found %v failed components" , compStatus .failed ),
253
259
})
254
- return r .resetOrFail (ctx , aw )
255
- }
256
-
257
- // Second, check the Pod-level status of the workload
258
- podStatus , err := r .getPodStatus (ctx , aw )
259
- if err != nil {
260
- return ctrl.Result {}, err
260
+ return r .resetOrFail (ctx , aw , podStatus .terminalFailure )
261
261
}
262
262
263
263
// Handle Success
@@ -295,7 +295,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
295
295
if now .Before (deadline ) {
296
296
return ctrl.Result {RequeueAfter : deadline .Sub (now )}, r .Status ().Update (ctx , aw )
297
297
} else {
298
- return r .resetOrFail (ctx , aw )
298
+ return r .resetOrFail (ctx , aw , podStatus . terminalFailure )
299
299
}
300
300
}
301
301
@@ -330,7 +330,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
330
330
Reason : "InsufficientPodsReady" ,
331
331
Message : podDetailsMessage ,
332
332
})
333
- return r .resetOrFail (ctx , aw )
333
+ return r .resetOrFail (ctx , aw , podStatus . terminalFailure )
334
334
}
335
335
336
336
case workloadv1beta2 .AppWrapperSuspending : // undeploying components
@@ -473,9 +473,9 @@ func (r *AppWrapperReconciler) updateStatus(ctx context.Context, aw *workloadv1b
473
473
return ctrl.Result {}, nil
474
474
}
475
475
476
- func (r * AppWrapperReconciler ) resetOrFail (ctx context.Context , aw * workloadv1beta2.AppWrapper ) (ctrl.Result , error ) {
476
+ func (r * AppWrapperReconciler ) resetOrFail (ctx context.Context , aw * workloadv1beta2.AppWrapper , terminalFailure bool ) (ctrl.Result , error ) {
477
477
maxRetries := r .retryLimit (ctx , aw )
478
- if aw .Status .Retries < maxRetries {
478
+ if ! terminalFailure && aw .Status .Retries < maxRetries {
479
479
aw .Status .Retries += 1
480
480
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperResetting )
481
481
} else {
@@ -508,6 +508,40 @@ func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1b
508
508
summary .succeeded += 1
509
509
case v1 .PodFailed :
510
510
summary .failed += 1
511
+ if terminalCodes := r .terminalExitCodes (ctx , aw ); len (terminalCodes ) > 0 {
512
+ for _ , containerStatus := range pod .Status .ContainerStatuses {
513
+ if containerStatus .State .Terminated != nil {
514
+ exitCode := containerStatus .State .Terminated .ExitCode
515
+ if exitCode != 0 {
516
+ for _ , ec := range terminalCodes {
517
+ if ec == int (exitCode ) {
518
+ summary .terminalFailure = true
519
+ break
520
+ }
521
+ }
522
+ }
523
+ }
524
+ }
525
+ }
526
+ if retryableCodes := r .retryableExitCodes (ctx , aw ); len (retryableCodes ) > 0 {
527
+ for _ , containerStatus := range pod .Status .ContainerStatuses {
528
+ if containerStatus .State .Terminated != nil {
529
+ exitCode := containerStatus .State .Terminated .ExitCode
530
+ if exitCode != 0 {
531
+ terminal := true
532
+ for _ , ec := range retryableCodes {
533
+ if ec == int (exitCode ) {
534
+ terminal = false
535
+ break
536
+ }
537
+ }
538
+ if terminal {
539
+ summary .terminalFailure = terminal
540
+ }
541
+ }
542
+ }
543
+ }
544
+ }
511
545
}
512
546
}
513
547
@@ -749,6 +783,34 @@ func (r *AppWrapperReconciler) timeToLiveAfterSucceededDuration(ctx context.Cont
749
783
return r .Config .FaultTolerance .SuccessTTL
750
784
}
751
785
786
+ func (r * AppWrapperReconciler ) terminalExitCodes (_ context.Context , aw * workloadv1beta2.AppWrapper ) []int {
787
+ ans := []int {}
788
+ if exitCodeAnn , ok := aw .Annotations [workloadv1beta2 .TerminalExitCodesAnnotation ]; ok {
789
+ exitCodes := strings .Split (exitCodeAnn , "," )
790
+ for _ , str := range exitCodes {
791
+ exitCode , err := strconv .Atoi (str )
792
+ if err == nil {
793
+ ans = append (ans , exitCode )
794
+ }
795
+ }
796
+ }
797
+ return ans
798
+ }
799
+
800
+ func (r * AppWrapperReconciler ) retryableExitCodes (_ context.Context , aw * workloadv1beta2.AppWrapper ) []int {
801
+ ans := []int {}
802
+ if exitCodeAnn , ok := aw .Annotations [workloadv1beta2 .RetryableExitCodesAnnotation ]; ok {
803
+ exitCodes := strings .Split (exitCodeAnn , "," )
804
+ for _ , str := range exitCodes {
805
+ exitCode , err := strconv .Atoi (str )
806
+ if err == nil {
807
+ ans = append (ans , exitCode )
808
+ }
809
+ }
810
+ }
811
+ return ans
812
+ }
813
+
752
814
func clearCondition (aw * workloadv1beta2.AppWrapper , condition workloadv1beta2.AppWrapperCondition , reason string , message string ) {
753
815
if meta .IsStatusConditionTrue (aw .Status .Conditions , string (condition )) {
754
816
meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
0 commit comments