diff --git a/api/v1alpha1/nodehealthcheck_types.go b/api/v1alpha1/nodehealthcheck_types.go index 8cf43635..2e46ff2f 100644 --- a/api/v1alpha1/nodehealthcheck_types.go +++ b/api/v1alpha1/nodehealthcheck_types.go @@ -34,6 +34,11 @@ const ( ConditionReasonDisabledTemplateInvalid = "RemediationTemplateInvalid" // ConditionReasonEnabled is the condition reason for type Disabled and status False ConditionReasonEnabled = "NodeHealthCheckEnabled" + + // ConditionTypeStormActive is the condition type used when NHC will get disabled + ConditionTypeStormActive = "StormActive" + // ConditionReasonStormThresholdChange is the condition reason for a storm change from active to inactive and vice versa + ConditionReasonStormThresholdChange = "HealthyNodeThresholdChange" ) // NHCPhase is the string used for NHC.Status.Phase @@ -107,6 +112,23 @@ type NodeHealthCheckSpec struct { //+operator-sdk:csv:customresourcedefinitions:type=spec MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"` + // StormTerminationDelay introduces a configurable delay after storm recovery + // exit criteria are satisfied (for example, when the number of healthy nodes + // rises above the configured minHealthy constraint). While this + // delay is in effect, NHC remains in storm recovery mode and does not create + // new remediations. Once the delay elapses, storm recovery mode exits and normal + // remediation resumes. + // + // Expects a string of decimal numbers each with optional fraction and a unit + // suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us" + // (or "µs"), "ms", "s", "m", "h". + // + //+kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$" + //+kubebuilder:validation:Type=string + //+optional + //+operator-sdk:csv:customresourcedefinitions:type=spec + StormTerminationDelay *metav1.Duration `json:"stormTerminationDelay,omitempty"` + // RemediationTemplate is a reference to a remediation template // provided by an infrastructure provider. // @@ -266,6 +288,25 @@ type NodeHealthCheckStatus struct { //+operator-sdk:csv:customresourcedefinitions:type=status,xDescriptors="urn:alm:descriptor:io.kubernetes.phase:reason" Reason string `json:"reason,omitempty"` + // StormRecoveryStartTime records when storm recovery mode was activated. + // This field is set when StormRecoveryActive becomes true and helps track + // how long the system has been in storm recovery mode. + // + //+optional + //+kubebuilder:validation:Type=string + //+kubebuilder:validation:Format=date-time + //+operator-sdk:csv:customresourcedefinitions:type=status + StormRecoveryStartTime *metav1.Time `json:"stormRecoveryStartTime,omitempty"` + + // StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint + // and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed). + // + //+optional + //+kubebuilder:validation:Type=string + //+kubebuilder:validation:Format=date-time + //+operator-sdk:csv:customresourcedefinitions:type=status + StormTerminationStartTime *metav1.Time `json:"stormTerminationStartTime,omitempty"` + // LastUpdateTime is the last time the status was updated. // //+optional diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index b72af3d4..20638b1b 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -21,8 +21,8 @@ limitations under the License. package v1alpha1 import ( - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" ) @@ -122,9 +122,14 @@ func (in *NodeHealthCheckSpec) DeepCopyInto(out *NodeHealthCheckSpec) { *out = new(intstr.IntOrString) **out = **in } + if in.StormTerminationDelay != nil { + in, out := &in.StormTerminationDelay, &out.StormTerminationDelay + *out = new(v1.Duration) + **out = **in + } if in.RemediationTemplate != nil { in, out := &in.RemediationTemplate, &out.RemediationTemplate - *out = new(v1.ObjectReference) + *out = new(corev1.ObjectReference) **out = **in } if in.EscalatingRemediations != nil { @@ -139,7 +144,7 @@ func (in *NodeHealthCheckSpec) DeepCopyInto(out *NodeHealthCheckSpec) { } if in.HealthyDelay != nil { in, out := &in.HealthyDelay, &out.HealthyDelay - *out = new(metav1.Duration) + *out = new(v1.Duration) **out = **in } } @@ -180,18 +185,26 @@ func (in *NodeHealthCheckStatus) DeepCopyInto(out *NodeHealthCheckStatus) { } if in.InFlightRemediations != nil { in, out := &in.InFlightRemediations, &out.InFlightRemediations - *out = make(map[string]metav1.Time, len(*in)) + *out = make(map[string]v1.Time, len(*in)) for key, val := range *in { (*out)[key] = *val.DeepCopy() } } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions - *out = make([]metav1.Condition, len(*in)) + *out = make([]v1.Condition, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.StormRecoveryStartTime != nil { + in, out := &in.StormRecoveryStartTime, &out.StormRecoveryStartTime + *out = (*in).DeepCopy() + } + if in.StormTerminationStartTime != nil { + in, out := &in.StormTerminationStartTime, &out.StormTerminationStartTime + *out = (*in).DeepCopy() + } if in.LastUpdateTime != nil { in, out := &in.LastUpdateTime, &out.LastUpdateTime *out = (*in).DeepCopy() diff --git a/bundle/manifests/node-healthcheck-operator.clusterserviceversion.yaml b/bundle/manifests/node-healthcheck-operator.clusterserviceversion.yaml index c90f70ab..c56b8189 100644 --- a/bundle/manifests/node-healthcheck-operator.clusterserviceversion.yaml +++ b/bundle/manifests/node-healthcheck-operator.clusterserviceversion.yaml @@ -137,6 +137,16 @@ spec: to work with an empty selector, which matches all nodes." displayName: Selector path: selector + - description: "StormTerminationDelay introduces a configurable delay after + storm recovery exit criteria are satisfied (for example, when the number + of healthy nodes rises above the configured minHealthy constraint). While + this delay is in effect, NHC remains in storm recovery mode and does not + create new remediations. Once the delay elapses, storm recovery mode exits + and normal remediation resumes. \n Expects a string of decimal numbers each + with optional fraction and a unit suffix, e.g. \"300ms\", \"1.5h\" or \"2h45m\". + Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"." + displayName: Storm Termination Delay + path: stormTerminationDelay - description: UnhealthyConditions contains a list of the conditions that determine whether a node is considered unhealthy. The conditions are combined in a logical OR, i.e. if any of the conditions is met, the node is unhealthy. @@ -189,6 +199,16 @@ spec: path: reason x-descriptors: - urn:alm:descriptor:io.kubernetes.phase:reason + - description: StormRecoveryStartTime records when storm recovery mode was activated. + This field is set when StormRecoveryActive becomes true and helps track + how long the system has been in storm recovery mode. + displayName: Storm Recovery Start Time + path: stormRecoveryStartTime + - description: StormTerminationStartTime records when storm recovery mode regained + the minHealthy/maxUnhealthy constraint and the storm is about to end (after + NodeHealthCheckSpec.StormTerminationDelay has passed). + displayName: Storm Termination Start Time + path: stormTerminationStartTime - description: UnhealthyNodes tracks currently unhealthy nodes and their remediations. displayName: Unhealthy Nodes path: unhealthyNodes diff --git a/bundle/manifests/remediation.medik8s.io_nodehealthchecks.yaml b/bundle/manifests/remediation.medik8s.io_nodehealthchecks.yaml index 2b8d47e6..0d3f2a6d 100644 --- a/bundle/manifests/remediation.medik8s.io_nodehealthchecks.yaml +++ b/bundle/manifests/remediation.medik8s.io_nodehealthchecks.yaml @@ -273,6 +273,20 @@ spec: type: object type: object x-kubernetes-map-type: atomic + stormTerminationDelay: + description: |- + StormTerminationDelay introduces a configurable delay after storm recovery + exit criteria are satisfied (for example, when the number of healthy nodes + rises above the configured minHealthy constraint). While this + delay is in effect, NHC remains in storm recovery mode and does not create + new remediations. Once the delay elapses, storm recovery mode exits and normal + remediation resumes. + + Expects a string of decimal numbers each with optional fraction and a unit + suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us" + (or "µs"), "ms", "s", "m", "h". + pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$ + type: string unhealthyConditions: default: - duration: 300s @@ -417,6 +431,19 @@ spec: reason: description: Reason explains the current phase in more detail. type: string + stormRecoveryStartTime: + description: |- + StormRecoveryStartTime records when storm recovery mode was activated. + This field is set when StormRecoveryActive becomes true and helps track + how long the system has been in storm recovery mode. + format: date-time + type: string + stormTerminationStartTime: + description: |- + StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint + and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed). + format: date-time + type: string unhealthyNodes: description: UnhealthyNodes tracks currently unhealthy nodes and their remediations. diff --git a/config/crd/bases/remediation.medik8s.io_nodehealthchecks.yaml b/config/crd/bases/remediation.medik8s.io_nodehealthchecks.yaml index 10ff44cb..0c5a46fb 100644 --- a/config/crd/bases/remediation.medik8s.io_nodehealthchecks.yaml +++ b/config/crd/bases/remediation.medik8s.io_nodehealthchecks.yaml @@ -271,6 +271,20 @@ spec: type: object type: object x-kubernetes-map-type: atomic + stormTerminationDelay: + description: |- + StormTerminationDelay introduces a configurable delay after storm recovery + exit criteria are satisfied (for example, when the number of healthy nodes + rises above the configured minHealthy constraint). While this + delay is in effect, NHC remains in storm recovery mode and does not create + new remediations. Once the delay elapses, storm recovery mode exits and normal + remediation resumes. + + Expects a string of decimal numbers each with optional fraction and a unit + suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us" + (or "µs"), "ms", "s", "m", "h". + pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$ + type: string unhealthyConditions: default: - duration: 300s @@ -415,6 +429,19 @@ spec: reason: description: Reason explains the current phase in more detail. type: string + stormRecoveryStartTime: + description: |- + StormRecoveryStartTime records when storm recovery mode was activated. + This field is set when StormRecoveryActive becomes true and helps track + how long the system has been in storm recovery mode. + format: date-time + type: string + stormTerminationStartTime: + description: |- + StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint + and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed). + format: date-time + type: string unhealthyNodes: description: UnhealthyNodes tracks currently unhealthy nodes and their remediations. diff --git a/config/manifests/base/bases/node-healthcheck-operator.clusterserviceversion.yaml b/config/manifests/base/bases/node-healthcheck-operator.clusterserviceversion.yaml index fc707711..3d7b4323 100644 --- a/config/manifests/base/bases/node-healthcheck-operator.clusterserviceversion.yaml +++ b/config/manifests/base/bases/node-healthcheck-operator.clusterserviceversion.yaml @@ -97,6 +97,16 @@ spec: to work with an empty selector, which matches all nodes." displayName: Selector path: selector + - description: "StormTerminationDelay introduces a configurable delay after + storm recovery exit criteria are satisfied (for example, when the number + of healthy nodes rises above the configured minHealthy constraint). While + this delay is in effect, NHC remains in storm recovery mode and does not + create new remediations. Once the delay elapses, storm recovery mode exits + and normal remediation resumes. \n Expects a string of decimal numbers each + with optional fraction and a unit suffix, e.g. \"300ms\", \"1.5h\" or \"2h45m\". + Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"." + displayName: Storm Termination Delay + path: stormTerminationDelay - description: UnhealthyConditions contains a list of the conditions that determine whether a node is considered unhealthy. The conditions are combined in a logical OR, i.e. if any of the conditions is met, the node is unhealthy. @@ -149,6 +159,16 @@ spec: path: reason x-descriptors: - urn:alm:descriptor:io.kubernetes.phase:reason + - description: StormRecoveryStartTime records when storm recovery mode was activated. + This field is set when StormRecoveryActive becomes true and helps track + how long the system has been in storm recovery mode. + displayName: Storm Recovery Start Time + path: stormRecoveryStartTime + - description: StormTerminationStartTime records when storm recovery mode regained + the minHealthy/maxUnhealthy constraint and the storm is about to end (after + NodeHealthCheckSpec.StormTerminationDelay has passed). + displayName: Storm Termination Start Time + path: stormTerminationStartTime - description: UnhealthyNodes tracks currently unhealthy nodes and their remediations. displayName: Unhealthy Nodes path: unhealthyNodes diff --git a/controllers/nodehealthcheck_controller.go b/controllers/nodehealthcheck_controller.go index 79b9b4c4..a018e4f3 100644 --- a/controllers/nodehealthcheck_controller.go +++ b/controllers/nodehealthcheck_controller.go @@ -42,6 +42,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/record" "k8s.io/utils/pointer" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -334,7 +335,8 @@ func (r *NodeHealthCheckReconciler) Reconcile(ctx context.Context, req ctrl.Requ // check if we have enough healthy nodes skipRemediation := false - if minHealthy, err := getMinHealthy(nhc, len(selectedNodes)); err != nil { + minHealthy, err := getMinHealthy(nhc, len(selectedNodes)) + if err != nil { log.Error(err, "failed to calculate min healthy allowed nodes", "minHealthy", nhc.Spec.MinHealthy, "maxUnhealthy", nhc.Spec.MaxUnhealthy, "observedNodes", nhc.Status.ObservedNodes) return result, err @@ -345,6 +347,18 @@ func (r *NodeHealthCheckReconciler) Reconcile(ctx context.Context, req ctrl.Requ skipRemediation = true } + // check if we have enough healthy nodes and manage storm recovery + stormRecoveryActive, requeueAfter := r.evaluateStormRecovery(nhc, minHealthy) + updateRequeueAfter(&result, requeueAfter) + + // skipping remediation due to active storm + if stormRecoveryActive && !skipRemediation { + msg := fmt.Sprint("Storm recovery active: skipping creation of new remediations") + r.Log.Info(msg) + commonevents.WarningEvent(r.Recorder, nhc, utils.EventReasonRemediationSkipped, msg) + skipRemediation = true + } + // remediate unhealthy nodes for _, node := range matchingNodes { @@ -909,3 +923,135 @@ func getMinHealthy(nhc *remediationv1alpha1.NodeHealthCheck, total int) (int, er } return 0, fmt.Errorf("one of minHealthy and maxUnhealthy should be specified") } + +func (r *NodeHealthCheckReconciler) shouldStartStormRecovery(nhc *remediationv1alpha1.NodeHealthCheck, minHealthy int) bool { + if utils.IsConditionTrue(nhc.Status.Conditions, remediationv1alpha1.ConditionTypeStormActive, remediationv1alpha1.ConditionReasonStormThresholdChange) { + return false + } + + return !isMinHealthyConstraintSatisfied(nhc, minHealthy) +} + +func (r *NodeHealthCheckReconciler) shouldExistStormRecovery(nhc *remediationv1alpha1.NodeHealthCheck, minHealthy int) bool { + isActive := utils.IsConditionTrue(nhc.Status.Conditions, remediationv1alpha1.ConditionTypeStormActive, remediationv1alpha1.ConditionReasonStormThresholdChange) + isDelayElapsed := false + if nhc.Status.StormTerminationStartTime != nil { + isDelayElapsed = time.Now().After(nhc.Status.StormTerminationStartTime.Time.Add(nhc.Spec.StormTerminationDelay.Duration)) + } + shouldExist := isActive && isMinHealthyConstraintSatisfied(nhc, minHealthy) && isDelayElapsed + return shouldExist +} + +func (r *NodeHealthCheckReconciler) shouldSetStormExitDelay(nhc *remediationv1alpha1.NodeHealthCheck, minHealthy int) bool { + isActive := utils.IsConditionTrue(nhc.Status.Conditions, remediationv1alpha1.ConditionTypeStormActive, remediationv1alpha1.ConditionReasonStormThresholdChange) + isHealthSatisfied := isMinHealthyConstraintSatisfied(nhc, minHealthy) + isDelayUnSet := nhc.Status.StormTerminationStartTime == nil + return isActive && isHealthSatisfied && isDelayUnSet +} + +func isMinHealthyConstraintSatisfied(nhc *remediationv1alpha1.NodeHealthCheck, minHealthy int) bool { + healthyCount := 0 + if nhc.Status.HealthyNodes != nil { + healthyCount = *nhc.Status.HealthyNodes + } + + return healthyCount >= minHealthy +} + +// evaluateStormRecovery updates the state of the storm recover (active or not) and returns it current state +func (r *NodeHealthCheckReconciler) evaluateStormRecovery(nhc *remediationv1alpha1.NodeHealthCheck, minHealthy int) (bool, *time.Duration) { + if nhc.Spec.StormTerminationDelay == nil { + return false, nil + } + // Check if we should start storm recovery + shouldStart := r.shouldStartStormRecovery(nhc, minHealthy) + // Check if we should exit storm recovery now + shouldExit := r.shouldExistStormRecovery(nhc, minHealthy) + // Check if we should start the delay count for storm recovery exit + shouldSetDelay := r.shouldSetStormExitDelay(nhc, minHealthy) + // Update storm recovery status + if shouldStart { + r.updateStormRecoveryStatus(nhc, activate) + } else if shouldExit { + r.updateStormRecoveryStatus(nhc, deactivate) + } else if shouldSetDelay { + r.updateStormRecoveryStatus(nhc, setDelay) + } + + isStormActive := utils.IsConditionTrue(nhc.Status.Conditions, remediationv1alpha1.ConditionTypeStormActive, remediationv1alpha1.ConditionReasonStormThresholdChange) + + return isStormActive, calculateRequeue(isStormActive, nhc) +} + +func calculateRequeue(active bool, nhc *remediationv1alpha1.NodeHealthCheck) *time.Duration { + if !active || nhc.Status.StormTerminationStartTime == nil { + return nil + } + elapsedTime := time.Now().Sub(nhc.Status.StormTerminationStartTime.Time) + // time left before storm should finish + timeLeft := nhc.Spec.StormTerminationDelay.Duration - elapsedTime + //Add some requeue buffer + var requeueAfter *time.Duration + if timeLeft < 0 { + requeueAfter = ptr.To(time.Second) + } else { + requeueAfter = ptr.To(timeLeft + time.Second) + } + return requeueAfter +} + +func (r *NodeHealthCheckReconciler) getRemediationCount(nhc *remediationv1alpha1.NodeHealthCheck) int { + inProgressRemediations := 0 + for _, unhealthyNode := range nhc.Status.UnhealthyNodes { + if len(unhealthyNode.Remediations) > 0 { + inProgressRemediations++ + } + } + return inProgressRemediations +} + +type stormMode int + +const ( + activate stormMode = iota + deactivate + setDelay +) + +func (r *NodeHealthCheckReconciler) updateStormRecoveryStatus(nhc *remediationv1alpha1.NodeHealthCheck, sm stormMode) { + + switch sm { + case activate: + { + meta.SetStatusCondition(&nhc.Status.Conditions, metav1.Condition{ + Type: remediationv1alpha1.ConditionTypeStormActive, + Status: metav1.ConditionTrue, + Reason: remediationv1alpha1.ConditionReasonStormThresholdChange, + Message: "Storm mode is activated preventing any remediations until done", + }) + now := metav1.Time{Time: currentTime()} + nhc.Status.StormRecoveryStartTime = &now + r.Log.Info("Storm recovery mode activated", "nhc", nhc.Name) + commonevents.WarningEvent(r.Recorder, nhc, "StormRecoveryStarted", "Storm recovery mode activated - delaying remediation until threshold is reached") + } + case deactivate: + { + meta.SetStatusCondition(&nhc.Status.Conditions, metav1.Condition{ + Type: remediationv1alpha1.ConditionTypeStormActive, + Status: metav1.ConditionFalse, + Reason: remediationv1alpha1.ConditionReasonStormThresholdChange, + Message: "Storm mode is deactivated remediations can occur normally", + }) + nhc.Status.StormRecoveryStartTime = nil + nhc.Status.StormTerminationStartTime = nil + r.Log.Info("Storm recovery mode deactivated", "nhc", nhc.Name) + commonevents.NormalEvent(r.Recorder, nhc, "StormRecoveryEnded", "Storm recovery mode deactivated - normal remediation resumed") + } + case setDelay: + now := metav1.Time{Time: currentTime()} + nhc.Status.StormTerminationStartTime = &now + r.Log.Info("Storm regained health starting delay count until storm ends", "nhc", nhc.Name) + commonevents.WarningEvent(r.Recorder, nhc, "StormRecoveryDelayStarted", "Storm recovery mode will exit after delay") + } + +} diff --git a/controllers/nodehealthcheck_controller_test.go b/controllers/nodehealthcheck_controller_test.go index 97b56ae8..4f411d29 100644 --- a/controllers/nodehealthcheck_controller_test.go +++ b/controllers/nodehealthcheck_controller_test.go @@ -1959,6 +1959,101 @@ var _ = Describe("Node Health Check CR", func() { }) + Context("Storm Recovery", func() { + BeforeEach(func() { + underTest = newNodeHealthCheckWithStormRecovery() + setupObjects(3, 4, true) // 2 unhealthy, 5 healthy = 7 total + }) + When("consecutive storms are triggered", func() { + It("they should start and finish according delay and min max criteria", func() { + // Phase 1: Verify initial state - normal operation + By("verifying initial normal operation") + // 7 total, 4 healthy, 3 unhealthy, 3 remediations + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed()) + g.Expect(*underTest.Status.HealthyNodes).To(Equal(4)) + g.Expect(len(underTest.Status.UnhealthyNodes)).To(Equal(3)) + g.Expect(utils.IsConditionSet(underTest.Status.Conditions, v1alpha1.ConditionTypeStormActive, v1alpha1.ConditionReasonStormThresholdChange)).To(BeFalse()) + g.Expect(getRemediationsCount(underTest)).To(Equal(3)) + }, "5s", "1s").Should(Succeed()) + + // Phase 2: Make the forth node unhealthy - triggers first storm recovery + By("making one more node unhealthy - triggers storm recovery") + mockNodeGettingUnhealthy("healthy-worker-node-1") + // wait for node to turn unhealthy + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed()) + g.Expect(len(underTest.Status.UnhealthyNodes)).To(Equal(4)) + }, "15s", "100ms").Should(Succeed()) + + // Verify Storm recovery is activated + // 7 total, 3 healthy, 4 unhealthy, 3 remediations (additional remediation not created because of min healthy constraint) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed()) + g.Expect(*underTest.Status.HealthyNodes).To(Equal(3)) + g.Expect(len(underTest.Status.UnhealthyNodes)).To(Equal(4)) + g.Expect(getRemediationsCount(underTest)).To(Equal(3)) + g.Expect(utils.IsConditionTrue(underTest.Status.Conditions, v1alpha1.ConditionTypeStormActive, v1alpha1.ConditionReasonStormThresholdChange)).To(BeTrue()) + g.Expect(underTest.Status.StormRecoveryStartTime).ToNot(BeNil()) + }, "5s", "100ms").Should(Succeed()) + + // Phase 3: Recover one node - storm recovery remains active due to 2-second delay + By("recovering one node - storm recovery remains active due to 2-second delay") + mockNodeGettingHealthy("unhealthy-worker-node-1") + + //wait for node to recover + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed()) + g.Expect(*underTest.Status.HealthyNodes).To(Equal(4)) + }, "5s", "100ms").Should(Succeed()) + // 7 total, 4 healthy, 3 unhealthy, 2 remediations (one removed due to healthy node), 1 remediation is pending to be created (not created because of storm) + Consistently(func(g Gomega) { + g.Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed()) + g.Expect(*underTest.Status.HealthyNodes).To(Equal(4)) + g.Expect(len(underTest.Status.UnhealthyNodes)).To(Equal(3)) + g.Expect(getRemediationsCount(underTest)).To(Equal(2)) + g.Expect(utils.IsConditionTrue(underTest.Status.Conditions, v1alpha1.ConditionTypeStormActive, v1alpha1.ConditionReasonStormThresholdChange)).To(BeTrue()) + }, "2s", "100ms").Should(Succeed()) + //expected termination time of the first storm + firstStormTerminationTime := underTest.Status.StormTerminationStartTime.Time.Add(underTest.Spec.StormTerminationDelay.Duration) + // Phase 4: wait for the delay to pass + time.Sleep(time.Millisecond * 1500) + //Expect Storm Recovery mode to end + // 7 total, 4 healthy, 3 unhealthy, 3 remediations (pending remediation created when storm is done) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed()) + g.Expect(utils.IsConditionSet(underTest.Status.Conditions, v1alpha1.ConditionTypeStormActive, v1alpha1.ConditionReasonStormThresholdChange)).To(BeTrue()) + g.Expect(utils.IsConditionTrue(underTest.Status.Conditions, v1alpha1.ConditionTypeStormActive, v1alpha1.ConditionReasonStormThresholdChange)).To(BeFalse()) + g.Expect(getRemediationsCount(underTest)).To(Equal(3)) + }, "1500ms", "100ms").Should(Succeed()) + + // Phase 5: Make the forth node unhealthy - triggers the second storm + By("making the 4th node unhealthy - triggers second storm recovery") + mockNodeGettingUnhealthy("healthy-worker-node-2") + // wait for node to turn unhealthy + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed()) + g.Expect(len(underTest.Status.UnhealthyNodes)).To(Equal(4)) + }, "15s", "100ms").Should(Succeed()) + + // Verify Second Storm mode is activated + // 7 total, 3 healthy, 4 unhealthy, 3 remediations (additional remediation not created because of min healthy constraint) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed()) + g.Expect(*underTest.Status.HealthyNodes).To(Equal(3)) + g.Expect(len(underTest.Status.UnhealthyNodes)).To(Equal(4)) + g.Expect(getRemediationsCount(underTest)).To(Equal(3)) + g.Expect(utils.IsConditionTrue(underTest.Status.Conditions, v1alpha1.ConditionTypeStormActive, v1alpha1.ConditionReasonStormThresholdChange)).To(BeTrue()) + g.Expect(underTest.Status.StormRecoveryStartTime).ToNot(BeNil()) + // Verifies StormRecoveryStartTime is updated properly when a second storm starts + g.Expect(underTest.Status.StormRecoveryStartTime.Time.After(firstStormTerminationTime)).To(BeTrue()) + // Verifies StormTerminationStartTime of the first storm is cleared + g.Expect(underTest.Status.StormTerminationStartTime).To(BeNil()) + }, "5s", "100ms").Should(Succeed()) + }) + }) + }) + }) // TODO move to new suite in utils package @@ -2572,6 +2667,28 @@ func findRemediationCRForTemplate(nodeName string, nhc *v1alpha1.NodeHealthCheck return nil } +func getRemediationsCount(nhc *v1alpha1.NodeHealthCheck) int { + if nhc.Spec.RemediationTemplate != nil { + return getRemediationsCountForTemplate(nhc, *nhc.Spec.RemediationTemplate) + } + + count := 0 + for _, escalationRemediation := range nhc.Spec.EscalatingRemediations { + count += getRemediationsCountForTemplate(nhc, escalationRemediation.RemediationTemplate) + } + + return count +} + +func getRemediationsCountForTemplate(nhc *v1alpha1.NodeHealthCheck, templateRef v1.ObjectReference) int { + baseCr := newBaseCR(nhc, templateRef) + crList := &unstructured.UnstructuredList{Object: baseCr.Object} + if err := k8sClient.List(ctx, crList); err == nil { + return len(crList.Items) + } + return 0 +} + func newBaseCR(nhc *v1alpha1.NodeHealthCheck, templateRef v1.ObjectReference) unstructured.Unstructured { crKind := templateRef.Kind[:len(templateRef.Kind)-len("template")] crs := newBaseCRs(nhc) @@ -2658,6 +2775,15 @@ func newNodeHealthCheck() *v1alpha1.NodeHealthCheck { } } +func newNodeHealthCheckWithStormRecovery() *v1alpha1.NodeHealthCheck { + nhc := newNodeHealthCheck() + // 7-node cluster: minHealthy=4, stormRecoveryThreshold=1 + minHealthy := intstr.FromInt(4) + nhc.Spec.MinHealthy = &minHealthy + nhc.Spec.StormTerminationDelay = &metav1.Duration{Duration: 2 * time.Second} + return nhc +} + func newNodes(unhealthy int, healthy int, isControlPlane bool, unhealthyNow bool) []client.Object { o := make([]client.Object, 0, healthy+unhealthy) roleName := "-worker" @@ -2799,3 +2925,11 @@ func mockNodeGettingHealthy(unhealthyNodeName string) { err = k8sClient.Status().Update(context.Background(), node) Expect(err).ToNot(HaveOccurred()) } + +func mockNodeGettingUnhealthy(healthyNodeName string) { + node := &v1.Node{} + Expect(k8sClient.Get(context.Background(), client.ObjectKey{Name: healthyNodeName}, node)).To(Succeed()) + node.Status.Conditions[0].Status = v1.ConditionFalse + node.Status.Conditions[0].LastTransitionTime = metav1.Now() + Expect(k8sClient.Status().Update(context.Background(), node)).To(Succeed()) +} diff --git a/controllers/utils/conditions.go b/controllers/utils/conditions.go index 3767fdf8..cc99a6f0 100644 --- a/controllers/utils/conditions.go +++ b/controllers/utils/conditions.go @@ -77,11 +77,17 @@ func isHealthy(unhealthyConditions []unhealthyCondition, nodeConditions []corev1 // IsConditionTrue return true when the conditions contain a condition of given type and reason with status true func IsConditionTrue(conditions []metav1.Condition, conditionType string, reason string) bool { - condition := meta.FindStatusCondition(conditions, conditionType) - if condition == nil { + if !IsConditionSet(conditions, conditionType, reason) { return false } - if condition.Status != metav1.ConditionTrue { + condition := meta.FindStatusCondition(conditions, conditionType) + return condition.Status == metav1.ConditionTrue +} + +// IsConditionSet return true when the conditions contain a condition of given type and reason +func IsConditionSet(conditions []metav1.Condition, conditionType string, reason string) bool { + condition := meta.FindStatusCondition(conditions, conditionType) + if condition == nil { return false } if condition.Reason != reason { diff --git a/docs/configuration.md b/docs/configuration.md index bd9d3110..cf69c533 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -59,6 +59,7 @@ spec: | _pauseRequests_ | no | n/a | A string list. See details below. | | _unhealthyConditions_ | no | `[{type: Ready, status: False, duration: 300s},{type: Ready, status: Unknown, duration: 300s}]` | List of UnhealthyCondition, which defines node unhealthiness. See details below. | | _healthyDelay_ | no | 0 | The time before NHC would allow a node to be healthy again. A negative value means that NHC will never consider the node healthy, and manual intervention is expected. | +| _stormTerminationDelay_ | no | n/a | Optional duration. When storm recovery regains the minHealthy/maxUnhealthy constraint, NHC waits this additional time before exiting storm mode. While waiting, no new remediations are created. Example: 30s, 2m, 1h. | ### Selector @@ -224,21 +225,117 @@ There are two methods for manual intervention: - Once the node meets all other healthy criteria, NHC will delete the remediation.medik8s.io/manually-confirmed-healthy annotation from the Node, and proceed with deleting the remediation CR for that node. - This approach provides a precise, node-specific mechanism for an administrator to signal that a node is healthy and ready to exit the healthyDelay period, without affecting the healthyDelay configuration for other nodes under the same NodeHealthCheck CR. +## Storm Recovery + +Storm recovery is an **optional advanced feature** that provides system stabilization through remediation restraint during mass failure scenarios. + +### Concept & Philosophy + +**Core Principle**: Storm recovery is about **system stabilization through remediation restraint**. + +**The Philosophy**: +- **Recognition**: When many nodes fail, creating more remediations may be counterproductive +- **Assumption**: The system needs time to stabilize before additional remediation attempts +- **Strategy**: Temporary restraint to prevent making a bad situation worse +- **Acceptance**: Some scenarios may be beyond automated recovery (same as minHealthy) + +**Real-World Example**: +``` +Scenario: 20 nodes, minHealthy=11, stormTerminationDelay=30s + +Normal operation: 15 healthy, 5 unhealthy → 5 remediations created +5 additional nodes go down and storm hits: 10 healthy (<11) → storm recovery activates, new remediations are blocked (existing continue) +2 node regain health 12 healthy (≥11) → start 30s delay timer (3 remediations left, 8 unhealthy nodes) +Exit: after 30s elapse → storm recovery deactivates, resume creating remediations for the 5 additional nodes +``` + +### How Storm Recovery Works + +**The Key Insight**: Storm recovery provides a **controlled exit strategy** when maximum number of remediation is reached due to minHealthy/maxUnhealthy constraint. + +**Normal minHealthy Behavior**: +``` +If (Maximum number of remediation is reached): + → Block creation of new remediations + → Existing remediations continue + → Wait until current remediations are below maximum before creating more (as defined by minHealthy/maxUnhealthy) +``` + +**Storm Recovery Enhancement**: +``` +If (minHealthy/maxUnhealthy blocks creation of new remediations AND stormTerminationDelay is configured): + → Enter storm recovery mode + → Block creation of new remediations while storm is active + → When constraint is satisfied (healthyNodes ≥ minHealthy or maxUnhealthy satisfied): + - Start stormTerminationDelay timer + - Exit storm recovery after delay + - Resume normal remediation creation +``` + +### When to Use Storm Recovery + +**Storm Recovery is Appropriate When**: +- Your infrastructure can be overwhelmed by too many concurrent remediations +- You want controlled recovery during mass failure events +- You prefer "wait and see" over "aggressive remediation" during storms +- You accept that some failure scenarios are beyond automated recovery + +**Storm Recovery is NOT Appropriate When**: +- You need aggressive remediation regardless of system load +- Your infrastructure can handle unlimited concurrent remediations +- You can't accept any remediation delays during mass failures + +### Configuring Storm Recovery + +- Enable by setting `spec.stormTerminationDelay` on the `NodeHealthCheck` CR. +- Choose a delay that matches your environment's stabilization time. While the delay is counting down, new remediations remain blocked. +- If `stormTerminationDelay` is not set, storm recovery mode is disabled and only the standard `minHealthy`/`maxUnhealthy` gating applies. + +### Storm Recovery States + +**State 1: Normal Operation** +``` +healthyNodes ≥ minHealthy +→ Create remediations for unhealthy nodes (as allowed by minHealthy/maxUnhealthy) +``` + +**State 2: Storm Recovery Active** +``` +healthyNodes < minHealthy +→ Block new remediations +``` + +**State 3: Regaining Health (Delay)** +``` +healthyNodes ≥ minHealthy +→ Start stormTerminationDelay timer +→ Continue blocking new remediations until delay elapses +``` + +**State 4: Storm Recovery Exit** +``` +after stormTerminationDelay +→ Exit storm recovery mode +→ Resume creating remediations +``` + ## NodeHealthCheck Status The status section of the NodeHealthCheck custom resource provides detailed information about what the operator is doing. It contains these fields: -| Field | Description | -|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| _observedNodes_ | The number of nodes observed according to the selector. | -| _healthyNodes_ | The number of observed healthy nodes. | -| _inFlightRemediations_ | ** DEPRECATED ** A list of "timestamp - node name" pairs of ongoing remediations. Replaced by unhealthyNodes. | -| _unhealthyNodes_ | A list of unhealthy nodes and their remediations. See details below. | -| _conditions_ | A list of conditions representing NHC's current state. Currently the only used type is "Disabled", and it is true when the controller detects problems which prevent it to work correctly. See the [workflow page](./workflow.md) for further information. | -| _phase_ | A short human readable representation of NHC's current state. Known phases are Disabled, Paused, Remediating and Enabled. | -| _reason_ | A longer human readable explanation of the phase. | +| Field | Description | +|-----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| _observedNodes_ | The number of nodes observed according to the selector. | +| _healthyNodes_ | The number of observed healthy nodes. | +| _inFlightRemediations_ | ** DEPRECATED ** A list of "timestamp - node name" pairs of ongoing remediations. Replaced by unhealthyNodes. | +| _unhealthyNodes_ | A list of unhealthy nodes and their remediations. See details below. | +| _conditions_ | A list of conditions representing NHC's current state. Currently the only used type is "Disabled", and it is true when the controller detects problems which prevent it to work correctly. See the [workflow page](./workflow.md) for further information. | +| _phase_ | A short human readable representation of NHC's current state. Known phases are Disabled, Paused, Remediating and Enabled. | +| _reason_ | A longer human readable explanation of the phase. | +| _stormRecoveryStartTime_ | Timestamp when storm recovery mode was activated. Present when storm mode is/was active. | +| _stormTerminationStartTime_ | Timestamp when minHealthy/maxUnhealthy constraint was satisfied and the exit delay countdown started. Present while storm exit delay is in progress. | ### UnhealthyNodes