Skip to content

Commit 61eb63f

Browse files
committed
Implementing a storm recovery mechanics which will postponed creation of new remediation as long as the remediation "storm" is active.
Signed-off-by: Michael Shitrit <[email protected]>
1 parent 2d87424 commit 61eb63f

11 files changed

+691
-34
lines changed

api/v1alpha1/nodehealthcheck_types.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,18 @@ type NodeHealthCheckSpec struct {
107107
//+operator-sdk:csv:customresourcedefinitions:type=spec
108108
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`
109109

110+
// StormRecoveryThreshold defines the number of unhealthy nodes at which storm recovery mode should exit.
111+
// When the number of unhealthy nodes drops to this threshold or below, the storm recovery mode will deactivate,
112+
// allowing creation of new remediations.
113+
//
114+
// This threshold must be less than (totalNodes - minHealthy) to prevent permanent storm recovery lock.
115+
// This parameter is optional and when not specified, the original minHealthy/maxUnhealthy behavior is preserved.
116+
//
117+
//+optional
118+
//+kubebuilder:validation:Minimum=0
119+
//+operator-sdk:csv:customresourcedefinitions:type=spec
120+
StormRecoveryThreshold *int `json:"stormRecoveryThreshold,omitempty"`
121+
110122
// RemediationTemplate is a reference to a remediation template
111123
// provided by an infrastructure provider.
112124
//
@@ -266,6 +278,24 @@ type NodeHealthCheckStatus struct {
266278
//+operator-sdk:csv:customresourcedefinitions:type=status,xDescriptors="urn:alm:descriptor:io.kubernetes.phase:reason"
267279
Reason string `json:"reason,omitempty"`
268280

281+
// StormRecoveryActive indicates if storm recovery mode is currently active.
282+
// Storm recovery mode is activated when the number of healthy nodes drops to or below minHealthy,
283+
// and remediation is delayed until the number of unhealthy nodes reaches stormRecoveryThreshold.
284+
//
285+
//+optional
286+
//+operator-sdk:csv:customresourcedefinitions:type=status
287+
StormRecoveryActive *bool `json:"stormRecoveryActive,omitempty"`
288+
289+
// StormRecoveryStartTime records when storm recovery mode was activated.
290+
// This field is set when StormRecoveryActive becomes true and helps track
291+
// how long the system has been in storm recovery mode.
292+
//
293+
//+optional
294+
//+kubebuilder:validation:Type=string
295+
//+kubebuilder:validation:Format=date-time
296+
//+operator-sdk:csv:customresourcedefinitions:type=status
297+
StormRecoveryStartTime *metav1.Time `json:"stormRecoveryStartTime,omitempty"`
298+
269299
// LastUpdateTime is the last time the status was updated.
270300
//
271301
//+optional

api/v1alpha1/nodehealthcheck_webhook.go

Lines changed: 73 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import (
2727
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2828
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2929
"k8s.io/apimachinery/pkg/runtime"
30-
"k8s.io/apimachinery/pkg/util/errors"
30+
utilerrors "k8s.io/apimachinery/pkg/util/errors"
3131
"k8s.io/apimachinery/pkg/util/intstr"
3232
ctrl "sigs.k8s.io/controller-runtime"
3333
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -106,8 +106,9 @@ func (v *customValidator) ValidateDelete(_ context.Context, obj runtime.Object)
106106
}
107107

108108
func (v *customValidator) validate(ctx context.Context, nhc *NodeHealthCheck) error {
109-
aggregated := errors.NewAggregate([]error{
109+
aggregated := utilerrors.NewAggregate([]error{
110110
ValidateMinHealthyMaxUnhealthy(nhc),
111+
v.validateStormRecoveryThreshold(ctx, nhc),
111112
v.validateSelector(nhc),
112113
v.validateMutualRemediations(nhc),
113114
v.validateEscalatingRemediations(ctx, nhc),
@@ -152,7 +153,7 @@ func (v *customValidator) validateEscalatingRemediations(ctx context.Context, nh
152153
return nil
153154
}
154155

155-
aggregated := errors.NewAggregate([]error{
156+
aggregated := utilerrors.NewAggregate([]error{
156157
v.validateEscalatingRemediationsUniqueOrder(nhc),
157158
v.validateEscalatingRemediationsTimeout(nhc),
158159
v.validateEscalatingRemediationsUniqueRemediator(ctx, nhc),
@@ -262,3 +263,72 @@ func ValidateMinHealthyMaxUnhealthy(nhc *NodeHealthCheck) error {
262263
}
263264
return nil
264265
}
266+
267+
func (v *customValidator) validateStormRecoveryThreshold(ctx context.Context, nhc *NodeHealthCheck) error {
268+
// StormRecoveryThreshold is optional, so skip validation if not specified
269+
if nhc.Spec.StormRecoveryThreshold == nil {
270+
return nil
271+
}
272+
273+
selector, err := metav1.LabelSelectorAsSelector(&nhc.Spec.Selector)
274+
if err != nil {
275+
return fmt.Errorf("invalid selector for storm recovery validation: %v", err)
276+
}
277+
278+
var nodes corev1.NodeList
279+
// Fetch nodes matching the selector to get actual total count for comprehensive validation
280+
if err := v.Client.List(ctx, &nodes, client.MatchingLabelsSelector{Selector: selector}); err != nil {
281+
// Storm recovery validation requires actual node count - cannot proceed without it
282+
return fmt.Errorf("failed to fetch nodes for storm recovery validation: %v", err)
283+
}
284+
285+
totalNodes := len(nodes.Items)
286+
if totalNodes == 0 {
287+
return fmt.Errorf("no nodes match the selector, cannot validate storm recovery threshold")
288+
}
289+
290+
// Get the storm recovery threshold value
291+
stormThreshold := *nhc.Spec.StormRecoveryThreshold
292+
293+
// Calculate minHealthy directly to validate the critical constraint
294+
minHealthy, err := GetMinHealthy(nhc, totalNodes)
295+
if err != nil {
296+
nodehealthchecklog.Error(err, "failed to calculate the number of minimum healthy nodes")
297+
return err
298+
}
299+
300+
// Critical validation: stormRecoveryThreshold < (totalNodes - minHealthy)
301+
// This ensures storm recovery can actually be exited and prevents permanent storm recovery lock
302+
maxAllowedStormThreshold := totalNodes - minHealthy
303+
if stormThreshold >= maxAllowedStormThreshold {
304+
return fmt.Errorf("stormRecoveryThreshold (%d) must be less than (totalNodes - minHealthy) = (%d - %d) = %d to prevent permanent storm recovery lock",
305+
stormThreshold, totalNodes, minHealthy, maxAllowedStormThreshold)
306+
}
307+
308+
return nil
309+
}
310+
311+
func GetMinHealthy(nhc *NodeHealthCheck, total int) (int, error) {
312+
err := ValidateMinHealthyMaxUnhealthy(nhc)
313+
if err != nil {
314+
return 0, err
315+
}
316+
if nhc.Spec.MinHealthy != nil {
317+
minHealthy, err := intstr.GetScaledValueFromIntOrPercent(nhc.Spec.MinHealthy, total, true)
318+
if minHealthy < 0 && err == nil {
319+
err = fmt.Errorf("minHealthy is negative: %d", minHealthy)
320+
}
321+
return minHealthy, err
322+
}
323+
if nhc.Spec.MaxUnhealthy != nil {
324+
maxUnhealthy, err := intstr.GetScaledValueFromIntOrPercent(nhc.Spec.MaxUnhealthy, total, true)
325+
if maxUnhealthy < 0 && err == nil {
326+
err = fmt.Errorf("maxUnhealthy is negative: %d", maxUnhealthy)
327+
}
328+
if maxUnhealthy > total && err == nil {
329+
err = fmt.Errorf("maxUnhealthy is greater than the number of selected nodes: %d", maxUnhealthy)
330+
}
331+
return total - maxUnhealthy, err
332+
}
333+
return 0, fmt.Errorf("one of minHealthy and maxUnhealthy should be specified")
334+
}

api/v1alpha1/nodehealthcheck_webhook_test.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package v1alpha1
22

33
import (
44
"context"
5+
"fmt"
56
"time"
67

78
. "github.com/onsi/ginkgo/v2"
@@ -428,6 +429,119 @@ var _ = Describe("NodeHealthCheck Validation", func() {
428429
Expect(nhc.isRemediating()).To(BeTrue())
429430
})
430431
})
432+
433+
Context("Storm Recovery Validation", func() {
434+
When("valid storm recovery configuration", func() {
435+
BeforeEach(func() {
436+
// Setup: 10 nodes, minHealthy=60% (6), stormRecoveryThreshold=2
437+
// Valid because: 2 < (10-6) = 4
438+
mh := intstr.FromString("60%")
439+
nhc.Spec.MinHealthy = &mh
440+
stormThreshold := 2
441+
nhc.Spec.StormRecoveryThreshold = &stormThreshold
442+
// Keep the RemediationTemplate from parent BeforeEach
443+
nhc.Spec.RemediationTemplate = &v1.ObjectReference{
444+
Kind: "R",
445+
Namespace: "dummy",
446+
Name: "r",
447+
APIVersion: "r",
448+
}
449+
nhc.Spec.Selector = metav1.LabelSelector{
450+
MatchExpressions: []metav1.LabelSelectorRequirement{
451+
{
452+
Key: "node-role.kubernetes.io/control-plane",
453+
Operator: metav1.LabelSelectorOpDoesNotExist,
454+
},
455+
},
456+
}
457+
458+
// Mock 10 nodes
459+
mockValidatorClient.listFunc = func(ctx context.Context, nodesList client.ObjectList, opts ...client.ListOption) error {
460+
nodeList := nodesList.(*v1.NodeList)
461+
nodeList.Items = make([]v1.Node, 10)
462+
for i := 0; i < 10; i++ {
463+
nodeList.Items[i] = v1.Node{
464+
ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("node-%d", i)},
465+
}
466+
}
467+
return nil
468+
}
469+
})
470+
471+
It("should be allowed", func() {
472+
Expect(validator.validate(context.Background(), nhc)).To(Succeed())
473+
})
474+
})
475+
476+
When("storm recovery threshold too high with percentage minHealthy", func() {
477+
BeforeEach(func() {
478+
// Setup: 5 nodes, minHealthy=60% (3), stormRecoveryThreshold=3
479+
// Invalid because: 3 >= (5-3) = 2
480+
mh := intstr.FromString("60%")
481+
nhc.Spec.MinHealthy = &mh
482+
stormThreshold := 3
483+
nhc.Spec.StormRecoveryThreshold = &stormThreshold
484+
// Keep the RemediationTemplate from parent BeforeEach
485+
nhc.Spec.RemediationTemplate = &v1.ObjectReference{
486+
Kind: "R",
487+
Namespace: "dummy",
488+
Name: "r",
489+
APIVersion: "r",
490+
}
491+
492+
// Mock 5 nodes
493+
mockValidatorClient.listFunc = func(ctx context.Context, nodesList client.ObjectList, opts ...client.ListOption) error {
494+
nodeList := nodesList.(*v1.NodeList)
495+
nodeList.Items = make([]v1.Node, 5)
496+
for i := 0; i < 5; i++ {
497+
nodeList.Items[i] = v1.Node{
498+
ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("node-%d", i)},
499+
}
500+
}
501+
return nil
502+
}
503+
})
504+
505+
It("should be denied", func() {
506+
Expect(validator.validate(context.Background(), nhc)).To(MatchError(ContainSubstring("stormRecoveryThreshold (3) must be less than (totalNodes - minHealthy) = (5 - 3) = 2 to prevent permanent storm recovery lock")))
507+
})
508+
})
509+
510+
When("storm recovery threshold too high with fixed maxUnhealthy", func() {
511+
BeforeEach(func() {
512+
// Setup: 6 nodes, maxUnhealthy=2 (minHealthy=4), stormRecoveryThreshold=3
513+
// Invalid because: 3 >= (6-4) = 2
514+
nhc.Spec.MinHealthy = nil
515+
maxUnhealthy := intstr.FromInt(2)
516+
nhc.Spec.MaxUnhealthy = &maxUnhealthy
517+
stormThreshold := 3
518+
nhc.Spec.StormRecoveryThreshold = &stormThreshold
519+
// Keep the RemediationTemplate from parent BeforeEach
520+
nhc.Spec.RemediationTemplate = &v1.ObjectReference{
521+
Kind: "R",
522+
Namespace: "dummy",
523+
Name: "r",
524+
APIVersion: "r",
525+
}
526+
527+
// Mock 6 nodes
528+
mockValidatorClient.listFunc = func(ctx context.Context, nodesList client.ObjectList, opts ...client.ListOption) error {
529+
nodeList := nodesList.(*v1.NodeList)
530+
nodeList.Items = make([]v1.Node, 6)
531+
for i := 0; i < 6; i++ {
532+
nodeList.Items[i] = v1.Node{
533+
ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("node-%d", i)},
534+
}
535+
}
536+
return nil
537+
}
538+
})
539+
540+
It("should be denied", func() {
541+
Expect(validator.validate(context.Background(), nhc)).To(MatchError(ContainSubstring("stormRecoveryThreshold (3) must be less than (totalNodes - minHealthy) = (6 - 4) = 2 to prevent permanent storm recovery lock")))
542+
})
543+
})
544+
})
431545
})
432546
})
433547

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 15 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/node-healthcheck-operator.clusterserviceversion.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,15 @@ spec:
137137
to work with an empty selector, which matches all nodes."
138138
displayName: Selector
139139
path: selector
140+
- description: "StormRecoveryThreshold defines the number of unhealthy nodes
141+
at which storm recovery mode should exit. When the number of unhealthy nodes
142+
drops to this threshold or below, the storm recovery mode will deactivate,
143+
allowing creation of new remediations. \n This threshold must be less than
144+
(totalNodes - minHealthy) to prevent permanent storm recovery lock. This
145+
parameter is optional and when not specified, the original minHealthy/maxUnhealthy
146+
behavior is preserved."
147+
displayName: Storm Recovery Threshold
148+
path: stormRecoveryThreshold
140149
- description: UnhealthyConditions contains a list of the conditions that determine
141150
whether a node is considered unhealthy. The conditions are combined in
142151
a logical OR, i.e. if any of the conditions is met, the node is unhealthy.
@@ -189,6 +198,17 @@ spec:
189198
path: reason
190199
x-descriptors:
191200
- urn:alm:descriptor:io.kubernetes.phase:reason
201+
- description: StormRecoveryActive indicates if storm recovery mode is currently
202+
active. Storm recovery mode is activated when the number of healthy nodes
203+
drops to or below minHealthy, and remediation is delayed until the number
204+
of unhealthy nodes reaches stormRecoveryThreshold.
205+
displayName: Storm Recovery Active
206+
path: stormRecoveryActive
207+
- description: StormRecoveryStartTime records when storm recovery mode was activated.
208+
This field is set when StormRecoveryActive becomes true and helps track
209+
how long the system has been in storm recovery mode.
210+
displayName: Storm Recovery Start Time
211+
path: stormRecoveryStartTime
192212
- description: UnhealthyNodes tracks currently unhealthy nodes and their remediations.
193213
displayName: Unhealthy Nodes
194214
path: unhealthyNodes

bundle/manifests/remediation.medik8s.io_nodehealthchecks.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,17 @@ spec:
280280
type: object
281281
type: object
282282
x-kubernetes-map-type: atomic
283+
stormRecoveryThreshold:
284+
description: |-
285+
StormRecoveryThreshold defines the number of unhealthy nodes at which storm recovery mode should exit.
286+
When the number of unhealthy nodes drops to this threshold or below, the storm recovery mode will deactivate,
287+
allowing creation of new remediations.
288+
289+
290+
This threshold must be less than (totalNodes - minHealthy) to prevent permanent storm recovery lock.
291+
This parameter is optional and when not specified, the original minHealthy/maxUnhealthy behavior is preserved.
292+
minimum: 0
293+
type: integer
283294
unhealthyConditions:
284295
default:
285296
- duration: 300s
@@ -438,6 +449,19 @@ spec:
438449
reason:
439450
description: Reason explains the current phase in more detail.
440451
type: string
452+
stormRecoveryActive:
453+
description: |-
454+
StormRecoveryActive indicates if storm recovery mode is currently active.
455+
Storm recovery mode is activated when the number of healthy nodes drops to or below minHealthy,
456+
and remediation is delayed until the number of unhealthy nodes reaches stormRecoveryThreshold.
457+
type: boolean
458+
stormRecoveryStartTime:
459+
description: |-
460+
StormRecoveryStartTime records when storm recovery mode was activated.
461+
This field is set when StormRecoveryActive becomes true and helps track
462+
how long the system has been in storm recovery mode.
463+
format: date-time
464+
type: string
441465
unhealthyNodes:
442466
description: UnhealthyNodes tracks currently unhealthy nodes and their
443467
remediations.

0 commit comments

Comments
 (0)