Skip to content

Commit 021946b

Browse files
committed
Implementing a storm recovery mechanics which will postponed creation of new remediation as long as the "storm" is active.
Signed-off-by: Michael Shitrit <[email protected]>
1 parent 2d87424 commit 021946b

10 files changed

+533
-19
lines changed

api/v1alpha1/nodehealthcheck_types.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ const (
3434
ConditionReasonDisabledTemplateInvalid = "RemediationTemplateInvalid"
3535
// ConditionReasonEnabled is the condition reason for type Disabled and status False
3636
ConditionReasonEnabled = "NodeHealthCheckEnabled"
37+
38+
// ConditionTypeStormActive is the condition type used when NHC will get disabled
39+
ConditionTypeStormActive = "StormActive"
40+
// ConditionReasonStormThresholdChange is the condition reason for a storm change from active to inactive and vice versa
41+
ConditionReasonStormThresholdChange = "HealthyNodeThresholdChange"
3742
)
3843

3944
// NHCPhase is the string used for NHC.Status.Phase
@@ -107,6 +112,23 @@ type NodeHealthCheckSpec struct {
107112
//+operator-sdk:csv:customresourcedefinitions:type=spec
108113
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`
109114

115+
// StormTerminationDelay introduces a configurable delay after storm recovery
116+
// exit criteria are satisfied (for example, when the number of healthy nodes
117+
// rises above the configured minHealthy constraint). While this
118+
// delay is in effect, NHC remains in storm recovery mode and does not create
119+
// new remediations. Once the delay elapses, storm recovery mode exits and normal
120+
// remediation resumes.
121+
//
122+
// Expects a string of decimal numbers each with optional fraction and a unit
123+
// suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us"
124+
// (or "µs"), "ms", "s", "m", "h".
125+
//
126+
//+kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$"
127+
//+kubebuilder:validation:Type=string
128+
//+optional
129+
//+operator-sdk:csv:customresourcedefinitions:type=spec
130+
StormTerminationDelay *metav1.Duration `json:"stormTerminationDelay,omitempty"`
131+
110132
// RemediationTemplate is a reference to a remediation template
111133
// provided by an infrastructure provider.
112134
//
@@ -266,6 +288,25 @@ type NodeHealthCheckStatus struct {
266288
//+operator-sdk:csv:customresourcedefinitions:type=status,xDescriptors="urn:alm:descriptor:io.kubernetes.phase:reason"
267289
Reason string `json:"reason,omitempty"`
268290

291+
// StormRecoveryStartTime records when storm recovery mode was activated.
292+
// This field is set when StormRecoveryActive becomes true and helps track
293+
// how long the system has been in storm recovery mode.
294+
//
295+
//+optional
296+
//+kubebuilder:validation:Type=string
297+
//+kubebuilder:validation:Format=date-time
298+
//+operator-sdk:csv:customresourcedefinitions:type=status
299+
StormRecoveryStartTime *metav1.Time `json:"stormRecoveryStartTime,omitempty"`
300+
301+
// StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint
302+
// and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed).
303+
//
304+
//+optional
305+
//+kubebuilder:validation:Type=string
306+
//+kubebuilder:validation:Format=date-time
307+
//+operator-sdk:csv:customresourcedefinitions:type=status
308+
StormTerminationStartTime *metav1.Time `json:"stormTerminationStartTime,omitempty"`
309+
269310
// LastUpdateTime is the last time the status was updated.
270311
//
271312
//+optional

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 19 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/node-healthcheck-operator.clusterserviceversion.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,16 @@ spec:
137137
to work with an empty selector, which matches all nodes."
138138
displayName: Selector
139139
path: selector
140+
- description: "StormTerminationDelay introduces a configurable delay after
141+
storm recovery exit criteria are satisfied (for example, when the number
142+
of healthy nodes rises above the configured minHealthy constraint). While
143+
this delay is in effect, NHC remains in storm recovery mode and does not
144+
create new remediations. Once the delay elapses, storm recovery mode exits
145+
and normal remediation resumes. \n Expects a string of decimal numbers each
146+
with optional fraction and a unit suffix, e.g. \"300ms\", \"1.5h\" or \"2h45m\".
147+
Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"."
148+
displayName: Storm Termination Delay
149+
path: stormTerminationDelay
140150
- description: UnhealthyConditions contains a list of the conditions that determine
141151
whether a node is considered unhealthy. The conditions are combined in
142152
a logical OR, i.e. if any of the conditions is met, the node is unhealthy.
@@ -189,6 +199,16 @@ spec:
189199
path: reason
190200
x-descriptors:
191201
- urn:alm:descriptor:io.kubernetes.phase:reason
202+
- description: StormRecoveryStartTime records when storm recovery mode was activated.
203+
This field is set when StormRecoveryActive becomes true and helps track
204+
how long the system has been in storm recovery mode.
205+
displayName: Storm Recovery Start Time
206+
path: stormRecoveryStartTime
207+
- description: StormTerminationStartTime records when storm recovery mode regained
208+
the minHealthy/maxUnhealthy constraint and the storm is about to end (after
209+
NodeHealthCheckSpec.StormTerminationDelay has passed).
210+
displayName: Storm Termination Start Time
211+
path: stormTerminationStartTime
192212
- description: UnhealthyNodes tracks currently unhealthy nodes and their remediations.
193213
displayName: Unhealthy Nodes
194214
path: unhealthyNodes

bundle/manifests/remediation.medik8s.io_nodehealthchecks.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,21 @@ spec:
280280
type: object
281281
type: object
282282
x-kubernetes-map-type: atomic
283+
stormTerminationDelay:
284+
description: |-
285+
StormTerminationDelay introduces a configurable delay after storm recovery
286+
exit criteria are satisfied (for example, when the number of healthy nodes
287+
rises above the configured minHealthy constraint). While this
288+
delay is in effect, NHC remains in storm recovery mode and does not create
289+
new remediations. Once the delay elapses, storm recovery mode exits and normal
290+
remediation resumes.
291+
292+
293+
Expects a string of decimal numbers each with optional fraction and a unit
294+
suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us"
295+
(or "µs"), "ms", "s", "m", "h".
296+
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
297+
type: string
283298
unhealthyConditions:
284299
default:
285300
- duration: 300s
@@ -438,6 +453,19 @@ spec:
438453
reason:
439454
description: Reason explains the current phase in more detail.
440455
type: string
456+
stormRecoveryStartTime:
457+
description: |-
458+
StormRecoveryStartTime records when storm recovery mode was activated.
459+
This field is set when StormRecoveryActive becomes true and helps track
460+
how long the system has been in storm recovery mode.
461+
format: date-time
462+
type: string
463+
stormTerminationStartTime:
464+
description: |-
465+
StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint
466+
and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed).
467+
format: date-time
468+
type: string
441469
unhealthyNodes:
442470
description: UnhealthyNodes tracks currently unhealthy nodes and their
443471
remediations.

config/crd/bases/remediation.medik8s.io_nodehealthchecks.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,21 @@ spec:
278278
type: object
279279
type: object
280280
x-kubernetes-map-type: atomic
281+
stormTerminationDelay:
282+
description: |-
283+
StormTerminationDelay introduces a configurable delay after storm recovery
284+
exit criteria are satisfied (for example, when the number of healthy nodes
285+
rises above the configured minHealthy constraint). While this
286+
delay is in effect, NHC remains in storm recovery mode and does not create
287+
new remediations. Once the delay elapses, storm recovery mode exits and normal
288+
remediation resumes.
289+
290+
291+
Expects a string of decimal numbers each with optional fraction and a unit
292+
suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us"
293+
(or "µs"), "ms", "s", "m", "h".
294+
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
295+
type: string
281296
unhealthyConditions:
282297
default:
283298
- duration: 300s
@@ -436,6 +451,19 @@ spec:
436451
reason:
437452
description: Reason explains the current phase in more detail.
438453
type: string
454+
stormRecoveryStartTime:
455+
description: |-
456+
StormRecoveryStartTime records when storm recovery mode was activated.
457+
This field is set when StormRecoveryActive becomes true and helps track
458+
how long the system has been in storm recovery mode.
459+
format: date-time
460+
type: string
461+
stormTerminationStartTime:
462+
description: |-
463+
StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint
464+
and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed).
465+
format: date-time
466+
type: string
439467
unhealthyNodes:
440468
description: UnhealthyNodes tracks currently unhealthy nodes and their
441469
remediations.

config/manifests/base/bases/node-healthcheck-operator.clusterserviceversion.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,16 @@ spec:
9797
to work with an empty selector, which matches all nodes."
9898
displayName: Selector
9999
path: selector
100+
- description: "StormTerminationDelay introduces a configurable delay after
101+
storm recovery exit criteria are satisfied (for example, when the number
102+
of healthy nodes rises above the configured minHealthy constraint). While
103+
this delay is in effect, NHC remains in storm recovery mode and does not
104+
create new remediations. Once the delay elapses, storm recovery mode exits
105+
and normal remediation resumes. \n Expects a string of decimal numbers each
106+
with optional fraction and a unit suffix, e.g. \"300ms\", \"1.5h\" or \"2h45m\".
107+
Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"."
108+
displayName: Storm Termination Delay
109+
path: stormTerminationDelay
100110
- description: UnhealthyConditions contains a list of the conditions that determine
101111
whether a node is considered unhealthy. The conditions are combined in
102112
a logical OR, i.e. if any of the conditions is met, the node is unhealthy.
@@ -149,6 +159,16 @@ spec:
149159
path: reason
150160
x-descriptors:
151161
- urn:alm:descriptor:io.kubernetes.phase:reason
162+
- description: StormRecoveryStartTime records when storm recovery mode was activated.
163+
This field is set when StormRecoveryActive becomes true and helps track
164+
how long the system has been in storm recovery mode.
165+
displayName: Storm Recovery Start Time
166+
path: stormRecoveryStartTime
167+
- description: StormTerminationStartTime records when storm recovery mode regained
168+
the minHealthy/maxUnhealthy constraint and the storm is about to end (after
169+
NodeHealthCheckSpec.StormTerminationDelay has passed).
170+
displayName: Storm Termination Start Time
171+
path: stormTerminationStartTime
152172
- description: UnhealthyNodes tracks currently unhealthy nodes and their remediations.
153173
displayName: Unhealthy Nodes
154174
path: unhealthyNodes

0 commit comments

Comments
 (0)