Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions api/v1alpha1/nodehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ const (
ConditionReasonDisabledTemplateInvalid = "RemediationTemplateInvalid"
// ConditionReasonEnabled is the condition reason for type Disabled and status False
ConditionReasonEnabled = "NodeHealthCheckEnabled"

// ConditionTypeStormActive is the condition type used when NHC will get disabled
ConditionTypeStormActive = "StormActive"
// ConditionReasonStormThresholdChange is the condition reason for a storm change from active to inactive and vice versa
ConditionReasonStormThresholdChange = "HealthyNodeThresholdChange"
)

// NHCPhase is the string used for NHC.Status.Phase
Expand Down Expand Up @@ -107,6 +112,23 @@ type NodeHealthCheckSpec struct {
//+operator-sdk:csv:customresourcedefinitions:type=spec
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`

// StormTerminationDelay introduces a configurable delay after storm recovery
// exit criteria are satisfied (for example, when the number of healthy nodes
// rises above the configured minHealthy constraint). While this
// delay is in effect, NHC remains in storm recovery mode and does not create
// new remediations. Once the delay elapses, storm recovery mode exits and normal
// remediation resumes.
//
// Expects a string of decimal numbers each with optional fraction and a unit
// suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us"
// (or "µs"), "ms", "s", "m", "h".
//
//+kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$"
//+kubebuilder:validation:Type=string
//+optional
//+operator-sdk:csv:customresourcedefinitions:type=spec
StormTerminationDelay *metav1.Duration `json:"stormTerminationDelay,omitempty"`

// RemediationTemplate is a reference to a remediation template
// provided by an infrastructure provider.
//
Expand Down Expand Up @@ -266,6 +288,25 @@ type NodeHealthCheckStatus struct {
//+operator-sdk:csv:customresourcedefinitions:type=status,xDescriptors="urn:alm:descriptor:io.kubernetes.phase:reason"
Reason string `json:"reason,omitempty"`

// StormRecoveryStartTime records when storm recovery mode was activated.
// This field is set when StormRecoveryActive becomes true and helps track
// how long the system has been in storm recovery mode.
//
//+optional
//+kubebuilder:validation:Type=string
//+kubebuilder:validation:Format=date-time
//+operator-sdk:csv:customresourcedefinitions:type=status
StormRecoveryStartTime *metav1.Time `json:"stormRecoveryStartTime,omitempty"`

// StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint
// and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed).
//
//+optional
//+kubebuilder:validation:Type=string
//+kubebuilder:validation:Format=date-time
//+operator-sdk:csv:customresourcedefinitions:type=status
StormTerminationStartTime *metav1.Time `json:"stormTerminationStartTime,omitempty"`

// LastUpdateTime is the last time the status was updated.
//
//+optional
Expand Down
25 changes: 19 additions & 6 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,16 @@ spec:
to work with an empty selector, which matches all nodes."
displayName: Selector
path: selector
- description: "StormTerminationDelay introduces a configurable delay after
storm recovery exit criteria are satisfied (for example, when the number
of healthy nodes rises above the configured minHealthy constraint). While
this delay is in effect, NHC remains in storm recovery mode and does not
create new remediations. Once the delay elapses, storm recovery mode exits
and normal remediation resumes. \n Expects a string of decimal numbers each
with optional fraction and a unit suffix, e.g. \"300ms\", \"1.5h\" or \"2h45m\".
Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"."
displayName: Storm Termination Delay
path: stormTerminationDelay
- description: UnhealthyConditions contains a list of the conditions that determine
whether a node is considered unhealthy. The conditions are combined in
a logical OR, i.e. if any of the conditions is met, the node is unhealthy.
Expand Down Expand Up @@ -189,6 +199,16 @@ spec:
path: reason
x-descriptors:
- urn:alm:descriptor:io.kubernetes.phase:reason
- description: StormRecoveryStartTime records when storm recovery mode was activated.
This field is set when StormRecoveryActive becomes true and helps track
how long the system has been in storm recovery mode.
displayName: Storm Recovery Start Time
path: stormRecoveryStartTime
- description: StormTerminationStartTime records when storm recovery mode regained
the minHealthy/maxUnhealthy constraint and the storm is about to end (after
NodeHealthCheckSpec.StormTerminationDelay has passed).
displayName: Storm Termination Start Time
path: stormTerminationStartTime
- description: UnhealthyNodes tracks currently unhealthy nodes and their remediations.
displayName: Unhealthy Nodes
path: unhealthyNodes
Expand Down
27 changes: 27 additions & 0 deletions bundle/manifests/remediation.medik8s.io_nodehealthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,20 @@ spec:
type: object
type: object
x-kubernetes-map-type: atomic
stormTerminationDelay:
description: |-
StormTerminationDelay introduces a configurable delay after storm recovery
exit criteria are satisfied (for example, when the number of healthy nodes
rises above the configured minHealthy constraint). While this
delay is in effect, NHC remains in storm recovery mode and does not create
new remediations. Once the delay elapses, storm recovery mode exits and normal
remediation resumes.

Expects a string of decimal numbers each with optional fraction and a unit
suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us"
(or "µs"), "ms", "s", "m", "h".
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
unhealthyConditions:
default:
- duration: 300s
Expand Down Expand Up @@ -417,6 +431,19 @@ spec:
reason:
description: Reason explains the current phase in more detail.
type: string
stormRecoveryStartTime:
description: |-
StormRecoveryStartTime records when storm recovery mode was activated.
This field is set when StormRecoveryActive becomes true and helps track
how long the system has been in storm recovery mode.
format: date-time
type: string
stormTerminationStartTime:
description: |-
StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint
and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed).
format: date-time
type: string
unhealthyNodes:
description: UnhealthyNodes tracks currently unhealthy nodes and their
remediations.
Expand Down
27 changes: 27 additions & 0 deletions config/crd/bases/remediation.medik8s.io_nodehealthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,20 @@ spec:
type: object
type: object
x-kubernetes-map-type: atomic
stormTerminationDelay:
description: |-
StormTerminationDelay introduces a configurable delay after storm recovery
exit criteria are satisfied (for example, when the number of healthy nodes
rises above the configured minHealthy constraint). While this
delay is in effect, NHC remains in storm recovery mode and does not create
new remediations. Once the delay elapses, storm recovery mode exits and normal
remediation resumes.

Expects a string of decimal numbers each with optional fraction and a unit
suffix, e.g. "300ms", "1.5h" or "2h45m". Valid time units are "ns", "us"
(or "µs"), "ms", "s", "m", "h".
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
unhealthyConditions:
default:
- duration: 300s
Expand Down Expand Up @@ -415,6 +429,19 @@ spec:
reason:
description: Reason explains the current phase in more detail.
type: string
stormRecoveryStartTime:
description: |-
StormRecoveryStartTime records when storm recovery mode was activated.
This field is set when StormRecoveryActive becomes true and helps track
how long the system has been in storm recovery mode.
format: date-time
type: string
stormTerminationStartTime:
description: |-
StormTerminationStartTime records when storm recovery mode regained the minHealthy/maxUnhealthy constraint
and the storm is about to end (after NodeHealthCheckSpec.StormTerminationDelay has passed).
format: date-time
type: string
unhealthyNodes:
description: UnhealthyNodes tracks currently unhealthy nodes and their
remediations.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,16 @@ spec:
to work with an empty selector, which matches all nodes."
displayName: Selector
path: selector
- description: "StormTerminationDelay introduces a configurable delay after
storm recovery exit criteria are satisfied (for example, when the number
of healthy nodes rises above the configured minHealthy constraint). While
this delay is in effect, NHC remains in storm recovery mode and does not
create new remediations. Once the delay elapses, storm recovery mode exits
and normal remediation resumes. \n Expects a string of decimal numbers each
with optional fraction and a unit suffix, e.g. \"300ms\", \"1.5h\" or \"2h45m\".
Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"."
displayName: Storm Termination Delay
path: stormTerminationDelay
- description: UnhealthyConditions contains a list of the conditions that determine
whether a node is considered unhealthy. The conditions are combined in
a logical OR, i.e. if any of the conditions is met, the node is unhealthy.
Expand Down Expand Up @@ -149,6 +159,16 @@ spec:
path: reason
x-descriptors:
- urn:alm:descriptor:io.kubernetes.phase:reason
- description: StormRecoveryStartTime records when storm recovery mode was activated.
This field is set when StormRecoveryActive becomes true and helps track
how long the system has been in storm recovery mode.
displayName: Storm Recovery Start Time
path: stormRecoveryStartTime
- description: StormTerminationStartTime records when storm recovery mode regained
the minHealthy/maxUnhealthy constraint and the storm is about to end (after
NodeHealthCheckSpec.StormTerminationDelay has passed).
displayName: Storm Termination Start Time
path: stormTerminationStartTime
- description: UnhealthyNodes tracks currently unhealthy nodes and their remediations.
displayName: Unhealthy Nodes
path: unhealthyNodes
Expand Down
Loading