From 545a39da21ddbe6ea058d18cbd0baf2e9fae8e19 Mon Sep 17 00:00:00 2001 From: Yaroslav Borbat Date: Tue, 22 Jul 2025 16:21:38 +0300 Subject: [PATCH 1/3] add alert KubeNodeAwaitingWorkloadEvacuationBeforeShutdown Signed-off-by: Yaroslav Borbat --- monitoring/prometheus-rules/node.yaml | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 monitoring/prometheus-rules/node.yaml diff --git a/monitoring/prometheus-rules/node.yaml b/monitoring/prometheus-rules/node.yaml new file mode 100644 index 0000000000..6cac1ba1fb --- /dev/null +++ b/monitoring/prometheus-rules/node.yaml @@ -0,0 +1,50 @@ +- alert: KubeNodeAwaitingWorkloadEvacuationBeforeShutdown + expr: kube_node_status_condition{condition="GracefulShutdownPostpone", status="true"} == 1 + labels: + severity_level: "6" + tier: cluster + for: 5m + annotations: + plk_protocol_extent_version: "1" + plk_markup_format: "markdown" + plk_create_group_if_not_exists__node_maintenance: "NodeMaintenance,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" + plk_grouped_by__node_maintenance: "NodeMaintenance,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" + summary: Node is awaiting workload evacuation before safe shutdown. + description: | + The node `{{ $labels.node }}` has activated graceful shutdown protection and **cannot be safely powered off** until workloads (e.g., VirtualMachines) are evacuated. + + ### What Is Happening? + A shutdown request was issued, but the system intercepted it to prevent data loss or VM downtime. + The `GracefulShutdownPostpone` condition is now active — this means: + - The node is **intentionally blocking abrupt power-off**. + - You must **manually evict VirtualMachines** or other critical workloads before proceeding. + + This is expected behavior for nodes running VMs and ensures safe maintenance. + + ### Required Action + To proceed with node shutdown: + 1. **List VMs running on the node and check if they are migratable**: + ```bash + d8 k get virtualmachine -A -o jsonpath='{range .items[?(@.status.nodeName=="'{{ $labels.node }}'")]}{.metadata.namespace}/{.metadata.name}{"\t"}Migratable={.status.conditions[?(@.type=="Migratable")].status}{"\n"}{end}'' + ``` + This command shows a list like: + ```bash + default/vm-name Migratable=True + prod/vm-beta Migratable=False + ``` + 2. **For each VM**: + **If Migratable=True**, **migrate the VM to another node**: + ```bash + d8 v evict -n + ``` + > This migrates the VM to another node without guest OS downtime. + + **If Migratable=False**, **restart the VM**: + ```bash + d8 v restart -n + ``` + > This restarts the VM. + Some VMs cannot run on other nodes because they have specific storage or network requirements. + In such cases, these VMs must be stopped. + + 3. Once all VMs are migrated, restarted or stopped, the node will automatically continue shutting down. From 7fc69a30ecbeebc96504dee5f9554795cccedea5 Mon Sep 17 00:00:00 2001 From: Yaroslav Borbat Date: Tue, 22 Jul 2025 18:32:00 +0300 Subject: [PATCH 2/3] add alert KubeNodeAwaitingWorkloadEvacuationBeforeShutdown Signed-off-by: Yaroslav Borbat --- monitoring/prometheus-rules/node.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/monitoring/prometheus-rules/node.yaml b/monitoring/prometheus-rules/node.yaml index 6cac1ba1fb..8505c15769 100644 --- a/monitoring/prometheus-rules/node.yaml +++ b/monitoring/prometheus-rules/node.yaml @@ -1,5 +1,10 @@ - alert: KubeNodeAwaitingWorkloadEvacuationBeforeShutdown - expr: kube_node_status_condition{condition="GracefulShutdownPostpone", status="true"} == 1 + expr: | + ( + kube_node_status_condition{condition="GracefulShutdownPostpone", status="true"} == 1 + and on(node) + sum by (node) (d8_virtualization_virtualmachine_status_phase{phase="Running"}) > 0 + ) labels: severity_level: "6" tier: cluster From 035228ca93d5b88174b4fdb8f8150efcefd4f3ee Mon Sep 17 00:00:00 2001 From: Yaroslav Borbat Date: Tue, 22 Jul 2025 18:35:02 +0300 Subject: [PATCH 3/3] add alert KubeNodeAwaitingWorkloadEvacuationBeforeShutdown Signed-off-by: Yaroslav Borbat --- monitoring/prometheus-rules/node.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/monitoring/prometheus-rules/node.yaml b/monitoring/prometheus-rules/node.yaml index 8505c15769..2860a4f309 100644 --- a/monitoring/prometheus-rules/node.yaml +++ b/monitoring/prometheus-rules/node.yaml @@ -1,4 +1,4 @@ -- alert: KubeNodeAwaitingWorkloadEvacuationBeforeShutdown +- alert: KubeNodeAwaitingVirtualMachinesEvictionBeforeShutdown expr: | ( kube_node_status_condition{condition="GracefulShutdownPostpone", status="true"} == 1 @@ -16,13 +16,13 @@ plk_grouped_by__node_maintenance: "NodeMaintenance,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes" summary: Node is awaiting workload evacuation before safe shutdown. description: | - The node `{{ $labels.node }}` has activated graceful shutdown protection and **cannot be safely powered off** until workloads (e.g., VirtualMachines) are evacuated. + The node `{{ $labels.node }}` has activated graceful shutdown protection and **cannot be safely powered off** until workloads (e.g., VirtualMachines) are eviction. ### What Is Happening? A shutdown request was issued, but the system intercepted it to prevent data loss or VM downtime. The `GracefulShutdownPostpone` condition is now active — this means: - The node is **intentionally blocking abrupt power-off**. - - You must **manually evict VirtualMachines** or other critical workloads before proceeding. + - You must **manually evict VirtualMachines** before proceeding. This is expected behavior for nodes running VMs and ensures safe maintenance.