From 545a39da21ddbe6ea058d18cbd0baf2e9fae8e19 Mon Sep 17 00:00:00 2001
From: Yaroslav Borbat <yaroslav.borbat@flant.com>
Date: Tue, 22 Jul 2025 16:21:38 +0300
Subject: [PATCH 1/3] add alert
 KubeNodeAwaitingWorkloadEvacuationBeforeShutdown

Signed-off-by: Yaroslav Borbat <yaroslav.borbat@flant.com>
---
 monitoring/prometheus-rules/node.yaml | 50 +++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 monitoring/prometheus-rules/node.yaml
diff --git a/monitoring/prometheus-rules/node.yaml b/monitoring/prometheus-rules/node.yaml
new file mode 100644
index 0000000000..6cac1ba1fb
--- /dev/null
+++ b/monitoring/prometheus-rules/node.yaml
@@ -0,0 +1,50 @@
+- alert: KubeNodeAwaitingWorkloadEvacuationBeforeShutdown
+  expr: kube_node_status_condition{condition="GracefulShutdownPostpone", status="true"} == 1
+  labels:
+    severity_level: "6"
+    tier: cluster
+  for: 5m
+  annotations:
+    plk_protocol_extent_version: "1"
+    plk_markup_format: "markdown"
+    plk_create_group_if_not_exists__node_maintenance: "NodeMaintenance,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes"
+    plk_grouped_by__node_maintenance: "NodeMaintenance,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes"
+    summary: Node is awaiting workload evacuation before safe shutdown.
+    description: |
+      The node `{{ $labels.node }}` has activated graceful shutdown protection and **cannot be safely powered off** until workloads (e.g., VirtualMachines) are evacuated.
+
+      ### What Is Happening?
+      A shutdown request was issued, but the system intercepted it to prevent data loss or VM downtime.
+      The `GracefulShutdownPostpone` condition is now active — this means:
+      - The node is **intentionally blocking abrupt power-off**.
+      - You must **manually evict VirtualMachines** or other critical workloads before proceeding.
+
+      This is expected behavior for nodes running VMs and ensures safe maintenance.
+
+      ### Required Action
+      To proceed with node shutdown:
+      1. **List VMs running on the node and check if they are migratable**:
+        ```bash
+        d8 k get virtualmachine -A -o jsonpath='{range .items[?(@.status.nodeName=="'{{ $labels.node }}'")]}{.metadata.namespace}/{.metadata.name}{"\t"}Migratable={.status.conditions[?(@.type=="Migratable")].status}{"\n"}{end}''
+        ```
+        This command shows a list like:
+        ```bash
+        default/vm-name	Migratable=True
+        prod/vm-beta    Migratable=False
+        ```
+      2. **For each VM**:
+         **If Migratable=True**, **migrate the VM to another node**:
+        ```bash
+         d8 v evict <vm-name> -n <namespace>
+         ```
+         > This migrates the VM to another node without guest OS downtime.
+
+        **If Migratable=False**, **restart the VM**:
+        ```bash
+         d8 v restart <vm-name> -n <namespace>
+        ```
+        > This restarts the VM.
+        Some VMs cannot run on other nodes because they have specific storage or network requirements.
+        In such cases, these VMs must be stopped.
+
+      3. Once all VMs are migrated, restarted or stopped, the node will automatically continue shutting down.

From 7fc69a30ecbeebc96504dee5f9554795cccedea5 Mon Sep 17 00:00:00 2001
From: Yaroslav Borbat <yaroslav.borbat@flant.com>
Date: Tue, 22 Jul 2025 18:32:00 +0300
Subject: [PATCH 2/3] add alert
 KubeNodeAwaitingWorkloadEvacuationBeforeShutdown

Signed-off-by: Yaroslav Borbat <yaroslav.borbat@flant.com>
---
 monitoring/prometheus-rules/node.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/monitoring/prometheus-rules/node.yaml b/monitoring/prometheus-rules/node.yaml
index 6cac1ba1fb..8505c15769 100644
--- a/monitoring/prometheus-rules/node.yaml
+++ b/monitoring/prometheus-rules/node.yaml
@@ -1,5 +1,10 @@
 - alert: KubeNodeAwaitingWorkloadEvacuationBeforeShutdown
-  expr: kube_node_status_condition{condition="GracefulShutdownPostpone", status="true"} == 1
+  expr: |
+    (
+      kube_node_status_condition{condition="GracefulShutdownPostpone", status="true"} == 1
+      and on(node)
+      sum by (node) (d8_virtualization_virtualmachine_status_phase{phase="Running"}) > 0
+    )
   labels:
     severity_level: "6"
     tier: cluster

From 035228ca93d5b88174b4fdb8f8150efcefd4f3ee Mon Sep 17 00:00:00 2001
From: Yaroslav Borbat <yaroslav.borbat@flant.com>
Date: Tue, 22 Jul 2025 18:35:02 +0300
Subject: [PATCH 3/3] add alert
 KubeNodeAwaitingWorkloadEvacuationBeforeShutdown

Signed-off-by: Yaroslav Borbat <yaroslav.borbat@flant.com>
---
 monitoring/prometheus-rules/node.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/monitoring/prometheus-rules/node.yaml b/monitoring/prometheus-rules/node.yaml
index 8505c15769..2860a4f309 100644
--- a/monitoring/prometheus-rules/node.yaml
+++ b/monitoring/prometheus-rules/node.yaml
@@ -1,4 +1,4 @@
-- alert: KubeNodeAwaitingWorkloadEvacuationBeforeShutdown
+- alert: KubeNodeAwaitingVirtualMachinesEvictionBeforeShutdown
   expr: |
     (
       kube_node_status_condition{condition="GracefulShutdownPostpone", status="true"} == 1
@@ -16,13 +16,13 @@
     plk_grouped_by__node_maintenance: "NodeMaintenance,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes"
     summary: Node is awaiting workload evacuation before safe shutdown.
     description: |
-      The node `{{ $labels.node }}` has activated graceful shutdown protection and **cannot be safely powered off** until workloads (e.g., VirtualMachines) are evacuated.
+      The node `{{ $labels.node }}` has activated graceful shutdown protection and **cannot be safely powered off** until workloads (e.g., VirtualMachines) are eviction.
 
       ### What Is Happening?
       A shutdown request was issued, but the system intercepted it to prevent data loss or VM downtime.
       The `GracefulShutdownPostpone` condition is now active — this means:
       - The node is **intentionally blocking abrupt power-off**.
-      - You must **manually evict VirtualMachines** or other critical workloads before proceeding.
+      - You must **manually evict VirtualMachines** before proceeding.
 
       This is expected behavior for nodes running VMs and ensures safe maintenance.