From 9488c8e46ccea428b99b7debdd401fcf0cf47c67 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 13 Nov 2024 15:37:51 +0000 Subject: [PATCH] High cpu usage alert now only applies to control and login nodes --- .../roles/kube_prometheus_stack/defaults/main/helm.yml | 2 ++ .../common/inventory/group_vars/all/prometheus.yml | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index e9c04632f..22fadb7b3 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -51,6 +51,8 @@ kube_prometheus_stack_release_defaults: KubeSchedulerDown: true KubeProxyDown: true KubeControllerManagerDown: true + # Replaced with appliance specific versions + NodeCPUHighUsage: true prometheus: service: type: NodePort diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 351aedbe7..52cdbe4a7 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -44,3 +44,11 @@ prometheus_extra_alerting_rules: expr: "slurm_nodes_down > 0\n" labels: severity: critical + - alert: NodeCPUHighUsage + annotations: + description: '{% raw %}CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.{% endraw %}' + summary: High CPU usage. + expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",mode!="idle",instance=~".+(-control|-login).*"}[2m]))) * 100 > 90 + for: 15m + labels: + severity: info