Skip to content

Commit

Permalink
Merge pull request #458 from rh-rahulshetty/KONFLUX-6427-add-cluster-…
Browse files Browse the repository at this point in the history
…info

feat(KONFLUX-6427): update perf alert description
  • Loading branch information
kubasikus authored Jan 20, 2025
2 parents ef3d713 + e0477f5 commit 90c3c23
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 48 deletions.
16 changes: 8 additions & 8 deletions rhobs/alerting/data_plane/prometheus.performance_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
summary: >-
ETCD slow file system synchronization.
description: >-
10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 40ms.
10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 40ms in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

- alert: EtcdCommitLatency
Expand All @@ -33,7 +33,7 @@ spec:
summary: >-
ETCD slow writes observed.
description: >-
10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 40ms.
10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 40ms in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

- alert: EtcdProposalFailures
Expand All @@ -46,7 +46,7 @@ spec:
summary: >-
ETCD raft proposal failures.
description: >-
Etcd high number of failed proposals on pod {{ $labels.pod }}
Etcd high number of failed proposals on pod {{ $labels.pod }} in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

- alert: EtcdSlowNetworkRTT
Expand All @@ -59,7 +59,7 @@ spec:
summary: >-
High RTT latency on ETCD cluster member requests.
description: >-
99th etcd RTT latency rate on {{$labels.pod}} higher than 0.1
99th etcd RTT latency rate on {{$labels.pod}} higher than 0.1 in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

# KubeAPI Alerts
Expand All @@ -73,21 +73,21 @@ spec:
summary: >-
Kubernetes Job slow completion.
description: >-
Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} running for long duration.
Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} running for long duration in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

# Node based Alerts
- alert: NodeHighCPU
expr: |
(100 * avg(1 - rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)) > 95
(100 * avg(1 - rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance, source_cluster)) > 95
for: 10m
labels:
severity: warning
annotations:
summary: >-
Node High CPU Usage.
description: >-
CPU Usage is {{$value}}% on node {{ $labels.instance }}
CPU Usage is {{$value}}% on node {{ $labels.instance }} in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

- alert: NodeHighMemory
Expand All @@ -100,5 +100,5 @@ spec:
summary: >-
Node High Memory Usage.
description: >-
Memory Usage is {{$value}}% on node {{ $labels.instance }}
Memory Usage is {{$value}}% on node {{ $labels.instance }} in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale
88 changes: 48 additions & 40 deletions test/promql/tests/data_plane/performance_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ tests:
- interval: 1m
input_series:
# Average Fsync latency higher than 40ms, so it will be alerted.
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.05", pod="etcd-pod-1"}'
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.05", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+0.01x15'

- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="+Inf", pod="etcd-pod-1"}'
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="+Inf", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+0.1x15'

alert_rule_test:
Expand All @@ -21,15 +21,16 @@ tests:
- exp_labels:
severity: warning
pod: etcd-pod-1
source_cluster: cluster01
exp_annotations:
summary: "ETCD slow file system synchronization."
description: "10 minutes avg. 99th etcd fsync latency on etcd-pod-1 higher than 40ms."
description: "10 minutes avg. 99th etcd fsync latency on etcd-pod-1 higher than 40ms in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Average Fsync latency is below alert threshold, so it will not be alerted.
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.01", pod="etcd-pod-2"}'
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.01", pod="etcd-pod-2", source_cluster="cluster01"}'
values: '2+0.01x15'

alert_rule_test:
Expand All @@ -40,10 +41,10 @@ tests:
- interval: 1m
input_series:
# Average commit latency higher than 40ms, so it will be alerted.
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="0.05", pod="etcd-pod-1"}'
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="0.05", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+0.01x15'

- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="+Inf", pod="etcd-pod-1"}'
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="+Inf", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+0.5x15'

alert_rule_test:
Expand All @@ -53,15 +54,16 @@ tests:
- exp_labels:
severity: warning
pod: etcd-pod-1
source_cluster: cluster01
exp_annotations:
summary: "ETCD slow writes observed."
description: "10 minutes avg. 99th etcd commit latency on etcd-pod-1 higher than 40ms."
description: "10 minutes avg. 99th etcd commit latency on etcd-pod-1 higher than 40ms in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Average commit latency is below alert threshold count, so it will not be alerted.
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="0.03", pod="etcd-pod-2"}'
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="0.03", pod="etcd-pod-2", source_cluster="cluster01"}'
values: '2+0.01x15'

alert_rule_test:
Expand All @@ -71,13 +73,13 @@ tests:
- interval: 1m
input_series:
# Etcd cluster member RTT delay is over 0.1, so it will be alerted.
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.01", pod="etcd-pod-1"}'
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.01", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0.1+0.3x15'

- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.2", pod="etcd-pod-1"}'
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.2", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0.1+0.5x15'

- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="+Inf", pod="etcd-pod-1"}'
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="+Inf", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0.1+0.2x15'

alert_rule_test:
Expand All @@ -87,15 +89,16 @@ tests:
- exp_labels:
severity: warning
pod: etcd-pod-1
source_cluster: cluster01
exp_annotations:
summary: "High RTT latency on ETCD cluster member requests."
description: "99th etcd RTT latency rate on etcd-pod-1 higher than 0.1"
description: "99th etcd RTT latency rate on etcd-pod-1 higher than 0.1 in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Etcd cluster member RTT delay is below alert threshold rate, so it will not be alerted.
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.1", pod="etcd-pod-2"}'
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.1", pod="etcd-pod-2", source_cluster="cluster01"}'
values: '0.1+0.01x15'

alert_rule_test:
Expand All @@ -106,7 +109,7 @@ tests:
- interval: 1m
input_series:
# Increase in Etcd raft proposal failures over time, so it will be alerted.
- series: 'etcd_server_proposals_failed_total{pod="etcd-pod-1"}'
- series: 'etcd_server_proposals_failed_total{pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+5x59'

alert_rule_test:
Expand All @@ -116,15 +119,16 @@ tests:
- exp_labels:
severity: warning
pod: etcd-pod-1
source_cluster: cluster01
exp_annotations:
summary: "ETCD raft proposal failures."
description: "Etcd high number of failed proposals on pod etcd-pod-1"
description: "Etcd high number of failed proposals on pod etcd-pod-1 in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Etcd raft proposals failures within threshold limit, so it will not be alerted.
- series: 'etcd_server_proposals_failed_total{pod="etcd-pod-1"}'
- series: 'etcd_server_proposals_failed_total{pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+5x30 1+0x29'

alert_rule_test:
Expand All @@ -135,15 +139,15 @@ tests:
- interval: 1m
input_series:
# Run status of 2 jobs are not available, so it will be alerted.
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job"}'
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x15'

# Status: 0 job succeeded
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '0+0x15'

# Status: 1 jobs failed
- series: 'kube_job_status_failed{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_failed{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '1+0x15'

alert_rule_test:
Expand All @@ -154,23 +158,24 @@ tests:
severity: warning
job_name: example-job
namespace: default
source_cluster: cluster01
exp_annotations:
summary: "Kubernetes Job slow completion."
description: "Kubernetes Job default/example-job running for long duration."
description: "Kubernetes Job default/example-job running for long duration in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Run status of 1 job is not available, so it will be alerted.
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job"}'
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x20'

# Status: 2 job succeeded
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '2+0x20'

# Status: 0 jobs failed
- series: 'kube_job_status_failed{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_failed{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '0+0x20'

alert_rule_test:
Expand All @@ -181,23 +186,24 @@ tests:
severity: warning
job_name: example-job
namespace: default
source_cluster: cluster01
exp_annotations:
summary: "Kubernetes Job slow completion."
description: "Kubernetes Job default/example-job running for long duration."
description: "Kubernetes Job default/example-job running for long duration in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Run status of all jobs are available, so it will not be alerted.
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job"}'
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x20'

# Status: 3 job succeeded
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x20'

# Status: 0 jobs failed
- series: 'kube_job_status_failed{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_failed{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '0+0x20'

alert_rule_test:
Expand All @@ -207,15 +213,15 @@ tests:
- interval: 1m
input_series:
# Run status of all jobs are available, so it will not be alerted.
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job"}'
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x20'

# Status: 2 job succeeded
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '2+0x20'

# Status: 0 jobs failed
- series: 'kube_job_status_failed{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_failed{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '1+0x20'

alert_rule_test:
Expand All @@ -226,9 +232,9 @@ tests:
- interval: 1m
input_series:
# Node is running with 100% CPU usage, so it will be alerted.
- series: 'node_cpu_seconds_total{instance="instance1", mode="idle"}'
- series: 'node_cpu_seconds_total{instance="instance1", mode="idle", source_cluster="cluster01"}'
values: '0.01+0x15'
- series: 'node_cpu_seconds_total{instance="instance2", mode="idle"}'
- series: 'node_cpu_seconds_total{instance="instance2", mode="idle", source_cluster="cluster01"}'
values: '0.85+0x10'

alert_rule_test:
Expand All @@ -238,17 +244,18 @@ tests:
- exp_labels:
severity: warning
instance: instance1
source_cluster: cluster01
exp_annotations:
summary: "Node High CPU Usage."
description: "CPU Usage is 100% on node instance1"
description: "CPU Usage is 100% on node instance1 in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Node is running under 95% CPU usage, so it will not be alerted.
- series: 'node_cpu_seconds_total{instance="instance1", mode="idle"}'
- series: 'node_cpu_seconds_total{instance="instance1", mode="idle", source_cluster="cluster01"}'
values: '0.75-0.1x10'
- series: 'node_cpu_seconds_total{instance="instance2", mode="idle"}'
- series: 'node_cpu_seconds_total{instance="instance2", mode="idle", source_cluster="cluster01"}'
values: '0.85+0x10'

alert_rule_test:
Expand All @@ -258,9 +265,9 @@ tests:
- interval: 1m
input_series:
# Node is running with 98% memory usage, so it will be alerted.
- series: 'node_memory_MemTotal_bytes{instance="instance1"}'
- series: 'node_memory_MemTotal_bytes{instance="instance1", source_cluster="cluster01"}'
values: '100+0x9'
- series: 'node_memory_MemAvailable_bytes{instance="instance1"}'
- series: 'node_memory_MemAvailable_bytes{instance="instance1", source_cluster="cluster01"}'
values: '2+0x9'

alert_rule_test:
Expand All @@ -270,17 +277,18 @@ tests:
- exp_labels:
severity: warning
instance: instance1
source_cluster: cluster01
exp_annotations:
summary: "Node High Memory Usage."
description: "Memory Usage is 98% on node instance1"
description: "Memory Usage is 98% on node instance1 in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Node is running under 90% memory usage, so it will not be alerted.
- series: 'node_memory_MemTotal_bytes{instance="instance2"}'
- series: 'node_memory_MemTotal_bytes{instance="instance2", source_cluster="cluster01"}'
values: '100x9'
- series: 'node_memory_MemAvailable_bytes{instance="instance2"}'
- series: 'node_memory_MemAvailable_bytes{instance="instance2", source_cluster="cluster01"}'
values: '98x9'

alert_rule_test:
Expand Down

0 comments on commit 90c3c23

Please sign in to comment.