Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(KONFLUX-6427): update perf alert description #458

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions rhobs/alerting/data_plane/prometheus.performance_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
summary: >-
ETCD slow file system synchronization.
description: >-
10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 40ms.
10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 40ms in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

- alert: EtcdCommitLatency
Expand All @@ -33,7 +33,7 @@ spec:
summary: >-
ETCD slow writes observed.
description: >-
10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 40ms.
10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 40ms in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

- alert: EtcdProposalFailures
Expand All @@ -46,7 +46,7 @@ spec:
summary: >-
ETCD raft proposal failures.
description: >-
Etcd high number of failed proposals on pod {{ $labels.pod }}
Etcd high number of failed proposals on pod {{ $labels.pod }} in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

- alert: EtcdSlowNetworkRTT
Expand All @@ -59,7 +59,7 @@ spec:
summary: >-
High RTT latency on ETCD cluster member requests.
description: >-
99th etcd RTT latency rate on {{$labels.pod}} higher than 0.1
99th etcd RTT latency rate on {{$labels.pod}} higher than 0.1 in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

# KubeAPI Alerts
Expand All @@ -73,21 +73,21 @@ spec:
summary: >-
Kubernetes Job slow completion.
description: >-
Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} running for long duration.
Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} running for long duration in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

# Node based Alerts
- alert: NodeHighCPU
expr: |
(100 * avg(1 - rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)) > 95
(100 * avg(1 - rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance, source_cluster)) > 95
for: 10m
labels:
severity: warning
annotations:
summary: >-
Node High CPU Usage.
description: >-
CPU Usage is {{$value}}% on node {{ $labels.instance }}
CPU Usage is {{$value}}% on node {{ $labels.instance }} in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale

- alert: NodeHighMemory
Expand All @@ -100,5 +100,5 @@ spec:
summary: >-
Node High Memory Usage.
description: >-
Memory Usage is {{$value}}% on node {{ $labels.instance }}
Memory Usage is {{$value}}% on node {{ $labels.instance }} in cluster {{ $labels.source_cluster }}.
alert_routing_key: perfandscale
88 changes: 48 additions & 40 deletions test/promql/tests/data_plane/performance_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ tests:
- interval: 1m
input_series:
# Average Fsync latency higher than 40ms, so it will be alerted.
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.05", pod="etcd-pod-1"}'
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.05", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+0.01x15'

- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="+Inf", pod="etcd-pod-1"}'
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="+Inf", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+0.1x15'

alert_rule_test:
Expand All @@ -21,15 +21,16 @@ tests:
- exp_labels:
severity: warning
pod: etcd-pod-1
source_cluster: cluster01
exp_annotations:
summary: "ETCD slow file system synchronization."
description: "10 minutes avg. 99th etcd fsync latency on etcd-pod-1 higher than 40ms."
description: "10 minutes avg. 99th etcd fsync latency on etcd-pod-1 higher than 40ms in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Average Fsync latency is below alert threshold, so it will not be alerted.
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.01", pod="etcd-pod-2"}'
- series: 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.01", pod="etcd-pod-2", source_cluster="cluster01"}'
values: '2+0.01x15'

alert_rule_test:
Expand All @@ -40,10 +41,10 @@ tests:
- interval: 1m
input_series:
# Average commit latency higher than 40ms, so it will be alerted.
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="0.05", pod="etcd-pod-1"}'
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="0.05", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+0.01x15'

- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="+Inf", pod="etcd-pod-1"}'
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="+Inf", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+0.5x15'

alert_rule_test:
Expand All @@ -53,15 +54,16 @@ tests:
- exp_labels:
severity: warning
pod: etcd-pod-1
source_cluster: cluster01
exp_annotations:
summary: "ETCD slow writes observed."
description: "10 minutes avg. 99th etcd commit latency on etcd-pod-1 higher than 40ms."
description: "10 minutes avg. 99th etcd commit latency on etcd-pod-1 higher than 40ms in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Average commit latency is below alert threshold count, so it will not be alerted.
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="0.03", pod="etcd-pod-2"}'
- series: 'etcd_disk_backend_commit_duration_seconds_bucket{le="0.03", pod="etcd-pod-2", source_cluster="cluster01"}'
values: '2+0.01x15'

alert_rule_test:
Expand All @@ -71,13 +73,13 @@ tests:
- interval: 1m
input_series:
# Etcd cluster member RTT delay is over 0.1, so it will be alerted.
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.01", pod="etcd-pod-1"}'
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.01", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0.1+0.3x15'

- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.2", pod="etcd-pod-1"}'
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.2", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0.1+0.5x15'

- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="+Inf", pod="etcd-pod-1"}'
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="+Inf", pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0.1+0.2x15'

alert_rule_test:
Expand All @@ -87,15 +89,16 @@ tests:
- exp_labels:
severity: warning
pod: etcd-pod-1
source_cluster: cluster01
exp_annotations:
summary: "High RTT latency on ETCD cluster member requests."
description: "99th etcd RTT latency rate on etcd-pod-1 higher than 0.1"
description: "99th etcd RTT latency rate on etcd-pod-1 higher than 0.1 in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Etcd cluster member RTT delay is below alert threshold rate, so it will not be alerted.
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.1", pod="etcd-pod-2"}'
- series: 'etcd_network_peer_round_trip_time_seconds_bucket{le="0.1", pod="etcd-pod-2", source_cluster="cluster01"}'
values: '0.1+0.01x15'

alert_rule_test:
Expand All @@ -106,7 +109,7 @@ tests:
- interval: 1m
input_series:
# Increase in Etcd raft proposal failures over time, so it will be alerted.
- series: 'etcd_server_proposals_failed_total{pod="etcd-pod-1"}'
- series: 'etcd_server_proposals_failed_total{pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+5x59'

alert_rule_test:
Expand All @@ -116,15 +119,16 @@ tests:
- exp_labels:
severity: warning
pod: etcd-pod-1
source_cluster: cluster01
exp_annotations:
summary: "ETCD raft proposal failures."
description: "Etcd high number of failed proposals on pod etcd-pod-1"
description: "Etcd high number of failed proposals on pod etcd-pod-1 in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Etcd raft proposals failures within threshold limit, so it will not be alerted.
- series: 'etcd_server_proposals_failed_total{pod="etcd-pod-1"}'
- series: 'etcd_server_proposals_failed_total{pod="etcd-pod-1", source_cluster="cluster01"}'
values: '0+5x30 1+0x29'

alert_rule_test:
Expand All @@ -135,15 +139,15 @@ tests:
- interval: 1m
input_series:
# Run status of 2 jobs are not available, so it will be alerted.
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job"}'
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x15'

# Status: 0 job succeeded
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '0+0x15'

# Status: 1 jobs failed
- series: 'kube_job_status_failed{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_failed{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '1+0x15'

alert_rule_test:
Expand All @@ -154,23 +158,24 @@ tests:
severity: warning
job_name: example-job
namespace: default
source_cluster: cluster01
exp_annotations:
summary: "Kubernetes Job slow completion."
description: "Kubernetes Job default/example-job running for long duration."
description: "Kubernetes Job default/example-job running for long duration in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Run status of 1 job is not available, so it will be alerted.
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job"}'
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x20'

# Status: 2 job succeeded
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '2+0x20'

# Status: 0 jobs failed
- series: 'kube_job_status_failed{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_failed{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '0+0x20'

alert_rule_test:
Expand All @@ -181,23 +186,24 @@ tests:
severity: warning
job_name: example-job
namespace: default
source_cluster: cluster01
exp_annotations:
summary: "Kubernetes Job slow completion."
description: "Kubernetes Job default/example-job running for long duration."
description: "Kubernetes Job default/example-job running for long duration in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Run status of all jobs are available, so it will not be alerted.
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job"}'
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x20'

# Status: 3 job succeeded
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x20'

# Status: 0 jobs failed
- series: 'kube_job_status_failed{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_failed{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '0+0x20'

alert_rule_test:
Expand All @@ -207,15 +213,15 @@ tests:
- interval: 1m
input_series:
# Run status of all jobs are available, so it will not be alerted.
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job"}'
- series: 'kube_job_spec_completions{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '3+0x20'

# Status: 2 job succeeded
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_succeeded{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '2+0x20'

# Status: 0 jobs failed
- series: 'kube_job_status_failed{namespace="default", job_name="example-job"}'
- series: 'kube_job_status_failed{namespace="default", job_name="example-job", source_cluster="cluster01"}'
values: '1+0x20'

alert_rule_test:
Expand All @@ -226,9 +232,9 @@ tests:
- interval: 1m
input_series:
# Node is running with 100% CPU usage, so it will be alerted.
- series: 'node_cpu_seconds_total{instance="instance1", mode="idle"}'
- series: 'node_cpu_seconds_total{instance="instance1", mode="idle", source_cluster="cluster01"}'
values: '0.01+0x15'
- series: 'node_cpu_seconds_total{instance="instance2", mode="idle"}'
- series: 'node_cpu_seconds_total{instance="instance2", mode="idle", source_cluster="cluster01"}'
values: '0.85+0x10'

alert_rule_test:
Expand All @@ -238,17 +244,18 @@ tests:
- exp_labels:
severity: warning
instance: instance1
source_cluster: cluster01
exp_annotations:
summary: "Node High CPU Usage."
description: "CPU Usage is 100% on node instance1"
description: "CPU Usage is 100% on node instance1 in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Node is running under 95% CPU usage, so it will not be alerted.
- series: 'node_cpu_seconds_total{instance="instance1", mode="idle"}'
- series: 'node_cpu_seconds_total{instance="instance1", mode="idle", source_cluster="cluster01"}'
values: '0.75-0.1x10'
- series: 'node_cpu_seconds_total{instance="instance2", mode="idle"}'
- series: 'node_cpu_seconds_total{instance="instance2", mode="idle", source_cluster="cluster01"}'
values: '0.85+0x10'

alert_rule_test:
Expand All @@ -258,9 +265,9 @@ tests:
- interval: 1m
input_series:
# Node is running with 98% memory usage, so it will be alerted.
- series: 'node_memory_MemTotal_bytes{instance="instance1"}'
- series: 'node_memory_MemTotal_bytes{instance="instance1", source_cluster="cluster01"}'
values: '100+0x9'
- series: 'node_memory_MemAvailable_bytes{instance="instance1"}'
- series: 'node_memory_MemAvailable_bytes{instance="instance1", source_cluster="cluster01"}'
values: '2+0x9'

alert_rule_test:
Expand All @@ -270,17 +277,18 @@ tests:
- exp_labels:
severity: warning
instance: instance1
source_cluster: cluster01
exp_annotations:
summary: "Node High Memory Usage."
description: "Memory Usage is 98% on node instance1"
description: "Memory Usage is 98% on node instance1 in cluster cluster01."
alert_routing_key: perfandscale

- interval: 1m
input_series:
# Node is running under 90% memory usage, so it will not be alerted.
- series: 'node_memory_MemTotal_bytes{instance="instance2"}'
- series: 'node_memory_MemTotal_bytes{instance="instance2", source_cluster="cluster01"}'
values: '100x9'
- series: 'node_memory_MemAvailable_bytes{instance="instance2"}'
- series: 'node_memory_MemAvailable_bytes{instance="instance2", source_cluster="cluster01"}'
values: '98x9'

alert_rule_test:
Expand Down