Skip to content

Commit fd8ae81

Browse files
authored
prometheus: add consistent alert labels (#786)
1 parent 574801d commit fd8ae81

File tree

2 files changed

+39
-12
lines changed

2 files changed

+39
-12
lines changed

base/prometheus/prometheus.ConfigMap.yaml

+36-10
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ data:
235235
metrics_path: /alertmanager/metrics
236236
static_configs:
237237
- targets: ['127.0.0.1:9093']
238+
# TODO: migrate these rules to the generator - https://github.com/sourcegraph/sourcegraph/issues/12117
238239
alert_rules.yml: |
239240
groups:
240241
- name: alert.rules
@@ -245,9 +246,10 @@ data:
245246
labels:
246247
level: critical
247248
severity: page
249+
service_name: "{{ $labels.app }}"
250+
name: "{{ $labels.alertname }}"
251+
description: "Pods missing from {{ $labels.app }}: {{ $value }}"
248252
annotations:
249-
description: 'Pods missing from {{`{{`}} $labels.app {{`}}`}}: {{`{{`}} $value
250-
{{`}}`}}'
251253
help: Alerts when pods are missing.
252254
summary: Pods missing from {{`{{`}} $labels.app {{`}}`}}
253255
- alert: NoPodsRunning
@@ -256,9 +258,10 @@ data:
256258
labels:
257259
level: critical
258260
severity: page
261+
service_name: "{{ $labels.app }}"
262+
name: "{{ $labels.alertname }}"
263+
description: "No pods are running for {{ $labels.app }}: {{ $value }}"
259264
annotations:
260-
description: 'No pods are running for {{`{{`}} $labels.app {{`}}`}}: {{`{{`}}
261-
$value {{`}}`}}'
262265
help: Alerts when no pods are running for a service.
263266
summary: No pods are running for {{`{{`}} $labels.app {{`}}`}}
264267
- alert: ProdPageLoadLatency
@@ -267,35 +270,41 @@ data:
267270
labels:
268271
level: critical
269272
severity: page
273+
service_name: "{{ $labels.app }}"
274+
name: "{{ $labels.alertname }}"
275+
description: "Page load latency > 20s (90th percentile over all routes; current value: {{ $value }}s)"
270276
annotations:
271-
description: 'Page load latency > 20s (90th percentile over all routes; current
272-
value: {{`{{`}}$value{{`}}`}}s)'
273277
help: Alerts when the page load latency is too high.
274278
summary: High page load latency
275279
- alert: GoroutineLeak
276280
expr: go_goroutines >= 10000
277281
for: 10m
278282
labels:
279283
level: warn
284+
service_name: "{{ $labels.app }}"
285+
name: "{{ $labels.alertname }}"
286+
description: "{{ $labels.app }} has more than 10k goroutines. This is probably a regression causing a goroutine leak"
280287
annotations:
281-
description: '{{`{{`}} $labels.app {{`}}`}} has more than 10k goroutines. This
282-
is probably a regression causing a goroutine leak'
283288
help: Alerts when a service has excessive running goroutines.
284289
summary: Excessive number of goroutines
285290
- alert: FSINodesRemainingLow
286291
expr: sum by(instance) (container_fs_inodes_total{pod_name!=""}) > 3e+06
287292
labels:
288293
level: critical
289294
severity: page
295+
service_name: "{{ $labels.app }}"
296+
name: "{{ $labels.alertname }}"
297+
description: "{{ $labels.instance }} is using {{ $value | humanize }} inodes"
290298
annotations:
291-
description: '{{`{{`}}$labels.instance{{`}}`}} is using {{`{{`}}humanize $value{{`}}`}}
292-
inodes'
293299
help: Alerts when a node's remaining FS inodes are low.
294300
summary: '{{`{{`}}$labels.instance{{`}}`}} remaining fs inodes is low'
295301
- alert: DiskSpaceLow
296302
expr: node:k8snode_filesystem_avail_bytes:ratio < 0.1
297303
labels:
298304
level: warn
305+
service_name: "{{ $labels.app }}"
306+
name: "{{ $labels.alertname }}"
307+
description: "{{ $labels.exported_name }} has less than 10% available disk space"
299308
annotations:
300309
help: Alerts when a node has less than 10% available disk space.
301310
summary: '{{`{{`}}$labels.exported_name{{`}}`}} has less than 10% available
@@ -305,12 +314,20 @@ data:
305314
labels:
306315
level: critical
307316
severity: page
317+
service_name: "{{ $labels.app }}"
318+
name: "{{ $labels.alertname }}"
319+
description: "{{ $labels.exported_name }} has less than 5% available disk space"
308320
annotations:
309321
help: Alerts when a node has less than 5% available disk space.
310322
summary: Critical! {{`{{`}}$labels.exported_name{{`}}`}} has less than 5% available
311323
disk space
312324
- alert: GitserverDiskSpaceLow
313325
expr: src_gitserver_disk_space_available / src_gitserver_disk_space_total < 0.1
326+
labels:
327+
level: warning
328+
service_name: "{{ $labels.app }}"
329+
name: "{{ $labels.alertname }}"
330+
description: "gitserver {{ $labels.instance }} disk space is less than 10% of available disk space"
314331
annotations:
315332
help: Alerts when gitserverdisk space is low.
316333
summary: gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 10% of available disk space
@@ -319,6 +336,9 @@ data:
319336
labels:
320337
level: critical
321338
severity: page
339+
service_name: "{{ $labels.app }}"
340+
name: "{{ $labels.alertname }}"
341+
description: "gitserver {{ $labels.instance }} disk space is less than 5% of available disk space"
322342
annotations:
323343
help: Alerts when gitserverdisk space is critically low.
324344
summary: Critical! gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 5% of available disk space
@@ -327,6 +347,9 @@ data:
327347
for: 20m
328348
labels:
329349
level: warn
350+
service_name: "{{ $labels.app }}"
351+
name: "{{ $labels.alertname }}"
352+
description: Error ratio exceeds 10%
330353
annotations:
331354
help: Alerts when the search service has more than 10% of requests failing.
332355
summary: Error ratio exceeds 10%
@@ -335,6 +358,9 @@ data:
335358
> 20000
336359
labels:
337360
level: warn
361+
service_name: "{{ $labels.app }}"
362+
name: "{{ $labels.alertname }}"
363+
description: "{{ $labels.job }} in {{ $labels.ns }} is probably leaking metrics (unbounded attribute)"
338364
annotations:
339365
help: Alerts when a service is probably leaking metrics (unbounded attribute).
340366
summary: '{{`{{`}}$labels.job{{`}}`}} in {{`{{`}}$labels.ns{{`}}`}} is probably

kubectl-apply-all.sh

+3-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
# * When the cluster is first created
77
# * Whenever the configuration for any resource has been updated
88
#
9-
# The --prune flag is destructive and should always be used
10-
# in conjunction with -f base and -l deploy=sourcegraph. Otherwise, it will delete all resources
9+
# The --prune flag is destructive and should always be used
10+
# in conjunction with -f base and -l deploy=sourcegraph. Otherwise, it will delete all resources
1111
# previously created by create or apply that are not specified in the command.
1212
#
1313
# Apply the base Soucegraph deployment
14+
# shellcheck disable=SC2068
1415
kubectl apply --prune -l deploy=sourcegraph -f base --recursive $@

0 commit comments

Comments
 (0)