Skip to content

Commit 574801d

Browse files
authored
prometheus: add builtin alertmanager, labels.level for builtin alerts (#784)
1 parent 37b460a commit 574801d

File tree

1 file changed

+23
-0
lines changed

1 file changed

+23
-0
lines changed

base/prometheus/prometheus.ConfigMap.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ data:
1313
- source_labels: [__meta_kubernetes_service_name]
1414
regex: alertmanager
1515
action: keep
16+
# bundled alertmanager, started by prom-wrapper
17+
- static_configs:
18+
- targets: ['127.0.0.1:9093']
19+
path_prefix: /alertmanager
1620
1721
rule_files:
1822
- '*_rules.yml'
@@ -226,6 +230,11 @@ data:
226230
- source_labels: [__meta_kubernetes_pod_name]
227231
action: replace
228232
target_label: kubernetes_pod_name
233+
234+
- job_name: 'builtin-alertmanager'
235+
metrics_path: /alertmanager/metrics
236+
static_configs:
237+
- targets: ['127.0.0.1:9093']
229238
alert_rules.yml: |
230239
groups:
231240
- name: alert.rules
@@ -234,6 +243,7 @@ data:
234243
expr: app:up:ratio{app!=""} < 0.9
235244
for: 10m
236245
labels:
246+
level: critical
237247
severity: page
238248
annotations:
239249
description: 'Pods missing from {{`{{`}} $labels.app {{`}}`}}: {{`{{`}} $value
@@ -244,6 +254,7 @@ data:
244254
expr: app:up:ratio{app!=""} < 0.1
245255
for: 2m
246256
labels:
257+
level: critical
247258
severity: page
248259
annotations:
249260
description: 'No pods are running for {{`{{`}} $labels.app {{`}}`}}: {{`{{`}}
@@ -254,6 +265,7 @@ data:
254265
expr: histogram_quantile(0.9, sum by(le) (rate(src_http_request_duration_seconds_bucket{job="sourcegraph-frontend",route!="raw"}[10m])))
255266
> 20
256267
labels:
268+
level: critical
257269
severity: page
258270
annotations:
259271
description: 'Page load latency > 20s (90th percentile over all routes; current
@@ -263,6 +275,8 @@ data:
263275
- alert: GoroutineLeak
264276
expr: go_goroutines >= 10000
265277
for: 10m
278+
labels:
279+
level: warn
266280
annotations:
267281
description: '{{`{{`}} $labels.app {{`}}`}} has more than 10k goroutines. This
268282
is probably a regression causing a goroutine leak'
@@ -271,6 +285,7 @@ data:
271285
- alert: FSINodesRemainingLow
272286
expr: sum by(instance) (container_fs_inodes_total{pod_name!=""}) > 3e+06
273287
labels:
288+
level: critical
274289
severity: page
275290
annotations:
276291
description: '{{`{{`}}$labels.instance{{`}}`}} is using {{`{{`}}humanize $value{{`}}`}}
@@ -279,13 +294,16 @@ data:
279294
summary: '{{`{{`}}$labels.instance{{`}}`}} remaining fs inodes is low'
280295
- alert: DiskSpaceLow
281296
expr: node:k8snode_filesystem_avail_bytes:ratio < 0.1
297+
labels:
298+
level: warn
282299
annotations:
283300
help: Alerts when a node has less than 10% available disk space.
284301
summary: '{{`{{`}}$labels.exported_name{{`}}`}} has less than 10% available
285302
disk space'
286303
- alert: DiskSpaceLowCritical
287304
expr: node:k8snode_filesystem_avail_bytes:ratio{exported_name=~".*prod.*"} < 0.05
288305
labels:
306+
level: critical
289307
severity: page
290308
annotations:
291309
help: Alerts when a node has less than 5% available disk space.
@@ -299,19 +317,24 @@ data:
299317
- alert: GitserverDiskSpaceLowCritical
300318
expr: src_gitserver_disk_space_available / src_gitserver_disk_space_total < 0.05
301319
labels:
320+
level: critical
302321
severity: page
303322
annotations:
304323
help: Alerts when gitserverdisk space is critically low.
305324
summary: Critical! gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 5% of available disk space
306325
- alert: SearcherErrorRatioTooHigh
307326
expr: searcher_errors:ratio10m > 0.1
308327
for: 20m
328+
labels:
329+
level: warn
309330
annotations:
310331
help: Alerts when the search service has more than 10% of requests failing.
311332
summary: Error ratio exceeds 10%
312333
- alert: PrometheusMetricsBloat
313334
expr: http_response_size_bytes{handler="prometheus",job!="kubernetes-apiservers",job!="kubernetes-nodes",quantile="0.5"}
314335
> 20000
336+
labels:
337+
level: warn
315338
annotations:
316339
help: Alerts when a service is probably leaking metrics (unbounded attribute).
317340
summary: '{{`{{`}}$labels.job{{`}}`}} in {{`{{`}}$labels.ns{{`}}`}} is probably

0 commit comments

Comments
 (0)