13
13
- source_labels: [__meta_kubernetes_service_name]
14
14
regex: alertmanager
15
15
action: keep
16
+ # bundled alertmanager, started by prom-wrapper
17
+ - static_configs:
18
+ - targets: ['127.0.0.1:9093']
19
+ path_prefix: /alertmanager
16
20
17
21
rule_files:
18
22
- '*_rules.yml'
@@ -226,6 +230,11 @@ data:
226
230
- source_labels: [__meta_kubernetes_pod_name]
227
231
action: replace
228
232
target_label: kubernetes_pod_name
233
+
234
+ - job_name: 'builtin-alertmanager'
235
+ metrics_path: /alertmanager/metrics
236
+ static_configs:
237
+ - targets: ['127.0.0.1:9093']
229
238
alert_rules.yml : |
230
239
groups:
231
240
- name: alert.rules
@@ -234,6 +243,7 @@ data:
234
243
expr: app:up:ratio{app!=""} < 0.9
235
244
for: 10m
236
245
labels:
246
+ level: critical
237
247
severity: page
238
248
annotations:
239
249
description: 'Pods missing from {{`{{`}} $labels.app {{`}}`}}: {{`{{`}} $value
@@ -244,6 +254,7 @@ data:
244
254
expr: app:up:ratio{app!=""} < 0.1
245
255
for: 2m
246
256
labels:
257
+ level: critical
247
258
severity: page
248
259
annotations:
249
260
description: 'No pods are running for {{`{{`}} $labels.app {{`}}`}}: {{`{{`}}
@@ -254,6 +265,7 @@ data:
254
265
expr: histogram_quantile(0.9, sum by(le) (rate(src_http_request_duration_seconds_bucket{job="sourcegraph-frontend",route!="raw"}[10m])))
255
266
> 20
256
267
labels:
268
+ level: critical
257
269
severity: page
258
270
annotations:
259
271
description: 'Page load latency > 20s (90th percentile over all routes; current
@@ -263,6 +275,8 @@ data:
263
275
- alert: GoroutineLeak
264
276
expr: go_goroutines >= 10000
265
277
for: 10m
278
+ labels:
279
+ level: warn
266
280
annotations:
267
281
description: '{{`{{`}} $labels.app {{`}}`}} has more than 10k goroutines. This
268
282
is probably a regression causing a goroutine leak'
@@ -271,6 +285,7 @@ data:
271
285
- alert: FSINodesRemainingLow
272
286
expr: sum by(instance) (container_fs_inodes_total{pod_name!=""}) > 3e+06
273
287
labels:
288
+ level: critical
274
289
severity: page
275
290
annotations:
276
291
description: '{{`{{`}}$labels.instance{{`}}`}} is using {{`{{`}}humanize $value{{`}}`}}
@@ -279,13 +294,16 @@ data:
279
294
summary: '{{`{{`}}$labels.instance{{`}}`}} remaining fs inodes is low'
280
295
- alert: DiskSpaceLow
281
296
expr: node:k8snode_filesystem_avail_bytes:ratio < 0.1
297
+ labels:
298
+ level: warn
282
299
annotations:
283
300
help: Alerts when a node has less than 10% available disk space.
284
301
summary: '{{`{{`}}$labels.exported_name{{`}}`}} has less than 10% available
285
302
disk space'
286
303
- alert: DiskSpaceLowCritical
287
304
expr: node:k8snode_filesystem_avail_bytes:ratio{exported_name=~".*prod.*"} < 0.05
288
305
labels:
306
+ level: critical
289
307
severity: page
290
308
annotations:
291
309
help: Alerts when a node has less than 5% available disk space.
@@ -299,19 +317,24 @@ data:
299
317
- alert: GitserverDiskSpaceLowCritical
300
318
expr: src_gitserver_disk_space_available / src_gitserver_disk_space_total < 0.05
301
319
labels:
320
+ level: critical
302
321
severity: page
303
322
annotations:
304
323
help: Alerts when gitserverdisk space is critically low.
305
324
summary: Critical! gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 5% of available disk space
306
325
- alert: SearcherErrorRatioTooHigh
307
326
expr: searcher_errors:ratio10m > 0.1
308
327
for: 20m
328
+ labels:
329
+ level: warn
309
330
annotations:
310
331
help: Alerts when the search service has more than 10% of requests failing.
311
332
summary: Error ratio exceeds 10%
312
333
- alert: PrometheusMetricsBloat
313
334
expr: http_response_size_bytes{handler="prometheus",job!="kubernetes-apiservers",job!="kubernetes-nodes",quantile="0.5"}
314
335
> 20000
336
+ labels:
337
+ level: warn
315
338
annotations:
316
339
help: Alerts when a service is probably leaking metrics (unbounded attribute).
317
340
summary: '{{`{{`}}$labels.job{{`}}`}} in {{`{{`}}$labels.ns{{`}}`}} is probably
0 commit comments