@@ -235,6 +235,7 @@ data:
235
235
metrics_path: /alertmanager/metrics
236
236
static_configs:
237
237
- targets: ['127.0.0.1:9093']
238
+ # TODO: migrate these rules to the generator - https://github.com/sourcegraph/sourcegraph/issues/12117
238
239
alert_rules.yml : |
239
240
groups:
240
241
- name: alert.rules
@@ -245,9 +246,10 @@ data:
245
246
labels:
246
247
level: critical
247
248
severity: page
249
+ service_name: "{{ $labels.app }}"
250
+ name: "{{ $labels.alertname }}"
251
+ description: "Pods missing from {{ $labels.app }}: {{ $value }}"
248
252
annotations:
249
- description: 'Pods missing from {{`{{`}} $labels.app {{`}}`}}: {{`{{`}} $value
250
- {{`}}`}}'
251
253
help: Alerts when pods are missing.
252
254
summary: Pods missing from {{`{{`}} $labels.app {{`}}`}}
253
255
- alert: NoPodsRunning
@@ -256,9 +258,10 @@ data:
256
258
labels:
257
259
level: critical
258
260
severity: page
261
+ service_name: "{{ $labels.app }}"
262
+ name: "{{ $labels.alertname }}"
263
+ description: "No pods are running for {{ $labels.app }}: {{ $value }}"
259
264
annotations:
260
- description: 'No pods are running for {{`{{`}} $labels.app {{`}}`}}: {{`{{`}}
261
- $value {{`}}`}}'
262
265
help: Alerts when no pods are running for a service.
263
266
summary: No pods are running for {{`{{`}} $labels.app {{`}}`}}
264
267
- alert: ProdPageLoadLatency
@@ -267,35 +270,41 @@ data:
267
270
labels:
268
271
level: critical
269
272
severity: page
273
+ service_name: "{{ $labels.app }}"
274
+ name: "{{ $labels.alertname }}"
275
+ description: "Page load latency > 20s (90th percentile over all routes; current value: {{ $value }}s)"
270
276
annotations:
271
- description: 'Page load latency > 20s (90th percentile over all routes; current
272
- value: {{`{{`}}$value{{`}}`}}s)'
273
277
help: Alerts when the page load latency is too high.
274
278
summary: High page load latency
275
279
- alert: GoroutineLeak
276
280
expr: go_goroutines >= 10000
277
281
for: 10m
278
282
labels:
279
283
level: warn
284
+ service_name: "{{ $labels.app }}"
285
+ name: "{{ $labels.alertname }}"
286
+ description: "{{ $labels.app }} has more than 10k goroutines. This is probably a regression causing a goroutine leak"
280
287
annotations:
281
- description: '{{`{{`}} $labels.app {{`}}`}} has more than 10k goroutines. This
282
- is probably a regression causing a goroutine leak'
283
288
help: Alerts when a service has excessive running goroutines.
284
289
summary: Excessive number of goroutines
285
290
- alert: FSINodesRemainingLow
286
291
expr: sum by(instance) (container_fs_inodes_total{pod_name!=""}) > 3e+06
287
292
labels:
288
293
level: critical
289
294
severity: page
295
+ service_name: "{{ $labels.app }}"
296
+ name: "{{ $labels.alertname }}"
297
+ description: "{{ $labels.instance }} is using {{ $value | humanize }} inodes"
290
298
annotations:
291
- description: '{{`{{`}}$labels.instance{{`}}`}} is using {{`{{`}}humanize $value{{`}}`}}
292
- inodes'
293
299
help: Alerts when a node's remaining FS inodes are low.
294
300
summary: '{{`{{`}}$labels.instance{{`}}`}} remaining fs inodes is low'
295
301
- alert: DiskSpaceLow
296
302
expr: node:k8snode_filesystem_avail_bytes:ratio < 0.1
297
303
labels:
298
304
level: warn
305
+ service_name: "{{ $labels.app }}"
306
+ name: "{{ $labels.alertname }}"
307
+ description: "{{ $labels.exported_name }} has less than 10% available disk space"
299
308
annotations:
300
309
help: Alerts when a node has less than 10% available disk space.
301
310
summary: '{{`{{`}}$labels.exported_name{{`}}`}} has less than 10% available
@@ -305,12 +314,20 @@ data:
305
314
labels:
306
315
level: critical
307
316
severity: page
317
+ service_name: "{{ $labels.app }}"
318
+ name: "{{ $labels.alertname }}"
319
+ description: "{{ $labels.exported_name }} has less than 5% available disk space"
308
320
annotations:
309
321
help: Alerts when a node has less than 5% available disk space.
310
322
summary: Critical! {{`{{`}}$labels.exported_name{{`}}`}} has less than 5% available
311
323
disk space
312
324
- alert: GitserverDiskSpaceLow
313
325
expr: src_gitserver_disk_space_available / src_gitserver_disk_space_total < 0.1
326
+ labels:
327
+ level: warning
328
+ service_name: "{{ $labels.app }}"
329
+ name: "{{ $labels.alertname }}"
330
+ description: "gitserver {{ $labels.instance }} disk space is less than 10% of available disk space"
314
331
annotations:
315
332
help: Alerts when gitserverdisk space is low.
316
333
summary: gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 10% of available disk space
@@ -319,6 +336,9 @@ data:
319
336
labels:
320
337
level: critical
321
338
severity: page
339
+ service_name: "{{ $labels.app }}"
340
+ name: "{{ $labels.alertname }}"
341
+ description: "gitserver {{ $labels.instance }} disk space is less than 5% of available disk space"
322
342
annotations:
323
343
help: Alerts when gitserverdisk space is critically low.
324
344
summary: Critical! gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 5% of available disk space
@@ -327,6 +347,9 @@ data:
327
347
for: 20m
328
348
labels:
329
349
level: warn
350
+ service_name: "{{ $labels.app }}"
351
+ name: "{{ $labels.alertname }}"
352
+ description: Error ratio exceeds 10%
330
353
annotations:
331
354
help: Alerts when the search service has more than 10% of requests failing.
332
355
summary: Error ratio exceeds 10%
@@ -335,6 +358,9 @@ data:
335
358
> 20000
336
359
labels:
337
360
level: warn
361
+ service_name: "{{ $labels.app }}"
362
+ name: "{{ $labels.alertname }}"
363
+ description: "{{ $labels.job }} in {{ $labels.ns }} is probably leaking metrics (unbounded attribute)"
338
364
annotations:
339
365
help: Alerts when a service is probably leaking metrics (unbounded attribute).
340
366
summary: '{{`{{`}}$labels.job{{`}}`}} in {{`{{`}}$labels.ns{{`}}`}} is probably
0 commit comments