prometheus: add consistent alert labels (#786)

bobheadxi · web-flow · commit fd8ae81269ab · 2020-07-13T17:06:43.000+08:00
diff --git a/base/prometheus/prometheus.ConfigMap.yaml b/base/prometheus/prometheus.ConfigMap.yaml
@@ -235,6 +235,7 @@ data:
       metrics_path: /alertmanager/metrics
       static_configs:
         - targets: ['127.0.0.1:9093']
+  # TODO: migrate these rules to the generator - https://github.com/sourcegraph/sourcegraph/issues/12117
   alert_rules.yml: |
     groups:
       - name: alert.rules
@@ -245,9 +246,10 @@ data:
             labels:
               level: critical
               severity: page
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "Pods missing from {{ $labels.app }}: {{ $value }}"
             annotations:
-              description: 'Pods missing from {{`{{`}} $labels.app {{`}}`}}: {{`{{`}} $value
-            {{`}}`}}'
               help: Alerts when pods are missing.
               summary: Pods missing from {{`{{`}} $labels.app {{`}}`}}
           - alert: NoPodsRunning
@@ -256,9 +258,10 @@ data:
             labels:
               level: critical
               severity: page
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "No pods are running for {{ $labels.app }}: {{ $value }}"
             annotations:
-              description: 'No pods are running for {{`{{`}} $labels.app {{`}}`}}: {{`{{`}}
-            $value {{`}}`}}'
               help: Alerts when no pods are running for a service.
               summary: No pods are running for {{`{{`}} $labels.app {{`}}`}}
           - alert: ProdPageLoadLatency
@@ -267,35 +270,41 @@ data:
             labels:
               level: critical
               severity: page
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "Page load latency > 20s (90th percentile over all routes; current value: {{ $value }}s)"
             annotations:
-              description: 'Page load latency > 20s (90th percentile over all routes; current
-            value: {{`{{`}}$value{{`}}`}}s)'
               help: Alerts when the page load latency is too high.
               summary: High page load latency
           - alert: GoroutineLeak
             expr: go_goroutines >= 10000
             for: 10m
             labels:
               level: warn
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "{{ $labels.app }} has more than 10k goroutines. This is probably a regression causing a goroutine leak"
             annotations:
-              description: '{{`{{`}} $labels.app {{`}}`}} has more than 10k goroutines. This
-            is probably a regression causing a goroutine leak'
               help: Alerts when a service has excessive running goroutines.
               summary: Excessive number of goroutines
           - alert: FSINodesRemainingLow
             expr: sum by(instance) (container_fs_inodes_total{pod_name!=""}) > 3e+06
             labels:
               level: critical
               severity: page
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "{{ $labels.instance }} is using {{ $value | humanize }} inodes"
             annotations:
-              description: '{{`{{`}}$labels.instance{{`}}`}} is using {{`{{`}}humanize $value{{`}}`}}
-            inodes'
               help: Alerts when a node's remaining FS inodes are low.
               summary: '{{`{{`}}$labels.instance{{`}}`}} remaining fs inodes is low'
           - alert: DiskSpaceLow
             expr: node:k8snode_filesystem_avail_bytes:ratio < 0.1
             labels:
               level: warn
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "{{ $labels.exported_name }} has less than 10% available disk space"
             annotations:
               help: Alerts when a node has less than 10% available disk space.
               summary: '{{`{{`}}$labels.exported_name{{`}}`}} has less than 10% available
@@ -305,12 +314,20 @@ data:
             labels:
               level: critical
               severity: page
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "{{ $labels.exported_name }} has less than 5% available disk space"
             annotations:
               help: Alerts when a node has less than 5% available disk space.
               summary: Critical! {{`{{`}}$labels.exported_name{{`}}`}} has less than 5% available
                 disk space
           - alert: GitserverDiskSpaceLow
             expr: src_gitserver_disk_space_available / src_gitserver_disk_space_total < 0.1
+            labels:
+              level: warning
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "gitserver {{ $labels.instance }} disk space is less than 10% of available disk space"
             annotations:
               help: Alerts when gitserverdisk space is low.
               summary: gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 10% of available disk space
@@ -319,6 +336,9 @@ data:
             labels:
               level: critical
               severity: page
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "gitserver {{ $labels.instance }} disk space is less than 5% of available disk space"
             annotations:
               help: Alerts when gitserverdisk space is critically low.
               summary: Critical! gitserver {{`{{`}}$labels.instance{{`}}`}} disk space is less than 5% of available disk space
@@ -327,6 +347,9 @@ data:
             for: 20m
             labels:
               level: warn
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: Error ratio exceeds 10%
             annotations:
               help: Alerts when the search service has more than 10% of requests failing.
               summary: Error ratio exceeds 10%
@@ -335,6 +358,9 @@ data:
               > 20000
             labels:
               level: warn
+              service_name: "{{ $labels.app }}"
+              name: "{{ $labels.alertname }}"
+              description: "{{ $labels.job }} in {{ $labels.ns }} is probably leaking metrics (unbounded attribute)"
             annotations:
               help: Alerts when a service is probably leaking metrics (unbounded attribute).
               summary: '{{`{{`}}$labels.job{{`}}`}} in {{`{{`}}$labels.ns{{`}}`}} is probably
diff --git a/kubectl-apply-all.sh b/kubectl-apply-all.sh
@@ -6,9 +6,10 @@
 #   * When the cluster is first created
 #   * Whenever the configuration for any resource has been updated
 #
-# The --prune flag is destructive and should always be used 
-# in conjunction with -f base and -l deploy=sourcegraph. Otherwise, it will delete all resources 
+# The --prune flag is destructive and should always be used
+# in conjunction with -f base and -l deploy=sourcegraph. Otherwise, it will delete all resources
 # previously created by create or apply that are not specified in the command.
 #
 # Apply the base Soucegraph deployment
+# shellcheck disable=SC2068
 kubectl apply --prune -l deploy=sourcegraph -f base --recursive $@

Original file line number	Diff line number	Diff line change
`@@ -6,9 +6,10 @@`
`6`	`6`	`# * When the cluster is first created`
`7`	`7`	`# * Whenever the configuration for any resource has been updated`
`8`	`8`	`#`
`9`		`-# The --prune flag is destructive and should always be used`
`10`		`-# in conjunction with -f base and -l deploy=sourcegraph. Otherwise, it will delete all resources`
	`9`	`+# The --prune flag is destructive and should always be used`
	`10`	`+# in conjunction with -f base and -l deploy=sourcegraph. Otherwise, it will delete all resources`
`11`	`11`	`# previously created by create or apply that are not specified in the command.`
`12`	`12`	`#`
`13`	`13`	`# Apply the base Soucegraph deployment`
	`14`	`+# shellcheck disable=SC2068`
`14`	`15`	`kubectl apply --prune -l deploy=sourcegraph -f base --recursive $@`