Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions infra/alertmanager/config/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
global:
resolve_timeout: 5m

route:
receiver: default
group_by: ["alertname", "job", "severity"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h

receivers:
- name: default
Comment thread
luaraggio marked this conversation as resolved.

inhibit_rules:
- source_matchers:
- severity="critical"
target_matchers:
- severity="warning"
equal: ["alertname", "job", "instance"]
19 changes: 17 additions & 2 deletions infra/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ services:
image: prom/prometheus:v2.52.0
volumes:
- prometheus_data:/prometheus
- ../prometheus/config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ../prometheus/config:/etc/prometheus:ro
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
Expand All @@ -156,7 +156,6 @@ services:
expose:
- "9090"


node-exporter:
container_name: node-exporter
image: prom/node-exporter:v1.8.1
Expand Down Expand Up @@ -244,6 +243,21 @@ services:
networks:
- grafana-monitoring-network

alertmanager:
image: prom/alertmanager:v0.27.0
container_name: alertmanager
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
volumes:
- ../alertmanager/config/alertmanager.yml:/etc/alertmanager/alertmanager.yml
Comment thread
luaraggio marked this conversation as resolved.
Outdated
- alertmanager_data:/alertmanager
expose:
- "9093"
restart: unless-stopped
networks:
- grafana-monitoring-network

redis:
image: redis:7-alpine
ports:
Expand Down Expand Up @@ -444,6 +458,7 @@ volumes:
minio_data:
prometheus_data:
grafana_data:
alertmanager_data:

networks:
transcendence-network:
Expand Down
51 changes: 51 additions & 0 deletions infra/prometheus/config/alerts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
groups:
- name: availability
rules:
- alert: TargetDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Target down: {{ $labels.job }}"
description: "The target {{ $labels.instance }} has been down for more than 2 minutes."

- alert: HttpProbeFailed
expr: probe_success{job="blackbox-http"} == 0
Comment thread
luaraggio marked this conversation as resolved.
Outdated
for: 2m
labels:
severity: critical
annotations:
summary: "HTTP probe failed: {{ $labels.instance }}"
description: "Blackbox probe for {{ $labels.instance }} has failed for more than 2 minutes."

- name: host-resources
rules:
- alert: HostHighCpuUsage
expr: 100 - (avg by(instance)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage has been above 85% for 10 minutes."

- alert: HostHighMemoryUsage
expr: (1 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))) * 100 > 90
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage has been above 90% for 10 minutes."

- name: containers
rules:
- alert: ContainerRestartSpike
expr: increase(container_start_time_seconds[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
summary: "Container restart spike"
description: "Container {{ $labels.name }} restarted multiple times in 15 minutes."
Comment thread
luaraggio marked this conversation as resolved.
9 changes: 9 additions & 0 deletions infra/prometheus/config/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ global:
external_labels:
monitor: 'codelab-monitor'

rule_files:
- "alerts.yml"

alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093

scrape_configs:
- job_name: 'prometheus'
static_configs:
Expand Down
Loading