From 2306f3b10e3c839743ed006ec1789604ef3c757f Mon Sep 17 00:00:00 2001 From: Nick Larew Date: Wed, 6 Aug 2025 14:12:06 -0500 Subject: [PATCH 1/2] (EAI-1247) Prometheus alerts for 429 rate limit errors --- .../environments/production.yml | 37 +++++++++++++++++++ .../environments/staging.yml | 37 +++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/packages/chatbot-server-mongodb-public/environments/production.yml b/packages/chatbot-server-mongodb-public/environments/production.yml index 57fc586d9..18694d479 100644 --- a/packages/chatbot-server-mongodb-public/environments/production.yml +++ b/packages/chatbot-server-mongodb-public/environments/production.yml @@ -103,3 +103,40 @@ prometheusRules: annotations: summary: High HTTP 500 Error rate on {$labels.job} description: Too many HTTP 500 Errors on {$labels.job} in the last 5 minutes + - alert: HighRateLimitErrors + expr: | + ( + rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 0.1 + ) or ( + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[10m]) > 10 + ) + for: 2m + labels: + severity: warning + category: performance + annotations: + summary: "High rate of 429 (rate limit) errors detected" + description: | + Service {{ $labels.service }} is experiencing high rate limit errors. + Current rate: {{ $value | printf "%.2f" }} errors/second + This may indicate: + - Client retry storms + - Insufficient rate limiting configuration + - Upstream service throttling + - alert: CriticalRateLimitErrors + expr: | + ( + rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 1.0 + ) or ( + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 50 + ) + for: 1m + labels: + severity: critical + category: performance + annotations: + summary: "Critical rate of 429 (rate limit) errors" + description: | + Service {{ $labels.service }} is experiencing critical rate limit errors. + Current rate: {{ $value | printf "%.2f" }} errors/second + Immediate investigation required - service may be degraded. diff --git a/packages/chatbot-server-mongodb-public/environments/staging.yml b/packages/chatbot-server-mongodb-public/environments/staging.yml index 40c14da88..3d94308ed 100644 --- a/packages/chatbot-server-mongodb-public/environments/staging.yml +++ b/packages/chatbot-server-mongodb-public/environments/staging.yml @@ -103,3 +103,40 @@ prometheusRules: annotations: summary: High HTTP 500 Error rate on {$labels.job} description: Too many HTTP 500 Errors on {$labels.job} in the last 5 minutes + - alert: HighRateLimitErrors + expr: | + ( + rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 0.1 + ) or ( + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[10m]) > 10 + ) + for: 2m + labels: + severity: warning + category: performance + annotations: + summary: "High rate of 429 (rate limit) errors detected" + description: | + Service {{ $labels.service }} is experiencing high rate limit errors. + Current rate: {{ $value | printf "%.2f" }} errors/second + This may indicate: + - Client retry storms + - Insufficient rate limiting configuration + - Upstream service throttling + - alert: CriticalRateLimitErrors + expr: | + ( + rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 1.0 + ) or ( + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 50 + ) + for: 1m + labels: + severity: critical + category: performance + annotations: + summary: "Critical rate of 429 (rate limit) errors" + description: | + Service {{ $labels.service }} is experiencing critical rate limit errors. + Current rate: {{ $value | printf "%.2f" }} errors/second + Immediate investigation required - service may be degraded. From 04f55dbcca0c3a25480e9efdcc27ce9018542f92 Mon Sep 17 00:00:00 2001 From: Nick Larew Date: Wed, 6 Aug 2025 14:36:38 -0500 Subject: [PATCH 2/2] consistent windows --- .../chatbot-server-mongodb-public/environments/production.yml | 2 +- packages/chatbot-server-mongodb-public/environments/staging.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/chatbot-server-mongodb-public/environments/production.yml b/packages/chatbot-server-mongodb-public/environments/production.yml index 18694d479..ff4529f16 100644 --- a/packages/chatbot-server-mongodb-public/environments/production.yml +++ b/packages/chatbot-server-mongodb-public/environments/production.yml @@ -108,7 +108,7 @@ prometheusRules: ( rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 0.1 ) or ( - increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[10m]) > 10 + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 10 ) for: 2m labels: diff --git a/packages/chatbot-server-mongodb-public/environments/staging.yml b/packages/chatbot-server-mongodb-public/environments/staging.yml index 3d94308ed..891dee4b4 100644 --- a/packages/chatbot-server-mongodb-public/environments/staging.yml +++ b/packages/chatbot-server-mongodb-public/environments/staging.yml @@ -108,7 +108,7 @@ prometheusRules: ( rate(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 0.1 ) or ( - increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[10m]) > 10 + increase(http_requests_total{namespace="docs",container="docs-chat",code="429"}[5m]) > 10 ) for: 2m labels: