From 0f30657169eaf2b665b36155889accc8379ea880 Mon Sep 17 00:00:00 2001
From: Xavier Vello <xavier@posthog.com>
Date: Fri, 17 Mar 2023 11:25:31 +0100
Subject: [PATCH 1/3] alerts: add paging CeleryQueueDepth monitor

---
 charts/posthog/Chart.yaml  |  2 +-
 charts/posthog/values.yaml | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/charts/posthog/Chart.yaml b/charts/posthog/Chart.yaml
index 0b177685..20e4237d 100644
--- a/charts/posthog/Chart.yaml
+++ b/charts/posthog/Chart.yaml
@@ -11,7 +11,7 @@ type: application
 
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
-version: 30.10.3
+version: 30.10.4
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application.
diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml
index 53480ab9..1d74fac0 100644
--- a/charts/posthog/values.yaml
+++ b/charts/posthog/values.yaml
@@ -2240,6 +2240,22 @@ prometheus:
                   Check https://github.com/PostHog/product-internal/blob/main/infrastructure/runbooks/pipeline/graphile.md
                   for more context and steps to recovery.
 
+            - alert: CeleryQueueDepth
+              expr: (max (posthog_celery_queue_depth)) > 1000
+              for: 10m
+              labels:
+                rotation: common
+                severity: critical
+              annotations:
+                summary: Celery job execution delayed for more than 5 minutes.
+                description: |
+                  The Celery jobs queue (stored in Redis) is filling up faster than it is consumed. This impacts our
+                  monitoring, as some paging monitors depend on metrics exported by Celery jobs.
+                    - make sure posthog-worker pods are healthy and check their logs
+                    - look for recent code changes to posthog/celery.py and rollback if needed
+                    - check the health of Postgres and Clickhouse, many jobs query them and could hang if these
+                      stores get slow
+
             - alert: KafkaDiskCritical
               expr: min by (instance) (aws_msk_node_filesystem_free_bytes{mountpoint="/kafka/datalogs"}) < 536870912000
               for: 2m

From a1e33e6145fc71d8850a67faa5e8ceb38987c1a9 Mon Sep 17 00:00:00 2001
From: Xavier Vello <xavier@posthog.com>
Date: Fri, 17 Mar 2023 12:11:25 +0100
Subject: [PATCH 2/3] fix summary

---
 charts/posthog/values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml
index 1d74fac0..4a841918 100644
--- a/charts/posthog/values.yaml
+++ b/charts/posthog/values.yaml
@@ -2247,7 +2247,7 @@ prometheus:
                 rotation: common
                 severity: critical
               annotations:
-                summary: Celery job execution delayed for more than 5 minutes.
+                summary: Celery job execution delayed for more than 10 minutes.
                 description: |
                   The Celery jobs queue (stored in Redis) is filling up faster than it is consumed. This impacts our
                   monitoring, as some paging monitors depend on metrics exported by Celery jobs.

From 0a920a03de80ebf587604947ff150836b370c8c4 Mon Sep 17 00:00:00 2001
From: Xavier Vello <xavier@posthog.com>
Date: Fri, 17 Mar 2023 12:20:39 +0100
Subject: [PATCH 3/3] alerts: add CeleryNotReporting paging alert

---
 charts/posthog/values.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml
index 4a841918..ee0a6f43 100644
--- a/charts/posthog/values.yaml
+++ b/charts/posthog/values.yaml
@@ -2256,6 +2256,20 @@ prometheus:
                     - check the health of Postgres and Clickhouse, many jobs query them and could hang if these
                       stores get slow
 
+            - alert: CeleryNotReporting
+              expr: absent_over_time(posthog_celery_queue_depth[10m]) > 0
+              labels:
+                rotation: common
+                severity: critical
+              annotations:
+                summary: Celery has not pushed metrics for more than 10 minutes.
+                description: |
+                  The posthog-worker pods have not reported points for the posthog_celery_queue_depth metric
+                  in the last 10 minutes. Because many of our alerts rely on metrics reported by Celery jobs,
+                  this is a critical alert (other issues could be hidden).
+                    - make sure posthog-worker pods are healthy and check their logs
+                    - look for recent code changes to posthog/celery.py and rollback if needed
+
             - alert: KafkaDiskCritical
               expr: min by (instance) (aws_msk_node_filesystem_free_bytes{mountpoint="/kafka/datalogs"}) < 536870912000
               for: 2m