From fbdbac2fca97056bd5da76835b0d83d870cdfe91 Mon Sep 17 00:00:00 2001
From: Kevin Meinhardt <kmeinhardt@mozilla.com>
Date: Wed, 19 Mar 2025 13:32:11 +0100
Subject: [PATCH 1/3] Enhance health check workflows and scripts

- Added output for fork detection in health_check_completed.yml.
- Updated health_check.yml to ensure health check file existence before proceeding.
- Refactored health_check.py to use a session with retries for HTTP requests, improving error handling.
- Modified tests in test_health_check.py to validate new error handling and retry logic.
---
 .github/workflows/health_check.yml           |  5 ++
 .github/workflows/health_check_completed.yml | 14 +++-
 scripts/health_check.py                      | 54 ++++++------
 tests/make/test_health_check.py              | 86 +++++++++++++-------
 4 files changed, 98 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/health_check.yml b/.github/workflows/health_check.yml
index 36b826694d5c..36a662fff75d 100644
--- a/.github/workflows/health_check.yml
+++ b/.github/workflows/health_check.yml
@@ -42,6 +42,11 @@ jobs:
         if: steps.health_check.outcome == 'failure'
         shell: bash
         run: |
+          if [ ! -f ${{ env.health_check_file }} ]; then
+            echo "Health check file is missing from previous step"
+            exit 1
+          fi
+
           # Create the message blocks file
           ./scripts/health_check_blocks.py \
           --input ${{ env.health_check_file }} \
diff --git a/.github/workflows/health_check_completed.yml b/.github/workflows/health_check_completed.yml
index 87848cf500cc..6b42804eecdc 100644
--- a/.github/workflows/health_check_completed.yml
+++ b/.github/workflows/health_check_completed.yml
@@ -8,6 +8,10 @@ on:
 jobs:
   context:
     runs-on: ubuntu-latest
+
+    outputs:
+      is_fork: ${{ steps.context.outputs.is_fork }}
+
     steps:
       - uses: actions/checkout@v4
 
@@ -16,7 +20,9 @@ jobs:
         uses: ./.github/actions/context
 
   health_check_failure_notification:
+    needs: context
     if: |
+      needs.context.outputs.is_fork == 'false' &&
       github.event.workflow_run.event == 'schedule' &&
       github.event.workflow_run.conclusion == 'failure'
     runs-on: ubuntu-latest
@@ -33,13 +39,13 @@ jobs:
           slack_token: ${{ secrets.SLACK_TOKEN }}
           slack_channel: ${{ secrets.SLACK_ADDONS_PRODUCTION_CHANNEL }}
           emoji: ':x:'
-          actor: ${{ vars.slack_actor }}
+          actor: ${{ vars.SLACK_ACTOR }}
           conclusion: ${{ github.event.workflow_run.conclusion }}
           workflow_id: ${{ github.event.workflow_run.id }}
-          workflow_url: ${{ github.event.workflow_run.url }}
+          workflow_url: ${{ github.event.workflow_run.html_url }}
           event: ${{ github.event.workflow_run.event }}
           env: ci
-          ref: ''
-          ref_link: ''
+          ref: ${{ github.event.workflow_run.repository.name }}
+          ref_link: ${{ github.event.workflow_run.html_url }}
 
 
diff --git a/scripts/health_check.py b/scripts/health_check.py
index 1e67d2038000..f051b59c52ca 100755
--- a/scripts/health_check.py
+++ b/scripts/health_check.py
@@ -4,8 +4,11 @@
 import json
 import time
 from enum import Enum
+from functools import cached_property
 
-import requests
+from requests import Session
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
 
 
 ENV_ENUM = Enum(
@@ -25,34 +28,29 @@ def __init__(self, env: ENV_ENUM, verbose: bool = False):
         self.environment = ENV_ENUM[env]
         self.verbose = verbose
 
-    def _fetch(self, path: str):
-        url = f'{self.environment.value}/{path}'
-        if self.verbose:
-            print(f'Requesting {url} for {self.environment.name}')
-
-        data = None
-        # We return 500 if any of the monitors are failing.
-        # So instead of raising, we should try to form valid JSON
-        # and determine if we should raise later based on the json values.
-        try:
-            response = requests.get(url, allow_redirects=False)
-            data = response.json()
-        except (requests.exceptions.HTTPError, json.JSONDecodeError) as e:
-            if self.verbose:
-                print(
-                    {
-                        'error': e,
-                        'data': data,
-                        'response': response,
-                    }
-                )
-
-        if data is None:
-            return {}
-
+    @cached_property
+    def client(self):
+        session = Session()
+        retries = Retry(
+            total=5,
+            backoff_factor=0.1,
+            status_forcelist=[502, 503, 504],
+            allowed_methods={'GET'},
+        )
+        adapter = HTTPAdapter(max_retries=retries)
+        session.mount(self.environment.value, adapter)
+        return session
+
+    def log(self, *args):
         if self.verbose:
-            print(json.dumps(data, indent=2))
+            print(*args)
 
+    def _fetch(self, path: str):
+        url = f'{self.environment.value}/{path}'
+        self.log(f'Requesting {url} for {self.environment.name}')
+        response = self.client.get(url, allow_redirects=False)
+        data = response.json()
+        self.log(json.dumps(data, indent=2))
         return {'url': url, 'data': data}
 
     def version(self):
@@ -65,7 +63,7 @@ def monitors(self):
         return self._fetch('services/__heartbeat__')
 
 
-def main(env: ENV_ENUM, verbose: bool, retries: int = 0, attempt: int = 0):
+def main(env: ENV_ENUM, verbose: bool = False, retries: int = 0, attempt: int = 0):
     fetcher = Fetcher(env, verbose)
 
     version_data = fetcher.version()
diff --git a/tests/make/test_health_check.py b/tests/make/test_health_check.py
index 69a162456c4c..857b4b33fb0a 100644
--- a/tests/make/test_health_check.py
+++ b/tests/make/test_health_check.py
@@ -1,5 +1,8 @@
+from json.decoder import JSONDecodeError
 from unittest import TestCase
+from unittest.mock import call, patch
 
+import requests
 import responses
 
 from scripts.health_check import main
@@ -26,7 +29,7 @@ def test_basic(self):
         self.mock_url(
             'services/__heartbeat__', status=200, json=self._monitor('two', True, '')
         )
-        results, _ = main('local', False)
+        main('local')
 
     def test_missing_version(self):
         self.mock_url('__version__', status=500)
@@ -35,8 +38,8 @@ def test_missing_version(self):
             'services/__heartbeat__', status=200, json=self._monitor('two', True, '')
         )
 
-        results, _ = main('local', False)
-        self.assertEqual(results['version'], {})
+        with self.assertRaises(JSONDecodeError):
+            main('local')
 
     def test_invalid_version(self):
         self.mock_url('__version__', status=200, body='{not valid json')
@@ -45,8 +48,8 @@ def test_invalid_version(self):
             'services/__heartbeat__', status=200, json=self._monitor('two', True, '')
         )
 
-        results, _ = main('local', False)
-        self.assertEqual(results['version'], {})
+        with self.assertRaises(JSONDecodeError):
+            main('local')
 
     def test_missing_heartbeat(self):
         self.mock_url('__heartbeat__', status=500)
@@ -55,8 +58,8 @@ def test_missing_heartbeat(self):
             'services/__heartbeat__', status=200, json=self._monitor('two', True, '')
         )
 
-        results, _ = main('local', False)
-        self.assertEqual(results['heartbeat'], {})
+        with self.assertRaises(JSONDecodeError):
+            main('local')
 
     def test_failing_heartbeat(self):
         failing_monitor = self._monitor('fail', False, 'Service is down')
@@ -65,25 +68,16 @@ def test_failing_heartbeat(self):
         self.mock_url('__version__', status=200, json={'version': '1.0.0'})
         self.mock_url('services/__heartbeat__', status=200, json=success_monitor)
 
-        results, has_failures = main('local', False)
+        results, has_failures = main('local')
         self.assertTrue(has_failures)
-        # Check for failing monitors
-        failing_monitors = []
-        for monitor_type, monitor_data in results.items():
-            if monitor_type == 'version':
-                continue
-            for name, details in monitor_data.get('data', {}).items():
-                if details.get('state') is False:
-                    failing_monitors.append(f'{monitor_type}.{name}')
-        self.assertIn('heartbeat.fail', failing_monitors)
 
     def test_missing_monitors(self):
         self.mock_url('services/__heartbeat__', status=500)
         self.mock_url('__version__', status=200, json={'version': '1.0.0'})
         self.mock_url('__heartbeat__', status=200, json=self._monitor('one', True, ''))
 
-        results, _ = main('local', False)
-        self.assertEqual(results['monitors'], {})
+        with self.assertRaises(JSONDecodeError):
+            main('local')
 
     def test_failing_monitors(self):
         failing_monitor = self._monitor('fail', False, 'Service is down')
@@ -92,14 +86,48 @@ def test_failing_monitors(self):
         self.mock_url('__version__', status=200, json={'version': '1.0.0'})
         self.mock_url('__heartbeat__', status=200, json=success_monitor)
 
-        results, has_failures = main('local', False)
+        results, has_failures = main('local')
         self.assertTrue(has_failures)
-        # Check for failing monitors
-        failing_monitors = []
-        for monitor_type, monitor_data in results.items():
-            if monitor_type == 'version':
-                continue
-            for name, details in monitor_data.get('data', {}).items():
-                if details.get('state') is False:
-                    failing_monitors.append(f'{monitor_type}.{name}')
-        self.assertIn('monitors.fail', failing_monitors)
+
+    def test_request_retries(self):
+        count = 0
+
+        def increment_count(request):
+            nonlocal count
+            count += 1
+            return (503, {}, '')
+
+        # Mock the get request to fail with a 503 status
+        self.mock_url('__version__', status=503)
+        self.mock_url('__heartbeat__', status=503)
+        self.mock_url('services/__heartbeat__', status=503)
+
+        responses.add_callback(
+            responses.GET,
+            self._url('__version__'),
+            callback=increment_count,
+        )
+
+        with self.assertRaises(requests.exceptions.RequestException):
+            main('local', retries=0)
+
+        assert count == 5
+
+    @patch('scripts.health_check.main', wraps=main)
+    def test_retry_failures(self, mock_main):
+        self.mock_url('__version__', status=200, json={'version': '1.0.0'})
+        self.mock_url('__heartbeat__', status=200, json=self._monitor('one', True, ''))
+        self.mock_url(
+            'services/__heartbeat__',
+            json=self._monitor('fail', False, 'Service is down'),
+        )
+
+        main('local', retries=2)
+
+        # Should be called 3 times total - initial call plus 2 retries
+        self.assertEqual(mock_main.call_count, 2)
+
+        # Verify retry attempts were made with incrementing attempt numbers
+        mock_main.assert_has_calls(
+            [call('local', False, 2, 1), call('local', False, 2, 2)]
+        )

From d9a287f92055f4844abfc30d20d6490ef32f204a Mon Sep 17 00:00:00 2001
From: Kevin Meinhardt <kmeinhardt@mozilla.com>
Date: Wed, 19 Mar 2025 13:45:22 +0100
Subject: [PATCH 2/3] Add dummy waffle switch to test healthcheck e2e in any
 environment

---
 .github/workflows/health_check.yml            |  7 +++++
 .../amo/migrations/0002_auto_20250319_1240.py | 16 +++++++++++
 src/olympia/amo/monitors.py                   |  7 +++++
 src/olympia/amo/tests/test_views.py           | 27 +++++++++++++++++++
 src/olympia/amo/views.py                      |  2 +-
 5 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 src/olympia/amo/migrations/0002_auto_20250319_1240.py

diff --git a/.github/workflows/health_check.yml b/.github/workflows/health_check.yml
index 36a662fff75d..176c6cf24e82 100644
--- a/.github/workflows/health_check.yml
+++ b/.github/workflows/health_check.yml
@@ -32,6 +32,13 @@ jobs:
           target: development
           version: local
           run: |
+            # On local, we want to ensure there are failures to test
+            # the later steps with so set the waffle switch to on.
+            if [ "${{ matrix.environment }}" = "local" ]; then
+              echo "Set waffle switch dummy-monitor-fails to true"
+              ./manage.py waffle_switch dummy-monitor-fails on
+            fi
+
             ./scripts/health_check.py \
             --env ${{ matrix.environment }} \
             --verbose \
diff --git a/src/olympia/amo/migrations/0002_auto_20250319_1240.py b/src/olympia/amo/migrations/0002_auto_20250319_1240.py
new file mode 100644
index 000000000000..4a58ab0a9ca1
--- /dev/null
+++ b/src/olympia/amo/migrations/0002_auto_20250319_1240.py
@@ -0,0 +1,16 @@
+# Generated by Django 4.2.20 on 2025-03-19 12:40
+
+from django.db import migrations
+
+from olympia.core.db.migrations import CreateWaffleSwitch
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('amo', '0001_initial'),
+    ]
+
+    operations = [
+        CreateWaffleSwitch('dummy-monitor-fails'),
+    ]
diff --git a/src/olympia/amo/monitors.py b/src/olympia/amo/monitors.py
index 8b74e161855e..718d561e2549 100644
--- a/src/olympia/amo/monitors.py
+++ b/src/olympia/amo/monitors.py
@@ -9,6 +9,7 @@
 import celery
 import MySQLdb
 import requests
+import waffle
 from django_statsd.clients import statsd
 from kombu import Connection
 from PIL import Image
@@ -34,6 +35,12 @@ def execute_checks(checks: list[str], verbose: bool = False):
     return status_summary
 
 
+def dummy_monitor():
+    if waffle.switch_is_active('dummy-monitor-fails'):
+        return 'Dummy monitor failed', None
+    return '', None
+
+
 def localdev_web():
     """
     Used in local development environments to determine if the web container
diff --git a/src/olympia/amo/tests/test_views.py b/src/olympia/amo/tests/test_views.py
index 3b15a4000f93..98e623299b5d 100644
--- a/src/olympia/amo/tests/test_views.py
+++ b/src/olympia/amo/tests/test_views.py
@@ -16,6 +16,7 @@
 import pytest
 from lxml import etree
 from pyquery import PyQuery as pq
+from waffle.testutils import override_switch
 
 from olympia import amo, core
 from olympia.access import acl
@@ -401,6 +402,19 @@ def test_front_heartbeat_failure(self):
         assert response.status_code >= 500
         assert response.json()['database']['status'] == 'boom'
 
+    def test_front_heartbeat_dummy_monitor_failure(self):
+        url = reverse('amo.front_heartbeat')
+        response = self.client.get(url)
+
+        assert response.status_code == 200
+        self.assertTrue(response.json()['dummy_monitor']['state'])
+
+        with override_switch('dummy-monitor-fails', True):
+            response = self.client.get(url)
+
+            assert response.status_code >= 500
+            assert response.json()['dummy_monitor']['status'] == 'Dummy monitor failed'
+
     def test_services_heartbeat_success(self):
         response = self.client.get(reverse('amo.services_heartbeat'))
         assert response.status_code == 200
@@ -413,6 +427,19 @@ def test_services_heartbeat_failure(self):
         assert response.status_code >= 500
         assert response.json()['rabbitmq']['status'] == 'boom'
 
+    def test_services_heartbeat_dummy_monitor_failure(self):
+        url = reverse('amo.services_heartbeat')
+        response = self.client.get(url)
+
+        assert response.status_code == 200
+        self.assertTrue(response.json()['dummy_monitor']['state'])
+
+        with override_switch('dummy-monitor-fails', True):
+            response = self.client.get(url)
+
+            assert response.status_code >= 500
+            assert response.json()['dummy_monitor']['status'] == 'Dummy monitor failed'
+
 
 class TestCORS(TestCase):
     fixtures = ('base/addon_3615',)
diff --git a/src/olympia/amo/views.py b/src/olympia/amo/views.py
index 1f5a05b372c4..eeee3380068d 100644
--- a/src/olympia/amo/views.py
+++ b/src/olympia/amo/views.py
@@ -31,7 +31,7 @@
 
 
 def _exec_monitors(checks: list[str]):
-    status_summary = monitors.execute_checks(checks)
+    status_summary = monitors.execute_checks([*checks, 'dummy_monitor'])
     status_code = 200 if all(a['state'] for a in status_summary.values()) else 500
     return JsonResponse(status_summary, status=status_code)
 

From f865ffcccd7230ad5ec52da5b556ba7e9cedbf26 Mon Sep 17 00:00:00 2001
From: Kevin Meinhardt <kmeinhardt7@gmail.com>
Date: Thu, 20 Mar 2025 11:51:19 +0100
Subject: [PATCH 3/3] CI Completed should use [ci] env for all notifications

---
 .github/workflows/ci_completed.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci_completed.yml b/.github/workflows/ci_completed.yml
index fd7133639dd2..9f79f6943c41 100644
--- a/.github/workflows/ci_completed.yml
+++ b/.github/workflows/ci_completed.yml
@@ -30,6 +30,7 @@ jobs:
       conclusion: ${{ steps.ref.outputs.conclusion }}
       commit_short: ${{ steps.ref.outputs.commit_short }}
       emoji: ${{ steps.ref.outputs.emoji }}
+      env: ci
 
     steps:
       - uses: actions/checkout@v4
@@ -75,7 +76,7 @@ jobs:
           workflow_id: ${{ needs.context.outputs.workflow_id }}
           workflow_url: ${{ needs.context.outputs.workflow_url }}
           event: ${{ needs.context.outputs.event }}
-          env: dev
+          env: ${{ needs.context.outputs.env }}
           ref: ${{ format('{0} ({1}) {2}', needs.context.outputs.branch, needs.context.outputs.commit_short, needs.context.outputs.title) }}
           ref_link: ${{ format('{0}/commit/{1}', needs.context.outputs.repo_url, needs.context.outputs.sha) }}
 
@@ -96,6 +97,6 @@ jobs:
           workflow_id: ${{ needs.context.outputs.workflow_id }}
           workflow_url: ${{ needs.context.outputs.workflow_url }}
           event: ${{ needs.context.outputs.event }}
-          env: production
+          env: ${{ needs.context.outputs.env }}
           ref: ${{ format('{0}', needs.context.outputs.branch) }}
           ref_link: ${{ format('{0}/releases/tag/{1}', needs.context.outputs.repo_url, needs.context.outputs.branch) }}