From fbdbac2fca97056bd5da76835b0d83d870cdfe91 Mon Sep 17 00:00:00 2001 From: Kevin Meinhardt Date: Wed, 19 Mar 2025 13:32:11 +0100 Subject: [PATCH 1/3] Enhance health check workflows and scripts - Added output for fork detection in health_check_completed.yml. - Updated health_check.yml to ensure health check file existence before proceeding. - Refactored health_check.py to use a session with retries for HTTP requests, improving error handling. - Modified tests in test_health_check.py to validate new error handling and retry logic. --- .github/workflows/health_check.yml | 5 ++ .github/workflows/health_check_completed.yml | 14 +++- scripts/health_check.py | 54 ++++++------ tests/make/test_health_check.py | 86 +++++++++++++------- 4 files changed, 98 insertions(+), 61 deletions(-) diff --git a/.github/workflows/health_check.yml b/.github/workflows/health_check.yml index 36b826694d5c..36a662fff75d 100644 --- a/.github/workflows/health_check.yml +++ b/.github/workflows/health_check.yml @@ -42,6 +42,11 @@ jobs: if: steps.health_check.outcome == 'failure' shell: bash run: | + if [ ! -f ${{ env.health_check_file }} ]; then + echo "Health check file is missing from previous step" + exit 1 + fi + # Create the message blocks file ./scripts/health_check_blocks.py \ --input ${{ env.health_check_file }} \ diff --git a/.github/workflows/health_check_completed.yml b/.github/workflows/health_check_completed.yml index 87848cf500cc..6b42804eecdc 100644 --- a/.github/workflows/health_check_completed.yml +++ b/.github/workflows/health_check_completed.yml @@ -8,6 +8,10 @@ on: jobs: context: runs-on: ubuntu-latest + + outputs: + is_fork: ${{ steps.context.outputs.is_fork }} + steps: - uses: actions/checkout@v4 @@ -16,7 +20,9 @@ jobs: uses: ./.github/actions/context health_check_failure_notification: + needs: context if: | + needs.context.outputs.is_fork == 'false' && github.event.workflow_run.event == 'schedule' && github.event.workflow_run.conclusion == 'failure' runs-on: ubuntu-latest @@ -33,13 +39,13 @@ jobs: slack_token: ${{ secrets.SLACK_TOKEN }} slack_channel: ${{ secrets.SLACK_ADDONS_PRODUCTION_CHANNEL }} emoji: ':x:' - actor: ${{ vars.slack_actor }} + actor: ${{ vars.SLACK_ACTOR }} conclusion: ${{ github.event.workflow_run.conclusion }} workflow_id: ${{ github.event.workflow_run.id }} - workflow_url: ${{ github.event.workflow_run.url }} + workflow_url: ${{ github.event.workflow_run.html_url }} event: ${{ github.event.workflow_run.event }} env: ci - ref: '' - ref_link: '' + ref: ${{ github.event.workflow_run.repository.name }} + ref_link: ${{ github.event.workflow_run.html_url }} diff --git a/scripts/health_check.py b/scripts/health_check.py index 1e67d2038000..f051b59c52ca 100755 --- a/scripts/health_check.py +++ b/scripts/health_check.py @@ -4,8 +4,11 @@ import json import time from enum import Enum +from functools import cached_property -import requests +from requests import Session +from requests.adapters import HTTPAdapter +from urllib3.util import Retry ENV_ENUM = Enum( @@ -25,34 +28,29 @@ def __init__(self, env: ENV_ENUM, verbose: bool = False): self.environment = ENV_ENUM[env] self.verbose = verbose - def _fetch(self, path: str): - url = f'{self.environment.value}/{path}' - if self.verbose: - print(f'Requesting {url} for {self.environment.name}') - - data = None - # We return 500 if any of the monitors are failing. - # So instead of raising, we should try to form valid JSON - # and determine if we should raise later based on the json values. - try: - response = requests.get(url, allow_redirects=False) - data = response.json() - except (requests.exceptions.HTTPError, json.JSONDecodeError) as e: - if self.verbose: - print( - { - 'error': e, - 'data': data, - 'response': response, - } - ) - - if data is None: - return {} - + @cached_property + def client(self): + session = Session() + retries = Retry( + total=5, + backoff_factor=0.1, + status_forcelist=[502, 503, 504], + allowed_methods={'GET'}, + ) + adapter = HTTPAdapter(max_retries=retries) + session.mount(self.environment.value, adapter) + return session + + def log(self, *args): if self.verbose: - print(json.dumps(data, indent=2)) + print(*args) + def _fetch(self, path: str): + url = f'{self.environment.value}/{path}' + self.log(f'Requesting {url} for {self.environment.name}') + response = self.client.get(url, allow_redirects=False) + data = response.json() + self.log(json.dumps(data, indent=2)) return {'url': url, 'data': data} def version(self): @@ -65,7 +63,7 @@ def monitors(self): return self._fetch('services/__heartbeat__') -def main(env: ENV_ENUM, verbose: bool, retries: int = 0, attempt: int = 0): +def main(env: ENV_ENUM, verbose: bool = False, retries: int = 0, attempt: int = 0): fetcher = Fetcher(env, verbose) version_data = fetcher.version() diff --git a/tests/make/test_health_check.py b/tests/make/test_health_check.py index 69a162456c4c..857b4b33fb0a 100644 --- a/tests/make/test_health_check.py +++ b/tests/make/test_health_check.py @@ -1,5 +1,8 @@ +from json.decoder import JSONDecodeError from unittest import TestCase +from unittest.mock import call, patch +import requests import responses from scripts.health_check import main @@ -26,7 +29,7 @@ def test_basic(self): self.mock_url( 'services/__heartbeat__', status=200, json=self._monitor('two', True, '') ) - results, _ = main('local', False) + main('local') def test_missing_version(self): self.mock_url('__version__', status=500) @@ -35,8 +38,8 @@ def test_missing_version(self): 'services/__heartbeat__', status=200, json=self._monitor('two', True, '') ) - results, _ = main('local', False) - self.assertEqual(results['version'], {}) + with self.assertRaises(JSONDecodeError): + main('local') def test_invalid_version(self): self.mock_url('__version__', status=200, body='{not valid json') @@ -45,8 +48,8 @@ def test_invalid_version(self): 'services/__heartbeat__', status=200, json=self._monitor('two', True, '') ) - results, _ = main('local', False) - self.assertEqual(results['version'], {}) + with self.assertRaises(JSONDecodeError): + main('local') def test_missing_heartbeat(self): self.mock_url('__heartbeat__', status=500) @@ -55,8 +58,8 @@ def test_missing_heartbeat(self): 'services/__heartbeat__', status=200, json=self._monitor('two', True, '') ) - results, _ = main('local', False) - self.assertEqual(results['heartbeat'], {}) + with self.assertRaises(JSONDecodeError): + main('local') def test_failing_heartbeat(self): failing_monitor = self._monitor('fail', False, 'Service is down') @@ -65,25 +68,16 @@ def test_failing_heartbeat(self): self.mock_url('__version__', status=200, json={'version': '1.0.0'}) self.mock_url('services/__heartbeat__', status=200, json=success_monitor) - results, has_failures = main('local', False) + results, has_failures = main('local') self.assertTrue(has_failures) - # Check for failing monitors - failing_monitors = [] - for monitor_type, monitor_data in results.items(): - if monitor_type == 'version': - continue - for name, details in monitor_data.get('data', {}).items(): - if details.get('state') is False: - failing_monitors.append(f'{monitor_type}.{name}') - self.assertIn('heartbeat.fail', failing_monitors) def test_missing_monitors(self): self.mock_url('services/__heartbeat__', status=500) self.mock_url('__version__', status=200, json={'version': '1.0.0'}) self.mock_url('__heartbeat__', status=200, json=self._monitor('one', True, '')) - results, _ = main('local', False) - self.assertEqual(results['monitors'], {}) + with self.assertRaises(JSONDecodeError): + main('local') def test_failing_monitors(self): failing_monitor = self._monitor('fail', False, 'Service is down') @@ -92,14 +86,48 @@ def test_failing_monitors(self): self.mock_url('__version__', status=200, json={'version': '1.0.0'}) self.mock_url('__heartbeat__', status=200, json=success_monitor) - results, has_failures = main('local', False) + results, has_failures = main('local') self.assertTrue(has_failures) - # Check for failing monitors - failing_monitors = [] - for monitor_type, monitor_data in results.items(): - if monitor_type == 'version': - continue - for name, details in monitor_data.get('data', {}).items(): - if details.get('state') is False: - failing_monitors.append(f'{monitor_type}.{name}') - self.assertIn('monitors.fail', failing_monitors) + + def test_request_retries(self): + count = 0 + + def increment_count(request): + nonlocal count + count += 1 + return (503, {}, '') + + # Mock the get request to fail with a 503 status + self.mock_url('__version__', status=503) + self.mock_url('__heartbeat__', status=503) + self.mock_url('services/__heartbeat__', status=503) + + responses.add_callback( + responses.GET, + self._url('__version__'), + callback=increment_count, + ) + + with self.assertRaises(requests.exceptions.RequestException): + main('local', retries=0) + + assert count == 5 + + @patch('scripts.health_check.main', wraps=main) + def test_retry_failures(self, mock_main): + self.mock_url('__version__', status=200, json={'version': '1.0.0'}) + self.mock_url('__heartbeat__', status=200, json=self._monitor('one', True, '')) + self.mock_url( + 'services/__heartbeat__', + json=self._monitor('fail', False, 'Service is down'), + ) + + main('local', retries=2) + + # Should be called 3 times total - initial call plus 2 retries + self.assertEqual(mock_main.call_count, 2) + + # Verify retry attempts were made with incrementing attempt numbers + mock_main.assert_has_calls( + [call('local', False, 2, 1), call('local', False, 2, 2)] + ) From d9a287f92055f4844abfc30d20d6490ef32f204a Mon Sep 17 00:00:00 2001 From: Kevin Meinhardt Date: Wed, 19 Mar 2025 13:45:22 +0100 Subject: [PATCH 2/3] Add dummy waffle switch to test healthcheck e2e in any environment --- .github/workflows/health_check.yml | 7 +++++ .../amo/migrations/0002_auto_20250319_1240.py | 16 +++++++++++ src/olympia/amo/monitors.py | 7 +++++ src/olympia/amo/tests/test_views.py | 27 +++++++++++++++++++ src/olympia/amo/views.py | 2 +- 5 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 src/olympia/amo/migrations/0002_auto_20250319_1240.py diff --git a/.github/workflows/health_check.yml b/.github/workflows/health_check.yml index 36a662fff75d..176c6cf24e82 100644 --- a/.github/workflows/health_check.yml +++ b/.github/workflows/health_check.yml @@ -32,6 +32,13 @@ jobs: target: development version: local run: | + # On local, we want to ensure there are failures to test + # the later steps with so set the waffle switch to on. + if [ "${{ matrix.environment }}" = "local" ]; then + echo "Set waffle switch dummy-monitor-fails to true" + ./manage.py waffle_switch dummy-monitor-fails on + fi + ./scripts/health_check.py \ --env ${{ matrix.environment }} \ --verbose \ diff --git a/src/olympia/amo/migrations/0002_auto_20250319_1240.py b/src/olympia/amo/migrations/0002_auto_20250319_1240.py new file mode 100644 index 000000000000..4a58ab0a9ca1 --- /dev/null +++ b/src/olympia/amo/migrations/0002_auto_20250319_1240.py @@ -0,0 +1,16 @@ +# Generated by Django 4.2.20 on 2025-03-19 12:40 + +from django.db import migrations + +from olympia.core.db.migrations import CreateWaffleSwitch + + +class Migration(migrations.Migration): + + dependencies = [ + ('amo', '0001_initial'), + ] + + operations = [ + CreateWaffleSwitch('dummy-monitor-fails'), + ] diff --git a/src/olympia/amo/monitors.py b/src/olympia/amo/monitors.py index 8b74e161855e..718d561e2549 100644 --- a/src/olympia/amo/monitors.py +++ b/src/olympia/amo/monitors.py @@ -9,6 +9,7 @@ import celery import MySQLdb import requests +import waffle from django_statsd.clients import statsd from kombu import Connection from PIL import Image @@ -34,6 +35,12 @@ def execute_checks(checks: list[str], verbose: bool = False): return status_summary +def dummy_monitor(): + if waffle.switch_is_active('dummy-monitor-fails'): + return 'Dummy monitor failed', None + return '', None + + def localdev_web(): """ Used in local development environments to determine if the web container diff --git a/src/olympia/amo/tests/test_views.py b/src/olympia/amo/tests/test_views.py index 3b15a4000f93..98e623299b5d 100644 --- a/src/olympia/amo/tests/test_views.py +++ b/src/olympia/amo/tests/test_views.py @@ -16,6 +16,7 @@ import pytest from lxml import etree from pyquery import PyQuery as pq +from waffle.testutils import override_switch from olympia import amo, core from olympia.access import acl @@ -401,6 +402,19 @@ def test_front_heartbeat_failure(self): assert response.status_code >= 500 assert response.json()['database']['status'] == 'boom' + def test_front_heartbeat_dummy_monitor_failure(self): + url = reverse('amo.front_heartbeat') + response = self.client.get(url) + + assert response.status_code == 200 + self.assertTrue(response.json()['dummy_monitor']['state']) + + with override_switch('dummy-monitor-fails', True): + response = self.client.get(url) + + assert response.status_code >= 500 + assert response.json()['dummy_monitor']['status'] == 'Dummy monitor failed' + def test_services_heartbeat_success(self): response = self.client.get(reverse('amo.services_heartbeat')) assert response.status_code == 200 @@ -413,6 +427,19 @@ def test_services_heartbeat_failure(self): assert response.status_code >= 500 assert response.json()['rabbitmq']['status'] == 'boom' + def test_services_heartbeat_dummy_monitor_failure(self): + url = reverse('amo.services_heartbeat') + response = self.client.get(url) + + assert response.status_code == 200 + self.assertTrue(response.json()['dummy_monitor']['state']) + + with override_switch('dummy-monitor-fails', True): + response = self.client.get(url) + + assert response.status_code >= 500 + assert response.json()['dummy_monitor']['status'] == 'Dummy monitor failed' + class TestCORS(TestCase): fixtures = ('base/addon_3615',) diff --git a/src/olympia/amo/views.py b/src/olympia/amo/views.py index 1f5a05b372c4..eeee3380068d 100644 --- a/src/olympia/amo/views.py +++ b/src/olympia/amo/views.py @@ -31,7 +31,7 @@ def _exec_monitors(checks: list[str]): - status_summary = monitors.execute_checks(checks) + status_summary = monitors.execute_checks([*checks, 'dummy_monitor']) status_code = 200 if all(a['state'] for a in status_summary.values()) else 500 return JsonResponse(status_summary, status=status_code) From f865ffcccd7230ad5ec52da5b556ba7e9cedbf26 Mon Sep 17 00:00:00 2001 From: Kevin Meinhardt Date: Thu, 20 Mar 2025 11:51:19 +0100 Subject: [PATCH 3/3] CI Completed should use [ci] env for all notifications --- .github/workflows/ci_completed.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_completed.yml b/.github/workflows/ci_completed.yml index fd7133639dd2..9f79f6943c41 100644 --- a/.github/workflows/ci_completed.yml +++ b/.github/workflows/ci_completed.yml @@ -30,6 +30,7 @@ jobs: conclusion: ${{ steps.ref.outputs.conclusion }} commit_short: ${{ steps.ref.outputs.commit_short }} emoji: ${{ steps.ref.outputs.emoji }} + env: ci steps: - uses: actions/checkout@v4 @@ -75,7 +76,7 @@ jobs: workflow_id: ${{ needs.context.outputs.workflow_id }} workflow_url: ${{ needs.context.outputs.workflow_url }} event: ${{ needs.context.outputs.event }} - env: dev + env: ${{ needs.context.outputs.env }} ref: ${{ format('{0} ({1}) {2}', needs.context.outputs.branch, needs.context.outputs.commit_short, needs.context.outputs.title) }} ref_link: ${{ format('{0}/commit/{1}', needs.context.outputs.repo_url, needs.context.outputs.sha) }} @@ -96,6 +97,6 @@ jobs: workflow_id: ${{ needs.context.outputs.workflow_id }} workflow_url: ${{ needs.context.outputs.workflow_url }} event: ${{ needs.context.outputs.event }} - env: production + env: ${{ needs.context.outputs.env }} ref: ${{ format('{0}', needs.context.outputs.branch) }} ref_link: ${{ format('{0}/releases/tag/{1}', needs.context.outputs.repo_url, needs.context.outputs.branch) }}