Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance health check workflows and scripts #23190

Merged
merged 3 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/ci_completed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
conclusion: ${{ steps.ref.outputs.conclusion }}
commit_short: ${{ steps.ref.outputs.commit_short }}
emoji: ${{ steps.ref.outputs.emoji }}
env: ci

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -75,7 +76,7 @@ jobs:
workflow_id: ${{ needs.context.outputs.workflow_id }}
workflow_url: ${{ needs.context.outputs.workflow_url }}
event: ${{ needs.context.outputs.event }}
env: dev
env: ${{ needs.context.outputs.env }}
ref: ${{ format('{0} ({1}) {2}', needs.context.outputs.branch, needs.context.outputs.commit_short, needs.context.outputs.title) }}
ref_link: ${{ format('{0}/commit/{1}', needs.context.outputs.repo_url, needs.context.outputs.sha) }}

Expand All @@ -96,6 +97,6 @@ jobs:
workflow_id: ${{ needs.context.outputs.workflow_id }}
workflow_url: ${{ needs.context.outputs.workflow_url }}
event: ${{ needs.context.outputs.event }}
env: production
env: ${{ needs.context.outputs.env }}
ref: ${{ format('{0}', needs.context.outputs.branch) }}
ref_link: ${{ format('{0}/releases/tag/{1}', needs.context.outputs.repo_url, needs.context.outputs.branch) }}
12 changes: 12 additions & 0 deletions .github/workflows/health_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ jobs:
target: development
version: local
run: |
# On local, we want to ensure there are failures to test
# the later steps with so set the waffle switch to on.
if [ "${{ matrix.environment }}" = "local" ]; then
echo "Set waffle switch dummy-monitor-fails to true"
./manage.py waffle_switch dummy-monitor-fails on
fi

./scripts/health_check.py \
--env ${{ matrix.environment }} \
--verbose \
Expand All @@ -42,6 +49,11 @@ jobs:
if: steps.health_check.outcome == 'failure'
shell: bash
run: |
if [ ! -f ${{ env.health_check_file }} ]; then
echo "Health check file is missing from previous step"
exit 1
fi

# Create the message blocks file
./scripts/health_check_blocks.py \
--input ${{ env.health_check_file }} \
Expand Down
14 changes: 10 additions & 4 deletions .github/workflows/health_check_completed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ on:
jobs:
context:
runs-on: ubuntu-latest

outputs:
is_fork: ${{ steps.context.outputs.is_fork }}

steps:
- uses: actions/checkout@v4

Expand All @@ -16,7 +20,9 @@ jobs:
uses: ./.github/actions/context

health_check_failure_notification:
needs: context
if: |
needs.context.outputs.is_fork == 'false' &&
github.event.workflow_run.event == 'schedule' &&
github.event.workflow_run.conclusion == 'failure'
runs-on: ubuntu-latest
Expand All @@ -33,13 +39,13 @@ jobs:
slack_token: ${{ secrets.SLACK_TOKEN }}
slack_channel: ${{ secrets.SLACK_ADDONS_PRODUCTION_CHANNEL }}
emoji: ':x:'
actor: ${{ vars.slack_actor }}
actor: ${{ vars.SLACK_ACTOR }}
conclusion: ${{ github.event.workflow_run.conclusion }}
workflow_id: ${{ github.event.workflow_run.id }}
workflow_url: ${{ github.event.workflow_run.url }}
workflow_url: ${{ github.event.workflow_run.html_url }}
event: ${{ github.event.workflow_run.event }}
env: ci
ref: ''
ref_link: ''
ref: ${{ github.event.workflow_run.repository.name }}
ref_link: ${{ github.event.workflow_run.html_url }}


54 changes: 26 additions & 28 deletions scripts/health_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
import json
import time
from enum import Enum
from functools import cached_property

import requests
from requests import Session
from requests.adapters import HTTPAdapter
from urllib3.util import Retry


ENV_ENUM = Enum(
Expand All @@ -25,34 +28,29 @@ def __init__(self, env: ENV_ENUM, verbose: bool = False):
self.environment = ENV_ENUM[env]
self.verbose = verbose

def _fetch(self, path: str):
url = f'{self.environment.value}/{path}'
if self.verbose:
print(f'Requesting {url} for {self.environment.name}')

data = None
# We return 500 if any of the monitors are failing.
# So instead of raising, we should try to form valid JSON
# and determine if we should raise later based on the json values.
try:
response = requests.get(url, allow_redirects=False)
data = response.json()
except (requests.exceptions.HTTPError, json.JSONDecodeError) as e:
if self.verbose:
print(
{
'error': e,
'data': data,
'response': response,
}
)

if data is None:
return {}

@cached_property
def client(self):
session = Session()
retries = Retry(
total=5,
backoff_factor=0.1,
status_forcelist=[502, 503, 504],
allowed_methods={'GET'},
)
adapter = HTTPAdapter(max_retries=retries)
session.mount(self.environment.value, adapter)
return session

def log(self, *args):
if self.verbose:
print(json.dumps(data, indent=2))
print(*args)

def _fetch(self, path: str):
url = f'{self.environment.value}/{path}'
self.log(f'Requesting {url} for {self.environment.name}')
response = self.client.get(url, allow_redirects=False)
data = response.json()
self.log(json.dumps(data, indent=2))
return {'url': url, 'data': data}

def version(self):
Expand All @@ -65,7 +63,7 @@ def monitors(self):
return self._fetch('services/__heartbeat__')


def main(env: ENV_ENUM, verbose: bool, retries: int = 0, attempt: int = 0):
def main(env: ENV_ENUM, verbose: bool = False, retries: int = 0, attempt: int = 0):
fetcher = Fetcher(env, verbose)

version_data = fetcher.version()
Expand Down
16 changes: 16 additions & 0 deletions src/olympia/amo/migrations/0002_auto_20250319_1240.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Generated by Django 4.2.20 on 2025-03-19 12:40

from django.db import migrations

from olympia.core.db.migrations import CreateWaffleSwitch


class Migration(migrations.Migration):

dependencies = [
('amo', '0001_initial'),
]

operations = [
CreateWaffleSwitch('dummy-monitor-fails'),
]
7 changes: 7 additions & 0 deletions src/olympia/amo/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import celery
import MySQLdb
import requests
import waffle
from django_statsd.clients import statsd
from kombu import Connection
from PIL import Image
Expand All @@ -34,6 +35,12 @@ def execute_checks(checks: list[str], verbose: bool = False):
return status_summary


def dummy_monitor():
if waffle.switch_is_active('dummy-monitor-fails'):
return 'Dummy monitor failed', None
return '', None


def localdev_web():
"""
Used in local development environments to determine if the web container
Expand Down
27 changes: 27 additions & 0 deletions src/olympia/amo/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pytest
from lxml import etree
from pyquery import PyQuery as pq
from waffle.testutils import override_switch

from olympia import amo, core
from olympia.access import acl
Expand Down Expand Up @@ -401,6 +402,19 @@ def test_front_heartbeat_failure(self):
assert response.status_code >= 500
assert response.json()['database']['status'] == 'boom'

def test_front_heartbeat_dummy_monitor_failure(self):
url = reverse('amo.front_heartbeat')
response = self.client.get(url)

assert response.status_code == 200
self.assertTrue(response.json()['dummy_monitor']['state'])

with override_switch('dummy-monitor-fails', True):
response = self.client.get(url)

assert response.status_code >= 500
assert response.json()['dummy_monitor']['status'] == 'Dummy monitor failed'

def test_services_heartbeat_success(self):
response = self.client.get(reverse('amo.services_heartbeat'))
assert response.status_code == 200
Expand All @@ -413,6 +427,19 @@ def test_services_heartbeat_failure(self):
assert response.status_code >= 500
assert response.json()['rabbitmq']['status'] == 'boom'

def test_services_heartbeat_dummy_monitor_failure(self):
url = reverse('amo.services_heartbeat')
response = self.client.get(url)

assert response.status_code == 200
self.assertTrue(response.json()['dummy_monitor']['state'])

with override_switch('dummy-monitor-fails', True):
response = self.client.get(url)

assert response.status_code >= 500
assert response.json()['dummy_monitor']['status'] == 'Dummy monitor failed'


class TestCORS(TestCase):
fixtures = ('base/addon_3615',)
Expand Down
2 changes: 1 addition & 1 deletion src/olympia/amo/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@


def _exec_monitors(checks: list[str]):
status_summary = monitors.execute_checks(checks)
status_summary = monitors.execute_checks([*checks, 'dummy_monitor'])
status_code = 200 if all(a['state'] for a in status_summary.values()) else 500
return JsonResponse(status_summary, status=status_code)

Expand Down
Loading
Loading