Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GitHub Actions health check workflow #23036

Merged
merged 7 commits into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions .github/actions/run-docker/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ inputs:
logs:
description: 'Show logs'
required: false
data_backup_skip:
skip_data_seed:
description: 'Skip data backup'
required: false
default: 'true'
Expand All @@ -32,6 +32,8 @@ runs:
using: 'composite'
steps:
- name: Run Docker Container
id: run
continue-on-error: true
shell: bash
run: |
# Start the specified services
Expand All @@ -41,7 +43,7 @@ runs:
DOCKER_TARGET="${{ inputs.target }}" \
OLYMPIA_UID="$(id -u)" \
OLYMPIA_DEPS="${{ inputs.deps }}" \
DATA_BACKUP_SKIP="${{ inputs.data_backup_skip }}" \
SKIP_DATA_SEED="${{ inputs.skip_data_seed }}" \
DOCKER_WAIT="true"


Expand All @@ -54,4 +56,10 @@ runs:
- name: Logs
if: ${{ inputs.logs }}
shell: bash
run: docker compose logs
run: |
docker compose logs

# If the run command failed, exit with a non-zero exit code
if [ ${{ steps.run.outcome }} != 'success' ]; then
exit 1
fi
1 change: 0 additions & 1 deletion .github/workflows/_test_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,5 +160,4 @@ jobs:
- uses: ./.github/actions/run-docker
with:
version: ${{ inputs.version }}
data_backup_skip: false
run: echo true
30 changes: 30 additions & 0 deletions .github/workflows/health_check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Health Check

on:
# Run the workflow test on push events
push:
# Run the main workflow on workflow_dispatch or schedule
workflow_dispatch:
schedule:
# Every 5 minutes
- cron: '*/5 * * * *'

jobs:
health_check:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
environment: ${{fromJson(github.event_name == 'push' && '["local"]' || '["dev","stage","prod"]')}}

steps:
- uses: actions/checkout@v4

- uses: ./.github/actions/run-docker
with:
target: development
version: local
run: ./scripts/health_check.py --env ${{ matrix.environment }} --verbose



2 changes: 1 addition & 1 deletion Makefile-os
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ DOCKER_WAIT ?=
export DOCKER_COMMIT ?=
export DOCKER_BUILD ?=
export DOCKER_VERSION ?=
export DATA_BACKUP_SKIP ?=
export SKIP_DATA_SEED ?=
override DOCKER_MYSQLD_VOLUME = addons-server_data_mysqld

export FXA_CLIENT_ID ?=
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ x-env-mapping: &env
- HISTCONTROL=erasedups
- ENV=local
- CIRCLECI
- DATA_BACKUP_SKIP
- SKIP_DATA_SEED
- FXA_CLIENT_ID
- FXA_CLIENT_SECRET

Expand Down
2 changes: 1 addition & 1 deletion docker/nginx/addons.conf
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ server {
try_files $uri @olympia;
}

location /__version__ {
location ~ ^/(__version__|__heartbeat__)(\?.*)?$ {
try_files $uri @olympia;
}

Expand Down
110 changes: 110 additions & 0 deletions scripts/health_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3

import argparse
import json
import time
from enum import Enum

import requests


ENV_ENUM = Enum(
'ENV',
[
('dev', 'https://addons-dev.allizom.org'),
('stage', 'https://addons.allizom.org'),
('prod', 'https://addons.mozilla.org'),
# For local environments hit the nginx container as set in docker-compose.yml
('local', 'http://nginx'),
],
)


class Fetcher:
def __init__(self, env: ENV_ENUM, verbose: bool = False):
self.environment = ENV_ENUM[env]
self.verbose = verbose

def _fetch(self, path: str) -> dict[str, str] | None:
url = f'{self.environment.value}/{path}'
if self.verbose:
print(f'Requesting {url} for {self.environment.name}')

data = None
# We return 500 if any of the monitors are failing.
# So instead of raising, we should try to form valid JSON
# and determine if we should raise later based on the json values.
try:
response = requests.get(url, allow_redirects=False)
data = response.json()
except (requests.exceptions.HTTPError, json.JSONDecodeError) as e:
if self.verbose:
print(
{
'error': e,
'data': data,
'response': response,
}
)

if self.verbose and data is not None:
print(json.dumps(data, indent=2))

return data

def version(self):
return self._fetch('__version__')

def heartbeat(self):
return self._fetch('__heartbeat__')

def monitors(self):
return self._fetch('services/__heartbeat__')


def main(env: ENV_ENUM, verbose: bool = False):
fetcher = Fetcher(env, verbose)

version_data = fetcher.version()
heartbeat_data = fetcher.heartbeat()
monitors_data = fetcher.monitors()

if version_data is None:
raise ValueError('Error fetching version data')

if heartbeat_data is None:
raise ValueError('Error fetching heartbeat data')

if monitors_data is None:
raise ValueError('Error fetching monitors data')

combined_data = {**heartbeat_data, **monitors_data}
failing_monitors = [
name for name, monitor in combined_data.items() if monitor['state'] is False
]

if len(failing_monitors) > 0:
raise ValueError(f'Some monitors are failing {failing_monitors}')


if __name__ == '__main__':
args = argparse.ArgumentParser()
args.add_argument(
'--env', type=str, choices=list(ENV_ENUM.__members__.keys()), required=True
)
args.add_argument('--verbose', action='store_true')
args.add_argument('--retries', type=int, default=3)
args = args.parse_args()

attempt = 1

while attempt <= args.retries:
try:
main(args.env, args.verbose)
break
except Exception as e:
print(f'Error: {e}')
if attempt == args.retries:
raise
time.sleep(2**attempt)
attempt += 1
4 changes: 3 additions & 1 deletion settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
DBBACKUP_CONNECTOR_MAPPING = {
'olympia.core.db.mysql': 'dbbackup.db.mysql.MysqlDumpConnector',
}
DATA_BACKUP_SKIP = os.environ.get('DATA_BACKUP_SKIP', False)
SKIP_DATA_SEED = os.environ.get('SKIP_DATA_SEED', False)

# Override logging config to enable DEBUG logs for (almost) everything.
LOGGING['root']['level'] = logging.DEBUG
Expand Down Expand Up @@ -196,3 +196,5 @@ def insert_debug_toolbar_middleware(middlewares):
'static_url_prefix': 'bundle',
}
}

MEMCACHE_MIN_SERVER_COUNT = 1
6 changes: 5 additions & 1 deletion src/olympia/amo/management/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,11 @@ def handle(self, *args, **options):
'test': '',
'uploads': '',
},
'uploads': '',
'uploads': {
'addon_icons': '',
'previews': '',
'userpics': '',
},
},
}

Expand Down
51 changes: 26 additions & 25 deletions src/olympia/amo/management/commands/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,32 +46,33 @@ def handle(self, *args, **options):
# Always ensure "olympia" database exists and is accessible.
call_command('monitors', services=['olympia_database', 'elastic'])

# If we are not skipping data backup
# then run the logic to ensure the DB is ready.
if not settings.DATA_BACKUP_SKIP:
# If DB empty or we are explicitly cleaning, then bail with data_seed.
if options.get('clean') or not self.local_admin_exists():
call_command('data_seed')
# Otherwise, we're working with a pre-existing DB.
if (
# If we are not skipping data seeding
not settings.SKIP_DATA_SEED
# and we are either explicitly cleaning or loading a fresh db
and (options.get('clean') or not self.local_admin_exists())
):
call_command('data_seed')
# Otherwise, we're working with a pre-existing DB.
else:
load = options.get('load')
# We always migrate the DB.
logging.info('Migrating...')
call_command('migrate', '--noinput')

# If we specify a specific backup, simply load that.
if load:
call_command('data_load', '--name', load)
# We should reindex even if no data is loaded/modified
# because we might have a fresh instance of elasticsearch
else:
load = options.get('load')
# We always migrate the DB.
logging.info('Migrating...')
call_command('migrate', '--noinput')

# If we specify a specific backup, simply load that.
if load:
call_command('data_load', '--name', load)
# We should reindex even if no data is loaded/modified
# because we might have a fresh instance of elasticsearch
else:
call_command(
'reindex', '--wipe', '--force', '--noinput', '--skip-if-exists'
)

# By now, we excpect the database to exist, and to be migrated
# so our database tables should be accessible
call_command('monitors', services=['database'])
call_command(
'reindex', '--wipe', '--force', '--noinput', '--skip-if-exists'
)

# By now, we excpect the database to exist, and to be migrated
# so our database tables should be accessible
call_command('monitors', services=['database'])

# Ensure that the storage directories exist.
self.make_storage(clean=False)
Expand Down
34 changes: 22 additions & 12 deletions src/olympia/amo/management/commands/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,39 @@ def add_arguments(self, parser):
default=5,
help='Number of attempts to check the services',
)
parser.add_argument(
'--verbose',
action='store_true',
help='Verbose output',
)

def handle(self, *args, **options):
attempts = options.get('attempts')
services = options.get('services')

verbose = options.get('verbose')
self.logger.info(f'services: {services}')

if not services:
raise CommandError('No services specified')

failing_services = services.copy()

current = 0

services_to_check = set(services.copy())

while current < attempts:
current += 1
self.logger.info(f'Checking services {services} for the {current} time')
status_summary = monitors.execute_checks(services)
failing_services = [
service
for service, result in status_summary.items()
if result['state'] is False
]
self.logger.info(
f'Checking services {services_to_check} for the {current} time'
)
status_summary = monitors.execute_checks(list(services_to_check), verbose)

failing_services = {}

for service, result in status_summary.items():
if result['state'] is True and service in services_to_check:
services_to_check.remove(service)
else:
failing_services[service] = result

if len(failing_services) > 0:
self.logger.error('Some services are failing: %s', failing_services)
Expand All @@ -54,7 +64,7 @@ def handle(self, *args, **options):
else:
break

if len(failing_services) > 0:
raise CommandError(f'Some services are failing: {failing_services}')
if len(services_to_check) > 0:
raise CommandError(f'Some services are failing: {list(services_to_check)}')
else:
self.logger.info(f'All services are healthy {services}')
Loading
Loading