mozilla · KevinMind · Mar 6, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.github/actions/run-docker/action.yml b/.github/actions/run-docker/action.yml
@@ -15,7 +15,7 @@ inputs:
   logs:
     description: 'Show logs'
     required: false
-  data_backup_skip:
+  skip_data_seed:
     description: 'Skip data backup'
     required: false
     default: 'true'
@@ -32,6 +32,8 @@ runs:
   using: 'composite'
   steps:
     - name: Run Docker Container
+      id: run
+      continue-on-error: true
       shell: bash
       run: |
         # Start the specified services
@@ -41,7 +43,7 @@ runs:
           DOCKER_TARGET="${{ inputs.target }}" \
           OLYMPIA_UID="$(id -u)" \
           OLYMPIA_DEPS="${{ inputs.deps }}" \
-          DATA_BACKUP_SKIP="${{ inputs.data_backup_skip }}" \
+          SKIP_DATA_SEED="${{ inputs.skip_data_seed }}" \
           DOCKER_WAIT="true"
 
 
@@ -54,4 +56,10 @@ runs:
     - name: Logs
       if: ${{ inputs.logs }}
       shell: bash
-      run: docker compose logs
+      run: |
+        docker compose logs
+
+        # If the run command failed, exit with a non-zero exit code
+        if [ ${{ steps.run.outcome }} != 'success' ]; then
+          exit 1
+        fi
diff --git a/.github/workflows/_test_check.yml b/.github/workflows/_test_check.yml
@@ -160,5 +160,4 @@ jobs:
       - uses: ./.github/actions/run-docker
         with:
           version: ${{ inputs.version }}
-          data_backup_skip: false
           run: echo true
diff --git a/.github/workflows/health_check.yml b/.github/workflows/health_check.yml
@@ -0,0 +1,30 @@
+name: Health Check
+
+on:
+  # Run the workflow test on push events
+  push:
+  # Run the main workflow on workflow_dispatch or schedule
+  workflow_dispatch:
+  schedule:
+    # Every 5 minutes
+    - cron: '*/5 * * * *'
+
+jobs:
+  health_check:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        environment: ${{fromJson(github.event_name == 'push' && '["local"]' || '["dev","stage","prod"]')}}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: ./.github/actions/run-docker
+        with:
+          target: development
+          version: local
+          run: ./scripts/health_check.py --env ${{ matrix.environment }} --verbose
+
+
+
diff --git a/Makefile-os b/Makefile-os
@@ -14,7 +14,7 @@ DOCKER_WAIT ?=
 export DOCKER_COMMIT ?=
 export DOCKER_BUILD ?=
 export DOCKER_VERSION ?=
-export DATA_BACKUP_SKIP ?=
+export SKIP_DATA_SEED ?=
 override DOCKER_MYSQLD_VOLUME = addons-server_data_mysqld
 
 export FXA_CLIENT_ID ?=

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -21,7 +21,7 @@ x-env-mapping: &env
     - HISTCONTROL=erasedups
     - ENV=local
     - CIRCLECI
-    - DATA_BACKUP_SKIP
+    - SKIP_DATA_SEED
     - FXA_CLIENT_ID
     - FXA_CLIENT_SECRET
 

diff --git a/docker/nginx/addons.conf b/docker/nginx/addons.conf
@@ -49,7 +49,7 @@ server {
         try_files $uri @olympia;
     }
 
-    location /__version__ {
+    location ~ ^/(__version__|__heartbeat__)(\?.*)?$ {
         try_files $uri @olympia;
     }
 

diff --git a/scripts/health_check.py b/scripts/health_check.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import time
+from enum import Enum
+
+import requests
+
+
+ENV_ENUM = Enum(
+    'ENV',
+    [
+        ('dev', 'https://addons-dev.allizom.org'),
+        ('stage', 'https://addons.allizom.org'),
+        ('prod', 'https://addons.mozilla.org'),
+        # For local environments hit the nginx container as set in docker-compose.yml
+        ('local', 'http://nginx'),
+    ],
+)
+
+
+class Fetcher:
+    def __init__(self, env: ENV_ENUM, verbose: bool = False):
+        self.environment = ENV_ENUM[env]
+        self.verbose = verbose
+
+    def _fetch(self, path: str) -> dict[str, str] | None:
+        url = f'{self.environment.value}/{path}'
+        if self.verbose:
+            print(f'Requesting {url} for {self.environment.name}')
+
+        data = None
+        # We return 500 if any of the monitors are failing.
+        # So instead of raising, we should try to form valid JSON
+        # and determine if we should raise later based on the json values.
+        try:
+            response = requests.get(url, allow_redirects=False)
+            data = response.json()
+        except (requests.exceptions.HTTPError, json.JSONDecodeError) as e:
+            if self.verbose:
+                print(
+                    {
+                        'error': e,
+                        'data': data,
+                        'response': response,
+                    }
+                )
+
+        if self.verbose and data is not None:
+            print(json.dumps(data, indent=2))
+
+        return data
+
+    def version(self):
+        return self._fetch('__version__')
+
+    def heartbeat(self):
+        return self._fetch('__heartbeat__')
+
+    def monitors(self):
+        return self._fetch('services/__heartbeat__')
+
+
+def main(env: ENV_ENUM, verbose: bool = False):
+    fetcher = Fetcher(env, verbose)
+
+    version_data = fetcher.version()
+    heartbeat_data = fetcher.heartbeat()
+    monitors_data = fetcher.monitors()
+
+    if version_data is None:
+        raise ValueError('Error fetching version data')
+
+    if heartbeat_data is None:
+        raise ValueError('Error fetching heartbeat data')
+
+    if monitors_data is None:
+        raise ValueError('Error fetching monitors data')
+
+    combined_data = {**heartbeat_data, **monitors_data}
+    failing_monitors = [
+        name for name, monitor in combined_data.items() if monitor['state'] is False
+    ]
+
+    if len(failing_monitors) > 0:
+        raise ValueError(f'Some monitors are failing {failing_monitors}')
+
+
+if __name__ == '__main__':
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        '--env', type=str, choices=list(ENV_ENUM.__members__.keys()), required=True
+    )
+    args.add_argument('--verbose', action='store_true')
+    args.add_argument('--retries', type=int, default=3)
+    args = args.parse_args()
+
+    attempt = 1
+
+    while attempt <= args.retries:
+        try:
+            main(args.env, args.verbose)
+            break
+        except Exception as e:
+            print(f'Error: {e}')
+            if attempt == args.retries:
+                raise
+            time.sleep(2**attempt)
+            attempt += 1
diff --git a/settings.py b/settings.py
@@ -33,7 +33,7 @@
 DBBACKUP_CONNECTOR_MAPPING = {
     'olympia.core.db.mysql': 'dbbackup.db.mysql.MysqlDumpConnector',
 }
-DATA_BACKUP_SKIP = os.environ.get('DATA_BACKUP_SKIP', False)
+SKIP_DATA_SEED = os.environ.get('SKIP_DATA_SEED', False)
 
 # Override logging config to enable DEBUG logs for (almost) everything.
 LOGGING['root']['level'] = logging.DEBUG
@@ -196,3 +196,5 @@ def insert_debug_toolbar_middleware(middlewares):
             'static_url_prefix': 'bundle',
         }
     }
+
+MEMCACHE_MIN_SERVER_COUNT = 1
diff --git a/src/olympia/amo/management/__init__.py b/src/olympia/amo/management/__init__.py
@@ -165,7 +165,11 @@ def handle(self, *args, **options):
             'test': '',
             'uploads': '',
         },
-        'uploads': '',
+        'uploads': {
+            'addon_icons': '',
+            'previews': '',
+            'userpics': '',
+        },
     },
 }
 

diff --git a/src/olympia/amo/management/commands/initialize.py b/src/olympia/amo/management/commands/initialize.py
@@ -46,32 +46,33 @@ def handle(self, *args, **options):
         # Always ensure "olympia" database exists and is accessible.
         call_command('monitors', services=['olympia_database', 'elastic'])
 
-        # If we are not skipping data backup
-        # then run the logic to ensure the DB is ready.
-        if not settings.DATA_BACKUP_SKIP:
-            # If DB empty or we are explicitly cleaning, then bail with data_seed.
-            if options.get('clean') or not self.local_admin_exists():
-                call_command('data_seed')
-            # Otherwise, we're working with a pre-existing DB.
+        if (
+            # If we are not skipping data seeding
+            not settings.SKIP_DATA_SEED
+            # and we are either explicitly cleaning or loading a fresh db
+            and (options.get('clean') or not self.local_admin_exists())
+        ):
+            call_command('data_seed')
+        # Otherwise, we're working with a pre-existing DB.
+        else:
+            load = options.get('load')
+            # We always migrate the DB.
+            logging.info('Migrating...')
+            call_command('migrate', '--noinput')
+
+            # If we specify a specific backup, simply load that.
+            if load:
+                call_command('data_load', '--name', load)
+            # We should reindex even if no data is loaded/modified
+            # because we might have a fresh instance of elasticsearch
             else:
-                load = options.get('load')
-                # We always migrate the DB.
-                logging.info('Migrating...')
-                call_command('migrate', '--noinput')
-
-                # If we specify a specific backup, simply load that.
-                if load:
-                    call_command('data_load', '--name', load)
-                # We should reindex even if no data is loaded/modified
-                # because we might have a fresh instance of elasticsearch
-                else:
-                    call_command(
-                        'reindex', '--wipe', '--force', '--noinput', '--skip-if-exists'
-                    )
-
-            # By now, we excpect the database to exist, and to be migrated
-            # so our database tables should be accessible
-            call_command('monitors', services=['database'])
+                call_command(
+                    'reindex', '--wipe', '--force', '--noinput', '--skip-if-exists'
+                )
+
+        # By now, we excpect the database to exist, and to be migrated
+        # so our database tables should be accessible
+        call_command('monitors', services=['database'])
 
         # Ensure that the storage directories exist.
         self.make_storage(clean=False)

diff --git a/src/olympia/amo/management/commands/monitors.py b/src/olympia/amo/management/commands/monitors.py
@@ -22,29 +22,39 @@ def add_arguments(self, parser):
             default=5,
             help='Number of attempts to check the services',
         )
+        parser.add_argument(
+            '--verbose',
+            action='store_true',
+            help='Verbose output',
+        )
 
     def handle(self, *args, **options):
         attempts = options.get('attempts')
         services = options.get('services')
-
+        verbose = options.get('verbose')
         self.logger.info(f'services: {services}')
 
         if not services:
             raise CommandError('No services specified')
 
-        failing_services = services.copy()
-
         current = 0
 
+        services_to_check = set(services.copy())
+
         while current < attempts:
             current += 1
-            self.logger.info(f'Checking services {services} for the {current} time')
-            status_summary = monitors.execute_checks(services)
-            failing_services = [
-                service
-                for service, result in status_summary.items()
-                if result['state'] is False
-            ]
+            self.logger.info(
+                f'Checking services {services_to_check} for the {current} time'
+            )
+            status_summary = monitors.execute_checks(list(services_to_check), verbose)
+
+            failing_services = {}
+
+            for service, result in status_summary.items():
+                if result['state'] is True and service in services_to_check:
+                    services_to_check.remove(service)
+                else:
+                    failing_services[service] = result
 
             if len(failing_services) > 0:
                 self.logger.error('Some services are failing: %s', failing_services)
@@ -54,7 +64,7 @@ def handle(self, *args, **options):
             else:
                 break
 
-        if len(failing_services) > 0:
-            raise CommandError(f'Some services are failing: {failing_services}')
+        if len(services_to_check) > 0:
+            raise CommandError(f'Some services are failing: {list(services_to_check)}')
         else:
             self.logger.info(f'All services are healthy {services}')