Skip to content

Commit ef19c7e

Browse files
committed
Add GitHub Actions health check workflow
Introduces a new GitHub Actions workflow to periodically check service health and version information for specified endpoints. The workflow includes: - Scheduled runs every 5 minutes - Checks version information - Monitors service status - Raises an error if any services are down
1 parent 8b491c0 commit ef19c7e

File tree

13 files changed

+317
-50
lines changed

13 files changed

+317
-50
lines changed

.github/workflows/health_check.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: Health Check
2+
3+
on:
4+
# Run the workflow test on push events
5+
push:
6+
# Run the main workflow on workflow_dispatch or schedule
7+
workflow_dispatch:
8+
schedule:
9+
# Every 5 minutes
10+
- cron: '*/5 * * * *'
11+
12+
jobs:
13+
health_check:
14+
runs-on: ubuntu-latest
15+
strategy:
16+
fail-fast: false
17+
matrix:
18+
environment: ${{fromJson(github.event_name == 'push' && '["local"]' || '["dev","stage","prod"]')}}
19+
20+
steps:
21+
- uses: actions/checkout@v4
22+
23+
- uses: ./.github/actions/run-docker
24+
with:
25+
target: development
26+
version: local
27+
logs: true
28+
run: ./scripts/health_check.py --env ${{ matrix.environment }} --verbose
29+
30+
31+

docker/nginx/addons.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ server {
4949
try_files $uri @olympia;
5050
}
5151

52-
location /__version__ {
52+
location ~ ^/(__version__|__heartbeat__|__healthcheck__)(\?.*)?$ {
5353
try_files $uri @olympia;
5454
}
5555

scripts/health_check.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import json
5+
from enum import Enum
6+
import time
7+
8+
import requests
9+
10+
11+
ENV_ENUM = Enum(
12+
'ENV',
13+
[
14+
('dev', 'https://addons-dev.allizom.org'),
15+
('stage', 'https://addons.allizom.org'),
16+
('prod', 'https://addons.mozilla.org'),
17+
# For local environments hit the nginx container as set in docker-compose.yml
18+
('local', 'http://nginx'),
19+
],
20+
)
21+
22+
23+
class Fetcher:
24+
def __init__(self, env: ENV_ENUM, verbose: bool = False):
25+
self.environment = ENV_ENUM[env]
26+
self.verbose = verbose
27+
28+
def _fetch(self, path: str) -> dict[str, str] | None:
29+
url = f'{self.environment.value}/{path}'
30+
if self.verbose:
31+
print(f'Requesting {url} for {self.environment.name}')
32+
33+
data = None
34+
# We return 500 if any of the monitors are failing.
35+
# So instead of raising, we should try to form valid JSON
36+
# and determine if we should raise later based on the json values.
37+
try:
38+
response = requests.get(url, allow_redirects=False)
39+
data = response.json()
40+
except (requests.exceptions.HTTPError, json.JSONDecodeError) as e:
41+
if self.verbose:
42+
print({
43+
'error': e,
44+
'data': data,
45+
'response': response,
46+
})
47+
48+
if self.verbose and data is not None:
49+
print(json.dumps(data, indent=2))
50+
51+
return data
52+
53+
def version(self):
54+
return self._fetch('__version__')
55+
56+
def healthcheck(self):
57+
return self._fetch('__healthcheck__?verbose=true')
58+
59+
60+
def main(env: ENV_ENUM, verbose: bool = False):
61+
fetcher = Fetcher(env, verbose)
62+
63+
version_data = fetcher.version()
64+
healthcheck_data = fetcher.healthcheck()
65+
66+
if version_data is None:
67+
raise ValueError('Error fetching version data')
68+
69+
if healthcheck_data is None:
70+
raise ValueError('Error fetching healthcheck data')
71+
72+
if healthcheck_data is not None:
73+
if any(monitor['state'] is False for monitor in healthcheck_data.values()):
74+
raise ValueError(f'Some monitors are failing {healthcheck_data.keys()}')
75+
76+
77+
if __name__ == '__main__':
78+
args = argparse.ArgumentParser()
79+
args.add_argument(
80+
'--env', type=str, choices=list(ENV_ENUM.__members__.keys()), required=True
81+
)
82+
args.add_argument('--verbose', action='store_true')
83+
args.add_argument('--retries', type=int, default=3)
84+
args = args.parse_args()
85+
86+
attempt = 1
87+
88+
while attempt <= args.retries:
89+
try:
90+
main(args.env, args.verbose)
91+
break
92+
except Exception as e:
93+
print(f'Error: {e}')
94+
time.sleep(2 ** attempt)
95+
attempt += 1

settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,5 @@ def insert_debug_toolbar_middleware(middlewares):
204204
'static_url_prefix': 'bundle',
205205
}
206206
}
207+
208+
MEMCACHE_MIN_SERVER_COUNT = 1

src/olympia/amo/management/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,11 @@ def handle(self, *args, **options):
165165
'test': '',
166166
'uploads': '',
167167
},
168-
'uploads': '',
168+
'uploads': {
169+
'addon_icons': '',
170+
'previews': '',
171+
'userpics': '',
172+
},
169173
},
170174
}
171175

src/olympia/amo/management/commands/monitors.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,29 +22,39 @@ def add_arguments(self, parser):
2222
default=5,
2323
help='Number of attempts to check the services',
2424
)
25+
parser.add_argument(
26+
'--verbose',
27+
action='store_true',
28+
help='Verbose output',
29+
)
2530

2631
def handle(self, *args, **options):
2732
attempts = options.get('attempts')
2833
services = options.get('services')
29-
34+
verbose = options.get('verbose')
3035
self.logger.info(f'services: {services}')
3136

3237
if not services:
3338
raise CommandError('No services specified')
3439

35-
failing_services = services.copy()
36-
3740
current = 0
3841

42+
services_to_check = set(services.copy())
43+
3944
while current < attempts:
4045
current += 1
41-
self.logger.info(f'Checking services {services} for the {current} time')
42-
status_summary = monitors.execute_checks(services)
43-
failing_services = [
44-
service
45-
for service, result in status_summary.items()
46-
if result['state'] is False
47-
]
46+
self.logger.info(
47+
f'Checking services {services_to_check} for the {current} time'
48+
)
49+
status_summary = monitors.execute_checks(list(services_to_check), verbose)
50+
51+
failing_services = {}
52+
53+
for service, result in status_summary.items():
54+
if result['state'] is True and service in services_to_check:
55+
services_to_check.remove(service)
56+
else:
57+
failing_services[service] = result
4858

4959
if len(failing_services) > 0:
5060
self.logger.error('Some services are failing: %s', failing_services)
@@ -54,7 +64,7 @@ def handle(self, *args, **options):
5464
else:
5565
break
5666

57-
if len(failing_services) > 0:
58-
raise CommandError(f'Some services are failing: {failing_services}')
67+
if len(services_to_check) > 0:
68+
raise CommandError(f'Some services are failing: {list(services_to_check)}')
5969
else:
6070
self.logger.info(f'All services are healthy {services}')

src/olympia/amo/monitors.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@
2222
monitor_log = olympia.core.logger.getLogger('z.monitor')
2323

2424

25-
def execute_checks(checks: list[str]):
25+
def execute_checks(checks: list[str], verbose: bool = False):
2626
status_summary = {}
2727
for check in checks:
2828
with statsd.timer('monitor.%s' % check):
29-
status, _ = globals()[check]()
29+
status, results = globals()[check]()
3030
# state is a string. If it is empty, that means everything is fine.
3131
status_summary[check] = {'state': not status, 'status': status}
32+
if verbose:
33+
status_summary[check]['results'] = results
3234
return status_summary
3335

3436

@@ -40,7 +42,7 @@ def localdev_web():
4042
"""
4143
status = ''
4244
try:
43-
response = requests.get('http://127.0.0.1:8002/__version__')
45+
response = requests.get('http://nginx/__version__')
4446
response.raise_for_status()
4547
except Exception as e:
4648
status = f'Failed to ping web: {e}'
@@ -94,9 +96,11 @@ def memcache():
9496
s.close()
9597

9698
memcache_results.append((ip, port, result))
97-
if not using_twemproxy and len(memcache_results) < 2:
98-
status = ('2+ memcache servers are required. %s available') % len(
99-
memcache_results
99+
expected_count = settings.MEMCACHE_MIN_SERVER_COUNT
100+
if not using_twemproxy and len(memcache_results) < expected_count:
101+
status = ('%d+ memcache servers are required. %d available') % (
102+
expected_count,
103+
len(memcache_results),
100104
)
101105
monitor_log.warning(status)
102106

@@ -285,6 +289,10 @@ def database():
285289

286290

287291
def remotesettings():
292+
# TODO: We should be able to check remote settings
293+
# connectivity if the credentials are set.
294+
if settings.ENV in ('local', 'test'):
295+
return '', None
288296
# check Remote Settings connectivity.
289297
# Since the blocklist filter task is performed by
290298
# a worker, and since workers have different network

src/olympia/amo/tests/test_commands.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,9 @@ def walk_keys(root, dir_dict):
519519
os.path.join(settings.STORAGE_ROOT, 'shared_storage/tmp/preview'),
520520
os.path.join(settings.STORAGE_ROOT, 'shared_storage/tmp/test'),
521521
os.path.join(settings.STORAGE_ROOT, 'shared_storage/tmp/uploads'),
522-
os.path.join(settings.STORAGE_ROOT, 'shared_storage/uploads'),
522+
os.path.join(settings.STORAGE_ROOT, 'shared_storage/uploads/addon_icons'),
523+
os.path.join(settings.STORAGE_ROOT, 'shared_storage/uploads/previews'),
524+
os.path.join(settings.STORAGE_ROOT, 'shared_storage/uploads/userpics'),
523525
]
524526

525527
for key in keys:
@@ -723,7 +725,7 @@ def test_monitors_fails_after_specified_attempts(self):
723725
def test_monitors_succeeds_after_specified_attempts(self):
724726
succeed_after = 3
725727

726-
def mock_handler(services):
728+
def mock_handler(services, _verbose):
727729
state = self.mock_execute_checks.call_count >= succeed_after
728730
return {service: {'state': state} for service in services}
729731

@@ -733,7 +735,7 @@ def mock_handler(services):
733735
def test_monitors_succeeds_after_all_services_are_healthy(self):
734736
succeed_after = 3
735737

736-
def mock_handler(_):
738+
def mock_handler(_a, _b):
737739
state = self.mock_execute_checks.call_count >= succeed_after
738740
return {
739741
'database': {'state': state},

src/olympia/amo/tests/test_monitor.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def test_remotesettings_success(self):
110110
obtained, _ = monitors.remotesettings()
111111
assert obtained == ''
112112

113+
@override_settings(ENV='production')
113114
def test_remotesettings_bad_credentials(self):
114115
responses.add(
115116
responses.GET,
@@ -126,6 +127,7 @@ def test_remotesettings_bad_credentials(self):
126127
obtained, _ = monitors.remotesettings()
127128
assert 'Invalid credentials' in obtained
128129

130+
@override_settings(ENV='production')
129131
def test_remotesettings_fail(self):
130132
responses.add(
131133
responses.GET,
@@ -164,7 +166,7 @@ def test_cinder_fail(self):
164166
def test_localdev_web_fail(self):
165167
responses.add(
166168
responses.GET,
167-
'http://127.0.0.1:8002/__version__',
169+
'http://nginx/__version__',
168170
status=500,
169171
)
170172
status, _ = monitors.localdev_web()
@@ -173,7 +175,7 @@ def test_localdev_web_fail(self):
173175
def test_localdev_web_success(self):
174176
responses.add(
175177
responses.GET,
176-
'http://127.0.0.1:8002/__version__',
178+
'http://nginx/__version__',
177179
status=200,
178180
)
179181
status, _ = monitors.localdev_web()

src/olympia/amo/urls.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
re_path(r'^services/', include(shared_services_api_patterns)),
3535
re_path(r'^__version__$', views.version, name='version.json'),
3636
re_path(r'^__heartbeat__$', views.front_heartbeat, name='amo.front_heartbeat'),
37+
re_path(r'^__healthcheck__$', views.healthcheck, name='amo.healthcheck'),
3738
re_path(
3839
r'^opensearch\.xml$',
3940
TemplateView.as_view(

0 commit comments

Comments
 (0)