Skip to content

Commit 2216913

Browse files
monyarmPetarKirov
authored andcommitted
feat(healthcheck): add healthcheck module with liveness and readiness probes
1 parent 8a7271c commit 2216913

File tree

2 files changed

+282
-0
lines changed

2 files changed

+282
-0
lines changed

modules/default.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@
99
./secrets.nix
1010
./mcl-disko
1111
./pharos
12+
./healthcheck
1213
];
1314
}

modules/healthcheck/default.nix

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
{ ... }:
2+
{
3+
flake.modules.nixos.healthcheck =
4+
# /etc/nixos/modules/systemd-healthcheck.nix
5+
{
6+
config,
7+
lib,
8+
pkgs,
9+
...
10+
}:
11+
let
12+
mkProbeOptions = x: {
13+
options =
14+
{
15+
enable = lib.mkEnableOption "the ${x} probe";
16+
17+
command = lib.mkOption {
18+
type = lib.types.str;
19+
description = "The command to execute for the ${x} check. Any necessary programs should be added to the healthcheck.runtimePackages option.";
20+
};
21+
22+
initialDelay = lib.mkOption {
23+
type = lib.types.int;
24+
default = 15;
25+
description = "Seconds to wait after the service is up before the first ${x} probe.";
26+
};
27+
28+
interval = lib.mkOption {
29+
type = lib.types.int;
30+
default = if x == "liveness" then 30 else 2;
31+
description = "How often (in seconds) to perform the ${x} probe.";
32+
};
33+
34+
timeout = lib.mkOption {
35+
type = lib.types.int;
36+
default = 10;
37+
description = "Seconds after which the ${x} probe command times out.";
38+
};
39+
retryCount = lib.mkOption {
40+
type = lib.types.int;
41+
default = 10;
42+
description = "Number of times to retry the ${x} probe before considering it failed. (-1 means infinite retries)";
43+
};
44+
}
45+
// lib.optionalAttrs (x == "readiness") {
46+
statusWaitingMessage = lib.mkOption {
47+
type = lib.types.str;
48+
default = "Service starting, waiting for ready signal...";
49+
description = "The status message to send to systemd while waiting.";
50+
};
51+
52+
statusReadyMessage = lib.mkOption {
53+
type = lib.types.str;
54+
default = "Service is ready.";
55+
description = ''
56+
The status message to send when the service is ready.
57+
Use %OUTPUT% to substitute the output of the check command.
58+
'';
59+
};
60+
};
61+
};
62+
63+
# Options for the liveness probe (timer-based check)
64+
livenessProbeOptions = mkProbeOptions "liveness";
65+
66+
# Options for the readiness probe (notify-based check)
67+
readinessProbeOptions = mkProbeOptions "readiness";
68+
in
69+
{
70+
71+
config =
72+
let
73+
servicesWithHealthcheck = lib.filterAttrs (
74+
_name: service: service.healthcheck != null
75+
) config.mcl.services;
76+
in
77+
{
78+
assertions = lib.mapAttrsToList (
79+
serviceName: serviceConfig:
80+
let
81+
cfg = serviceConfig.healthcheck;
82+
in
83+
lib.mkIf (cfg != null && cfg.readiness-probe.enable) {
84+
assertion = cfg.exec != null;
85+
message = "When healthcheck.readiness-probe is enabled, you must define `healthcheck.exec` with the service command. (${serviceName})";
86+
}
87+
) servicesWithHealthcheck;
88+
systemd = {
89+
timers = lib.mapAttrs' (
90+
mainServiceName: serviceConfig:
91+
let
92+
cfg = serviceConfig.healthcheck;
93+
in
94+
{
95+
name = "${mainServiceName}-liveness-check";
96+
value = lib.mkIf (cfg != null && cfg.liveness-probe.enable) {
97+
description = "Timer for ${mainServiceName} liveness probe";
98+
timerConfig = {
99+
Unit = "${mainServiceName}-liveness-check.service";
100+
};
101+
wantedBy = [ "${mainServiceName}.service" ];
102+
};
103+
}
104+
) servicesWithHealthcheck;
105+
106+
services =
107+
let
108+
mainServices = lib.mapAttrs (
109+
mainServiceName: serviceConfig:
110+
let
111+
cfg = serviceConfig.healthcheck;
112+
in
113+
(lib.mkIf (cfg.readiness-probe.enable) (
114+
let
115+
probeCfg = cfg.readiness-probe;
116+
in
117+
{
118+
# We have to force it to be a notify service, in order to use systemd-notify.
119+
serviceConfig.Type = lib.mkForce "notify";
120+
# If the TimeoutStartSec is not infinity, it can cause the service to fail, because the readiness probe is considered part of the startup.
121+
serviceConfig.TimeoutStartSec = lib.mkForce "infinity";
122+
123+
# We replace the ExecStart with a script that runs the readiness probe in the background, and the original service command in the foreground.
124+
serviceConfig.ExecStart =
125+
let
126+
scriptPath = lib.makeBinPath (
127+
[
128+
pkgs.systemd
129+
pkgs.curl
130+
pkgs.gawk
131+
]
132+
++ (cfg.runtimePackages or [ ])
133+
++ (serviceConfig.path or [ ])
134+
);
135+
in
136+
lib.mkForce (
137+
pkgs.writeShellScript "${mainServiceName}-readiness-check" ''
138+
#!${pkgs.runtimeShell}
139+
set -o nounset
140+
export PATH="${scriptPath}:$PATH"
141+
142+
check() {
143+
echo "Health check: starting background readiness probe for ${mainServiceName}."
144+
sleep ${toString probeCfg.initialDelay}
145+
retryCount=${toString probeCfg.retryCount}
146+
while true; do
147+
if (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then
148+
echo "Health check: probe successful. Notifying systemd that the service is ready."
149+
systemd-notify --ready --status="${probeCfg.statusReadyMessage}"
150+
return 0
151+
else
152+
echo "Health check: probe not successful. Notifying systemd that the service is still waiting. Retrying in ${toString probeCfg.interval} seconds..."
153+
systemd-notify --status="${probeCfg.statusWaitingMessage}"
154+
if [[ ''${retryCount} -ne -1 ]]; then
155+
retryCount=$((retryCount - 1))
156+
if [[ ''${retryCount} -le 0 ]]; then
157+
echo "Health check: probe failed after maximum retries. Exiting."
158+
exit 1
159+
fi
160+
fi
161+
fi
162+
sleep ${toString probeCfg.interval}
163+
done
164+
}
165+
166+
if [[ -n "''${NOTIFY_SOCKET:-}" ]]; then
167+
check &
168+
else
169+
echo "Health check: NOTIFY_SOCKET not set. Cannot run readiness probe." >&2
170+
exit 1
171+
fi
172+
173+
${cfg.exec}
174+
''
175+
);
176+
}
177+
))
178+
) servicesWithHealthcheck;
179+
healthCheckServices = lib.mapAttrs' (
180+
mainServiceName: serviceConfig:
181+
let
182+
cfg = serviceConfig.healthcheck;
183+
in
184+
{
185+
name = "${mainServiceName}-liveness-check";
186+
value = lib.mkIf (cfg != null && cfg.liveness-probe.enable) (
187+
let
188+
probeCfg = cfg.liveness-probe;
189+
checkScript = pkgs.writeShellScript "liveness-check" ''
190+
#!${pkgs.runtimeShell}
191+
retryCount=${toString probeCfg.retryCount}
192+
sleep ${toString probeCfg.initialDelay}
193+
echo "Executing liveness probe for ${mainServiceName}..."
194+
# If the command fails, explicitly restart the main service
195+
while true; do
196+
if ! (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then
197+
echo "(timeout ${toString probeCfg.timeout}s ${probeCfg.command})"
198+
echo "Liveness probe for ${mainServiceName} failed. Triggering restart..."
199+
${lib.getExe' pkgs.systemd "systemctl"} restart ${mainServiceName}.service &
200+
if [[ ''${retryCount} -ne -1 ]]; then
201+
retryCount=$((retryCount - 1))
202+
if [[ ''${retryCount} -le 0 ]]; then
203+
echo "Liveness probe failed after maximum retries. Exiting."
204+
exit 1
205+
fi
206+
fi
207+
fi
208+
sleep ${toString probeCfg.interval}
209+
done
210+
'';
211+
in
212+
{
213+
description = "Liveness check for ${mainServiceName}";
214+
# This check needs systemctl in its path.
215+
path = [ pkgs.systemd ] ++ (cfg.runtimePackages or [ ]);
216+
serviceConfig = {
217+
Type = "oneshot";
218+
ExecStart = "${checkScript}";
219+
};
220+
}
221+
);
222+
}
223+
) servicesWithHealthcheck;
224+
in
225+
mainServices // healthCheckServices;
226+
};
227+
};
228+
229+
options.mcl.services = lib.mkOption {
230+
default = { };
231+
type = lib.types.attrsOf (
232+
lib.types.submodule (
233+
{ ... }:
234+
{
235+
options = {
236+
healthcheck = lib.mkOption {
237+
default = null;
238+
description = "Declarative health checks for this systemd service.";
239+
type = lib.types.nullOr (
240+
lib.types.submodule {
241+
options = {
242+
# Programs to add to the PATH for the health check.
243+
runtimePackages = lib.mkOption {
244+
type = lib.types.listOf lib.types.package;
245+
default = [ ];
246+
description = "Additional programs to add to the PATH for health checks.";
247+
};
248+
249+
# The main command for the service, required when readiness-probe is on.
250+
exec = lib.mkOption {
251+
type = lib.types.str;
252+
description = ''
253+
The actual command to run for the service.
254+
This MUST be used instead of `script` or `serviceConfig.ExecStart`
255+
when `readiness-probe.enable` is true.
256+
'';
257+
};
258+
259+
# The new readiness probe that uses the notify pattern.
260+
readiness-probe = lib.mkOption {
261+
type = lib.types.submodule readinessProbeOptions;
262+
default = { };
263+
};
264+
265+
# The liveness probe (timer-based).
266+
liveness-probe = lib.mkOption {
267+
type = lib.types.submodule livenessProbeOptions;
268+
default = { };
269+
};
270+
};
271+
}
272+
);
273+
};
274+
};
275+
276+
}
277+
)
278+
);
279+
};
280+
};
281+
}

0 commit comments

Comments
 (0)