Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use dbus to tell when systemd has completed an action for us #3805

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
291 changes: 174 additions & 117 deletions daemons/execd/execd_commands.c
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,57 @@ client_disconnect_cleanup(const char *client_id)
}
}

#if SUPPORT_SYSTEMD
void
handle_systemd_job_complete(int job_id, const char *bus_path, const char *unit_name,
const char *result, void *user_data)
{
svc_action_t *action = NULL;
lrmd_cmd_t *cmd = NULL;
lrmd_rsc_t *rsc = NULL;

/* Get an inflight operation that matches unit_name. If there's no match,
* systemd is telling us about a job for a unit that we didn't invoke and
* therefore don't care about. We can just return in that case.
*/
action = services__systemd_get_inflight_op(unit_name);

if (action == NULL) {
return;
}

cmd = action->cb_data;
rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL;

/* Actions besides Start and Stop are not supported right now */
if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START, PCMK_ACTION_STOP, NULL)) {
return;
}

if (pcmk__str_eq(result, "done", pcmk__str_none)) {
pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);

} else if (pcmk__str_eq(result, "timeout", pcmk__str_none)) {
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_TIMEOUT,
"Investigate reason for timeout, and adjust "
"configured operation timeout if necessary");

} else {
/* FIXME: Should I handle additional results here instead of globbing
* them together into a generic error?
*/
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_ERROR, NULL);
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note the FIXME here. There's several other possibilities systemd could give us - canceled, failed, dependency, and skipped are all documented. These could be broken out into separate errors.


pcmk__set_result_output(&(cmd->result), services__grab_stdout(action),
services__grab_stderr(action));
services__finalize_async_op(action);
cmd_finalize(cmd, rsc);
}
#endif

static void
action_complete(svc_action_t * action)
{
Expand All @@ -828,6 +879,9 @@ action_complete(svc_action_t * action)
#ifdef PCMK__TIME_USE_CGT
const char *rclass = NULL;
bool goagain = false;
int time_sum = 0;
int timeout_left = 0;
int delay = 0;
#endif

if (!cmd) {
Expand Down Expand Up @@ -862,142 +916,145 @@ action_complete(svc_action_t * action)
#endif
}

if (pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) {
if (pcmk__result_ok(&(cmd->result))
&& pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
PCMK_ACTION_STOP, NULL)) {
/* systemd returns from start and stop actions after the action
* begins, not after it completes. We have to jump through a few
* hoops so that we don't report 'complete' to the rest of pacemaker
* until it's actually done.
*/
if (!pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) {
goto finalize;
}

if (pcmk__result_ok(&(cmd->result)) &&
pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START, PCMK_ACTION_STOP, NULL)) {
/* Getting results for when a start or stop action completes is now
* handled by watching for JobRemoved() signals from systemd and
* reacting to them. So, we can bypass the rest of the code in this
* function for those actions.
*
* @TODO When monitors are handled in the same way, this function
* can either be drastically simplified or done away with entirely.
*/
return;

} else if (cmd->result.execution_status == PCMK_EXEC_PENDING &&
pcmk__str_any_of(cmd->action, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL) &&
cmd->interval_ms == 0 &&
cmd->real_action == NULL) {
/* If the state is Pending at the time of probe, execute follow-up monitor. */
goagain = true;
cmd->real_action = cmd->action;
cmd->action = pcmk__str_copy(PCMK_ACTION_MONITOR);
} else if (cmd->real_action != NULL) {
// This is follow-up monitor to check whether start/stop/probe(monitor) completed
if (cmd->result.execution_status == PCMK_EXEC_PENDING) {
goagain = true;
cmd->real_action = cmd->action;
cmd->action = pcmk__str_copy(PCMK_ACTION_MONITOR);

} else if (cmd->result.execution_status == PCMK_EXEC_PENDING &&
pcmk__str_any_of(cmd->action, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL) &&
cmd->interval_ms == 0 &&
cmd->real_action == NULL) {
/* If the state is Pending at the time of probe, execute follow-up monitor. */

} else if (pcmk__result_ok(&(cmd->result))
&& pcmk__str_eq(cmd->real_action, PCMK_ACTION_STOP,
pcmk__str_casei)) {
goagain = true;
cmd->real_action = cmd->action;
cmd->action = pcmk__str_copy(PCMK_ACTION_MONITOR);
} else if (cmd->real_action != NULL) {
// This is follow-up monitor to check whether start/stop/probe(monitor) completed
if (cmd->result.execution_status == PCMK_EXEC_PENDING) {
goagain = true;

} else if (pcmk__result_ok(&(cmd->result))
&& pcmk__str_eq(cmd->real_action, PCMK_ACTION_STOP,
pcmk__str_casei)) {
goagain = true;

} else {
int time_sum = time_diff_ms(NULL, &(cmd->t_first_run));
int timeout_left = cmd->timeout_orig - time_sum;

crm_debug("%s systemd %s is now complete (elapsed=%dms, "
"remaining=%dms): %s (%d)",
cmd->rsc_id, cmd->real_action, time_sum, timeout_left,
crm_exit_str(cmd->result.exit_status),
cmd->result.exit_status);
cmd_original_times(cmd);

// Monitors may return "not running", but start/stop shouldn't
if ((cmd->result.execution_status == PCMK_EXEC_DONE)
&& (cmd->result.exit_status == PCMK_OCF_NOT_RUNNING)) {

if (pcmk__str_eq(cmd->real_action, PCMK_ACTION_START,
pcmk__str_casei)) {
cmd->result.exit_status = PCMK_OCF_UNKNOWN_ERROR;
} else if (pcmk__str_eq(cmd->real_action, PCMK_ACTION_STOP,
pcmk__str_casei)) {
cmd->result.exit_status = PCMK_OCF_OK;
}
} else {
int time_sum = time_diff_ms(NULL, &(cmd->t_first_run));
int timeout_left = cmd->timeout_orig - time_sum;

crm_debug("%s systemd %s is now complete (elapsed=%dms, "
"remaining=%dms): %s (%d)",
cmd->rsc_id, cmd->real_action, time_sum, timeout_left,
crm_exit_str(cmd->result.exit_status),
cmd->result.exit_status);
cmd_original_times(cmd);

// Monitors may return "not running", but start/stop shouldn't
if ((cmd->result.execution_status == PCMK_EXEC_DONE)
&& (cmd->result.exit_status == PCMK_OCF_NOT_RUNNING)) {

if (pcmk__str_eq(cmd->real_action, PCMK_ACTION_START,
pcmk__str_casei)) {
cmd->result.exit_status = PCMK_OCF_UNKNOWN_ERROR;
} else if (pcmk__str_eq(cmd->real_action, PCMK_ACTION_STOP,
pcmk__str_casei)) {
cmd->result.exit_status = PCMK_OCF_OK;
}
}
} else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL) &&
(cmd->interval_ms > 0)) {
/* For monitors, excluding follow-up monitors, */
/* if the pending state persists from the first notification until its timeout, */
/* it will be treated as a timeout. */

if ((cmd->result.execution_status == PCMK_EXEC_PENDING) &&
(cmd->last_notify_op_status == PCMK_EXEC_PENDING)) {
int time_left = time(NULL) - (cmd->epoch_rcchange + (cmd->timeout_orig/1000));

if (time_left >= 0) {
crm_notice("Giving up on %s %s (rc=%d): monitor pending timeout (first pending notification=%s timeout=%ds)",
cmd->rsc_id, cmd->action,
cmd->result.exit_status, pcmk__trim(ctime(&cmd->epoch_rcchange)), cmd->timeout_orig);
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_TIMEOUT,
"Investigate reason for timeout, and adjust "
"configured operation timeout if necessary");
cmd_original_times(cmd);
}
}
} else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL) &&
(cmd->interval_ms > 0)) {
/* For monitors, excluding follow-up monitors, */
/* if the pending state persists from the first notification until its timeout, */
/* it will be treated as a timeout. */

if ((cmd->result.execution_status == PCMK_EXEC_PENDING) &&
(cmd->last_notify_op_status == PCMK_EXEC_PENDING)) {
int time_left = time(NULL) - (cmd->epoch_rcchange + (cmd->timeout_orig/1000));

if (time_left >= 0) {
crm_notice("Giving up on %s %s (rc=%d): monitor pending timeout (first pending notification=%s timeout=%ds)",
cmd->rsc_id, cmd->action,
cmd->result.exit_status, pcmk__trim(ctime(&cmd->epoch_rcchange)), cmd->timeout_orig);
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_TIMEOUT,
"Investigate reason for timeout, and adjust "
"configured operation timeout if necessary");
cmd_original_times(cmd);
}
}
}
#endif

#ifdef PCMK__TIME_USE_CGT
if (goagain) {
int time_sum = time_diff_ms(NULL, &(cmd->t_first_run));
int timeout_left = cmd->timeout_orig - time_sum;
int delay = cmd->timeout_orig / 10;
if (!goagain) {
goto finalize;
}

if(delay >= timeout_left && timeout_left > 20) {
delay = timeout_left/2;
}
time_sum = time_diff_ms(NULL, &(cmd->t_first_run));
timeout_left = cmd->timeout_orig - time_sum;
delay = cmd->timeout_orig / 10;

delay = QB_MIN(2000, delay);
if (delay < timeout_left) {
cmd->start_delay = delay;
cmd->timeout = timeout_left;

if (pcmk__result_ok(&(cmd->result))) {
crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay);

} else if (cmd->result.execution_status == PCMK_EXEC_PENDING) {
crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
cmd->rsc_id, cmd->action, time_sum, timeout_left, delay);

} else {
crm_notice("%s %s failed: %s: Re-scheduling (remaining "
"timeout %s) " QB_XS
" exitstatus=%d elapsed=%dms start_delay=%dms)",
cmd->rsc_id, cmd->action,
crm_exit_str(cmd->result.exit_status),
pcmk__readable_interval(timeout_left),
cmd->result.exit_status, time_sum, delay);
}
if (delay >= timeout_left && timeout_left > 20) {
delay = timeout_left/2;
}

cmd_reset(cmd);
if(rsc) {
rsc->active = NULL;
}
schedule_lrmd_cmd(rsc, cmd);
delay = QB_MIN(2000, delay);
if (delay < timeout_left) {
cmd->start_delay = delay;
cmd->timeout = timeout_left;

/* Don't finalize cmd, we're not done with it yet */
return;
if (pcmk__result_ok(&(cmd->result))) {
crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay);

} else if (cmd->result.execution_status == PCMK_EXEC_PENDING) {
crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)",
cmd->rsc_id, cmd->action, time_sum, timeout_left, delay);

} else {
crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)",
cmd->rsc_id,
(cmd->real_action? cmd->real_action : cmd->action),
cmd->result.exit_status, time_sum, timeout_left);
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_TIMEOUT,
"Investigate reason for timeout, and adjust "
"configured operation timeout if necessary");
cmd_original_times(cmd);
crm_notice("%s %s failed: %s: Re-scheduling (remaining "
"timeout %s) " QB_XS
" exitstatus=%d elapsed=%dms start_delay=%dms)",
cmd->rsc_id, cmd->action,
crm_exit_str(cmd->result.exit_status),
pcmk__readable_interval(timeout_left),
cmd->result.exit_status, time_sum, delay);
}

cmd_reset(cmd);
if (rsc) {
rsc->active = NULL;
}
schedule_lrmd_cmd(rsc, cmd);

/* Don't finalize cmd, we're not done with it yet */
return;

} else {
crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)",
cmd->rsc_id,
(cmd->real_action? cmd->real_action : cmd->action),
cmd->result.exit_status, time_sum, timeout_left);
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
PCMK_EXEC_TIMEOUT,
"Investigate reason for timeout, and adjust "
"configured operation timeout if necessary");
cmd_original_times(cmd);
}
#endif

finalize:
pcmk__set_result_output(&(cmd->result), services__grab_stdout(action),
services__grab_stderr(action));
cmd_finalize(cmd, rsc);
Expand Down
7 changes: 6 additions & 1 deletion daemons/execd/pacemaker-execd.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2012-2024 the Pacemaker project contributors
* Copyright 2012-2025 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
Expand All @@ -16,6 +16,7 @@
#include <crm/crm.h>
#include <crm/common/xml.h>
#include <crm/services.h>
#include <crm/services_internal.h>
#include <crm/common/cmdline_internal.h>
#include <crm/common/ipc.h>
#include <crm/common/ipc_internal.h>
Expand Down Expand Up @@ -547,6 +548,10 @@ main(int argc, char **argv, char **envp)
ipc_proxy_init();
#endif

#if SUPPORT_SYSTEMD
services__set_systemd_callback(handle_systemd_job_complete, NULL);
#endif

mainloop_add_signal(SIGTERM, lrmd_shutdown);
mainloop = g_main_loop_new(NULL, FALSE);
crm_notice("Pacemaker " EXECD_TYPE " executor successfully started and accepting connections");
Expand Down
8 changes: 7 additions & 1 deletion daemons/execd/pacemaker-execd.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2012-2024 the Pacemaker project contributors
* Copyright 2012-2025 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
Expand Down Expand Up @@ -88,6 +88,12 @@ stonith_t *get_stonith_connection(void);
*/
void stonith_connection_failed(void);

#if SUPPORT_SYSTEMD
void handle_systemd_job_complete(int job_id, const char *bus_path,
const char *unit_name, const char *result,
void *user_data);
#endif

#ifdef PCMK__COMPILE_REMOTE
void ipc_proxy_init(void);
void ipc_proxy_cleanup(void);
Expand Down
Loading