-
Notifications
You must be signed in to change notification settings - Fork 348
Use dbus to tell when systemd has completed an action for us #3805
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
aa3e74c
Refactor: libcrmservice: systemd_init should return a bool.
clumens a497f6a
Refactor: daemons: Fix whitespace problems in execd_commands.c.
clumens 9a11cb8
Refactor: daemons: Get rid of an unnecessary #endif/#ifdef.
clumens 8b4c66f
Refactor: daemons: Unindent a block of code in action_complete.
clumens 8985e8b
Refactor: daemons: Unindent the goagain block in action_complete.
clumens a9edc52
Refactor: libservices: Add the systemd unit name to opaque data.
clumens 31170b9
Feature: libcrmservice: Subscribe to systemd dbus signals.
clumens 396dfad
Feature: libcrmservice: Only respond to JobRemoved signals.
clumens 041a18e
Feature: libcrmservices: Add a systemd job complete callback.
clumens 1590694
Feature: systemd: Don't initiate a timer for start actions.
clumens 94ee566
Feature: daemons: Don't handle start actions in action_complete.
clumens 791cabf
Refactor: libcrmservices: Add a function to get an inflight op.
clumens 8fc6d68
Refactor: libcrmservices: Expose services__finalize_async_op.
clumens bf13de0
Feature: daemons: Use dbus to detect systemd resource startup.
clumens 1b6c951
Feature: daemons: Use dbus to detect systemd resources stopping.
clumens c0d99b9
Fix: libcrmservices: Add another place to finalize Start/Stop ops.
clumens File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -818,6 +818,57 @@ client_disconnect_cleanup(const char *client_id) | |
} | ||
} | ||
|
||
#if SUPPORT_SYSTEMD | ||
void | ||
handle_systemd_job_complete(int job_id, const char *bus_path, const char *unit_name, | ||
const char *result, void *user_data) | ||
{ | ||
svc_action_t *action = NULL; | ||
lrmd_cmd_t *cmd = NULL; | ||
lrmd_rsc_t *rsc = NULL; | ||
|
||
/* Get an inflight operation that matches unit_name. If there's no match, | ||
* systemd is telling us about a job for a unit that we didn't invoke and | ||
* therefore don't care about. We can just return in that case. | ||
*/ | ||
action = services__systemd_get_inflight_op(unit_name); | ||
|
||
if (action == NULL) { | ||
return; | ||
} | ||
|
||
cmd = action->cb_data; | ||
rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; | ||
|
||
/* Actions besides Start and Stop are not supported right now */ | ||
if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START, PCMK_ACTION_STOP, NULL)) { | ||
return; | ||
} | ||
|
||
if (pcmk__str_eq(result, "done", pcmk__str_none)) { | ||
pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); | ||
|
||
} else if (pcmk__str_eq(result, "timeout", pcmk__str_none)) { | ||
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, | ||
PCMK_EXEC_TIMEOUT, | ||
"Investigate reason for timeout, and adjust " | ||
"configured operation timeout if necessary"); | ||
|
||
} else { | ||
/* FIXME: Should I handle additional results here instead of globbing | ||
* them together into a generic error? | ||
*/ | ||
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, | ||
PCMK_EXEC_ERROR, NULL); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note the FIXME here. There's several other possibilities systemd could give us - canceled, failed, dependency, and skipped are all documented. These could be broken out into separate errors. |
||
|
||
pcmk__set_result_output(&(cmd->result), services__grab_stdout(action), | ||
clumens marked this conversation as resolved.
Show resolved
Hide resolved
|
||
services__grab_stderr(action)); | ||
services__finalize_async_op(action); | ||
cmd_finalize(cmd, rsc); | ||
} | ||
#endif | ||
clumens marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
static void | ||
action_complete(svc_action_t * action) | ||
{ | ||
|
@@ -828,6 +879,9 @@ action_complete(svc_action_t * action) | |
#ifdef PCMK__TIME_USE_CGT | ||
const char *rclass = NULL; | ||
bool goagain = false; | ||
int time_sum = 0; | ||
int timeout_left = 0; | ||
int delay = 0; | ||
#endif | ||
|
||
if (!cmd) { | ||
|
@@ -862,142 +916,145 @@ action_complete(svc_action_t * action) | |
#endif | ||
} | ||
|
||
if (pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) { | ||
if (pcmk__result_ok(&(cmd->result)) | ||
&& pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START, | ||
PCMK_ACTION_STOP, NULL)) { | ||
/* systemd returns from start and stop actions after the action | ||
* begins, not after it completes. We have to jump through a few | ||
* hoops so that we don't report 'complete' to the rest of pacemaker | ||
* until it's actually done. | ||
*/ | ||
if (!pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) { | ||
goto finalize; | ||
} | ||
|
||
if (pcmk__result_ok(&(cmd->result)) && | ||
clumens marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START, PCMK_ACTION_STOP, NULL)) { | ||
/* Getting results for when a start or stop action completes is now | ||
* handled by watching for JobRemoved() signals from systemd and | ||
* reacting to them. So, we can bypass the rest of the code in this | ||
* function for those actions. | ||
* | ||
* @TODO When monitors are handled in the same way, this function | ||
* can either be drastically simplified or done away with entirely. | ||
*/ | ||
return; | ||
|
||
} else if (cmd->result.execution_status == PCMK_EXEC_PENDING && | ||
pcmk__str_any_of(cmd->action, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL) && | ||
cmd->interval_ms == 0 && | ||
cmd->real_action == NULL) { | ||
/* If the state is Pending at the time of probe, execute follow-up monitor. */ | ||
goagain = true; | ||
cmd->real_action = cmd->action; | ||
cmd->action = pcmk__str_copy(PCMK_ACTION_MONITOR); | ||
} else if (cmd->real_action != NULL) { | ||
// This is follow-up monitor to check whether start/stop/probe(monitor) completed | ||
if (cmd->result.execution_status == PCMK_EXEC_PENDING) { | ||
goagain = true; | ||
cmd->real_action = cmd->action; | ||
cmd->action = pcmk__str_copy(PCMK_ACTION_MONITOR); | ||
|
||
} else if (cmd->result.execution_status == PCMK_EXEC_PENDING && | ||
pcmk__str_any_of(cmd->action, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL) && | ||
cmd->interval_ms == 0 && | ||
cmd->real_action == NULL) { | ||
/* If the state is Pending at the time of probe, execute follow-up monitor. */ | ||
|
||
} else if (pcmk__result_ok(&(cmd->result)) | ||
&& pcmk__str_eq(cmd->real_action, PCMK_ACTION_STOP, | ||
pcmk__str_casei)) { | ||
goagain = true; | ||
cmd->real_action = cmd->action; | ||
cmd->action = pcmk__str_copy(PCMK_ACTION_MONITOR); | ||
} else if (cmd->real_action != NULL) { | ||
// This is follow-up monitor to check whether start/stop/probe(monitor) completed | ||
if (cmd->result.execution_status == PCMK_EXEC_PENDING) { | ||
goagain = true; | ||
|
||
} else if (pcmk__result_ok(&(cmd->result)) | ||
&& pcmk__str_eq(cmd->real_action, PCMK_ACTION_STOP, | ||
pcmk__str_casei)) { | ||
goagain = true; | ||
|
||
} else { | ||
int time_sum = time_diff_ms(NULL, &(cmd->t_first_run)); | ||
int timeout_left = cmd->timeout_orig - time_sum; | ||
|
||
crm_debug("%s systemd %s is now complete (elapsed=%dms, " | ||
"remaining=%dms): %s (%d)", | ||
cmd->rsc_id, cmd->real_action, time_sum, timeout_left, | ||
crm_exit_str(cmd->result.exit_status), | ||
cmd->result.exit_status); | ||
cmd_original_times(cmd); | ||
|
||
// Monitors may return "not running", but start/stop shouldn't | ||
if ((cmd->result.execution_status == PCMK_EXEC_DONE) | ||
&& (cmd->result.exit_status == PCMK_OCF_NOT_RUNNING)) { | ||
|
||
if (pcmk__str_eq(cmd->real_action, PCMK_ACTION_START, | ||
pcmk__str_casei)) { | ||
cmd->result.exit_status = PCMK_OCF_UNKNOWN_ERROR; | ||
} else if (pcmk__str_eq(cmd->real_action, PCMK_ACTION_STOP, | ||
pcmk__str_casei)) { | ||
cmd->result.exit_status = PCMK_OCF_OK; | ||
} | ||
} else { | ||
int time_sum = time_diff_ms(NULL, &(cmd->t_first_run)); | ||
int timeout_left = cmd->timeout_orig - time_sum; | ||
|
||
crm_debug("%s systemd %s is now complete (elapsed=%dms, " | ||
"remaining=%dms): %s (%d)", | ||
cmd->rsc_id, cmd->real_action, time_sum, timeout_left, | ||
crm_exit_str(cmd->result.exit_status), | ||
cmd->result.exit_status); | ||
cmd_original_times(cmd); | ||
|
||
// Monitors may return "not running", but start/stop shouldn't | ||
if ((cmd->result.execution_status == PCMK_EXEC_DONE) | ||
&& (cmd->result.exit_status == PCMK_OCF_NOT_RUNNING)) { | ||
|
||
if (pcmk__str_eq(cmd->real_action, PCMK_ACTION_START, | ||
pcmk__str_casei)) { | ||
cmd->result.exit_status = PCMK_OCF_UNKNOWN_ERROR; | ||
} else if (pcmk__str_eq(cmd->real_action, PCMK_ACTION_STOP, | ||
pcmk__str_casei)) { | ||
cmd->result.exit_status = PCMK_OCF_OK; | ||
} | ||
} | ||
} else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL) && | ||
(cmd->interval_ms > 0)) { | ||
/* For monitors, excluding follow-up monitors, */ | ||
/* if the pending state persists from the first notification until its timeout, */ | ||
/* it will be treated as a timeout. */ | ||
|
||
if ((cmd->result.execution_status == PCMK_EXEC_PENDING) && | ||
(cmd->last_notify_op_status == PCMK_EXEC_PENDING)) { | ||
int time_left = time(NULL) - (cmd->epoch_rcchange + (cmd->timeout_orig/1000)); | ||
|
||
if (time_left >= 0) { | ||
crm_notice("Giving up on %s %s (rc=%d): monitor pending timeout (first pending notification=%s timeout=%ds)", | ||
cmd->rsc_id, cmd->action, | ||
cmd->result.exit_status, pcmk__trim(ctime(&cmd->epoch_rcchange)), cmd->timeout_orig); | ||
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, | ||
PCMK_EXEC_TIMEOUT, | ||
"Investigate reason for timeout, and adjust " | ||
"configured operation timeout if necessary"); | ||
cmd_original_times(cmd); | ||
} | ||
} | ||
} else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_MONITOR, PCMK_ACTION_STATUS, NULL) && | ||
(cmd->interval_ms > 0)) { | ||
/* For monitors, excluding follow-up monitors, */ | ||
/* if the pending state persists from the first notification until its timeout, */ | ||
/* it will be treated as a timeout. */ | ||
|
||
if ((cmd->result.execution_status == PCMK_EXEC_PENDING) && | ||
(cmd->last_notify_op_status == PCMK_EXEC_PENDING)) { | ||
int time_left = time(NULL) - (cmd->epoch_rcchange + (cmd->timeout_orig/1000)); | ||
|
||
if (time_left >= 0) { | ||
crm_notice("Giving up on %s %s (rc=%d): monitor pending timeout (first pending notification=%s timeout=%ds)", | ||
cmd->rsc_id, cmd->action, | ||
clumens marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cmd->result.exit_status, pcmk__trim(ctime(&cmd->epoch_rcchange)), cmd->timeout_orig); | ||
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, | ||
PCMK_EXEC_TIMEOUT, | ||
"Investigate reason for timeout, and adjust " | ||
"configured operation timeout if necessary"); | ||
cmd_original_times(cmd); | ||
} | ||
} | ||
} | ||
#endif | ||
clumens marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
#ifdef PCMK__TIME_USE_CGT | ||
if (goagain) { | ||
int time_sum = time_diff_ms(NULL, &(cmd->t_first_run)); | ||
int timeout_left = cmd->timeout_orig - time_sum; | ||
int delay = cmd->timeout_orig / 10; | ||
if (!goagain) { | ||
goto finalize; | ||
} | ||
|
||
if(delay >= timeout_left && timeout_left > 20) { | ||
delay = timeout_left/2; | ||
} | ||
time_sum = time_diff_ms(NULL, &(cmd->t_first_run)); | ||
timeout_left = cmd->timeout_orig - time_sum; | ||
delay = cmd->timeout_orig / 10; | ||
|
||
delay = QB_MIN(2000, delay); | ||
if (delay < timeout_left) { | ||
cmd->start_delay = delay; | ||
cmd->timeout = timeout_left; | ||
|
||
if (pcmk__result_ok(&(cmd->result))) { | ||
crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", | ||
cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay); | ||
|
||
} else if (cmd->result.execution_status == PCMK_EXEC_PENDING) { | ||
crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", | ||
cmd->rsc_id, cmd->action, time_sum, timeout_left, delay); | ||
|
||
} else { | ||
crm_notice("%s %s failed: %s: Re-scheduling (remaining " | ||
"timeout %s) " QB_XS | ||
" exitstatus=%d elapsed=%dms start_delay=%dms)", | ||
cmd->rsc_id, cmd->action, | ||
crm_exit_str(cmd->result.exit_status), | ||
pcmk__readable_interval(timeout_left), | ||
cmd->result.exit_status, time_sum, delay); | ||
} | ||
if (delay >= timeout_left && timeout_left > 20) { | ||
delay = timeout_left/2; | ||
} | ||
|
||
cmd_reset(cmd); | ||
if(rsc) { | ||
rsc->active = NULL; | ||
} | ||
schedule_lrmd_cmd(rsc, cmd); | ||
delay = QB_MIN(2000, delay); | ||
if (delay < timeout_left) { | ||
cmd->start_delay = delay; | ||
cmd->timeout = timeout_left; | ||
|
||
/* Don't finalize cmd, we're not done with it yet */ | ||
return; | ||
if (pcmk__result_ok(&(cmd->result))) { | ||
crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", | ||
cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay); | ||
|
||
} else if (cmd->result.execution_status == PCMK_EXEC_PENDING) { | ||
crm_info("%s %s is still in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", | ||
cmd->rsc_id, cmd->action, time_sum, timeout_left, delay); | ||
|
||
} else { | ||
crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)", | ||
cmd->rsc_id, | ||
(cmd->real_action? cmd->real_action : cmd->action), | ||
cmd->result.exit_status, time_sum, timeout_left); | ||
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, | ||
PCMK_EXEC_TIMEOUT, | ||
"Investigate reason for timeout, and adjust " | ||
"configured operation timeout if necessary"); | ||
cmd_original_times(cmd); | ||
crm_notice("%s %s failed: %s: Re-scheduling (remaining " | ||
"timeout %s) " QB_XS | ||
" exitstatus=%d elapsed=%dms start_delay=%dms)", | ||
cmd->rsc_id, cmd->action, | ||
crm_exit_str(cmd->result.exit_status), | ||
pcmk__readable_interval(timeout_left), | ||
cmd->result.exit_status, time_sum, delay); | ||
} | ||
|
||
cmd_reset(cmd); | ||
if (rsc) { | ||
rsc->active = NULL; | ||
} | ||
schedule_lrmd_cmd(rsc, cmd); | ||
|
||
/* Don't finalize cmd, we're not done with it yet */ | ||
return; | ||
|
||
} else { | ||
crm_notice("Giving up on %s %s (rc=%d): timeout (elapsed=%dms, remaining=%dms)", | ||
cmd->rsc_id, | ||
(cmd->real_action? cmd->real_action : cmd->action), | ||
cmd->result.exit_status, time_sum, timeout_left); | ||
pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, | ||
PCMK_EXEC_TIMEOUT, | ||
"Investigate reason for timeout, and adjust " | ||
"configured operation timeout if necessary"); | ||
cmd_original_times(cmd); | ||
} | ||
#endif | ||
|
||
finalize: | ||
pcmk__set_result_output(&(cmd->result), services__grab_stdout(action), | ||
services__grab_stderr(action)); | ||
cmd_finalize(cmd, rsc); | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.