Skip to content

Commit 5fd0c47

Browse files
committed
Add error to point user to slurm resume log
(cherry picked from commit 84ec039)
1 parent e45dfd8 commit 5fd0c47

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

src/slurm_plugin/clustermgtd.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources(
12621262
return
12631263
log.info(
12641264
"The following compute resources are in down state due to insufficient capacity: %s, "
1265-
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
1265+
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired."
1266+
"Check the slurm_resume log for ec2 error codes.",
12661267
self._insufficient_capacity_compute_resources,
12671268
self._config.insufficient_capacity_timeout,
12681269
)

src/slurm_plugin/resume.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,10 @@ def _resume(arg_nodes, resume_config, slurm_resume):
227227
print_with_count(failed_nodes),
228228
)
229229
for error_code, node_list in instance_manager.failed_nodes.items():
230-
_handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
230+
_handle_failed_nodes(
231+
node_list,
232+
reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes",
233+
)
231234

232235
event_publisher = ClusterEventPublisher.create_with_default_publisher(
233236
event_logger,

0 commit comments

Comments
 (0)