Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ ignore =
W503,
# N818: exception name should be named with an Error suffix
N818
# B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`.
# Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525
B042
exclude =
.tox,
.git,
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG

This file is used to list changes made in each version of the aws-parallelcluster-node package.

3.15.0
------

**CHANGES**
- Direct users to slurm_resume log to see EC2 error codes if no instances are launched.

3.14.0
------

Expand Down
3 changes: 2 additions & 1 deletion src/slurm_plugin/clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources(
return
log.info(
"The following compute resources are in down state due to insufficient capacity: %s, "
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired. "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Test] Can we reflect this change into the corresponding unit test. The same thing you did for the resume script.

"Check the slurm_resume log for EC2 error codes.",
self._insufficient_capacity_compute_resources,
self._config.insufficient_capacity_timeout,
)
Expand Down
6 changes: 5 additions & 1 deletion src/slurm_plugin/resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,11 @@ def _resume(arg_nodes, resume_config, slurm_resume):
print_with_count(failed_nodes),
)
for error_code, node_list in instance_manager.failed_nodes.items():
_handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
_handle_failed_nodes(
node_list,
reason=f"(Code:{error_code})Failure when resuming nodes - "
f"Check the slurm_resume log for EC2 error codes",
)

event_publisher = ClusterEventPublisher.create_with_default_publisher(
event_logger,
Expand Down
7 changes: 7 additions & 0 deletions tests/slurm_plugin/test_clustermgtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3533,6 +3533,13 @@ def test_reset_timeout_expired_compute_resources(
assert_that(cluster_manager._insufficient_capacity_compute_resources).is_equal_to(
expected_insufficient_capacity_compute_resources
)

if expected_insufficient_capacity_compute_resources:
assert (
"compute resources will be reset after insufficient capacity timeout (20 seconds) expired. "
"Check the slurm_resume log for EC2 error codes."
) in caplog.text

if expected_power_save_node_list:
power_save_mock.assert_called_with(
expected_power_save_node_list, reason="Enabling node since insufficient capacity timeout expired"
Expand Down
6 changes: 5 additions & 1 deletion tests/slurm_plugin/test_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,11 @@ def test_resume_launch(
if expected_failed_nodes:
for error_code, nodeset in expected_failed_nodes.items():
mock_handle_failed_nodes_calls.append(
call(nodeset, reason=f"(Code:{error_code})Failure when resuming nodes")
call(
nodeset,
reason=f"(Code:{error_code})Failure when resuming nodes - "
f"Check the slurm_resume log for EC2 error codes",
)
)
mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)
mock_terminate_instances.assert_called_with(ANY, mock_resume_config.terminate_max_batch_size)
Expand Down