diff --git a/README.md b/README.md index 23690dce..00d5b25a 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ CycleCloud Slurm Clusters in Azure This project sets up an auto-scaling Slurm cluster Slurm is a highly configurable open source workload manager. See the [Slurm project site](https://www.schedmd.com/) for an overview. # Table of Contents: -1. [Managing Slurm Clusters in 4.0.4](#managing-slurm-clusters) +1. [Managing Slurm Clusters in 4.0.5](#managing-slurm-clusters) 1. [Making Cluster Changes](#making-cluster-changes) 2. [No longer pre-creating execute nodes](#no-longer-pre-creating-execute-nodes) 3. [Creating additional partitions](#creating-additional-partitions) @@ -36,7 +36,7 @@ Slurm is a highly configurable open source workload manager. See the [Slurm proj 8. [Capturing logs and configuration for troubleshooting](#capturing-logs-and-configuration-data-for-troubleshooting) 6. [Contributing](#contributing) --- -## Managing Slurm Clusters in 4.0.4 +## Managing Slurm Clusters in 4.0.5 ### Making Cluster Changes In CycleCloud, cluster changes can be made using the "Edit" dialog from the cluster page in the GUI or from the CycleCloud CLI. Cluster topology changes, such as new partitions, generally require editing and re-importing the cluster template. This can be applied to live, running clusters as well as terminated clusters. It is also possible to import changes as a new Template for future cluster creation via the GUI. @@ -362,12 +362,12 @@ Cyclecloud Slurm clusters now include prolog and epilog scripts to enable and cl ### Setting KeepAlive -Added in 4.0.4: If the KeepAlive attribute is set in the CycleCloud UI, then the azslurmd will add that node's name to the `SuspendExcNodes` attribute via scontrol. Note that it is required that `ReconfigFlags=KeepPowerSaveSettings` is set in the slurm.conf, as is the default as of 4.0.4. Once KeepALive is set back to false, `azslurmd` will then remove this node from `SuspendExcNodes`. +Added in 4.0.5: If the KeepAlive attribute is set in the CycleCloud UI, then the azslurmd will add that node's name to the `SuspendExcNodes` attribute via scontrol. Note that it is required that `ReconfigFlags=KeepPowerSaveSettings` is set in the slurm.conf, as is the default as of 4.0.5. Once KeepALive is set back to false, `azslurmd` will then remove this node from `SuspendExcNodes`. If a node is added to `SuspendExcNodes` either via `azslurm keep_alive` or via the scontrol command, then `azslurmd` will not remove this node from the `SuspendExcNodes` if KeepAlive is false in CycleCloud. However, if the node is later set to KeepAlive as true in the UI then `azslurmd` will then remove it from `SuspendExcNodes` when the node is set back to KeepAlive is false. ### Slurmrestd -As of version 4.0.4, `slurmrestd` is automatically configured and started on the scheduler node and scheduler-ha node for all Slurm clusters. This REST API service provides programmatic access to Slurm functionality, allowing external applications and tools to interact with the cluster. For more information on the Slurm REST API, see the [official Slurm REST API documentation](https://slurm.schedmd.com/rest_api.html). +As of version 4.0.5, `slurmrestd` is automatically configured and started on the scheduler node and scheduler-ha node for all Slurm clusters. This REST API service provides programmatic access to Slurm functionality, allowing external applications and tools to interact with the cluster. For more information on the Slurm REST API, see the [official Slurm REST API documentation](https://slurm.schedmd.com/rest_api.html). ### Node Health Checks diff --git a/azure-slurm-install/setup.py b/azure-slurm-install/setup.py index 1e27a006..e79d61d7 100644 --- a/azure-slurm-install/setup.py +++ b/azure-slurm-install/setup.py @@ -7,7 +7,7 @@ from setuptools.command.test import Command from setuptools.command.test import test as TestCommand # noqa: N812 -__version__ = "4.0.4" +__version__ = "4.0.5" CWD = os.path.dirname(os.path.abspath(__file__)) diff --git a/azure-slurm/setup.py b/azure-slurm/setup.py index 67733884..b45d1644 100644 --- a/azure-slurm/setup.py +++ b/azure-slurm/setup.py @@ -7,7 +7,7 @@ from setuptools.command.test import Command from setuptools.command.test import test as TestCommand # noqa: N812 -__version__ = "4.0.4" +__version__ = "4.0.5" CWD = os.path.dirname(os.path.abspath(__file__)) diff --git a/azure-slurm/slurmcc/cli.py b/azure-slurm/slurmcc/cli.py index b8a9d706..356b9b7e 100644 --- a/azure-slurm/slurmcc/cli.py +++ b/azure-slurm/slurmcc/cli.py @@ -39,7 +39,7 @@ from . import topology -VERSION = "4.0.4" +VERSION = "4.0.5" def csv_list(x: str) -> List[str]: diff --git a/project.ini b/project.ini index c57ed536..b36f33f0 100644 --- a/project.ini +++ b/project.ini @@ -1,11 +1,11 @@ [project] name = slurm label = Slurm -version = 4.0.4 +version = 4.0.5 type = scheduler [blobs] -Files = azure-slurm-pkg-4.0.4.tar.gz, azure-slurm-install-pkg-4.0.4.tar.gz +Files = azure-slurm-pkg-4.0.5.tar.gz, azure-slurm-install-pkg-4.0.5.tar.gz [spec scheduler] run_list = role[slurm_scheduler_role] diff --git a/specs/default/chef/site-cookbooks/slurm/attributes/default.rb b/specs/default/chef/site-cookbooks/slurm/attributes/default.rb index 31f60f37..928cd441 100644 --- a/specs/default/chef/site-cookbooks/slurm/attributes/default.rb +++ b/specs/default/chef/site-cookbooks/slurm/attributes/default.rb @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -default[:slurm][:autoscale_version] = "4.0.4" +default[:slurm][:autoscale_version] = "4.0.5" default[:slurm][:version] = "23.11.9-1" default[:slurm][:user][:name] = 'slurm' default[:slurm][:cyclecloud_api] = "cyclecloud_api-8.4.1-py2.py3-none-any.whl" diff --git a/specs/default/chef/site-cookbooks/slurm/metadata.rb b/specs/default/chef/site-cookbooks/slurm/metadata.rb index b8f27936..29132aad 100644 --- a/specs/default/chef/site-cookbooks/slurm/metadata.rb +++ b/specs/default/chef/site-cookbooks/slurm/metadata.rb @@ -4,7 +4,7 @@ license 'All Rights Reserved' description 'Installs/Configures slurm' long_description 'Installs/Configures slurm' -version '4.0.4' +version '4.0.5' chef_version '>= 12.1' if respond_to?(:chef_version) %w{ cuser cshared }.each {|c| depends c} diff --git a/specs/default/cluster-init/files/install-non-scheduler.sh b/specs/default/cluster-init/files/install-non-scheduler.sh index e07ce1dc..3aff8a19 100644 --- a/specs/default/cluster-init/files/install-non-scheduler.sh +++ b/specs/default/cluster-init/files/install-non-scheduler.sh @@ -5,7 +5,7 @@ mode=$1 echo $mode | grep -Eqw "login|execute" || (echo "Usage: $0 [login|execute]" && exit 1) do_install=$(jetpack config slurm.do_install True) -install_pkg=$(jetpack config slurm.install_pkg azure-slurm-install-pkg-4.0.4.tar.gz) +install_pkg=$(jetpack config slurm.install_pkg azure-slurm-install-pkg-4.0.5.tar.gz) slurm_project_name=$(jetpack config slurm.project_name slurm) diff --git a/specs/scheduler/cluster-init/scripts/00-install.sh b/specs/scheduler/cluster-init/scripts/00-install.sh index 262e7d9f..be54465a 100644 --- a/specs/scheduler/cluster-init/scripts/00-install.sh +++ b/specs/scheduler/cluster-init/scripts/00-install.sh @@ -2,8 +2,8 @@ set -e do_install=$(jetpack config slurm.do_install True) -install_pkg=$(jetpack config slurm.install_pkg azure-slurm-install-pkg-4.0.4.tar.gz) -autoscale_pkg=$(jetpack config slurm.autoscale_pkg azure-slurm-pkg-4.0.4.tar.gz) +install_pkg=$(jetpack config slurm.install_pkg azure-slurm-install-pkg-4.0.5.tar.gz) +autoscale_pkg=$(jetpack config slurm.autoscale_pkg azure-slurm-pkg-4.0.5.tar.gz) slurm_project_name=$(jetpack config slurm.project_name slurm) find_python3() { diff --git a/templates/slurm-cs.txt b/templates/slurm-cs.txt index d502c7d3..ebc423a2 100644 --- a/templates/slurm-cs.txt +++ b/templates/slurm-cs.txt @@ -22,8 +22,8 @@ Autoscale = $Autoscale [[[configuration]]] - slurm.install_pkg = azure-slurm-install-pkg-4.0.4.tar.gz - slurm.autoscale_pkg = azure-slurm-pkg-4.0.4.tar.gz + slurm.install_pkg = azure-slurm-install-pkg-4.0.5.tar.gz + slurm.autoscale_pkg = azure-slurm-pkg-4.0.5.tar.gz slurm.version = $configuration_slurm_version slurm.accounting.enabled = $configuration_slurm_accounting_enabled @@ -40,7 +40,7 @@ Autoscale = $Autoscale # For fast spin-up after Deallocate, force an immediate re-converge on boot cyclecloud.converge_on_boot = false - [[[cluster-init cyclecloud/slurm:default:4.0.4]]] + [[[cluster-init cyclecloud/slurm:default:4.0.5]]] Optional = true diff --git a/templates/slurm.txt b/templates/slurm.txt index 3a126f7e..1d618e61 100644 --- a/templates/slurm.txt +++ b/templates/slurm.txt @@ -71,7 +71,7 @@ Autoscale = $Autoscale [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/monitoring:default]]] - [[[cluster-init cyclecloud/slurm:default:4.0.4]]] + [[[cluster-init cyclecloud/slurm:default:4.0.5]]] [[[volume boot]]] Size = ${ifThenElse(BootDiskSize > 0, BootDiskSize, undefined)} @@ -126,7 +126,7 @@ Autoscale = $Autoscale [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/monitoring:default]]] - [[[cluster-init cyclecloud/slurm:scheduler:4.0.4]]] + [[[cluster-init cyclecloud/slurm:scheduler:4.0.5]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $UsePublicNetwork @@ -191,7 +191,7 @@ Autoscale = $Autoscale [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/monitoring:default]]] - [[[cluster-init cyclecloud/slurm:login:4.0.4]]] + [[[cluster-init cyclecloud/slurm:login:4.0.5]]] [[[configuration]]] slurm.role = login autoscale.enabled = false @@ -209,7 +209,7 @@ Autoscale = $Autoscale [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/monitoring:default]]] - [[[cluster-init cyclecloud/slurm:execute:4.0.4]]] + [[[cluster-init cyclecloud/slurm:execute:4.0.5]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $ExecuteNodesPublic