diff --git a/CHANGELOG.md b/CHANGELOG.md index c38550d19..3f77e04ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,7 +31,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. - Add support for GB200 instance types. - Install nvidia-imex for all OSs except AL2. -- Install nvidia-fabricmanager for ARM instances for all OSs except AL2. **BUG FIXES** - Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures. diff --git a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb index 62d52e6ea..88765b1a9 100644 --- a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb +++ b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb @@ -34,3 +34,7 @@ def get_device_ids # NVSwitch device id is 10de:2941 for P6e instance { 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'b200' => '10de:2901', 'gb200' => '10de:2941' } end + +def is_gb200_node? + get_nvswitch_count(get_device_ids['gb200']) > 1 +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb index 957d9122c..375dcb02c 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb @@ -28,7 +28,3 @@ def fabric_manager_version def platform 'rhel7' end - -def _fabric_manager_enabled - !arm_instance? && _nvidia_enabled -end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index 0ee7a33a5..e9d9c6d64 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -31,7 +31,7 @@ action :configure do # Start nvidia fabric manager on NVSwitch enabled systems - if get_nvswitches > 1 + if get_nvswitches > 1 && !is_gb200_node? service 'nvidia-fabricmanager' do action %i(start enable) supports status: true @@ -40,7 +40,8 @@ end def _fabric_manager_enabled - _nvidia_enabled + # NVIDIA Fabric Manager not present on ARM + !arm_instance? && _nvidia_enabled end def _nvidia_enabled diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index fbd4c2c6d..d9754af3e 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -31,7 +31,7 @@ action :configure do return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" # Start nvidia-imex on p6e-gb200 and only on ComputeFleet - if get_nvswitch_count(get_device_ids['gb200']) > 1 || enable_force_configuration? + if is_gb200_node? || enable_force_configuration? # For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, # if one doesn't already exist in a common, shared location. template nvidia_imex_nodes_conf_file do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index 0f8bf3e84..9fa1ce4fd 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -118,55 +118,45 @@ def self.configure(chef_run) end describe 'fabric_manager:_fabric_manager_enabled' do - for_all_oses do |platform, version| - context "on #{platform}#{version}" do - context 'when on arm' do - cached(:chef_run) do - allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true) - ChefSpec::SoloRunner.new(step_into: ['fabric_manager'], platform: platform, version: version) - end - cached(:resource) do - ConvergeFabricManager.setup(chef_run, nvidia_enabled: true) - chef_run.find_resource('fabric_manager', 'setup') - end - if platform == 'amazon' && version == '2' - it "is not enabled" do - expect(resource._fabric_manager_enabled).to eq(false) - end - else - it "is enabled" do - expect(resource._fabric_manager_enabled).to eq(true) - end - end - end + context 'when on arm' do + cached(:chef_run) do + allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true) + ChefSpec::SoloRunner.new(step_into: ['fabric_manager']) + end + cached(:resource) do + ConvergeFabricManager.setup(chef_run, nvidia_enabled: true) + chef_run.find_resource('fabric_manager', 'setup') + end + it "is not enabled" do + expect(resource._fabric_manager_enabled).to eq(false) + end + end - context 'when not on arm' do - cached(:chef_run) do - allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false) - ChefSpec::SoloRunner.new(step_into: ['fabric_manager']) - end + context 'when not on arm' do + cached(:chef_run) do + allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false) + ChefSpec::SoloRunner.new(step_into: ['fabric_manager']) + end - context 'when nvidia enabled' do - cached(:resource) do - ConvergeFabricManager.setup(chef_run, nvidia_enabled: true) - chef_run.find_resource('fabric_manager', 'setup') - end + context 'when nvidia enabled' do + cached(:resource) do + ConvergeFabricManager.setup(chef_run, nvidia_enabled: true) + chef_run.find_resource('fabric_manager', 'setup') + end - it "is enabled" do - expect(resource._fabric_manager_enabled).to eq(true) - end - end + it "is enabled" do + expect(resource._fabric_manager_enabled).to eq(true) + end + end - context 'when nvidia not enabled' do - cached(:resource) do - ConvergeFabricManager.setup(chef_run, nvidia_enabled: false) - chef_run.find_resource('fabric_manager', 'setup') - end + context 'when nvidia not enabled' do + cached(:resource) do + ConvergeFabricManager.setup(chef_run, nvidia_enabled: false) + chef_run.find_resource('fabric_manager', 'setup') + end - it "is not enabled" do - expect(resource._fabric_manager_enabled).to eq(false) - end - end + it "is not enabled" do + expect(resource._fabric_manager_enabled).to eq(false) end end end @@ -227,44 +217,60 @@ def self.configure(chef_run) end describe 'fabric_manager:configure' do + let(:output_of_shell) { double('shell_out') } cached(:nvidia_driver_version) { 'nvidia_driver_version' } - - for_all_oses do |platform, version| - context "on #{platform}#{version}" do - cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } - cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version } - - context('when nvswithes are > 1') do - cached(:chef_run) do - stubs_for_resource('fabric_manager') do |res| - allow(res).to receive(:get_nvswitches).and_return(2) + [true, false].each do |is_gb200| + for_all_oses do |platform, version| + context "on #{platform}#{version} on #{is_gb200} gb200 node" do + cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' } + cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version } + + context('when nvswithes are > 1') do + cached(:chef_run) do + stubs_for_provider('fabric_manager') do |res| + allow(res).to receive(:get_nvswitches).and_return(2) + allow(res).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(res).to receive(:is_gb200_node?).and_return(is_gb200) + allow(res).to receive(:shell_out).and_return(output_of_shell) + end + runner = runner(platform: platform, version: version, step_into: ['fabric_manager']) + ConvergeFabricManager.configure(runner) end - runner = runner(platform: platform, version: version, step_into: ['fabric_manager']) - ConvergeFabricManager.configure(runner) - end - it 'configures fabric manager' do - is_expected.to configure_fabric_manager('configure') - end + it 'configures fabric manager' do + is_expected.to configure_fabric_manager('configure') + end - it 'starts nvidia-fabricmanager service' do - is_expected.to start_service('nvidia-fabricmanager') - .with_action(%i(start enable)) - .with_supports({ status: true }) + if is_gb200 + it 'does not start nvidia-fabricmanager service' do + is_expected.not_to start_service('nvidia-fabricmanager') + .with_action(%i(start enable)) + .with_supports({ status: true }) + end + else + it 'starts nvidia-fabricmanager service' do + is_expected.to start_service('nvidia-fabricmanager') + .with_action(%i(start enable)) + .with_supports({ status: true }) + end + end end - end - context('when nvswithes are not > 1') do - cached(:chef_run) do - stubs_for_resource('fabric_manager') do |res| - allow(res).to receive(:get_nvswitches).and_return(1) + context('when nvswithes are not > 1') do + cached(:chef_run) do + stubs_for_provider('fabric_manager[configure]') do |res| + allow(res).to receive(:get_nvswitches).and_return(1) + allow(res).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(res).to receive(:is_gb200_node?).and_return(is_gb200) + allow(res).to receive(:shell_out).and_return(output_of_shell) + end + runner = runner(platform: platform, version: version, step_into: ['fabric_manager']) + ConvergeFabricManager.configure(runner) end - runner = runner(platform: platform, version: version, step_into: ['fabric_manager']) - ConvergeFabricManager.configure(runner) - end - it "doesn't start nvidia-fabricmanager service" do - is_expected.not_to start_service('nvidia-fabricmanager') + it "doesn't start nvidia-fabricmanager service" do + is_expected.not_to start_service('nvidia-fabricmanager') + end end end end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb index 1f6653ad7..242ce90e5 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'tag:install_expected_versions_of_nvidia_fabric_manager_installed' do - only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !os_properties.alinux2? } + only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } describe package(node['cluster']['nvidia']['fabricmanager']['package']) do it { should be_installed }