Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
- Add support for GB200 instance types.
- Install nvidia-imex for all OSs except AL2.
- Install nvidia-fabricmanager for ARM instances for all OSs except AL2.

**BUG FIXES**
- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures.
Expand Down
4 changes: 4 additions & 0 deletions cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,7 @@ def get_device_ids
# NVSwitch device id is 10de:2941 for P6e instance
{ 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'b200' => '10de:2901', 'gb200' => '10de:2941' }
end

def is_gb200_node?
get_nvswitch_count(get_device_ids['gb200']) > 1
end
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,3 @@ def fabric_manager_version
def platform
'rhel7'
end

def _fabric_manager_enabled
!arm_instance? && _nvidia_enabled
end
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

action :configure do
# Start nvidia fabric manager on NVSwitch enabled systems
if get_nvswitches > 1
if get_nvswitches > 1 && !is_gb200_node?
service 'nvidia-fabricmanager' do
action %i(start enable)
supports status: true
Expand All @@ -40,7 +40,8 @@
end

def _fabric_manager_enabled
_nvidia_enabled
# NVIDIA Fabric Manager not present on ARM
!arm_instance? && _nvidia_enabled
end

def _nvidia_enabled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
action :configure do
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
if get_nvswitch_count(get_device_ids['gb200']) > 1 || enable_force_configuration?
if is_gb200_node? || enable_force_configuration?
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
# if one doesn't already exist in a common, shared location.
template nvidia_imex_nodes_conf_file do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,55 +118,45 @@ def self.configure(chef_run)
end

describe 'fabric_manager:_fabric_manager_enabled' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
context 'when on arm' do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'], platform: platform, version: version)
end
cached(:resource) do
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
chef_run.find_resource('fabric_manager', 'setup')
end
if platform == 'amazon' && version == '2'
it "is not enabled" do
expect(resource._fabric_manager_enabled).to eq(false)
end
else
it "is enabled" do
expect(resource._fabric_manager_enabled).to eq(true)
end
end
end
context 'when on arm' do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
end
cached(:resource) do
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
chef_run.find_resource('fabric_manager', 'setup')
end
it "is not enabled" do
expect(resource._fabric_manager_enabled).to eq(false)
end
end

context 'when not on arm' do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
end
context 'when not on arm' do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
end

context 'when nvidia enabled' do
cached(:resource) do
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
chef_run.find_resource('fabric_manager', 'setup')
end
context 'when nvidia enabled' do
cached(:resource) do
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
chef_run.find_resource('fabric_manager', 'setup')
end

it "is enabled" do
expect(resource._fabric_manager_enabled).to eq(true)
end
end
it "is enabled" do
expect(resource._fabric_manager_enabled).to eq(true)
end
end

context 'when nvidia not enabled' do
cached(:resource) do
ConvergeFabricManager.setup(chef_run, nvidia_enabled: false)
chef_run.find_resource('fabric_manager', 'setup')
end
context 'when nvidia not enabled' do
cached(:resource) do
ConvergeFabricManager.setup(chef_run, nvidia_enabled: false)
chef_run.find_resource('fabric_manager', 'setup')
end

it "is not enabled" do
expect(resource._fabric_manager_enabled).to eq(false)
end
end
it "is not enabled" do
expect(resource._fabric_manager_enabled).to eq(false)
end
end
end
Expand Down Expand Up @@ -227,44 +217,60 @@ def self.configure(chef_run)
end

describe 'fabric_manager:configure' do
let(:output_of_shell) { double('shell_out') }
cached(:nvidia_driver_version) { 'nvidia_driver_version' }

for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }

context('when nvswithes are > 1') do
cached(:chef_run) do
stubs_for_resource('fabric_manager') do |res|
allow(res).to receive(:get_nvswitches).and_return(2)
[true, false].each do |is_gb200|
for_all_oses do |platform, version|
context "on #{platform}#{version} on #{is_gb200} gb200 node" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }

context('when nvswithes are > 1') do
cached(:chef_run) do
stubs_for_provider('fabric_manager') do |res|
allow(res).to receive(:get_nvswitches).and_return(2)
allow(res).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
allow(res).to receive(:is_gb200_node?).and_return(is_gb200)
allow(res).to receive(:shell_out).and_return(output_of_shell)
end
runner = runner(platform: platform, version: version, step_into: ['fabric_manager'])
ConvergeFabricManager.configure(runner)
end
runner = runner(platform: platform, version: version, step_into: ['fabric_manager'])
ConvergeFabricManager.configure(runner)
end

it 'configures fabric manager' do
is_expected.to configure_fabric_manager('configure')
end
it 'configures fabric manager' do
is_expected.to configure_fabric_manager('configure')
end

it 'starts nvidia-fabricmanager service' do
is_expected.to start_service('nvidia-fabricmanager')
.with_action(%i(start enable))
.with_supports({ status: true })
if is_gb200
it 'does not start nvidia-fabricmanager service' do
is_expected.not_to start_service('nvidia-fabricmanager')
.with_action(%i(start enable))
.with_supports({ status: true })
end
else
it 'starts nvidia-fabricmanager service' do
is_expected.to start_service('nvidia-fabricmanager')
.with_action(%i(start enable))
.with_supports({ status: true })
end
end
end
end

context('when nvswithes are not > 1') do
cached(:chef_run) do
stubs_for_resource('fabric_manager') do |res|
allow(res).to receive(:get_nvswitches).and_return(1)
context('when nvswithes are not > 1') do
cached(:chef_run) do
stubs_for_provider('fabric_manager[configure]') do |res|
allow(res).to receive(:get_nvswitches).and_return(1)
allow(res).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
allow(res).to receive(:is_gb200_node?).and_return(is_gb200)
allow(res).to receive(:shell_out).and_return(output_of_shell)
end
runner = runner(platform: platform, version: version, step_into: ['fabric_manager'])
ConvergeFabricManager.configure(runner)
end
runner = runner(platform: platform, version: version, step_into: ['fabric_manager'])
ConvergeFabricManager.configure(runner)
end

it "doesn't start nvidia-fabricmanager service" do
is_expected.not_to start_service('nvidia-fabricmanager')
it "doesn't start nvidia-fabricmanager service" do
is_expected.not_to start_service('nvidia-fabricmanager')
end
end
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and limitations under the License.

control 'tag:install_expected_versions_of_nvidia_fabric_manager_installed' do
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !os_properties.alinux2? }
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }

describe package(node['cluster']['nvidia']['fabricmanager']['package']) do
it { should be_installed }
Expand Down
Loading