Skip to content

Commit 7d13c38

Browse files
himani2411Himani Anil Deshpande
andauthored
[Gb200]Skip Fabric Manager installation on Gb200 instance (#3015)
* Revert "[Fabric] Install NVIDIA Fabric manager for ARM instances (#3014)" This reverts commit f516bba. * [Fabric] We do not enable Nvidia Fabric manager for Gb200 instance --------- Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent f516bba commit 7d13c38

File tree

7 files changed

+89
-83
lines changed

7 files changed

+89
-83
lines changed

CHANGELOG.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3131
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
3232
- Add support for GB200 instance types.
3333
- Install nvidia-imex for all OSs except AL2.
34-
- Install nvidia-fabricmanager for ARM instances for all OSs except AL2.
3534

3635
**BUG FIXES**
3736
- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures.

cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,7 @@ def get_device_ids
3434
# NVSwitch device id is 10de:2941 for P6e instance
3535
{ 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'b200' => '10de:2901', 'gb200' => '10de:2941' }
3636
end
37+
38+
def is_gb200_node?
39+
get_nvswitch_count(get_device_ids['gb200']) > 1
40+
end

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,3 @@ def fabric_manager_version
2828
def platform
2929
'rhel7'
3030
end
31-
32-
def _fabric_manager_enabled
33-
!arm_instance? && _nvidia_enabled
34-
end

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
action :configure do
3333
# Start nvidia fabric manager on NVSwitch enabled systems
34-
if get_nvswitches > 1
34+
if get_nvswitches > 1 && !is_gb200_node?
3535
service 'nvidia-fabricmanager' do
3636
action %i(start enable)
3737
supports status: true
@@ -40,7 +40,8 @@
4040
end
4141

4242
def _fabric_manager_enabled
43-
_nvidia_enabled
43+
# NVIDIA Fabric Manager not present on ARM
44+
!arm_instance? && _nvidia_enabled
4445
end
4546

4647
def _nvidia_enabled

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
action :configure do
3232
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
3333
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
34-
if get_nvswitch_count(get_device_ids['gb200']) > 1 || enable_force_configuration?
34+
if is_gb200_node? || enable_force_configuration?
3535
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
3636
# if one doesn't already exist in a common, shared location.
3737
template nvidia_imex_nodes_conf_file do

cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb

Lines changed: 80 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -118,55 +118,45 @@ def self.configure(chef_run)
118118
end
119119

120120
describe 'fabric_manager:_fabric_manager_enabled' do
121-
for_all_oses do |platform, version|
122-
context "on #{platform}#{version}" do
123-
context 'when on arm' do
124-
cached(:chef_run) do
125-
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
126-
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'], platform: platform, version: version)
127-
end
128-
cached(:resource) do
129-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
130-
chef_run.find_resource('fabric_manager', 'setup')
131-
end
132-
if platform == 'amazon' && version == '2'
133-
it "is not enabled" do
134-
expect(resource._fabric_manager_enabled).to eq(false)
135-
end
136-
else
137-
it "is enabled" do
138-
expect(resource._fabric_manager_enabled).to eq(true)
139-
end
140-
end
141-
end
121+
context 'when on arm' do
122+
cached(:chef_run) do
123+
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
124+
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
125+
end
126+
cached(:resource) do
127+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
128+
chef_run.find_resource('fabric_manager', 'setup')
129+
end
130+
it "is not enabled" do
131+
expect(resource._fabric_manager_enabled).to eq(false)
132+
end
133+
end
142134

143-
context 'when not on arm' do
144-
cached(:chef_run) do
145-
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
146-
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
147-
end
135+
context 'when not on arm' do
136+
cached(:chef_run) do
137+
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
138+
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
139+
end
148140

149-
context 'when nvidia enabled' do
150-
cached(:resource) do
151-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
152-
chef_run.find_resource('fabric_manager', 'setup')
153-
end
141+
context 'when nvidia enabled' do
142+
cached(:resource) do
143+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
144+
chef_run.find_resource('fabric_manager', 'setup')
145+
end
154146

155-
it "is enabled" do
156-
expect(resource._fabric_manager_enabled).to eq(true)
157-
end
158-
end
147+
it "is enabled" do
148+
expect(resource._fabric_manager_enabled).to eq(true)
149+
end
150+
end
159151

160-
context 'when nvidia not enabled' do
161-
cached(:resource) do
162-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: false)
163-
chef_run.find_resource('fabric_manager', 'setup')
164-
end
152+
context 'when nvidia not enabled' do
153+
cached(:resource) do
154+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: false)
155+
chef_run.find_resource('fabric_manager', 'setup')
156+
end
165157

166-
it "is not enabled" do
167-
expect(resource._fabric_manager_enabled).to eq(false)
168-
end
169-
end
158+
it "is not enabled" do
159+
expect(resource._fabric_manager_enabled).to eq(false)
170160
end
171161
end
172162
end
@@ -227,44 +217,60 @@ def self.configure(chef_run)
227217
end
228218

229219
describe 'fabric_manager:configure' do
220+
let(:output_of_shell) { double('shell_out') }
230221
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
231-
232-
for_all_oses do |platform, version|
233-
context "on #{platform}#{version}" do
234-
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
235-
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }
236-
237-
context('when nvswithes are > 1') do
238-
cached(:chef_run) do
239-
stubs_for_resource('fabric_manager') do |res|
240-
allow(res).to receive(:get_nvswitches).and_return(2)
222+
[true, false].each do |is_gb200|
223+
for_all_oses do |platform, version|
224+
context "on #{platform}#{version} on #{is_gb200} gb200 node" do
225+
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
226+
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }
227+
228+
context('when nvswithes are > 1') do
229+
cached(:chef_run) do
230+
stubs_for_provider('fabric_manager') do |res|
231+
allow(res).to receive(:get_nvswitches).and_return(2)
232+
allow(res).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
233+
allow(res).to receive(:is_gb200_node?).and_return(is_gb200)
234+
allow(res).to receive(:shell_out).and_return(output_of_shell)
235+
end
236+
runner = runner(platform: platform, version: version, step_into: ['fabric_manager'])
237+
ConvergeFabricManager.configure(runner)
241238
end
242-
runner = runner(platform: platform, version: version, step_into: ['fabric_manager'])
243-
ConvergeFabricManager.configure(runner)
244-
end
245239

246-
it 'configures fabric manager' do
247-
is_expected.to configure_fabric_manager('configure')
248-
end
240+
it 'configures fabric manager' do
241+
is_expected.to configure_fabric_manager('configure')
242+
end
249243

250-
it 'starts nvidia-fabricmanager service' do
251-
is_expected.to start_service('nvidia-fabricmanager')
252-
.with_action(%i(start enable))
253-
.with_supports({ status: true })
244+
if is_gb200
245+
it 'does not start nvidia-fabricmanager service' do
246+
is_expected.not_to start_service('nvidia-fabricmanager')
247+
.with_action(%i(start enable))
248+
.with_supports({ status: true })
249+
end
250+
else
251+
it 'starts nvidia-fabricmanager service' do
252+
is_expected.to start_service('nvidia-fabricmanager')
253+
.with_action(%i(start enable))
254+
.with_supports({ status: true })
255+
end
256+
end
254257
end
255-
end
256258

257-
context('when nvswithes are not > 1') do
258-
cached(:chef_run) do
259-
stubs_for_resource('fabric_manager') do |res|
260-
allow(res).to receive(:get_nvswitches).and_return(1)
259+
context('when nvswithes are not > 1') do
260+
cached(:chef_run) do
261+
stubs_for_provider('fabric_manager[configure]') do |res|
262+
allow(res).to receive(:get_nvswitches).and_return(1)
263+
allow(res).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
264+
allow(res).to receive(:is_gb200_node?).and_return(is_gb200)
265+
allow(res).to receive(:shell_out).and_return(output_of_shell)
266+
end
267+
runner = runner(platform: platform, version: version, step_into: ['fabric_manager'])
268+
ConvergeFabricManager.configure(runner)
261269
end
262-
runner = runner(platform: platform, version: version, step_into: ['fabric_manager'])
263-
ConvergeFabricManager.configure(runner)
264-
end
265270

266-
it "doesn't start nvidia-fabricmanager service" do
267-
is_expected.not_to start_service('nvidia-fabricmanager')
271+
it "doesn't start nvidia-fabricmanager service" do
272+
is_expected.not_to start_service('nvidia-fabricmanager')
273+
end
268274
end
269275
end
270276
end

cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# See the License for the specific language governing permissions and limitations under the License.
1111

1212
control 'tag:install_expected_versions_of_nvidia_fabric_manager_installed' do
13-
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !os_properties.alinux2? }
13+
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }
1414

1515
describe package(node['cluster']['nvidia']['fabricmanager']['package']) do
1616
it { should be_installed }

0 commit comments

Comments
 (0)