Skip to content

Commit 6e3c1c5

Browse files
author
Himani Anil Deshpande
committed
[Gb200] Skip IMEX configuration File creation if the Directory does not exist
* We create the Directory as part of AMI creation * Skip starting Imex service if the service file does not exist
1 parent 253fc5f commit 6e3c1c5

File tree

2 files changed

+142
-103
lines changed

2 files changed

+142
-103
lines changed

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
group 'root'
4141
mode '0755'
4242
action :create_if_missing
43+
only_if { Dir.exist?(node['cluster']['nvidia']['imex']['shared_dir']) }
4344
end
4445

4546
template nvidia_imex_main_conf_file do
@@ -49,6 +50,7 @@
4950
mode '0755'
5051
action :create_if_missing
5152
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
53+
only_if { Dir.exist?(node['cluster']['nvidia']['imex']['shared_dir']) }
5254
end
5355

5456
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
@@ -58,11 +60,13 @@
5860
mode '0644'
5961
action :create
6062
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
63+
only_if { Dir.exist?(node['cluster']['nvidia']['imex']['shared_dir']) }
6164
end
6265

6366
service nvidia_imex_service do
6467
action %i(enable start)
6568
supports status: true
69+
only_if { ::File.exist?("/etc/systemd/system/#{nvidia_imex_service}.service") }
6670
end
6771
end
6872
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb

Lines changed: 138 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
nvidia_version = "1.2.3"
44
SOURCE_DIR = 'SOURCE_DIR'.freeze
55
nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex"
6+
imex_service_file = "/etc/systemd/system/nvidia-imex.service"
67
imex_binary = '/usr/bin/nvidia-imex'
78
imex_ctl_binary = '/usr/bin/nvidia-imex-ctl'
89
queue_name = 'queue-name'
@@ -296,118 +297,152 @@ def self.configure(chef_run)
296297

297298
describe 'nvidia_imex:configure' do
298299
[%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator|
299-
for_all_oses do |platform, version|
300-
context "on #{platform}#{version} with force_configuration #{force_indicator}" do
301-
context "when nvidia-imex binary is not installed" do
302-
cached(:chef_run) do
303-
stubs_for_resource('nvidia_imex') do |res|
304-
allow(res).to receive(:imex_installed?).and_return(false)
305-
end
306-
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
307-
ConvergeNvidiaImex.configure(runner)
308-
end
309-
cached(:node) { chef_run.node }
310-
311-
it 'does not configure nvidia-imex' do
312-
is_expected.not_to configure_nvidia_imex('nvidia-imex')
313-
end
314-
end
315-
316-
%w(HeadNode LoginNode ComputeFleet).each do |node_type|
317-
context "when get_nvswitch_count > 1 on #{node_type} node" do
318-
cached(:chef_run) do
319-
stubs_for_provider('nvidia_imex[configure]') do |pro|
320-
allow(pro).to receive(:imex_installed?).and_return(true)
321-
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
322-
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4)
323-
allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator)
300+
[true, false].each do |shared_dir_exists|
301+
[true, false].each do |imex_service_file_exists|
302+
for_all_oses do |platform, version|
303+
context "on #{platform}#{version} with force_configuration #{force_indicator} with shared_dir existence #{shared_dir_exists}" do
304+
context "when nvidia-imex binary is not installed" do
305+
cached(:chef_run) do
306+
stubs_for_resource('nvidia_imex') do |res|
307+
allow(res).to receive(:imex_installed?).and_return(false)
308+
allow(Dir).to receive(:exist?).with(nvidia_imex_shared_dir).and_return(shared_dir_exists)
309+
allow(File).to receive(:exist?).with(imex_service_file).and_return(imex_service_file_exists)
310+
end
311+
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
312+
ConvergeNvidiaImex.configure(runner)
324313
end
325-
runner(platform: platform, version: version, step_into: ['nvidia_imex'])
326-
end
327-
cached(:node) { chef_run.node }
314+
cached(:node) { chef_run.node }
328315

329-
before do
330-
chef_run.node.override['cluster']['region'] = 'aws_region'
331-
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
332-
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
333-
chef_run.node.override['cluster']['node_type'] = node_type
334-
chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name
335-
chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name
336-
337-
ConvergeNvidiaImex.configure(chef_run)
338-
end
339-
340-
if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
341316
it 'does not configure nvidia-imex' do
342-
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
343-
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
344-
.with(user: 'root')
345-
.with(group: 'root')
346-
.with(mode: '0755')
347-
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
348-
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
349-
.with(user: 'root')
350-
.with(group: 'root')
351-
.with(mode: '0755')
352-
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
353-
is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service")
354-
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
355-
.with(user: 'root')
356-
.with(group: 'root')
357-
.with(mode: '0644')
358-
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
359-
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
360-
end
361-
else
362-
it 'it starts nvidia-imex service' do
363-
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
364-
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
365-
.with(user: 'root')
366-
.with(group: 'root')
367-
.with(mode: '0755')
368-
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
369-
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
370-
.with(user: 'root')
371-
.with(group: 'root')
372-
.with(mode: '0755')
373-
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
374-
is_expected.to create_template("/etc/systemd/system/nvidia-imex.service")
375-
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
376-
.with(user: 'root')
377-
.with(group: 'root')
378-
.with(mode: '0644')
379-
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
380-
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
317+
is_expected.not_to configure_nvidia_imex('nvidia-imex')
381318
end
382319
end
383-
end
384-
end
385320

386-
context "when get_nvswitch_count <= 1" do
387-
cached(:chef_run) do
388-
stubs_for_provider('nvidia_imex[configure]') do |pro|
389-
allow(pro).to receive(:imex_installed?).and_return(true)
390-
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
391-
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1)
392-
allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator)
321+
%w(HeadNode LoginNode ComputeFleet).each do |node_type|
322+
context "when get_nvswitch_count > 1 on #{node_type} node" do
323+
cached(:chef_run) do
324+
stubs_for_provider('nvidia_imex[configure]') do |pro|
325+
allow(pro).to receive(:imex_installed?).and_return(true)
326+
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
327+
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4)
328+
allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator)
329+
allow(Dir).to receive(:exist?).with(nvidia_imex_shared_dir).and_return(shared_dir_exists)
330+
allow(File).to receive(:exist?).with(imex_service_file).and_return(imex_service_file_exists)
331+
end
332+
runner(platform: platform, version: version, step_into: ['nvidia_imex'])
333+
end
334+
cached(:node) { chef_run.node }
335+
336+
before do
337+
chef_run.node.override['cluster']['region'] = 'aws_region'
338+
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
339+
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
340+
chef_run.node.override['cluster']['node_type'] = node_type
341+
chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name
342+
chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name
343+
344+
ConvergeNvidiaImex.configure(chef_run)
345+
end
346+
347+
if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
348+
it 'does not configure nvidia-imex' do
349+
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
350+
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
351+
.with(user: 'root')
352+
.with(group: 'root')
353+
.with(mode: '0755')
354+
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
355+
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
356+
.with(user: 'root')
357+
.with(group: 'root')
358+
.with(mode: '0755')
359+
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
360+
is_expected.not_to create_template(imex_service_file)
361+
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
362+
.with(user: 'root')
363+
.with(group: 'root')
364+
.with(mode: '0644')
365+
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
366+
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
367+
end
368+
else
369+
it 'it starts nvidia-imex service' do
370+
if shared_dir_exists
371+
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
372+
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
373+
.with(user: 'root')
374+
.with(group: 'root')
375+
.with(mode: '0755')
376+
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
377+
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
378+
.with(user: 'root')
379+
.with(group: 'root')
380+
.with(mode: '0755')
381+
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
382+
is_expected.to create_template(imex_service_file)
383+
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
384+
.with(user: 'root')
385+
.with(group: 'root')
386+
.with(mode: '0644')
387+
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
388+
else
389+
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
390+
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
391+
.with(user: 'root')
392+
.with(group: 'root')
393+
.with(mode: '0755')
394+
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
395+
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
396+
.with(user: 'root')
397+
.with(group: 'root')
398+
.with(mode: '0755')
399+
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
400+
is_expected.not_to create_template(imex_service_file)
401+
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
402+
.with(user: 'root')
403+
.with(group: 'root')
404+
.with(mode: '0644')
405+
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
406+
end
407+
if imex_service_file_exists
408+
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
409+
else
410+
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
411+
end
412+
end
413+
end
414+
end
393415
end
394-
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
395-
ConvergeNvidiaImex.configure(runner)
396-
end
397-
cached(:node) { chef_run.node }
398416

399-
before do
400-
chef_run.node.override['cluster']['region'] = 'aws_region'
401-
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
402-
end
417+
context "when get_nvswitch_count <= 1" do
418+
cached(:chef_run) do
419+
stubs_for_provider('nvidia_imex[configure]') do |pro|
420+
allow(pro).to receive(:imex_installed?).and_return(true)
421+
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
422+
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1)
423+
allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator)
424+
allow(Dir).to receive(:exist?).with(nvidia_imex_shared_dir).and_return(shared_dir_exists)
425+
allow(File).to receive(:exist?).with(imex_service_file).and_return(imex_service_file_exists)
426+
end
427+
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
428+
ConvergeNvidiaImex.configure(runner)
429+
end
430+
cached(:node) { chef_run.node }
403431

404-
if ['true', 'yes', true].include?(force_indicator)
405-
it 'does configure nvidia-imex' do
406-
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
407-
end
408-
else
409-
it 'does not configure nvidia-imex' do
410-
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
432+
before do
433+
chef_run.node.override['cluster']['region'] = 'aws_region'
434+
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
435+
end
436+
437+
if ['true', 'yes', true].include?(force_indicator) && imex_service_file_exists
438+
it 'does configure nvidia-imex' do
439+
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
440+
end
441+
else
442+
it 'does not configure nvidia-imex' do
443+
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
444+
end
445+
end
411446
end
412447
end
413448
end

0 commit comments

Comments
 (0)