Skip to content

Commit 2e9d1fe

Browse files
author
Himani Anil Deshpande
committed
[Gb200] Support IMEX configuration to be local to a node
* we remove /opt/parallelcluster/shared/nvidia-imex directory creation * We keep default path of `/etc/nvidia-imex/nodes_config.cfg` and `/etc/nvidia-imex/config.cfg` for IMEX configuration * We override `/etc/nvidia-imex/nodes_config.cfg` only if it is missing to avoid Imex start failures. * Update unit test
1 parent 253fc5f commit 2e9d1fe

File tree

3 files changed

+91
-60
lines changed

3 files changed

+91
-60
lines changed

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
end
2525

2626
# nvidia-imex
27-
default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex"
27+
default['cluster']['nvidia']['imex']['conf_dir'] = "/etc/nvidia-imex"
28+
default['cluster']['nvidia']['imex']['main_config'] = "#{node['cluster']['nvidia']['imex']['conf_dir']}/config.cfg"
29+
default['cluster']['nvidia']['imex']['nodes_config'] = "#{node['cluster']['nvidia']['imex']['conf_dir']}/nodes_config.cfg"
2830
default['cluster']['nvidia']['imex']['force_configuration'] = false
2931

3032
# NVIDIA NVLSM

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,50 @@
1919
return unless nvidia_enabled_or_installed?
2020
return if on_docker? || imex_installed? || aws_region.start_with?("us-iso")
2121

22-
directory node['cluster']['nvidia']['imex']['shared_dir']
23-
2422
action_install_imex
23+
24+
action_create_configuration_files
2525
# Save Imex version in Node Attributes for InSpec Tests
2626
node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version
2727
node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package
2828
node_attributes 'dump node attributes'
2929
end
3030

31+
action :create_configuration_files do
32+
# We create or update IMEX configuration files if ParallelCluster is installing IMEX
33+
template nvidia_imex_nodes_conf_file do
34+
source 'nvidia-imex/nvidia-imex-nodes.erb'
35+
owner 'root'
36+
group 'root'
37+
mode '0755'
38+
action :create
39+
end
40+
41+
template nvidia_imex_main_conf_file do
42+
source 'nvidia-imex/nvidia-imex-config.erb'
43+
owner 'root'
44+
group 'root'
45+
mode '0755'
46+
action :create
47+
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
48+
end
49+
50+
# We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file.
51+
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
52+
source 'nvidia-imex/nvidia-imex.service.erb'
53+
owner 'root'
54+
group 'root'
55+
mode '0644'
56+
action :create
57+
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
58+
end
59+
end
60+
3161
action :configure do
3262
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
3363
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
3464
if is_gb200_node? || enable_force_configuration?
35-
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
36-
# if one doesn't already exist in a common, shared location.
65+
# Create the file if this is missing otherwise Imex service will not start
3766
template nvidia_imex_nodes_conf_file do
3867
source 'nvidia-imex/nvidia-imex-nodes.erb'
3968
owner 'root'
@@ -42,24 +71,6 @@
4271
action :create_if_missing
4372
end
4473

45-
template nvidia_imex_main_conf_file do
46-
source 'nvidia-imex/nvidia-imex-config.erb'
47-
owner 'root'
48-
group 'root'
49-
mode '0755'
50-
action :create_if_missing
51-
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
52-
end
53-
54-
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
55-
source 'nvidia-imex/nvidia-imex.service.erb'
56-
owner 'root'
57-
group 'root'
58-
mode '0644'
59-
action :create
60-
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
61-
end
62-
6374
service nvidia_imex_service do
6475
action %i(enable start)
6576
supports status: true
@@ -92,11 +103,11 @@ def nvidia_enabled_or_installed?
92103
end
93104

94105
def nvidia_imex_main_conf_file
95-
"#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg"
106+
"#{node['cluster']['nvidia']['imex']['main_config']}"
96107
end
97108

98109
def nvidia_imex_nodes_conf_file
99-
"#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg"
110+
"#{node['cluster']['nvidia']['imex']['nodes_config']}"
100111
end
101112

102113
def enable_force_configuration?

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb

Lines changed: 53 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
nvidia_version = "1.2.3"
44
SOURCE_DIR = 'SOURCE_DIR'.freeze
5-
nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex"
5+
nvidia_imex_dir = "/etc/nvidia-imex"
6+
imex_main_conf_file = "#{nvidia_imex_dir}/config.cfg"
7+
imex_nodes_conf_file = "#{nvidia_imex_dir}/nodes_config.cfg"
8+
imex_service_file = "/etc/systemd/system/nvidia-imex.service"
69
imex_binary = '/usr/bin/nvidia-imex'
710
imex_ctl_binary = '/usr/bin/nvidia-imex-ctl'
8-
queue_name = 'queue-name'
9-
compute_resource_name = 'compute-resource-name'
1011
cluster_artifacts_s3_url = 'https://aws_region-aws-parallelcluster.s3.aws_region.AWS_DOMAIN'
1112

1213
class ConvergeNvidiaImex
@@ -18,6 +19,14 @@ def self.install(chef_run)
1819
end
1920
end
2021

22+
def self.create_configuration_files(chef_run)
23+
chef_run.converge_dsl('aws-parallelcluster-platform') do
24+
nvidia_imex 'create_configuration_files' do
25+
action :create_configuration_files
26+
end
27+
end
28+
end
29+
2130
def self.configure(chef_run)
2231
chef_run.converge_dsl('aws-parallelcluster-platform') do
2332
nvidia_imex 'configure' do
@@ -231,7 +240,7 @@ def self.configure(chef_run)
231240
cached(:node) { chef_run.node }
232241

233242
before do
234-
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
243+
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_dir
235244
chef_run.node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url
236245
chef_run.node.override['cluster']['region'] = 'aws_region'
237246
chef_run.node.override['cluster']['sources_dir'] = SOURCE_DIR
@@ -241,7 +250,6 @@ def self.configure(chef_run)
241250
end
242251
if platform == 'amazon' && version == '2'
243252
it 'does not install nvidia-imex' do
244-
is_expected.not_to create_directory(nvidia_imex_shared_dir)
245253
is_expected.not_to install_install_packages('Install nvidia-imex')
246254
.with(packages: "#{nvidia_imex_name}")
247255
.with(action: %i(install))
@@ -254,7 +262,6 @@ def self.configure(chef_run)
254262
else
255263

256264
it 'installs nvidia-imex' do
257-
is_expected.to create_directory(nvidia_imex_shared_dir)
258265
if platform == 'ubuntu'
259266
is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.deb").with(
260267
source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.deb",
@@ -294,6 +301,41 @@ def self.configure(chef_run)
294301
end
295302
end
296303

304+
describe 'nvidia_imex:create_configuration_files' do
305+
for_all_oses do |platform, version|
306+
context "on #{platform}#{version}" do
307+
cached(:chef_run) do
308+
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
309+
runner.node.override['cluster']['nvidia']['imex']['conf_dir'] = nvidia_imex_dir
310+
runner.node.override['cluster']['nvidia']['imex']['main_config'] = imex_main_conf_file
311+
runner.node.override['cluster']['nvidia']['imex']['nodes_config'] = imex_nodes_conf_file
312+
ConvergeNvidiaImex.create_configuration_files(runner)
313+
end
314+
cached(:node) { chef_run.node }
315+
316+
it 'does create Imex configuration files' do
317+
is_expected.to create_template("#{imex_nodes_conf_file}")
318+
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
319+
.with(user: 'root')
320+
.with(group: 'root')
321+
.with(mode: '0755')
322+
is_expected.to create_template("#{imex_main_conf_file}")
323+
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
324+
.with(user: 'root')
325+
.with(group: 'root')
326+
.with(mode: '0755')
327+
.with(variables: { imex_nodes_config_file_path: "#{imex_nodes_conf_file}" })
328+
is_expected.to create_template(imex_service_file)
329+
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
330+
.with(user: 'root')
331+
.with(group: 'root')
332+
.with(mode: '0644')
333+
.with(variables: { imex_main_config_file_path: "#{imex_main_conf_file}" })
334+
end
335+
end
336+
end
337+
end
338+
297339
describe 'nvidia_imex:configure' do
298340
[%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator|
299341
for_all_oses do |platform, version|
@@ -329,54 +371,30 @@ def self.configure(chef_run)
329371
before do
330372
chef_run.node.override['cluster']['region'] = 'aws_region'
331373
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
332-
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
374+
chef_run.node.override['cluster']['nvidia']['imex']['conf_dir'] = nvidia_imex_dir
375+
chef_run.node.override['cluster']['nvidia']['imex']['main_config'] = imex_main_conf_file
376+
chef_run.node.override['cluster']['nvidia']['imex']['nodes_config'] = imex_nodes_conf_file
333377
chef_run.node.override['cluster']['node_type'] = node_type
334-
chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name
335-
chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name
336378

337379
ConvergeNvidiaImex.configure(chef_run)
338380
end
339381

340382
if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
341383
it 'does not configure nvidia-imex' do
342-
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
384+
is_expected.not_to create_if_missing_template("#{imex_nodes_conf_file}")
343385
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
344386
.with(user: 'root')
345387
.with(group: 'root')
346388
.with(mode: '0755')
347-
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
348-
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
349-
.with(user: 'root')
350-
.with(group: 'root')
351-
.with(mode: '0755')
352-
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
353-
is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service")
354-
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
355-
.with(user: 'root')
356-
.with(group: 'root')
357-
.with(mode: '0644')
358-
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
359389
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
360390
end
361391
else
362392
it 'it starts nvidia-imex service' do
363-
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
393+
is_expected.to create_if_missing_template("#{imex_nodes_conf_file}")
364394
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
365395
.with(user: 'root')
366396
.with(group: 'root')
367397
.with(mode: '0755')
368-
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
369-
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
370-
.with(user: 'root')
371-
.with(group: 'root')
372-
.with(mode: '0755')
373-
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
374-
is_expected.to create_template("/etc/systemd/system/nvidia-imex.service")
375-
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
376-
.with(user: 'root')
377-
.with(group: 'root')
378-
.with(mode: '0644')
379-
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
380398
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
381399
end
382400
end

0 commit comments

Comments
 (0)