Skip to content

Commit 253fc5f

Browse files
committed
[B200] Add support for P6-B200 instance type.
In particular, it consists in installing Infiniband packages, load Infiniband kernel modules at boot time, ann install NVIDIA NVLink Subnet Manager.
1 parent 7bf075c commit 253fc5f

File tree

15 files changed

+527
-3
lines changed

15 files changed

+527
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
88

99
**ENHANCEMENTS**
1010
- Add support for P6e-GB200 instances. ParallelCluster sets up Slurm topology plugin to handle P6e-GB200 UltraServers. See limitations section for important additional setup requirements.
11+
- Add support for P6-B200 instances for all OSs except AL2.
1112
- Add `build-image` support for Amazon Linux 2023 AMIs based on kernel 6.12 (in addition to 6.1).
1213

1314
**LIMITATIONS**

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex"
2828
default['cluster']['nvidia']['imex']['force_configuration'] = false
2929

30+
# NVIDIA NVLSM
31+
default['cluster']['nvidia']['nvlsm']['enabled'] = true
32+
3033
# DCV
3134
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"
3235
default['cluster']['dcv']['authenticator']['user_id'] = node['cluster']['reserved_base_uid'] + 3
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ib_umad

cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,12 @@ def get_nvswitch_count(device_id)
2727
end
2828

2929
def get_device_ids
30-
# A100 (P4), H100(P5), B200(P6) and GB200()p6e) systems have NVSwitches
30+
# A100 (P4), H100(P5), B200(P6) and GB200(p6e) systems have NVSwitches
3131
# NVSwitch device id is 10de:1af1 for P4 instance
3232
# NVSwitch device id is 10de:22a3 for P5 instance
33+
# NVSwitch device id is 10de:2901 for P6 instance
3334
# NVSwitch device id is 10de:2941 for P6e instance
34-
{ 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'gb200' => '10de:2941' }
35+
{ 'a100' => '10de:1af1', 'h100' => '10de:22a3', 'b200' => '10de:2901', 'gb200' => '10de:2941' }
3536
end
3637

3738
def is_gb200_node?

cookbooks/aws-parallelcluster-platform/recipes/install/nvidia_install.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
gdrcopy 'Install Nvidia gdrcopy'
2323

24+
nvidia_nvlsm 'Install Nvidia NVLink Subnet Manager'
25+
2426
fabric_manager 'Install Nvidia Fabric Manager'
2527

2628
nvidia_dcgm 'install Nvidia datacenter-gpu-manager'
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_nvlsm, platform: 'amazon' do |node|
16+
node['platform_version'].to_i == 2023
17+
end
18+
19+
use 'partial/_nvidia_nvlsm_common.rb'
20+
use 'partial/_nvidia_nvlsm_rhel.rb'
21+
22+
def platform
23+
"amzn#{node['platform_version'].to_i}"
24+
end
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_nvlsm, platform: 'amazon', platform_version: '2'
16+
17+
use 'partial/_nvidia_nvlsm_common.rb'
18+
use 'partial/_nvidia_nvlsm_rhel.rb'
19+
20+
def nvlsm_installation_enabled?
21+
# NVLSM is not supported on Amazon Linux 2.
22+
false
23+
end
24+
25+
def platform
26+
"amzn#{node['platform_version'].to_i}"
27+
end
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_nvlsm, platform: 'redhat' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_nvidia_nvlsm_common.rb'
20+
use 'partial/_nvidia_nvlsm_rhel.rb'
21+
22+
def platform
23+
"rhel#{node['platform_version'].to_i}"
24+
end
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_nvlsm, platform: 'rocky' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_nvidia_nvlsm_common.rb'
20+
use 'partial/_nvidia_nvlsm_rhel.rb'
21+
22+
def platform
23+
"rhel#{node['platform_version'].to_i}"
24+
end
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_nvlsm, platform: 'ubuntu' do |node|
16+
node['platform_version'].to_i >= 22
17+
end
18+
19+
use 'partial/_nvidia_nvlsm_common.rb'
20+
use 'partial/_nvidia_nvlsm_debian.rb'
21+
22+
def platform
23+
"ubuntu#{node['platform_version'].delete('.')}"
24+
end

0 commit comments

Comments
 (0)