Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions cime_config/allactive/config_pesall.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2526,4 +2526,80 @@
</pes>
</mach>
</grid>
<grid name="a%ne256np4.pg2.*">
<mach name="pm-gpu|muller-gpu|alvarez-gpu">
<pes compset=".*SCREAM.*ELM.*MPASSI.*MPASO.*" pesize="S">
<comment>pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 64 nodes, 64x1</comment>
<MAX_MPITASKS_PER_NODE>64</MAX_MPITASKS_PER_NODE>
<ntasks>
<ntasks_atm>256</ntasks_atm>
<ntasks_cpl>-64</ntasks_cpl>
<ntasks_lnd>-64</ntasks_lnd>
<ntasks_rof>-64</ntasks_rof>
<ntasks_ice>-64</ntasks_ice>
<ntasks_ocn>-64</ntasks_ocn>
</ntasks>
<rootpe>
<rootpe_glc>1</rootpe_glc>
<rootpe_wav>1</rootpe_wav>
<rootpe_esp>1</rootpe_esp>
<rootpe_iac>1</rootpe_iac>
</rootpe>
<pstrid>
<pstrid_atm>16</pstrid_atm>
</pstrid>
<excl_stride>
<excl_stride_atm>16</excl_stride_atm>
</excl_stride>
</pes>
<pes compset=".*SCREAM.*ELM.*MPASSI.*MPASO.*" pesize="M">
<comment>pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 128 nodes, 64x1</comment>
<MAX_MPITASKS_PER_NODE>64</MAX_MPITASKS_PER_NODE>
<ntasks>
<ntasks_atm>512</ntasks_atm>
<ntasks_cpl>-128</ntasks_cpl>
<ntasks_lnd>-128</ntasks_lnd>
<ntasks_rof>-128</ntasks_rof>
<ntasks_ice>-128</ntasks_ice>
<ntasks_ocn>-128</ntasks_ocn>
</ntasks>
<rootpe>
<rootpe_glc>1</rootpe_glc>
<rootpe_wav>1</rootpe_wav>
<rootpe_esp>1</rootpe_esp>
<rootpe_iac>1</rootpe_iac>
</rootpe>
<pstrid>
<pstrid_atm>16</pstrid_atm>
</pstrid>
<excl_stride>
<excl_stride_atm>16</excl_stride_atm>
</excl_stride>
</pes>
<pes compset=".*SCREAM.*ELM.*MPASSI.*MPASO.*" pesize="L">
<comment>pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 256 nodes, 64x1</comment>
<MAX_MPITASKS_PER_NODE>64</MAX_MPITASKS_PER_NODE>
<ntasks>
<ntasks_atm>1024</ntasks_atm>
<ntasks_cpl>-256</ntasks_cpl>
<ntasks_lnd>-256</ntasks_lnd>
<ntasks_rof>-256</ntasks_rof>
<ntasks_ice>-256</ntasks_ice>
<ntasks_ocn>-256</ntasks_ocn>
</ntasks>
<rootpe>
<rootpe_glc>1</rootpe_glc>
<rootpe_wav>1</rootpe_wav>
<rootpe_esp>1</rootpe_esp>
<rootpe_iac>1</rootpe_iac>
</rootpe>
<pstrid>
<pstrid_atm>16</pstrid_atm>
</pstrid>
<excl_stride>
<excl_stride_atm>16</excl_stride_atm>
</excl_stride>
</pes>
</mach>
</grid>
</config_pes>
10 changes: 1 addition & 9 deletions cime_config/machines/config_batch.xml
Original file line number Diff line number Diff line change
Expand Up @@ -404,15 +404,7 @@
<directives>
<directive> --constraint=gpu</directive>
</directives>
<directives COMPSET="!.*MMF.*" compiler="gnugpu">
<directive> --gpus-per-node=4</directive>
<directive> --gpu-bind=none</directive>
</directives>
<directives COMPSET=".*MMF.*" compiler="gnugpu">
<directive> --gpus-per-task=1</directive>
<directive> --gpu-bind=map_gpu:0,1,2,3</directive>
</directives>
<directives compiler="nvidiagpu">
<directives compiler=".*gpu">
<directive> --gpus-per-node=4</directive>
<directive> --gpu-bind=none</directive>
</directives>
Expand Down
7 changes: 5 additions & 2 deletions cime_config/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@
<arg name="thread_count">-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}</arg>
<arg name="binding"> $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} </arg>
<arg name="placement"> -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}</arg>
<arg name="gpu-bind"> $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} </arg>
</arguments>
</mpirun>
<module_system type="module" allow_error="true">
Expand Down Expand Up @@ -726,7 +727,8 @@
<arg name="thread_count">-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}</arg>
<arg name="binding"> $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} </arg>
<arg name="placement"> -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}</arg>
</arguments>
<arg name="gpu-bind"> $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} </arg>
</arguments>
</mpirun>
<module_system type="module" allow_error="true">
<init_path lang="perl">/opt/cray/pe/lmod/8.7.19/init/perl</init_path>
Expand Down Expand Up @@ -1084,7 +1086,8 @@
<arg name="thread_count">-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}</arg>
<arg name="binding"> $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} </arg>
<arg name="placement"> -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}</arg>
</arguments>
<arg name="gpu-bind"> $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} </arg>
</arguments>
</mpirun>
<module_system type="module" allow_error="true">
<init_path lang="perl">/opt/cray/pe/lmod/8.7.19/init/perl</init_path>
Expand Down
41 changes: 41 additions & 0 deletions cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# Runtime launcher wrapper designed to enforce round-robin GPU affinity for high-density MPI jobs (e.g., MPN=64 on a 4-GPU node).
# It ensures optimal resource sharing and prevents device contention by partitioning MPI ranks into subgroups per device.

# example with mpn=tasks_per_node=64 or 64 MPI's per node:
#+------------------+--------------------------+--------------+
#| Local Rank Range | Logic: (Rank / 16) % 4 | Assigned GPU |
#+------------------+--------------------------+--------------+
#| 00 - 15 | 0 / 16 ... 15 / 16 = 0 | dev0 |
#| 16 - 31 | 16 / 16 ... 31 / 16 = 1 | dev1 |
#| 32 - 47 | 32 / 16 ... 47 / 16 = 2 | dev2 |
#| 48 - 63 | 48 / 16 ... 63 / 16 = 3 | dev3 |
#+------------------+--------------------------+--------------+

# Get total MPI tasks per node from first argument
tasks_per_node=$1

# Dynamically detect the number of GPUs on this node
num_gpus=$(nvidia-smi -L | wc -l)
#num_gpus=4

# Calculate how many tasks share each GPU
# If 64 tasks and 4 GPUs, tasks_per_gpu = 16
tasks_per_gpu=$(( ${tasks_per_node} / ${num_gpus} ))

# Use 0 if SLURM_LOCALID is not set
local_id=${SLURM_LOCALID:-0}

# Assign GPU based on Local Rank
# The modulo (%) handles edge cases if tasks_per_node isn't perfectly divisible
gpu=$(( (${local_id} / ${tasks_per_gpu}) % ${num_gpus} ))

export CUDA_VISIBLE_DEVICES=$gpu

#printf '?RANK= %s LOCAL_RANK= %s gpu= %s?\n' ${SLURM_PROCID} ${SLURM_LOCALID} ${gpu}
#echo "num_gpus=${num_gpus} Rank ${SLURM_PROCID} (Local ${SLURM_LOCALID}) assigned to GPU ${gpu}"

# Clean up arguments and launch the application
shift
exec "$@"
Loading