diff --git a/cime_config/allactive/config_pesall.xml b/cime_config/allactive/config_pesall.xml index d313390ca4f3..520a5bbd3382 100644 --- a/cime_config/allactive/config_pesall.xml +++ b/cime_config/allactive/config_pesall.xml @@ -2526,4 +2526,80 @@ + + + + pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 64 nodes, 64x1 + 64 + + 256 + -64 + -64 + -64 + -64 + -64 + + + 1 + 1 + 1 + 1 + + + 16 + + + 16 + + + + pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 128 nodes, 64x1 + 64 + + 512 + -128 + -128 + -128 + -128 + -128 + + + 1 + 1 + 1 + 1 + + + 16 + + + 16 + + + + pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 256 nodes, 64x1 + 64 + + 1024 + -256 + -256 + -256 + -256 + -256 + + + 1 + 1 + 1 + 1 + + + 16 + + + 16 + + + + diff --git a/cime_config/machines/config_batch.xml b/cime_config/machines/config_batch.xml index d5809e2fd9c3..75a1c866fc43 100644 --- a/cime_config/machines/config_batch.xml +++ b/cime_config/machines/config_batch.xml @@ -404,15 +404,7 @@ --constraint=gpu - - --gpus-per-node=4 - --gpu-bind=none - - - --gpus-per-task=1 - --gpu-bind=map_gpu:0,1,2,3 - - + --gpus-per-node=4 --gpu-bind=none diff --git a/cime_config/machines/config_machines.xml b/cime_config/machines/config_machines.xml index b4d86cb7e861..8fe3816a909f 100644 --- a/cime_config/machines/config_machines.xml +++ b/cime_config/machines/config_machines.xml @@ -367,6 +367,7 @@ -c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc} $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`} + $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} @@ -726,7 +727,8 @@ -c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc} $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`} - + $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} + /opt/cray/pe/lmod/8.7.19/init/perl @@ -1084,7 +1086,8 @@ -c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc} $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`} - + $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} + /opt/cray/pe/lmod/8.7.19/init/perl diff --git a/cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh b/cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh new file mode 100755 index 000000000000..689fbcc0bbcf --- /dev/null +++ b/cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Runtime launcher wrapper designed to enforce round-robin GPU affinity for high-density MPI jobs (e.g., MPN=64 on a 4-GPU node). +# It ensures optimal resource sharing and prevents device contention by partitioning MPI ranks into subgroups per device. + +# example with mpn=tasks_per_node=64 or 64 MPI's per node: +#+------------------+--------------------------+--------------+ +#| Local Rank Range | Logic: (Rank / 16) % 4 | Assigned GPU | +#+------------------+--------------------------+--------------+ +#| 00 - 15 | 0 / 16 ... 15 / 16 = 0 | dev0 | +#| 16 - 31 | 16 / 16 ... 31 / 16 = 1 | dev1 | +#| 32 - 47 | 32 / 16 ... 47 / 16 = 2 | dev2 | +#| 48 - 63 | 48 / 16 ... 63 / 16 = 3 | dev3 | +#+------------------+--------------------------+--------------+ + +# Get total MPI tasks per node from first argument +tasks_per_node=$1 + +# Dynamically detect the number of GPUs on this node +num_gpus=$(nvidia-smi -L | wc -l) +#num_gpus=4 + +# Calculate how many tasks share each GPU +# If 64 tasks and 4 GPUs, tasks_per_gpu = 16 +tasks_per_gpu=$(( ${tasks_per_node} / ${num_gpus} )) + +# Use 0 if SLURM_LOCALID is not set +local_id=${SLURM_LOCALID:-0} + +# Assign GPU based on Local Rank +# The modulo (%) handles edge cases if tasks_per_node isn't perfectly divisible +gpu=$(( (${local_id} / ${tasks_per_gpu}) % ${num_gpus} )) + +export CUDA_VISIBLE_DEVICES=$gpu + +#printf '?RANK= %s LOCAL_RANK= %s gpu= %s?\n' ${SLURM_PROCID} ${SLURM_LOCALID} ${gpu} +#echo "num_gpus=${num_gpus} Rank ${SLURM_PROCID} (Local ${SLURM_LOCALID}) assigned to GPU ${gpu}" + +# Clean up arguments and launch the application +shift +exec "$@"