E3SM-Project · ndkeen · Dec 25, 2025
diff --git a/cime_config/allactive/config_pesall.xml b/cime_config/allactive/config_pesall.xml
@@ -2526,4 +2526,80 @@
       </pes>
     </mach>
   </grid>
+  <grid name="a%ne256np4.pg2.*">
+    <mach name="pm-gpu|muller-gpu|alvarez-gpu">
+      <pes compset=".*SCREAM.*ELM.*MPASSI.*MPASO.*" pesize="S">
+        <comment>pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 64 nodes, 64x1</comment>
+        <MAX_MPITASKS_PER_NODE>64</MAX_MPITASKS_PER_NODE>
+        <ntasks>
+          <ntasks_atm>256</ntasks_atm>
+          <ntasks_cpl>-64</ntasks_cpl>
+          <ntasks_lnd>-64</ntasks_lnd>
+          <ntasks_rof>-64</ntasks_rof>
+          <ntasks_ice>-64</ntasks_ice>
+          <ntasks_ocn>-64</ntasks_ocn>
+        </ntasks>
+        <rootpe>
+          <rootpe_glc>1</rootpe_glc>
+          <rootpe_wav>1</rootpe_wav>
+          <rootpe_esp>1</rootpe_esp>
+          <rootpe_iac>1</rootpe_iac>
+        </rootpe>
+        <pstrid>
+          <pstrid_atm>16</pstrid_atm>
+        </pstrid>
+        <excl_stride>
+          <excl_stride_atm>16</excl_stride_atm>
+        </excl_stride>
+      </pes>
+      <pes compset=".*SCREAM.*ELM.*MPASSI.*MPASO.*" pesize="M">
+        <comment>pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 128 nodes, 64x1</comment>
+        <MAX_MPITASKS_PER_NODE>64</MAX_MPITASKS_PER_NODE>
+        <ntasks>
+          <ntasks_atm>512</ntasks_atm>
+          <ntasks_cpl>-128</ntasks_cpl>
+          <ntasks_lnd>-128</ntasks_lnd>
+          <ntasks_rof>-128</ntasks_rof>
+          <ntasks_ice>-128</ntasks_ice>
+          <ntasks_ocn>-128</ntasks_ocn>
+        </ntasks>
+        <rootpe>
+          <rootpe_glc>1</rootpe_glc>
+          <rootpe_wav>1</rootpe_wav>
+          <rootpe_esp>1</rootpe_esp>
+          <rootpe_iac>1</rootpe_iac>
+        </rootpe>
+        <pstrid>
+          <pstrid_atm>16</pstrid_atm>
+        </pstrid>
+        <excl_stride>
+          <excl_stride_atm>16</excl_stride_atm>
+        </excl_stride>
+      </pes>
+      <pes compset=".*SCREAM.*ELM.*MPASSI.*MPASO.*" pesize="L">
+        <comment>pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 256 nodes, 64x1</comment>
+        <MAX_MPITASKS_PER_NODE>64</MAX_MPITASKS_PER_NODE>
+        <ntasks>
+          <ntasks_atm>1024</ntasks_atm>
+          <ntasks_cpl>-256</ntasks_cpl>
+          <ntasks_lnd>-256</ntasks_lnd>
+          <ntasks_rof>-256</ntasks_rof>
+          <ntasks_ice>-256</ntasks_ice>
+          <ntasks_ocn>-256</ntasks_ocn>
+        </ntasks>
+        <rootpe>
+          <rootpe_glc>1</rootpe_glc>
+          <rootpe_wav>1</rootpe_wav>
+          <rootpe_esp>1</rootpe_esp>
+          <rootpe_iac>1</rootpe_iac>
+        </rootpe>
+        <pstrid>
+          <pstrid_atm>16</pstrid_atm>
+        </pstrid>
+        <excl_stride>
+          <excl_stride_atm>16</excl_stride_atm>
+        </excl_stride>
+      </pes>
+    </mach>
+  </grid>
 </config_pes>
diff --git a/cime_config/machines/config_batch.xml b/cime_config/machines/config_batch.xml
@@ -404,15 +404,7 @@
     <directives>
       <directive> --constraint=gpu</directive>
     </directives>
-    <directives COMPSET="!.*MMF.*" compiler="gnugpu">
-      <directive> --gpus-per-node=4</directive>
-      <directive> --gpu-bind=none</directive>
-    </directives>
-    <directives COMPSET=".*MMF.*" compiler="gnugpu">
-      <directive> --gpus-per-task=1</directive>
-      <directive> --gpu-bind=map_gpu:0,1,2,3</directive>
-    </directives>
-    <directives compiler="nvidiagpu">
+    <directives compiler=".*gpu">
       <directive> --gpus-per-node=4</directive>
       <directive> --gpu-bind=none</directive>
     </directives>

diff --git a/cime_config/machines/config_machines.xml b/cime_config/machines/config_machines.xml
@@ -367,6 +367,7 @@
         <arg name="thread_count">-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}</arg>
         <arg name="binding"> $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} </arg>
         <arg name="placement"> -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}</arg>
+        <arg name="gpu-bind"> $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} </arg>
     </arguments>
     </mpirun>
     <module_system type="module" allow_error="true">
@@ -726,7 +727,8 @@
         <arg name="thread_count">-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}</arg>
         <arg name="binding"> $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} </arg>
         <arg name="placement"> -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}</arg>
-    </arguments>
+        <arg name="gpu-bind"> $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} </arg>
+      </arguments>
     </mpirun>
     <module_system type="module" allow_error="true">
       <init_path lang="perl">/opt/cray/pe/lmod/8.7.19/init/perl</init_path>
@@ -1084,7 +1086,8 @@
         <arg name="thread_count">-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}</arg>
         <arg name="binding"> $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;} </arg>
         <arg name="placement"> -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}</arg>
-    </arguments>
+        <arg name="gpu-bind"> $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;} </arg>
+      </arguments>
     </mpirun>
     <module_system type="module" allow_error="true">
       <init_path lang="perl">/opt/cray/pe/lmod/8.7.19/init/perl</init_path>

diff --git a/cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh b/cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Runtime launcher wrapper designed to enforce round-robin GPU affinity for high-density MPI jobs (e.g., MPN=64 on a 4-GPU node).
+# It ensures optimal resource sharing and prevents device contention by partitioning MPI ranks into subgroups per device.
+
+# example with mpn=tasks_per_node=64 or 64 MPI's per node:
+#+------------------+--------------------------+--------------+
+#| Local Rank Range | Logic: (Rank / 16) % 4   | Assigned GPU |
+#+------------------+--------------------------+--------------+
+#| 00 - 15          | 0 / 16 ... 15 / 16 = 0   | dev0         |
+#| 16 - 31          | 16 / 16 ... 31 / 16 = 1  | dev1         |
+#| 32 - 47          | 32 / 16 ... 47 / 16 = 2  | dev2         |
+#| 48 - 63          | 48 / 16 ... 63 / 16 = 3  | dev3         |
+#+------------------+--------------------------+--------------+
+
+# Get total MPI tasks per node from first argument
+tasks_per_node=$1
+
+# Dynamically detect the number of GPUs on this node
+num_gpus=$(nvidia-smi -L | wc -l)
+#num_gpus=4
+
+# Calculate how many tasks share each GPU
+# If 64 tasks and 4 GPUs, tasks_per_gpu = 16
+tasks_per_gpu=$(( ${tasks_per_node} / ${num_gpus} ))
+
+# Use 0 if SLURM_LOCALID is not set
+local_id=${SLURM_LOCALID:-0}
+
+# Assign GPU based on Local Rank
+# The modulo (%) handles edge cases if tasks_per_node isn't perfectly divisible
+gpu=$(( (${local_id} / ${tasks_per_gpu}) % ${num_gpus} ))
+
+export CUDA_VISIBLE_DEVICES=$gpu
+
+#printf '?RANK= %s LOCAL_RANK= %s gpu= %s?\n' ${SLURM_PROCID} ${SLURM_LOCALID} ${gpu}
+#echo "num_gpus=${num_gpus} Rank ${SLURM_PROCID} (Local ${SLURM_LOCALID}) assigned to GPU ${gpu}"
+
+# Clean up arguments and launch the application
+shift
+exec "$@"