diff --git a/cime_config/allactive/config_pesall.xml b/cime_config/allactive/config_pesall.xml
index d313390ca4f3..520a5bbd3382 100644
--- a/cime_config/allactive/config_pesall.xml
+++ b/cime_config/allactive/config_pesall.xml
@@ -2526,4 +2526,80 @@
+
+
+
+ pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 64 nodes, 64x1
+ 64
+
+ 256
+ -64
+ -64
+ -64
+ -64
+ -64
+
+
+ 1
+ 1
+ 1
+ 1
+
+
+ 16
+
+
+ 16
+
+
+
+ pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 128 nodes, 64x1
+ 64
+
+ 512
+ -128
+ -128
+ -128
+ -128
+ -128
+
+
+ 1
+ 1
+ 1
+ 1
+
+
+ 16
+
+
+ 16
+
+
+
+ pm-gpu ne256 for fully coupled cases with ATM on GPU, MPAS on CPU -- WCYCLXX2010 256 nodes, 64x1
+ 64
+
+ 1024
+ -256
+ -256
+ -256
+ -256
+ -256
+
+
+ 1
+ 1
+ 1
+ 1
+
+
+ 16
+
+
+ 16
+
+
+
+
diff --git a/cime_config/machines/config_batch.xml b/cime_config/machines/config_batch.xml
index d5809e2fd9c3..75a1c866fc43 100644
--- a/cime_config/machines/config_batch.xml
+++ b/cime_config/machines/config_batch.xml
@@ -404,15 +404,7 @@
--constraint=gpu
-
- --gpus-per-node=4
- --gpu-bind=none
-
-
- --gpus-per-task=1
- --gpu-bind=map_gpu:0,1,2,3
-
-
+
--gpus-per-node=4
--gpu-bind=none
diff --git a/cime_config/machines/config_machines.xml b/cime_config/machines/config_machines.xml
index b4d86cb7e861..8fe3816a909f 100644
--- a/cime_config/machines/config_machines.xml
+++ b/cime_config/machines/config_machines.xml
@@ -367,6 +367,7 @@
-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}
$SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;}
-m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}
+ $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;}
@@ -726,7 +727,8 @@
-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}
$SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;}
-m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}
-
+ $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;}
+
/opt/cray/pe/lmod/8.7.19/init/perl
@@ -1084,7 +1086,8 @@
-c $SHELL{echo 128/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}
$SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu-bind=cores"; else echo "--cpu-bind=threads";fi;}
-m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}
-
+ $SHELL{mpn=`./xmlquery --value MAX_MPITASKS_PER_NODE`; if [ 64 -le $mpn ]; then echo $CIMEROOT/../cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh $mpn; fi;}
+
/opt/cray/pe/lmod/8.7.19/init/perl
diff --git a/cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh b/cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh
new file mode 100755
index 000000000000..689fbcc0bbcf
--- /dev/null
+++ b/cime_config/machines/scripts/pm-gpu_set_affinity_npergpu.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Runtime launcher wrapper designed to enforce round-robin GPU affinity for high-density MPI jobs (e.g., MPN=64 on a 4-GPU node).
+# It ensures optimal resource sharing and prevents device contention by partitioning MPI ranks into subgroups per device.
+
+# example with mpn=tasks_per_node=64 or 64 MPI's per node:
+#+------------------+--------------------------+--------------+
+#| Local Rank Range | Logic: (Rank / 16) % 4 | Assigned GPU |
+#+------------------+--------------------------+--------------+
+#| 00 - 15 | 0 / 16 ... 15 / 16 = 0 | dev0 |
+#| 16 - 31 | 16 / 16 ... 31 / 16 = 1 | dev1 |
+#| 32 - 47 | 32 / 16 ... 47 / 16 = 2 | dev2 |
+#| 48 - 63 | 48 / 16 ... 63 / 16 = 3 | dev3 |
+#+------------------+--------------------------+--------------+
+
+# Get total MPI tasks per node from first argument
+tasks_per_node=$1
+
+# Dynamically detect the number of GPUs on this node
+num_gpus=$(nvidia-smi -L | wc -l)
+#num_gpus=4
+
+# Calculate how many tasks share each GPU
+# If 64 tasks and 4 GPUs, tasks_per_gpu = 16
+tasks_per_gpu=$(( ${tasks_per_node} / ${num_gpus} ))
+
+# Use 0 if SLURM_LOCALID is not set
+local_id=${SLURM_LOCALID:-0}
+
+# Assign GPU based on Local Rank
+# The modulo (%) handles edge cases if tasks_per_node isn't perfectly divisible
+gpu=$(( (${local_id} / ${tasks_per_gpu}) % ${num_gpus} ))
+
+export CUDA_VISIBLE_DEVICES=$gpu
+
+#printf '?RANK= %s LOCAL_RANK= %s gpu= %s?\n' ${SLURM_PROCID} ${SLURM_LOCALID} ${gpu}
+#echo "num_gpus=${num_gpus} Rank ${SLURM_PROCID} (Local ${SLURM_LOCALID}) assigned to GPU ${gpu}"
+
+# Clean up arguments and launch the application
+shift
+exec "$@"