awslabs · KeitaW · May 6, 2026 · May 1, 2026 · paragao · May 1, 2026
diff --git a/micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml b/micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml
@@ -60,18 +60,6 @@ spec:
             - NCCL_NVLS_ENABLE=1
             - -x
             - NCCL_MNNVL_ENABLE=1
-            - -x
-            - NCCL_BUFFSIZE=8388608
-            - -x
-            - NCCL_P2P_NET_CHUNKSIZE=524288
-            - -x
-            - NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so
-            - --mca
-            - btl
-            - tcp,self
-            - --mca
-            - btl_tcp_if_exclude
-            - lo,docker0,veth_def_agent
             - /opt/nccl-tests/build/all_reduce_perf
             - -b
             - "8"

diff --git a/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml b/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml
@@ -42,21 +42,6 @@ spec:
             - FI_EFA_FORK_SAFE=1
             - -x
             - NCCL_DEBUG=INFO
-            - -x
-            - NCCL_BUFFSIZE=8388608
-            - -x
-            - NCCL_P2P_NET_CHUNKSIZE=524288
-            - -x
-            - NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so
-            - --mca
-            - pml
-            - ^ucx
-            - --mca
-            - btl
-            - tcp,self
-            - --mca
-            - btl_tcp_if_exclude
-            - lo,docker0,veth_def_agent
             - /opt/nccl-tests/build/all_reduce_perf
             - -b
             - "8"

diff --git a/micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch b/micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch
@@ -20,28 +20,12 @@ ADDITIONAL_LD_LIBRARY_PATH=${2:-/usr/local/cuda-13.0/lib}
 mpirun -N 1 bash -c 'echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'
 
 
-### NCCL_BUFFSIZE increase the send queue depth and can turn NCCL communications into non-blocking.
-### https://www.usenix.org/system/files/atc23-choi.pdf
-
-### NCCL_P2P_NET_CHUNKSIZE Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
-### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
-
-### Improve performance for AllReduce by selecting specific protocol and algorithm for specific
-### message size and number of ranks.
-### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS.
-
 # run all_reduce test
 mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \
         -x FI_PROVIDER=efa \
 	-x FI_EFA_FORK_SAFE=1 \
 	-x LD_LIBRARY_PATH=$ADDITIONAL_LD_LIBRARY_PATH:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/amazon/ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
 	-x NCCL_DEBUG=INFO \
 	-x NCCL_SOCKET_IFNAME=^docker,lo,veth \
-	-x NCCL_BUFFSIZE=8388608 \
-	-x NCCL_P2P_NET_CHUNKSIZE=524288 \
-	-x NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \
-	--mca pml ^ucx \
-	--mca btl tcp,self \
-	--mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
 	--bind-to none ${ALL_REDUCE_BINARY} -b 8 -e 16G -f 2 -g 1 -c 1 -n 100
 
diff --git a/micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch b/micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch
@@ -35,18 +35,6 @@ export FI_EFA_FORK_SAFE=1
 ## NCCL Environment variables
 export NCCL_DEBUG=INFO
 
-### Increase the send queue depth and can turn NCCL communications into non-blocking.
-### https://www.usenix.org/system/files/atc23-choi.pdf
-export NCCL_BUFFSIZE=8388608
-### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
-### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
-export NCCL_P2P_NET_CHUNKSIZE=524288
-
-### Improve performance for AllReduce by selecting specific protocol and algorithm for specific
-### message size and number of ranks.
-### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS.
-export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/$(uname -m)-linux-gnu/libnccl-ofi-tuner.so
-
 
 declare -a ARGS=(
     --container-image $IMAGE

diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch
@@ -140,7 +140,4 @@ mpirun -n $((SLURM_NTASKS_PER_NODE * SLURM_JOB_NUM_NODES)) -N $SLURM_NTASKS_PER_
         -x NCCL_TESTS_SPLIT_MASK=${SPLIT_MASK} \
         ${NCCL_DEBUG_FLAG} \
         ${HOSTFILE_OPTS} \
-        --mca pml ^ucx \
-        --mca btl tcp,self \
-        --mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
         --bind-to none ${TEST_BINARY} -b 8 -e 16G -f 2 -g 1 -c 1 -n 100