diff --git a/micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml b/micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml index 7ac71fa7e..41c5c7f99 100644 --- a/micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml +++ b/micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml @@ -60,18 +60,6 @@ spec: - NCCL_NVLS_ENABLE=1 - -x - NCCL_MNNVL_ENABLE=1 - - -x - - NCCL_BUFFSIZE=8388608 - - -x - - NCCL_P2P_NET_CHUNKSIZE=524288 - - -x - - NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so - - --mca - - btl - - tcp,self - - --mca - - btl_tcp_if_exclude - - lo,docker0,veth_def_agent - /opt/nccl-tests/build/all_reduce_perf - -b - "8" diff --git a/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml b/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml index e30cbf355..7f22dd678 100644 --- a/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml +++ b/micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml @@ -42,21 +42,6 @@ spec: - FI_EFA_FORK_SAFE=1 - -x - NCCL_DEBUG=INFO - - -x - - NCCL_BUFFSIZE=8388608 - - -x - - NCCL_P2P_NET_CHUNKSIZE=524288 - - -x - - NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so - - --mca - - pml - - ^ucx - - --mca - - btl - - tcp,self - - --mca - - btl_tcp_if_exclude - - lo,docker0,veth_def_agent - /opt/nccl-tests/build/all_reduce_perf - -b - "8" diff --git a/micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch b/micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch index 8e7549799..1181cd967 100644 --- a/micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch +++ b/micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch @@ -20,16 +20,6 @@ ADDITIONAL_LD_LIBRARY_PATH=${2:-/usr/local/cuda-13.0/lib} mpirun -N 1 bash -c 'echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")' -### NCCL_BUFFSIZE increase the send queue depth and can turn NCCL communications into non-blocking. -### https://www.usenix.org/system/files/atc23-choi.pdf - -### NCCL_P2P_NET_CHUNKSIZE Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications -### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html - -### Improve performance for AllReduce by selecting specific protocol and algorithm for specific -### message size and number of ranks. -### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS. - # run all_reduce test mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \ -x FI_PROVIDER=efa \ @@ -37,11 +27,5 @@ mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \ -x LD_LIBRARY_PATH=$ADDITIONAL_LD_LIBRARY_PATH:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/amazon/ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \ -x NCCL_DEBUG=INFO \ -x NCCL_SOCKET_IFNAME=^docker,lo,veth \ - -x NCCL_BUFFSIZE=8388608 \ - -x NCCL_P2P_NET_CHUNKSIZE=524288 \ - -x NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \ - --mca pml ^ucx \ - --mca btl tcp,self \ - --mca btl_tcp_if_exclude lo,docker0,veth_def_agent \ --bind-to none ${ALL_REDUCE_BINARY} -b 8 -e 16G -f 2 -g 1 -c 1 -n 100 diff --git a/micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch b/micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch index 98093e7a8..408f78168 100644 --- a/micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch +++ b/micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch @@ -35,18 +35,6 @@ export FI_EFA_FORK_SAFE=1 ## NCCL Environment variables export NCCL_DEBUG=INFO -### Increase the send queue depth and can turn NCCL communications into non-blocking. -### https://www.usenix.org/system/files/atc23-choi.pdf -export NCCL_BUFFSIZE=8388608 -### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications -### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html -export NCCL_P2P_NET_CHUNKSIZE=524288 - -### Improve performance for AllReduce by selecting specific protocol and algorithm for specific -### message size and number of ranks. -### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS. -export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/$(uname -m)-linux-gnu/libnccl-ofi-tuner.so - declare -a ARGS=( --container-image $IMAGE diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch index a301c12e4..181640f45 100644 --- a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch +++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch @@ -140,7 +140,4 @@ mpirun -n $((SLURM_NTASKS_PER_NODE * SLURM_JOB_NUM_NODES)) -N $SLURM_NTASKS_PER_ -x NCCL_TESTS_SPLIT_MASK=${SPLIT_MASK} \ ${NCCL_DEBUG_FLAG} \ ${HOSTFILE_OPTS} \ - --mca pml ^ucx \ - --mca btl tcp,self \ - --mca btl_tcp_if_exclude lo,docker0,veth_def_agent \ --bind-to none ${TEST_BINARY} -b 8 -e 16G -f 2 -g 1 -c 1 -n 100