Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions micro-benchmarks/nccl-tests/kubernetes/nccl-tests-gb200.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,6 @@ spec:
- NCCL_NVLS_ENABLE=1
- -x
- NCCL_MNNVL_ENABLE=1
- -x
- NCCL_BUFFSIZE=8388608
- -x
- NCCL_P2P_NET_CHUNKSIZE=524288
- -x
- NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so
- --mca
- btl
- tcp,self
- --mca
- btl_tcp_if_exclude
- lo,docker0,veth_def_agent
- /opt/nccl-tests/build/all_reduce_perf
- -b
- "8"
Expand Down
15 changes: 0 additions & 15 deletions micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,6 @@ spec:
- FI_EFA_FORK_SAFE=1
- -x
- NCCL_DEBUG=INFO
- -x
- NCCL_BUFFSIZE=8388608
- -x
- NCCL_P2P_NET_CHUNKSIZE=524288
- -x
- NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so
- --mca
- pml
- ^ucx
- --mca
- btl
- tcp,self
- --mca
- btl_tcp_if_exclude
- lo,docker0,veth_def_agent
- /opt/nccl-tests/build/all_reduce_perf
- -b
- "8"
Expand Down
16 changes: 0 additions & 16 deletions micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,12 @@ ADDITIONAL_LD_LIBRARY_PATH=${2:-/usr/local/cuda-13.0/lib}
mpirun -N 1 bash -c 'echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'


### NCCL_BUFFSIZE increase the send queue depth and can turn NCCL communications into non-blocking.
### https://www.usenix.org/system/files/atc23-choi.pdf

### NCCL_P2P_NET_CHUNKSIZE Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html

### Improve performance for AllReduce by selecting specific protocol and algorithm for specific
### message size and number of ranks.
### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS.

# run all_reduce test
mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \
-x FI_PROVIDER=efa \
-x FI_EFA_FORK_SAFE=1 \
-x LD_LIBRARY_PATH=$ADDITIONAL_LD_LIBRARY_PATH:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/amazon/ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
-x NCCL_DEBUG=INFO \
-x NCCL_SOCKET_IFNAME=^docker,lo,veth \
-x NCCL_BUFFSIZE=8388608 \
-x NCCL_P2P_NET_CHUNKSIZE=524288 \
-x NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \
--mca pml ^ucx \
--mca btl tcp,self \
--mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
--bind-to none ${ALL_REDUCE_BINARY} -b 8 -e 16G -f 2 -g 1 -c 1 -n 100

12 changes: 0 additions & 12 deletions micro-benchmarks/nccl-tests/slurm/nccl-tests-container.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,6 @@ export FI_EFA_FORK_SAFE=1
## NCCL Environment variables
export NCCL_DEBUG=INFO
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this shouldn't be set to INFO. Change it to WARN, add a comment that if the user requires detailed data they should change to INFO. It can take a toll on the final results.


### Increase the send queue depth and can turn NCCL communications into non-blocking.
### https://www.usenix.org/system/files/atc23-choi.pdf
export NCCL_BUFFSIZE=8388608
### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
export NCCL_P2P_NET_CHUNKSIZE=524288

### Improve performance for AllReduce by selecting specific protocol and algorithm for specific
### message size and number of ranks.
### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS.
export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/$(uname -m)-linux-gnu/libnccl-ofi-tuner.so


declare -a ARGS=(
--container-image $IMAGE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,4 @@ mpirun -n $((SLURM_NTASKS_PER_NODE * SLURM_JOB_NUM_NODES)) -N $SLURM_NTASKS_PER_
-x NCCL_TESTS_SPLIT_MASK=${SPLIT_MASK} \
${NCCL_DEBUG_FLAG} \
${HOSTFILE_OPTS} \
--mca pml ^ucx \
--mca btl tcp,self \
--mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
--bind-to none ${TEST_BINARY} -b 8 -e 16G -f 2 -g 1 -c 1 -n 100
Loading