-
Notifications
You must be signed in to change notification settings - Fork 112
/
Copy path0.NemoMegatron-aws-optimized.Dockerfile
140 lines (119 loc) · 4.91 KB
/
0.NemoMegatron-aws-optimized.Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
# DOCKER_BUILDKIT=1 docker build --progress plain -t aws-nemo-megatron:latest .
FROM nvcr.io/nvidia/nemo:24.12
ARG GDRCOPY_VERSION=v2.4.1
ARG EFA_INSTALLER_VERSION=1.37.0
ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
ARG NCCL_VERSION=v2.23.4-1
ARG NCCL_TESTS_VERSION=v2.13.10
RUN apt-get update -y && apt-get upgrade -y
RUN apt-get remove -y --allow-change-held-packages \
ibverbs-utils \
libibverbs-dev \
libibverbs1 \
libmlx5-1 \
libnccl2 \
libnccl-dev
RUN rm -rf /opt/hpcx \
&& rm -rf /usr/local/mpi \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig
ENV OPAL_PREFIX=
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \
autoconf \
automake \
build-essential \
check \
cmake \
curl \
debhelper \
devscripts \
git \
gcc \
gdb \
kmod \
libsubunit-dev \
libtool \
openssh-client \
openssh-server \
pkg-config \
python3-distutils \
vim
RUN apt-get purge -y cuda-compat-*
RUN mkdir -p /var/run/sshd
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
&& python3 /tmp/get-pip.py \
&& pip3 install awscli pynvml
#################################################
## Install NVIDIA GDRCopy
##
## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
## that the cuda-compat-xx-x package is the latest.
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
&& cd /tmp/gdrcopy \
&& make prefix=/opt/gdrcopy install
ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:$LD_LIBRARY_PATH
ENV LIBRARY_PATH /opt/gdrcopy/lib:$LIBRARY_PATH
ENV CPATH /opt/gdrcopy/include:$CPATH
ENV PATH /opt/gdrcopy/bin:$PATH
#################################################
## Install EFA installer
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer
###################################################
## Install NCCL
RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \
&& cd /opt/nccl \
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"
###################################################
## Install AWS-OFI-NCCL plugin
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
#Switch from sh to bash to allow parameter expansion
SHELL ["/bin/bash", "-c"]
RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
&& tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
&& make -j $(nproc) \
&& make install \
&& cd .. \
&& rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
&& rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
SHELL ["/bin/sh", "-c"]
###################################################
## Install NCCL-tests
RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
&& cd /opt/nccl-tests \
&& make -j $(nproc) \
MPI=1 \
MPI_HOME=/opt/amazon/openmpi/ \
CUDA_HOME=/usr/local/cuda \
NCCL_HOME=/opt/nccl/build \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"
RUN rm -rf /var/lib/apt/lists/*
## Set Open MPI variables to exclude network interface and conduit.
ENV OMPI_MCA_pml=^ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
OPAL_PREFIX=/opt/amazon/openmpi \
NCCL_SOCKET_IFNAME=^docker,lo,veth
## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
ENV PMIX_MCA_gds=hash
## Set LD_PRELOAD for NCCL library
ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so