diff --git a/Dockerfile-infinityhub-hpc b/Dockerfile-infinityhub-hpc new file mode 100644 index 00000000..9ce0380c --- /dev/null +++ b/Dockerfile-infinityhub-hpc @@ -0,0 +1,88 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# MAY NOT BE IMPORTANT ANYMORE +RUN apt install rocm-libs + + +# THIS FIX IS FOR SAWMILL, UNCLEAR IF NECESSARY FOR GENERAL USERS +#TODO: is this necessary? +RUN apt remove -y openmpi ucx +#Let's remove existing /opt/ompi; and, link to our version. +RUN rm -rf /opt/ompi +RUN ln -s /container/ompi /opt +COPY dockerfile_scripts /tmp/det_dockerfile_scripts + +# SHOULDN'T NEED TO SET SOME OF THESE VARIABLES +#USING OFI +#TODO: up until line 63 should be a separate shell script +ARG WITH_MPI=1 +ARG WITH_OFI=1 +ARG WITH_MPICH +ARG UCX_INSTALL_DIR=/container/ucx +ARG OMPI_INSTALL_DIR=/container/ompi +ARG MPICH_INSTALL_DIR=/container/mpich +ARG OFI_INSTALL_DIR=/container/ofi +ARG OMPI_WITH_CUDA=0 +ARG OMPI_WITH_ROCM=1 +RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi + +# Make sure OMPI/UCX show up in the right paths +ARG VERBS_LIB_DIR=/usr/lib/libibverbs +ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64 +ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin +ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64 +ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin +ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib +ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin +ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib +ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin + +# Set up UCX_LIBS and OFI_LIBS +ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:" +ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:" + +# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string +ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}" + +# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs +ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}" + +# But, only add them if WITH_MPI +ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH + +#USING OFI +ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}} + +#USING UCX +ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}} + +ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH + +# Enable running OMPI as root +ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1} +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1} + + +ARG AWS_PLUGIN_INSTALL_DIR=/container/aws +ARG WITH_AWS_TRACE +ARG INTERNAL_AWS_DS +ARG INTERNAL_AWS_PATH +ARG ROCM_DIR=/opt/rocm +ENV ROCM_DIR $ROCM_DIR +RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi +ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH + +# Set an entrypoint that can scrape up the host libfabric.so and then +# run the user command. This is intended to enable performant execution +# on non-IB systems that have a proprietary libfabric. + +ARG WITH_RCCL=1 +ENV WITH_RCCL=$WITH_RCCL +ARG WITH_NFS_WORKAROUND=1 +ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND + +RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin +ENTRYPOINT ["/container/bin/scrape_libs.sh"] + +RUN rm -r /tmp/* diff --git a/Dockerfile-infinityhub-pytorch b/Dockerfile-infinityhub-pytorch new file mode 100644 index 00000000..1d50ad7b --- /dev/null +++ b/Dockerfile-infinityhub-pytorch @@ -0,0 +1,74 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} +#why no highlighting? +ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 TT=0 + +RUN mkdir -p /var/run/sshd +RUN rm /etc/apt/sources.list.d/rocm.list +RUN pip install --upgrade pip + +COPY dockerfile_scripts /tmp/det_dockerfile_scripts + +RUN apt-get update && /tmp/det_dockerfile_scripts/install_deb_packages.sh +RUN python --version +#RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh + +# LIBFABRIC ISSUE +# USE CONDA FOR WORKAROUND +#TODO: MAY NOT BE A PROBLEM ANYMORE? +# protect this image from slurm +#ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}" +ARG CONDA="${PATH}" + +#RUN exit 1 +# Install fixed version of FFI package for Ubuntu 20.04. +# This is done after above stuff to make sure we get right version. +RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh +RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh +RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh + + +RUN pip install determined && pip uninstall -y determined +RUN apt update +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y cargo + +RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt + +RUN python -m pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && \ + jupyter labextension disable "@jupyterlab/apputils-extension:announcements" + +ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config +ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data +ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime + +#ENV HSA_FORCE_FINE_GRAIN_PCIE=1 #TODO: check if this is necessary + +#RUN ldconfig #TODO: check if this is necessary +RUN echo A +#TODO: finish iterating here, preferably turn it into a shell script. +ARG DEEPSPEED_PIP +ARG DS_BUILD_CUTLASS_OPS=0 +ENV DS_BUILD_CUTLASS_OPS=0 + +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&&pip3 install ninja cmake;pip3 install triton==2.3.1;fi +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_RANDOM_LTD=0 DS_BUILD_FUSED_ADAM=0 DS_BUILD_CCL_COMM=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then python -m deepspeed.env_report ; fi +RUN pip list | grep -i deepspeed +RUN echo "$DEEPSPEED_PIP" + +RUN pip install tokenizers>=0.19 +RUN pip install transformers==4.43.3 +RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash +#RUN exit 1 +# MIOPEN_DEBUG_SAVE_TEMP_DIR is required to prevent +# PAD-133 +ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1 + +CMD ["/bin/bash"] +USER root + +RUN rm -r /tmp/* diff --git a/Makefile b/Makefile index 3ec47d21..06b63606 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,11 @@ CPU_PREFIX_310 := $(REGISTRY_REPO):py-3.10- CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3- CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8- ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6- +ROCM_57_PREFIX := $(REGISTRY_REPO):rocm-5.7- +ROCM_60_PREFIX := $(REGISTRY_REPO):rocm-6.0- +ROCM_61_PREFIX := $(REGISTRY_REPO):rocm-6.1- +ROCM_60_TF_PREFIX := tensorflow-infinity-hub:tensorflow-infinity-hub + CPU_SUFFIX := -cpu CUDA_SUFFIX := -cuda @@ -140,6 +145,15 @@ NGC_PYTORCH_HPC_REPO := pytorch-ngc-hpc-dev NGC_TF_REPO := tensorflow-ngc-dev NGC_TF_HPC_REPO := tensorflow-ngc-hpc-dev +INFINITYHUB_PYTORCH_PREFIX := rocm/pytorch +INFINITYHUB_TENSORFLOW_PREFIX := rocm/tensorflow +INFINITYHUB_PYTORCH_VERSION := 2.1.2 +INFINITYHUB_TENSORFLOW_VERSION := +export INFINITYHUB_PYTORCH_REPO := pytorch-infinityhub-dev +INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev +INFINITYHUB_TF_REPO := tensorflow-infinityhub-dev +INFINITYHUB_TF_HPC_REPO := tensorflow-infinityhub-hpc-dev + # build hpc together since hpc is dependent on the normal build .PHONY: build-pytorch-ngc build-pytorch-ngc: @@ -163,39 +177,157 @@ build-tensorflow-ngc: -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \ . + + #DOCKER_BUILDKIT=0 docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ + #docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + +DEEPSPEED_VERSION := 0.13.0 +export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch-infinityhub +build-pytorch-infinityhub: + docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ + -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \ + . + docker build --shm-size='1gb' -f Dockerfile-infinityhub-hpc \ + --build-arg BASE_IMAGE=$(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_HPC_REPO)-$(SHORT_GIT_HASH) \ + . + + ifeq ($(WITH_MPICH),1) -ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich +ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich else -ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi +ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi endif -export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI) -.PHONY: build-pytorch13-tf210-rocm56 -build-pytorch13-tf210-rocm56: +export ROCM61_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH13_MPI) +.PHONY: build-pytorch13-tf210-rocm60 +build-pytorch13-tf210-rocm60: docker build -f Dockerfile-default-rocm \ - --build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\ - --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ - --build-arg HOROVOD_PIP="horovod==0.28.1" \ - --build-arg WITH_MPICH=$(WITH_MPICH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \ - . + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_1.13.1" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + + ifeq ($(WITH_MPICH),1) -ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich else -ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi endif -export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI) -.PHONY: build-pytorch20-tf210-rocm56 -build-pytorch20-tf210-rocm56: + +export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH_MPI) +.PHONY: build-pytorch20-tf210-rocm60 +build-pytorch20-tf210-rocm60: docker build -f Dockerfile-default-rocm \ - --build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \ - --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ - --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="0" \ --build-arg WITH_MPICH=$(WITH_MPICH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ - . + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + + + +ifeq ($(WITH_MPICH),1) +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich +else +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi +endif +export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI) +.PHONY: build-pytorch20-tf210-rocm61 +build-pytorch20-tf210-rocm61: + docker build -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + +ifeq ($(WITH_MPICH),1) +ROCM61_TORCH_MPI :=pytorch-3.10-rocm-mpich +else +ROCM61_TORCH_MPI :=pytorch-3.10-rocm-ompi +endif +export ROCM61_TORCH_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI) +.PHONY: build-pytorch20-rocm61 +build-pytorch20-rocm61: + docker build -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="0" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(VERSION) \ + . + + + + +export ROCM60_TF_ENVIRONMENT_NAME := $(ROCM_60_TF_PREFIX) +build-tf210-rocm60: + docker build -f Dockerfile-tensorflow-rocm \ + --build-arg BASE_IMAGE="rocm/tensorflow:rocm6.1-py3.9-tf2.15-dev" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + + +export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX) +export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX) +export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html + +export ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_57_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch20-tf210-rocm57-deepspeed +build-pytorch20-tf210-rocm57-deepspeed: + docker build --shm-size='1gb' -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.1.1" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ + . + +export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch20-tf210-rocm61-deepspeed +build-pytorch20-tf210-rocm61-deepspeed: + docker build --shm-size='1gb' -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ + . + + DEEPSPEED_VERSION := 0.8.3 export GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := deepspeed-cuda-gpt-neox diff --git a/VERSION b/VERSION index 8df3f459..c9ec1d54 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.33.1 +0.33.2 diff --git a/dockerfile_scripts/additional-requirements-rocm.txt b/dockerfile_scripts/additional-requirements-rocm.txt index 27585b8e..61e0ef58 100644 --- a/dockerfile_scripts/additional-requirements-rocm.txt +++ b/dockerfile_scripts/additional-requirements-rocm.txt @@ -1,16 +1,23 @@ attrdict3 pandas matplotlib -tensorflow-datasets==1.3.2 -Keras-Preprocessing[image] # TODO(DET-4259) Remove this when we fix the circular dependency with the main repo. petname azure-storage-blob Pillow>=8.3.2,<=9.5.0 analytics-python -nvidia-ml-py +# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf +# Horovod cannot build with protobuf > 3.20.x +# latest google-api-python-client requires protobuf >= 3.20.1 protobuf<=3.20.3 tensorboard==2.10.1 -pynvml -tokenizers==0.13.0 -huggingface-hub==0.16.4 +tokenizers>=0.19.0 +huggingface-hub>=0.20.1 +# necessary for benchmarks, but really should go into startup-hook.sh for that workflow +accelerate>=0.31.0 +datasets +sentencepiece +evaluate +scikit-learn +#transformers +transformers==4.43.3