From d54724a0fc3181819748ad8477af58b402ac7d07 Mon Sep 17 00:00:00 2001 From: Mikhail Kardash Date: Thu, 27 Jun 2024 12:05:21 -0700 Subject: [PATCH 1/5] feat: add infinityhub dockerfiles --- Dockerfile-infinityhub-hpc | 87 +++++++++++++++++++ Dockerfile-infinityhub-pytorch | 60 +++++++++++++ .../additional-requirements-rocm.txt | 16 ++-- 3 files changed, 158 insertions(+), 5 deletions(-) create mode 100644 Dockerfile-infinityhub-hpc create mode 100644 Dockerfile-infinityhub-pytorch diff --git a/Dockerfile-infinityhub-hpc b/Dockerfile-infinityhub-hpc new file mode 100644 index 00000000..f8753c34 --- /dev/null +++ b/Dockerfile-infinityhub-hpc @@ -0,0 +1,87 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# MAY NOT BE IMPORTANT ANYMORE +RUN apt install rocm-libs + + +# THIS FIX IS FOR SAWMILL, UNCLEAR IF NECESSARY FOR GENERAL USERS +#TODO: is this necessary? +RUN apt remove -y openmpi ucx +#Let's remove existing /opt/ompi; and, link to our version. +RUN rm -rf /opt/ompi +RUN ln -s /container/ompi /opt + +# SHOULDN'T NEED TO SET SOME OF THESE VARIABLES +#USING OFI +#TODO: up until line 63 should be a separate shell script +ARG WITH_MPI=1 +ARG WITH_OFI=1 +ARG WITH_MPICH +ARG UCX_INSTALL_DIR=/container/ucx +ARG OMPI_INSTALL_DIR=/container/ompi +ARG MPICH_INSTALL_DIR=/container/mpich +ARG OFI_INSTALL_DIR=/container/ofi +ARG OMPI_WITH_CUDA=0 +ARG OMPI_WITH_ROCM=1 +RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi + +# Make sure OMPI/UCX show up in the right paths +ARG VERBS_LIB_DIR=/usr/lib/libibverbs +ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64 +ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin +ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64 +ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin +ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib +ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin +ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib +ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin + +# Set up UCX_LIBS and OFI_LIBS +ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:" +ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:" + +# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string +ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}" + +# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs +ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}" + +# But, only add them if WITH_MPI +ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH + +#USING OFI +ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}} + +#USING UCX +ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}} + +ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH + +# Enable running OMPI as root +ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1} +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1} + + +ARG AWS_PLUGIN_INSTALL_DIR=/container/aws +ARG WITH_AWS_TRACE +ARG INTERNAL_AWS_DS +ARG INTERNAL_AWS_PATH +ARG ROCM_DIR=/opt/rocm +ENV ROCM_DIR $ROCM_DIR +RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi +ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH + +# Set an entrypoint that can scrape up the host libfabric.so and then +# run the user command. This is intended to enable performant execution +# on non-IB systems that have a proprietary libfabric. + +ARG WITH_RCCL=1 +ENV WITH_RCCL=$WITH_RCCL +ARG WITH_NFS_WORKAROUND=1 +ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND + +RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin +ENTRYPOINT ["/container/bin/scrape_libs.sh"] + +RUN rm -r /tmp/* diff --git a/Dockerfile-infinityhub-pytorch b/Dockerfile-infinityhub-pytorch new file mode 100644 index 00000000..7aa59adf --- /dev/null +++ b/Dockerfile-infinityhub-pytorch @@ -0,0 +1,60 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 + +RUN mkdir -p /var/run/sshd +RUN rm /etc/apt/sources.list.d/rocm.list +RUN pip install pip install --upgrade pip + +COPY dockerfile_scripts /tmp/det_dockerfile_scripts + +RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh + +# LIBFABRIC ISSUE +# USE CONDA FOR WORKAROUND +#TODO: MAY NOT BE A PROBLEM ANYMORE? +# protect this image from slurm +ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}" +ARG CONDA="${PATH}" + +# Install fixed version of FFI package for Ubuntu 20.04. +# This is done after above stuff to make sure we get right version. +RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh +RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh +RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh + + +RUN pip install determined && pip uninstall -y determined + +RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt + +RUN python -m pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && \ + jupyter labextension disable "@jupyterlab/apputils-extension:announcements" + +ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config +ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data +ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime + +#ENV HSA_FORCE_FINE_GRAIN_PCIE=1 #TODO: check if this is necessary + +#RUN ldconfig #TODO: check if this is necessary + +#TODO: finish iterating here, preferably turn it into a shell script. +RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && pip install -e .;fi +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ; fi +#RUN if [ -n "$DEEPSPEED_PIP" ]; then cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then cd DeepSpeed && DS_BUILD_SPARSE_ATTN=0 DS_BUILD_RANDOM_LTD=0 DS_BUILD_EVOFORMER_ATTN=0 python3 setup.py build ; fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then cd DeepSpeed && python3 setup.py install && echo TIME FOR DEEPSPEED ENV REPORT || TIME FOR DEEPSPEED ENV REPORT && python -m deepspeed.env_report; fi +#RUN exit 1 +#RUN if [ -n "$DEEPSPEED_PIP" ]; then python -m deepspeed.env_report ; fi + +# MIOPEN_DEBUG_SAVE_TEMP_DIR is required to prevent +# PAD-133 +ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1 + +CMD ["/bin/bash"] +USER root + +RUN rm -r /tmp/* diff --git a/dockerfile_scripts/additional-requirements-rocm.txt b/dockerfile_scripts/additional-requirements-rocm.txt index 27585b8e..95207da6 100644 --- a/dockerfile_scripts/additional-requirements-rocm.txt +++ b/dockerfile_scripts/additional-requirements-rocm.txt @@ -1,16 +1,22 @@ attrdict3 pandas matplotlib -tensorflow-datasets==1.3.2 -Keras-Preprocessing[image] # TODO(DET-4259) Remove this when we fix the circular dependency with the main repo. petname azure-storage-blob Pillow>=8.3.2,<=9.5.0 analytics-python -nvidia-ml-py +# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf +# Horovod cannot build with protobuf > 3.20.x +# latest google-api-python-client requires protobuf >= 3.20.1 protobuf<=3.20.3 tensorboard==2.10.1 -pynvml tokenizers==0.13.0 -huggingface-hub==0.16.4 +huggingface-hub==0.16.4 +# necessary for benchmarks, but really should go into startup-hook.sh for that workflow +accelerate>=0.12.0 +datasets +sentencepiece +evaluate +scikit-learn +transformers From 12d78ec68e3bf89467784354dea3843ad3f7fca1 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 15 Jul 2024 21:41:10 +0000 Subject: [PATCH 2/5] Fixed minor issues from the Bifurcation; there is likely some more cleanup needed in the Makefile. --- Dockerfile-infinityhub-hpc | 1 + Dockerfile-infinityhub-pytorch | 13 +-- Makefile | 185 +++++++++++++++++++++++++++++---- 3 files changed, 167 insertions(+), 32 deletions(-) diff --git a/Dockerfile-infinityhub-hpc b/Dockerfile-infinityhub-hpc index f8753c34..9ce0380c 100644 --- a/Dockerfile-infinityhub-hpc +++ b/Dockerfile-infinityhub-hpc @@ -11,6 +11,7 @@ RUN apt remove -y openmpi ucx #Let's remove existing /opt/ompi; and, link to our version. RUN rm -rf /opt/ompi RUN ln -s /container/ompi /opt +COPY dockerfile_scripts /tmp/det_dockerfile_scripts # SHOULDN'T NEED TO SET SOME OF THESE VARIABLES #USING OFI diff --git a/Dockerfile-infinityhub-pytorch b/Dockerfile-infinityhub-pytorch index 7aa59adf..c8d95a64 100644 --- a/Dockerfile-infinityhub-pytorch +++ b/Dockerfile-infinityhub-pytorch @@ -1,6 +1,6 @@ ARG BASE_IMAGE FROM ${BASE_IMAGE} - +#why no highlighting? ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 RUN mkdir -p /var/run/sshd @@ -41,14 +41,9 @@ ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime #RUN ldconfig #TODO: check if this is necessary #TODO: finish iterating here, preferably turn it into a shell script. -RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && pip install -e .;fi -#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi -RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ; fi -#RUN if [ -n "$DEEPSPEED_PIP" ]; then cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi -RUN if [ -n "$DEEPSPEED_PIP" ]; then cd DeepSpeed && DS_BUILD_SPARSE_ATTN=0 DS_BUILD_RANDOM_LTD=0 DS_BUILD_EVOFORMER_ATTN=0 python3 setup.py build ; fi -RUN if [ -n "$DEEPSPEED_PIP" ]; then cd DeepSpeed && python3 setup.py install && echo TIME FOR DEEPSPEED ENV REPORT || TIME FOR DEEPSPEED ENV REPORT && python -m deepspeed.env_report; fi -#RUN exit 1 -#RUN if [ -n "$DEEPSPEED_PIP" ]; then python -m deepspeed.env_report ; fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then python -m deepspeed.env_report ; fi # MIOPEN_DEBUG_SAVE_TEMP_DIR is required to prevent # PAD-133 diff --git a/Makefile b/Makefile index 3ec47d21..cd9bea73 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,11 @@ CPU_PREFIX_310 := $(REGISTRY_REPO):py-3.10- CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3- CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8- ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6- +ROCM_57_PREFIX := $(REGISTRY_REPO):rocm-5.7- +ROCM_60_PREFIX := $(REGISTRY_REPO):rocm-6.0- +ROCM_61_PREFIX := $(REGISTRY_REPO):rocm-6.1- +ROCM_60_TF_PREFIX := tensorflow-infinity-hub:tensorflow-infinity-hub + CPU_SUFFIX := -cpu CUDA_SUFFIX := -cuda @@ -140,6 +145,15 @@ NGC_PYTORCH_HPC_REPO := pytorch-ngc-hpc-dev NGC_TF_REPO := tensorflow-ngc-dev NGC_TF_HPC_REPO := tensorflow-ngc-hpc-dev +INFINITYHUB_PYTORCH_PREFIX := rocm/pytorch +INFINITYHUB_TENSORFLOW_PREFIX := rocm/tensorflow +INFINITYHUB_PYTORCH_VERSION := 2.1.2 +INFINITYHUB_TENSORFLOW_VERSION := +export INFINITYHUB_PYTORCH_REPO := pytorch-infinityhub-dev +INFINITYHUB_PYTORCH_HPC_REPO := pytorch-infinityhub-hpc-dev +INFINITYHUB_TF_REPO := tensorflow-infinityhub-dev +INFINITYHUB_TF_HPC_REPO := tensorflow-infinityhub-hpc-dev + # build hpc together since hpc is dependent on the normal build .PHONY: build-pytorch-ngc build-pytorch-ngc: @@ -163,39 +177,164 @@ build-tensorflow-ngc: -t $(DOCKERHUB_REGISTRY)/$(NGC_TF_HPC_REPO):$(SHORT_GIT_HASH) \ . + +export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch-infinityhub +build-pytorch-infinityhub: + docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ + -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \ + . + docker build --shm-size='1gb' -f Dockerfile-infinityhub-hpc \ + --build-arg BASE_IMAGE=$(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_HPC_REPO)-$(SHORT_GIT_HASH) \ + . + + ifeq ($(WITH_MPICH),1) -ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich +ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich else -ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi +ROCM61_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi endif -export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI) -.PHONY: build-pytorch13-tf210-rocm56 -build-pytorch13-tf210-rocm56: +export ROCM61_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH13_MPI) +.PHONY: build-pytorch13-tf210-rocm60 +build-pytorch13-tf210-rocm60: docker build -f Dockerfile-default-rocm \ - --build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\ - --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ - --build-arg HOROVOD_PIP="horovod==0.28.1" \ - --build-arg WITH_MPICH=$(WITH_MPICH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \ - . + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_1.13.1" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + + ifeq ($(WITH_MPICH),1) -ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich else -ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi endif -export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI) -.PHONY: build-pytorch20-tf210-rocm56 -build-pytorch20-tf210-rocm56: + #--build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ + #?? --build-arg BASE_IMAGE="amdih/uif-pytorch:uif1.2_rocm5.6.1_vai3.5_py3.8_pytorch1.13" \ + #--build-arg BASE_IMAGE="rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2" \ + +export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH_MPI) +.PHONY: build-pytorch20-tf210-rocm60 +build-pytorch20-tf210-rocm60: docker build -f Dockerfile-default-rocm \ - --build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \ - --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ - --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="0" \ --build-arg WITH_MPICH=$(WITH_MPICH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ - -t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ - . + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + + + +ifeq ($(WITH_MPICH),1) +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich +else +ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi +endif +export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI) +.PHONY: build-pytorch20-tf210-rocm61 +build-pytorch20-tf210-rocm61: + docker build -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + +ifeq ($(WITH_MPICH),1) +ROCM61_TORCH_MPI :=pytorch-3.10-rocm-mpich +else +ROCM61_TORCH_MPI :=pytorch-3.10-rocm-ompi +endif +export ROCM61_TORCH_ENVIRONMENT_NAME := $(ROCM_61_PREFIX)$(ROCM61_TORCH_MPI) +.PHONY: build-pytorch20-rocm61 +build-pytorch20-rocm61: + docker build -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="0" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_ENVIRONMENT_NAME)-$(VERSION) \ + . + + + + +#export ROCM60_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH_MPI) +export ROCM60_TF_ENVIRONMENT_NAME := $(ROCM_60_TF_PREFIX) + #--build-arg BASE_IMAGE="rocm/tensorflow:rocm6.0-tf2.12-dev" \ +.PHONY: build-tf210-rocm60 +build-tf210-rocm60: + docker build -f Dockerfile-tensorflow-rocm \ + --build-arg BASE_IMAGE="rocm/tensorflow:rocm6.1-py3.9-tf2.15-dev" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM60_TF_ENVIRONMENT_NAME)-$(VERSION) \ + . + + +DEEPSPEED_VERSION := 0.8.3 +export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX) +export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX) +export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html + +export ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_57_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch20-tf210-rocm57-deepspeed +build-pytorch20-tf210-rocm57-deepspeed: + docker build --shm-size='1gb' -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.1.1" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="horovod==0.28.1" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ + . + + + #--build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ + #DOCKER_BUILDKIT=0 docker build --shm-size='1gb' -f Dockerfile-default-rocm \ + +export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed +.PHONY: build-pytorch20-tf210-rocm61-deepspeed +build-pytorch20-tf210-rocm61-deepspeed: + docker build --shm-size='1gb' -f Dockerfile-default-rocm \ + --build-arg BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2" \ + --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ + --build-arg HOROVOD_PIP="0" \ + --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ + --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ + --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ + --build-arg WITH_MPICH=$(WITH_MPICH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(SHORT_GIT_HASH) \ + -t $(DOCKERHUB_REGISTRY)/$(ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ + . + + DEEPSPEED_VERSION := 0.8.3 export GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := deepspeed-cuda-gpt-neox From 3d95ef166b749cc3227e17d35fdae9cba8e2e94d Mon Sep 17 00:00:00 2001 From: will-HPE Date: Tue, 16 Jul 2024 19:44:07 +0000 Subject: [PATCH 3/5] cleaned up some commented out clode. --- Makefile | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Makefile b/Makefile index cd9bea73..ab1e6d15 100644 --- a/Makefile +++ b/Makefile @@ -222,9 +222,6 @@ ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich else ROCM61_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi endif - #--build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ - #?? --build-arg BASE_IMAGE="amdih/uif-pytorch:uif1.2_rocm5.6.1_vai3.5_py3.8_pytorch1.13" \ - #--build-arg BASE_IMAGE="rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2" \ export ROCM61_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH_MPI) .PHONY: build-pytorch20-tf210-rocm60 @@ -277,10 +274,7 @@ build-pytorch20-rocm61: -#export ROCM60_TF_ENVIRONMENT_NAME := $(ROCM_60_PREFIX)$(ROCM61_TORCH_MPI) export ROCM60_TF_ENVIRONMENT_NAME := $(ROCM_60_TF_PREFIX) - #--build-arg BASE_IMAGE="rocm/tensorflow:rocm6.0-tf2.12-dev" \ -.PHONY: build-tf210-rocm60 build-tf210-rocm60: docker build -f Dockerfile-tensorflow-rocm \ --build-arg BASE_IMAGE="rocm/tensorflow:rocm6.1-py3.9-tf2.15-dev" \ @@ -313,10 +307,6 @@ build-pytorch20-tf210-rocm57-deepspeed: -t $(DOCKERHUB_REGISTRY)/$(ROCM57_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED)-$(VERSION) \ . - - #--build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ - #DOCKER_BUILDKIT=0 docker build --shm-size='1gb' -f Dockerfile-default-rocm \ - export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed .PHONY: build-pytorch20-tf210-rocm61-deepspeed build-pytorch20-tf210-rocm61-deepspeed: From e58c7c507ccc2214f25e67ca948e59b00af3a5a5 Mon Sep 17 00:00:00 2001 From: will-HPE Date: Tue, 16 Jul 2024 21:11:00 +0000 Subject: [PATCH 4/5] bumped VERSION. --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 8df3f459..c9ec1d54 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.33.1 +0.33.2 From a1137b325c4d87b08858a9f63fa339908d5a18c1 Mon Sep 17 00:00:00 2001 From: Will B Date: Mon, 5 Aug 2024 19:22:09 +0000 Subject: [PATCH 5/5] fixed deepspeed; many changes working toward support of huggingface --- Dockerfile-infinityhub-pytorch | 33 +++++++++++++++---- Makefile | 7 ++-- .../additional-requirements-rocm.txt | 9 ++--- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/Dockerfile-infinityhub-pytorch b/Dockerfile-infinityhub-pytorch index c8d95a64..1d50ad7b 100644 --- a/Dockerfile-infinityhub-pytorch +++ b/Dockerfile-infinityhub-pytorch @@ -1,23 +1,26 @@ ARG BASE_IMAGE FROM ${BASE_IMAGE} #why no highlighting? -ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 +ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0 TT=0 RUN mkdir -p /var/run/sshd RUN rm /etc/apt/sources.list.d/rocm.list -RUN pip install pip install --upgrade pip +RUN pip install --upgrade pip COPY dockerfile_scripts /tmp/det_dockerfile_scripts -RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh +RUN apt-get update && /tmp/det_dockerfile_scripts/install_deb_packages.sh +RUN python --version +#RUN /tmp/det_dockerfile_scripts/install_deb_packages.sh # LIBFABRIC ISSUE # USE CONDA FOR WORKAROUND #TODO: MAY NOT BE A PROBLEM ANYMORE? # protect this image from slurm -ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}" +#ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}" ARG CONDA="${PATH}" +#RUN exit 1 # Install fixed version of FFI package for Ubuntu 20.04. # This is done after above stuff to make sure we get right version. RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh @@ -26,6 +29,8 @@ RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh RUN pip install determined && pip uninstall -y determined +RUN apt update +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y cargo RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt @@ -39,12 +44,26 @@ ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime #ENV HSA_FORCE_FINE_GRAIN_PCIE=1 #TODO: check if this is necessary #RUN ldconfig #TODO: check if this is necessary - +RUN echo A #TODO: finish iterating here, preferably turn it into a shell script. -RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi -RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +ARG DEEPSPEED_PIP +ARG DS_BUILD_CUTLASS_OPS=0 +ENV DS_BUILD_CUTLASS_OPS=0 + +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& git clone https://github.com/ROCmSoftwarePlatform/triton.git && cd triton && git checkout triton-mlir && cd python && pip3 install ninja cmake && python setup.py install;fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&&pip3 install ninja cmake;pip3 install triton==2.3.1;fi +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/microsoft/DeepSpeed.git && cd DeepSpeed && DS_BUILD_OPS=1 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_RANDOM_LTD=0 DS_BUILD_FUSED_ADAM=0 DS_BUILD_CCL_COMM=0 python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi +#RUN if [ -n "$DEEPSPEED_PIP" ]; then DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev&& python -m pip install pydantic==1.10.11 && git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git && cd DeepSpeed && python3 setup.py build && python3 setup.py install && python -m deepspeed.env_report; fi RUN if [ -n "$DEEPSPEED_PIP" ]; then python -m deepspeed.env_report ; fi +RUN pip list | grep -i deepspeed +RUN echo "$DEEPSPEED_PIP" +RUN pip install tokenizers>=0.19 +RUN pip install transformers==4.43.3 +RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash +#RUN exit 1 # MIOPEN_DEBUG_SAVE_TEMP_DIR is required to prevent # PAD-133 ENV MIOPEN_DEBUG_SAVE_TEMP_DIR=1 diff --git a/Makefile b/Makefile index ab1e6d15..06b63606 100644 --- a/Makefile +++ b/Makefile @@ -178,6 +178,11 @@ build-tensorflow-ngc: . + #DOCKER_BUILDKIT=0 docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ + #docker build --shm-size='1gb' -f Dockerfile-infinityhub-pytorch \ + --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ + +DEEPSPEED_VERSION := 0.13.0 export ROCM61_TORCH_TF_ENVIRONMENT_NAME_DEEPSPEED := $(ROCM_61_PREFIX)pytorch-2.0-tf-2.10-rocm-deepspeed .PHONY: build-pytorch-infinityhub build-pytorch-infinityhub: @@ -186,7 +191,6 @@ build-pytorch-infinityhub: --build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \ --build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \ --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ - --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ --build-arg DEEPSPEED_PIP="deepspeed==$(DEEPSPEED_VERSION)" \ -t $(DOCKERHUB_REGISTRY)/$(INFINITYHUB_PYTORCH_REPO)-$(SHORT_GIT_HASH) \ @@ -285,7 +289,6 @@ build-tf210-rocm60: . -DEEPSPEED_VERSION := 0.8.3 export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX) export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX) export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html diff --git a/dockerfile_scripts/additional-requirements-rocm.txt b/dockerfile_scripts/additional-requirements-rocm.txt index 95207da6..61e0ef58 100644 --- a/dockerfile_scripts/additional-requirements-rocm.txt +++ b/dockerfile_scripts/additional-requirements-rocm.txt @@ -11,12 +11,13 @@ analytics-python # latest google-api-python-client requires protobuf >= 3.20.1 protobuf<=3.20.3 tensorboard==2.10.1 -tokenizers==0.13.0 -huggingface-hub==0.16.4 +tokenizers>=0.19.0 +huggingface-hub>=0.20.1 # necessary for benchmarks, but really should go into startup-hook.sh for that workflow -accelerate>=0.12.0 +accelerate>=0.31.0 datasets sentencepiece evaluate scikit-learn -transformers +#transformers +transformers==4.43.3