Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,10 @@ build-pytorch10-tf27-rocm50:
-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
.

DEEPSPEED_VERSION := 0.8.3
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
DEEPSPEED_VERSION := 0.9.2
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.12-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.12-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.12.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1

# This builds deepspeed environment off of upstream microsoft/DeepSpeed.
Expand Down Expand Up @@ -239,8 +239,8 @@ build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg "$(NCCL_BUILD_ARG)" \
--build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@eleuther_dai" \
--build-arg DET_BUILD_NCCL="" \
--build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@determined2#egg=deepspeed" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \
-t $(NGC_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
Expand Down
29 changes: 26 additions & 3 deletions dockerfile_scripts/install_deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,30 @@
set -e

DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev
# Triton is needed to build deepspeed's sparse_attn operation.
python -m pip install triton==1.0.0
DS_BUILD_OPS=1 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
# Not building sparse attn operation which depends on a very old version of triton
DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
python -m deepspeed.env_report

if [[ "$DEEPSPEED_PIP" == *"determined2"* ]]; then
# Build gpt-neox and dependencies when we install the gpt-neox version of deepspeed.
# Triton is needed for flash attn
python -m pip install triton==2.0.0.dev20221202
# This is a dependency of gpt-neox
apt-get install -y mpich
# Need this to avoid `AttributeError: module 'distutils' has no attribute 'version'` when importing tensorboard. See https://github.com/pytorch/pytorch/issues/69894.
pip install setuptools==59.5.0
# Install gpt-neox and dependencies
git clone -b determined2 https://github.com/determined-ai/gpt-neox.git
python gpt-neox/megatron/fused_kernels/setup.py install

# Exclude DeeperSpeed reinstall since the version in requirements is not pinned.
pip install $(grep -ivE "DeeperSpeed" gpt-neox/requirements/requirements.txt)
pip install -r /gpt-neox/requirements/requirements-flashattention.txt

# Download sample data
gsutil cp -r gs://determined-ai-public-datasets/text_data /gpt-neox && mv /gpt-neox/text_data /gpt-neox/data

# Modify permissions to enable example to run in nonroot mode
chmod -R 777 /gpt-neox
chmod -R 777 /tmp
fi