Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 67 additions & 11 deletions buildkite/test-template-ci.j2
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,16 @@ steps:
{% endif %}
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- |
#!/bin/bash
# Create buildx builder with docker-container driver if it doesn't exist
if ! docker buildx inspect vllm-builder > /dev/null 2>&1; then
echo "Creating buildx builder..."
docker buildx create --name vllm-builder --driver docker-container --use --bootstrap
else
echo "Using existing buildx builder..."
docker buildx use vllm-builder
fi
- |
#!/bin/bash
if [[ -z $(docker manifest inspect {{ docker_image }}) ]]; then
Expand All @@ -401,21 +411,23 @@ steps:
exit 0
fi
- >
docker build --file docker/Dockerfile
docker buildx build --file docker/Dockerfile
--build-arg max_jobs=16
--build-arg buildkite_commit=$BUILDKITE_COMMIT
--build-arg USE_SCCACHE=1
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0"
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a"
--cache-from type=registry,ref=public.ecr.aws/q9t5s3a7/vllm-ci-{% if branch == "main" %}postmerge{% else %}test{% endif %}-repo:buildcache
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @rzabarazesh , I see that currently you have a separate buildcache which you cache from and cache to, however in your PR description you have an example run where you cache from the :latest image and then you do not have a --cache-to. I am just wondering why you chose to change this? My PR ( #174 ) also has a separate build cache and utilizes --cache-from and --cache-to, however I am also seeing a lot of overhead from --cache-to so I am wondering if your original approach might be better?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "example ci" build is outdated. I was trying another approach with BUILDKIT_INLINE_CACHE.
Basically this:

        docker build --file docker/Dockerfile
        --build-arg max_jobs=16
        --build-arg buildkite_commit=$BUILDKITE_COMMIT
        --build-arg USE_SCCACHE=1
        --build-arg BUILDKIT_INLINE_CACHE=1
        --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0"
        --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a"
        --cache-from public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:latest
        {% if branch != "main" %}
        --build-arg VLLM_USE_PRECOMPILED={{ vllm_use_precompiled | default("0") }}
        --build-arg VLLM_DOCKER_BUILD_CONTEXT=1{% if vllm_use_precompiled is defined and vllm_use_precompiled == "1" %}
        --build-arg USE_FLASHINFER_PREBUILT_WHEEL=true{% endif %}
        {% endif %}
        --tag {{ docker_image }}
        --target test
        --progress plain .
      - "docker push {{ docker_image }}"
      {% if branch == "main" %}
      - "docker tag {{ docker_image }} {{ docker_image_latest }}"
      - "docker push {{ docker_image_latest }}"
      {% endif %}

--cache-to type=registry,ref=public.ecr.aws/q9t5s3a7/vllm-ci-{% if branch == "main" %}postmerge{% else %}test{% endif %}-repo:buildcache,mode=max
{% if branch != "main" %}
--build-arg VLLM_USE_PRECOMPILED={{ vllm_use_precompiled | default("0") }}
--build-arg VLLM_DOCKER_BUILD_CONTEXT=1{% if vllm_use_precompiled is defined and vllm_use_precompiled == "1" %}
--build-arg USE_FLASHINFER_PREBUILT_WHEEL=true{% endif %}
{% endif %}
--tag {{ docker_image }}
--target test
--progress plain .
- "docker push {{ docker_image }}"
--progress plain
--push .
{% if branch == "main" %}
- "docker tag {{ docker_image }} {{ docker_image_latest }}"
- "docker push {{ docker_image_latest }}"
Expand Down Expand Up @@ -444,6 +456,16 @@ steps:
{% endif %}
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- |
#!/bin/bash
# Create buildx builder with docker-container driver if it doesn't exist
if ! docker buildx inspect vllm-builder > /dev/null 2>&1; then
echo "Creating buildx builder..."
docker buildx create --name vllm-builder --driver docker-container --use --bootstrap
else
echo "Using existing buildx builder..."
docker buildx use vllm-builder
fi
- |
#!/bin/bash
if [[ -z $(docker manifest inspect {{ docker_image_cu118 }}) ]]; then
Expand All @@ -453,21 +475,23 @@ steps:
exit 0
fi
- >
docker build
docker buildx build
--file docker/Dockerfile
--build-arg max_jobs=16
--build-arg buildkite_commit=$BUILDKITE_COMMIT
--build-arg USE_SCCACHE=1
--build-arg CUDA_VERSION=11.8.0
--cache-from type=registry,ref=public.ecr.aws/q9t5s3a7/vllm-ci-{% if branch == "main" %}postmerge{% else %}test{% endif %}-repo:buildcache-cu118
--cache-to type=registry,ref=public.ecr.aws/q9t5s3a7/vllm-ci-{% if branch == "main" %}postmerge{% else %}test{% endif %}-repo:buildcache-cu118,mode=max
{% if branch != "main" %}
--build-arg VLLM_USE_PRECOMPILED={{ vllm_use_precompiled | default("0") }}
--build-arg VLLM_DOCKER_BUILD_CONTEXT=1{% if vllm_use_precompiled is defined and vllm_use_precompiled == "1" %}
--build-arg USE_FLASHINFER_PREBUILT_WHEEL=true{% endif %}
{% endif %}
--tag {{ docker_image_cu118 }}
--target test
--progress plain .
- "docker push {{ docker_image_cu118 }}"
--progress plain
--push .
env:
DOCKER_BUILDKIT: "1"
retry:
Expand All @@ -488,6 +512,16 @@ steps:
{% endif %}
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- |
#!/bin/bash
# Create buildx builder with docker-container driver if it doesn't exist
if ! docker buildx inspect vllm-builder > /dev/null 2>&1; then
echo "Creating buildx builder..."
docker buildx create --name vllm-builder --driver docker-container --use --bootstrap
else
echo "Using existing buildx builder..."
docker buildx use vllm-builder
fi
- |
#!/bin/bash
if [[ -z $(docker manifest inspect {{ docker_image_cpu }}) ]]; then
Expand All @@ -496,8 +530,18 @@ steps:
echo "Image found"
exit 0
fi
- "docker build --file docker/Dockerfile.cpu --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag {{ docker_image_cpu }} --target vllm-test --progress plain ."
- "docker push {{ docker_image_cpu }}"
- >
docker buildx build --file docker/Dockerfile.cpu
--build-arg max_jobs=16
--build-arg buildkite_commit=$BUILDKITE_COMMIT
--build-arg VLLM_CPU_AVX512BF16=true
--build-arg VLLM_CPU_AVX512VNNI=true
--cache-from type=registry,ref=public.ecr.aws/q9t5s3a7/vllm-ci-{% if branch == "main" %}postmerge{% else %}test{% endif %}-repo:buildcache-cpu
--cache-to type=registry,ref=public.ecr.aws/q9t5s3a7/vllm-ci-{% if branch == "main" %}postmerge{% else %}test{% endif %}-repo:buildcache-cpu,mode=max
--tag {{ docker_image_cpu }}
--target vllm-test
--progress plain
--push .
env:
DOCKER_BUILDKIT: "1"
retry:
Expand Down Expand Up @@ -574,6 +618,16 @@ steps:
timeout_in_minutes: 360
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- |
#!/bin/bash
# Create buildx builder with docker-container driver if it doesn't exist
if ! docker buildx inspect vllm-builder > /dev/null 2>&1; then
echo "Creating buildx builder..."
docker buildx create --name vllm-builder --driver docker-container --use --bootstrap
else
echo "Using existing buildx builder..."
docker buildx use vllm-builder
fi
- |
#!/bin/bash
if [[ -z $(docker manifest inspect {{ docker_image_torch_nightly }}) ]]; then
Expand All @@ -583,15 +637,17 @@ steps:
exit 0
fi
- >
docker build
docker buildx build
--file docker/Dockerfile.nightly_torch
--build-arg max_jobs=16
--build-arg buildkite_commit=$BUILDKITE_COMMIT
--build-arg USE_SCCACHE=1
--cache-from type=registry,ref=public.ecr.aws/q9t5s3a7/vllm-ci-{% if branch == "main" %}postmerge{% else %}test{% endif %}-repo:buildcache-nightly
--cache-to type=registry,ref=public.ecr.aws/q9t5s3a7/vllm-ci-{% if branch == "main" %}postmerge{% else %}test{% endif %}-repo:buildcache-nightly,mode=max
--tag {{ docker_image_torch_nightly }}
--target test
--progress plain .
- "docker push {{ docker_image_torch_nightly }}"
--progress plain
--push .
env:
DOCKER_BUILDKIT: "1"
retry:
Expand Down