diff --git a/buildkite/test-template-ci.j2 b/buildkite/test-template-ci.j2 index 9be92f3d..981ed1b7 100644 --- a/buildkite/test-template-ci.j2 +++ b/buildkite/test-template-ci.j2 @@ -471,24 +471,24 @@ steps: {% for step in steps %} {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} - - label: "AMD MI300: {{ step.label }}" + - label: "{{ step.agent_pool }}: {{ step.label }}" depends_on: amd-build agents: - {% if step.label and step.label=="Benchmarks" or step.label=="Kernels Attention Test %N" or step.label=="LoRA Test %N" or step.label=="Kernels Quantization Test %N" %} - queue: amd_mi325_8 - {% elif step.label=="Distributed Tests (4 GPUs)" or step.label=="2 Node Tests (4 GPUs in total)" or step.label=="Multi-step Tests (4 GPUs)" or step.label=="Pipeline Parallelism Test" or step.label=="LoRA TP Test (Distributed)" %} - queue: amd_mi325_4 - {% elif step.label=="Distributed Comm Ops Test" or step.label=="Distributed Tests (2 GPUs)" or step.label=="Plugin Tests (2 GPUs)" or step.label=="Weight Loading Multiple GPU Test" or step.label=="Weight Loading Multiple GPU Test - Large Models" %} - queue: amd_mi325_2 - {% else %} - queue: amd_mi325_1 - {% endif%} + {% if step.agent_pool %} + queue: {{ step.agent_pool }} + {% else %} + queue: amd_mi325_1 + {% endif %} command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: DOCKER_BUILDKIT: "1" priority: 100 + {% if step.label and step.label=="Regression Test" or step.label=="Engine Test" %} soft_fail: false - {% endif %} + {% else %} + soft_fail: true + {% endif%} + {% endif %} {% endfor %} {% for step in steps %} # removed because of lack of HW resources: step.label and step.label=="Benchmarks" or step.label=="Pipeline Parallelism Test" or diff --git a/buildkite/test-template-fastcheck.j2 b/buildkite/test-template-fastcheck.j2 index 49c3d593..0bba294e 100644 --- a/buildkite/test-template-fastcheck.j2 +++ b/buildkite/test-template-fastcheck.j2 @@ -326,7 +326,7 @@ steps: steps: - label: "AMD: :docker: build image with {{mirror_hw}}" depends_on: ~ - soft_fail: true + soft_fail: false commands: - "docker build --build-arg max_jobs=16 --build-arg REMOTE_VLLM=1 --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT --tag {{ docker_image_amd }} -f docker/Dockerfile.rocm --target test --no-cache --progress plain ." - "docker push {{ docker_image_amd }}" @@ -346,14 +346,18 @@ steps: {% for step in steps %} {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} {% if step.label and step.label=="Basic Correctness Test" %} - - block: "Run AMD MI300: {{ step.label }} with {{mirror_hw}}" + - block: "{{ step.agent_pool }}: {{ step.label }}" key: block-amd-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }} depends_on: amd-build - - label: "AMD MI300: {{ step.label }} with {{mirror_hw}}" + - label: "{{ step.agent_pool }}: {{ step.label }}" depends_on: block-amd-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }} agents: - queue: amd_mi300_1 + {% if step.agent_pool %} + queue: {{ step.agent_pool }} + {% else %} + queue: amd_mi325_1 + {% endif %} command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: DOCKER_BUILDKIT: "1"