Skip to content

Commit 23f1dcc

Browse files
authored
Merge branch 'main' into rmccormick/sglang_metrics
Signed-off-by: Keiven C <[email protected]>
2 parents 2678f28 + 6d62fc7 commit 23f1dcc

File tree

158 files changed

+8932
-2642
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

158 files changed

+8932
-2642
lines changed

.github/actions/docker-build/action.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ inputs:
99
description: 'Target to build'
1010
required: false
1111
default: 'runtime'
12+
platform:
13+
description: 'Docker platform to build on, ie. linux/amd64'
14+
required: false
15+
default: 'linux/amd64'
1216
image_tag:
1317
description: 'Custom image tag (optional, defaults to framework:latest)'
1418
required: false
@@ -63,18 +67,21 @@ runs:
6367
SCCACHE_S3_BUCKET: ${{ inputs.sccache_s3_bucket }}
6468
AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }}
6569
AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }}
70+
PLATFORM: ${{ inputs.platform }}
6671
run: |
6772
# Determine image tag
6873
if [ -n "${{ inputs.image_tag }}" ]; then
6974
IMAGE_TAG="${{ inputs.image_tag }}"
7075
else
7176
IMAGE_TAG="${{ inputs.framework }}:latest"
7277
fi
73-
echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
7478
79+
echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
7580
./container/build.sh --tag "$IMAGE_TAG" \
7681
--target ${{ inputs.target }} \
82+
--vllm-max-jobs 10 \
7783
--framework ${{ inputs.framework }} \
84+
--platform ${{ inputs.platform }} \
7885
--use-sccache \
7986
--sccache-bucket "$SCCACHE_S3_BUCKET" \
8087
--sccache-region "$AWS_DEFAULT_REGION"
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
description: 'Tag and Push Docker Images'
2+
3+
inputs:
4+
local_image:
5+
description: 'Local Image Name:Tag'
6+
required: true
7+
push_tag:
8+
description: 'Target Name:Tag'
9+
required: true
10+
aws_push:
11+
description: 'Push to AWS Boolean'
12+
required: false
13+
default: 'false'
14+
azure_push:
15+
description: 'Push to Azure Container Registry (ACR) Boolean'
16+
required: false
17+
default: 'false'
18+
aws_account_id:
19+
description: 'AWS Account ID'
20+
required: false
21+
aws_default_region:
22+
description: 'AWS Default Region'
23+
required: false
24+
aws_access_key_id:
25+
description: 'AWS Access Key ID'
26+
required: false
27+
aws_secret_access_key:
28+
description: 'AWS Secret Access Key'
29+
required: false
30+
azure_acr_hostname:
31+
description: 'Azure ACR hostname'
32+
required: false
33+
azure_acr_user:
34+
description: 'Azure ACR user'
35+
required: false
36+
azure_acr_password:
37+
description: 'Azure ACR password'
38+
required: false
39+
40+
outputs:
41+
image_tag:
42+
description: 'Image Tag'
43+
value: ${{ inputs.push_tag }}
44+
45+
runs:
46+
using: "composite"
47+
steps:
48+
- name: Set up Docker Buildx
49+
uses: docker/setup-buildx-action@v3
50+
- name: Install awscli
51+
if: ${{ inputs.aws_push == 'true' }}
52+
shell: bash
53+
run: |
54+
curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip"
55+
unzip awscliv2.zip
56+
sudo ./aws/install
57+
- name: ECR Login
58+
if: ${{ inputs.aws_push == 'true' }}
59+
shell: bash
60+
env:
61+
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
62+
run: |
63+
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
64+
- name: ACR Login
65+
shell: bash
66+
if: ${{ inputs.azure_push == 'true' }}
67+
run: |
68+
echo "${{ inputs.azure_acr_password }}" | docker login ${{ inputs.azure_acr_hostname }} --username ${{ inputs.azure_acr_user }} --password-stdin
69+
- name: ECR Tag and Push
70+
shell: bash
71+
if: ${{ inputs.aws_push == 'true' }}
72+
env:
73+
LOCAL_IMAGE: ${{ inputs.local_image }}
74+
PUSH_TAG: ${{ inputs.push_tag }}
75+
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
76+
run: |
77+
docker tag ${LOCAL_IMAGE} ${ECR_HOSTNAME}/${PUSH_TAG}
78+
docker push ${ECR_HOSTNAME}/${PUSH_TAG}
79+
- name: ACR Tag and Push
80+
shell: bash
81+
if: ${{ inputs.azure_push == 'true' }}
82+
env:
83+
LOCAL_IMAGE: ${{ inputs.local_image }}
84+
PUSH_TAG: ${{ inputs.push_tag }}
85+
AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
86+
run: |
87+
docker tag ${LOCAL_IMAGE} ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
88+
docker push ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}

.github/workflows/container-validation-backends.yml

Lines changed: 115 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ on:
88
branches:
99
- main
1010
- "pull-request/[0-9]+"
11+
- release/*.*.*
1112

1213
concurrency:
1314
group: ${{ github.workflow }}-build-test-${{ github.ref_name || github.run_id }}
@@ -37,9 +38,16 @@ jobs:
3738
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
3839
3940
vllm:
40-
runs-on: gpu-l40-amd64
4141
needs: changed-files
4242
if: needs.changed-files.outputs.has_code_changes == 'true'
43+
strategy:
44+
fail-fast: false
45+
matrix:
46+
platform:
47+
- { arch: amd64, runner: gpu-l40-amd64 }
48+
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
49+
name: vllm (${{ matrix.platform.arch }})
50+
runs-on: ${{ matrix.platform.runner }}
4351
steps:
4452
- name: Checkout code
4553
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
@@ -49,22 +57,53 @@ jobs:
4957
with:
5058
framework: vllm
5159
target: runtime
60+
platform: 'linux/${{ matrix.platform.arch }}'
5261
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
5362
ci_token: ${{ secrets.CI_TOKEN }}
5463
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
5564
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
5665
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
5766
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
58-
- name: Run tests
67+
- name: Docker Tag and Push
68+
uses: ./.github/actions/docker-tag-push
69+
with:
70+
local_image: ${{ steps.build-image.outputs.image_tag }}
71+
push_tag: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
72+
# OPS-1145: Switch aws_push to true
73+
aws_push: 'false'
74+
azure_push: 'true'
75+
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
76+
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
77+
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
78+
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
79+
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
80+
- name: Run unit tests
81+
if: ${{ matrix.platform.arch != 'arm64' }}
82+
uses: ./.github/actions/pytest
83+
with:
84+
image_tag: ${{ steps.build-image.outputs.image_tag }}
85+
pytest_marks: "unit and vllm and gpu_1"
86+
- name: Run e2e tests
87+
if: ${{ matrix.platform.arch != 'arm64' }}
5988
uses: ./.github/actions/pytest
6089
with:
6190
image_tag: ${{ steps.build-image.outputs.image_tag }}
6291
pytest_marks: "e2e and vllm and gpu_1 and not slow"
6392

6493
sglang:
65-
runs-on: gpu-l40-amd64
6694
needs: changed-files
6795
if: needs.changed-files.outputs.has_code_changes == 'true'
96+
# OPS-1140: Uncomment this for sglang arm switch to wideep
97+
# strategy:
98+
# fail-fast: false
99+
# matrix:
100+
# platform:
101+
# - { arch: amd64, runner: gpu-l40-amd64 }
102+
# - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
103+
# name: sglang (${{ matrix.platform.arch }})
104+
# runs-on: ${{ matrix.platform.runner }}
105+
# OPS-1140: Remove this runs-on line, replaced with the above line
106+
runs-on: gpu-l40-amd64
68107
steps:
69108
- name: Checkout repository
70109
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
@@ -74,22 +113,49 @@ jobs:
74113
with:
75114
framework: sglang
76115
target: runtime
116+
platform: 'linux/amd64'
117+
# OPS-1140: Replace the above line with the uncommented below line
118+
# platform: 'linux/${{ matrix.platform.arch }}'
77119
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
78120
ci_token: ${{ secrets.CI_TOKEN }}
79121
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
80122
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
81123
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
82124
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
125+
- name: Docker Tag and Push
126+
uses: ./.github/actions/docker-tag-push
127+
with:
128+
local_image: ${{ steps.build-image.outputs.image_tag }}
129+
push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-amd64
130+
# OPS-1140: Replace the above line with the uncommented below line
131+
# push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
132+
# OPS-1145: Switch aws_push to true
133+
aws_push: 'false'
134+
azure_push: 'true'
135+
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
136+
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
137+
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
138+
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
139+
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
83140
- name: Run tests
141+
# OPS-1140: Uncomment the below line
142+
# if: ${{ matrix.platform.arch != 'arm64' }}
84143
uses: ./.github/actions/pytest
85144
with:
86145
image_tag: ${{ steps.build-image.outputs.image_tag }}
87146
pytest_marks: "e2e and sglang and gpu_1"
88147

89148
trtllm:
90-
runs-on: gpu-l40-amd64
91149
needs: changed-files
92150
if: needs.changed-files.outputs.has_code_changes == 'true'
151+
strategy:
152+
fail-fast: false
153+
matrix:
154+
platform:
155+
- { arch: amd64, runner: gpu-l40-amd64 }
156+
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
157+
name: trtllm (${{ matrix.platform.arch }})
158+
runs-on: ${{ matrix.platform.runner }}
93159
steps:
94160
- name: Checkout code
95161
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
@@ -99,14 +165,59 @@ jobs:
99165
with:
100166
framework: trtllm
101167
target: runtime
168+
platform: 'linux/${{ matrix.platform.arch }}'
102169
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
103170
ci_token: ${{ secrets.CI_TOKEN }}
104171
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
105172
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
106173
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
107174
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
175+
- name: Docker Tag and Push
176+
uses: ./.github/actions/docker-tag-push
177+
with:
178+
local_image: ${{ steps.build-image.outputs.image_tag }}
179+
push_tag: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
180+
# OPS-1145: Switch aws_push to true
181+
aws_push: 'false'
182+
azure_push: 'true'
183+
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
184+
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
185+
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
186+
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
187+
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
108188
- name: Run tests
189+
if: ${{ matrix.platform.arch != 'arm64' }}
109190
uses: ./.github/actions/pytest
110191
with:
111192
image_tag: ${{ steps.build-image.outputs.image_tag }}
112193
pytest_marks: "e2e and trtllm_marker and gpu_1 and not slow"
194+
195+
# Upload metrics for this workflow and all its jobs
196+
upload-workflow-metrics:
197+
name: Upload Workflow Metrics
198+
runs-on: gitlab
199+
if: always() # Always run, even if other jobs fail
200+
needs: [backend-status-check] # Wait for the status check which waits for all build jobs
201+
202+
steps:
203+
- name: Check out repository
204+
uses: actions/checkout@v4
205+
206+
- name: Set up Python
207+
uses: actions/setup-python@v4
208+
with:
209+
python-version: '3.x'
210+
211+
- name: Install dependencies
212+
run: |
213+
python -m pip install --upgrade pip
214+
pip install requests
215+
- name: Upload Complete Workflow Metrics
216+
env:
217+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
218+
WORKFLOW_INDEX: ${{ secrets.WORKFLOW_INDEX }}
219+
JOB_INDEX: ${{ secrets.JOB_INDEX }}
220+
STEPS_INDEX: ${{ secrets.STEPS_INDEX }}
221+
run: |
222+
# Run the enhanced metrics upload script
223+
python3 .github/workflows/upload_complete_workflow_metrics.py

0 commit comments

Comments
 (0)