Skip to content

Commit dccdadd

Browse files
committed
Merge branch 'main' into chi/model_package_2
2 parents d979c60 + 895422d commit dccdadd

68 files changed

Lines changed: 4692 additions & 387 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/linux_cuda_ci.yml

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ jobs:
2727
build_config: Release
2828
architecture: x64
2929
dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
30-
docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
31-
docker_image_repo: onnxruntimecuda12manylinuxbuild
32-
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
30+
docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
31+
docker_image_repo: onnxruntimecuda13manylinuxbuild
32+
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=13.0 --cuda_home=/usr/local/cuda-13.0 --cudnn_home=/usr/local/cuda-13.0 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
3333
python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
3434
run_tests: false # <<< Do not run tests in this job
3535
upload_build_output: true # <<< Upload the build/Release directory
@@ -57,8 +57,8 @@ jobs:
5757
id: build_docker_image_step
5858
with:
5959
dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
60-
image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda12manylinuxbuild
61-
build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
60+
image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda13manylinuxbuild
61+
build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
6262
push: true
6363
azure-container-registry-name: onnxruntimebuildcache
6464
env:
@@ -91,6 +91,15 @@ jobs:
9191
echo "Warning: perms.txt not found in artifact."
9292
fi
9393
94+
# Verify the GPU is accessible inside Docker before running the full test suite.
95+
# If the NVIDIA Container Toolkit fails to expose /dev/nvidia* devices,
96+
# tests will fail with "CUDA failure 100" and waste 10+ minutes.
97+
- name: Verify GPU access in Docker
98+
run: |
99+
docker run --rm --gpus all \
100+
"${{ steps.build_docker_image_step.outputs.full-image-name }}" \
101+
nvidia-smi
102+
94103
# --- Run Tests using the downloaded build ---
95104
# The run-build-script-in-docker action mounts ${{ runner.temp }} to /onnxruntime_src/build
96105
# So build.py --build_dir build/Release inside the container correctly finds the artifacts.
@@ -102,5 +111,5 @@ jobs:
102111
build_config: Release
103112
mode: 'test' # Set mode to test
104113
execution_providers: 'cuda'
105-
extra_build_flags: '--use_binskim_compliant_compile_flags --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
114+
extra_build_flags: '--use_binskim_compliant_compile_flags --cuda_version=13.0 --cuda_home=/usr/local/cuda-13.0 --cudnn_home=/usr/local/cuda-13.0 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
106115
python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'

.github/workflows/linux_cuda_plugin_ci.yml

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,17 @@ jobs:
2626
build_config: Release
2727
architecture: x64
2828
dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
29-
docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
30-
docker_image_repo: onnxruntimecuda12manylinuxbuild
29+
docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
30+
docker_image_repo: onnxruntimecuda13manylinuxbuild
3131
extra_build_flags: >-
3232
--use_binskim_compliant_compile_flags
3333
--build_wheel
3434
--parallel
3535
--nvcc_threads 4
3636
--flash_nvcc_threads 4
37-
--cuda_version=12.8
38-
--cuda_home=/usr/local/cuda-12.8
39-
--cudnn_home=/usr/local/cuda-12.8
37+
--cuda_version=13.0
38+
--cuda_home=/usr/local/cuda-13.0
39+
--cudnn_home=/usr/local/cuda-13.0
4040
--enable_cuda_profiling
4141
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
4242
--cmake_extra_defines onnxruntime_QUICK_BUILD=ON
@@ -67,8 +67,8 @@ jobs:
6767
id: build_docker_image_step
6868
with:
6969
dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
70-
image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda12manylinuxbuild
71-
build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
70+
image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda13manylinuxbuild
71+
build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
7272
push: true
7373
azure-container-registry-name: onnxruntimebuildcache
7474
env:
@@ -100,6 +100,15 @@ jobs:
100100
echo "Warning: perms.txt not found in artifact."
101101
fi
102102
103+
# Verify the GPU is accessible inside Docker before running the full test suite.
104+
# If the NVIDIA Container Toolkit fails to expose /dev/nvidia* devices,
105+
# tests will fail with "CUDA failure 100" and waste 10+ minutes.
106+
- name: Verify GPU access in Docker
107+
run: |
108+
docker run --rm --gpus all \
109+
"${{ steps.build_docker_image_step.outputs.full-image-name }}" \
110+
nvidia-smi
111+
103112
# --- Install the ORT wheel and run CUDA plugin EP tests ---
104113
- name: Run CUDA Plugin EP Python Tests
105114
run: |
@@ -111,6 +120,11 @@ jobs:
111120
bash -c "
112121
set -ex
113122
export PATH=/opt/python/cp312-cp312/bin:\$PATH
123+
# Ensure libcudart.so.13 is findable regardless of host-runner NVIDIA Container Toolkit configuration.
124+
# The CUDA runtime library lives in the container image at /usr/local/cuda-13.0/lib64, but the
125+
# LD_LIBRARY_PATH may not include this path when the runner's NVIDIA toolkit only mounts driver
126+
# libraries at /usr/local/nvidia/lib64.
127+
export LD_LIBRARY_PATH=/usr/local/cuda-13.0/lib64:\${LD_LIBRARY_PATH:-}
114128
115129
# Install the ORT wheel
116130
python -m pip install /build/Release/Release/dist/onnxruntime*.whl

.github/workflows/windows_cuda.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ jobs:
157157
runs-on: [
158158
"self-hosted",
159159
"1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
160+
"1ES.ImageOverride=onnxruntime-Win-CPU-VS2022-Latest-NVMe-x64-test",
160161
"JobId=windows-cuda-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
161162
]
162163
steps:
@@ -222,6 +223,13 @@ jobs:
222223
with:
223224
whl-directory: ${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo\dist
224225

226+
# Verify the GPU is accessible before running the full test suite.
227+
# If the NVIDIA driver is not available, tests will fail with
228+
# "CUDA failure 100" and waste significant time.
229+
- name: Verify GPU access
230+
shell: pwsh
231+
run: nvidia-smi
232+
225233
- name: Run Tests
226234
working-directory: ${{ runner.temp }}
227235
run: |

.github/workflows/windows_cuda_plugin.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ jobs:
127127
runs-on: [
128128
"self-hosted",
129129
"1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
130+
"1ES.ImageOverride=onnxruntime-Win-CPU-VS2022-Latest-NVMe-x64-test",
130131
"JobId=windows-cuda-plugin-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
131132
]
132133
steps:
@@ -187,6 +188,13 @@ jobs:
187188
with:
188189
whl-directory: ${{ runner.temp }}\build\Release\Release\dist
189190

191+
# Verify the GPU is accessible before running the full test suite.
192+
# If the NVIDIA driver is not available, tests will fail with
193+
# "CUDA failure 100" and waste significant time.
194+
- name: Verify GPU access
195+
shell: pwsh
196+
run: nvidia-smi
197+
190198
- name: Run CUDA Plugin EP Python Tests
191199
working-directory: ${{ github.workspace }}\onnxruntime\test\python\transformers
192200
shell: pwsh

cmake/deps.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ googletest;https://github.com/google/googletest/archive/refs/tags/v1.17.0.zip;f6
3030
#xnnpack 2025.06.22
3131
googlexnnpack;https://github.com/google/XNNPACK/archive/3cf85e705098622d59056dcb8f5f963ea7bb0a00.zip;6f6bbba627241f89463ca845febaf063982b34fe
3232
json;https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.zip;5e88795165cc8590138d1f47ce94ee567b85b4d6
33-
microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14
33+
microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.2.1.zip;1094e3bb7a8af763dcb136ccd676e6e75e614eec
3434
microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
3535
mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
3636
mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063

cmake/external/onnxruntime_external_deps.cmake

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -405,24 +405,16 @@ if (CPUINFO_SUPPORTED)
405405
endif()
406406
endif()
407407

408-
if(onnxruntime_USE_CUDA)
409-
onnxruntime_fetchcontent_declare(
410-
GSL
411-
URL ${DEP_URL_microsoft_gsl}
412-
URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
413-
PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/gsl/1064.patch
414-
EXCLUDE_FROM_ALL
415-
FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
416-
)
417-
else()
418-
onnxruntime_fetchcontent_declare(
419-
GSL
420-
URL ${DEP_URL_microsoft_gsl}
421-
URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
422-
EXCLUDE_FROM_ALL
423-
FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
424-
)
425-
endif()
408+
onnxruntime_fetchcontent_declare(
409+
GSL
410+
URL ${DEP_URL_microsoft_gsl}
411+
URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
412+
# Stringify fix for GSL_SUPPRESS on MSVC (C4875). Remove when GSL ships a release
413+
# containing microsoft/GSL#1213 (commit 543d0dd).
414+
PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/gsl/1213.patch
415+
EXCLUDE_FROM_ALL
416+
FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
417+
)
426418
set(GSL_TARGET "Microsoft.GSL::GSL")
427419
set(GSL_INCLUDE_DIR "$<TARGET_PROPERTY:${GSL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>")
428420
onnxruntime_fetchcontent_makeavailable(GSL)
@@ -624,10 +616,17 @@ endif()
624616
if(onnxruntime_ENABLE_TRAINING OR (onnxruntime_ENABLE_TRAINING_APIS AND onnxruntime_BUILD_UNIT_TESTS))
625617
# Once code under orttraining/orttraining/models dir is removed "onnxruntime_ENABLE_TRAINING" should be removed from
626618
# this conditional
619+
if(Patch_FOUND)
620+
set(ONNXRUNTIME_CXXOPTS_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cxxopts/gcc-15-compat.patch)
621+
else()
622+
set(ONNXRUNTIME_CXXOPTS_PATCH_COMMAND "")
623+
endif()
624+
627625
onnxruntime_fetchcontent_declare(
628626
cxxopts
629627
URL ${DEP_URL_cxxopts}
630628
URL_HASH SHA1=${DEP_SHA1_cxxopts}
629+
PATCH_COMMAND ${ONNXRUNTIME_CXXOPTS_PATCH_COMMAND}
631630
EXCLUDE_FROM_ALL
632631
FIND_PACKAGE_ARGS NAMES cxxopts
633632
)

cmake/onnxruntime_mlas.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
5555
${MLAS_SRC_DIR}/qlutgemm.cpp
5656
${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
5757
${MLAS_SRC_DIR}/flashattn.cpp
58+
${MLAS_SRC_DIR}/flashattn_qkv.cpp
5859
${MLAS_SRC_DIR}/qkv_quant.cpp
5960
${MLAS_SRC_DIR}/cast.cpp
6061
${MLAS_SRC_DIR}/layernorm.cpp

cmake/onnxruntime_python.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ if (onnxruntime_USE_CUDA AND NOT WIN32)
242242
)
243243
include(cutlass)
244244
target_include_directories(onnxruntime_pybind11_state PRIVATE ${cutlass_SOURCE_DIR}/include)
245+
target_link_libraries(onnxruntime_pybind11_state PRIVATE CUDA::cudart)
245246
endif()
246247
if (onnxruntime_USE_CUDA AND WIN32)
247248
target_compile_definitions(onnxruntime_pybind11_state PRIVATE ORT_NO_CUDA_IN_PYBIND)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
diff --git a/include/cxxopts.hpp b/include/cxxopts.hpp
2+
index 991ba3fc..a2e71faf 100644
3+
--- a/include/cxxopts.hpp
4+
+++ b/include/cxxopts.hpp
5+
@@ -25,6 +25,7 @@
6+
#ifndef CXXOPTS_HPP_INCLUDED
7+
#define CXXOPTS_HPP_INCLUDED
8+
9+
+#include <cstdint>
10+
#include <cstring>
11+
#include <cctype>
12+
#include <exception>
13+

cmake/patches/gsl/1064.patch

Lines changed: 0 additions & 26 deletions
This file was deleted.

0 commit comments

Comments
 (0)