microsoft
diff --git a/‎.github/workflows/linux_cuda_ci.yml‎
Lines changed: 15 additions & 6 deletions b/‎.github/workflows/linux_cuda_ci.yml‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎.github/workflows/linux_cuda_plugin_ci.yml‎
Lines changed: 21 additions & 7 deletions b/‎.github/workflows/linux_cuda_plugin_ci.yml‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎.github/workflows/windows_cuda.yml‎
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/windows_cuda.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/workflows/windows_cuda_plugin.yml‎
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/windows_cuda_plugin.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎cmake/deps.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/deps.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/external/onnxruntime_external_deps.cmake‎
Lines changed: 17 additions & 18 deletions b/‎cmake/external/onnxruntime_external_deps.cmake‎
Lines changed: 17 additions & 18 deletions
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/onnxruntime_python.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/onnxruntime_python.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/patches/cxxopts/gcc-15-compat.patch‎
Lines changed: 13 additions & 0 deletions b/‎cmake/patches/cxxopts/gcc-15-compat.patch‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎cmake/patches/gsl/1064.patch‎
Lines changed: 0 additions & 26 deletions b/‎cmake/patches/gsl/1064.patch‎
Lines changed: 0 additions & 26 deletions
@@ -27,9 +27,9 @@ jobs:
       build_config: Release
       architecture: x64
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-      docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
-      docker_image_repo: onnxruntimecuda12manylinuxbuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
+      docker_image_repo: onnxruntimecuda13manylinuxbuild
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=13.0 --cuda_home=/usr/local/cuda-13.0 --cudnn_home=/usr/local/cuda-13.0 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
@@ -57,8 +57,8 @@ jobs:
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda12manylinuxbuild
-          build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
+          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda13manylinuxbuild
+          build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
           push: true
           azure-container-registry-name: onnxruntimebuildcache
         env:
@@ -91,6 +91,15 @@ jobs:
             echo "Warning: perms.txt not found in artifact."
           fi
 
+      # Verify the GPU is accessible inside Docker before running the full test suite.
+      # If the NVIDIA Container Toolkit fails to expose /dev/nvidia* devices,
+      # tests will fail with "CUDA failure 100" and waste 10+ minutes.
+      - name: Verify GPU access in Docker
+        run: |
+          docker run --rm --gpus all \
+            "${{ steps.build_docker_image_step.outputs.full-image-name }}" \
+            nvidia-smi
+
       # --- Run Tests using the downloaded build ---
       # The run-build-script-in-docker action mounts ${{ runner.temp }} to /onnxruntime_src/build
       # So build.py --build_dir build/Release inside the container correctly finds the artifacts.
@@ -102,5 +111,5 @@ jobs:
           build_config: Release
           mode: 'test' # Set mode to test
           execution_providers: 'cuda'
-          extra_build_flags: '--use_binskim_compliant_compile_flags --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+          extra_build_flags: '--use_binskim_compliant_compile_flags --cuda_version=13.0 --cuda_home=/usr/local/cuda-13.0 --cudnn_home=/usr/local/cuda-13.0 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
           python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
@@ -26,17 +26,17 @@ jobs:
       build_config: Release
       architecture: x64
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-      docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
-      docker_image_repo: onnxruntimecuda12manylinuxbuild
+      docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
+      docker_image_repo: onnxruntimecuda13manylinuxbuild
       extra_build_flags: >-
         --use_binskim_compliant_compile_flags
         --build_wheel
         --parallel
         --nvcc_threads 4
         --flash_nvcc_threads 4
-        --cuda_version=12.8
-        --cuda_home=/usr/local/cuda-12.8
-        --cudnn_home=/usr/local/cuda-12.8
+        --cuda_version=13.0
+        --cuda_home=/usr/local/cuda-13.0
+        --cudnn_home=/usr/local/cuda-13.0
         --enable_cuda_profiling
         --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         --cmake_extra_defines onnxruntime_QUICK_BUILD=ON
@@ -67,8 +67,8 @@ jobs:
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda12manylinuxbuild
-          build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
+          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecuda13manylinuxbuild
+          build-args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
           push: true
           azure-container-registry-name: onnxruntimebuildcache
         env:
@@ -100,6 +100,15 @@ jobs:
             echo "Warning: perms.txt not found in artifact."
           fi
 
+      # Verify the GPU is accessible inside Docker before running the full test suite.
+      # If the NVIDIA Container Toolkit fails to expose /dev/nvidia* devices,
+      # tests will fail with "CUDA failure 100" and waste 10+ minutes.
+      - name: Verify GPU access in Docker
+        run: |
+          docker run --rm --gpus all \
+            "${{ steps.build_docker_image_step.outputs.full-image-name }}" \
+            nvidia-smi
+
       # --- Install the ORT wheel and run CUDA plugin EP tests ---
       - name: Run CUDA Plugin EP Python Tests
         run: |
@@ -111,6 +120,11 @@ jobs:
             bash -c "
               set -ex
               export PATH=/opt/python/cp312-cp312/bin:\$PATH
+              # Ensure libcudart.so.13 is findable regardless of host-runner NVIDIA Container Toolkit configuration.
+              # The CUDA runtime library lives in the container image at /usr/local/cuda-13.0/lib64, but the
+              # LD_LIBRARY_PATH may not include this path when the runner's NVIDIA toolkit only mounts driver
+              # libraries at /usr/local/nvidia/lib64.
+              export LD_LIBRARY_PATH=/usr/local/cuda-13.0/lib64:\${LD_LIBRARY_PATH:-}
 
               # Install the ORT wheel
               python -m pip install /build/Release/Release/dist/onnxruntime*.whl
 
@@ -157,6 +157,7 @@ jobs:
     runs-on: [
       "self-hosted",
       "1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
+      "1ES.ImageOverride=onnxruntime-Win-CPU-VS2022-Latest-NVMe-x64-test",
       "JobId=windows-cuda-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
     ]
     steps:
@@ -222,6 +223,13 @@ jobs:
         with:
           whl-directory: ${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo\dist
 
+      # Verify the GPU is accessible before running the full test suite.
+      # If the NVIDIA driver is not available, tests will fail with
+      # "CUDA failure 100" and waste significant time.
+      - name: Verify GPU access
+        shell: pwsh
+        run: nvidia-smi
+
       - name: Run Tests
         working-directory: ${{ runner.temp }}
         run: |
 
@@ -127,6 +127,7 @@ jobs:
     runs-on: [
       "self-hosted",
       "1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
+      "1ES.ImageOverride=onnxruntime-Win-CPU-VS2022-Latest-NVMe-x64-test",
       "JobId=windows-cuda-plugin-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
     ]
     steps:
@@ -187,6 +188,13 @@ jobs:
         with:
           whl-directory: ${{ runner.temp }}\build\Release\Release\dist
 
+      # Verify the GPU is accessible before running the full test suite.
+      # If the NVIDIA driver is not available, tests will fail with
+      # "CUDA failure 100" and waste significant time.
+      - name: Verify GPU access
+        shell: pwsh
+        run: nvidia-smi
+
       - name: Run CUDA Plugin EP Python Tests
         working-directory: ${{ github.workspace }}\onnxruntime\test\python\transformers
         shell: pwsh
 
@@ -30,7 +30,7 @@ googletest;https://github.com/google/googletest/archive/refs/tags/v1.17.0.zip;f6
 #xnnpack 2025.06.22
 googlexnnpack;https://github.com/google/XNNPACK/archive/3cf85e705098622d59056dcb8f5f963ea7bb0a00.zip;6f6bbba627241f89463ca845febaf063982b34fe
 json;https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.zip;5e88795165cc8590138d1f47ce94ee567b85b4d6
-microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14
+microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.2.1.zip;1094e3bb7a8af763dcb136ccd676e6e75e614eec
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.250325.1.zip;826c8bd47c2258ec61b8b218e031e5b33d27f761
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 
@@ -405,24 +405,16 @@ if (CPUINFO_SUPPORTED)
   endif()
 endif()
 
-if(onnxruntime_USE_CUDA)
-  onnxruntime_fetchcontent_declare(
-    GSL
-    URL ${DEP_URL_microsoft_gsl}
-    URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
-    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/gsl/1064.patch
-    EXCLUDE_FROM_ALL
-    FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
-  )
-else()
-  onnxruntime_fetchcontent_declare(
-    GSL
-    URL ${DEP_URL_microsoft_gsl}
-    URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
-    EXCLUDE_FROM_ALL
-    FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
-  )
-endif()
+onnxruntime_fetchcontent_declare(
+  GSL
+  URL ${DEP_URL_microsoft_gsl}
+  URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
+  # Stringify fix for GSL_SUPPRESS on MSVC (C4875). Remove when GSL ships a release
+  # containing microsoft/GSL#1213 (commit 543d0dd).
+  PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/gsl/1213.patch
+  EXCLUDE_FROM_ALL
+  FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
+)
 set(GSL_TARGET "Microsoft.GSL::GSL")
 set(GSL_INCLUDE_DIR "$<TARGET_PROPERTY:${GSL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>")
 onnxruntime_fetchcontent_makeavailable(GSL)
@@ -624,10 +616,17 @@ endif()
 if(onnxruntime_ENABLE_TRAINING OR (onnxruntime_ENABLE_TRAINING_APIS AND onnxruntime_BUILD_UNIT_TESTS))
   # Once code under orttraining/orttraining/models dir is removed "onnxruntime_ENABLE_TRAINING" should be removed from
   # this conditional
+  if(Patch_FOUND)
+    set(ONNXRUNTIME_CXXOPTS_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cxxopts/gcc-15-compat.patch)
+  else()
+    set(ONNXRUNTIME_CXXOPTS_PATCH_COMMAND "")
+  endif()
+
   onnxruntime_fetchcontent_declare(
     cxxopts
     URL ${DEP_URL_cxxopts}
     URL_HASH SHA1=${DEP_SHA1_cxxopts}
+    PATCH_COMMAND ${ONNXRUNTIME_CXXOPTS_PATCH_COMMAND}
     EXCLUDE_FROM_ALL
     FIND_PACKAGE_ARGS NAMES cxxopts
   )
 
@@ -55,6 +55,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qlutgemm.cpp
   ${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
   ${MLAS_SRC_DIR}/flashattn.cpp
+  ${MLAS_SRC_DIR}/flashattn_qkv.cpp
   ${MLAS_SRC_DIR}/qkv_quant.cpp
   ${MLAS_SRC_DIR}/cast.cpp
   ${MLAS_SRC_DIR}/layernorm.cpp
 
@@ -242,6 +242,7 @@ if (onnxruntime_USE_CUDA AND NOT WIN32)
   )
   include(cutlass)
   target_include_directories(onnxruntime_pybind11_state PRIVATE ${cutlass_SOURCE_DIR}/include)
+  target_link_libraries(onnxruntime_pybind11_state PRIVATE CUDA::cudart)
 endif()
 if (onnxruntime_USE_CUDA AND WIN32)
   target_compile_definitions(onnxruntime_pybind11_state PRIVATE ORT_NO_CUDA_IN_PYBIND)
 
@@ -0,0 +1,13 @@
+diff --git a/include/cxxopts.hpp b/include/cxxopts.hpp
+index 991ba3fc..a2e71faf 100644
+--- a/include/cxxopts.hpp
++++ b/include/cxxopts.hpp
+@@ -25,6 +25,7 @@
+ #ifndef CXXOPTS_HPP_INCLUDED
+ #define CXXOPTS_HPP_INCLUDED
+ 
++#include <cstdint>
+ #include <cstring>
+ #include <cctype>
+ #include <exception>
+
Original file line number	Diff line number	Diff line change
`@@ -242,6 +242,7 @@ if (onnxruntime_USE_CUDA AND NOT WIN32)`
`242`	`242`	`)`
`243`	`243`	`include(cutlass)`
`244`	`244`	`target_include_directories(onnxruntime_pybind11_state PRIVATE ${cutlass_SOURCE_DIR}/include)`
	`245`	`+ target_link_libraries(onnxruntime_pybind11_state PRIVATE CUDA::cudart)`
`245`	`246`	`endif()`
`246`	`247`	`if (onnxruntime_USE_CUDA AND WIN32)`
`247`	`248`	`target_compile_definitions(onnxruntime_pybind11_state PRIVATE ORT_NO_CUDA_IN_PYBIND)`