Skip to content

Commit 52c88df

Browse files
committed
Merge branch 'master' into sync_msft_14112025
2 parents 10af800 + 8fe4804 commit 52c88df

File tree

185 files changed

+4369
-1467
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

185 files changed

+4369
-1467
lines changed

.github/workflows/reusable_linux_build.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ jobs:
7878
uses: actions/checkout@v5
7979

8080
- name: Set up Python ${{ inputs.python_version }}
81+
if: inputs.architecture != 'arm64'
8182
uses: actions/setup-python@v6
8283
with:
8384
python-version: ${{ inputs.python_version }}

cmake/CMakeLists.txt

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,7 +1454,12 @@ if (onnxruntime_USE_CUDA)
14541454
message(STATUS "CUDA Toolkit version is greater or equal than 12.8, enable -DENABLE_FP4 flag")
14551455
endif()
14561456

1457-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
1457+
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
1458+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all -compress-mode=size")
1459+
else()
1460+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
1461+
endif()
1462+
14581463
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
14591464
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch")
14601465

@@ -1654,9 +1659,25 @@ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux")
16541659
endif()
16551660
endif()
16561661

1657-
#Now the 'onnxruntime_EXTERNAL_LIBRARIES' variable should be sealed. It will be used in onnxruntime.cmake which will be included in the next.
1658-
#The order of the following targets matters. Right depends on left. If target A appears before target B. Then A.cmake can not use variables defined in B.cmake.
1659-
set(ONNXRUNTIME_CMAKE_FILES onnxruntime_flatbuffers onnxruntime_common onnxruntime_mlas onnxruntime_graph onnxruntime_lora onnxruntime_framework onnxruntime_util onnxruntime_providers onnxruntime_optimizer onnxruntime_session ${ONNXRUNTIME_EAGER_CMAKE_FILE_NAME})
1662+
# From this point on, onnxruntime_EXTERNAL_LIBRARIES should be no longer be modified. It will be used in
1663+
# onnxruntime.cmake which will be included next.
1664+
1665+
# The order of the CMake file names (which exclude the ".cmake" suffix) in ONNXRUNTIME_CMAKE_FILES matters. Later CMake
1666+
# files may depend on earlier ones but earlier ones cannot depend on later ones. For example, if A appears before B,
1667+
# then A.cmake cannot use variables defined in B.cmake.
1668+
set(ONNXRUNTIME_CMAKE_FILES
1669+
onnxruntime_flatbuffers
1670+
onnxruntime_common
1671+
onnxruntime_mlas
1672+
onnxruntime_graph
1673+
onnxruntime_lora
1674+
onnxruntime_framework
1675+
onnxruntime_util
1676+
onnxruntime_providers
1677+
onnxruntime_optimizer
1678+
onnxruntime_session
1679+
${ONNXRUNTIME_EAGER_CMAKE_FILE_NAME}
1680+
)
16601681

16611682
if (onnxruntime_USE_WINML)
16621683
# WINML uses and depends on the shared lib. Note: You can build WINML without DML and you will get a

cmake/deps.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0
5656
directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
5757
cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
5858
dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a
59-
kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.10.0.tar.gz;11b62149cb2514b3b9069cc435c3aa7a4e82b97a
59+
kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.15.0.tar.gz;62ccd24ab60bcef68766440fb42d79071ac2a5d2
6060
duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794

cmake/external/onnxruntime_external_deps.cmake

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -751,9 +751,18 @@ if (onnxruntime_USE_WEBGPU)
751751
# Some build warnings are not allowed to be disabled in project level.
752752
${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn_binskim.patch &&
753753

754-
# Android devices doesn't seem to allow fp16 in uniforms so the WebGPU EP has to manually handle passing an fp32
755-
# in the uniform and converting to fp16 before using.
756-
${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/uniform_and_storage_buffer_16_bit_access.patch)
754+
# The uniform_and_storage_buffer_16_bit_access.patch contains the following changes:
755+
#
756+
# - (private) Android devices don't seem to allow fp16 in uniforms so the WebGPU EP has to manually handle passing an fp32
757+
# in the uniform and converting to fp16 before using.
758+
${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/uniform_and_storage_buffer_16_bit_access.patch &&
759+
760+
# The safari_polyfill.patch contains the following changes:
761+
#
762+
# - (private) Fix compatibility issues with Safari. Contains the following changes:
763+
# - Polyfill for `device.AdapterInfo` (returns `undefined` in Safari v26.0)
764+
#
765+
${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/safari_polyfill.patch)
757766

758767
onnxruntime_fetchcontent_declare(
759768
dawn

cmake/onnxruntime_mlas.cmake

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
55
set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
66
set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
77

8+
89
# mlas_private_compile_definitions contains compile definitions that are private to onnxruntime_mlas and targets which
910
# use internal MLAS headers like mlasi.h.
1011
set(mlas_private_compile_definitions)
@@ -285,6 +286,15 @@ function(setup_kleidiai)
285286
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai)
286287
set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE)
287288

289+
# If KLEIDIAI_DEBUG is enabled that implies both DEBUG and KERNEL messages.
290+
if(onnxruntime_KLEIDIAI_DEBUG_LOGGING)
291+
target_compile_definitions(onnxruntime_mlas PRIVATE KLEIDIAI_DEBUG=1)
292+
target_compile_definitions(onnxruntime_mlas PRIVATE KLEIDIAI_KERNEL=1)
293+
endif()
294+
if(onnxruntime_KLEIDIAI_KERNEL_LOGGING)
295+
target_compile_definitions(onnxruntime_mlas PRIVATE KLEIDIAI_KERNEL=1)
296+
endif()
297+
288298
if (NOT onnxruntime_BUILD_SHARED_LIB)
289299
install(TARGETS kleidiai EXPORT ${PROJECT_NAME}Targets
290300
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}

cmake/onnxruntime_unittests.cmake

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,7 @@ set (onnxruntime_shared_lib_test_SRC
588588
${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_nontensor_types.cc
589589
${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_ort_format_models.cc
590590
${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_run_options.cc
591+
${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_runtime_path.cc
591592
${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_session_options.cc
592593
${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/utils.h
593594
${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/utils.cc
@@ -1518,9 +1519,48 @@ endif()
15181519
target_link_libraries(onnxruntime_mocked_allocator PRIVATE ${GSL_TARGET})
15191520
set_target_properties(onnxruntime_mocked_allocator PROPERTIES FOLDER "ONNXRuntimeTest")
15201521

1522+
# onnxruntime_runtime_path_test_shared_library
1523+
block()
1524+
set(onnxruntime_runtime_path_test_shared_library_src
1525+
"${TEST_SRC_DIR}/shared_lib/runtime_path_test_shared_library/runtime_path_test_shared_library.h"
1526+
"${TEST_SRC_DIR}/shared_lib/runtime_path_test_shared_library/runtime_path_test_shared_library.cc")
1527+
1528+
onnxruntime_add_shared_library(onnxruntime_runtime_path_test_shared_library
1529+
${onnxruntime_runtime_path_test_shared_library_src})
1530+
1531+
target_link_libraries(onnxruntime_runtime_path_test_shared_library PRIVATE
1532+
onnxruntime_common cpuinfo ${CMAKE_DL_LIBS})
1533+
target_include_directories(onnxruntime_runtime_path_test_shared_library PRIVATE ${ONNXRUNTIME_ROOT})
1534+
1535+
if(UNIX)
1536+
if (APPLE)
1537+
set(onnxruntime_runtime_path_test_shared_library_link_flags "-Xlinker -dead_strip")
1538+
elseif (NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
1539+
string(CONCAT onnxruntime_runtime_path_test_shared_library_link_flags
1540+
"-Xlinker --version-script=${TEST_SRC_DIR}/shared_lib/runtime_path_test_shared_library/runtime_path_test_shared_library.lds "
1541+
"-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
1542+
endif()
1543+
else()
1544+
set(onnxruntime_runtime_path_test_shared_library_link_flags
1545+
"-DEF:${TEST_SRC_DIR}/shared_lib/runtime_path_test_shared_library/runtime_path_test_shared_library.def")
1546+
endif()
1547+
1548+
set_property(TARGET onnxruntime_runtime_path_test_shared_library APPEND_STRING PROPERTY LINK_FLAGS
1549+
${onnxruntime_runtime_path_test_shared_library_link_flags})
1550+
1551+
set_target_properties(onnxruntime_runtime_path_test_shared_library PROPERTIES FOLDER "ONNXRuntimeTest")
1552+
source_group(TREE ${TEST_SRC_DIR} FILES ${onnxruntime_runtime_path_test_shared_library_src})
1553+
endblock()
1554+
15211555
#################################################################
15221556
# test inference using shared lib
1523-
set(onnxruntime_shared_lib_test_LIBS onnxruntime_mocked_allocator onnxruntime_test_utils onnxruntime_common onnx_proto)
1557+
set(onnxruntime_shared_lib_test_LIBS
1558+
onnxruntime_mocked_allocator
1559+
onnxruntime_test_utils
1560+
onnxruntime_common
1561+
onnx_proto
1562+
onnxruntime_runtime_path_test_shared_library)
1563+
15241564
if(NOT WIN32)
15251565
if(onnxruntime_USE_SNPE)
15261566
list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_providers_snpe)
@@ -1576,6 +1616,10 @@ endif()
15761616
target_compile_definitions(onnxruntime_shared_lib_test PRIVATE USE_DUMMY_EXA_DEMANGLE=1)
15771617
endif()
15781618

1619+
if (CMAKE_SYSTEM_NAME MATCHES "AIX" AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
1620+
set_target_properties(onnxruntime_shared_lib_test PROPERTIES ENABLE_EXPORTS 1)
1621+
endif()
1622+
15791623
if (IOS)
15801624
add_custom_command(
15811625
TARGET onnxruntime_shared_lib_test POST_BUILD
@@ -1978,8 +2022,7 @@ endif()
19782022

19792023
# Build library that can be used with RegisterExecutionProviderLibrary and automatic EP selection
19802024
# We need a shared lib build to use that as a dependency for the test library
1981-
# Currently we only have device discovery on Windows so no point building the test app on other platforms.
1982-
if (WIN32 AND onnxruntime_BUILD_SHARED_LIB AND
2025+
if (onnxruntime_BUILD_SHARED_LIB AND
19832026
NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND
19842027
NOT onnxruntime_MINIMAL_BUILD)
19852028

@@ -1991,7 +2034,7 @@ if (WIN32 AND onnxruntime_BUILD_SHARED_LIB AND
19912034
"${TEST_SRC_DIR}/autoep/library/plugin_ep_utils.h")
19922035
onnxruntime_add_shared_library_module(example_plugin_ep ${onnxruntime_autoep_test_library_src})
19932036
target_include_directories(example_plugin_ep PRIVATE ${REPO_ROOT}/include/onnxruntime/core/session)
1994-
target_link_libraries(example_plugin_ep PRIVATE onnxruntime)
2037+
target_link_libraries(example_plugin_ep PRIVATE onnxruntime ${GSL_TARGET})
19952038

19962039
if(UNIX)
19972040
if (APPLE)
@@ -2024,7 +2067,7 @@ if (WIN32 AND onnxruntime_BUILD_SHARED_LIB AND
20242067
"${TEST_SRC_DIR}/autoep/library/example_plugin_ep_virt_gpu/ep.cc")
20252068
onnxruntime_add_shared_library_module(example_plugin_ep_virt_gpu ${onnxruntime_autoep_test_example_plugin_ep_virt_gpu_src})
20262069
target_include_directories(example_plugin_ep_virt_gpu PRIVATE ${REPO_ROOT}/include/onnxruntime/core/session)
2027-
target_link_libraries(example_plugin_ep_virt_gpu PRIVATE onnxruntime)
2070+
target_link_libraries(example_plugin_ep_virt_gpu PRIVATE onnxruntime ${GSL_TARGET})
20282071

20292072
if(UNIX)
20302073
if (APPLE)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
diff --git a/third_party/emdawnwebgpu/pkg/webgpu/src/library_webgpu.js b/third_party/emdawnwebgpu/pkg/webgpu/src/library_webgpu.js
2+
index a3ce6dc732..7119eae424 100644
3+
--- a/third_party/emdawnwebgpu/pkg/webgpu/src/library_webgpu.js
4+
+++ b/third_party/emdawnwebgpu/pkg/webgpu/src/library_webgpu.js
5+
@@ -903,6 +903,12 @@ var LibraryWebGPU = {
6+
stackRestore(sp);
7+
};
8+
9+
+ // Polyfill `device.AdapterInfo` if not present (Safari v26.0).
10+
+ // See https://bugs.webkit.org/show_bug.cgi?id=301878
11+
+ if (!('adapterInfo' in device)) {
12+
+ device.adapterInfo = adapter.info;
13+
+ }
14+
+
15+
_emwgpuOnRequestDeviceCompleted(futureId, {{{ gpu.RequestDeviceStatus.Success }}},
16+
{{{ gpu.passAsPointer('devicePtr') }}}, {{{ gpu.NULLPTR }}});
17+
}, (ex) => {

cmake/vcpkg-ports/onnx/portfile.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ vcpkg_cmake_configure(
3636
"-DProtobuf_PROTOC_EXECUTABLE:FILEPATH=${PROTOC}"
3737
-DONNX_ML=ON
3838
-DONNX_USE_PROTOBUF_SHARED_LIBS=${USE_PROTOBUF_SHARED}
39-
-DONNX_USE_LITE_PROTO=OFF
4039
-DONNX_USE_MSVC_STATIC_RUNTIME=${USE_STATIC_RUNTIME}
4140
-DONNX_BUILD_TESTS=OFF
4241
-DONNX_BUILD_BENCHMARKS=OFF

csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -866,13 +866,14 @@ static NativeMethods()
866866
internal class NativeLib
867867
{
868868
#if __ANDROID__
869-
// define the library name required for android
869+
// Define the library name required for Android
870870
internal const string DllName = "libonnxruntime.so";
871871
#elif __IOS__
872-
// define the library name required for iOS
872+
// Define the library name required for iOS
873873
internal const string DllName = "__Internal";
874874
#else
875-
internal const string DllName = "onnxruntime";
875+
// Note: the file name in ONNX Runtime nuget package must be onnxruntime.dll instead of onnxruntime.DLL(Windows filesystem can be case sensitive)
876+
internal const string DllName = "onnxruntime.dll";
876877
#endif
877878
}
878879

@@ -2951,7 +2952,9 @@ internal static class OrtExtensionsNativeMethods
29512952
#elif __IOS__
29522953
internal const string ExtensionsDllName = "__Internal";
29532954
#else
2954-
internal const string ExtensionsDllName = "ortextensions";
2955+
// For desktop platforms, explicitly specify the DLL name with extension to avoid
2956+
// issues on case-sensitive filesystems. See NativeLib.DllName for detailed explanation.
2957+
internal const string ExtensionsDllName = "ortextensions.dll";
29552958
#endif
29562959

29572960
[DllImport(ExtensionsDllName, CharSet = CharSet.Ansi,

docs/ContribOperators.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4534,10 +4534,19 @@ This version of the operator has been available since version 1 of the 'com.micr
45344534

45354535
Quantized mixture of experts (MoE).
45364536

4537-
Only weights are quantized with symmetric quantization.
45384537
The quantized weights are stored in column major order per expert.
45394538
The quantization block size can be specified. If not provided, column wise quantization is used.
45404539

4540+
The formula of linear dequantization of the quantized weights using scale and (optionally) zero-point is:
4541+
dequantized_weight = (quantized_weight - zero_point) * scale
4542+
When zero_point is not provided, the default value is 2^(bits-1): 8 for 4 bits, 128 for 8 bits.
4543+
4544+
If block_size is provided, both hidden_size and inter_size must be divisible by the block size, and
4545+
the dequantization is performed per block of size block_size along the K (input feature) dimension.
4546+
4547+
If block_size and zero_point are provided, both hidden_size and inter_size must be divisible by block_size * pack_size,
4548+
where pack_size = 8 / expert_weight_bits.
4549+
45414550
The SwiGLU (Swish-Gated Linear Unit) activation function is like:
45424551
g = xW + b
45434552
l = xV + c
@@ -4579,7 +4588,7 @@ This version of the operator has been available since version 1 of the 'com.micr
45794588
<dd>Whether to use sparse mixer</dd>
45804589
</dl>
45814590

4582-
#### Inputs (7 - 11)
4591+
#### Inputs (7 - 14)
45834592

45844593
<dl>
45854594
<dt><tt>input</tt> : T</dt>
@@ -4604,6 +4613,12 @@ This version of the operator has been available since version 1 of the 'com.micr
46044613
<dd>2D optional tensor with shape (num_experts, inter_size), or 3D optional tensor with shape (num_experts, inter_size, hidden_size / block_size) when block_size is provided.</dd>
46054614
<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
46064615
<dd>2D optional tensor with shape (num_experts, inter_size)</dd>
4616+
<dt><tt>fc1_zero_points</tt> (optional) : T1</dt>
4617+
<dd>2D tensor with shape (num_experts, fusion_size * inter_size / pack_size), or 3D tensor with shape (num_experts, fusion_size * inter_size, hidden_size / block_size / pack_size) when block_size is provided.</dd>
4618+
<dt><tt>fc2_zero_points</tt> (optional) : T1</dt>
4619+
<dd>2D tensor with shape (num_experts, hidden_size / pack_size), or 3D tensor with shape (num_experts, hidden_size, inter_size / block_size / pack_size) when block_size is provided.</dd>
4620+
<dt><tt>fc3_zero_points</tt> (optional) : T1</dt>
4621+
<dd>2D optional tensor with shape (num_experts, inter_size / pack_size), or 3D optional tensor with shape (num_experts, inter_size, hidden_size / block_size / pack_size) when block_size is provided.</dd>
46074622
</dl>
46084623

46094624
#### Outputs

0 commit comments

Comments
 (0)