diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 99f28439db53a..989d1022f1d7b 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -21,6 +21,7 @@ #include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" #include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h" +#include "../../framework/tensorprotoutils.h" namespace onnxruntime { namespace openvino_ep { @@ -453,6 +454,80 @@ static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& on #endif } +// this is a helper function to set the data fields, it duplicates ExternalDataInfo::SetExternalLocationToProto +// but we cannot use that function as it is not part of public provider api. +static void SetExternalDataFields(ONNX_NAMESPACE::TensorProto* proto_init, const void* data_ptr, int64_t data_size) { + static constexpr const char* ORT_INTERNAL_MEM_INITIALIZER = "*/_ORT_MEM_ADDR_/*"; + auto* external_data = proto_init->mutable_external_data(); + bool found_location = false, found_offset = false, found_length = false; + const int ext_data_size = external_data->size(); + proto_init->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL); + + for (int j = 0; j < ext_data_size; ++j) { + auto& ext_entry = external_data->at(j); + auto& key = *ext_entry.mutable_key(); + if (key == "location") { + *ext_entry.mutable_value() = ORT_INTERNAL_MEM_INITIALIZER; + found_location = true; + } else if (key == "offset") { + *ext_entry.mutable_value() = std::to_string(reinterpret_cast(data_ptr)); + found_offset = true; + } else if (key == "length") { + *ext_entry.mutable_value() = std::to_string(data_size); + found_length = true; + } + } + + if (!found_location) { + auto* new_entry = external_data->Add(); + *new_entry->mutable_key() = "location"; + *new_entry->mutable_value() = ORT_INTERNAL_MEM_INITIALIZER; + } + if (!found_offset) { + auto* new_entry = external_data->Add(); + *new_entry->mutable_key() = "offset"; + *new_entry->mutable_value() = std::to_string(reinterpret_cast(data_ptr)); + } + if (!found_length) { + auto* new_entry = external_data->Add(); + *new_entry->mutable_key() = "length"; + *new_entry->mutable_value() = std::to_string(data_size); + } +} + +static void ReadExternalDataFields(const ONNX_NAMESPACE::TensorProto* src_init, std::string& location, size_t& offset, size_t& length) { + // Remove constness as we need to use mutable_external_data() to get the entries to read. + // The entries themselves are not modified... + auto& mutable_proto = *const_cast(src_init); + auto* entry_protos = mutable_proto.mutable_external_data(); + for (int i = 0; i < entry_protos->size(); i++) { + auto& string_entry_proto{entry_protos->at(i)}; + const auto& pb_key{*(string_entry_proto.mutable_key())}; + const auto& pb_value{*(string_entry_proto.mutable_value())}; + if (pb_key == "location") { + location = pb_value; + } else if (pb_key == "offset") { + const auto res = std::from_chars(pb_value.data(), pb_value.data() + pb_value.size(), offset); + if (res.ec != std::errc()) { + std::ostringstream err_msg; + err_msg << "External data in memory has invalid offset field: " + << src_init->name() << "], location: " << location + << ", offset: " << pb_value; + ORT_THROW(err_msg.str()); + } + } else if (pb_key == "length") { + const auto res = std::from_chars(pb_value.data(), pb_value.data() + pb_value.size(), length); + if (res.ec != std::errc()) { + std::ostringstream err_msg; + err_msg << "External data in memory has invalid length field: " + << src_init->name() << "], location: " << location + << ", length: " << pb_value; + ORT_THROW(err_msg.str()); + } + } + } +} + std::unique_ptr BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, @@ -529,12 +604,98 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, return model_proto; } else { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled"; + + // scan ext initializers: + std::unordered_map> external_initializers_offset_and_length; + std::string tempLocation; + size_t extInitializerTotalSize = 0; + if (session_context_.has_external_weights) { + auto allInitializers = subgraph.GetAllInitializedTensors(); + for (auto& [name, tp] : allInitializers) { + if (utils::HasExternalDataInMemory(*tp)) { + size_t offset = 0; + size_t length = 0; + ReadExternalDataFields(tp, tempLocation, offset, length); + extInitializerTotalSize += length; + external_initializers_offset_and_length[name] = {offset, length}; + } + } + } + + // when we have external weights in memory, the model proto will actually embed those + // and bloat the serialized string. We can avoid that by not including the data in the proto + // but then we have to update those initializers and set the external_data fields to mem_addr tag... + // proto is limited to 2GB, but let's use 32MB as threshold to be conservative and still gain some memory reductions. +#if (((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2025)) + constexpr size_t MAX_EMBEDDED_INITIALIZER_SIZE = 1024 * 1024 * 32; + const bool include_initializer_data_in_proto = !(session_context_.has_external_weights && + external_initializers_offset_and_length.size() > 1 && + extInitializerTotalSize >= MAX_EMBEDDED_INITIALIZER_SIZE); +#else + const bool include_initializer_data_in_proto = true; +#endif + + auto model = subgraph.CreateModel(logger); auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); - subgraph.ToProto(*model_proto->mutable_graph(), true, true); + subgraph.ToProto(*model_proto->mutable_graph(), /*include_initializers*/true, + /*include_outer_scope_args*/true, /*execution_order*/0, /*include_initializer_data*/include_initializer_data_in_proto); + print_model_proto_duration(); + + if (!include_initializer_data_in_proto) { + LOGS(logger, INFO) << "Initializer data is not included in the model proto. Updating metadata..., total size " << extInitializerTotalSize / (1024 * 1024) << " MB in " << external_initializers_offset_and_length.size() << " initializers"; + auto* graph_proto = model_proto->mutable_graph(); + auto* proto_initializers = graph_proto->mutable_initializer(); + + std::unordered_map proto_initializer_map; + for (int i = 0, n = proto_initializers->size(); i < n; ++i) { + auto& proto_init = proto_initializers->at(i); + proto_initializer_map[proto_init.name()] = &proto_init; + } + + for (const auto& [name, src_init] : subgraph.GetAllInitializedTensors()) { + auto it = proto_initializer_map.find(name); + if (it == proto_initializer_map.end()) + continue; + + auto* proto_init = it->second; + + // If the proto initializer is missing data, fill it in + if (!proto_init->has_raw_data() && src_init->has_raw_data()) { + *proto_init->mutable_raw_data() = src_init->raw_data(); + } + + // Only set in-memory external_data fields if the data is in memory + if (src_init->has_raw_data()) { + LOGS(logger, VERBOSE) << "In-memory initializer RAW: " + << src_init->name() + << ", data_type: " << src_init->data_type() + << ", raw_data size: " << src_init->raw_data().size(); + + SetExternalDataFields(proto_init, src_init->raw_data().data(), src_init->raw_data().size()); + } else if (onnxruntime::utils::HasExternalDataInMemory(*src_init)) { + auto it_ext = external_initializers_offset_and_length.find(name); + if (it_ext == external_initializers_offset_and_length.end()) { + std::ostringstream err_msg; + err_msg << "Initializer marked as external in memory but missing offset/length info: " << src_init->name(); + ORT_THROW(err_msg.str()); + } + const size_t offset = it_ext->second.first; + const size_t length = it_ext->second.second; + + LOGS(logger, VERBOSE) << "In-memory initializer EXT: " << src_init->name() << ", size: " << length; + + SetExternalDataFields(proto_init, (const void*)offset, length); + } else { + LOGS(logger, VERBOSE) << "File-based initializer: " << src_init->name() << ", data_type: " << src_init->data_type(); + } + } + } + DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); + return model_proto; } } diff --git a/onnxruntime/test/providers/openvino/openvino_ep_ext_init.cc b/onnxruntime/test/providers/openvino/openvino_ep_ext_init.cc new file mode 100644 index 0000000000000..21ec61c2d2e3f --- /dev/null +++ b/onnxruntime/test/providers/openvino/openvino_ep_ext_init.cc @@ -0,0 +1,215 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include + +#include "core/session/onnxruntime_cxx_api.h" + +#include "test/util/include/test/test_environment.h" +#include "test/unittest_util/qdq_test_utils.h" + +#include "gtest/gtest.h" +#include "gmock/gmock.h" +#include "onnxruntime_session_options_config_keys.h" + +using namespace ONNX_NAMESPACE; +using namespace onnxruntime::logging; + +extern std::unique_ptr ort_env; + +class OVEP_ExtInit_Tests : public ::testing::TestWithParam {}; + +namespace { + +std::vector LoadFileToMemory(const std::string& path) { + std::ifstream file(path, std::ios::binary | std::ios::ate); + if (!file.is_open()) { + return std::vector(); + } + std::streamsize size = file.tellg(); + file.seekg(0, std::ios::beg); + std::vector buffer(static_cast(size)); + if (!file.read(reinterpret_cast(buffer.data()), size)) { + return std::vector(); + } + return buffer; +} + +auto ProbeDevice(const std::string& device) { + static std::map is_present; + if (is_present.find(device) == is_present.end()) { + Ort::SessionOptions sessionOptions; + std::unordered_map ov_options; + ov_options["device_type"] = device; + try { + sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options); + is_present[device] = true; + } catch (...) { + is_present[device] = false; + } + } + return is_present[device]; +} +} // namespace detail + +namespace onnxruntime { +namespace test { + +// this test requiresOV 2025.4+ to run, currently CI uses OV 2025.2, so the test will be disabled until OV is updated +TEST_P(OVEP_ExtInit_Tests, DISABLED_ModelFromExtInit) { + const auto& device = GetParam(); + if (!ProbeDevice(device)) + GTEST_SKIP() << device + " is not available on this machine"; + + // Model and weights file paths + const std::string model_path = "ovep_ext_init_test.onnx"; + const std::string weights_path = "ovep_ext_init_test.onnx.data"; + const size_t num_initializers = 8; + const size_t floats_per_initializer = 64 * 1024 * 1024; // 64 millions floats per initializer, 256MB + const size_t total_floats = num_initializers * floats_per_initializer; + const size_t total_bytes = total_floats * sizeof(float); + // min size threshold for new logic with ext initializers + ASSERT_GE(total_bytes, 32 * 1024 * 1024); + + // 1. Create initializers + std::vector> initializer_data; + for (size_t i = 0; i < num_initializers; ++i) + initializer_data.emplace_back(floats_per_initializer, static_cast(i + 1)); // W0:1, W1:2... + + // 2. Build ONNX model with 4 external initializers, and 4 ADD nodes + { + ModelProto model_proto; + model_proto.set_ir_version(7); + model_proto.set_producer_name("openvino_extinit_test"); + model_proto.set_producer_version("1.0"); + model_proto.set_domain(""); + model_proto.set_model_version(1); + + auto* graph = model_proto.mutable_graph(); + graph->set_name("TestGraph"); + + // Input: shape [floats_per_initializer] + auto* input = graph->add_input(); + input->set_name("X"); + auto* input_type = input->mutable_type()->mutable_tensor_type(); + input_type->set_elem_type(TensorProto_DataType_FLOAT); + input_type->mutable_shape()->add_dim()->set_dim_value(floats_per_initializer); + + // Output: shape [floats_per_initializer] + auto* output = graph->add_output(); + output->set_name("Y"); + auto* output_type = output->mutable_type()->mutable_tensor_type(); + output_type->set_elem_type(TensorProto_DataType_FLOAT); + output_type->mutable_shape()->add_dim()->set_dim_value(floats_per_initializer); + + auto* opset_import = model_proto.add_opset_import(); + opset_import->set_domain(""); + opset_import->set_version(19); + + // Add initializers as external data + size_t offset = 0; + std::vector initializer_names; + for (size_t i = 0; i < num_initializers; ++i) { + std::string name = "W" + std::to_string(i); + initializer_names.push_back(name); + TensorProto* initializer = graph->add_initializer(); + initializer->set_name(name); + initializer->set_data_type(TensorProto_DataType_FLOAT); + initializer->add_dims(floats_per_initializer); + initializer->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + auto* ext = initializer->add_external_data(); + ext->set_key("location"); + ext->set_value(weights_path); + ext = initializer->add_external_data(); + ext->set_key("offset"); + ext->set_value(std::to_string(offset)); + ext = initializer->add_external_data(); + ext->set_key("length"); + ext->set_value(std::to_string(floats_per_initializer * sizeof(float))); + offset += floats_per_initializer * sizeof(float); + } + + // nodes: X -> Add with Init[0] -> ... -> output Y + std::string prev_output = "X"; + std::string node_output; + for (size_t i = 0; i < num_initializers; ++i) { + node_output = (i == num_initializers - 1) ? "Y" : "A" + std::to_string(i); + auto* add_node = graph->add_node(); + add_node->set_op_type("Add"); + add_node->add_input(prev_output); + add_node->add_input(initializer_names[i]); + add_node->add_output(node_output); + prev_output = node_output; + } + + // Save model + std::ofstream model_file(model_path, std::ios::binary); + ASSERT_TRUE(model_proto.SerializeToOstream(&model_file)); + model_file.close(); + } + + // 3. Save weights file (concatenate all initializers) + { + std::ofstream weights_file(weights_path, std::ios::binary); + ASSERT_TRUE(weights_file.is_open()); + for (const auto& w : initializer_data) { + weights_file.write(reinterpret_cast(w.data()), w.size() * sizeof(float)); + } + weights_file.close(); + } + + // 4. Load model and weights into memory + std::vector model_data = LoadFileToMemory(model_path); + std::vector weights_data = LoadFileToMemory(weights_path); + + // 5. Prepare external initializer info + PathString weights_name_path(weights_path.begin(), weights_path.end()); + std::vector names_path = {weights_name_path}; + std::vector buffers = {reinterpret_cast(weights_data.data())}; + std::vector buffer_sizes = {weights_data.size()}; + + // 6. Set up session options with OpenVINO + Ort::SessionOptions session_options; + session_options.AddConfigEntry(kOrtSessionOptionsDisableCPUEPFallback, "1"); + session_options.SetIntraOpNumThreads(1); + std::unordered_map ov_options = { {"device_type", device } }; + session_options.AppendExecutionProvider_OpenVINO_V2(ov_options); + session_options.AddExternalInitializersFromFilesInMemory(names_path, buffers, buffer_sizes); + + // 7. Create session from memory + Ort::Session session(*ort_env, model_data.data(), model_data.size(), session_options); + + // 8. Run inference to verify weights are loaded + std::vector input_data(floats_per_initializer, 2.0f); + std::vector input_shape = {static_cast(floats_per_initializer)}; + Ort::MemoryInfo mem_info = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtDeviceAllocator, OrtMemTypeDefault); + Ort::Value input_tensor = Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), input_shape.data(), input_shape.size()); + + std::vector input_names = {"X"}; + std::vector output_names = {"Y"}; + std::vector output_tensors(1); + + session.Run(Ort::RunOptions{nullptr}, input_names.data(), &input_tensor, 1, output_names.data(), output_tensors.data(), 1); + + // Check output: should be input + W0 + W1 + W2... + auto* out_data = output_tensors[0].GetTensorMutableData(); + float expected = input_data[0]; + for (size_t i = 0; i < num_initializers; ++i) { + expected += initializer_data[i][0]; + } + + for (size_t i = 0; i < floats_per_initializer; ++i) + ASSERT_FLOAT_EQ(out_data[i], expected); + + // Cleanup + std::filesystem::remove(model_path); + std::filesystem::remove(weights_path); +} +INSTANTIATE_TEST_SUITE_P(OVEP_Tests, + OVEP_ExtInit_Tests, + ::testing::Values("CPU", "GPU", "NPU")); + +} // namespace test +} // namespace onnxruntime