diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index 606ab1fe5ba89..f1a2e752f7965 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -13,8 +13,8 @@ # Header paths find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX) - if(OpenVINO_VERSION VERSION_LESS 2024.4) - message(FATAL_ERROR "OpenVINO 2024.4 and newer are supported. Please, use latest OpenVINO release") + if(OpenVINO_VERSION VERSION_LESS 2024.5) + message(FATAL_ERROR "OpenVINO 2024.5 and newer are supported. Please, use latest OpenVINO release") endif() if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4) @@ -30,7 +30,7 @@ endif() list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES}) - if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS})) + if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}) AND onnxruntime_USE_OPENVINO_GPU) add_definitions(-DIO_BUFFER_ENABLED=1) list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS}) endif() @@ -86,4 +86,4 @@ set_target_properties(onnxruntime_providers_openvino PROPERTIES MAP_IMPORTED_CONFIG_RELEASE RelWithDebInfo MAP_IMPORTED_CONFIG_DEBUG RelWithDebInfo - ) \ No newline at end of file + ) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index a0bcf953938d9..16a92b43adaf6 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -10,8 +10,10 @@ #include #include #include +#include #include "core/providers/shared_library/provider_api.h" +#include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/contexts.h" #include "core/providers/openvino/backend_manager.h" #include "core/providers/openvino/ibackend.h" @@ -21,8 +23,8 @@ namespace onnxruntime { namespace openvino_ep { -GlobalContext& BackendManager::GetGlobalContext() { - return global_context_; +SessionContext& BackendManager::GetSessionContext() { + return session_context_; } ov::CompiledModel& BackendManager::GetOVCompiledModel() { @@ -30,75 +32,95 @@ ov::CompiledModel& BackendManager::GetOVCompiledModel() { return (ov_ptr); } -BackendManager::BackendManager(const GlobalContext& global_context, +BackendManager::BackendManager(SessionContext& session_context, + SharedContext& shared_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, - EPCtxHandler& ep_ctx_handle_) { - global_context_ = global_context; - - openvino_sdk_version_ = std::to_string(global_context_.OpenVINO_Version.at(0)) + "." + - std::to_string(global_context_.OpenVINO_Version.at(1)); - if (ep_ctx_handle_.CheckForOVEPCtxNode(subgraph, openvino_sdk_version_)) { - if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, global_context_.ep_context_embed_mode) != Status::OK()) - ORT_THROW("Import blob from model failed"); - } + EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle), + session_context_(session_context), + shared_context_{shared_context} { + subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph); + + subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) { + // return empty if graph has no inputs or if types are not one of FP32/FP16 + // else assume the type of the first input + if (graph_viewer.GetInputs().empty()) { + return ""; + } else { + auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type(); + if (session_context_.precision == "ACCURACY" && + session_context_.device_type.find("GPU") != std::string::npos) { + if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { + return "FP32"; + } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) { + return "FP16"; + } + } + } + return ""; + }(subgraph); // Save the indexes of graph inputs among fused_node's inputDefs // (which also contains initializers). - auto node_input_defs = fused_node.InputDefs(); - int i = 0; - for (auto idef : node_input_defs) { - subgraph_context_.input_names.insert({idef->Name(), i}); - i++; + for (uint32_t index = 0; const auto& node : subgraph.GetInputs()) { + subgraph_context_.input_names.insert({node->Name(), index++}); } - const std::vector& graph_inputs = subgraph.GetInputs(); - for (auto input : graph_inputs) { - auto it = subgraph_context_.input_names.find(input->Name()); - if (it == subgraph_context_.input_names.end()) { - ORT_THROW("Input not found in the input defs list"); - } - int index = it->second; - subgraph_context_.input_indexes.push_back(index); + for (uint32_t index = 0; const auto& node : subgraph.GetOutputs()) { + subgraph_context_.output_names.insert({node->Name(), index++}); } - auto graph_outputs_defs = fused_node.OutputDefs(); - i = 0; - for (auto output_def : graph_outputs_defs) { - subgraph_context_.output_names.insert({output_def->Name(), i}); - i++; - } subgraph_context_.subgraph_name = fused_node.Name(); + + ptr_stream_t model_stream; std::unique_ptr model_proto; - if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) { + if (subgraph_context_.is_ep_ctx_graph) { + model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph); + } else { model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger); } - std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type; + std::string device_type = session_context_.device_type; + + auto& sw = shared_context_.shared_weights; + if (session_context_.so_share_ep_contexts) { + std::filesystem::path weight_filename = session_context_.onnx_model_path_name.parent_path(); + if (sw.external_weight_filename.empty() && !sw.metadata.empty()) { + // Reasonable assumption that all metadata entries have the same external file location + sw.external_weight_filename = sw.metadata.begin()->second.location; + } + weight_filename /= sw.external_weight_filename; + std::ifstream weight_file(weight_filename); + + if (weight_file) { + if (!sw.mapped_weights) { + sw.mapped_weights = std::make_unique(weight_filename); + } + backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, *sw.mapped_weights); + } + } if (ModelHasSymbolicInputDims(subgraph)) { subgraph_context_.has_dynamic_input_shape = true; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims"; - ORT_ENFORCE(!global_context_.enable_qdq_optimizer, - "QDQ stripping should not be enabled for models with dynamic input shapes. " - "Set enable_qdq_optimizer to False"); - if ((GetGlobalContext().device_type.find("CPU") != std::string::npos || - GetGlobalContext().device_type.find("GPU") != std::string::npos) && - !GetGlobalContext().disable_dynamic_shapes) { + if ((session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos) && + !session_context_.disable_dynamic_shapes) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. " << "Creating backend Dynamic Shapes"; try { concrete_backend_ = BackendFactory::MakeBackend(model_proto, - GetGlobalContext(), + session_context_, subgraph_context_, - ep_ctx_handle_); + shared_context_, + model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " << "Backend created for graph " << subgraph_context_.subgraph_name; } else { - // Only cache model_proto in global to rewrite the model with input shapes at runtime. + // Only cache model_proto in session context to rewrite the model with input shapes at runtime. // For dynamic backend creation model_proto_ = std::move(model_proto); } @@ -112,14 +134,15 @@ BackendManager::BackendManager(const GlobalContext& global_context, // OV NPU plugin is supported with fallback to OV CPU upon compilation failures. try { concrete_backend_ = BackendFactory::MakeBackend(model_proto, - GetGlobalContext(), + session_context_, subgraph_context_, - ep_ctx_handle_); + shared_context_, + model_stream); } catch (const OnnxRuntimeException& ex) { std::string exception_str = ex.what(); bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos && - !GetGlobalContext().disable_cpu_fallback && - !ep_ctx_handle_.IsValidOVEPCtxGraph(); + !session_context_.so_disable_cpu_ep_fallback && + !subgraph_context_.is_ep_ctx_graph; #if defined(OPENVINO_DISABLE_NPU_FALLBACK) eligible_for_cpu_fallback = false; #else @@ -127,13 +150,14 @@ BackendManager::BackendManager(const GlobalContext& global_context, LOGS_DEFAULT(VERBOSE) << exception_str; LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; - GetGlobalContext().device_type = "CPU"; - GetGlobalContext().precision_str = "FP32"; + session_context_.device_type = "CPU"; + session_context_.precision = "FP32"; try { concrete_backend_ = BackendFactory::MakeBackend(model_proto, - GetGlobalContext(), + session_context_, subgraph_context_, - ep_ctx_handle_); + shared_context_, + model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } @@ -165,9 +189,8 @@ BackendManager::BackendManager(const GlobalContext& global_context, } } } - if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) { - auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph, - logger); + if (session_context_.so_context_enable && !subgraph_context_.is_ep_ctx_graph) { + auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph); if ((!status.IsOK())) { ORT_THROW(status); } @@ -178,9 +201,8 @@ BackendManager::BackendManager(const GlobalContext& global_context, // precompiled blob is set. If that's the case: // By default, create model in embed mode where the blob stream is exported as data within // the EPContext node. -Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer, - const logging::Logger& logger) { - if (GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) { +Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer) { + if (session_context_.disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) { std::string exception_str = "Exporting dynamically compiled models at runtime is not supported. " "Cannot export blobs of dynamic models that request static shape inference. " @@ -188,47 +210,48 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie ORT_THROW(exception_str); } - std::string model_blob_str; - auto compiled_model = concrete_backend_->GetOVCompiledModel(); - std::string graph_name = ""; - // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability - if (!global_context_.cache_dir.empty()) { - graph_name = global_context_.cache_dir; - } else { - graph_name = global_context_.onnx_model_path_name; - // Remove extension so we can append suffix to form the complete name of output graph - size_t dot = global_context_.onnx_model_path_name.find_last_of("."); - graph_name = graph_name.substr(0, dot); - if (dot != std::string::npos) graph_name += "_ctx.onnx"; - } - // If embed_mode, then pass on the serialized blob // If not embed_mode, dump the blob here and only pass on the path to the blob - if (global_context_.ep_context_embed_mode) { + std::string model_blob_str; + auto compiled_model = concrete_backend_->GetOVCompiledModel(); + if (session_context_.so_context_embed_mode) { // Internal blob std::ostringstream model_blob_stream; compiled_model.export_model(model_blob_stream); model_blob_str = std::move(model_blob_stream).str(); if (model_blob_str.empty()) { ORT_THROW("Model blob stream is empty after exporting the compiled model."); } - } else { - // Remove extension so we can append suffix to form the complete name of output graph - auto blob_name = graph_name.substr(0, graph_name.find_last_of(".")); - std::ofstream blob_file(blob_name + ".blob", + } else { // External blob + // Build name by combining EpCtx model name (if available) and subgraph name. Model + // name is not available in when creating a session from memory + auto name = session_context_.so_context_file_path.stem().string(); + if (!name.empty() && !graph_body_viewer.ModelPath().empty()) { + name = graph_body_viewer.ModelPath().stem().string(); + } + if (!name.empty()) { + name += "_"; + } + name += subgraph_context_.subgraph_name; + + std::filesystem::path blob_filename = session_context_.so_context_file_path; + if (blob_filename.empty()) { + blob_filename = session_context_.onnx_model_path_name; + } + blob_filename = blob_filename.parent_path() / name; + blob_filename.replace_extension("blob"); + std::ofstream blob_file(blob_filename, std::ios::out | std::ios::trunc | std::ios::binary); if (!blob_file) { ORT_THROW("Unable to open file for epctx model dump."); } compiled_model.export_model(blob_file); - model_blob_str = blob_name + ".blob"; + model_blob_str = blob_filename.filename().string(); } - ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer, - graph_name, - logger, - global_context_.ep_context_embed_mode, - std::move(model_blob_str), - openvino_sdk_version_)); + ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer, + subgraph_context_.subgraph_name, + session_context_.so_context_embed_mode, + std::move(model_blob_str))); return Status::OK(); } @@ -236,8 +259,8 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& model_proto) const { bool has_batched_inputs = true; - for (int i = 0; i < static_cast(subgraph_context_.input_indexes.size()); i++) { - auto& input = model_proto.graph().input(subgraph_context_.input_indexes[i]); + for (const auto& [name, index] : subgraph_context_.input_names) { + auto& input = model_proto.graph().input(index); // Batch-process only raw image inputs (NCHW or NHWC layouts) auto& shape = input.type().tensor_type().shape(); @@ -251,8 +274,8 @@ bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& mod break; } - for (int index = 1; index < 4; index++) { - if (shape.dim(index).value_case() != shape.dim(0).kDimValue) { + for (int dim_index = 1; dim_index < 4; dim_index++) { + if (shape.dim(dim_index).value_case() != shape.dim(0).kDimValue) { has_batched_inputs = false; break; } @@ -299,27 +322,20 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) { return false; } -static void DumpOpenVINOEPModel(std::string onnx_model_path_name, +static void DumpOpenVINOEPModel(const std::filesystem::path& onnx_model_path_name, ONNX_NAMESPACE::ModelProto* model_proto, const onnxruntime::Node& fused_node) { if (openvino_ep::backend_utils::IsDebugEnabled()) { - auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : std::move(onnx_model_path_name); -#ifdef _WIN32 - size_t slash = model_name.find_last_of("\\"); -#else - size_t slash = model_name.find_last_of("/"); -#endif - model_name = model_name.substr(slash + 1, std::string::npos); - size_t dot = model_name.find_last_of("."); - model_name = model_name.substr(0, dot); + auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name.filename(); - std::string subgraph_name = fused_node.Name(); + const auto& subgraph_name = fused_node.Name(); size_t dash = subgraph_name.find_last_of("-"); - subgraph_name = subgraph_name.substr(dash, std::string::npos); - - const std::string name = model_name + subgraph_name + ".onnx"; + if (dash != std::string::npos) { + auto new_name = model_name.stem().string() + subgraph_name.substr(dash, std::string::npos); + model_name.replace_filename(new_name); + } - std::fstream dump(name, std::ios::out | std::ios::trunc | std::ios::binary); + std::fstream dump(model_name, std::ios::out | std::ios::trunc | std::ios::binary); model_proto->SerializeToOstream(dump); } } @@ -344,17 +360,18 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, } }; + const auto& onnx_model_path_name = subgraph.ModelPath(); // QDQ stripping enabled only for the NPU - if (global_context_.device_type.find("NPU") != std::string::npos && - global_context_.enable_qdq_optimizer && + if (session_context_.device_type.find("NPU") != std::string::npos && + session_context_.enable_qdq_optimizer && IsQDQGraph(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1"; std::unique_ptr model; - Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, model); + Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, model, shared_context_.shared_weights); auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); print_model_proto_duration(); - DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node); + DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; } else { @@ -364,7 +381,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); subgraph.ToProto(*model_proto->mutable_graph(), true, true); print_model_proto_duration(); - DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node); + DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); return model_proto; } } @@ -456,16 +473,17 @@ void BackendManager::Compute(OrtKernelContext* context) { // by rewriting the model to static shaped model at runtime based on input shape. // disable_dynamic_shapes is always set to true for OV NPU plugin. if (subgraph_context_.has_dynamic_input_shape && - !GetGlobalContext().disable_dynamic_shapes && - (GetGlobalContext().device_type.find("CPU") != std::string::npos || - GetGlobalContext().device_type.find("GPU") != std::string::npos)) { + !session_context_.disable_dynamic_shapes && + (session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos)) { concrete_backend_->Infer(context); } else if (subgraph_context_.has_dynamic_input_shape) { std::vector> tensor_shapes = GetInputTensorShapes(ctx); - auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type); + auto key = MakeMapKeyString(tensor_shapes, session_context_.device_type); std::shared_ptr dynamic_backend; auto search = backend_map_.find(key); if (search == backend_map_.end()) { + ptr_stream_t model_stream; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " << "Creating dynamic backend for key: " << key; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " @@ -473,28 +491,30 @@ void BackendManager::Compute(OrtKernelContext* context) { auto modelproto_with_concrete_shapes = ReWriteInputShapeInfo(*model_proto_, tensor_shapes); try { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, - GetGlobalContext(), + session_context_, subgraph_context_, - ep_ctx_handle_); + shared_context_, + model_stream); } catch (const OnnxRuntimeException& ex) { // Build option disables fallback to CPU on compilation failures with NPU. #if defined(OPENVINO_DISABLE_NPU_FALLBACK) LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."; ORT_THROW(ex.what()); #else - if (GetGlobalContext().device_type.find("NPU") != std::string::npos && - !GetGlobalContext().disable_cpu_fallback) { + if (session_context_.device_type.find("NPU") != std::string::npos && + !session_context_.so_disable_cpu_ep_fallback) { LOGS_DEFAULT(WARNING) << ex.what(); LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; - GetGlobalContext().device_type = "CPU"; - GetGlobalContext().precision_str = "FP32"; - key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type); + session_context_.device_type = "CPU"; + session_context_.precision = "FP32"; + key = MakeMapKeyString(tensor_shapes, session_context_.device_type); try { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, - GetGlobalContext(), + session_context_, subgraph_context_, - ep_ctx_handle_); + shared_context_, + model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } @@ -524,6 +544,8 @@ void BackendManager::Compute(OrtKernelContext* context) { } void BackendManager::ShutdownBackendManager() { + backend_map_.clear(); + concrete_backend_.reset(); } } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 5ec462afd9d01..cdc27701ec2e6 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -19,17 +19,16 @@ namespace openvino_ep { // Singleton class that manages all the backends class BackendManager { public: - BackendManager(const GlobalContext& global_context, + BackendManager(SessionContext& session_context, + SharedContext& shared_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, EPCtxHandler& ctx_handle); void Compute(OrtKernelContext* context); void ShutdownBackendManager(); - void SetGlobalCotext(const GlobalContext& global_context); - GlobalContext& GetGlobalContext(); - Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, - const logging::Logger& logger); + SessionContext& GetSessionContext(); + Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph); ov::CompiledModel& GetOVCompiledModel(); private: @@ -52,9 +51,9 @@ class BackendManager { std::shared_ptr concrete_backend_; std::map> backend_map_; SubGraphContext subgraph_context_; - GlobalContext global_context_; - EPCtxHandler ep_ctx_handle_{}; - std::string openvino_sdk_version_{}; + EPCtxHandler& ep_ctx_handle_; + SessionContext& session_context_; + SharedContext& shared_context_; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index d6f408228f2bf..acc3f120b270b 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -1,13 +1,16 @@ // Copyright (C) Intel Corporation // Licensed under the MIT License - #include #include #include #include +#include +#include + #include "openvino/pass/convert_fp32_to_fp16.hpp" #include "openvino/pass/constant_folding.hpp" +#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp" #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/backend_utils.h" #include "core/providers/openvino/ov_interface.h" @@ -16,6 +19,105 @@ using Exception = ov::Exception; namespace onnxruntime { namespace openvino_ep { + +SharedContext::SharedWeights::WeightsFile::WeightsFile(std::filesystem::path filename) : file_(filename, std::ios::in | std::ios::binary) { + try { + file_.exceptions(std::ifstream::failbit | std::ifstream::badbit); + weights_size_ = file_.seekg(0, std::ios::end).tellg(); + } catch (std::ifstream::failure& e) { + ORT_THROW("Error: Failed to open weight file at ", filename.string(), " ", e.what()); + } +} + +void SharedContext::SharedWeights::WeightsFile::load_weights(size_t file_offset, void* data, size_t size) { + ORT_ENFORCE(file_offset < weights_size_ && size <= weights_size_ && (file_offset <= weights_size_ - size), "Error: File offset is out of bounds."); + file_.seekg(file_offset); + file_.read(reinterpret_cast(data), size); +} + +std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) { + try { + stream << metadata.size(); + + // Write each key-value pair + // Put elements in separate lines to facilitate reading + for (const auto& [key, value] : metadata) { + stream << std::endl + << key.name; + stream << std::endl + << value.location; + stream << std::endl + << value.data_offset; + stream << std::endl + << value.size; + stream << std::endl + << value.dimensions.size(); + for (const auto& dim : value.dimensions) { + stream << std::endl + << dim; + } + stream << std::endl + << value.element_type; + } + } catch (const Exception& e) { + ORT_THROW("Error: Failed to write map data.", e.what()); + } catch (...) { + ORT_THROW("Error: Failed to write map data."); + } + + ORT_ENFORCE(stream.good(), "Error: Failed to write map data."); + return stream; +} + +std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Metadata::Map& metadata) { + size_t map_size{0}; + try { + stream >> map_size; + + while (!stream.eof()) { + SharedContext::SharedWeights::Metadata::Key key; + SharedContext::SharedWeights::Metadata::Value value; + stream >> key.name; + stream >> value.location; + stream >> value.data_offset; + stream >> value.size; + size_t num_dimensions; + stream >> num_dimensions; + + if (stream.fail()) { + ORT_THROW("Error: Failed to read num_dimensions from stream."); + } + + constexpr size_t MAX_SAFE_DIMENSIONS = 1024; + + size_t safe_num_dimensions = num_dimensions; + + if (num_dimensions == 0 || safe_num_dimensions > MAX_SAFE_DIMENSIONS) { + ORT_THROW("Invalid number of dimensions provided."); + } + try { + value.dimensions.resize(safe_num_dimensions); + } catch (const std::bad_alloc&) { + ORT_THROW("Error: Memory allocation failed while resizing dimensions."); + } + + for (auto& dim : value.dimensions) { + stream >> dim; + } + stream >> value.element_type; + metadata.emplace(key, value); + } + } catch (const Exception& e) { + ORT_THROW("Error: Failed to read map data.", e.what()); + } catch (...) { + ORT_THROW("Error: Failed to read map data."); + } + + ORT_ENFORCE(metadata.size() == map_size, "Error: Inconsistent map data."); + + return stream; +} + namespace backend_utils { bool IsDebugEnabled() { @@ -34,23 +136,18 @@ bool IsCILogEnabled() { return false; } -struct static_cast_int64 { - template // T1 models type statically convertible to T - int64_t operator()(const T1& x) const { return static_cast(x); } -}; - std::shared_ptr -CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, +CreateOVModel(const std::string model, + const SessionContext& session_context, std::map>& const_outputs_map) { if (IsCILogEnabled()) { std::cout << "CreateNgraphFunc" << std::endl; } - const std::string model = model_proto.SerializeAsString(); try { - auto ov_model = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name); + auto ov_model = OVCore::ReadModel(model, session_context.onnx_model_path_name.string()); // Check for Constant Folding - if ((global_context.device_type != "NPU") && !global_context.is_wholly_supported_graph) { + if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) { ov::pass::ConstantFolding pass_const_obj; pass_const_obj.run_on_model(ov_model); auto& results = const_cast(ov_model.get()->get_results()); @@ -82,7 +179,7 @@ Ort::UnownedValue GetOutputTensor(Ort::KernelContext& context, size_t batch_size, OVInferRequestPtr infer_request, std::string output_name, - std::unordered_map output_names) { + const SubGraphContext::string_index_map_t& output_names) { auto graph_output_blob = infer_request->GetTensor(output_name); auto graph_output_dims = graph_output_blob->get_shape(); @@ -107,7 +204,7 @@ GetOutputTensor(Ort::KernelContext& context, size_t batch_size, Ort::UnownedValue GetOutputTensor(Ort::KernelContext& context, std::string output_name, - std::unordered_map output_names, + const SubGraphContext::string_index_map_t& output_names, std::shared_ptr node) { // Find position of '/' in the output_name auto pos = output_name.find("/"); @@ -129,13 +226,13 @@ GetOutputTensor(Ort::KernelContext& context, return context.GetOutput(index, output_shape.get(), num_dims); } -int GetFirstAvailableDevice(GlobalContext& global_context) { +int GetFirstAvailableDevice(SessionContext& session_context) { int i = 0; // Get the first available VAD-M device and set the device to busy while (i < 8) { - bool device = global_context.deviceAvailableList[i]; + bool device = session_context.deviceAvailableList[i]; if (device) { - global_context.deviceAvailableList[i] = false; + session_context.deviceAvailableList[i] = false; break; } i++; @@ -144,9 +241,9 @@ int GetFirstAvailableDevice(GlobalContext& global_context) { // make all remaining devices free if (i == 8) { i = 0; - global_context.deviceAvailableList[i] = false; + session_context.deviceAvailableList[i] = false; for (int j = 1; j < 8; j++) { - global_context.deviceAvailableList[j] = true; + session_context.deviceAvailableList[j] = true; } } return i; @@ -155,23 +252,23 @@ int GetFirstAvailableDevice(GlobalContext& global_context) { void FillOutputsWithConstantData(std::shared_ptr node, Ort::UnownedValue& out_tensor) { switch (node->get_element_type()) { case ov::element::Type_t::f32: { - FillOutputHelper(out_tensor, node); + FillOutputHelper(out_tensor, std::move(node)); break; } case ov::element::Type_t::boolean: { - FillOutputHelper(out_tensor, node); + FillOutputHelper(out_tensor, std::move(node)); break; } case ov::element::Type_t::i32: { - FillOutputHelper(out_tensor, node); + FillOutputHelper(out_tensor, std::move(node)); break; } case ov::element::Type_t::i64: { - FillOutputHelper(out_tensor, node); + FillOutputHelper(out_tensor, std::move(node)); break; } case ov::element::Type_t::f16: { - FillOutputHelper(out_tensor, node); + FillOutputHelper(out_tensor, std::move(node)); break; } default: @@ -267,6 +364,78 @@ void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std printPerformanceCounts(performanceMap, stream, std::move(deviceName)); } +ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt) { + static std::unordered_map map{ + {ONNX_NAMESPACE::TensorProto_DataType_FLOAT, ov::element::f32}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT8, ov::element::u8}, + {ONNX_NAMESPACE::TensorProto_DataType_INT8, ov::element::i8}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT16, ov::element::u16}, + {ONNX_NAMESPACE::TensorProto_DataType_INT16, ov::element::i16}, + {ONNX_NAMESPACE::TensorProto_DataType_INT32, ov::element::i32}, + {ONNX_NAMESPACE::TensorProto_DataType_INT64, ov::element::i64}, + {ONNX_NAMESPACE::TensorProto_DataType_STRING, ov::element::string}, + {ONNX_NAMESPACE::TensorProto_DataType_BOOL, ov::element::boolean}, + {ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, ov::element::f16}, + {ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, ov::element::f64}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT32, ov::element::u32}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT64, ov::element::u64}, + //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64, ov::element::undefined}, + //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128, ov::element::undefined}, + {ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16, ov::element::bf16}, + //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN, ov::element::undefined}, + //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ, ov::element::undefined}, + {ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2, ov::element::f8e5m2}, + //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ, ov::element::undefined}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT4, ov::element::u4}, + {ONNX_NAMESPACE::TensorProto_DataType_INT4, ov::element::i4}, + }; + + if (auto result = map.find(dt); result != map.end()) { + return result->second; + } else { + throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(dt)); + } +} + +// Function to handle tensor creation from external data +void CreateOVTensors(const std::string& device_name, + SharedContext::SharedWeights::Metadata::Map& metadata_map, + SharedContext::SharedWeights::WeightsFile& weights) { + for (auto& [key, value] : metadata_map) { + if (value.tensor) continue; + + // Get element data type + auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type; + + ov::element::Type ov_elementType = GetOpenVINOElementType(onnx_element_type); // Map to OpenVINO data type + + // Create OpenVINO Tensor + if (device_name == "NPU") { + // Use remote tensors + auto npu_context = OVCore::Get().get_default_context("NPU").as(); + auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT); + + // Copy data to remote tensor + weights.load_weights(value.data_offset, remote_tensor.get(), value.size); + value.tensor = std::make_shared(remote_tensor); + } else { + // Use vanilla tensors + value.tensor = std::make_shared(ov_elementType, value.dimensions); + weights.load_weights(value.data_offset, value.tensor->data(), value.size); + } + ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch"); + } +} + +void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map) { + for (auto& [key, value] : metadata_map) { + if (value.tensor) { + value.tensor.reset(); + } + } + metadata_map.clear(); +} + } // namespace backend_utils } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 9d58e1ca73abb..a4e6fc0828f79 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/openvino/contexts.h" @@ -34,7 +35,7 @@ bool IsDebugEnabled(); // Internal diagnostic function. bool IsCILogEnabled(); -int GetFirstAvailableDevice(GlobalContext& global_context); +int GetFirstAvailableDevice(SessionContext& session_context); void FillOutputsWithConstantData(std::shared_ptr node, Ort::UnownedValue& out_tensor); @@ -44,14 +45,14 @@ void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr n Ort::UnownedValue GetOutputTensor(Ort::KernelContext& context, std::string output_name, - std::unordered_map output_names, + const SubGraphContext::string_index_map_t& output_names, std::shared_ptr node); Ort::UnownedValue GetOutputTensor(Ort::KernelContext& context, size_t batch_size, OVInferRequestPtr infer_request, std::string output_name, - std::unordered_map output_names); + const SubGraphContext::string_index_map_t& output_names); void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx, std::string input_name, Ort::KernelContext& context, @@ -61,10 +62,15 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor, size_t batch_slice_idx); std::shared_ptr -CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, - const GlobalContext& global_context, +CreateOVModel(const std::string model, + const SessionContext& session_context, std::map>& const_outputs_map); +void CreateOVTensors(const std::string& device_name, + SharedContext::SharedWeights::Metadata::Map& metadata_map, + SharedContext::SharedWeights::WeightsFile& weights); +void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map); + void printPerformanceCounts(const std::vector& performanceMap, std::ostream& stream, std::string deviceName); diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index b7e4aed6e7e18..6c1ed9aa42727 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -12,10 +12,11 @@ namespace openvino_ep { std::shared_ptr BackendFactory::MakeBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + SessionContext& session_context, const SubGraphContext& subgraph_context, - EPCtxHandler& ep_ctx_handle) { - std::string type = global_context.device_type; + SharedContext& shared_context, + ptr_stream_t& model_stream) { + std::string type = session_context.device_type; if (type == "CPU" || type.find("GPU") != std::string::npos || type.find("NPU") != std::string::npos || type.find("HETERO") != std::string::npos || @@ -23,7 +24,7 @@ BackendFactory::MakeBackend(std::unique_ptr& model_p type.find("AUTO") != std::string::npos) { std::shared_ptr concrete_backend_; try { - concrete_backend_ = std::make_shared(model_proto, global_context, subgraph_context, ep_ctx_handle); + concrete_backend_ = std::make_shared(model_proto, session_context, subgraph_context, shared_context, model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } @@ -32,5 +33,6 @@ BackendFactory::MakeBackend(std::unique_ptr& model_p ORT_THROW("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type); } } + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 45639293344d8..a6a848c542b12 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -21,13 +21,12 @@ namespace openvino_ep { using namespace backend_utils; BasicBackend::BasicBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + SessionContext& session_context, const SubGraphContext& subgraph_context, - EPCtxHandler& ep_ctx_handle) - : global_context_(global_context), subgraph_context_(subgraph_context) { - std::string& hw_target = global_context_.device_type; - - is_ep_ctx_graph_ = ep_ctx_handle.IsValidOVEPCtxGraph(); + SharedContext& shared_context, + ptr_stream_t& model_stream) + : session_context_{session_context}, subgraph_context_{subgraph_context}, shared_context_{shared_context} { + std::string& hw_target = session_context_.device_type; if (ValidateSubgraph(const_outputs_map_)) return; @@ -37,7 +36,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr PopulateConfigValue(device_config); // Enable caching - EnableCaching(device_config); + EnableCaching(); // Setting OpenCL queue throttling for GPU EnableGPUThrottling(device_config); @@ -59,78 +58,90 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } try { - std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str; - - if (global_context.is_wholly_supported_graph) { // Full graph is supported + // IO_BUFFER is enabled on GPU HW. + // Pre-requisite is provider_option "context" must be set #if defined(IO_BUFFER_ENABLED) - if (is_ep_ctx_graph_) { - std::istringstream model_stream(ep_ctx_handle.GetModelBlobString()); - exe_network_ = global_context_.ie_core.ImportModel(model_stream, - remote_context_, - subgraph_context_.subgraph_name); - } else if ((global_context.device_type.find("GPU") != std::string::npos) && - (global_context_.context != nullptr)) { - LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; - cl_context ctx = static_cast(global_context_.context); - remote_context_ = new ov::intel_gpu::ocl::ClContext(global_context_.ie_core.Get(), ctx); - ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( - ie_cnn_network_, remote_context_, subgraph_context_.subgraph_name); - } else { - ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( - ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); + cl_context ctx = static_cast(session_context_.context); + remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get(), ctx); + if (subgraph_context_.is_ep_ctx_graph) { + exe_network_ = OVCore::ImportModel(*model_stream, + remote_context_, + subgraph_context_.subgraph_name); + model_stream.reset(); // Delete stream after it is no longer needed + } else { + std::shared_ptr ov_model; + { + const std::string model = model_proto->SerializeAsString(); + if (!subgraph_context.has_dynamic_input_shape) { + delete model_proto.release(); + } + ov_model = CreateOVModel(model, session_context_, const_outputs_map_); } + LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; + exe_network_ = OVCore::CompileModel( + ov_model, remote_context_, subgraph_context_.subgraph_name); + } #else // !IO_BUFFER_ENABLED - std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision; - if (is_ep_ctx_graph_) { - // If the blob is held in an EPContext node, then skip FE+Compile - // and directly move on to creating a backend with the executable blob - exe_network_ = global_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(), - hw_target, - device_config, - global_context_.ep_context_embed_mode, - subgraph_context_.subgraph_name); - } else if (global_context_.export_ep_ctx_blob && - hw_target.find("NPU") != std::string::npos && - !global_context_.has_external_weights) { - std::shared_ptr ov_model; - { - const std::string model = model_proto->SerializeAsString(); - if (!subgraph_context.has_dynamic_input_shape) { - delete model_proto.release(); - } - ov_model = global_context_.ie_core.Get().read_model(model, ov::Tensor()); - } - exe_network_ = OVExeNetwork(global_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config)); - } else if (!global_context_.has_external_weights && - (!subgraph_context_.has_dynamic_input_shape) && - ((hw_target.find("AUTO") == std::string::npos) || - (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) > 2))) { - // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above - // Inputs with static dimenstions + auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) || + (session_context_.OpenVINO_Version.at(0) >= 2024 && + session_context_.OpenVINO_Version.at(1) > 2)); + if (subgraph_context_.is_ep_ctx_graph) { + // If the blob is held in an EPContext node, then skip FE+Compile + // and directly move on to creating a backend with the executable blob + exe_network_ = OVCore::ImportModel(*model_stream, + hw_target, + device_config, + subgraph_context_.subgraph_name); + model_stream.reset(); // Delete stream after it is no longer needed + } else if (!session_context_.has_external_weights && + !subgraph_context_.has_dynamic_input_shape && + !session_context_.so_context_enable && + auto_unified_compile) { + // Unified OV compile_model is efficient when ov model caching is enabled + // Unified OV compile_model API is supported with AUTO from version 2024.3 and above + // Inputs with static dimenstions + // Not enabled for models with external weights and when ep context is set. + const std::string model = model_proto->SerializeAsString(); + exe_network_ = OVCore::CompileModel(model, + hw_target, + device_config, + subgraph_context_.subgraph_name); + } else { // For all other types use ov::ov_core read_model() to generate OV IR + // followed by ov::ov_core compile_model() + std::shared_ptr ov_model; + { const std::string model = model_proto->SerializeAsString(); - exe_network_ = global_context_.ie_core.CompileModel(model, - hw_target, - device_config, - subgraph_context_.subgraph_name); - } else { // For all other types use ov::Model Type - auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( - ov_model, hw_target, device_config, subgraph_context_.subgraph_name); + if (!subgraph_context.has_dynamic_input_shape) { + delete model_proto.release(); + } + ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_); } -#endif - } else { // Full graph is not supported - auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + exe_network_ = OVCore::CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } +#endif LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } catch (const char* msg) { ORT_THROW(msg); } - size_t num_infer_req = (global_context_.num_of_threads > 0) ? global_context_.num_of_threads : 1; - inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req)); + + int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1; + std::function initializer = [](OVInferRequestPtr) {}; + auto metadata = shared_context_.shared_weights.metadata; + if (session_context_.so_share_ep_contexts) { + initializer = [&metadata](OVInferRequestPtr ir_ptr) { + const auto input_count = ir_ptr->GetNumInputs(); + for (auto i = 0u; i < input_count; i++) { + using Key = SharedContext::SharedWeights::Metadata::Key; + const auto tensor_key = Key{ir_ptr->GetInputTensorName(i)}; + if (metadata.contains(tensor_key)) { + auto& value = metadata.at(tensor_key); + ir_ptr->SetTensor(tensor_key.name, value.tensor); + } + } + }; + } + inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer))); } bool BasicBackend::ValidateSubgraph(std::map>& const_outputs_map) { @@ -146,21 +157,21 @@ bool BasicBackend::ValidateSubgraph(std::map= 2024) { + if (session_context_.precision.find("ACCURACY") != std::string::npos && + session_context_.device_type.find("GPU") != std::string::npos) { + if (session_context_.OpenVINO_Version.at(0) >= 2024) { device_config.emplace(ov::hint::inference_precision(ov::element::undefined)); device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY)); } else { - if (global_context_.model_precision != "") - device_config.emplace(ov::hint::inference_precision(global_context_.model_precision)); + if (!subgraph_context_.model_precision.empty()) + device_config.emplace(ov::hint::inference_precision(subgraph_context_.model_precision)); } } #ifndef NDEBUG @@ -171,10 +182,10 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { // Set a priority level for the current workload for preemption; default priority is "DEFAULT" // CPU Plugin doesn't support workload priority - if (global_context_.device_type.find("CPU") == std::string::npos) - device_config.emplace(ov::hint::model_priority(global_context_.model_priority)); + if (session_context_.device_type.find("CPU") == std::string::npos) + device_config.emplace(ov::hint::model_priority(session_context_.model_priority)); - if (global_context_.device_type.find("NPU") != std::string::npos) { + if (session_context_.device_type.find("NPU") != std::string::npos) { std::pair device_property; device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER"); @@ -184,16 +195,16 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { } device_config.emplace(ov::device::properties("NPU", device_property)); #if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024)) - if (global_context_.export_ep_ctx_blob) { - global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); + if (session_context_.so_context_enable) { + OVCore::Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); } #endif } - if (!global_context_.load_config.empty()) { - const std::map& target_config = global_context_.load_config; + if (!session_context_.load_config.empty()) { + const std::map& target_config = session_context_.load_config; - if (global_context_.device_type.find("NPU") != std::string::npos) { + if (session_context_.device_type.find("NPU") != std::string::npos) { auto npuw_config = target_config.at("NPU"); // Check if "NPU_USE_NPUW" exists and is set to "YES" @@ -253,7 +264,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { continue; } if (is_supported_and_mutable(key, supported_properties)) { - global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}}); + OVCore::Get().set_property(device, ov::AnyMap{{key, value}}); } else { LOGS_DEFAULT(WARNING) << "WARNING: Property \"" << key << "\" is either unsupported in current OpenVINO version" @@ -264,50 +275,44 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { }; // Check if the device type is AUTO, HETERO, or MULTI - if (global_context_.device_type.find("AUTO") == 0 || - global_context_.device_type.find("HETERO") == 0 || - global_context_.device_type.find("MULTI") == 0) { + if (session_context_.device_type.find("AUTO") == 0 || + session_context_.device_type.find("HETERO") == 0 || + session_context_.device_type.find("MULTI") == 0) { // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"]) - auto individual_devices = parse_individual_devices(global_context_.device_type); + auto individual_devices = parse_individual_devices(session_context_.device_type); // Set properties only for individual devices (e.g., "CPU", "GPU") for (const std::string& device : individual_devices) { if (target_config.count(device)) { // Get supported properties for each individual device - auto device_properties = global_context_.ie_core.Get().get_property(device, ov::supported_properties); + auto device_properties = OVCore::Get().get_property(device, ov::supported_properties); // Set properties for the device set_target_properties(device, target_config.at(device), device_properties); } } } else { - if (target_config.count(global_context_.device_type)) { - auto supported_properties = global_context_.ie_core.Get().get_property(global_context_.device_type, - ov::supported_properties); - set_target_properties(global_context_.device_type, - target_config.at(global_context_.device_type), supported_properties); + if (target_config.count(session_context_.device_type)) { + auto supported_properties = OVCore::Get().get_property(session_context_.device_type, + ov::supported_properties); + set_target_properties(session_context_.device_type, + target_config.at(session_context_.device_type), supported_properties); } } } } -void BasicBackend::EnableCaching(ov::AnyMap& device_config) { +void BasicBackend::EnableCaching() { // cache_dir argument has no effect when working with an embed-mode EPContext Graph - if (is_ep_ctx_graph_) return; + if (subgraph_context_.is_ep_ctx_graph) return; - if (!global_context_.cache_dir.empty() && !global_context_.export_ep_ctx_blob) { + if (!session_context_.cache_dir.empty() && !session_context_.so_context_enable) { LOGS_DEFAULT(INFO) << log_tag << "Enables Caching"; - if (global_context_.device_type.find("AUTO:GPU") != std::string::npos) { - std::pair device_property; - device_property = std::make_pair("CACHE_DIR", global_context_.cache_dir); - device_config.emplace(ov::device::properties("GPU", device_property)); - } else { - global_context_.ie_core.SetCache(global_context_.cache_dir); - } + OVCore::SetCache(session_context_.cache_dir.string()); } } void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) { - if (global_context_.enable_opencl_throttling == true && - global_context_.device_type.find("GPU") != std::string::npos) { + if (session_context_.enable_opencl_throttling == true && + session_context_.device_type.find("GPU") != std::string::npos) { LOGS_DEFAULT(INFO) << log_tag << "Enabled OpenCL queue throttling for GPU device"; std::pair device_property; device_property = std::make_pair("PLUGIN_THROTTLE", "1"); @@ -318,61 +323,56 @@ void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) { void BasicBackend::EnableStreams() { // Return silently for NPU as it's currently treated as a read-only flag by the NPU plugin // and throws an exception for the same - if (global_context_.device_type.find("NPU") != std::string::npos) + if (session_context_.device_type.find("NPU") != std::string::npos) return; // Streams can be set only if the device is not one of AUTO, MULTI, or HETERO // Throw an exception if the user tries to set num_streams for these devices - if ((global_context_.device_type.find("MULTI") != std::string::npos) || - (global_context_.device_type.find("HETERO") != std::string::npos) || - (global_context_.device_type.find("AUTO") != std::string::npos)) { - if (global_context_.num_streams != 1) { + if ((session_context_.device_type.find("MULTI") != std::string::npos) || + (session_context_.device_type.find("HETERO") != std::string::npos) || + (session_context_.device_type.find("AUTO") != std::string::npos)) { + if (session_context_.num_streams != 1) { ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " + - std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type); + std::to_string(session_context_.num_streams) + " for device " + session_context_.device_type); } // Do nothing } else { - global_context_.ie_core.SetStreams(global_context_.device_type, global_context_.num_streams); + OVCore::SetStreams(session_context_.device_type, session_context_.num_streams); } } void BasicBackend::SetNumThreads(ov::AnyMap& device_config) { // inference_num_threads is applicable only for the CPU device - if (global_context_.device_type.find("CPU") != std::string::npos) - device_config.emplace(ov::inference_num_threads(static_cast(global_context_.num_of_threads))); + if (session_context_.device_type.find("CPU") != std::string::npos) + device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads)); } // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on // an Infer Request indexed by infer_req_idx void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) { try { - auto graph_input_info = exe_network_.Get().inputs(); - int input_idx = 0; - for (auto input_info_iter = graph_input_info.begin(); - input_info_iter != graph_input_info.end(); ++input_info_iter) { - auto input_names = input_info_iter->get_names(); - std::string onnx_input_name; - std::string input_name; - // use names retrieved from original ONNX model to assign the right onnx input name for the graph - for (auto it = subgraph_context_.input_names.begin(); it != subgraph_context_.input_names.end(); ++it) { - if (it->second == input_idx) { - onnx_input_name = it->first; + auto ov_input_info = exe_network_.Get().inputs(); + + // Loop over subgraph original input names to find the correspondent OV input name + for (const auto& [onnx_input_name, onnx_input_index] : subgraph_context_.input_names) { + std::string input_name{}; + uint32_t input_idx = 0; + for (uint32_t index = 0; const auto& ov_input : ov_input_info) { + if (ov_input.get_names().contains(onnx_input_name)) { + input_name = onnx_input_name; + input_idx = index; break; } + index++; } - // using the input name retrieved from ONNX original to match with the input names returned by OV tensors - if (input_names.find(onnx_input_name) != input_names.end()) { - input_name = std::move(onnx_input_name); - } else { - ORT_THROW(log_tag + - "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name + + ORT_ENFORCE(!input_name.empty(), log_tag, + "Input names mismatch between OpenVINO and ONNX. ", onnx_input_name, " doesn't exist in the list of OpenVINO input tensor names"); - } size_t batch_slice_idx = 0; if (subgraph_context_.has_dynamic_input_shape && - !global_context_.disable_dynamic_shapes && - (global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + !session_context_.disable_dynamic_shapes && + (session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos)) { auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name)); auto tensor_info = tensor.GetTensorTypeAndShapeInfo(); auto tensor_shape = tensor_info.GetShape(); @@ -384,10 +384,10 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque input_tensor_shape[tensor_iter] = *i; tensor_iter += 1; } - const auto& input = graph_input_info.at(input_idx); + const auto& input = ov_input_info.at(input_idx); OVTensorPtr tensor_ptr; // avoid input copies on the CPU device - if (global_context_.device_type.find("CPU") != std::string::npos) { + if (session_context_.device_type.find("CPU") != std::string::npos) { tensor_ptr = std::make_shared(input.get_element_type(), input_tensor_shape, (void*)tensor_data); } else { @@ -401,8 +401,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque ORT_THROW(msg); } } else { - if ((global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + if ((session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos)) { OVTensorPtr graph_input_blob; try { graph_input_blob = infer_request->GetTensor(input_name); @@ -417,7 +417,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque if ((it == ort_ov_tensor_map.end()) || (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) { ov_tensor_data_t ov_tensor_data; - const auto& input = graph_input_info.at(input_idx); + const auto& input = ov_input_info.at(input_idx); ov_tensor_data.tensor_ptr = std::make_shared(input.get_element_type(), input.get_shape(), const_cast(tensor.GetTensorRawData())); @@ -432,9 +432,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } } } - input_idx++; - } - if (global_context_.device_type.find("NPU") != std::string::npos) { + } // Loop subgraph original input names + + if (session_context_.device_type.find("NPU") != std::string::npos) { // Set the output blob as remote blob auto graph_output_info = exe_network_.Get().outputs(); auto output_idx = 0; @@ -628,8 +628,8 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe " doesn't exist in the " "list of OpenVINO output tensor names"); } - if ((global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + if ((session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos)) { try { graph_output_blob = infer_request->GetTensor(output_name); } catch (const char* msg) { @@ -703,8 +703,8 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { OVInferRequestPtr infer_request; infer_request = inferRequestsQueue_->getIdleRequest(); #ifdef IO_BUFFER_ENABLED - if ((global_context_.device_type.find("GPU") != std::string::npos) && - (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) { + if ((session_context_.device_type.find("GPU") != std::string::npos) && + (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) { try { StartRemoteAsyncInference(context, infer_request); } catch (std::string const& msg) { @@ -748,7 +748,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { #ifndef IO_BUFFER_ENABLED // Printing performance counts is disabled when IO_BUFFER_ENABLED if (openvino_ep::backend_utils::IsDebugEnabled()) { inferRequestsQueue_->printstatus(); // Printing the elements of infer_requests_ vector pool only in debug mode - std::string& hw_target = global_context_.device_type; + std::string& hw_target = session_context_.device_type; printPerformanceCounts(std::move(infer_request_), std::cout, hw_target); } #endif diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 3fcf6e4384d52..7d905f4a1e2f7 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/openvino/contexts.h" @@ -30,11 +31,13 @@ class InferRequestsQueue; class BasicBackend : public IBackend { public: BasicBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + SessionContext& session_context, const SubGraphContext& subgraph_context, - EPCtxHandler& ep_ctx_handle); + SharedContext& shared_context, + ptr_stream_t& model_stream); void Infer(OrtKernelContext* context) override; + ~BasicBackend() override = default; ov::CompiledModel& GetOVCompiledModel() override { return exe_network_.Get(); } @@ -43,7 +46,7 @@ class BasicBackend : public IBackend { void PopulateCompiledDirectory(std::string, std::string&, std::string&, bool&); bool ValidateSubgraph(std::map>& const_outputs_map); void PopulateConfigValue(ov::AnyMap& device_config); - void EnableCaching(ov::AnyMap& device_config); + void EnableCaching(); void EnableGPUThrottling(ov::AnyMap& device_config); void EnableStreams(); void SetNumThreads(ov::AnyMap& device_config); @@ -55,13 +58,13 @@ class BasicBackend : public IBackend { void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr infer_request); - GlobalContext& global_context_; + SessionContext& session_context_; SubGraphContext subgraph_context_; + SharedContext& shared_context_; mutable std::mutex compute_lock_; OVExeNetwork exe_network_; std::map> const_outputs_map_; std::unique_ptr inferRequestsQueue_; - bool is_ep_ctx_graph_{false}; #if defined IO_BUFFER_ENABLED OVRemoteContextPtr remote_context_; #endif @@ -72,10 +75,11 @@ class BasicBackend : public IBackend { class InferRequestsQueue { public: - InferRequestsQueue(OVExeNetwork& net, size_t nireq) { + InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function initializer) { OVInferRequestPtr infer_request; for (size_t id = 0; id < nireq; id++) { infer_request = std::make_shared(net.CreateInferRequest()); + initializer(infer_request); infer_requests_.push_back(infer_request); } } diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 4f970bc7bc287..66fcb8025ad8d 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -7,50 +7,122 @@ #include #include #include +#include +#include +#include "core/common/common.h" #include "core/providers/openvino/ov_interface.h" namespace onnxruntime { namespace openvino_ep { +namespace fs = std::filesystem; + +struct SharedContext { + struct SharedWeights { + struct Metadata { + struct Key { + std::string name; + bool operator==(const Key&) const = default; + }; + struct Hash { + std::size_t operator()(const Key& key) const noexcept { + return std::hash()(key.name); + } + }; + struct Value { + std::string location; + unsigned int data_offset; + unsigned int size; + std::vector dimensions; + std::int32_t element_type; + std::shared_ptr tensor; + }; + using Map = std::unordered_map; + friend std::ostream& operator<<(std::ostream& right, const Metadata::Map& metadata); + friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata); + }; + + struct WeightsFile { + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightsFile); + WeightsFile() = delete; + explicit WeightsFile(std::filesystem::path filename); + + void load_weights(size_t file_offset, void* data, size_t size); + + private: + std::ifstream file_; + size_t weights_size_; + }; + + fs::path external_weight_filename; + std::unique_ptr mapped_weights; + Metadata::Map metadata; + } shared_weights; +}; + +using config_t = std::map; + +struct ProviderInfo { + std::string device_type{""}; // [device_type]: Overrides the accelerator hardware type and + // precision with these values at runtime. + std::string precision{""}; // [precision]: Sets the inference precision for execution. + // Supported precision for devices are + // CPU=FP32, GPU=FP32,FP16, NPU=FP16. + // Not setting precision will execute with optimized precision for + // best inference latency. set Precision=ACCURACY for executing + // models with input precision for best accuracy. + uint32_t num_of_threads{0}; // [num_of_threads]: Overrides the accelerator default value of + // number of threads with this value at runtime. + config_t load_config{}; // JSON config map to load custom OV parameters. + fs::path cache_dir{""}; // [cache_dir]: specify the path to + // dump and load the blobs for the model caching/kernel caching + // (GPU) feature. If blob files are already present, + // it will be directly loaded. + std::string model_priority{"DEFAULT"}; // High-level OpenVINO model priority hint + // Defines what model should be provided with more performant + // bounded resource first + uint32_t num_streams{1}; // [num_streams]: Option that specifies the number of parallel + // inference requests to be processed on a given `device_type`. + // Overrides the accelerator default value of number of streams + // with this value at runtime. + void* context{nullptr}; // OpenCL context + bool enable_opencl_throttling{false}; // [enable_opencl_throttling]: Enables OpenCL queue throttling for + // GPU device (Reduces CPU Utilization when using GPU) + bool disable_dynamic_shapes{false}; // [disable_dynamic_shapes]: Rewrite dynamic shaped models to + // static shape at runtime and execute. + bool enable_qdq_optimizer{false}; // Enables QDQ pruning for efficient inference latency with NPU + bool so_context_enable{false}; // ORT session option + bool so_disable_cpu_ep_fallback{false}; // ORT session option + bool so_context_embed_mode{false}; // ORT session option + bool so_share_ep_contexts{false}; // ORT session option + fs::path so_context_file_path{}; // ORT session option +}; + // Holds context applicable to the entire EP instance. -struct GlobalContext { - OVCore ie_core; - bool is_wholly_supported_graph = false; - bool enable_opencl_throttling = false; - bool disable_dynamic_shapes = false; - bool ep_context_embed_mode = false; - bool export_ep_ctx_blob = false; - bool enable_qdq_optimizer = false; - bool disable_cpu_fallback = false; - bool has_external_weights = false; - size_t num_of_threads; - std::string device_type; - std::string precision_str; - std::string model_precision; - std::string cache_dir; - std::map load_config; - std::string model_priority = "DEFAULT"; - int num_streams; +struct SessionContext : ProviderInfo { + SessionContext(const ProviderInfo& info) : ProviderInfo{info} {} std::vector deviceAvailableList = {true, true, true, true, true, true, true, true}; - std::string onnx_model_name; - std::string onnx_model_path_name; - int onnx_opset_version; - void* context = 0; - bool use_api_2; - std::vector OpenVINO_Version = {}; // Ov Major and OV minor version from OV headers + std::filesystem::path onnx_model_path_name; + uint32_t onnx_opset_version{0}; + mutable bool is_wholly_supported_graph = false; // Value is set to mutable to modify from capability + mutable bool has_external_weights = false; // Value is set to mutable to modify from capability + const std::vector OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; + const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR); }; // Holds context specific to subgraph. struct SubGraphContext { + using string_index_map_t = std::unordered_map; bool has_dynamic_input_shape = false; bool enable_batching = false; bool set_npu_config = false; bool is_constant = false; void* context = 0; std::string subgraph_name; - std::vector input_indexes; - std::unordered_map input_names; - std::unordered_map output_names; + string_index_map_t input_names; + string_index_map_t output_names; + std::string model_precision; + bool is_ep_ctx_graph = false; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 7a2d6f4e8cd69..04d1f52cbf834 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -4,6 +4,7 @@ #pragma once #include +#include #define ORT_API_MANUAL_INIT #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/openvino/onnx_ctx_model_helper.h" @@ -15,15 +16,17 @@ class IBackend { public: virtual void Infer(OrtKernelContext* context) = 0; virtual ov::CompiledModel& GetOVCompiledModel() = 0; + virtual ~IBackend() = default; }; - +using ptr_stream_t = std::unique_ptr; class BackendFactory { public: static std::shared_ptr MakeBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + SessionContext& session_context, const SubGraphContext& subgraph_context, - EPCtxHandler& ctx_handle); + SharedContext& shared_context, + ptr_stream_t& model_stream); }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc index 6d159db3b390d..7bd4f8d96cc55 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc @@ -11,25 +11,45 @@ namespace onnxruntime { namespace openvino_ep { +EPCtxHandler::EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger) : openvino_sdk_version_(std::move(ov_sdk_version)), logger_(logger) { + epctx_model_ = Model::Create("ovep_context_model", false, logger_); +} + /* Export the serialized blob string embedded onto an EPContext Node * along with other metadata necessary to validate the graph on import */ -Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, - const std::string& graph_name, - const logging::Logger& logger, - const bool& ep_context_embed_mode, - std::string&& model_blob_str, - const std::string& openvino_sdk_version) const { - auto& metadata = graph_viewer.GetGraph().GetModel().MetaData(); - auto model_build = graph_viewer.CreateModel(logger, metadata); - auto& graph_build = model_build->MainGraph(); +Status EPCtxHandler::ExportEPCtxModel(const std::string& model_name) { + // Serialize modelproto to string + auto model_proto = epctx_model_->ToProto(); + model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + + // Finally, dump the model + std::ofstream epctx_onnx_model(model_name, + std::ios::out | std::ios::trunc | std::ios::binary); + if (!epctx_onnx_model) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file"); + } + + if (!model_proto->SerializeToOstream(epctx_onnx_model)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file"); + } + LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node"; + + return Status::OK(); +} + +Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer, + const std::string& graph_name, + const bool embed_mode, + std::string&& model_blob_str) const { + auto& graph = epctx_model_->MainGraph(); // Get graph inputs and outputs const auto& viewer_inputs = graph_viewer.GetInputs(); const auto& viewer_outputs = graph_viewer.GetOutputs(); std::vector inputs(viewer_inputs.size()), outputs(viewer_outputs.size()); - auto transform_f = [&](const onnxruntime::NodeArg* iter) { return &graph_build.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); }; + auto transform_f = [&](const onnxruntime::NodeArg* iter) { return &graph.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); }; auto fill_vectors = [transform_f](auto& src, auto& dst) { std::transform(src.begin(), src.end(), dst.begin(), transform_f); }; @@ -46,7 +66,7 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, auto embed_mode_attr = ONNX_NAMESPACE::AttributeProto::Create(); embed_mode_attr->set_name(EMBED_MODE); embed_mode_attr->set_type(onnx::AttributeProto_AttributeType_INT); - embed_mode_attr->set_i(ep_context_embed_mode); + embed_mode_attr->set_i(embed_mode); node_attributes->emplace(EMBED_MODE, std::move(*embed_mode_attr)); // ep context @@ -60,7 +80,7 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, auto sdk_version_attr = ONNX_NAMESPACE::AttributeProto::Create(); sdk_version_attr->set_name(EP_SDK_VER); sdk_version_attr->set_type(onnx::AttributeProto_AttributeType_STRING); - sdk_version_attr->set_s(openvino_sdk_version); + sdk_version_attr->set_s(openvino_sdk_version_); node_attributes->emplace(EP_SDK_VER, std::move(*sdk_version_attr)); // source @@ -70,73 +90,70 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, source_attr->set_s(kOpenVINOExecutionProvider); node_attributes->emplace(SOURCE, std::move(*source_attr)); } - // Create EP context node - graph_build.AddNode(graph_name, EPCONTEXT_OP, "", inputs, outputs, std::move(*node_attributes), kMSDomain); - ORT_ENFORCE(graph_build.Resolve().IsOK()); - { - // Serialize modelproto to string - auto model_proto = model_build->ToProto(); - model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); - - // Finally, dump the model - std::ofstream epctx_onnx_model(graph_name, - std::ios::out | std::ios::trunc | std::ios::binary); - if (!epctx_onnx_model) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file"); - } + // Create EP context node + graph.AddNode(graph_name, EPCONTEXT_OP, "", inputs, outputs, std::move(*node_attributes), kMSDomain); - if (!model_proto->SerializeToOstream(epctx_onnx_model)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file"); - } - } - LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node"; + ORT_ENFORCE(graph.Resolve().IsOK()); return Status::OK(); } -Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode) { - auto node = graph_viewer.GetNode(0); +std::unique_ptr EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const { + auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin(); + auto node = graph_viewer.GetNode(first_index); + ORT_ENFORCE(node != nullptr); auto& attrs = node->GetAttributes(); - ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0); - - ep_cache_context_attribute_ = &attrs.at(EP_CACHE_CONTEXT); - ep_context_embed_mode = static_cast(attrs.at(EMBED_MODE).i()); - LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; + ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) == 1); + const auto& ep_cache_context = attrs.at(EP_CACHE_CONTEXT).s(); - is_valid_ep_ctx_graph_ = true; - return Status::OK(); -} + ORT_ENFORCE(attrs.count(EMBED_MODE) == 1); + bool embed_mode = static_cast(attrs.at(EMBED_MODE).i()); -const std::string& EPCtxHandler::GetModelBlobStream() const { - static std::string empty; - if (ep_cache_context_attribute_ != nullptr) { - return ep_cache_context_attribute_->s(); + std::unique_ptr result; + if (embed_mode) { + result.reset((std::istream*)new std::istringstream(ep_cache_context)); } else { - return empty; + auto blob_filepath = so_context_file_path; + if (blob_filepath.empty() && !graph_viewer.ModelPath().empty()) { + blob_filepath = graph_viewer.ModelPath(); + } + blob_filepath = blob_filepath.parent_path() / ep_cache_context; + ORT_ENFORCE(std::filesystem::exists(blob_filepath), "Blob file not found: ", blob_filepath.string()); + result.reset((std::istream*)new std::ifstream(blob_filepath, std::ios_base::binary | std::ios_base::in)); } + LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; + return result; } -bool EPCtxHandler::CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const { - for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) { - auto node = graph_viewer.GetNode(i); - auto& attrs = node->GetAttributes(); - - // Check for correct Op Type, EP SOURCE, and SDK version - if (node != nullptr && node->OpType() == EPCONTEXT_OP) { - if (attrs.at(SOURCE).s() == kOpenVINOExecutionProvider) { - if (attrs.at(EP_SDK_VER).s() == openvino_sdk_version) { - return true; - } else { - ORT_THROW("[Invalid Graph] Versions of OpenVINO used to export blob (" + attrs.at(EP_SDK_VER).s() + - ") and current runtime (" + openvino_sdk_version + ") don't match."); - } - } +bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const { + if (graph_viewer.NumberOfNodes() == 1) { + auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin(); + if (auto node = graph_viewer.GetNode(first_index); (node != nullptr) && CheckForOVEPCtxNode(*node)) { + return true; } } return false; } +bool EPCtxHandler::CheckForOVEPCtxNode(const Node& node) const { + // Check for correct Op Type, EP SOURCE, and SDK version + if (node.OpType() == EPCONTEXT_OP) { + auto& attrs = node.GetAttributes(); + bool result = (attrs.count(SOURCE) == 1) && (attrs.at(SOURCE).s() == kOpenVINOExecutionProvider); + result &= (attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_); + result &= attrs.count(EMBED_MODE) == 1; + result &= attrs.count(EP_CACHE_CONTEXT) == 1; + return result; + } + return false; +} + +InlinedVector EPCtxHandler::GetEPCtxNodes() const { + const auto& epctx_nodes{epctx_model_->MainGraph().Nodes()}; + return InlinedVector(epctx_nodes.begin(), epctx_nodes.end()); +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h index caab33b7db775..ff978bd6534d8 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h @@ -22,22 +22,22 @@ static const char SOURCE[] = "source"; class EPCtxHandler { public: - EPCtxHandler() = default; - EPCtxHandler(const EPCtxHandler&) = delete; - Status ExportEPCtxModel(const GraphViewer& graph_viewer, - const std::string& graph_name, - const logging::Logger& logger, - const bool& ep_context_embed_mode, - std::string&& model_blob_str, - const std::string& openvino_sdk_version) const; - Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode); - bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const; - bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; } - const std::string& GetModelBlobStream() const; + EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger); + EPCtxHandler(const EPCtxHandler&) = delete; // No copy constructor + Status ExportEPCtxModel(const std::string& model_name); + bool CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const; + bool CheckForOVEPCtxNode(const Node& node) const; + Status AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer, + const std::string& graph_name, + const bool embed_mode, + std::string&& model_blob_str) const; + std::unique_ptr GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const; + InlinedVector GetEPCtxNodes() const; private: - bool is_valid_ep_ctx_graph_{false}; - const onnx::AttributeProto* ep_cache_context_attribute_; + const std::string openvino_sdk_version_; + std::unique_ptr epctx_model_; + const logging::Logger& logger_; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 72a188108adef..22477611ce25b 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -11,62 +11,135 @@ #include "core/providers/openvino/backend_manager.h" #include "core/providers/openvino/onnx_ctx_model_helper.h" #include "core/providers/openvino/ov_versions/capability.h" +#include "core/providers/openvino/qdq_transformations/qdq_stripping.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "openvino/core/version.hpp" #ifdef USE_OVEP_NPU_MEMORY #include "core/providers/openvino/ov_allocator.h" #endif -#define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz)) - namespace onnxruntime { +namespace openvino_ep { -OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info) - : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} { - InitProviderOrtApi(); +// Parking this code here for now before it's moved to the factory +#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO +static std::vector parseDevices(const std::string& device_string, + const std::vector& available_devices) { + std::string comma_separated_devices = device_string; + if (comma_separated_devices.find(":") != std::string::npos) { + comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1); + } + auto devices = split(comma_separated_devices, ','); + if (devices.size() < 2) { + print_build_options(); + ORT_THROW("Invalid device string: " + device_string); + } + std::set dev_options = {"CPU", "GPU", "NPU"}; + + for (auto& device : available_devices) { + if (dev_options.find(device) == dev_options.end()) { + auto dev_options_update = dev_options.emplace(device); + } + } + + for (const std::string& dev : devices) { + if (!std::count(dev_options.begin(), dev_options.end(), dev)) { + print_build_options(); + ORT_THROW("Invalid device string: " + device_string); + } + } + return devices; +} +#endif + +// Parking this code here for now before it's moved to the factory +void AdjustProviderInfo(ProviderInfo& info) { + std::set ov_supported_device_types = {"CPU", "GPU", + "GPU.0", "GPU.1", "NPU"}; + + std::vector available_devices = OVCore::GetAvailableDevices(); + + for (auto& device : available_devices) { + if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { + ov_supported_device_types.emplace(device); + } + } - global_context_ = std::make_unique(); - global_context_->device_type = info.device_type_; - global_context_->precision_str = info.precision_; - global_context_->cache_dir = info.cache_dir_; - global_context_->load_config = info.load_config_; - global_context_->model_priority = info.model_priority_; - global_context_->num_streams = info.num_streams_; - global_context_->context = info.context_; - global_context_->enable_opencl_throttling = info.enable_opencl_throttling_; - global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_; - global_context_->num_of_threads = info.num_of_threads_; - global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; - global_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_; - global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_; - global_context_->disable_cpu_fallback = info.disable_cpu_fallback_; - global_context_->ep_context_embed_mode = info.so_epctx_embed_mode_; + if (info.device_type == "") { + LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" + << "No runtime device selection option provided."; +#if defined OPENVINO_CONFIG_CPU + info.device_type = "CPU"; + info.precision = "FP32"; +#elif defined OPENVINO_CONFIG_GPU + info.device_type = "GPU"; + info.precision = "FP16"; +#elif defined OPENVINO_CONFIG_NPU + info.device_type = "NPU"; + info.precision = "FP16"; +#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO +#ifdef DEVICE_NAME +#define DEVICE DEVICE_NAME +#endif + dev_type = DEVICE; + + if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) { + std::vector devices = parseDevices(info.device_type, available_devices); + info.precision = "FP16"; + if (devices[0] == "CPU") { + info.precision = "FP32"; + } + info.device_type = std::move(dev_type); + } +#endif + } else if (ov_supported_device_types.find(info.device_type) != ov_supported_device_types.end()) { + info.device_type = std::move(info.device_type); + } +#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO + else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) { + std::ignore = parseDevices(info.device_type, available_devices); + info.device_type = std::move(info.device_type); + } +#endif + else { + ORT_THROW("Invalid device string: " + info.device_type); + } + LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" + << "Choosing Device: " << info.device_type << " , Precision: " << info.precision; +} + +OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context) + : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider}, + session_context_(info), + shared_context_{shared_context}, + ep_ctx_handle_{session_context_.openvino_sdk_version, *GetLogger()} { + InitProviderOrtApi(); // to check if target device is available - // using ie_core capability GetAvailableDevices to fetch list of devices plugged in - if (info.cache_dir_.empty()) { + // using OVCore capability GetAvailableDevices to fetch list of devices plugged in + if (info.cache_dir.empty()) { bool device_found = false; - std::vector available_devices = global_context_->ie_core.GetAvailableDevices(); + std::vector available_devices = OVCore::GetAvailableDevices(); // Checking for device_type configuration - if (info.device_type_ != "") { - if (info.device_type_.find("HETERO") != std::string::npos || - info.device_type_.find("MULTI") != std::string::npos || - info.device_type_.find("AUTO") != std::string::npos) { + if (info.device_type != "") { + if (info.device_type.find("HETERO") != std::string::npos || + info.device_type.find("MULTI") != std::string::npos || + info.device_type.find("AUTO") != std::string::npos) { device_found = true; } else { for (const std::string& device : available_devices) { - if (device.rfind(info.device_type_, 0) == 0) { - if (info.device_type_.find("GPU") != std::string::npos && (info.precision_ == "FP32" || - info.precision_ == "FP16" || - info.precision_ == "ACCURACY")) { + if (device.rfind(info.device_type, 0) == 0) { + if (info.device_type.find("GPU") != std::string::npos && (info.precision == "FP32" || + info.precision == "FP16" || + info.precision == "ACCURACY")) { device_found = true; break; } - if (info.device_type_ == "CPU" && (info.precision_ == "FP32")) { + if (info.device_type == "CPU" && (info.precision == "FP32")) { device_found = true; break; } - if (info.device_type_.find("NPU") != std::string::npos) { + if (info.device_type.find("NPU") != std::string::npos) { device_found = true; break; } @@ -75,99 +148,101 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv } } if (!device_found) { - ORT_THROW("[ERROR] [OpenVINO] Specified device - " + info.device_type_ + " is not available"); + ORT_THROW("[ERROR] [OpenVINO] Specified device - " + info.device_type + " is not available"); } } } +OpenVINOExecutionProvider::~OpenVINOExecutionProvider() { + for (auto& backend_manager : backend_managers_) { + backend_manager.ShutdownBackendManager(); + } + backend_managers_.clear(); +} + std::vector> OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, const IKernelLookup& /*kernel_lookup*/) const { std::vector> result; - std::string openvino_sdk_version = std::to_string(global_context_->OpenVINO_Version.at(0)) + "." + - std::to_string(global_context_->OpenVINO_Version.at(1)); - - // Check for valid ctx node and maintain state for validity - if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, std::move(openvino_sdk_version))) - ORT_ENFORCE(graph_viewer.NumberOfNodes() == 1, - "[Invalid Graph] EPContext Model with OpenVINO compiled blob should not have more than one node."); - // Enable CI Logs if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) { std::cout << "In the OpenVINO EP" << std::endl; } - global_context_->onnx_model_path_name = graph_viewer.ModelPath().string(); - - global_context_->onnx_opset_version = - graph_viewer.DomainToVersionMap().at(kOnnxDomain); - - global_context_->model_precision = [&](const GraphViewer& graph_viewer) { - // return empty if graph has no inputs or if types are not one of FP32/FP16 - // else assume the type of the first input - if (graph_viewer.GetInputs().empty()) { - return ""; - } else { - auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type(); - if (global_context_->precision_str == "ACCURACY" && - global_context_->device_type.find("GPU") != std::string::npos) { - if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { - return "FP32"; - } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) { - return "FP16"; - } - } - } - return ""; - }(graph_viewer); - - openvino_ep::GetCapability obj(graph_viewer, - global_context_->device_type, - global_context_->enable_qdq_optimizer); + openvino_ep::GetCapability obj(ep_ctx_handle_, + graph_viewer, + session_context_.device_type, + session_context_.enable_qdq_optimizer); result = obj.Execute(); - - global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph(); - global_context_->has_external_weights = obj.HasExternalWeights(); - + session_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph(); + session_context_.has_external_weights = obj.HasExternalWeights(); return result; } common::Status OpenVINOExecutionProvider::Compile( const std::vector& fused_nodes, std::vector& node_compute_funcs) { + auto& logger = *GetLogger(); + Status status = Status::OK(); + + if (!fused_nodes.empty()) { + // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext + const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); + session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); + session_context_.onnx_opset_version = + graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); + } + + // Temporary code to read metadata before it moves to the .bin + auto& metadata = shared_context_.shared_weights.metadata; + if (session_context_.so_share_ep_contexts && metadata.empty()) { + // Metadata is always read from model location, this could be a source or epctx model + fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; + std::ifstream file(metadata_filename, std::ios::binary); + if (file) { + file >> metadata; + } + } + + struct OpenVINOEPFunctionState { + AllocateFunc allocate_func = nullptr; + DestroyFunc destroy_func = nullptr; + AllocatorHandle allocator_handle = nullptr; + BackendManager& backend_manager; + }; + for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; const Node& fused_node = fused_node_graph.fused_node; NodeComputeInfo compute_info; - global_context_->use_api_2 = true; - // During backend creation, we check if user wants to use precompiled blob onnx model or the original model // For precompiled blob, directly load the model instead of compiling the model // For original model, check if the user wants to export a model with pre-compiled blob - std::shared_ptr backend_manager = - std::make_shared(*global_context_, - fused_node, - graph_body_viewer, - *GetLogger(), - ep_ctx_handle_); - backend_manager_ = backend_manager; + auto& backend_manager = backend_managers_.emplace_back(session_context_, + shared_context_, + fused_node, + graph_body_viewer, + logger, + ep_ctx_handle_); + compute_info.create_state_func = - [backend_manager](ComputeContext* context, FunctionState* state) { - OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState(); - p->allocate_func = context->allocate_func; - p->destroy_func = context->release_func; - p->allocator_handle = context->allocator_handle; - p->backend_manager = backend_manager; + [&backend_manager](ComputeContext* context, FunctionState* state) { + OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{ + .allocate_func = context->allocate_func, + .destroy_func = context->release_func, + .allocator_handle = context->allocator_handle, + .backend_manager = backend_manager}; *state = static_cast(p); return 0; }; + compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { auto function_state = static_cast(state); try { - function_state->backend_manager->Compute(context); + function_state->backend_manager.Compute(context); } catch (const std::exception& ex) { return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what()); } @@ -181,19 +256,42 @@ common::Status OpenVINOExecutionProvider::Compile( delete function_state; } }; - node_compute_funcs.push_back(compute_info); + + node_compute_funcs.push_back(std::move(compute_info)); + + if (!status.IsOK()) { + break; + } } - return Status::OK(); + if (session_context_.so_share_ep_contexts) { + fs::path metadata_filename; + if (session_context_.so_context_file_path.empty()) { + metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; + } else { + metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin"; + } + + // Metadata is generated only for shared contexts + // If saving metadata then save it to the provided path or ose the original model path + // Multiple calls to Compile() will update the metadata and for the last call + // the resulting file will contain the aggregated content + std::ofstream file(metadata_filename, std::ios::binary); + if (file) { + file << metadata; + } + } + + return status; } #ifdef USE_OVEP_NPU_MEMORY std::vector OpenVINOExecutionProvider::CreatePreferredAllocators() { - if (global_context_->device_type.find("NPU") != std::string::npos) { + if (session_context_.device_type.find("NPU") != std::string::npos) { AllocatorCreationInfo npu_allocator_info{ [this](OrtDevice::DeviceId device_id) { return std::make_unique( - global_context_->ie_core.Get(), + OVCore::Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU); @@ -232,8 +330,10 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::spanGetOVCompiledModel(); - ov_compiled_model.set_property(ov::workload_type(workload_type)); + for (auto& backend : backend_managers_) { + ov::CompiledModel& ov_compiled_model = backend.GetOVCompiledModel(); + ov_compiled_model.set_property(ov::workload_type(workload_type)); + } } } else { // Handle unknown options @@ -242,4 +342,10 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span OpenVINOExecutionProvider::GetEpContextNodes() const { + return ep_ctx_handle_.GetEPCtxNodes(); +} + +} // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index d5c22a4e2a9e4..75f4ef9f8ecc8 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -13,15 +13,10 @@ #include #include "core/providers/openvino/backend_manager.h" +#include "core/providers/openvino/contexts.h" namespace onnxruntime { - -struct OVDevices { - ov::Core core; - std::vector get_ov_devices() const { - return core.get_available_devices(); - } -}; +namespace openvino_ep { static void print_build_options() { std::cout << "[ERROR] INVALID DEVICE BUILD TYPE SPECIFIED" << std::endl; @@ -47,139 +42,11 @@ static std::vector split(const std::string& s, char delim) { return result; } -static std::vector parseDevices(const std::string& device_string, - const std::vector& available_devices) { - std::string comma_separated_devices = device_string; - if (comma_separated_devices.find(":") != std::string::npos) { - comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1); - } - auto devices = split(comma_separated_devices, ','); - if (devices.size() < 2) { - print_build_options(); - ORT_THROW("Invalid device string: " + device_string); - } - std::set dev_options = {"CPU", "GPU", "NPU"}; - - for (auto& device : available_devices) { - if (dev_options.find(device) == dev_options.end()) { - auto dev_options_update = dev_options.emplace(device); - } - } - - for (const std::string& dev : devices) { - if (!std::count(dev_options.begin(), dev_options.end(), dev)) { - print_build_options(); - ORT_THROW("Invalid device string: " + device_string); - } - } - return devices; -} - -// Information needed to construct OpenVINO execution providers. -struct OpenVINOExecutionProviderInfo { - std::string device_type_{""}; - std::string precision_{""}; - size_t num_of_threads_{0}; - std::map load_config_{}; - std::string cache_dir_{""}; - std::string model_priority_{""}; - int num_streams_{1}; - void* context_{NULL}; - bool enable_opencl_throttling_{false}; - bool disable_dynamic_shapes_{false}; - bool export_ep_ctx_blob_{false}; - bool enable_qdq_optimizer_{false}; - bool disable_cpu_fallback_{false}; - bool so_epctx_embed_mode_{false}; - - OpenVINOExecutionProviderInfo() = delete; - - explicit OpenVINOExecutionProviderInfo(std::string dev_type, const std::string& precision, - size_t num_of_threads, - const std::map& load_config, - const std::string& cache_dir, - const std::string& model_priority, int num_streams, - void* context, bool enable_opencl_throttling, - bool disable_dynamic_shapes, bool export_ep_ctx_blob, - bool enable_qdq_optimizer, bool disable_cpu_fallback, - bool so_epctx_embed_mode) - : precision_(std::move(precision)), - num_of_threads_(num_of_threads), - load_config_(std::move(load_config)), - cache_dir_(std::move(cache_dir)), - model_priority_(std::move(model_priority)), - num_streams_(num_streams), - context_(context), - enable_opencl_throttling_(enable_opencl_throttling), - disable_dynamic_shapes_(disable_dynamic_shapes), - export_ep_ctx_blob_(export_ep_ctx_blob), - enable_qdq_optimizer_(enable_qdq_optimizer), - disable_cpu_fallback_(disable_cpu_fallback), - so_epctx_embed_mode_{so_epctx_embed_mode} { - std::set ov_supported_device_types = {"CPU", "GPU", - "GPU.0", "GPU.1", "NPU"}; - - OVDevices devices; - std::vector available_devices = devices.get_ov_devices(); - - for (auto& device : available_devices) { - if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { - ov_supported_device_types.emplace(device); - } - } - - if (dev_type == "") { - LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" - << "No runtime device selection option provided."; -#if defined OPENVINO_CONFIG_CPU - device_type_ = "CPU"; - precision_ = "FP32"; -#elif defined OPENVINO_CONFIG_GPU - device_type_ = "GPU"; - precision_ = "FP16"; -#elif defined OPENVINO_CONFIG_NPU - device_type_ = "NPU"; - precision_ = "FP16"; -#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO -#ifdef DEVICE_NAME -#define DEVICE DEVICE_NAME -#endif - dev_type = DEVICE; - - if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) { - std::vector devices = parseDevices(dev_type, available_devices); - precision_ = "FP16"; - if (devices[0] == "CPU") { - precision_ = "FP32"; - } - device_type_ = std::move(dev_type); - } -#endif - } else if (ov_supported_device_types.find(dev_type) != ov_supported_device_types.end()) { - device_type_ = std::move(dev_type); - } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) { - std::vector devices = parseDevices(dev_type, available_devices); - device_type_ = std::move(dev_type); - } else { - ORT_THROW("Invalid device string: " + dev_type); - } - LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" - << "Choosing Device: " << device_type_ << " , Precision: " << precision_; - } -}; - -struct OpenVINOEPFunctionState { - AllocateFunc allocate_func = nullptr; - DestroyFunc destroy_func = nullptr; - AllocatorHandle allocator_handle = nullptr; - std::shared_ptr backend_manager; -}; - // Logical device representation. class OpenVINOExecutionProvider : public IExecutionProvider { public: - explicit OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info); - ~OpenVINOExecutionProvider() = default; + explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context); + ~OpenVINOExecutionProvider(); std::vector> GetCapability(const GraphViewer& graph_viewer, @@ -194,13 +61,18 @@ class OpenVINOExecutionProvider : public IExecutionProvider { const void* GetExecutionHandle() const noexcept override { return nullptr; } + + const InlinedVector GetEpContextNodes() const override; + #ifdef USE_OVEP_NPU_MEMORY std::vector CreatePreferredAllocators() override; #endif private: - std::unique_ptr global_context_; - std::shared_ptr backend_manager_; - openvino_ep::EPCtxHandler ep_ctx_handle_{}; + SessionContext session_context_; + SharedContext& shared_context_; + std::list backend_managers_; // EP session owns the backend objects + EPCtxHandler ep_ctx_handle_; }; +} // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 879d2399e68af..1c2d857b6252d 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -7,203 +7,212 @@ #include "core/providers/openvino/openvino_provider_factory.h" #include "core/providers/openvino/openvino_execution_provider.h" #include "core/providers/openvino/openvino_provider_factory_creator.h" +#include "core/providers/openvino/contexts.h" +#include "core/providers/openvino/backend_utils.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "nlohmann/json.hpp" namespace onnxruntime { -struct OpenVINOProviderFactory : IExecutionProviderFactory { - OpenVINOProviderFactory(const std::string& device_type, const std::string& precision, - size_t num_of_threads, - const std::map& load_config, const std::string& cache_dir, - const std::string& model_priority, int num_streams, void* context, - bool enable_opencl_throttling, bool disable_dynamic_shapes, - bool enable_qdq_optimizer, const ConfigOptions& config_options) - : device_type_(device_type), - precision_(precision), - num_of_threads_(num_of_threads), - load_config_(load_config), - cache_dir_(cache_dir), - model_priority_(model_priority), - num_streams_(num_streams), - context_(context), - enable_opencl_throttling_(enable_opencl_throttling), - disable_dynamic_shapes_(disable_dynamic_shapes), - enable_qdq_optimizer_(enable_qdq_optimizer), - config_options_(config_options) {} +namespace openvino_ep { +void ParseConfigOptions(ProviderInfo& pi, const ConfigOptions& config_options) { + pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; + pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; + pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; + pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; + pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); +} - ~OpenVINOProviderFactory() override {} +void* ParseUint64(const ProviderOptions& provider_options, std::string option_name) { + if (provider_options.contains(option_name)) { + uint64_t number = std::strtoull(provider_options.at(option_name).data(), nullptr, 16); + return reinterpret_cast(number); + } else { + return nullptr; + } +} - std::unique_ptr CreateProvider() override; +bool ParseBooleanOption(const ProviderOptions& provider_options, std::string option_name) { + if (provider_options.contains(option_name)) { + const auto& value = provider_options.at(option_name); + if (value == "true" || value == "True") { + return true; + } else if (value == "false" || value == "False") { + return false; + } else { + ORT_THROW("[ERROR] [OpenVINO-EP] ", option_name, " should be a boolean.\n"); + } + } + return false; +} - private: - std::string device_type_; - std::string precision_; - size_t num_of_threads_; - const std::map load_config_; - std::string cache_dir_; - std::string model_priority_; - int num_streams_; - void* context_; - bool enable_opencl_throttling_; - bool disable_dynamic_shapes_; - bool enable_qdq_optimizer_; - const ConfigOptions& config_options_; -}; +std::string ParseDeviceType(const ProviderOptions& provider_options, std::string option_name) { + const std::vector ov_available_devices = OVCore::GetAvailableDevices(); + + std::set ov_supported_device_types = {"CPU", "GPU", + "GPU.0", "GPU.1", "NPU"}; + std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", + "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", + "GPU.0_FP16", "GPU.1_FP16"}; + + // Expand set of supported device with OV devices + ov_supported_device_types.insert(ov_available_devices.begin(), ov_available_devices.end()); -std::unique_ptr OpenVINOProviderFactory::CreateProvider() { - bool so_disable_cpu_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; - bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; - bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; - std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str(); - - if (so_export_ep_ctx_blob && !so_cache_path.empty()) { - cache_dir_ = std::move(so_cache_path); - auto file_path = std::filesystem::path(cache_dir_); - // ep_context_file_path_ file extension must be .onnx - if (file_path.extension().generic_string() == ".onnx") { - // ep_context_file_path_ must be provided as a directory, create it if doesn't exist - auto parent_path = file_path.parent_path(); - if (!parent_path.empty() && !std::filesystem::is_directory(parent_path) && - !std::filesystem::create_directory(parent_path)) { - ORT_THROW("[ERROR] [OpenVINO] Failed to create directory : " + - file_path.parent_path().generic_string() + " \n"); + if (provider_options.contains(option_name)) { + const auto& selected_device = provider_options.at("device_type"); + + if (deprecated_device_types.contains(selected_device)) { + // Deprecated device and precision is handled together at ParsePrecision + return selected_device; + } + + if (!((ov_supported_device_types.contains(selected_device)) || + (selected_device.find("HETERO:") == 0) || + (selected_device.find("MULTI:") == 0) || + (selected_device.find("AUTO:") == 0))) { + ORT_THROW( + "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. " + "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from" + " HETERO/MULTI/AUTO options available. \n"); + } + LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Choosing Device: " << selected_device; + return selected_device; + } else { + std::string default_device; + + // Take default behavior from project configuration +#if defined OPENVINO_CONFIG_CPU + default_device = "CPU"; +#elif defined OPENVINO_CONFIG_GPU + default_device = "GPU"; +#elif defined OPENVINO_CONFIG_NPU + default_device = "NPU"; +#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO + default_device = DEVICE_NAME; + + // Validate that devices passed are valid + int delimit = device_type.find(":"); + const auto& devices = device_type.substr(delimit + 1); + auto device_list = split(devices, ','); + for (const auto& device : devices) { + if (!ov_supported_device_types.contains(device)) { + ORT_THROW("[ERROR] [OpenVINO] Invalid device selected: ", device); } - } else { - ORT_THROW("[ERROR] [OpenVINO] Invalid ep_ctx_file_path" + cache_dir_ + " \n"); } +#endif + + LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Choosing Device: " << default_device; + return default_device; } +} + +// Depends on ProviderOptions. +std::string ParsePrecision(const ProviderOptions& provider_options, std::string& device_type, const std::string& option_name) { + using DeviceName = std::string; + using DefaultValue = std::string; + using ValidValues = std::list; + using foo = std::pair; + using ParserHelper = std::map; + ParserHelper helper = { + {"GPU", {"FP16", {"FP16", "FP32"}}}, + {"NPU", {"FP16", {"FP16"}}}, + {"CPU", {"FP32", {"FP32"}}}, + }; + + std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", + "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", + "GPU.0_FP16", "GPU.1_FP16"}; + + if (provider_options.contains(option_name)) { + // Start by checking if the device_type is a normal valid one + if (helper.contains(device_type)) { + auto const& valid_values = helper[device_type].second; + const auto& precision = provider_options.at(option_name); + if (precision == "ACCURACY") { + return valid_values.back(); // Return highest supported precision + } else { + if (std::find(valid_values.begin(), valid_values.end(), precision) != valid_values.end()) { + return precision; // Return precision selected if valid + } else { + auto value_iter = valid_values.begin(); + std::string valid_values_joined = *value_iter; + // Append 2nd and up, if only one then ++value_iter is same as end() + for (++value_iter; value_iter != valid_values.end(); ++value_iter) { + valid_values_joined += ", " + *value_iter; + } - OpenVINOExecutionProviderInfo info(device_type_, precision_, num_of_threads_, load_config_, - cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_, - disable_dynamic_shapes_, so_export_ep_ctx_blob, enable_qdq_optimizer_, - so_disable_cpu_fallback, so_epctx_embed_mode); - return std::make_unique(info); + ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. ", device_type, " only supports", valid_values_joined, ".\n"); + } + } + } else if (deprecated_device_types.contains(device_type)) { + LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + device_type + " is deprecated. \n" + << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', " + << "'GPU.1', 'NPU' or from" + << " HETERO/MULTI/AUTO options and set 'precision' separately. \n"; + int delimit = device_type.find("_"); + device_type = device_type.substr(0, delimit); + return device_type.substr(delimit + 1); + } + } + // Return default + return helper[device_type].first; } -} // namespace onnxruntime +void ParseProviderOptions([[maybe_unused]] ProviderInfo& result, [[maybe_unused]] const ProviderOptions& config_options) {} + +struct OpenVINOProviderFactory : IExecutionProviderFactory { + OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext& shared_context) + : provider_info_(std::move(provider_info)), shared_context_(shared_context) {} + + ~OpenVINOProviderFactory() override {} + + std::unique_ptr CreateProvider() override { + return std::make_unique(provider_info_, shared_context_); + } + + private: + ProviderInfo provider_info_; + SharedContext& shared_context_; +}; -namespace onnxruntime { struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO { std::vector GetAvailableDevices() const override { - openvino_ep::OVCore ie_core; - return ie_core.GetAvailableDevices(); + return OVCore::GetAvailableDevices(); } -} g_info; +}; struct OpenVINO_Provider : Provider { - void* GetInfo() override { return &g_info; } + void* GetInfo() override { return &info_; } std::shared_ptr CreateExecutionProviderFactory(const void* void_params) override { // Extract the void_params into ProviderOptions and ConfigOptions - typedef std::pair ConfigBuffer; + using ConfigBuffer = std::pair; const ConfigBuffer* buffer = reinterpret_cast(void_params); - auto& provider_options_map = *buffer->first; - const ConfigOptions& config_options = buffer->second; - - std::string device_type = ""; // [device_type]: Overrides the accelerator hardware type and - // precision with these values at runtime. - std::string precision = ""; // [precision]: Sets the inference precision for execution. - // Supported precision for devices are - // CPU=FP32, GPU=FP32,FP16, NPU=FP16. - // Not setting precision will execute with optimized precision for - // best inference latency. set Precision=ACCURACY for executing - // models with input precision for best accuracy. - int num_of_threads = 0; // [num_of_threads]: Overrides the accelerator default value of - // number of threads with this value at runtime. - std::map load_config; // JSON config map to load custom OV parameters. - std::string cache_dir = ""; // [cache_dir]: specify the path to - // dump and load the blobs for the model caching/kernel caching - // (GPU) feature. If blob files are already present, - // it will be directly loaded. - std::string model_priority = "DEFAULT"; // High-level OpenVINO model priority hint - // Defines what model should be provided with more performant - // bounded resource first - int num_streams = 1; // [num_streams]: Option that specifies the number of parallel - // inference requests to be processed on a given `device_type`. - // Overrides the accelerator default value of number of streams - // with this value at runtime. - bool enable_opencl_throttling = false; // [enable_opencl_throttling]: Enables OpenCL queue throttling for - // GPU device (Reduces CPU Utilization when using GPU) - - bool enable_qdq_optimizer = false; // Enables QDQ pruning for efficient inference latency with NPU - - void* context = nullptr; + const auto& provider_options = *buffer->first; + const auto& config_options = buffer->second; + + ProviderInfo pi; std::string bool_flag = ""; - if (provider_options_map.find("device_type") != provider_options_map.end()) { - device_type = provider_options_map.at("device_type").c_str(); - - std::set ov_supported_device_types = {"CPU", "GPU", - "GPU.0", "GPU.1", "NPU"}; - std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", - "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", - "GPU.0_FP16", "GPU.1_FP16"}; - OVDevices devices; - std::vector available_devices = devices.get_ov_devices(); - - for (auto& device : available_devices) { - if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { - ov_supported_device_types.emplace(device); - } - } - if (deprecated_device_types.find(device_type) != deprecated_device_types.end()) { - std::string deprecated_device = device_type; - auto delimit = device_type.find("_"); - device_type = deprecated_device.substr(0, delimit); - precision = deprecated_device.substr(delimit + 1); - LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + deprecated_device + " is deprecated. \n" - << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', " - << "'GPU.1', 'NPU' or from" - << " HETERO/MULTI/AUTO options and set 'precision' separately. \n"; - } - if (!((ov_supported_device_types.find(device_type) != ov_supported_device_types.end()) || - (device_type.find("HETERO:") == 0) || - (device_type.find("MULTI:") == 0) || - (device_type.find("AUTO:") == 0))) { - ORT_THROW( - "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. " - "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from" - " HETERO/MULTI/AUTO options available. \n"); - } - } - if (provider_options_map.find("device_id") != provider_options_map.end()) { - std::string dev_id = provider_options_map.at("device_id").c_str(); + + pi.device_type = ParseDeviceType(provider_options, "device_type"); + + if (provider_options.contains("device_id")) { + std::string dev_id = provider_options.at("device_id").data(); LOGS_DEFAULT(WARNING) << "[OpenVINO] The options 'device_id' is deprecated. " << "Upgrade to set deice_type and precision session options.\n"; if (dev_id == "CPU" || dev_id == "GPU" || dev_id == "NPU") { - device_type = std::move(dev_id); + pi.device_type = std::move(dev_id); } else { ORT_THROW("[ERROR] [OpenVINO] Unsupported device_id is selected. Select from available options."); } } - if (provider_options_map.find("precision") != provider_options_map.end()) { - precision = provider_options_map.at("precision").c_str(); - } - if (device_type.find("GPU") != std::string::npos) { - if (precision == "") { - precision = "FP16"; - } else if (precision != "ACCURACY" && precision != "FP16" && precision != "FP32") { - ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. GPU only supports FP32 / FP16. \n"); - } - } else if (device_type.find("NPU") != std::string::npos) { - if (precision == "" || precision == "ACCURACY" || precision == "FP16") { - precision = "FP16"; - } else { - ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. NPU only supported FP16. \n"); - } - } else if (device_type.find("CPU") != std::string::npos) { - if (precision == "" || precision == "ACCURACY" || precision == "FP32") { - precision = "FP32"; - } else { - ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. CPU only supports FP32 . \n"); - } + if (provider_options.contains("cache_dir")) { + pi.cache_dir = provider_options.at("cache_dir"); } - if (provider_options_map.find("cache_dir") != provider_options_map.end()) { - cache_dir = provider_options_map.at("cache_dir"); - } + pi.precision = ParsePrecision(provider_options, pi.device_type, "precision"); - if (provider_options_map.find("load_config") != provider_options_map.end()) { + if (provider_options.contains("load_config")) { auto parse_config = [&](const std::string& config_str) -> std::map { // If the config string is empty, return an empty map and skip processing if (config_str.empty()) { @@ -262,116 +271,96 @@ struct OpenVINO_Provider : Provider { return target_map; }; - load_config = parse_config(provider_options_map.at("load_config")); + pi.load_config = parse_config(provider_options.at("load_config")); } - if (provider_options_map.find("context") != provider_options_map.end()) { - std::string str = provider_options_map.at("context"); - uint64_t number = std::strtoull(str.c_str(), nullptr, 16); - context = reinterpret_cast(number); + pi.context = ParseUint64(provider_options, "context"); +#if defined(IO_BUFFER_ENABLED) + // a valid context must be provided to enable IO Buffer optimizations + if (pi.context == nullptr) { +#undef IO_BUFFER_ENABLED +#define IO_BUFFER_ENABLED = 0 + LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization"; } +#endif - if (provider_options_map.find("num_of_threads") != provider_options_map.end()) { - if (!std::all_of(provider_options_map.at("num_of_threads").begin(), - provider_options_map.at("num_of_threads").end(), ::isdigit)) { + if (provider_options.contains("num_of_threads")) { + if (!std::all_of(provider_options.at("num_of_threads").begin(), + provider_options.at("num_of_threads").end(), ::isdigit)) { ORT_THROW("[ERROR] [OpenVINO-EP] Number of threads should be a number. \n"); } - num_of_threads = std::stoi(provider_options_map.at("num_of_threads")); - if (num_of_threads <= 0) { - num_of_threads = 1; + pi.num_of_threads = std::stoi(provider_options.at("num_of_threads")); + if (pi.num_of_threads <= 0) { + pi.num_of_threads = 1; LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_threads' should be in the positive range.\n " << "Executing with num_threads=1"; } } - if (provider_options_map.find("model_priority") != provider_options_map.end()) { - model_priority = provider_options_map.at("model_priority").c_str(); + if (provider_options.contains("model_priority")) { + pi.model_priority = provider_options.at("model_priority").data(); std::vector supported_priorities({"LOW", "MEDIUM", "HIGH", "DEFAULT"}); if (std::find(supported_priorities.begin(), supported_priorities.end(), - model_priority) == supported_priorities.end()) { - model_priority = "DEFAULT"; + pi.model_priority) == supported_priorities.end()) { + pi.model_priority = "DEFAULT"; LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'model_priority' " << "is not one of LOW, MEDIUM, HIGH, DEFAULT. " << "Executing with model_priorty=DEFAULT"; } } - if (provider_options_map.find("num_streams") != provider_options_map.end()) { - num_streams = std::stoi(provider_options_map.at("num_streams")); - if (num_streams <= 0) { - num_streams = 1; + if (provider_options.contains("num_streams")) { + pi.num_streams = std::stoi(provider_options.at("num_streams")); + if (pi.num_streams <= 0) { + pi.num_streams = 1; LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_streams' should be in the range of 1-8.\n " << "Executing with num_streams=1"; } } - if (provider_options_map.find("enable_opencl_throttling") != provider_options_map.end()) { - bool_flag = provider_options_map.at("enable_opencl_throttling"); - if (bool_flag == "true" || bool_flag == "True") - enable_opencl_throttling = true; - else if (bool_flag == "false" || bool_flag == "False") - enable_opencl_throttling = false; - bool_flag = ""; - } + pi.enable_opencl_throttling = ParseBooleanOption(provider_options, "enable_opencl_throttling"); - if (provider_options_map.find("enable_qdq_optimizer") != provider_options_map.end()) { - bool_flag = provider_options_map.at("enable_qdq_optimizer"); - if (bool_flag == "true" || bool_flag == "True") - enable_qdq_optimizer = true; - else if (bool_flag == "false" || bool_flag == "False") - enable_qdq_optimizer = false; - else - ORT_THROW("[ERROR] [OpenVINO-EP] enable_qdq_optimiser should be a boolean.\n"); - bool_flag = ""; - } + pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer"); - // [disable_dynamic_shapes]: Rewrite dynamic shaped models to static shape at runtime and execute. - // Always true for NPU plugin. - bool disable_dynamic_shapes = false; - if (device_type.find("NPU") != std::string::npos) { - disable_dynamic_shapes = true; + pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes"); + + ParseConfigOptions(pi, config_options); + + // Always true for NPU plugin or when passed . + if (pi.device_type.find("NPU") != std::string::npos) { + pi.disable_dynamic_shapes = true; } - if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) { - bool_flag = provider_options_map.at("disable_dynamic_shapes"); - if (bool_flag == "true" || bool_flag == "True") { - disable_dynamic_shapes = true; - } else if (bool_flag == "false" || bool_flag == "False") { - if (device_type.find("NPU") != std::string::npos) { - disable_dynamic_shapes = true; - LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to " - << "TRUE for NPU backend.\n "; - } else { - disable_dynamic_shapes = false; - } - } - bool_flag = ""; + + // Append values to config to support weight-as-inputs conversion for shared contexts + if (pi.so_share_ep_contexts) { + ov::AnyMap map; + map["NPU_COMPILATION_MODE_PARAMS"] = "enable-wd-blockarg-input=true compute-layers-with-higher-precision=Sqrt,Power,ReduceSum"; + pi.load_config["NPU"] = std::move(map); } - return std::make_shared(device_type, - precision, - num_of_threads, - load_config, - cache_dir, - model_priority, - num_streams, - context, - enable_opencl_throttling, - disable_dynamic_shapes, - enable_qdq_optimizer, - config_options); + return std::make_shared(pi, shared_context_); } void Initialize() override { + OVCore::Initialize(); } void Shutdown() override { + backend_utils::DestroyOVTensors(shared_context_.shared_weights.metadata); + OVCore::Teardown(); } -} g_provider; + private: + SharedContext shared_context_; + ProviderInfo_OpenVINO_Impl info_; +}; // OpenVINO_Provider + +} // namespace openvino_ep } // namespace onnxruntime extern "C" { ORT_API(onnxruntime::Provider*, GetProvider) { - return &onnxruntime::g_provider; + static onnxruntime::openvino_ep::OpenVINO_Provider g_provider; + return &g_provider; } } diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 12ab7ecede031..4c656bceff550 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -13,7 +13,16 @@ using Exception = ov::Exception; namespace onnxruntime { namespace openvino_ep { -const std::string log_tag = "[OpenVINO-EP] "; +static const std::string log_tag = "[OpenVINO-EP] "; +static std::unique_ptr g_core; + +void OVCore::Initialize() { + g_core = std::make_unique(); +} + +void OVCore::Teardown() { + g_core.reset(); +} #ifndef NDEBUG void printDebugInfo(const ov::CompiledModel& obj) { @@ -46,7 +55,7 @@ void printDebugInfo(const ov::CompiledModel& obj) { } #endif -std::shared_ptr OVCore::ReadModel(const std::string& model, const std::string& model_path) const { +std::shared_ptr OVCore::ReadModel(const std::string& model, const std::string& model_path) { try { std::istringstream modelStringStream(model); std::istream& modelStream = modelStringStream; @@ -77,7 +86,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_netwo const std::string& name) { ov::CompiledModel obj; try { - obj = oe.compile_model(ie_cnn_network, hw_target, device_config); + obj = Get().compile_model(ie_cnn_network, hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -96,7 +105,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, const std::string& name) { ov::CompiledModel obj; try { - obj = oe.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); + obj = Get().compile_model(onnx_model, ov::Tensor(), hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -109,22 +118,13 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, } } -OVExeNetwork OVCore::ImportModel(const std::string& model_string, +OVExeNetwork OVCore::ImportModel(std::istream& model_stream, std::string hw_target, const ov::AnyMap& device_config, - bool embed_mode, std::string name) { try { ov::CompiledModel obj; - if (embed_mode) { - std::istringstream model_stream(model_string); - obj = oe.import_model(model_stream, hw_target, device_config); - } else { - std::ifstream modelStream(model_string, std::ios_base::binary | std::ios_base::in); - obj = oe.import_model(modelStream, - hw_target, - {}); - } + obj = Get().import_model(model_stream, hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -138,7 +138,12 @@ OVExeNetwork OVCore::ImportModel(const std::string& model_string, } void OVCore::SetCache(const std::string& cache_dir_path) { - oe.set_property(ov::cache_dir(cache_dir_path)); + Get().set_property(ov::cache_dir(cache_dir_path)); +} + +inline ov::Core& OVCore::Get() { + ORT_ENFORCE(g_core); + return *g_core; } #ifdef IO_BUFFER_ENABLED @@ -174,12 +179,12 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr model_strea #endif std::vector OVCore::GetAvailableDevices() { - auto available_devices = oe.get_available_devices(); + auto available_devices = Get().get_available_devices(); return available_devices; } void OVCore::SetStreams(const std::string& device_type, int num_streams) { - oe.set_property(device_type, {ov::num_streams(num_streams)}); + Get().set_property(device_type, {ov::num_streams(num_streams)}); } OVInferRequest OVExeNetwork::CreateInferRequest() { @@ -206,7 +211,18 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { } } -void OVInferRequest::SetTensor(std::string name, OVTensorPtr& blob) { +std::string OVInferRequest::GetInputTensorName(uint32_t index) { + try { + const auto& model = ovInfReq.get_compiled_model(); + return *model.input(index).get_names().begin(); + } catch (const Exception& e) { + ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index, e.what()); + } catch (...) { + ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index); + } +} + +void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { try { ovInfReq.set_tensor(name, *(blob.get())); } catch (const Exception& e) { @@ -216,6 +232,10 @@ void OVInferRequest::SetTensor(std::string name, OVTensorPtr& blob) { } } +uint32_t OVInferRequest::GetNumInputs() { + return ovInfReq.get_compiled_model().inputs().size(); +} + void OVInferRequest::StartAsync() { try { ovInfReq.start_async(); diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index c3417003f8e1f..53b814094438e 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -37,40 +37,40 @@ typedef ov::intel_gpu::ocl::ClContext* OVRemoteContextPtr; typedef ov::RemoteContext OVRemoteContext; #endif -class OVCore { - ov::Core oe; +struct OVCore { + static void Initialize(); + static void Teardown(); - public: // OV Interface For Reading Model - std::shared_ptr ReadModel(const std::string& model_stream, const std::string& model_path) const; + static std::shared_ptr ReadModel(const std::string& model_stream, const std::string& model_path); + // OV Interface for Compiling OV Model Type - OVExeNetwork CompileModel(std::shared_ptr& ie_cnn_network, - std::string& hw_target, - ov::AnyMap& device_config, - const std::string& name); + static OVExeNetwork CompileModel(std::shared_ptr& ie_cnn_network, + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name); // OV Interface for Fast Compile - OVExeNetwork CompileModel(const std::string& onnx_model, - std::string& hw_target, - ov::AnyMap& device_config, - const std::string& name); + static OVExeNetwork CompileModel(const std::string& onnx_model, + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name); // OV Interface for Import model Stream - OVExeNetwork ImportModel(const std::string& model_string, - std::string hw_target, - const ov::AnyMap& device_config, - bool embed_mode, - std::string name); + static OVExeNetwork ImportModel(std::istream& model_stream, + std::string hw_target, + const ov::AnyMap& device_config, + std::string name); #ifdef IO_BUFFER_ENABLED - OVExeNetwork CompileModel(std::shared_ptr& model, - OVRemoteContextPtr context, - std::string name); - OVExeNetwork ImportModel(std::shared_ptr model_stream, - OVRemoteContextPtr context, - std::string name); + static OVExeNetwork CompileModel(std::shared_ptr& model, + OVRemoteContextPtr context, + std::string name); + static OVExeNetwork ImportModel(std::shared_ptr model_stream, + OVRemoteContextPtr context, + std::string name); #endif - std::vector GetAvailableDevices(); - void SetCache(const std::string& cache_dir_path); - ov::Core& Get() { return oe; } - void SetStreams(const std::string& device_type, int num_streams); + static std::vector GetAvailableDevices(); + static void SetCache(const std::string& cache_dir_path); + inline static ov::Core& Get(); + static void SetStreams(const std::string& device_type, int num_streams); }; class OVExeNetwork { @@ -87,8 +87,10 @@ class OVInferRequest { ov::InferRequest ovInfReq; public: + uint32_t GetNumInputs(); OVTensorPtr GetTensor(const std::string& name); - void SetTensor(std::string name, OVTensorPtr& blob); + std::string GetInputTensorName(uint32_t index); + void SetTensor(const std::string& name, OVTensorPtr& blob); void StartAsync(); void Infer(); void WaitRequest(); diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 3e780f74145ae..d56687f868c3d 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License #include #include +#include #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/backend_utils.h" @@ -26,23 +27,27 @@ namespace onnxruntime { namespace openvino_ep { // Constructor -GetCapability::GetCapability(const GraphViewer& graph_viewer_param, +GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler, + const GraphViewer& graph_viewer_param, const std::string device_type_param, - const bool enable_qdq_optimizer) - : graph_viewer_(graph_viewer_param), device_type_(device_type_param) { + const bool enable_qdq_optimizer) : ep_ctx_handler_(ep_ctx_handler), + graph_viewer_(graph_viewer_param), + device_type_(std::move(device_type_param)) { bool npu_qdq_optimizer_enabled = false; if (device_type_.find("NPU") != std::string::npos) { device_type_ = "CPU"; if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true; } -#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4 - data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled); -#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5 +#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5 data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled); +#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 6 + data_ops_ = new DataOps(graph_viewer_, V_2024_6, device_type_, npu_qdq_optimizer_enabled); #elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0 data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled); +#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 1 + data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled); #else - data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled); #endif } @@ -54,6 +59,28 @@ std::vector> GetCapability::Execute() { return result; } + auto Iterable2String = [](U& strings, const V& node_args) { + constexpr bool has_name = requires(V v) { + (*v.begin())->Name(); + }; + for (const auto& arg : node_args) { + if constexpr (has_name) { + strings.push_back(arg->Name()); + } else { + strings.push_back(arg); + } + } + }; + + // Check for EpContext nodes + const auto& nodes = graph_viewer_.GetNodesInTopologicalOrder(); + + // If all the nodes have been accounted for then no more processing is needed + if (result.size() == nodes.size()) { + is_wholly_supported_graph_ = true; + return result; + } + // This is a list of initializers that nGraph considers as constants. Example weights, reshape shape etc. std::unordered_set ng_required_initializers; @@ -62,8 +89,8 @@ std::vector> GetCapability::Execute() { if (openvino_ep::backend_utils::IsDebugEnabled()) { std::cout << "No of unsupported nodes " << unsupported_nodes.size() << std::endl; for (size_t i = 0; i < unsupported_nodes.size(); i++) { - const Node* node = graph_viewer_.GetNode(unsupported_nodes[i]); - std::cout << "Unsupported node op " << node->OpType() << std::endl; + const Node* unode = graph_viewer_.GetNode(unsupported_nodes[i]); + std::cout << "Unsupported node op " << unode->OpType() << std::endl; } } #endif @@ -73,8 +100,7 @@ std::vector> GetCapability::Execute() { std::vector inputs; std::vector outputs; // Fill inputs with names - std::for_each(graph_viewer_.GetInputs().begin(), graph_viewer_.GetInputs().end(), - [&inputs](const NodeArg* node_arg) { inputs.push_back(node_arg->Name()); }); + Iterable2String(inputs, graph_viewer_.GetInputs()); /* In scenarios, when there are no inputs or all inputs being initializers, ConstantFolding optimization in onnxruntime pre-computes the value.*/ @@ -82,8 +108,6 @@ std::vector> GetCapability::Execute() { return result; } - const std::vector& nodes = graph_viewer_.GetNodesInTopologicalOrder(); - const Node* node = graph_viewer_.GetNode(nodes[0]); // Handle cases where lone, reoccuring Ops in smaller models cannot be supported in OpenVINO @@ -103,12 +127,10 @@ std::vector> GetCapability::Execute() { } // Initializers need to be part of meta_def->inputs - std::for_each(ng_required_initializers.begin(), ng_required_initializers.end(), - [&inputs](const std::string& initializer) { inputs.push_back(initializer); }); + Iterable2String(inputs, ng_required_initializers); // Fill outputs with names - std::for_each(graph_viewer_.GetOutputs().begin(), graph_viewer_.GetOutputs().end(), - [&outputs](const NodeArg* node_arg) { outputs.push_back(node_arg->Name()); }); + Iterable2String(outputs, graph_viewer_.GetOutputs()); // Create and add this graph to result. AppendClusterToSubGraph(graph_viewer_.GetNodesInTopologicalOrder(), inputs, outputs, result); @@ -148,9 +170,15 @@ std::vector> GetCapability::Execute() { int no_of_clusters = 0; for (auto this_cluster : connected_clusters) { - // If subgraph has less then three, graph is considered trivial + // If subgraph has less then three, graph is considered trivial unless its an epctx cluster if (this_cluster.size() < 3) { - continue; + bool is_epctx_node = false; + for (auto node_idx : this_cluster) { + if (graph_viewer_.GetNode(node_idx)->OpType() == "EPContext") + is_epctx_node = true; + } + if (!is_epctx_node) + continue; } std::vector cluster_graph_inputs, cluster_inputs, cluster_outputs; @@ -166,16 +194,6 @@ std::vector> GetCapability::Execute() { // Omitting zero dim subgraphs for (auto index : this_cluster) { const Node* node = graph_viewer_.GetNode(index); - if (data_ops_->DoNotOmitSubGraph(node->OpType())) { - for (const auto& input : node->InputDefs()) { - const auto& input_name = input->Name(); - auto it = find(cluster_graph_inputs.begin(), cluster_graph_inputs.end(), input_name); - if (it != cluster_graph_inputs.end()) { - omit_subgraph = true; - break; - } - } - } if (node->OpType() == "Conv" || node->OpType() == "Identity") { const auto& output_name = node->OutputDefs()[0]->Name(); @@ -213,7 +231,6 @@ std::vector> GetCapability::Execute() { } LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Supported subgraphs on OpenVINO: " << no_of_clusters; } - return result; } diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h index 2f87c4c73d892..364e79a76f154 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.h +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h @@ -6,12 +6,14 @@ #include #include #include "core/providers/openvino/ov_versions/data_ops.h" +#include "core/providers/openvino/onnx_ctx_model_helper.h" namespace onnxruntime { namespace openvino_ep { class GetCapability { private: + const EPCtxHandler& ep_ctx_handler_; const GraphViewer& graph_viewer_; std::string device_type_; DataOps* data_ops_; @@ -19,7 +21,8 @@ class GetCapability { bool has_external_weights_ = false; public: - GetCapability(const GraphViewer& graph_viewer_param, + GetCapability(const EPCtxHandler& ep_ctx_handler, + const GraphViewer& graph_viewer_param, const std::string device_type_param, const bool enable_qdq_optimizer); virtual std::vector> Execute(); diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index f118f057ac11e..2f0dd458cc349 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -388,7 +388,7 @@ void DataOps::populate_op_mode_supported() { // populate unsupportedmode_t { - UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2025_0}, + UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1}, [this](const Node* node, const InitializedTensorSet&) { // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch) for (size_t i = 0; i < node->InputDefs().size(); i++) { @@ -404,7 +404,7 @@ void DataOps::populate_op_mode_supported() { } { UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, - V_2024_3, V_2024_4, V_2024_5, V_2025_0}, + V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1}, [this](const Node* node, const InitializedTensorSet&) { const auto& input_arg = node->InputDefs()[1]; auto shape = input_arg->Shape(); @@ -422,7 +422,7 @@ void DataOps::populate_op_mode_supported() { } { UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, - V_2024_3, V_2024_4, V_2024_5, V_2025_0}, + V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1}, [this](const Node* node, const InitializedTensorSet&) { // If the operator is unsqueeze // If axes is an input, then we cannot produce a static graph. @@ -437,8 +437,8 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Unsqueeze", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, - V_2025_0}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6, + V_2025_0, V_2025_1}, [this](const Node* node, const InitializedTensorSet&) { // check for attributes auto& upsample_attr = node->GetAttributes(); diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h index 07fa36f355d55..cf7d834d6cfc7 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h @@ -33,7 +33,9 @@ enum versionNum { V_2024_3, V_2024_4, V_2024_5, - V_2025_0 + V_2024_6, + V_2025_0, + V_2025_1 }; using VersionNum = enum versionNum; @@ -82,7 +84,7 @@ class DataOps { const std::string dev_id, const bool npu_qdq_optimizer_enabled) : graph_viewer_(graph_viewer_param), version_id_(ver), - device_id_(dev_id), + device_id_(std::move(dev_id)), npu_qdq_optimizer_enabled_(npu_qdq_optimizer_enabled) { populate_op_mode_supported(); populate_types_supported(); diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index e021edc878709..4d513c0533ff1 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -56,7 +56,7 @@ static NodeArg& ProcessNodeUnitIO(onnxruntime::Graph& dst_graph, std::set& initializers_to_keep, const NodeUnitIODef& io_def) { const std::string& name = io_def.node_arg.Name(); - const ONNX_NAMESPACE::TypeProto* orig_type_proto = io_def.node_arg.TypeAsProto(); + const auto* orig_type_proto = io_def.node_arg.TypeAsProto(); // Handle quantized input or output. Convert to float type. if (io_def.quant_param.has_value()) { @@ -68,11 +68,11 @@ static NodeArg& ProcessNodeUnitIO(onnxruntime::Graph& dst_graph, ORT_ENFORCE(tensor_proto_iter != src_initializers.end(), "Unable to find scale initializer ", scale_initializer_name); - const ONNX_NAMESPACE::TensorProto* scale_tensor_proto = tensor_proto_iter->second; + const auto* scale_tensor_proto = tensor_proto_iter->second; int32_t float_type = scale_tensor_proto->data_type(); // Noe set the arg type to the float type of scale. Could be one of float/float16/bfloat16 - std::unique_ptr type_proto = ONNX_NAMESPACE::TypeProto::Create(); + auto type_proto = ONNX_NAMESPACE::TypeProto::Create(); type_proto->copy_from(orig_type_proto); type_proto->mutable_tensor_type()->set_elem_type(float_type); @@ -457,7 +457,7 @@ static void AddStandaloneNodeUnit(onnxruntime::Graph& dst_graph, const onnxrunti if (duplicate_dq && GetQDQDataType(&node_unit.GetNode()) != DT_UINT16 && GetQDQDataType(&node_unit.GetNode()) != DT_INT16) { std::string orig_dq_name = node_unit.Outputs()[0].node_arg.Name(); // ex: dql_output/duplicated - std::unique_ptr type_proto = ONNX_NAMESPACE::TypeProto::Create(); + auto type_proto = ONNX_NAMESPACE::TypeProto::Create(); type_proto->copy_from(node_unit.Inputs()[0].node_arg.TypeAsProto()); type_proto->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); orig_dq_name.erase(orig_dq_name.find(DuplicateDQ), std::string::npos); // ex: dql_output @@ -625,10 +625,54 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph, KeepInitsInDstGraph(initializers_to_keep, src_graph, &target_node); } +static void AddInitializerAsInput(onnxruntime::Graph& dst_graph, + InlinedVector& accumulated_inputs, + const onnxruntime::GraphViewer& src_graph, + const std::string& initializer_name) { + // Get the initializer from source graph + const auto& src_initializers = src_graph.GetAllInitializedTensors(); + auto init_iter = src_initializers.find(initializer_name); + + if (init_iter == src_initializers.end()) { + // Initializer not found + return; + } + + const auto* tensor_proto = init_iter->second; + + // Create TypeProto for the initializer + auto type_proto = ONNX_NAMESPACE::TypeProto::Create(); + auto* tensor_type = type_proto->mutable_tensor_type(); + tensor_type->set_elem_type(tensor_proto->data_type()); + + for (int i = 0; i < tensor_proto->dims_size(); ++i) { + tensor_type->mutable_shape()->add_dim()->set_dim_value(tensor_proto->dims().Get(i)); + } + + // Create NodeArg for the initializer + auto& input_arg = dst_graph.GetOrCreateNodeArg(initializer_name, type_proto.get()); + + // Check if input already exists in accumulated inputs + bool input_exists = false; + for (const auto* existing_input : accumulated_inputs) { + if (existing_input->Name() == initializer_name) { + input_exists = true; + break; + } + } + + if (!input_exists) { + // Add to accumulated inputs + accumulated_inputs.push_back(&input_arg); + } +} + // Creates a new model without the DQ/Q operators in the src graph. Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, - /*out*/ std::unique_ptr& model) { + bool enable_ovep_weight_sharing, + /*out*/ std::unique_ptr& model, + /*out*/ sw& shared_weights) { // NOTE: This function is a re-implementation of GraphViewerToProto() in core/graph/graph_proto_serializer.cc // with the following differences: // - Uses onnxruntime::Graph APIs instead of onnx::GraphProto APIs. @@ -665,7 +709,12 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, dst_graph_outputs.push_back(&ep_graph_output_arg); } - dst_graph.SetInputs(dst_graph_inputs); + // Will set inputs after deciding fate oif all internal and external initializers + // accumulated_inputs container will store input of the original graph and initializer with ext data + InlinedVector accumulated_inputs; + accumulated_inputs.reserve(dst_graph_inputs.size()); + + // dst_graph.SetInputs(dst_graph_inputs); dst_graph.SetOutputs(dst_graph_outputs); // TODO(sspintel): add Graph::SetName() provider api @@ -723,9 +772,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, seen_node_units.insert(node_unit); } - // - // Copy initializers to dst graph. - // + // Copy initializers to dst graph. std::unordered_set current_scope_initializer_set; @@ -738,26 +785,93 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, } std::sort(const_inits.begin(), const_inits.end()); + // initialize map for creating metadata for initilizers with external weights + auto& metadata = shared_weights.metadata; + + const auto& insert_metadata = [&metadata](const ONNX_NAMESPACE::TensorProto& proto) { + sw::Metadata::Map::key_type key{proto.name()}; + sw::Metadata::Map::mapped_type value{}; + + using mutable_proto_t = ONNX_NAMESPACE::TensorProto*; + auto& mutable_proto = *const_cast(&proto); + auto* entry_protos = mutable_proto.mutable_external_data(); + for (int i = 0; i < entry_protos->size(); i++) { + auto& string_entry_proto{entry_protos->at(i)}; + const auto& pb_key{*(string_entry_proto.mutable_key())}; + const auto& pb_value{*(string_entry_proto.mutable_value())}; + if (pb_key == "location") { + value.location = pb_value; + } else if (pb_key == "offset") { + value.data_offset = std::stoul(pb_value); + } else if (pb_key == "length") { + value.size = std::stoul(pb_value); + } + } + value.element_type = proto.data_type(); + value.dimensions.resize(proto.dims_size()); + for (uint32_t index = 0; auto& dim : value.dimensions) { + dim = proto.dims()[index++]; + } + + metadata.emplace(key, std::move(value)); + }; + + // Handle constant initializers for (auto& it : const_inits) { - if (initializers_to_keep.count(it)) - dst_graph.AddInitializedTensor(*(initializers.at(it))); + const auto& initializer_tensor = *initializers.at(it); + + // Check if the initializer has external data + if (initializer_tensor.has_data_location() && + initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && + enable_ovep_weight_sharing) { + insert_metadata(initializer_tensor); + + // Add initializer with external data as input + AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it); + + } else { + // Add as an initialized tensor if it does not have external data + if (initializers_to_keep.count(it)) + dst_graph.AddInitializedTensor(*(initializers.at(it))); + } + current_scope_initializer_set.insert(it); } - // handle outer scope value which is a constant initializer + // Handle outer-scope constant initializers for (auto& node_idx : src_graph.GetNodesInTopologicalOrder()) { const auto& node = src_graph.GetNode(node_idx); for (const auto& input : node->InputDefs()) { if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) { continue; } + if (src_graph.IsConstantInitializer(input->Name(), true)) { - if (initializers_to_keep.count(input->Name())) - dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true))); + const auto& initializer_tensor = *src_graph.GetConstantInitializer(input->Name(), true); + // Check if the initializer has external data + if (initializer_tensor.has_data_location() && + initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && + enable_ovep_weight_sharing) { + insert_metadata(initializer_tensor); + + // Add initializer as input if it has external data + AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name()); + + } else { + // Add as an initialized tensor if it does not have external data + if (initializers_to_keep.count(input->Name())) { + dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true))); + } + } + current_scope_initializer_set.insert(input->Name()); } } } + accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end()); + + // Set all inputs (original inputs amnd initializers as inputs) of the destination Graph + dst_graph.SetInputs(accumulated_inputs); // Validate graph, remove unnecessary initializers, and run type/shape inference. ORT_RETURN_IF_ERROR(dst_graph.Resolve()); diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h index 94a8eb4d5da17..02831525cba32 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h @@ -5,14 +5,20 @@ #include #include "core/providers/shared_library/provider_api.h" +#include "core/providers/openvino/contexts.h" namespace onnxruntime { namespace openvino_ep { +using sw = SharedContext::SharedWeights; + // Creates a new model without the DQ/Q operators in the src graph as per pre-defined rulesets Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, - /*out*/ std::unique_ptr& model); + bool enable_ovep_weight_sharing, + /*out*/ std::unique_ptr& model, + /*out*/ sw& shared_weights); +bool dumpMetaDataMapToBinary(const sw::Metadata::Map& shared_weights, const std::string& filename); } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index e434935343663..4feedd75f8004 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -991,7 +991,8 @@ struct Model final { const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) { return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger); } - static std::unique_ptr Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) { + static std::unique_ptr Create(const std::string& graph_name, bool is_onnx_domain_only, + const logging::Logger& logger) { return g_host->Model__construct(graph_name, is_onnx_domain_only, logger); } static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast(p)); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 3a694ac6f8e5e..f36345cdabf64 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1178,7 +1178,8 @@ struct ProviderHostImpl : ProviderHost { const logging::Logger& logger) override { return std::make_unique(model_proto, model_path, local_registries, logger); } - std::unique_ptr Model__construct(const std::string& graph_name, bool is_onnx_domain_only, + std::unique_ptr Model__construct(const std::string& graph_name, + bool is_onnx_domain_only, const logging::Logger& logger) override { return std::make_unique(graph_name, is_onnx_domain_only, logger); } diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index 272ea37fcc70c..d224246b98e5b 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -522,6 +522,7 @@ "^test_affine_grid_3d_align_corners_expanded", "^test_affine_grid_3d", "^test_affine_grid_3d_expanded", + "^test_dynamicquantizelinear_expanded_cpu", "^test_operator_permute2", "^test_operator_repeat", "^test_operator_repeat_dim_overflow",