diff --git a/.github/workflows/linux_openvino_ci_intel.yml b/.github/workflows/linux_openvino_ci_intel.yml
new file mode 100644
index 0000000000000..985d014994877
--- /dev/null
+++ b/.github/workflows/linux_openvino_ci_intel.yml
@@ -0,0 +1,45 @@
+name: Linux OpenVINO CI
+
+on:
+  push:
+    branches: [ main, 'rel-*' ]
+  pull_request:
+    branches: ['**' ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  packages: write # Needed if the reusable workflow pushes images
+  attestations: write # Optional: for artifact attestations if enabled
+  id-token: write # Optional: may be needed for OIDC authentication (e.g., ACR)
+
+jobs:
+  build_test_openvino:
+    name: Build and Test OpenVINO EP (AlamLinux8, Py3.12)
+    # Use the reusable workflow as the other Linux CI pipelines
+    uses: ./.github/workflows/reusable_linux_build_intel.yml
+    with:
+      pool_name: "onnxruntime-github-Ubuntu2204-AMD-CPU"
+      build_config: Release
+      # Architecture: OpenVino only supports Intel X64
+      architecture: x64
+      dockerfile_path: tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
+      docker_image_repo: onnxruntimeopenvino
+
+      execution_providers: 'openvino'
+
+      extra_build_flags: '--use_openvino CPU --enable_generic_interface --build_shared_lib'
+
+      # Python Path Prefix: Set the correct Python 3.12 path inside the manylinux container
+      python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
+
+      run_tests: true
+      upload_build_output: false
+
+    # Secrets: Pass the necessary GitHub token
+    secrets:
+      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/reusable_linux_build_intel.yml b/.github/workflows/reusable_linux_build_intel.yml
new file mode 100644
index 0000000000000..a9b718bb2e736
--- /dev/null
+++ b/.github/workflows/reusable_linux_build_intel.yml
@@ -0,0 +1,183 @@
+name: Reusable Linux CPU/GPU Build and Test
+
+on:
+  workflow_call:
+    inputs:
+      pool_name:
+        description: 'The specific 1ES pool name (e.g., onnxruntime-github-Ubuntu2204-AMD-CPU)'
+        required: true
+        type: string
+      build_config:
+        description: 'Build configuration (Debug or Release)'
+        required: true
+        type: string
+      architecture:
+        description: 'Target architecture (x64 or arm64)'
+        required: true
+        type: string
+      dockerfile_path:
+        description: 'Path to the Dockerfile relative to the workspace root'
+        required: true
+        type: string
+      docker_image_repo:
+        description: 'Name for the Docker image repository'
+        required: true
+        type: string
+      docker_build_args:
+        description: 'Arguments to pass to the docker image build command'
+        required: false
+        type: string
+        default: ''
+      execution_providers:
+        description: 'Space-separated list of execution providers to enable (passed to build.py)'
+        required: false
+        type: string
+        default: ''
+      extra_build_flags:
+        description: 'Additional flags for the build.py script (appended after EP flags)'
+        required: false
+        type: string
+        default: ''
+      python_path_prefix:
+        description: 'Optional prefix to add to the PATH for python command (e.g., PATH=/opt/python/cp310-cp310/bin:$PATH)'
+        required: false
+        type: string
+        default: ''
+      python_version:
+        description: 'Python version to set up on the runner host'
+        required: false
+        type: string
+        default: '3.x'
+      run_tests:
+        description: 'Whether to execute the test suite after building'
+        required: false
+        type: boolean
+        default: true
+      upload_build_output:
+        description: 'Whether to upload the build output directory as an artifact (used when tests are skipped)'
+        required: false
+        type: boolean
+        default: false
+    secrets:
+      GH_TOKEN:
+        description: 'GitHub token for accessing actions/packages'
+        required: true
+
+jobs:
+  build_test_pipeline:
+    runs-on: [self-hosted, Linux, X64]
+    permissions:
+      contents: read
+      packages: write
+      attestations: write
+      id-token: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Set up Python ${{ inputs.python_version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ inputs.python_version }}
+
+      - name: Build Docker Image (${{ inputs.architecture }} / ${{ inputs.build_config }})
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.7
+        id: build_docker_image_step
+        with:
+          dockerfile: ${{ github.workspace }}/${{ inputs.dockerfile_path }}
+          image-name: ghcr.io/microsoft/onnxruntime/${{ inputs.docker_image_repo }}
+          build-args: ${{ inputs.docker_build_args }}
+          push: true
+          azure-container-registry-name: onnxruntimebuildcache
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
+
+      - name: Export GitHub Actions cache environment variables
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+      # ------------- Update Step (CMake Generation) -------------
+      - name: Generate Build Files (CMake) (${{ inputs.architecture }} / ${{ inputs.build_config }})
+        id: update_step
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.7
+        with:
+          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
+          build_config: ${{ inputs.build_config }}
+          mode: 'update'
+          execution_providers: ${{ inputs.execution_providers }} # Pass down EP list
+          extra_build_flags: ${{ inputs.extra_build_flags }}
+          python_path_prefix: ${{ inputs.python_path_prefix }}
+
+      # ------------- Build Step (Compilation) -------------
+      - name: Build ONNX Runtime (${{ inputs.architecture }} / ${{ inputs.build_config }})
+        id: build_step
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.7
+        with:
+          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
+          build_config: ${{ inputs.build_config }}
+          mode: 'build'
+          execution_providers: ${{ inputs.execution_providers }} # Pass down EP list
+          extra_build_flags: ${{ inputs.extra_build_flags }}
+          python_path_prefix: ${{ inputs.python_path_prefix }}
+
+      # ------------- Test Step -------------
+      - name: Test ONNX Runtime (${{ inputs.architecture }} / ${{ inputs.build_config }})
+        id: test_step
+        if: inputs.run_tests == true
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.7
+        with:
+          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
+          build_config: ${{ inputs.build_config }}
+          mode: 'test'
+          execution_providers: ${{ inputs.execution_providers }} # Pass down EP list
+          extra_build_flags: ${{ inputs.extra_build_flags }}
+          python_path_prefix: ${{ inputs.python_path_prefix }}
+
+      # ------------- Prepare Artifact Step -------------
+      - name: Prepare Build Output for Upload
+        if: inputs.upload_build_output == true
+        shell: bash
+        run: |
+          #!/bin/bash
+          set -e -x
+          BUILD_DIR="${{ runner.temp }}/${{ inputs.build_config }}"
+          if [ ! -d "${BUILD_DIR}" ]; then
+            echo "Error: Build directory ${BUILD_DIR} not found. Cannot prepare artifact."
+            exit 1
+          fi
+          echo "--- Cleaning build directory: ${BUILD_DIR} ---"
+          rm -rf "${BUILD_DIR}/onnxruntime" || true
+          rm -rf "${BUILD_DIR}/pybind11" || true
+          rm -rf "${BUILD_DIR}/vcpkg_installed" || true
+          rm -f "${BUILD_DIR}/models" || true
+          DEPS_DIR="${BUILD_DIR}/_deps"
+          if [ -d "${DEPS_DIR}" ]; then
+            echo "Cleaning ${DEPS_DIR}, keeping onnx-src..."
+            find "${DEPS_DIR}" -mindepth 1 ! -regex "^${DEPS_DIR}/onnx-src\(/.*\)?$" -delete
+          else
+            echo "${DEPS_DIR} does not exist, skipping deps cleanup."
+          fi
+          echo "--- Saving executable permissions ---"
+          cd "${BUILD_DIR}"
+          find . -executable -type f -printf '%p\n' > perms.txt
+          echo "--- Cleanup and permission saving complete for ${BUILD_DIR} ---"
+
+      # ------------- Upload Build Output Step -------------
+      - name: Upload Build Output Artifact
+        if: inputs.upload_build_output == true
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-output-${{ inputs.architecture }}-${{ inputs.build_config }}
+          path: ${{ runner.temp }}/${{ inputs.build_config }}
+          if-no-files-found: error
+
+      # ------------- Upload Log on Build Failure Step -------------
+      - name: Upload VCPKG Manifest Install Log on Update or Build Failure
+        if: steps.update_step.outcome == 'failure' || steps.build_step.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        with:
+          name: vcpkg-manifest-install-log-${{ inputs.architecture }}-${{ inputs.build_config }}
+          path: ${{ runner.temp }}/${{ inputs.build_config }}/${{ inputs.build_config }}/vcpkg-manifest-install.log
+          if-no-files-found: ignore
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 5a831a106ae08..882fc56d9a40b 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -33,6 +33,11 @@
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
   onnxruntime_add_shared_library_module(onnxruntime_providers_openvino ${onnxruntime_providers_openvino_cc_srcs} "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc")
 
+  # Propagate leak check define if enabled at top level
+  if(onnxruntime_ENABLE_MEMLEAK_CHECKER)
+    target_compile_definitions(onnxruntime_providers_openvino PRIVATE ONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
+  endif()
+
   onnxruntime_add_include_to_target(onnxruntime_providers_openvino onnxruntime_common onnx nlohmann_json::nlohmann_json)
   install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/openvino/openvino_provider_factory.h
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
@@ -51,6 +56,11 @@
   target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${OpenVINO_INCLUDE_DIR} ${OPENVINO_INCLUDE_DIR_LIST} ${PYTHON_INCLUDE_DIRS} $ENV{OPENCL_INCS} $ENV{OPENCL_INCS}/../../cl_headers/)
   target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen onnx_proto)
 
+  # ETW TraceLogging depends on Advapi32 on Windows
+  if(WIN32)
+    target_link_libraries(onnxruntime_providers_openvino advapi32)
+  endif()
+
   target_compile_definitions(onnxruntime_providers_openvino PRIVATE FILE_NAME=\"onnxruntime_providers_openvino.dll\")
 
   if(MSVC)
diff --git a/onnxruntime/core/dll/dllmain.cc b/onnxruntime/core/dll/dllmain.cc
index 7cc00fa4ca74a..9e50c6e07738f 100644
--- a/onnxruntime/core/dll/dllmain.cc
+++ b/onnxruntime/core/dll/dllmain.cc
@@ -30,6 +30,10 @@ BOOL APIENTRY DllMain(HMODULE /*hModule*/,
       if (lpvReserved != nullptr) {
         g_is_shutting_down = true;
         // do not do cleanup if process termination scenario
+#if defined(ONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
+        // In leak-check builds we still want protobuf shutdown to avoid flagged leaks.
+        ::google::protobuf::ShutdownProtobufLibrary();
+#endif
       } else {
         // Cleanup protobuf library.
         // NOTE: it might be too early to do so, as all function local statics and global objects are not destroyed yet.
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 68d15bdfdcee0..712f3c5faafbe 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -7,6 +7,7 @@
 #include <fstream>
 #include <regex>
 #include <sstream>
+#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -20,7 +21,9 @@
 #include "core/providers/openvino/ov_interface.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/exceptions.h"
 #include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h"
+#include "../../framework/tensorprotoutils.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -35,6 +38,10 @@ ov::CompiledModel BackendManager::GetOVCompiledModel() {
   return ov::CompiledModel();
 }
 
+static bool ShouldExportEpContext(const SessionContext& session_context, const SubGraphContext& subgraph_context) {
+  return session_context.so_context_enable && (subgraph_context.is_ep_ctx_ovir_encapsulated || !subgraph_context.is_ep_ctx_graph);
+}
+
 BackendManager::BackendManager(SessionContext& session_context,
                                SharedContext& shared_context,
                                const onnxruntime::Node& fused_node,
@@ -42,7 +49,7 @@ BackendManager::BackendManager(SessionContext& session_context,
                                const logging::Logger& logger,
                                EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle),
                                                               session_context_(session_context),
-                                                              shared_context_{shared_context} {
+                                                              shared_context_(shared_context) {
   subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph);
   // If the graph contains a OVIR wrapped node, we check if it has matching xml file name attribute
   subgraph_context_.is_ep_ctx_ovir_encapsulated = ep_ctx_handle_.CheckEPCacheContextAttribute(subgraph,
@@ -82,6 +89,10 @@ BackendManager::BackendManager(SessionContext& session_context,
 
   subgraph_context_.subgraph_name = fused_node.Name();
 
+  if (ModelHasSymbolicInputDims(subgraph)) {
+    subgraph_context_.has_dynamic_input_shape = true;
+  }
+
   ptr_stream_t model_stream;
   std::unique_ptr<onnx::ModelProto> model_proto;
   if (subgraph_context_.is_ep_ctx_graph) {
@@ -101,25 +112,7 @@ BackendManager::BackendManager(SessionContext& session_context,
   }
   std::string device_type = session_context_.device_type;
 
-  auto& sw = shared_context_.shared_weights;
-  if (session_context_.so_share_ep_contexts && !sw.metadata.empty()) {
-    std::filesystem::path weight_filename = session_context_.onnx_model_path_name.parent_path();
-    if (sw.external_weight_filename.empty()) {
-      // Reasonable assumption that all metadata entries have the same external file location
-      sw.external_weight_filename = sw.metadata.begin()->second.location;
-    }
-    weight_filename /= sw.external_weight_filename;
-    std::ifstream weight_file(weight_filename);
-
-    ORT_ENFORCE(weight_file, "Initializer file not found: ", weight_filename.string());
-    if (!sw.mapped_weights) {
-      sw.mapped_weights = std::make_unique<SharedContext::SharedWeights::WeightsFile>(weight_filename);
-    }
-    backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, *sw.mapped_weights);
-  }
-
-  if (ModelHasSymbolicInputDims(subgraph)) {
-    subgraph_context_.has_dynamic_input_shape = true;
+  if (subgraph_context_.has_dynamic_input_shape) {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
     if ((!session_context_.disable_dynamic_shapes &&
          (session_context_.device_type.find("CPU") != std::string::npos ||
@@ -153,48 +146,21 @@ BackendManager::BackendManager(SessionContext& session_context,
     subgraph_context_.has_dynamic_input_shape = false;
 
     // OV NPU plugin is supported with fallback to OV CPU upon compilation failures.
-    try {
-      concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                      session_context_,
-                                                      subgraph_context_,
-                                                      shared_context_,
-                                                      model_stream);
-    } catch (const OnnxRuntimeException& ex) {
-      std::string exception_str = ex.what();
-
-      if (session_context_.device_type.find("NPU") != std::string::npos &&
-          exception_str.find("intel_npu") != std::string::npos) {
-        // Handle NPU device related errors
-#ifndef NDEBUG
-        ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
-#else
-        std::string error_message = "UNKNOWN NPU ERROR";
-        std::string error_code = "code 0x0";
-        std::regex error_message_pattern(R"(\bZE_\w*\b)");
-        std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
-        std::smatch matches;
-        if (std::regex_search(exception_str, matches, error_message_pattern)) {
-          error_message = matches[0];
-        }
-        if (std::regex_search(exception_str, matches, error_code_pattern)) {
-          error_code = matches[0];
-        }
-        throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
-#endif
-      } else {
-        ORT_THROW(exception_str);
-      }
-    }
+    concrete_backend_ = BackendFactory::MakeBackend(model_proto,
+                                                    session_context_,
+                                                    subgraph_context_,
+                                                    shared_context_,
+                                                    model_stream);
   }
-  if (session_context_.so_context_enable &&
-      (subgraph_context_.is_ep_ctx_ovir_encapsulated || !subgraph_context_.is_ep_ctx_graph)) {
+
+  if (ShouldExportEpContext(session_context_, subgraph_context_)) {
     if (concrete_backend_) {
-      auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph);
-      if (!status.IsOK()) {
-        ORT_THROW(status);
-      }
+      shared_context_.AddNativeBlob(subgraph_context_.subgraph_name, concrete_backend_->GetOVCompiledModel());
     } else {
-      ORT_THROW("[OpenVINO-EP] Cannot export compiled blob as EPCtx Node: Backend not initialized.");
+      ORT_THROW(
+          "Exporting dynamically compiled models at runtime is not supported. "
+          "Cannot export blobs of dynamic models that request static shape inference. "
+          "To export this model, set disable_dynamic_shapes to False");
     }
   }
 }
@@ -203,13 +169,9 @@ BackendManager::BackendManager(SessionContext& session_context,
 // precompiled blob is set. If that's the case:
 // By default, create model in embed mode where the blob stream is exported as data within
 // the EPContext node.
-Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer) {
-  if (session_context_.disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) {
-    std::string exception_str =
-        "Exporting dynamically compiled models at runtime is not supported. "
-        "Cannot export blobs of dynamic models that request static shape inference. "
-        "To export this model, set disable_dynamic_shapes to False";
-    ORT_THROW(exception_str);
+void BackendManager::TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer, bool include_embed_data) {
+  if (!ShouldExportEpContext(session_context_, subgraph_context_) || !concrete_backend_) {
+    return;
   }
 
   // If embed_mode, then pass on the serialized blob
@@ -217,44 +179,22 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
   std::string model_blob_str;
   auto compiled_model = concrete_backend_->GetOVCompiledModel();
   if (session_context_.so_context_embed_mode) {  // Internal blob
-    std::ostringstream model_blob_stream;
-    compiled_model.export_model(model_blob_stream);
-    model_blob_str = std::move(model_blob_stream).str();
-    if (model_blob_str.empty()) {
-      ORT_THROW("Model blob stream is empty after exporting the compiled model.");
+    if (include_embed_data) {
+      std::stringstream ss;
+      shared_context_.Serialize(ss);
+      model_blob_str = std::move(ss).str();
     }
   } else {  // External blob
-    // Build name by combining EpCtx model name (if available) and subgraph name. Model
-    // name is not available in when creating a session from memory
-    auto name = session_context_.so_context_file_path.stem().string();
-    if (name.empty() && !graph_body_viewer.ModelPath().empty()) {
-      name = graph_body_viewer.ModelPath().stem().string();
-    }
-    ORT_ENFORCE(!name.empty());
-    name += "_" + subgraph_context_.subgraph_name;
-
-    std::filesystem::path blob_filename = session_context_.so_context_file_path;
-    if (blob_filename.empty()) {
-      blob_filename = session_context_.onnx_model_path_name;
-    }
-    blob_filename = blob_filename.parent_path() / (name + ".blob");
-    std::ofstream blob_file(blob_filename,
-                            std::ios::out | std::ios::trunc | std::ios::binary);
-    if (!blob_file) {
-      std::ostringstream err_msg;
-      err_msg << "Unable to open file for epctx model dump: " << blob_filename;
-      ORT_THROW(err_msg.str());
-    }
-    compiled_model.export_model(blob_file);
-    model_blob_str = blob_filename.filename().string();
+    model_blob_str = shared_context_.GetBinPath().filename().string();
   }
 
-  ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer,
-                                                           subgraph_context_.subgraph_name,
-                                                           session_context_.so_context_embed_mode,
-                                                           std::move(model_blob_str)));
-
-  return Status::OK();
+  auto status = ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer,
+                                                     subgraph_context_.subgraph_name,
+                                                     session_context_.so_context_embed_mode,
+                                                     std::move(model_blob_str));
+  if (!status.IsOK()) {
+    ORT_THROW("[OpenVINO-EP] Failed to add OVEP EPContext node to the graph: " + status.ErrorMessage());
+  }
 }
 
 bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& model_proto) const {
@@ -382,18 +322,7 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
   return false;
 }
 
-static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
-  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
-  for (std::size_t i = 0; i < node_indices.size(); i++) {
-    gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
-    for (auto& output : node->OutputDefs()) {
-      if (output->ToProto().type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
-        return true;
-    }
-  }
-  return false;
-}
-
+#if ((OPENVINO_VERSION_MAJOR < 2025) || ((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR < 0)))
 static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) {
   const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr;
   return type_proto && type_proto->has_tensor_type() &&
@@ -431,6 +360,7 @@ static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_vi
   }
   return false;
 }
+#endif
 
 static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
                                 [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
@@ -453,6 +383,80 @@ static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& on
 #endif
 }
 
+// this is a helper function to set the data fields, it duplicates ExternalDataInfo::SetExternalLocationToProto
+// but we cannot use that function as it is not part of public provider api.
+static void SetExternalDataFields(ONNX_NAMESPACE::TensorProto* proto_init, const void* data_ptr, int64_t data_size) {
+  static constexpr const char* ORT_INTERNAL_MEM_INITIALIZER = "*/_ORT_MEM_ADDR_/*";
+  auto* external_data = proto_init->mutable_external_data();
+  bool found_location = false, found_offset = false, found_length = false;
+  const int ext_data_size = external_data->size();
+  proto_init->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+
+  for (int j = 0; j < ext_data_size; ++j) {
+    auto& ext_entry = external_data->at(j);
+    auto& key = *ext_entry.mutable_key();
+    if (key == "location") {
+      *ext_entry.mutable_value() = ORT_INTERNAL_MEM_INITIALIZER;
+      found_location = true;
+    } else if (key == "offset") {
+      *ext_entry.mutable_value() = std::to_string(reinterpret_cast<uintptr_t>(data_ptr));
+      found_offset = true;
+    } else if (key == "length") {
+      *ext_entry.mutable_value() = std::to_string(data_size);
+      found_length = true;
+    }
+  }
+
+  if (!found_location) {
+    auto* new_entry = external_data->Add();
+    *new_entry->mutable_key() = "location";
+    *new_entry->mutable_value() = ORT_INTERNAL_MEM_INITIALIZER;
+  }
+  if (!found_offset) {
+    auto* new_entry = external_data->Add();
+    *new_entry->mutable_key() = "offset";
+    *new_entry->mutable_value() = std::to_string(reinterpret_cast<uintptr_t>(data_ptr));
+  }
+  if (!found_length) {
+    auto* new_entry = external_data->Add();
+    *new_entry->mutable_key() = "length";
+    *new_entry->mutable_value() = std::to_string(data_size);
+  }
+}
+
+static void ReadExternalDataFields(const ONNX_NAMESPACE::TensorProto* src_init, std::string& location, size_t& offset, size_t& length) {
+  // Remove constness as we need to use mutable_external_data() to get the entries to read.
+  // The entries themselves are not modified...
+  auto& mutable_proto = *const_cast<ONNX_NAMESPACE::TensorProto*>(src_init);
+  auto* entry_protos = mutable_proto.mutable_external_data();
+  for (int i = 0; i < entry_protos->size(); i++) {
+    auto& string_entry_proto{entry_protos->at(i)};
+    const auto& pb_key{*(string_entry_proto.mutable_key())};
+    const auto& pb_value{*(string_entry_proto.mutable_value())};
+    if (pb_key == "location") {
+      location = pb_value;
+    } else if (pb_key == "offset") {
+      const auto res = std::from_chars(pb_value.data(), pb_value.data() + pb_value.size(), offset);
+      if (res.ec != std::errc()) {
+        std::ostringstream err_msg;
+        err_msg << "External data in memory has invalid offset field: "
+                << src_init->name() << "], location: " << location
+                << ", offset: " << pb_value;
+        ORT_THROW(err_msg.str());
+      }
+    } else if (pb_key == "length") {
+      const auto res = std::from_chars(pb_value.data(), pb_value.data() + pb_value.size(), length);
+      if (res.ec != std::errc()) {
+        std::ostringstream err_msg;
+        err_msg << "External data in memory has invalid length field: "
+                << src_init->name() << "], location: " << location
+                << ", length: " << pb_value;
+        ORT_THROW(err_msg.str());
+      }
+    }
+  }
+}
+
 std::unique_ptr<ONNX_NAMESPACE::ModelProto>
 BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
                                            const onnxruntime::GraphViewer& subgraph,
@@ -490,24 +494,23 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
   }
 #endif
 
-  // Check if the graph is QDQ and has int16 or uint16 quantization
-  // If so, we will apply the QDQ scales fix transformation (for GPU device only)
-  bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);
-
   const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU and experimentally on the GPU
   if ((session_context_.device_type.find("NPU") != std::string::npos) &&
       (enable_ovep_qdq_optimizer || session_context_.so_share_ep_contexts)) {
     std::unique_ptr<onnxruntime::Model> model;
-    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, enable_ovep_qdq_optimizer, model, shared_context_.shared_weights);
+    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, enable_ovep_qdq_optimizer, model, shared_context_);
     auto model_proto = model->ToProto();
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     print_model_proto_duration();
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
-  } else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
-             is_qdq_graph_uint16_or_int16) {
+  }
+#if ((OPENVINO_VERSION_MAJOR < 2025) || ((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR < 0)))
+  // Enable OVEP-level QDQ stripping only for OV versions that don't have it
+  else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
+           IsQDQGraphWithUint16OrInt16(subgraph)) {
     // Create a copy of the model
     std::unique_ptr<onnxruntime::Model> model;
     Status status = qdq_scales_fix::Transform(subgraph, logger, model);
@@ -517,24 +520,103 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
-  } else if (IsModelBF16(subgraph)) {
-    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled";
-    std::unique_ptr<onnxruntime::Model> model;
-    Status status = bfloat16_fix::Transform(subgraph, logger, model);
-    auto model_proto = model->ToProto();
-    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
-    print_model_proto_duration();
-    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
-    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
-    return model_proto;
-  } else {
+  }
+#endif
+  else {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled";
+
+    // scan ext initializers:
+    std::unordered_map<std::string, std::pair<size_t, size_t>> external_initializers_offset_and_length;
+    std::string tempLocation;
+    size_t extInitializerTotalSize = 0;
+    if (session_context_.has_external_weights && !subgraph_context_.has_dynamic_input_shape) {
+      auto allInitializers = subgraph.GetAllInitializedTensors();
+      for (auto& [name, tp] : allInitializers) {
+        if (utils::HasExternalDataInMemory(*tp)) {
+          size_t offset = 0;
+          size_t length = 0;
+          ReadExternalDataFields(tp, tempLocation, offset, length);
+          extInitializerTotalSize += length;
+          external_initializers_offset_and_length[name] = {offset, length};
+        }
+      }
+    }
+
+    // when we have external weights in memory, the model proto will actually embed those
+    // and bloat the serialized string. We can avoid that by not including the data in the proto
+    // but then we have to update those initializers and set the external_data fields to mem_addr tag...
+    // proto is limited to 2GB, but let's use 32MB as threshold to be conservative and still gain some memory reductions.
+#if (((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2025))
+    constexpr size_t MAX_EMBEDDED_INITIALIZER_SIZE = 1024 * 1024 * 32;
+    const bool include_initializer_data_in_proto = !(session_context_.has_external_weights &&
+                                                     external_initializers_offset_and_length.size() > 1 &&
+                                                     extInitializerTotalSize >= MAX_EMBEDDED_INITIALIZER_SIZE);
+#else
+    const bool include_initializer_data_in_proto = true;
+#endif
+
     auto model = subgraph.CreateModel(logger);
     auto model_proto = model->ToProto();
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
-    subgraph.ToProto(*model_proto->mutable_graph(), true, true);
+    subgraph.ToProto(*model_proto->mutable_graph(), /*include_initializers*/ true,
+                     /*include_outer_scope_args*/ true, /*execution_order*/ 0, /*include_initializer_data*/ include_initializer_data_in_proto);
+
     print_model_proto_duration();
+
+    if (!include_initializer_data_in_proto) {
+      LOGS(logger, INFO) << "Initializer data is not included in the model proto. Updating metadata..., total size " << extInitializerTotalSize / (1024 * 1024) << " MB in " << external_initializers_offset_and_length.size() << " initializers";
+      auto* graph_proto = model_proto->mutable_graph();
+      auto* proto_initializers = graph_proto->mutable_initializer();
+
+      std::unordered_map<std::string, ONNX_NAMESPACE::TensorProto*> proto_initializer_map;
+      for (int i = 0, n = proto_initializers->size(); i < n; ++i) {
+        auto& proto_init = proto_initializers->at(i);
+        proto_initializer_map[proto_init.name()] = &proto_init;
+      }
+
+      for (const auto& [name, src_init] : subgraph.GetAllInitializedTensors()) {
+        auto it = proto_initializer_map.find(name);
+        if (it == proto_initializer_map.end())
+          continue;
+
+        auto* proto_init = it->second;
+
+        // If the proto initializer is missing data, fill it in
+        if (!proto_init->has_raw_data() && src_init->has_raw_data()) {
+          *proto_init->mutable_raw_data() = src_init->raw_data();
+        }
+
+        // Only set in-memory external_data fields if the data is in memory
+        if (src_init->has_raw_data()) {
+          LOGS(logger, VERBOSE) << "In-memory initializer RAW: "
+                                << src_init->name()
+                                << ", data_type: " << src_init->data_type()
+                                << ", raw_data size: " << src_init->raw_data().size();
+          if (src_init->raw_data().size() > 0)
+            SetExternalDataFields(proto_init, src_init->raw_data().data(), src_init->raw_data().size());
+          else
+            LOGS(logger, VERBOSE) << "Initializer has empty raw_data: skipping initializer '" << src_init->name() << "'...";
+        } else if (onnxruntime::utils::HasExternalDataInMemory(*src_init)) {
+          auto it_ext = external_initializers_offset_and_length.find(name);
+          if (it_ext == external_initializers_offset_and_length.end()) {
+            std::ostringstream err_msg;
+            err_msg << "Initializer marked as external in memory but missing offset/length info: " << src_init->name();
+            ORT_THROW(err_msg.str());
+          }
+          const size_t offset = it_ext->second.first;
+          const size_t length = it_ext->second.second;
+
+          LOGS(logger, VERBOSE) << "In-memory initializer EXT: " << src_init->name() << ", size: " << length;
+
+          SetExternalDataFields(proto_init, (const void*)offset, length);
+        } else {
+          LOGS(logger, VERBOSE) << "File-based initializer: " << src_init->name() << ", data_type: " << src_init->data_type();
+        }
+      }
+    }
+
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
+
     return model_proto;
   }
 }
@@ -672,7 +754,10 @@ void BackendManager::Compute(OrtKernelContext* context) {
 
     {
       std::unique_lock<std::mutex> lock(mutex_);
-      dynamic_backend = backend_map_[key];
+      auto it = backend_map_.find(key);
+      if (it != backend_map_.end()) {
+        dynamic_backend = it->second;
+      }
     }
 
     if (!dynamic_backend) {
@@ -712,7 +797,24 @@ void BackendManager::Compute(OrtKernelContext* context) {
             ORT_THROW(msg);
           }
         } else {
-          ORT_THROW(ex.what());
+          std::string exception_str = ex.what();
+          if (session_context_.so_disable_cpu_ep_fallback) {
+            std::string error_message = "UNKNOWN NPU ERROR";
+            std::string error_code = "code 0x0";
+            std::regex error_message_pattern(R"(\bZE_\w*\b)");
+            std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
+            std::smatch matches;
+            if (std::regex_search(exception_str, matches, error_message_pattern)) {
+              error_message = matches[0];
+            }
+            if (std::regex_search(exception_str, matches, error_code_pattern)) {
+              error_code = matches[0];
+            }
+            std::string suffix = "\nModel failed to compile on NPU. Enable CPU fallback or try another device.\n";
+            throw std::runtime_error(error_message + ", " + error_code + suffix);
+          } else {
+            ORT_THROW(exception_str);
+          }
         }
 #endif
       }
@@ -746,4 +848,4 @@ void BackendManager::RewindKVCache(size_t index) {
 }
 
 }  // namespace openvino_ep
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index f091f95fe1c16..716fe3ef4cc90 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -28,7 +28,7 @@ class BackendManager {
   void Compute(OrtKernelContext* context);
   void ShutdownBackendManager();
   SessionContext& GetSessionContext();
-  Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph);
+  void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data);
   ov::CompiledModel GetOVCompiledModel();
   void RewindKVCache(size_t index);
 
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 7027861f0c4dc..45e518d16686e 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -20,104 +20,6 @@ using Exception = ov::Exception;
 namespace onnxruntime {
 namespace openvino_ep {
 
-SharedContext::SharedWeights::WeightsFile::WeightsFile(std::filesystem::path filename) : file_(filename, std::ios::in | std::ios::binary) {
-  try {
-    file_.exceptions(std::ifstream::failbit | std::ifstream::badbit);
-    weights_size_ = file_.seekg(0, std::ios::end).tellg();
-  } catch (std::ifstream::failure& e) {
-    ORT_THROW("Error: Failed to open weight file at ", filename.string(), " ", e.what());
-  }
-}
-
-void SharedContext::SharedWeights::WeightsFile::load_weights(size_t file_offset, void* data, size_t size) {
-  ORT_ENFORCE(file_offset < weights_size_ && size <= weights_size_ && (file_offset <= weights_size_ - size), "Error: File offset is out of bounds.");
-  file_.seekg(file_offset);
-  file_.read(reinterpret_cast<char*>(data), size);
-}
-
-std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) {
-  try {
-    stream << metadata.size();
-
-    // Write each key-value pair
-    // Put elements in separate lines to facilitate reading
-    for (const auto& [key, value] : metadata) {
-      stream << std::endl
-             << key.name;
-      stream << std::endl
-             << value.location;
-      stream << std::endl
-             << value.data_offset;
-      stream << std::endl
-             << value.size;
-      stream << std::endl
-             << value.dimensions.size();
-      for (const auto& dim : value.dimensions) {
-        stream << std::endl
-               << dim;
-      }
-      stream << std::endl
-             << value.element_type;
-    }
-  } catch (const Exception& e) {
-    ORT_THROW("Error: Failed to write map data.", e.what());
-  } catch (...) {
-    ORT_THROW("Error: Failed to write map data.");
-  }
-
-  ORT_ENFORCE(stream.good(), "Error: Failed to write map data.");
-  return stream;
-}
-
-std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Metadata::Map& metadata) {
-  size_t map_size{0};
-  try {
-    stream >> map_size;
-
-    while (!stream.eof()) {
-      SharedContext::SharedWeights::Metadata::Key key;
-      SharedContext::SharedWeights::Metadata::Value value;
-      stream >> key.name;
-      stream >> value.location;
-      stream >> value.data_offset;
-      stream >> value.size;
-      size_t num_dimensions;
-      stream >> num_dimensions;
-
-      if (stream.fail()) {
-        ORT_THROW("Error: Failed to read num_dimensions from stream.");
-      }
-
-      constexpr size_t MAX_SAFE_DIMENSIONS = 1024;
-
-      size_t safe_num_dimensions = num_dimensions;
-
-      if (num_dimensions == 0 || safe_num_dimensions > MAX_SAFE_DIMENSIONS) {
-        ORT_THROW("Invalid number of dimensions provided.");
-      }
-      try {
-        value.dimensions.resize(safe_num_dimensions);
-      } catch (const std::bad_alloc&) {
-        ORT_THROW("Error: Memory allocation failed while resizing dimensions.");
-      }
-
-      for (auto& dim : value.dimensions) {
-        stream >> dim;
-      }
-      stream >> value.element_type;
-      metadata.emplace(key, value);
-    }
-  } catch (const Exception& e) {
-    ORT_THROW("Error: Failed to read map data.", e.what());
-  } catch (...) {
-    ORT_THROW("Error: Failed to read map data.");
-  }
-
-  ORT_ENFORCE(metadata.size() == map_size, "Error: Inconsistent map data.");
-
-  return stream;
-}
-
 namespace backend_utils {
 
 bool IsDebugEnabled() {
@@ -364,82 +266,10 @@ void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
 }
 
 void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std::string deviceName) {
-  auto performanceMap = request->GetNewObj().get_profiling_info();
+  auto performanceMap = request->GetInfReq().get_profiling_info();
   printPerformanceCounts(performanceMap, stream, std::move(deviceName));
 }
 
-ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt) {
-  static std::unordered_map<ONNX_NAMESPACE::TensorProto_DataType, ov::element::Type> map{
-      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT, ov::element::f32},
-      {ONNX_NAMESPACE::TensorProto_DataType_UINT8, ov::element::u8},
-      {ONNX_NAMESPACE::TensorProto_DataType_INT8, ov::element::i8},
-      {ONNX_NAMESPACE::TensorProto_DataType_UINT16, ov::element::u16},
-      {ONNX_NAMESPACE::TensorProto_DataType_INT16, ov::element::i16},
-      {ONNX_NAMESPACE::TensorProto_DataType_INT32, ov::element::i32},
-      {ONNX_NAMESPACE::TensorProto_DataType_INT64, ov::element::i64},
-      {ONNX_NAMESPACE::TensorProto_DataType_STRING, ov::element::string},
-      {ONNX_NAMESPACE::TensorProto_DataType_BOOL, ov::element::boolean},
-      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, ov::element::f16},
-      {ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, ov::element::f64},
-      {ONNX_NAMESPACE::TensorProto_DataType_UINT32, ov::element::u32},
-      {ONNX_NAMESPACE::TensorProto_DataType_UINT64, ov::element::u64},
-      //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64, ov::element::undefined},
-      //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128, ov::element::undefined},
-      {ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16, ov::element::bf16},
-      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN, ov::element::undefined},
-      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ, ov::element::undefined},
-      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2, ov::element::f8e5m2},
-      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ, ov::element::undefined},
-      {ONNX_NAMESPACE::TensorProto_DataType_UINT4, ov::element::u4},
-      {ONNX_NAMESPACE::TensorProto_DataType_INT4, ov::element::i4},
-  };
-
-  if (auto result = map.find(dt); result != map.end()) {
-    return result->second;
-  } else {
-    throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(dt));
-  }
-}
-
-// Function to handle tensor creation from external data
-void CreateOVTensors(const std::string& device_name,
-                     SharedContext::SharedWeights::Metadata::Map& metadata_map,
-                     SharedContext::SharedWeights::WeightsFile& weights) {
-  for (auto& [key, value] : metadata_map) {
-    if (value.tensor) continue;
-
-    // Get element data type
-    auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type;
-
-    ov::element::Type ov_elementType = GetOpenVINOElementType(onnx_element_type);  // Map to OpenVINO data type
-
-    // Create OpenVINO Tensor
-    if (device_name == "NPU") {
-      // Use remote tensors
-      auto npu_context = OVCore::Get()->core.get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
-      auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT);
-
-      // Copy data to remote tensor
-      weights.load_weights(value.data_offset, remote_tensor.get(), value.size);
-      value.tensor = std::make_shared<ov::Tensor>(remote_tensor);
-    } else {
-      // Use vanilla tensors
-      value.tensor = std::make_shared<ov::Tensor>(ov_elementType, value.dimensions);
-      weights.load_weights(value.data_offset, value.tensor->data(), value.size);
-    }
-    ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch");
-  }
-}
-
-void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map) {
-  for (auto& [key, value] : metadata_map) {
-    if (value.tensor) {
-      value.tensor.reset();
-    }
-  }
-  metadata_map.clear();
-}
-
 bool IsModelStreamXML(std::istream& model_stream) {
   std::streampos originalPos = model_stream.tellg();
 
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 27f791c7a5bd1..8ba35e0abd1bc 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -99,11 +99,6 @@ CreateOVModel(std::string&& model,
               const SessionContext& session_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
-void CreateOVTensors(const std::string& device_name,
-                     SharedContext::SharedWeights::Metadata::Map& metadata_map,
-                     SharedContext::SharedWeights::WeightsFile& weights);
-void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map);
-
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
                             std::ostream& stream, std::string deviceName);
 
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 2f174110dd31b..d7fc0553fb1d4 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -138,20 +138,13 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   }
   int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1;
   std::function<void(OVInferRequestPtr)> initializer = [](OVInferRequestPtr) {};
-  auto metadata = shared_context_.shared_weights.metadata;
   if (session_context_.so_share_ep_contexts) {
-    initializer = [&metadata](OVInferRequestPtr ir_ptr) {
-      const auto input_count = ir_ptr->GetNumInputs();
-      for (auto i = 0u; i < input_count; i++) {
-        using Key = SharedContext::SharedWeights::Metadata::Key;
-        const auto tensor_key = Key{ir_ptr->GetInputTensorName(i)};
-        if (metadata.contains(tensor_key)) {
-          auto& value = metadata.at(tensor_key);
-          ir_ptr->SetTensor(tensor_key.name, value.tensor);
-        }
-      }
+    auto model_dir = session_context_.GetModelPath().parent_path();
+    initializer = [this, model_dir = std::move(model_dir)](OVInferRequestPtr ir_ptr) {
+      shared_context_.SetSharedWeightsOnInferRequest(ir_ptr->GetInfReq(), model_dir);
     };
   }
+
   infer_req_pool_ = std::make_unique<InferRequestPool>(exe_network_, num_infer_req, std::move(initializer));
   bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_, session_context_);
 }
@@ -242,13 +235,13 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   }
 }
 
-void BasicBackend::EnableCaching() {
+void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
   // cache_dir argument has no effect when working with an embed-mode EPContext Graph
   if (subgraph_context_.is_ep_ctx_graph) return;
 
   if (!session_context_.cache_dir.empty() && !session_context_.so_context_enable) {
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
-    OVCore::Get()->SetCache(session_context_.cache_dir.string());
+    device_config.emplace(ov::cache_dir(session_context_.cache_dir.string()));
   }
 }
 
@@ -262,7 +255,7 @@ void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) {
   }
 }
 
-void BasicBackend::EnableStreams() {
+void BasicBackend::EnableStreams(ov::AnyMap& device_config) {
   // Return silently for NPU as it's currently treated as a read-only flag by the NPU plugin
   // and throws an exception for the same
   if (session_context_.device_type.find("NPU") != std::string::npos)
@@ -279,7 +272,7 @@ void BasicBackend::EnableStreams() {
     }
     // Do nothing
   } else {
-    OVCore::Get()->SetStreams(session_context_.device_type, session_context_.num_streams);
+    device_config.emplace(ov::num_streams(session_context_.num_streams));
   }
 }
 
@@ -293,13 +286,13 @@ void BasicBackend::SetOVDeviceConfiguration(ov::AnyMap& device_config) {
   PopulateConfigValue(device_config);
 
   // Enable caching
-  EnableCaching();
+  EnableCaching(device_config);
 
   // Setting OpenCL queue throttling for GPU
   EnableGPUThrottling(device_config);
 
   // Enable streams; default=1 unless overridden by user configuration
-  EnableStreams();
+  EnableStreams(device_config);
 
   // Set the inference_num_threads property of the CPU
   SetNumThreads(device_config);
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 5c75a9ae183e2..2cf3d3faa8b47 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -142,9 +142,9 @@ class BasicBackend : public IBackend {
  private:
   bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
   void PopulateConfigValue(ov::AnyMap& device_config);
-  void EnableCaching();
+  void EnableCaching(ov::AnyMap& device_config);
   void EnableGPUThrottling(ov::AnyMap& device_config);
-  void EnableStreams();
+  void EnableStreams(ov::AnyMap& device_config);
   void SetNumThreads(ov::AnyMap& device_config);
   void SetOVDeviceConfiguration(ov::AnyMap& device_config);
   void ValidateOrtDimsAgainstPartialShape(const std::vector<int64_t>& ort_dims,
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 07b09899ac214..ebb716a64162c 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -13,61 +13,14 @@
 #include "core/common/common.h"
 #include "core/providers/openvino/ov_interface.h"
 #include "core/providers/shared_library/provider_api.h"
+#include "ov_bin_manager.h"
+#include "ov_shared_context.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
 namespace fs = std::filesystem;
 
-class SharedContext : public WeakSingleton<SharedContext> {
-  // Keep the core alive as long as the shared SharedContext are alive.
-  std::shared_ptr<OVCore> OVCore_;
-
- public:
-  SharedContext() : OVCore_(OVCore::Get()) {}
-  struct SharedWeights {
-    struct Metadata {
-      struct Key {
-        std::string name;
-        bool operator==(const Key&) const = default;
-      };
-      struct Hash {
-        std::size_t operator()(const Key& key) const noexcept {
-          return std::hash<std::string>()(key.name);
-        }
-      };
-      struct Value {
-        std::string location;
-        unsigned int data_offset;
-        unsigned int size;
-        std::vector<size_t> dimensions;
-        std::int32_t element_type;
-        std::shared_ptr<ov::Tensor> tensor;
-      };
-      using Map = std::unordered_map<Key, Value, Hash>;
-      friend std::ostream& operator<<(std::ostream& right, const Metadata::Map& metadata);
-      friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata);
-    };
-
-    struct WeightsFile {
-      ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightsFile);
-      WeightsFile() = delete;
-      explicit WeightsFile(std::filesystem::path filename);
-
-      void load_weights(size_t file_offset, void* data, size_t size);
-
-     private:
-      std::ifstream file_;
-      size_t weights_size_;
-    };
-
-    fs::path external_weight_filename;
-    std::unique_ptr<WeightsFile> mapped_weights;
-    Metadata::Map metadata;
-    fs::path metadata_filepath;
-  } shared_weights;
-};
-
 using config_t = std::map<std::string, ov::AnyMap>;
 using reshape_t = std::map<std::string, ov::PartialShape>;
 using layout_t = std::map<std::string, ov::Layout>;
@@ -108,6 +61,7 @@ struct ProviderInfo {
   bool so_disable_cpu_ep_fallback{false};  // ORT session option
   bool so_context_embed_mode{false};       // ORT session option
   bool so_share_ep_contexts{false};        // ORT session option
+  bool so_stop_share_ep_contexts{false};   // ORT session option
   fs::path so_context_file_path{};         // ORT session option
   const ConfigOptions* config_options{NULL};
   const std::unordered_set<std::string> valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision",
@@ -115,9 +69,20 @@ struct ProviderInfo {
                                                                "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout"};
 };
 
+struct RuntimeConfig {
+  std::unordered_map<std::string, std::string> options;
+  std::optional<std::string> Get(const std::string& key) const {
+    auto it = options.find(key);
+    return it != options.end() ? std::optional{it->second} : std::nullopt;
+  }
+};
+
 // Holds context applicable to the entire EP instance.
 struct SessionContext : ProviderInfo {
-  SessionContext(const ProviderInfo& info) : ProviderInfo{info} {}
+  SessionContext(const ProviderInfo& info) : ProviderInfo{info} {
+    InitRuntimeConfig();
+  }
+
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
   std::filesystem::path onnx_model_path_name;
   uint32_t onnx_opset_version{0};
@@ -125,6 +90,31 @@ struct SessionContext : ProviderInfo {
   mutable bool has_external_weights = false;       // Value is set to mutable to modify from capability
   const std::vector<uint32_t> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
   const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR);
+
+  RuntimeConfig runtime_config;
+
+  const std::filesystem::path& GetModelPath() const {
+    return onnx_model_path_name.empty() ? so_context_file_path : onnx_model_path_name;
+  }
+
+  const std::filesystem::path& GetOutputModelPath() const {
+    return so_context_file_path.empty() ? onnx_model_path_name : so_context_file_path;
+  }
+
+  std::filesystem::path GetOutputBinPath() const {
+    const auto& bin_file_name = GetOutputModelPath();
+    if (bin_file_name.empty()) {
+      return {};
+    }
+    return BinManager::GetBinPathForModel(bin_file_name);
+  }
+
+ private:
+  void InitRuntimeConfig() {
+    if (config_options) {
+      runtime_config.options = config_options->GetConfigOptionsMap();
+    }
+  }
 };
 
 // Holds context specific to subgraph.
diff --git a/onnxruntime/core/providers/openvino/exceptions.h b/onnxruntime/core/providers/openvino/exceptions.h
new file mode 100644
index 0000000000000..140ab1ac688ba
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/exceptions.h
@@ -0,0 +1,88 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include <exception>
+#include <regex>
+#include <string>
+
+#include "core/common/status.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+struct ovep_exception : public std::exception {
+  enum class type {
+    compile_model,
+    import_model,
+    query_prop,
+    read_model,
+    unknown,
+  };
+
+  ovep_exception(const std::exception& ex, enum class type exception_type)
+      : message_{ex.what()},
+        type_{exception_type},
+        error_code_{ze_result_code_from_string(message_)},
+        error_name_{ze_result_name_from_string(message_)} {}
+
+  ovep_exception(const std::string& message, enum class type exception_type)
+      : message_{message},
+        type_{exception_type},
+        error_code_{ze_result_code_from_string(message)},
+        error_name_{ze_result_name_from_string(message)} {}
+
+  const char* what() const noexcept override {
+    return message_.data();
+  }
+
+  uint32_t get_code() const { return error_code_; }
+
+  operator common::Status() const {
+    common::StatusCategory category_ort{common::ONNXRUNTIME};
+
+    if (type_ == type::unknown) {
+      return {category_ort, common::FAIL, message_};
+    }
+
+    // Newer drivers
+    if ((type_ == type::import_model) &&
+        (error_code_ == 0x7800000f /* ZE_RESULT_ERROR_INVALID_NATIVE_BINARY */)) {
+      std::string message{error_name_ + ", code 0x" + std::to_string(error_code_) + "\nModel needs to be recompiled\n"};
+      return {category_ort, common::INVALID_GRAPH, message};
+    }
+
+    std::string error_message = "Unhandled exception type: " + std::to_string(static_cast<int>(type_));
+    return {category_ort, common::EP_FAIL, error_message};
+  }
+
+ protected:
+  std::string message_;
+  type type_{type::unknown};
+  uint32_t error_code_{0};
+  std::string error_name_;
+
+ private:
+  uint32_t ze_result_code_from_string(const std::string& ov_exception_string) {
+    uint32_t error_code{0};
+    std::regex error_code_pattern("code 0x([0-9a-fA-F]+)");
+    std::smatch matches;
+    if (std::regex_search(ov_exception_string, matches, error_code_pattern)) {
+      std::from_chars(&(*matches[1].first), &(*matches[1].second), error_code, 16);
+    }
+    return error_code;
+  }
+  std::string ze_result_name_from_string(const std::string& ov_exception_string) {
+    std::string error_message = "UNKNOWN NPU ERROR";
+    std::regex error_message_pattern(R"(\bZE_\w*\b)");
+    std::smatch matches;
+    if (std::regex_search(ov_exception_string, matches, error_message_pattern)) {
+      error_message = matches[0];
+    }
+    return error_message;
+  }
+};
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index 051a39bd4f205..60a461f7159f3 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -12,32 +12,11 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-EPCtxHandler::EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger) : openvino_sdk_version_(std::move(ov_sdk_version)), logger_(logger) {
-  epctx_model_ = Model::Create("ovep_context_model", false, logger_);
-}
-
-/* Export the serialized blob string embedded onto an EPContext Node
- * along with other metadata necessary to validate the graph on import
- */
+EPCtxHandler::EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger, std::shared_ptr<SharedContextManager> shared_context_manager)
+    : openvino_sdk_version_(std::move(ov_sdk_version)), logger_(logger), shared_context_manager_(std::move(shared_context_manager)) {
+  ORT_ENFORCE(shared_context_manager_ != nullptr, "SharedContextManager pointer is null in EPCtxHandler constructor.");
 
-Status EPCtxHandler::ExportEPCtxModel(const std::string& model_name) {
-  // Serialize modelproto to string
-  auto model_proto = epctx_model_->ToProto();
-  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
-
-  // Finally, dump the model
-  std::ofstream epctx_onnx_model(model_name,
-                                 std::ios::out | std::ios::trunc | std::ios::binary);
-  if (!epctx_onnx_model) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file");
-  }
-
-  if (!model_proto->SerializeToOstream(epctx_onnx_model)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file");
-  }
-  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node";
-
-  return Status::OK();
+  epctx_model_ = Model::Create("ovep_context_model", false, logger_);
 }
 
 Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
@@ -59,7 +38,7 @@ Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
 
   // Create EP context node attributes
   auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
-  node_attributes->reserve(4);
+  node_attributes->reserve(6);
   {
     // Create EP context node attributes
 
@@ -70,6 +49,13 @@ Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
     embed_mode_attr->set_i(embed_mode);
     node_attributes->emplace(EMBED_MODE, std::move(*embed_mode_attr));
 
+    // main context
+    auto main_graph_attr = ONNX_NAMESPACE::AttributeProto::Create();
+    main_graph_attr->set_name(MAIN_CONTEXT);
+    main_graph_attr->set_type(onnx::AttributeProto_AttributeType_INT);
+    main_graph_attr->set_i(model_blob_str.empty() ? 0 : 1);
+    node_attributes->emplace(MAIN_CONTEXT, std::move(*main_graph_attr));
+
     // ep context
     auto ep_cache_context_attr = ONNX_NAMESPACE::AttributeProto::Create();
     ep_cache_context_attr->set_name(EP_CACHE_CONTEXT);
@@ -90,6 +76,13 @@ Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
     source_attr->set_type(onnx::AttributeProto_AttributeType_STRING);
     source_attr->set_s(kOpenVINOExecutionProvider);
     node_attributes->emplace(SOURCE, std::move(*source_attr));
+
+    // partition name
+    auto partition_name_attr = ONNX_NAMESPACE::AttributeProto::Create();
+    partition_name_attr->set_name(PARTITION_NAME);
+    partition_name_attr->set_type(onnx::AttributeProto_AttributeType_STRING);
+    partition_name_attr->set_s(graph_name);
+    node_attributes->emplace(PARTITION_NAME, std::move(*partition_name_attr));
   }
 
   // Create EP context node
@@ -100,8 +93,7 @@ Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
   return Status::OK();
 }
 
-std::unique_ptr<ModelBlobWrapper>
-EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
+std::unique_ptr<ModelBlobWrapper> EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
   auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
   auto node = graph_viewer.GetNode(first_index);
   ORT_ENFORCE(node != nullptr);
@@ -130,16 +122,23 @@ EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_pa
   bool isXML = backend_utils::IsModelStreamXML(*result);
   std::filesystem::path native_blob_path{};
   if (!isXML) {
+    ORT_ENFORCE(attrs.count(PARTITION_NAME) == 1, "Expected partition name for native ep context node");
+    const auto& partition_name = attrs.at(PARTITION_NAME).s();
+
     // If the model stream is not an XML (i.e. precompiled blob), the OpenVINO SDK version that it was
     // exported with must match the version that is currently running.
     native_blob_path = std::move(blob_filepath);
     ORT_ENFORCE((attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_),
                 "EPCtx blob was exported / is compatible with OpenVINO SDK version " + attrs.at(EP_SDK_VER).s() +
                     ", but OpenVINO SDK version currently in use is " + openvino_sdk_version_);
+
+    result.reset();  // Release the stream as we will get the native blob from SharedContext
+    auto shared_context = shared_context_manager_->GetOrCreateSharedContext(native_blob_path);
+    return std::make_unique<ModelBlobWrapper>(shared_context->GetNativeBlobAsStream(partition_name), shared_context->GetNativeBlob(partition_name));
   }
 
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
-  return std::make_unique<ModelBlobWrapper>(std::move(result), native_blob_path);
+  return std::make_unique<ModelBlobWrapper>(std::move(result), ov::Tensor());
 }
 
 bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const {
@@ -196,5 +195,76 @@ bool EPCtxHandler::CheckEPCacheContextAttribute(const GraphViewer& graph_viewer,
   return false;
 }
 
+std::shared_ptr<SharedContext> EPCtxHandler::Initialize(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes, const SessionContext& session_context) {
+  bool has_embed_nodes = false;
+  bool has_non_embed_nodes = false;
+  bool has_main_context = false;
+
+  std::shared_ptr<SharedContext> shared_context{};
+  for (const auto& fused_node_graph : fused_nodes) {
+    const GraphViewer& graph_viewer = fused_node_graph.filtered_graph;
+
+    // Only process graphs that contain ep context nodes.
+    if (!CheckForOVEPCtxNodeInGraph(graph_viewer)) {
+      continue;
+    }
+
+    auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
+    const Node* node = graph_viewer.GetNode(first_index);
+    ORT_ENFORCE(node != nullptr, "Node pointer is null despite CheckForOVEPCtxNodeInGraph returning true");
+
+    auto& attrs = node->GetAttributes();
+    ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) == 1, "EP_CACHE_CONTEXT attribute missing");
+
+    bool embed_mode = false;
+    if (attrs.count(EMBED_MODE) == 1) {
+      embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
+    }
+
+    bool main_context = true;
+    if (attrs.count(MAIN_CONTEXT) == 1) {
+      main_context = static_cast<bool>(attrs.at(MAIN_CONTEXT).i());
+    }
+
+    has_main_context |= main_context;
+    has_embed_nodes |= embed_mode;
+    has_non_embed_nodes |= !embed_mode;
+
+    const std::string& ep_cache_context = attrs.at(EP_CACHE_CONTEXT).s();
+    if (embed_mode) {
+      std::filesystem::path dummy_path{};
+      shared_context = shared_context_manager_->GetOrCreateSharedContext(dummy_path);
+      if (main_context) {
+        ORT_ENFORCE(!ep_cache_context.empty(), "Embedded EP context is indicated but EP_CACHE_CONTEXT attribute is empty.");
+        std::istringstream ss(ep_cache_context);
+        shared_context->Deserialize(ss);
+      }
+    } else {
+      std::filesystem::path ep_context_path = session_context.GetOutputModelPath().parent_path() / ep_cache_context;
+      if (ep_context_path.extension() != ".xml") {
+        shared_context = shared_context_manager_->GetOrCreateSharedContext(ep_context_path);
+        shared_context->Deserialize();
+      }
+    }
+  }
+
+  ORT_ENFORCE(!(has_embed_nodes && has_non_embed_nodes),
+              "Mixed embed and non-embed EP context nodes are not supported in a single model.");
+  ORT_ENFORCE(!(has_embed_nodes && !has_main_context),
+              "Expected at least one main context node when embedded EP context nodes are present.");
+
+  // No ep context nodes found - create a shared context that can hold native blobs or shared weights.
+  if (!shared_context) {
+    if (session_context.so_context_enable && session_context.so_share_ep_contexts) {
+      // We're creating a shared ep context model get or create the active context.
+      shared_context = shared_context_manager_->GetOrCreateActiveSharedContext(session_context.GetOutputBinPath());
+    } else {
+      shared_context = shared_context_manager_->GetOrCreateSharedContext(session_context.GetOutputBinPath());
+    }
+  }
+
+  return shared_context;
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
index f207f5014ca1f..fce88005a0605 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
@@ -8,43 +8,49 @@
 #include <memory>
 
 #include "core/providers/shared_library/provider_api.h"
+#include "core/framework/execution_provider.h"
+#include "ov_shared_context.h"
+#include "contexts.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
 struct ModelBlobWrapper {
-  ModelBlobWrapper(std::unique_ptr<std::istream> stream, const std::filesystem::path& native_blob_path) : stream_(std::move(stream)), maybe_native_blob_path_(native_blob_path) {}
+  ModelBlobWrapper(std::unique_ptr<std::istream> stream, const ov::Tensor& tensor) : stream_(std::move(stream)), tensor_(tensor) {}
   std::unique_ptr<std::istream> stream_;
-  std::filesystem::path maybe_native_blob_path_;
+  ov::Tensor tensor_;  // May be empty if model blob is provided as stream only.
 };
 
 // Utilities to handle EPContext node export and parsing of an EPContext node
 // to create the compiled_model object to infer on
 static const char EPCONTEXT_OP[] = "EPContext";
 static const char EMBED_MODE[] = "embed_mode";
+static const char MAIN_CONTEXT[] = "main_context";
+static const char PARTITION_NAME[] = "partition_name";
 static const char EP_CACHE_CONTEXT[] = "ep_cache_context";
 static const char EP_SDK_VER[] = "ep_sdk_version";
 static const char SOURCE[] = "source";
 
 class EPCtxHandler {
  public:
-  EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger);
+  EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger, std::shared_ptr<SharedContextManager> shared_context_manager);
   EPCtxHandler(const EPCtxHandler&) = delete;  // No copy constructor
-  Status ExportEPCtxModel(const std::string& model_name);
-  bool CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const;
+  bool CheckForOVEPCtxNodeInGraph(const GraphViewer& subgraph_view) const;
   bool CheckForOVEPCtxNode(const Node& node) const;
-  Status AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
+  Status AddOVEPCtxNodeToGraph(const GraphViewer& subgraph_view,
                                const std::string& graph_name,
                                const bool embed_mode,
                                std::string&& model_blob_str) const;
-  std::unique_ptr<ModelBlobWrapper> GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const;
+  std::unique_ptr<ModelBlobWrapper> GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& subgraph_view) const;
   InlinedVector<const Node*> GetEPCtxNodes() const;
-  bool CheckEPCacheContextAttribute(const GraphViewer& graph_viewer, const std::string& target_attr_extn) const;
+  bool CheckEPCacheContextAttribute(const GraphViewer& subgraph_view, const std::string& target_attr_extn) const;
+  std::shared_ptr<SharedContext> Initialize(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes, const SessionContext& session_context);
 
  private:
   const std::string openvino_sdk_version_;
   std::unique_ptr<Model> epctx_model_;
   const logging::Logger& logger_;
+  std::shared_ptr<SharedContextManager> shared_context_manager_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index a0fa885cbfc38..a099f85b2a4b9 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -12,15 +12,19 @@
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/exceptions.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "openvino/core/version.hpp"
 #ifdef USE_OVEP_NPU_MEMORY
 #include "core/providers/openvino/ov_allocator.h"
 #endif
+#include "ov_interface.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
+std::atomic<uint32_t> OpenVINOExecutionProvider::global_session_counter_{0};
+
 // Parking this code here for now before it's moved to the factory
 #if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
 static std::vector<std::string> parseDevices(const std::string& device_string,
@@ -52,12 +56,18 @@ static std::vector<std::string> parseDevices(const std::string& device_string,
 }
 #endif
 
-OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, std::shared_ptr<SharedContext> shared_context)
+OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info)
     : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider},
       session_context_(info),
-      shared_context_{std::move(shared_context)},
-      ep_ctx_handle_{session_context_.openvino_sdk_version, *GetLogger()} {
+      ov_core_(OVCore::Get()),
+      shared_context_manager_(SharedContextManager::Get()),
+      ep_ctx_handle_{session_context_.openvino_sdk_version, *GetLogger(), shared_context_manager_} {
   InitProviderOrtApi();
+#ifdef _WIN32
+  session_id_ = global_session_counter_.fetch_add(1) + 1;
+  // Trace all runtime options (includes both session and provider options)
+  OVTracing::Instance().LogAllRuntimeOptions(session_id_, session_context_);
+#endif
 }
 
 OpenVINOExecutionProvider::~OpenVINOExecutionProvider() {
@@ -94,124 +104,104 @@ common::Status OpenVINOExecutionProvider::Compile(
   auto& logger = *GetLogger();
   Status status = Status::OK();
 
-  bool is_epctx_model = false;
-  if (!fused_nodes.empty()) {
-    // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
-    const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
-    session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
-    session_context_.onnx_opset_version =
-        graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
-
-    // OVIR wrapped in epctx should be treated as source but this code does not
-    // This corner case is not in use and will be addressed in a future commit
-    is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0);
-  }
+  try {
+    if (session_context_.so_context_enable && session_context_.so_context_embed_mode && session_context_.so_share_ep_contexts) {
+      return Status(common::StatusCategory::ONNXRUNTIME, common::EP_FAIL,
+                    std::string("Invalid EP context configuration: ") + kOrtSessionOptionEpContextEmbedMode + " must be 0 if " + kOrtSessionOptionShareEpContexts + " is 1.");
+    }
 
-  // The block below is executed during EP context model inference
-  auto& metadata = shared_context_->shared_weights.metadata;  // Metadata object in memory
-  if (session_context_.so_share_ep_contexts &&
-      is_epctx_model &&
-      metadata.empty()) {
-    fs::path context_model_file_path = session_context_.so_context_file_path;
-    if (context_model_file_path.empty()) {
-      // If ep.context_file_path is not set the input model path is used
-      context_model_file_path = session_context_.onnx_model_path_name;
+    if (!fused_nodes.empty()) {
+      // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
+      const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
+      session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
+      session_context_.onnx_opset_version =
+          graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
     }
 
-    // Metadata is always read from model location, this could be a source or epctx model
-    fs::path metadata_filename = context_model_file_path.stem().string() + "_metadata.bin";
-    fs::path metadata_file_path = context_model_file_path.parent_path() / metadata_filename;
-    std::ifstream file(metadata_file_path, std::ios::binary);
-    ORT_RETURN_IF_NOT(file, "Metadata file was not found: " + metadata_file_path.string());
-    shared_context_->shared_weights.metadata_filepath = std::move(metadata_file_path);
-    file >> metadata;
-  }
+    shared_context_ = ep_ctx_handle_.Initialize(fused_nodes, session_context_);
+    ORT_ENFORCE(shared_context_,
+                "Failed to create or retrieve SharedContext");
 
-  struct OpenVINOEPFunctionState {
-    AllocateFunc allocate_func = nullptr;
-    DestroyFunc destroy_func = nullptr;
-    AllocatorHandle allocator_handle = nullptr;
-    BackendManager& backend_manager;
-  };
-
-  for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
-    const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
-    const Node& fused_node = fused_node_graph.fused_node;
-
-    NodeComputeInfo compute_info;
-
-    // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
-    // For precompiled blob, directly load the model instead of compiling the model
-    // For original model, check if the user wants to export a model with pre-compiled blob
-
-    auto& backend_manager = backend_managers_.emplace_back(session_context_,
-                                                           *shared_context_,
-                                                           fused_node,
-                                                           graph_body_viewer,
-                                                           logger,
-                                                           ep_ctx_handle_);
-
-    compute_info.create_state_func =
-        [&backend_manager](ComputeContext* context, FunctionState* state) {
-          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
-              .allocate_func = context->allocate_func,
-              .destroy_func = context->release_func,
-              .allocator_handle = context->allocator_handle,
-              .backend_manager = backend_manager};
-          *state = static_cast<FunctionState>(p);
-          return 0;
-        };
-
-    compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
-      auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
-      try {
-        function_state->backend_manager.Compute(context);
-      } catch (const std::exception& ex) {
-        return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
-      }
-      return Status::OK();
+    struct OpenVINOEPFunctionState {
+      AllocateFunc allocate_func = nullptr;
+      DestroyFunc destroy_func = nullptr;
+      AllocatorHandle allocator_handle = nullptr;
+      BackendManager& backend_manager;
     };
 
-    compute_info.release_state_func =
-        [](FunctionState state) {
-          if (state) {
-            OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
-            delete function_state;
-          }
-        };
+    for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
+      const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
+      const Node& fused_node = fused_node_graph.fused_node;
+
+      NodeComputeInfo compute_info;
+
+      // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
+      // For precompiled blob, directly load the model instead of compiling the model
+      // For original model, check if the user wants to export a model with pre-compiled blob
+
+      auto& backend_manager = backend_managers_.emplace_back(session_context_,
+                                                             *shared_context_,
+                                                             fused_node,
+                                                             graph_body_viewer,
+                                                             logger,
+                                                             ep_ctx_handle_);
+      compute_info.create_state_func =
+          [&backend_manager](ComputeContext* context, FunctionState* state) {
+            OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
+                .allocate_func = context->allocate_func,
+                .destroy_func = context->release_func,
+                .allocator_handle = context->allocator_handle,
+                .backend_manager = backend_manager};
+            *state = static_cast<FunctionState>(p);
+            return 0;
+          };
+
+      compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
+        auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
+        try {
+          function_state->backend_manager.Compute(context);
+        } catch (const std::exception& ex) {
+          return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
+        }
+        return Status::OK();
+      };
 
-    node_compute_funcs.push_back(std::move(compute_info));
+      compute_info.release_state_func =
+          [](FunctionState state) {
+            if (state) {
+              OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
+              delete function_state;
+            }
+          };
 
-    if (!status.IsOK()) {
-      break;
+      node_compute_funcs.push_back(std::move(compute_info));
     }
-  }
 
-  // The block below is executed during EP context model generation
-  if (session_context_.so_context_enable &&
-      session_context_.so_share_ep_contexts &&
-      !metadata.empty()) {
-    // For models after the first the metadata name comes from the shared context
-    fs::path metadata_file_path = shared_context_->shared_weights.metadata_filepath;
-    if (metadata_file_path.empty()) {
-      metadata_file_path = session_context_.so_context_file_path;
-      std::string name_append{"_metadata.bin"};
-      if (metadata_file_path.empty()) {
-        metadata_file_path = session_context_.onnx_model_path_name;
-        name_append = "_ctx" + name_append;
+    // Export compiled blobs as EPContext nodes if context enable is set
+    if (session_context_.so_context_enable) {
+      auto backend_it = backend_managers_.begin();
+      bool is_first = true;
+
+      for (const auto& fused_node_graph : fused_nodes) {
+        const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
+
+        // Set include_embed_data to true only for the first backend manager
+        backend_it->TryExportCompiledBlobAsEPCtxNode(graph_body_viewer, is_first);
+
+        is_first = false;
+        ++backend_it;
       }
-      auto metadata_filename = metadata_file_path.stem().string() + name_append;
-      metadata_file_path.replace_filename(metadata_filename);
-      shared_context_->shared_weights.metadata_filepath = metadata_file_path;
-    }
 
-    // Metadata is generated only for shared contexts
-    // If saving metadata then save it to the provided path or use the original model path
-    // Multiple calls to Compile() will update the metadata and for the last call
-    //   the resulting file will contain the aggregated content
-    std::ofstream file{metadata_file_path, std::ios::binary};
-    ORT_RETURN_IF_NOT(file, "Metadata file could not be written: ", metadata_file_path);
-    file << metadata;
+      // bit clunky ideally we should try to fold this into ep context handler
+      if (!session_context_.so_context_embed_mode) {
+        shared_context_->Serialize();
+        if (session_context_.so_stop_share_ep_contexts) {
+          shared_context_manager_->ClearActiveSharedContext();
+        }
+      }
+    }
+  } catch (const ovep_exception& ex) {
+    status = ex;
   }
 
   return status;
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 020aec16e507c..a343ad34cae50 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -11,9 +11,17 @@
 #include <vector>
 #include <set>
 #include <utility>
+#include <atomic>
 
 #include "core/providers/openvino/backend_manager.h"
 #include "core/providers/openvino/contexts.h"
+#include "ov_shared_context.h"
+#include "ov_bin_manager.h"
+#include "ov_interface.h"
+
+#ifdef _WIN32
+#include "core/providers/openvino/ov_tracing.h"
+#endif
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -45,7 +53,7 @@ static std::vector<std::string> split(const std::string& s, char delim) {
 // Logical device representation.
 class OpenVINOExecutionProvider : public IExecutionProvider {
  public:
-  explicit OpenVINOExecutionProvider(const ProviderInfo& info, std::shared_ptr<SharedContext> shared_context);
+  explicit OpenVINOExecutionProvider(const ProviderInfo& info);
   ~OpenVINOExecutionProvider();
 
   std::vector<std::unique_ptr<ComputeCapability>>
@@ -71,9 +79,16 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
 #endif
  private:
   SessionContext session_context_;
+  std::shared_ptr<OVCore> ov_core_;
+  std::shared_ptr<SharedContextManager> shared_context_manager_;
   std::shared_ptr<SharedContext> shared_context_;
+
   std::list<BackendManager> backend_managers_;  // EP session owns the backend objects
   EPCtxHandler ep_ctx_handle_;
+
+  // Tracing and session tracking
+  uint32_t session_id_{0};
+  static std::atomic<uint32_t> global_session_counter_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_dllmain.cc b/onnxruntime/core/providers/openvino/openvino_provider_dllmain.cc
new file mode 100644
index 0000000000000..08f9cc065aaae
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/openvino_provider_dllmain.cc
@@ -0,0 +1,51 @@
+// Copyright (c) Intel Corporation.
+// Licensed under the MIT License.
+#ifdef _WIN32
+
+#include <Windows.h>
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-qualifiers"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#else
+#endif
+#include <google/protobuf/message_lite.h>
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#include <atomic>
+
+// Reuse the global shutdown indicator (do NOT set it here; that is owned by the core DLL).
+extern std::atomic<bool> g_is_shutting_down;
+
+// NOTE:
+// This DllMain exists because the OpenVINO provider DLL statically links protobuf independently
+// of the core onnxruntime DLL. The core DLL's DllMain won't clean up this copy.
+// We perform protobuf shutdown on dynamic unload, and (optionally) during process termination
+// when memory leak checking is enabled.
+BOOL APIENTRY DllMain(HMODULE /*hModule*/,
+                      DWORD ul_reason_for_call,
+                      LPVOID lpvReserved) {
+  switch (ul_reason_for_call) {
+    case DLL_PROCESS_ATTACH:
+    case DLL_THREAD_ATTACH:
+    case DLL_THREAD_DETACH:
+      break;
+    case DLL_PROCESS_DETACH:
+      // Windows API doc says: "When handling DLL_PROCESS_DETACH, a DLL should free resources such as heap memory only if the DLL is being unloaded dynamically"
+      if (lpvReserved != nullptr) {
+        // Process termination. Normally skipped for speed/safety,
+        // but in leak-check builds we reclaim protobuf heap.
+#if defined(ONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
+        ::google::protobuf::ShutdownProtobufLibrary();
+#endif
+      } else {
+        // Dynamic unload: safe to clean up.
+        ::google::protobuf::ShutdownProtobufLibrary();
+      }
+      break;
+  }
+  return TRUE;
+}
+
+#endif  // defined(_WIN32)
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 1a10d9849d5cc..7eb5b062fe7c8 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -16,6 +16,7 @@
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 #include "core/providers/openvino/openvino_parser_utils.h"
+#include "ov_interface.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -28,6 +29,7 @@ void ParseConfigOptions(ProviderInfo& pi) {
   pi.so_context_embed_mode = pi.config_options->GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
   pi.so_share_ep_contexts = pi.config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
   pi.so_context_file_path = pi.config_options->GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+  pi.so_stop_share_ep_contexts = pi.config_options->GetConfigOrDefault(kOrtSessionOptionStopShareEpContexts, "0") == "1";
 
   if (pi.so_share_ep_contexts) {
     ov::AnyMap map;
@@ -187,6 +189,36 @@ std::string ParseDeviceType(std::shared_ptr<OVCore> ov_core, const ProviderOptio
 
 void ParseProviderOptions([[maybe_unused]] ProviderInfo& result, [[maybe_unused]] const ProviderOptions& config_options) {}
 
+static void ParseInnerMap(const nlohmann::json& json_map, ov::AnyMap& inner_map, size_t level = 0) {
+  const size_t max_levels = 8;
+  if (level >= max_levels) {
+    ORT_THROW("ParseInnerMap: load_config can have only up to " + std::to_string(max_levels) +
+              " levels of nested maps. Current level = " + std::to_string(level));
+  }
+
+  if (!json_map.is_object()) {
+    ORT_THROW("ParseInnerMap: Expected an object as input");
+  }
+
+  for (auto& [inner_key, inner_value] : json_map.items()) {
+    if (inner_value.is_string()) {
+      inner_map[inner_key] = ov::Any(inner_value.get<std::string>());
+    } else if (inner_value.is_number_integer()) {
+      inner_map[inner_key] = ov::Any(inner_value.get<int64_t>());
+    } else if (inner_value.is_number_float()) {
+      inner_map[inner_key] = ov::Any(inner_value.get<double>());
+    } else if (inner_value.is_boolean()) {
+      inner_map[inner_key] = ov::Any(inner_value.get<bool>());
+    } else if (inner_value.is_object()) {
+      auto inner_inner_map = ov::AnyMap();
+      ParseInnerMap(inner_value, inner_inner_map, level + 1);
+      inner_map[inner_key] = std::move(inner_inner_map);
+    } else {
+      ORT_THROW("load_config: unsupported JSON value type=" + std::string(inner_value.type_name()) + ", for key=" + inner_key);
+    }
+  }
+}
+
 // Initializes a ProviderInfo struct from a ProviderOptions map and a ConfigOptions map.
 static void ParseProviderInfo(const ProviderOptions& provider_options,
                               const ConfigOptions* config_options,
@@ -266,19 +298,7 @@ static void ParseProviderInfo(const ProviderOptions& provider_options,
             ORT_THROW("Invalid JSON structure: Expected an object for device properties.");
           }
 
-          for (auto& [inner_key, inner_value] : value.items()) {
-            if (inner_value.is_string()) {
-              inner_map[inner_key] = inner_value.get<std::string>();
-            } else if (inner_value.is_number_integer()) {
-              inner_map[inner_key] = inner_value.get<int64_t>();
-            } else if (inner_value.is_number_float()) {
-              inner_map[inner_key] = inner_value.get<double>();
-            } else if (inner_value.is_boolean()) {
-              inner_map[inner_key] = inner_value.get<bool>();
-            } else {
-              LOGS_DEFAULT(WARNING) << "Unsupported JSON value type for key: " << inner_key << ". Skipping key.";
-            }
-          }
+          ParseInnerMap(value, inner_map);
           target_map[key] = std::move(inner_map);
         }
       } catch (const nlohmann::json::parse_error& e) {
@@ -362,14 +382,14 @@ static void ParseProviderInfo(const ProviderOptions& provider_options,
 }
 
 struct OpenVINOProviderFactory : IExecutionProviderFactory {
-  OpenVINOProviderFactory(ProviderInfo provider_info, std::shared_ptr<SharedContext> shared_context)
-      : provider_info_(std::move(provider_info)), shared_context_(std::move(shared_context)) {}
+  OpenVINOProviderFactory(ProviderInfo provider_info, std::shared_ptr<OVCore> ov_core)
+      : provider_info_(std::move(provider_info)), ov_core_(ov_core) {}
 
   ~OpenVINOProviderFactory() override {}
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
     ParseConfigOptions(provider_info_);
-    return std::make_unique<OpenVINOExecutionProvider>(provider_info_, shared_context_);
+    return std::make_unique<OpenVINOExecutionProvider>(provider_info_);
   }
 
   // Called by InferenceSession when registering EPs. Allows creation of an EP instance that is initialized with
@@ -402,7 +422,7 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
     ParseProviderInfo(provider_options, &config_options, provider_info);
     ParseConfigOptions(provider_info);
 
-    auto ov_ep = std::make_unique<OpenVINOExecutionProvider>(provider_info, shared_context_);
+    auto ov_ep = std::make_unique<OpenVINOExecutionProvider>(provider_info);
     ov_ep->SetLogger(reinterpret_cast<const logging::Logger*>(&session_logger));
     return ov_ep;
   }
@@ -413,14 +433,14 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
   std::unique_ptr<IExecutionProvider> CreateProvider_V2(const OrtSessionOptions& /*session_options*/,
                                                         const OrtLogger& session_logger) {
     ProviderInfo provider_info = provider_info_;
-    auto ov_ep = std::make_unique<OpenVINOExecutionProvider>(provider_info, shared_context_);
+    auto ov_ep = std::make_unique<OpenVINOExecutionProvider>(provider_info);
     ov_ep->SetLogger(reinterpret_cast<const logging::Logger*>(&session_logger));
     return ov_ep;
   }
 
  private:
   ProviderInfo provider_info_;
-  std::shared_ptr<SharedContext> shared_context_;
+  std::shared_ptr<OVCore> ov_core_;
 };
 
 struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO {
@@ -445,7 +465,7 @@ struct OpenVINO_Provider : Provider {
     ProviderInfo pi;
     ParseProviderInfo(provider_options, config_options, pi);
 
-    return std::make_shared<OpenVINOProviderFactory>(pi, SharedContext::Get());
+    return std::make_shared<OpenVINOProviderFactory>(pi, OVCore::Get());
   }
 
   Status CreateIExecutionProvider(const OrtHardwareDevice* const* /*devices*/,
@@ -552,7 +572,7 @@ struct OpenVINO_Provider : Provider {
     ParseConfigOptions(pi);
 
     // Create and return the execution provider
-    auto factory = std::make_unique<OpenVINOProviderFactory>(pi, SharedContext::Get());
+    auto factory = std::make_unique<OpenVINOProviderFactory>(pi, OVCore::Get());
     ep = factory->CreateProvider_V2(session_options, logger);
     return Status::OK();
   }
diff --git a/onnxruntime/core/providers/openvino/ov_bin_manager.cc b/onnxruntime/core/providers/openvino/ov_bin_manager.cc
new file mode 100644
index 0000000000000..88a50377281bc
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/ov_bin_manager.cc
@@ -0,0 +1,428 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#include "ov_bin_manager.h"
+#include "ov_shared_context.h"
+#include <nlohmann/json.hpp>
+#include "core/providers/shared_library/provider_api.h"  // for ORT_VERSION and kOpenVINOExecutionProvider
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+static inline uint64_t AlignUp(uint64_t value, uint64_t alignment) {
+  return (value + alignment - 1) / alignment * alignment;
+}
+
+// Custom streambuf that wraps an ov::Tensor's memory
+// Provides us a std::istream interface over the tensor data without copying.
+// Only supports input operations.
+class TensorStreamBuf : public std::streambuf {
+ public:
+  explicit TensorStreamBuf(ov::Tensor& tensor) {
+    char* data = const_cast<char*>(tensor.data<char>());
+    size_t size = tensor.get_byte_size();
+    setg(data, data, data + size);
+  }
+
+ protected:
+  // Override seekoff for proper seeking support
+  std::streampos seekoff(std::streamoff off, std::ios_base::seekdir dir, std::ios_base::openmode which) override {
+    if (which & std::ios_base::in) {
+      char* new_pos = nullptr;
+      switch (dir) {
+        case std::ios_base::beg:
+          new_pos = eback() + off;
+          break;
+        case std::ios_base::cur:
+          new_pos = gptr() + off;
+          break;
+        case std::ios_base::end:
+          new_pos = egptr() + off;
+          break;
+        default:
+          return std::streampos(std::streamoff(-1));
+      }
+
+      if (new_pos >= eback() && new_pos <= egptr()) {
+        setg(eback(), new_pos, egptr());
+        return std::streampos(new_pos - eback());
+      }
+    }
+    return std::streampos(std::streamoff(-1));
+  }
+
+  // Override seekpos for proper seeking support
+  std::streampos seekpos(std::streampos pos, std::ios_base::openmode which) override {
+    return seekoff(std::streamoff(pos), std::ios_base::beg, which);
+  }
+};
+
+// Custom istream that owns the tensor to ensure proper lifetime management
+class TensorStream : public std::istream {
+ public:
+  explicit TensorStream(ov::Tensor tensor)
+      : std::istream(&buf_),
+        tensor_(std::move(tensor)),
+        buf_(tensor_) {}
+
+ private:
+  ov::Tensor tensor_;    // Keep tensor alive
+  TensorStreamBuf buf_;  // Buffer wrapping tensor data
+};
+
+/*
+    Logical layout of the single binary file:
+    [Header]
+    [BSON Metadata]              ← Contains blob_metadata_map with data_offset and size for each blob
+    [Padding to 64K alignment]   ← Blob section starts here (64K aligned)
+    [Blob 1]                     ← BSON blob_metadata_map["blob_name"].data_offset points here
+    [Padding to 64K alignment]   ← Each blob end is 64K aligned
+    [Blob 2]                     ← BSON blob_metadata_map["blob_name2"].data_offset points here
+    [Padding to 64K alignment]
+    [Blob 3]                     ← BSON blob_metadata_map["blob_name3"].data_offset points here
+    ...
+
+    BSON Schema:
+    {
+      "version": <string>,                    // BSON schema version (semver format)
+      "producer": <string>,                   // Producer identifier (e.g., "onnxruntime-openvino-ep-plugin")
+      "weights_metadata_map": {               // Map of ONNX tensor names to external weight file metadata
+        "<tensor_name>": {
+          "location": <string>,               // Relative path to external weights file
+          "data_offset": <int64>,            // Offset within external weights file
+          "size": <int64>                    // Size of weight data in bytes
+        },
+        ...
+      },
+      "blob_metadata_map": {                  // Map of blob names to compiled model blob metadata
+        "<blob_name>": {
+          "data_offset": <int64>,            // Absolute file offset to blob data (64K aligned)
+          "size": <int64>                    // Actual blob data size (excluding padding)
+        },
+        ...
+      }
+    }
+
+    Note: data_offset values in blob_metadata_map are absolute file positions.
+          size values exclude alignment padding bytes.
+*/
+
+// "OVEP_BIN" in little-endian (memory will read as 'O','V','E','P','_','B','I','N')
+constexpr uint64_t kMagicNumber = 0x4E49425F5045564FULL;
+
+enum class BinVersion : uint64_t {
+  v1 = 1,
+  current = v1
+};
+
+struct header_t {
+  uint64_t magic;
+  uint64_t version;
+  uint64_t header_size;
+  uint64_t bson_start_offset;
+  uint64_t bson_size;
+};
+
+constexpr uint64_t kBlobAlignment = 64 * 1024;
+
+// BSON field names
+namespace BSONFields {
+constexpr const char* kVersion = "version";
+constexpr const char* kProducer = "producer";
+constexpr const char* kWeightsMetadata = "weights_metadata_map";
+constexpr const char* kBlobMetadata = "blob_metadata_map";
+constexpr const char* kLocation = "location";
+constexpr const char* kDataOffset = "data_offset";
+constexpr const char* kSize = "size";
+constexpr const char* kCurrentBsonVersion = "1.0.0";
+constexpr const char* kProducerName = "onnxruntime-openvino-ep-" ORT_VERSION;
+}  // namespace BSONFields
+
+template <typename E>
+constexpr std::underlying_type_t<E> to_underlying(E e) noexcept {
+  static_assert(std::is_enum_v<E>, "to_underlying requires an enum type");
+  return static_cast<std::underlying_type_t<E>>(e);
+}
+
+void BinManager::AddNativeBlob(const std::string& name, const ov::CompiledModel& compiled_model) {
+  std::unique_lock lock(mutex_);
+  native_blobs_[name] = BlobContainer{.compiled_model = compiled_model, .tensor = {}, .data = {}, .serialized_info = {0, 0}};
+}
+
+ov::Tensor BinManager::GetNativeBlob(const std::string& blob_name) {
+  std::unique_lock lock(mutex_);
+
+  auto it = native_blobs_.find(blob_name);
+  ORT_ENFORCE(it != native_blobs_.end(), "Blob not found for ", blob_name);
+
+  auto& blob_container = it->second;
+  if (blob_container.tensor) {
+    return blob_container.tensor;
+  }
+
+  ORT_ENFORCE(blob_container.serialized_info.size > 0 || !blob_container.data.empty(),
+              "Blob has no serialization info or embedded data for ", blob_name);
+
+  if (!external_bin_path_.value_or("").empty() && !mapped_bin_) {
+    // Use ov::read_tensor_data to create a memory-mapped tensor from external file
+    mapped_bin_ = ov::read_tensor_data(external_bin_path_.value());
+  }
+
+  if (mapped_bin_) {
+    // Create a tensor from memory-mapped external file
+    blob_container.tensor = ov::Tensor(
+        ov::element::u8,
+        ov::Shape{blob_container.serialized_info.size},
+        mapped_bin_.data<uint8_t>() + blob_container.serialized_info.file_offset);
+  } else {
+    // Create a tensor from embedded data vector
+    blob_container.tensor = ov::Tensor(
+        ov::element::u8,
+        ov::Shape{blob_container.data.size()},
+        blob_container.data.data());
+  }
+
+  return blob_container.tensor;
+}
+
+std::unique_ptr<std::istream> BinManager::GetNativeBlobAsStream(const std::string& blob_name) {
+  return std::make_unique<TensorStream>(GetNativeBlob(blob_name));
+}
+
+std::filesystem::path BinManager::GetBinPathForModel(const std::filesystem::path& model_path) {
+  ORT_ENFORCE(!model_path.empty());
+  return model_path.parent_path() / (model_path.stem().string() + "_" + kOpenVINOExecutionProvider + ".bin");
+}
+
+void BinManager::Serialize(std::shared_ptr<SharedContext> shared_context) {
+  auto path = GetExternalBinPath();
+  std::ofstream stream(path, std::ios::out | std::ios::binary);
+  ORT_ENFORCE(stream.is_open(), "Failed to open file for serialization: " + path.string());
+  Serialize(stream, shared_context);
+}
+
+void BinManager::Deserialize(std::shared_ptr<SharedContext> shared_context) {
+  auto path = GetExternalBinPath();
+  std::ifstream stream(path, std::ios::in | std::ios::binary);
+  ORT_ENFORCE(stream.is_open(), "Failed to open file for deserialization: " + path.string());
+  Deserialize(stream, shared_context);
+}
+
+void BinManager::Serialize(std::ostream& stream, std::shared_ptr<SharedContext> shared_context) {
+  std::shared_lock ul(mutex_);
+
+  auto metadata = shared_context ? shared_context->GetMetadataCopy() : SharedContext::Metadata::Map{};
+  if (metadata.empty() && native_blobs_.empty()) {
+    return;  // Nothing to serialize
+  }
+
+  const auto stream_start = stream.tellp();
+
+  auto write_alignment_padding = [&stream](uint64_t current_pos, uint64_t alignment) {
+    uint64_t aligned_position = AlignUp(current_pos, alignment);
+    uint64_t padding_size = aligned_position - current_pos;
+    if (padding_size > 0) {
+      std::vector<char> padding(padding_size, 0);
+      stream.write(padding.data(), padding.size());
+      ORT_ENFORCE(stream.good(), "Error: Failed to write alignment padding.");
+    }
+  };
+
+  // Reserve space for header (will be updated later)
+  header_t header{};
+  header.magic = kMagicNumber;
+  header.version = to_underlying(BinVersion::current);
+  header.header_size = sizeof(header_t);
+  stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+  ORT_ENFORCE(stream.good(), "Error: Failed to write header.");
+
+  // Build JSON metadata
+  nlohmann::json j;
+  j[BSONFields::kVersion] = BSONFields::kCurrentBsonVersion;
+  j[BSONFields::kProducer] = BSONFields::kProducerName;
+
+  // Add weights metadata as a map (from SharedContext if available)
+  if (!metadata.empty()) {
+    nlohmann::json weights_map = nlohmann::json::object();
+    for (const auto& [key, value] : metadata) {
+      nlohmann::json weight_entry;
+      weight_entry[BSONFields::kLocation] = value.serialized.location.string();
+      weight_entry[BSONFields::kDataOffset] = value.serialized.data_offset;
+      weight_entry[BSONFields::kSize] = value.serialized.size;
+      weights_map[key] = weight_entry;
+    }
+    j[BSONFields::kWeightsMetadata] = weights_map;
+  }
+
+  // Add blob metadata with placeholder values as a map (will be updated after writing blobs)
+  nlohmann::json blob_map = nlohmann::json::object();
+  for (const auto& [key, value] : native_blobs_) {
+    nlohmann::json blob_entry;
+    auto max_val = std::numeric_limits<int64_t>::max();
+    // Placehold max size since we don't know actual offsets/sizes yet, and if they aren't max they might serialize smaller.
+    blob_entry[BSONFields::kDataOffset] = max_val;
+    blob_entry[BSONFields::kSize] = max_val;
+    blob_map[key] = blob_entry;
+  }
+  j[BSONFields::kBlobMetadata] = blob_map;
+
+  // Write BSON metadata (will be rewritten later with correct blob info)
+  header.bson_start_offset = stream.tellp();
+
+  size_t orig_bson_size;
+  {
+    std::vector<uint8_t> bson_data = nlohmann::json::to_bson(j);
+    orig_bson_size = bson_data.size();
+    stream.write(reinterpret_cast<const char*>(bson_data.data()), bson_data.size());
+    ORT_ENFORCE(stream.good(), "Error: Failed to write BSON data.");
+  }
+  uint64_t bson_end = stream.tellp();
+
+  write_alignment_padding(bson_end, kBlobAlignment);
+
+  // Write blob data and capture actual offsets/sizes
+  for (auto& [blob_name, value] : native_blobs_) {
+    uint64_t blob_start = stream.tellp();
+    value.compiled_model.export_model(stream);
+    ORT_ENFORCE(stream.good(), "Error: Failed to write blob data for ", blob_name);
+    // Seek to end of stream after writing in case export model didn't leave us there
+    stream.seekp(0, std::ios::end);
+    uint64_t blob_end = stream.tellp();
+    uint64_t blob_size = blob_end - blob_start;
+
+    // Update the BlobContainer + BSON with serialization info
+    value.serialized_info.file_offset = blob_start;
+    value.serialized_info.size = blob_size;
+    j[BSONFields::kBlobMetadata][blob_name][BSONFields::kDataOffset] = blob_start;
+    j[BSONFields::kBlobMetadata][blob_name][BSONFields::kSize] = blob_size;
+
+    write_alignment_padding(blob_end, kBlobAlignment);
+  }
+
+  // Rewrite BSON metadata with correct blob info
+  std::vector<uint8_t> updated_bson_data = nlohmann::json::to_bson(j);
+  ORT_ENFORCE(updated_bson_data.size() <= orig_bson_size,
+              "Error: BSON size larger after updating blob info. Original: ", orig_bson_size,
+              " Updated: ", updated_bson_data.size());
+
+  stream.seekp(header.bson_start_offset);
+  stream.write(reinterpret_cast<const char*>(updated_bson_data.data()), updated_bson_data.size());
+  ORT_ENFORCE(stream.good(), "Error: Failed to rewrite BSON data.");
+  bson_end = stream.tellp();
+  header.bson_size = bson_end - header.bson_start_offset;
+
+  // Update header with BSON offsets
+  stream.seekp(stream_start);
+  stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+  ORT_ENFORCE(stream.good(), "Error: Failed to update header.");
+
+  stream.seekp(0, std::ios::end);  // Move to end after writing.
+}
+
+void BinManager::Deserialize(std::istream& stream, std::shared_ptr<SharedContext> shared_context) {
+  try {
+    DeserializeImpl(stream, shared_context);
+  } catch (const std::exception& e) {
+    ORT_THROW(e.what(), "\nCould not deserialize binary data. This could mean the bin is corrupted or incompatible. Try re-generating ep context cache.");
+  }
+}
+
+void BinManager::DeserializeImpl(std::istream& stream, const std::shared_ptr<SharedContext>& shared_context) {
+  // Read and validate header
+  header_t header{};
+
+  stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+  ORT_ENFORCE(stream.good(), "Error: Failed to read header.");
+  ORT_ENFORCE(header.magic == kMagicNumber, "Error: Invalid magic number. Expected: 0x", std::hex, kMagicNumber, " Got: 0x", header.magic);
+  ORT_ENFORCE(header.version == to_underlying(BinVersion::current), "Error: Unsupported file version: ", header.version);
+  ORT_ENFORCE(header.header_size == sizeof(header_t), "Error: Header size mismatch.");
+
+  // Seek to BSON metadata and read it
+  stream.seekg(header.bson_start_offset);
+  ORT_ENFORCE(stream.good(), "Error: Failed to seek to BSON metadata.");
+
+  // Parse BSON
+  nlohmann::json j;
+  {
+    std::vector<uint8_t> bson_data(header.bson_size);
+    stream.read(reinterpret_cast<char*>(bson_data.data()), header.bson_size);
+    j = nlohmann::json::from_bson(bson_data);
+  }
+
+  // Validate BSON version (check major version compatibility)
+  ORT_ENFORCE(j.contains(BSONFields::kVersion), "Error: Missing version in BSON metadata.");
+  auto bson_version = j[BSONFields::kVersion].get<std::string>();
+
+  // Extract major version from semver strings (format: "major.minor.patch")
+  auto get_major_version = [](const std::string& version) -> int {
+    size_t dot_pos = version.find('.');
+    if (dot_pos == std::string::npos) return -1;
+    try {
+      return std::stoi(version.substr(0, dot_pos));
+    } catch (...) {
+      return -1;
+    }
+  };
+
+  int file_major = get_major_version(bson_version);
+  int current_major = get_major_version(BSONFields::kCurrentBsonVersion);
+
+  ORT_ENFORCE(file_major >= 0 && current_major >= 0,
+              "Error: Invalid BSON version format. Expected: ", BSONFields::kCurrentBsonVersion,
+              " Got: ", bson_version);
+  ORT_ENFORCE(file_major == current_major,
+              "Error: Incompatible BSON schema major version. Expected: ", current_major,
+              " Got: ", file_major, " (full version: ", bson_version, ")");
+
+  // Parse weights metadata and populate SharedContext if available
+  if (j.contains(BSONFields::kWeightsMetadata)) {
+    ORT_ENFORCE(shared_context, "Error: Bin contains shared weights metadata but no SharedContext was provided during deserialization.");
+    const auto& weights_map = j[BSONFields::kWeightsMetadata];
+    if (weights_map.is_object()) {
+      for (const auto& [weight_name, weight_entry] : weights_map.items()) {
+        auto location = weight_entry[BSONFields::kLocation].get<std::string>();
+        auto data_offset = weight_entry[BSONFields::kDataOffset].get<size_t>();
+        auto size = weight_entry[BSONFields::kSize].get<size_t>();
+        shared_context->AddExternalWeight(weight_name, data_offset, size, location);
+      }
+    }
+  }
+
+  // Parse blob metadata
+  ORT_ENFORCE(j.contains(BSONFields::kBlobMetadata), "Error: Missing blob metadata in BSON.");
+  const auto& blob_map = j[BSONFields::kBlobMetadata];
+  ORT_ENFORCE(blob_map.is_object(), "Error: Blob metadata must be an object.");
+
+  // Determine if we're deserializing from an external file or embedded stream
+  const bool has_external_file = !external_bin_path_.value_or("").empty();
+
+  std::unique_lock lock(mutex_);
+  for (const auto& [blob_name, blob_entry] : blob_map.items()) {
+    uint64_t blob_offset = blob_entry[BSONFields::kDataOffset].get<uint64_t>();
+    uint64_t blob_size = blob_entry[BSONFields::kSize].get<uint64_t>();
+
+    BlobContainer container;
+    container.serialized_info.file_offset = blob_offset;
+    container.serialized_info.size = blob_size;
+
+    // If no external file, extract blob data into vector
+    if (!has_external_file) {
+      // Seek to blob offset and read data into vector
+      auto current_pos = stream.tellg();
+      stream.seekg(blob_offset);
+      ORT_ENFORCE(stream.good(), "Error: Failed to seek to blob data for ", blob_name);
+
+      container.data.resize(blob_size);
+      stream.read(reinterpret_cast<char*>(container.data.data()), blob_size);
+      ORT_ENFORCE(stream.good(), "Error: Failed to read blob data for ", blob_name);
+
+      // Restore stream position
+      stream.seekg(current_pos);
+    }
+
+    native_blobs_[blob_name] = std::move(container);
+  }
+}
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_bin_manager.h b/onnxruntime/core/providers/openvino/ov_bin_manager.h
new file mode 100644
index 0000000000000..b50cfc460ec96
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/ov_bin_manager.h
@@ -0,0 +1,76 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <filesystem>
+#include <fstream>
+#include <shared_mutex>
+
+#include "openvino/runtime/core.hpp"
+#include "weak_singleton.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+// Forward declaration
+class SharedContext;
+
+// Manages native compiled model blobs and binary file serialization/deserialization
+class BinManager {
+ public:
+  BinManager() = default;
+  BinManager(const std::filesystem::path& external_bin_path) : external_bin_path_(external_bin_path) {}
+  ~BinManager() = default;
+
+  // Blob management
+  void AddNativeBlob(const std::string& name, const ov::CompiledModel& compiled_model);
+  ov::Tensor GetNativeBlob(const std::string& blob_name);
+  std::unique_ptr<std::istream> GetNativeBlobAsStream(const std::string& blob_name);
+
+  // Serialization/Deserialization
+  void Serialize(std::ostream& stream, std::shared_ptr<SharedContext> shared_context = nullptr);
+  void Deserialize(std::istream& stream, std::shared_ptr<SharedContext> shared_context = nullptr);
+
+  void Serialize(std::shared_ptr<SharedContext> shared_context = nullptr);
+  void Deserialize(std::shared_ptr<SharedContext> shared_context = nullptr);
+
+  // Path management
+  void TrySetExternalBinPath(const std::filesystem::path& bin_path) {
+    std::unique_lock lock(mutex_);
+    if (!external_bin_path_) {
+      external_bin_path_ = bin_path;
+    }
+  }
+  std::filesystem::path GetExternalBinPath() const {
+    std::shared_lock lock(mutex_);
+    return external_bin_path_.value_or("");
+  }
+
+  static std::filesystem::path GetBinPathForModel(const std::filesystem::path& model_path);
+
+ private:
+  struct BlobContainer {
+    ov::CompiledModel compiled_model;
+    ov::Tensor tensor;
+    std::vector<uint8_t> data;  // For embedded blobs when no external file exists
+    struct {
+      uint64_t file_offset{0};
+      uint64_t size{0};
+    } serialized_info;
+  };
+
+  void DeserializeImpl(std::istream& stream, const std::shared_ptr<SharedContext>& shared_context);
+
+  mutable std::shared_mutex mutex_;
+  std::optional<std::filesystem::path> external_bin_path_;
+  ov::Tensor mapped_bin_;
+  std::unordered_map<std::string, BlobContainer> native_blobs_;
+};
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_factory.cc b/onnxruntime/core/providers/openvino/ov_factory.cc
index 2853cc17726ab..5119c611d3f3d 100644
--- a/onnxruntime/core/providers/openvino/ov_factory.cc
+++ b/onnxruntime/core/providers/openvino/ov_factory.cc
@@ -16,7 +16,7 @@
 #include "onnxruntime_c_api.h"
 #include "ov_factory.h"
 #include "openvino/openvino.hpp"
-#include "ov_interface.h"
+#include "weak_singleton.h"
 
 using namespace onnxruntime::openvino_ep;
 using ov_core_singleton = onnxruntime::openvino_ep::WeakSingleton<ov::Core>;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 899845d4890cf..23be3447b8799 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -12,16 +12,21 @@
 #include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/ov_stateful_patch_utils.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
+#include "core/providers/openvino/exceptions.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
-template <typename Func, typename... Args>
+template <bool typed, typename Func, typename... Args>
 inline auto OvExceptionBoundary(Func&& func, std::format_string<Args...>&& fmt, Args&&... args) {
   try {
     return func();
   } catch (const ov::Exception& e) {
-    ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what()));
+    if constexpr (typed) {
+      throw ovep_exception(e, ovep_exception::type::import_model);
+    } else {
+      ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what()));
+    }
   } catch (...) {
     ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)));
   }
@@ -70,7 +75,7 @@ std::optional<bool> queryOVProperty(const std::string& property, const std::stri
 }
 
 std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::string& model_path) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     std::istringstream modelStringStream(std::move(model));
     std::istream& modelStream = modelStringStream;
     // Try to load with FrontEndManager
@@ -88,7 +93,7 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::str
       ORT_THROW(log_tag + "Unknown exception while Reading network");
     }
   },
-                             "Exception while Reading network");
+                                    "Exception while Reading network");
 }
 
 OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
@@ -156,7 +161,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
                                   ov::AnyMap& device_config,
                                   bool enable_causallm,
                                   const std::string& name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     OVExeNetwork exe;
     if (enable_causallm) {
       auto mutable_model = ie_cnn_network->clone();
@@ -172,14 +177,14 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
 
     return exe;
   },
-                             "Exception while Loading Network for graph {}", name);
+                                    "Exception while Loading Network for graph {}", name);
 }
 
 OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
                                   std::string& hw_target,
                                   ov::AnyMap& device_config,
                                   const std::string& name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     ov::CompiledModel obj;
 
     obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
@@ -189,23 +194,23 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
     OVExeNetwork exe(obj, hw_target);
     return exe;
   },
-                             "Exception while Loading Network for graph {}", name);
+                                    "Exception while Loading Network for graph {}", name);
 }
 
 OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob,
                                  std::string hw_target,
                                  const ov::AnyMap& device_config,
                                  std::string name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<true>([&]() {
     ov::CompiledModel obj;
 #if (OPENVINO_VERSION_MAJOR > 2025 || (OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR >= 3))
-    if (!model_blob.maybe_native_blob_path_.empty()) {
-      obj = core.import_model(ov::read_tensor_data(model_blob.maybe_native_blob_path_), hw_target, device_config);
+    if (model_blob.tensor_) {
+      obj = core.import_model(model_blob.tensor_, hw_target, device_config);
     } else {
       obj = core.import_model(*model_blob.stream_, hw_target, device_config);
     }
 #else
-    obj = core.import_model(*model_blob.stream_, hw_target, device_config);
+      obj = core.import_model(*model_blob.stream_, hw_target, device_config);
 #endif
     OVExeNetwork exe(obj, hw_target);
 
@@ -214,7 +219,7 @@ OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob,
 #endif
     return exe;
   },
-                             "Exception while Loading Network for graph {}", name);
+                                   "Exception while Loading Network for graph {}", name);
 }
 
 OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream,
@@ -222,7 +227,7 @@ OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream,
                                                   const ov::AnyMap& device_config,
                                                   bool enable_causallm,
                                                   std::filesystem::path model_file_path) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     OVExeNetwork exe;
 
     bool isXML = backend_utils::IsModelStreamXML(model_stream);
@@ -267,7 +272,7 @@ OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream,
 #endif
     return exe;
   },
-                             "Exception while Loading Network from OVIR model file: {}", model_file_path.string());
+                                    "Exception while Loading Network from OVIR model file: {}", model_file_path.string());
 }
 
 void OVCore::SetCache(const std::string& cache_dir_path) {
@@ -317,7 +322,7 @@ void OVCore::SetStreams(const std::string& device_type, int num_streams) {
 }
 
 std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     auto infReq = compiled_model_obj.create_infer_request();
     std::shared_ptr<OVInferRequest> ovInfReq;
     if (is_stateful_causallm) {
@@ -328,31 +333,31 @@ std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
     return ovInfReq;
   },
 
-                             "Exception while creating InferRequest object");
+                                    "Exception while creating InferRequest object");
 }
 
 OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     auto tobj = ovInfReq.get_tensor(input_name);
     OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
     return blob;
   },
-                             " Cannot access IE Blob for input: {}", input_name);
+                                    " Cannot access IE Blob for input: {}", input_name);
 }
 
 std::string OVInferRequest::GetInputTensorName(uint32_t index) {
-  return OvExceptionBoundary([&]() -> const std::string& {
+  return OvExceptionBoundary<false>([&]() {
     const auto& model = ovInfReq.get_compiled_model();
     return *model.input(index).get_names().begin();
   },
-                             " Cannot access IE Blob for input number: {}", index);
+                                    " Cannot access IE Blob for input number: {}", index);
 }
 
 void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
-  OvExceptionBoundary([&]() {
+  OvExceptionBoundary<false>([&]() {
     ovInfReq.set_tensor(name, *(blob.get()));
   },
-                      " Cannot set Remote Blob for output: {}", name);
+                             " Cannot set Remote Blob for output: {}", name);
 }
 
 uint32_t OVInferRequest::GetNumInputs() {
@@ -360,20 +365,51 @@ uint32_t OVInferRequest::GetNumInputs() {
 }
 
 void OVInferRequest::Infer() {
-  OvExceptionBoundary([&]() {
+  OvExceptionBoundary<false>([&]() {
     ovInfReq.infer();
   },
-                      "In Error Couldn't start Inference");
+                             "In Error Couldn't start Inference");
 }
 
 StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
     : OVInferRequest(std::move(infer_request)), target_device(device) {
   bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
-  if (gpu_or_npu) {
+
+  _npu_logits_slice_required = IsNPULogitsSliceRequired();
+
+  // check if there is input_ids tensors and if the tensor type is int64,
+  // because logic prefill_use_full_chat_history is only for specific inputs and data type
+  auto input_ids_opt = FindTensor("input_ids");
+  if (gpu_or_npu && input_ids_opt.has_value() && input_ids_opt->get_element_type() == ov::element::i64) {
     prefill_use_full_chat_history = true;
   }
 }
 
+static inline bool IsNPUWSliceOutEnabled(const ov::CompiledModel& compiled_model) {
+  auto slice_out_val = compiled_model.get_property("NPUW_SLICE_OUT");
+  if (!slice_out_val.empty()) {
+    if (slice_out_val.is<std::string>()) {
+      return (slice_out_val.as<std::string>() == "YES");
+    } else if (slice_out_val.is<bool>()) {
+      return slice_out_val.as<bool>();
+    }
+  }
+
+  return false;
+}
+
+bool StatefulOVInferRequest::IsNPULogitsSliceRequired() {
+  if (target_device.find("NPU") != std::string::npos) {
+    const auto& model = ovInfReq.get_compiled_model();
+    // If NPUW_SLICE_OUT is enabled, it means that it's not required to slice within OVEP.
+    // Otherwise, if NPUW_SLICE_OUT is NOT enabled, then we need to perform some explicit logit
+    // slicing in OVEP.
+    return !IsNPUWSliceOutEnabled(model);
+  }
+
+  return false;
+}
+
 void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type,
                                         const std::vector<size_t>& shape, int32_t fill_value) {
   ov::Tensor tensor = ov::Tensor(type, shape);
@@ -514,5 +550,46 @@ void StatefulOVInferRequest::RewindKVCache(size_t index) {
     }
   }
 }
+
+OVTensorPtr StatefulOVInferRequest::GetTensor(const std::string& input_name) {
+
+  auto tobj = OVInferRequest::GetTensor(input_name);
+
+  if (_npu_logits_slice_required) {
+    if (input_name == "logits") {
+      if (tobj->get_shape().size() != 3) {
+        ORT_THROW(log_tag + std::format("Expected logits to have shape of rank 3, but it has shape of rank {}",
+                                        tobj->get_shape().size()));
+      }
+
+      // When _npu_logits_slice_required is true, it means that prefill may produce logits of shape:
+      // [<batch_size>, sequence_length, <vocab_size>]
+      // (Where 'sequence_length' is number of input tokens to prefill)
+      // But, ORT GenAI is expecting to receive logits of shape:
+      // [<batch_size>, 1, <vocab_size>]
+      // In this case, detect when shape[1] is not 1. When it is, create a slice of shape [<batch_size>, 1, <vocab_size>]
+      if (tobj->get_shape()[1] > 1) {
+        return OvExceptionBoundary<false>([&]() {
+          const ov::Coordinate begin = {0, tobj->get_shape()[1] - 1, 0};
+          const ov::Coordinate end = {tobj->get_shape()[0], tobj->get_shape()[1], tobj->get_shape()[2]};
+          auto sliced_tensor = ov::Tensor(*tobj, begin, end);
+          if (sliced_tensor.is_continuous()) {
+            OVTensorPtr blob = std::make_shared<OVTensor>(sliced_tensor);
+            return blob;
+          } else {
+            auto continuous_sliced_tensor = ov::Tensor(sliced_tensor.get_element_type(), sliced_tensor.get_shape());
+            sliced_tensor.copy_to(continuous_sliced_tensor);
+            OVTensorPtr blob = std::make_shared<OVTensor>(continuous_sliced_tensor);
+            return blob;
+          }
+        },
+        "Could not create sliced logits tensor");
+      }
+    }
+  }
+
+  return tobj;
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 38ea883078e85..8fc28b8885e5d 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -18,9 +18,21 @@
 #include "openvino/frontend/manager.hpp"
 #include "openvino/core/dimension.hpp"
 #include "openvino/core/partial_shape.hpp"
+#include "weak_singleton.h"
 
 #include <string>
 
+// Helper macro to test OpenVINO version at compile time.
+// Usage: #if OPENVINO_VERSION_AT_LEAST(2025, 3)
+// Falls back to 0 if OPENVINO_VERSION_MAJOR/MINOR are not defined.
+#if defined(OPENVINO_VERSION_MAJOR) && defined(OPENVINO_VERSION_MINOR)
+#define OPENVINO_VERSION_AT_LEAST(major, minor) \
+  ((OPENVINO_VERSION_MAJOR > (major)) ||        \
+   (OPENVINO_VERSION_MAJOR == (major) && OPENVINO_VERSION_MINOR >= (minor)))
+#else
+#define OPENVINO_VERSION_AT_LEAST(major, minor) 0
+#endif
+
 namespace onnxruntime {
 namespace openvino_ep {
 class OVCore;
@@ -36,32 +48,6 @@ typedef std::shared_ptr<OVTensor> OVTensorPtr;
 
 std::optional<bool> queryOVProperty(const std::string& property, const std::string& device_type);
 
-template <typename T>
-class WeakSingleton {
- public:
-  static std::shared_ptr<T> Get() {
-    static std::weak_ptr<T> instance;
-    static std::mutex mutex;
-
-    auto ptr = instance.lock();
-    if (!ptr) {
-      std::lock_guard<std::mutex> lock(mutex);
-      // ensure another thread didn't create an instance while this thread was waiting
-      ptr = instance.lock();
-      if (!ptr) {
-        ptr = std::make_shared<T>();
-        instance = ptr;
-      }
-    }
-    return ptr;
-  }
-
- protected:
-  WeakSingleton() = default;
-  virtual ~WeakSingleton() = default;
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeakSingleton);
-};
-
 struct OVCore : WeakSingleton<OVCore> {
   ov::Core core;
 
@@ -124,7 +110,7 @@ class OVInferRequest {
 
  public:
   uint32_t GetNumInputs();
-  OVTensorPtr GetTensor(const std::string& name);
+  virtual OVTensorPtr GetTensor(const std::string& name);
   std::string GetInputTensorName(uint32_t index);
 
   // Set tensor call infer req tensor if ort_ptr differs from last set ptr.
@@ -144,7 +130,7 @@ class OVInferRequest {
   virtual void Infer();
   explicit OVInferRequest(ov::InferRequest obj) : ovInfReq(std::move(obj)) {}
   OVInferRequest() : ovInfReq(ov::InferRequest()) {}
-  ov::InferRequest& GetNewObj() {
+  ov::InferRequest& GetInfReq() {
     return ovInfReq;
   }
   virtual void RewindKVCache([[maybe_unused]] size_t index) {}
@@ -161,6 +147,7 @@ class StatefulOVInferRequest : public OVInferRequest {
   void CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache);
   void SetTensorFromCache(const std::string& tensor_name, const std::vector<int64_t>& cache_data);
   std::optional<ov::Tensor> FindTensor(const std::string& tensor_name);
+  OVTensorPtr GetTensor(const std::string& name) override;
 
  private:
   void PreProcessInferRequest();
@@ -171,6 +158,9 @@ class StatefulOVInferRequest : public OVInferRequest {
   bool prefill_use_full_chat_history = false;
   std::vector<int64_t> cached_input_ids;
   std::vector<int64_t> cached_position_ids;
+
+  bool IsNPULogitsSliceRequired();
+  bool _npu_logits_slice_required = false;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/ov_shared_context.cc b/onnxruntime/core/providers/openvino/ov_shared_context.cc
new file mode 100644
index 0000000000000..f48284d0cc974
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/ov_shared_context.cc
@@ -0,0 +1,136 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#include "ov_shared_context.h"
+#include "ov_interface.h"
+
+#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp"
+#include "openvino/core/type/element_type.hpp"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+SharedContext::SharedContext(std::filesystem::path bin_path)
+    : bin_path_(std::move(bin_path)),
+      bin_manager_(bin_path_) {
+}
+
+static bool InRange(size_t offset, size_t size, size_t total_size) {
+  return (offset < total_size) && (size <= total_size) && (offset <= total_size - size);
+}
+
+// Weights file handling
+SharedContext::WeightsFile::WeightsFile(const std::filesystem::path& filename) : file_(filename, std::ios::in | std::ios::binary), file_path_(filename) {
+  try {
+    file_.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+    weights_size_ = std::filesystem::file_size(filename);
+  } catch (std::exception& e) {
+    ORT_THROW("Error: Failed to open weight file at ", filename.string(), " ", e.what());
+  }
+}
+
+void SharedContext::WeightsFile::LoadWeights(size_t file_offset, void* data, size_t size) {
+  ORT_ENFORCE(InRange(file_offset, size, weights_size_), "Error: File offset is out of bounds.");
+  file_.seekg(file_offset);
+  file_.read(static_cast<char*>(data), size);
+}
+
+void* SharedContext::WeightsFile::TryGetOrCreateDeviceMapping(std::optional<ov::RemoteContext>& remote_context) {
+  std::string dev_name{};
+  if (remote_context) {
+    dev_name = remote_context->get_device_name();
+  }
+
+  auto [it, inserted] = imported_device_tensors_.emplace(dev_name, MappingContainer{});
+  if (inserted) {
+    if (dev_name == "NPU") {
+      // try to import the memory mapped file to remote tensor
+#if (OPENVINO_VERSION_MAJOR > 2025 || (OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR >= 3))
+      ORT_ENFORCE(remote_context, "Error: Remote context is required for NPU device.");
+      auto npu_context = remote_context->as<ov::intel_npu::level_zero::ZeroContext>();
+      auto&& l0_tensor = npu_context.create_tensor(ov::element::Type_t::u8, {weights_size_}, ov::intel_npu::FileDescriptor(file_path_));
+      it->second = MappingContainer{.ptr_ = l0_tensor.get(), .tensor_ = l0_tensor};
+#endif
+    } else if (dev_name.empty()) {
+      // CPU/virtual device case, create a CPU tensor memory mapped from file
+      auto&& mmaped_tensor = ov::read_tensor_data(file_path_);
+      it->second = MappingContainer{.ptr_ = mmaped_tensor.data(), .tensor_ = mmaped_tensor};
+    }
+  }
+
+  return it->second.ptr_;
+}
+
+void SharedContext::LoadTensorFromFile(
+    Metadata::Value& value,
+    const std::filesystem::path& model_dir,
+    std::optional<ov::RemoteContext>& remote_context,
+    const ov::element::Type& element_type,
+    const ov::Shape& dimensions) {
+  const auto weights_location = model_dir / value.serialized.location;
+  auto& weights_file = weight_files_[weights_location];
+  if (!weights_file) {
+    weights_file = std::make_unique<WeightsFile>(weights_location);
+  }
+
+  ov::Tensor tensor;
+  uint8_t* mmaped_weights = static_cast<uint8_t*>(weights_file->TryGetOrCreateDeviceMapping(remote_context));
+  if (mmaped_weights) {
+    // We have memory mapped weights. Create a Tensor view into it for this value.
+    ORT_ENFORCE(InRange(value.serialized.data_offset, value.serialized.size, weights_file->Size()), "File offset + size outside of external initializer file");
+    void* mmapped_offset = static_cast<void*>(mmaped_weights + value.serialized.data_offset);
+    tensor = ov::Tensor(element_type, dimensions, mmapped_offset);
+  } else {
+    ORT_ENFORCE(remote_context, "Unexpected: Don't have remote context and memory mapped weights is null!");
+    // Can't mmap the file to device tensor, create a host tensor and copy the data
+    tensor = remote_context->create_host_tensor(element_type, dimensions);
+    ORT_ENFORCE(tensor.get_byte_size() == value.serialized.size, "Remote tensor size mismatch");
+    weights_file->LoadWeights(value.serialized.data_offset, tensor.data(), value.serialized.size);
+  }
+
+  ORT_ENFORCE(tensor.get_byte_size() == value.serialized.size, "Tensor size mismatch");
+  value.tensor = std::make_shared<const ov::Tensor>(std::move(tensor));
+}
+
+void SharedContext::SetSharedWeightsOnInferRequest(ov::InferRequest& ir, const std::filesystem::path& model_dir) {
+  auto&& compiled_model = ir.get_compiled_model();
+  std::optional<ov::RemoteContext> opt_remote_ctx;
+  try {
+    opt_remote_ctx = compiled_model.get_context();
+  } catch (ov::Exception&) {
+    // CPU may not have a remote context.
+  }
+
+  std::unique_lock<std::shared_mutex> ul(mutex_);
+  for (const auto& input : compiled_model.inputs()) {
+    const std::string tensor_name = *input.get_names().begin();
+
+    auto it = metadata_.find(tensor_name);
+    if (it == metadata_.end()) continue;  // No shared weight for this tensor
+    auto& value = it->second;
+
+    if (!value.tensor) {
+      LoadTensorFromFile(value, model_dir, opt_remote_ctx, input.get_element_type(), input.get_shape());
+    }
+    ir.set_tensor(tensor_name, *value.tensor);
+  }
+}
+
+void SharedContext::Serialize(std::ostream& stream) {
+  bin_manager_.Serialize(stream, shared_from_this());
+}
+
+void SharedContext::Deserialize(std::istream& stream) {
+  bin_manager_.Deserialize(stream, shared_from_this());
+}
+
+void SharedContext::Serialize() {
+  bin_manager_.Serialize(shared_from_this());
+}
+
+void SharedContext::Deserialize() {
+  bin_manager_.Deserialize(shared_from_this());
+}
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_shared_context.h b/onnxruntime/core/providers/openvino/ov_shared_context.h
new file mode 100644
index 0000000000000..aee6d5570d8fa
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/ov_shared_context.h
@@ -0,0 +1,163 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <filesystem>
+#include <fstream>
+#include <shared_mutex>
+#include <mutex>
+
+#include "openvino/runtime/core.hpp"
+#include "ov_bin_manager.h"
+#include "weak_singleton.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+class SharedContext : public std::enable_shared_from_this<SharedContext> {
+ public:
+  explicit SharedContext(std::filesystem::path bin_path);
+  SharedContext() : SharedContext("") {}
+
+  struct Metadata {
+    struct Value {
+      struct {
+        std::filesystem::path location{};
+        size_t data_offset{0};
+        size_t size{0};
+      } serialized;
+
+      std::shared_ptr<const ov::Tensor> tensor;
+    };
+    using Map = std::unordered_map<std::string, Value>;
+  };
+
+  bool IsSharedWeight(const std::string& name) const {
+    std::shared_lock lock(mutex_);
+    return metadata_.contains(name);
+  }
+
+  void AddExternalWeight(const std::string& name, size_t offset, size_t size, const std::filesystem::path& location) {
+    Metadata::Value value;
+    value.serialized.data_offset = offset;
+    value.serialized.size = size;
+    value.serialized.location = location;
+    std::unique_lock lock(mutex_);
+    metadata_[name] = std::move(value);
+  }
+
+  Metadata::Map GetMetadataCopy() const {
+    std::shared_lock lock(mutex_);
+    return metadata_;
+  }
+
+  void SetSharedWeightsOnInferRequest(ov::InferRequest& ir, const std::filesystem::path& model_dir);
+
+  void AddNativeBlob(const std::string& name, const ov::CompiledModel& compiled_model) {
+    bin_manager_.AddNativeBlob(name, compiled_model);
+  }
+
+  ov::Tensor GetNativeBlob(const std::string& blob_name) {
+    return bin_manager_.GetNativeBlob(blob_name);
+  }
+
+  std::unique_ptr<std::istream> GetNativeBlobAsStream(const std::string& blob_name) {
+    return bin_manager_.GetNativeBlobAsStream(blob_name);
+  }
+
+  void Serialize(std::ostream& stream);
+  void Deserialize(std::istream& stream);
+  void Serialize();
+  void Deserialize();
+
+  std::filesystem::path GetBinPath() const {
+    return bin_manager_.GetExternalBinPath();
+  }
+
+  static std::filesystem::path GetBinPathForModel(const std::filesystem::path& model_path) {
+    return BinManager::GetBinPathForModel(model_path);
+  }
+
+ private:
+  struct WeightsFile {
+    ORT_DISALLOW_COPY_AND_ASSIGNMENT(WeightsFile);
+    WeightsFile() = delete;
+    virtual ~WeightsFile() = default;
+    explicit WeightsFile(const std::filesystem::path& filename);
+    void LoadWeights(size_t file_offset, void* data, size_t size);
+    void* TryGetOrCreateDeviceMapping(std::optional<ov::RemoteContext>& remote_context);
+    size_t Size() const { return weights_size_; }
+
+   private:
+    std::ifstream file_;
+    std::filesystem::path file_path_;
+    size_t weights_size_;
+    struct MappingContainer {
+      void* ptr_{nullptr};
+      ov::Tensor tensor_;
+    };
+    std::map<std::string, MappingContainer> imported_device_tensors_;
+  };
+
+  void LoadTensorFromFile(
+      Metadata::Value& value,
+      const std::filesystem::path& model_dir,
+      std::optional<ov::RemoteContext>& remote_context,
+      const ov::element::Type& element_type,
+      const ov::Shape& dimensions);
+
+  mutable std::shared_mutex mutex_;
+  std::filesystem::path bin_path_;
+  BinManager bin_manager_;
+  std::unordered_map<std::filesystem::path, std::unique_ptr<WeightsFile>> weight_files_;
+  Metadata::Map metadata_;
+};
+
+class SharedContextManager : public WeakSingleton<SharedContextManager> {
+ public:
+  std::shared_ptr<SharedContext> GetOrCreateActiveSharedContext(const std::filesystem::path& model_path) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (active_context_) {
+      return active_context_;
+    }
+    auto [it, inserted] = contexts_.try_emplace(model_path, nullptr);
+    if (inserted) {
+      it->second = std::make_shared<SharedContext>(model_path);
+    }
+    active_context_ = it->second;
+    active_context_path_ = model_path;
+    return it->second;
+  }
+
+  std::shared_ptr<SharedContext> GetOrCreateSharedContext(const std::filesystem::path& model_path) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto [it, inserted] = contexts_.try_emplace(model_path, nullptr);
+    if (inserted) {
+      it->second = std::make_shared<SharedContext>(model_path);
+    }
+    return it->second;
+  }
+
+  void ClearActiveSharedContext() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (active_context_) {
+      contexts_.erase(active_context_path_);
+      active_context_path_.clear();
+    }
+    active_context_ = nullptr;
+  }
+
+ private:
+  mutable std::mutex mutex_;
+  std::unordered_map<std::filesystem::path, std::shared_ptr<SharedContext>> contexts_;
+  std::shared_ptr<SharedContext> active_context_;
+  std::filesystem::path active_context_path_;
+};
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
index b48b0efde7ab6..c4ec47534d009 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -2,6 +2,8 @@
 // Licensed under the MIT License
 
 #include "core/providers/openvino/ov_stateful_patch_utils.h"
+#include "core/providers/shared_library/provider_api.h"
+#include "core/common/common.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -59,6 +61,17 @@ bool ModelHasInputOutputNames(std::shared_ptr<ov::Model> model, const std::strin
   return false;
 }
 
+std::string GetInputOutputName(std::shared_ptr<ov::Model> ov_model,
+                               const std::vector<std::string>& candidate_names) {
+  for (const auto& name : candidate_names) {
+    if (ModelHasInputOutputNames(ov_model, name)) {
+      return name;
+    }
+  }
+  // Return the first candidate as default if none are found
+  return candidate_names.empty() ? "" : candidate_names[0];
+}
+
 void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                       std::vector<std::string>& not_kv_inputs,
                       const std::vector<std::string>& key_value_input_names,
@@ -67,10 +80,15 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
     throw std::runtime_error("Model already has fused cache");
   }
 
-  std::string main_input_name = "inputs_embeds";
-  if (ModelHasInputOutputNames(ov_model, "input_ids")) {
-    main_input_name = "input_ids";
-  }
+  // Define input name candidates in priority order
+  const std::vector<std::string> input_name_candidates = {
+      "inputs_embeds",                       // Default fallback
+      "input_ids",                           // Most common
+      "input_hidden_states",                 // Alternative
+      "/model/embed_tokens/Gather_output_0"  // Specific model type
+  };
+
+  std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates);
 
   auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0];
 
@@ -116,21 +134,109 @@ void MakeStateful(std::shared_ptr<ov::Model>& ov_model,
   manager.run_passes(ov_model);
 }
 
-// Converted to C++ from below reference URL:
-// https://github.com/huggingface/optimum-intel/blob/main/optimum/exporters/openvino/stateful.py#L281
-void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
+// Helper function to extract KV patterns from output names dynamically
+//
+// Example: Given output names ["present_key_cross_0", "present_key_cross_1", "present_value_cross_0", "present_value_cross_1", "logits"]
+//   key_value_output_names = ["present_key_cross_0", "present_key_cross_1", "present_value_cross_0", "present_value_cross_1"]
+//   unique_patterns = {"key_cross", "value_cross"}
+std::pair<std::vector<std::string>, std::unordered_set<std::string>> ExtractKVPatternsFromOutputs(const std::shared_ptr<ov::Model>& model) {
+  std::vector<std::string> key_value_output_names;
+  std::unordered_set<std::string> unique_patterns;
+
+  const std::string prefix = "present_";
+  const size_t prefix_len = prefix.length();
+  for (const ov::Output<ov::Node>& output : model->outputs()) {
+    const auto& names = output.get_names();
+    for (const auto& name : names) {
+      if (name.find(prefix) == 0 && name.length() > prefix_len) {
+        size_t last_underscore_pos = name.rfind('_');
+        // Extract pattern between "present_" and the last underscore
+        if (last_underscore_pos != std::string::npos && last_underscore_pos > prefix_len) {
+          std::string pattern = name.substr(prefix_len, last_underscore_pos - prefix_len);
+          if (!pattern.empty()) {
+            unique_patterns.insert(pattern);
+            key_value_output_names.push_back(name);
+          }
+        }
+        break;
+      }
+    }
+  }
+
+  if (unique_patterns.size() > 2) {
+    ORT_THROW("More than two unique KV patterns found in output names.");
+  }
+  return std::make_pair(key_value_output_names, unique_patterns);
+}
+
+// Main function to extract KV tensors using dynamic pattern matching
+//
+// Example: Given input names ["input_ids", "attention_mask", "past_key_cross_0", "past_key_cross_1", "past_value_cross_0", "past_value_cross_1"]
+//   kv_patterns = {"key_cross", "value_cross"}
+//
+//   key_value_input_names = ["past_key_cross_0", "past_key_cross_1", "past_value_cross_0", "past_value_cross_1"]
+//   not_kv_inputs = ["input_ids", "attention_mask"]
+std::pair<std::vector<std::string>, std::vector<std::string>> ExtractInputKVTensors(
+    const std::shared_ptr<ov::Model>& model, const std::unordered_set<std::string>& kv_patterns) {
+
   std::vector<std::string> key_value_input_names;
   std::vector<std::string> not_kv_inputs;
+
+  if (kv_patterns.empty()) {
+    // Fallback: use original substring matching
+    for (const ov::Output<ov::Node>& input : model->inputs()) {
+      const auto& names = input.get_names();
+      const std::string input_name = input.get_any_name();
+
+      bool is_kv_input = false;
+      for (const auto& name : names) {
+        if (name.find("key_values") != std::string::npos ||
+            name.find("keys") != std::string::npos ||
+            name.find("values") != std::string::npos) {
+          key_value_input_names.push_back(name);
+          is_kv_input = true;
+          break;
+        }
+      }
+
+      if (!is_kv_input) {
+        not_kv_inputs.push_back(input_name);
+      }
+    }
+
+    return std::make_pair(key_value_input_names, not_kv_inputs);
+  }
+
+  // Inline helper function to check if name is matched with provided pattern followed by "_%d"
+  auto matches_pattern = [](const std::string& name, const std::string& pattern) -> bool {
+    size_t pos = name.find(pattern);
+    if (pos == std::string::npos) {
+      return false;
+    }
+
+    size_t after_pattern = pos + pattern.length();
+    if (after_pattern >= name.length() || name[after_pattern] != '_') {
+      return false;
+    }
+
+    std::string suffix = name.substr(after_pattern + 1);
+    return !suffix.empty() && std::all_of(suffix.begin(), suffix.end(), ::isdigit);
+  };
+
   for (const ov::Output<ov::Node>& input : model->inputs()) {
     auto& names = input.get_names();
-
     bool found = false;
-    for (auto& name : names) {
-      if (name.find("key_values") != std::string::npos) {
-        key_value_input_names.push_back(name);
-        found = true;
-        break;
+
+    // Check if any input name contains either key or value pattern
+    for (const auto& name : names) {
+      for (const auto& pattern : kv_patterns) {
+        if (matches_pattern(name, pattern)) {
+          key_value_input_names.push_back(name);
+          found = true;
+          break;
+        }
       }
+      if (found) break;
     }
 
     if (!found) {
@@ -138,20 +244,25 @@ void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
     }
   }
 
-  std::vector<std::string> key_value_output_names;
-  for (const ov::Output<ov::Node>& output : model->outputs()) {
-    auto& names = output.get_names();
-    for (auto& name : names) {
-      if (name.find("present") != std::string::npos) {
-        key_value_output_names.push_back(name);
-        break;
-      }
-    }
-  }
+  return std::make_pair(key_value_input_names, not_kv_inputs);
+}
+
+// Updated PatchStatefulDecoder function
+void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
+  // Use the dynamic pattern-based extraction logic
+  auto [key_value_output_names, extracted_patterns] = ExtractKVPatternsFromOutputs(model);
+  auto [key_value_input_names, not_kv_inputs] = ExtractInputKVTensors(model, extracted_patterns);
 
   if (key_value_input_names.empty() || key_value_output_names.empty()) {
-    std::cout << "no key_value_input_names or key_value_output_names found" << std::endl;
-    return;
+    ORT_THROW("No key_value_input_names or key_value_output_names found");
+  }
+
+  if (key_value_input_names.size() != key_value_output_names.size()) {
+    ORT_THROW("Found different sizes between key_value_input_names (", 
+    key_value_input_names.size(), 
+    ") and key_value_output_names (", 
+    key_value_output_names.size(), 
+    "). They couldn't be paired.");
   }
 
   // By default, batch is the 0 - th but chatglm uses 1 - st dimension as batch
@@ -295,13 +406,6 @@ void UpdateNPUConfig(ov::AnyMap& config, const KVAxesPosition& kv_pos, const KVD
   RenameKey(config, "PREFILL_HINT", "NPUW_LLM_PREFILL_HINT");
   RenameKey(config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG");
   RenameKey(config, "GENERATE_HINT", "NPUW_LLM_GENERATE_HINT");
-
-  const size_t npuw_context_len_threshold = 2048;
-  if ((kv_desc.max_prompt_len + kv_desc.min_response_len) >= npuw_context_len_threshold) {
-    // This improves accuracy for generation sequences that exceed 2k tokens.
-    config["++NPUW_LLM_PREFILL_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "NPU,CPU"}, {"NPUW_ONLINE_AVOID", "P:SinCos/NPU"}};
-    config["++NPUW_LLM_GENERATE_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "NPU,CPU"}, {"NPUW_ONLINE_AVOID", "P:SinCos/NPU"}};
-  }
 }
 
 std::optional<ov::Any> PopOptionNew(ov::AnyMap& config, const std::string& option_name) {
diff --git a/onnxruntime/core/providers/openvino/ov_tracing.cc b/onnxruntime/core/providers/openvino/ov_tracing.cc
new file mode 100644
index 0000000000000..79109552f3df6
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/ov_tracing.cc
@@ -0,0 +1,228 @@
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/openvino/ov_tracing.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 26440)
+#endif
+#include <TraceLoggingProvider.h>
+#include <winmeta.h>
+#include "core/platform/windows/TraceLoggingConfig.h"
+
+TRACELOGGING_DEFINE_PROVIDER(
+    ov_tracing_provider_handle,
+    "Intel.ML.ONNXRuntime.OpenVINO",
+    // {"b5a8c2e1-4d7f-4a3b-9c2e-1f8e5a6b7c9d"}
+    (0xb5a8c2e1, 0x4d7f, 0x4a3b, 0x9c, 0x2e, 0x1f, 0x8e, 0x5a, 0x6b, 0x7c, 0x9d),
+    TraceLoggingOptionMicrosoftTelemetry());
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+namespace {
+std::string EscapeJsonString(const std::string& input) {
+  std::string escaped;
+  // Reserve extra space for escaping
+  escaped.reserve(input.size() + input.size() / 5);
+
+  for (char c : input) {
+    switch (c) {
+      case '\"':
+        escaped += "\\\"";
+        break;
+      case '\\':
+        escaped += "\\\\";
+        break;
+      case '\b':
+        escaped += "\\b";
+        break;
+      case '\f':
+        escaped += "\\f";
+        break;
+      case '\n':
+        escaped += "\\n";
+        break;
+      case '\r':
+        escaped += "\\r";
+        break;
+      case '\t':
+        escaped += "\\t";
+        break;
+      default:
+        if (static_cast<unsigned char>(c) < 0x20) {
+          char unicode_escape[7];
+          sprintf_s(unicode_escape, sizeof(unicode_escape), "\\u%04x", static_cast<unsigned char>(c));
+          escaped += unicode_escape;
+        } else {
+          escaped += c;
+        }
+        break;
+    }
+  }
+  return escaped;
+}
+}  // namespace
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+std::mutex OVTracing::mutex_;
+std::mutex OVTracing::provider_change_mutex_;
+uint32_t OVTracing::global_register_count_ = 0;
+bool OVTracing::enabled_ = true;
+UCHAR OVTracing::level_ = 0;
+UINT64 OVTracing::keyword_ = 0;
+std::vector<const OVTracing::EtwInternalCallback*> OVTracing::callbacks_;
+std::mutex OVTracing::callbacks_mutex_;
+
+OVTracing::OVTracing() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (global_register_count_ == 0) {
+    HRESULT hr = TraceLoggingRegisterEx(ov_tracing_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
+    if (SUCCEEDED(hr)) {
+      global_register_count_ += 1;
+    }
+  }
+}
+
+OVTracing::~OVTracing() noexcept {
+  // Clean up TraceLogging, only hold mutex_
+  try {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (global_register_count_ > 0) {
+      global_register_count_ -= 1;
+      if (global_register_count_ == 0) {
+        TraceLoggingUnregister(ov_tracing_provider_handle);
+      }
+    }
+  } catch (...) {
+    // Suppress exceptions in destructor
+  }
+
+  // Clean up callbacks, only hold callbacks_mutex_
+  try {
+    std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+    callbacks_.clear();
+  } catch (...) {
+    // Suppress exceptions in destructor
+  }
+}
+
+OVTracing& OVTracing::Instance() {
+  static OVTracing instance;
+  return instance;
+}
+
+bool OVTracing::IsEnabled() const {
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return enabled_;
+}
+
+UCHAR OVTracing::Level() const {
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return level_;
+}
+
+UINT64 OVTracing::Keyword() const {
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return keyword_;
+}
+
+void OVTracing::LogAllRuntimeOptions(uint32_t session_id, const SessionContext& ctx) const {
+  if (!IsEnabled()) return;
+
+  // Log OpenVINO SDK version separately
+  TraceLoggingWrite(ov_tracing_provider_handle, "OV.SDK.Version",
+                    TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
+                    TraceLoggingUInt32(session_id, "session_id"),
+                    TraceLoggingString(ctx.openvino_sdk_version.c_str(), "openvino_sdk_version"));
+
+  constexpr std::string_view provider_prefix = "ep.openvinoexecutionprovider.";
+  std::ostringstream provider_opts;
+  std::ostringstream session_opts;
+  bool provider_first = true;
+  bool session_first = true;
+
+  provider_opts << "{";
+  session_opts << "{";
+
+  // Segregate options based on prefix
+  for (const auto& [key, value] : ctx.runtime_config.options) {
+    if (!value.empty()) {
+      if (key.starts_with(provider_prefix)) {
+        // Provider option
+        if (!provider_first) provider_opts << ",";
+        provider_opts << "\"" << key << "\":\"" << EscapeJsonString(value) << "\"";
+        provider_first = false;
+      } else {
+        // Session option
+        if (!session_first) session_opts << ",";
+        session_opts << "\"" << key << "\":\"" << EscapeJsonString(value) << "\"";
+        session_first = false;
+      }
+    }
+  }
+
+  provider_opts << "}";
+  session_opts << "}";
+
+  // Log provider options only if there are any
+  if (!provider_first) {
+    TraceLoggingWrite(ov_tracing_provider_handle, "OVEP.Provider.Options",
+                      TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
+                      TraceLoggingUInt32(session_id, "session_id"),
+                      TraceLoggingString(provider_opts.str().c_str(), "provider_options"));
+  }
+
+  // Log session options only if there are any
+  if (!session_first) {
+    TraceLoggingWrite(ov_tracing_provider_handle, "OVEP.Session.Options",
+                      TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
+                      TraceLoggingUInt32(session_id, "session_id"),
+                      TraceLoggingString(session_opts.str().c_str(), "session_options"));
+  }
+}
+
+void OVTracing::RegisterInternalCallback(const EtwInternalCallback& callback) {
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  callbacks_.push_back(&callback);
+}
+
+void OVTracing::UnregisterInternalCallback(const EtwInternalCallback& callback) {
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(),
+                                [&callback](const EtwInternalCallback* ptr) {
+                                  return ptr == &callback;
+                                });
+  callbacks_.erase(new_end, callbacks_.end());
+}
+
+void NTAPI OVTracing::ORT_TL_EtwEnableCallback(
+    _In_ LPCGUID SourceId, _In_ ULONG IsEnabled, _In_ UCHAR Level, _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword, _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData, _In_opt_ PVOID CallbackContext) {
+  {
+    std::lock_guard<std::mutex> lock(provider_change_mutex_);
+    enabled_ = (IsEnabled != 0);
+    level_ = Level;
+    keyword_ = MatchAnyKeyword;
+  }
+  // Release lock before invoking callbacks to prevent deadlock
+  InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+}
+
+void OVTracing::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                                ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext) {
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  for (const auto& callback : callbacks_) {
+    (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+  }
+}
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
+
+#endif  // defined(_WIN32)
diff --git a/onnxruntime/core/providers/openvino/ov_tracing.h b/onnxruntime/core/providers/openvino/ov_tracing.h
new file mode 100644
index 0000000000000..b558695d6f7c7
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/ov_tracing.h
@@ -0,0 +1,64 @@
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#ifdef _WIN32
+#include <windows.h>
+#include <TraceLoggingProvider.h>
+#include <winmeta.h>
+
+#include <functional>
+#include <mutex>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <unordered_map>
+#include <optional>
+#include <algorithm>
+#include "core/providers/openvino/contexts.h"
+
+TRACELOGGING_DECLARE_PROVIDER(ov_tracing_provider_handle);
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+class OVTracing {
+ public:
+  static OVTracing& Instance();
+  bool IsEnabled() const;
+  unsigned char Level() const;
+  UINT64 Keyword() const;
+
+  void LogAllRuntimeOptions(uint32_t session_id, const SessionContext& ctx) const;
+
+  using EtwInternalCallback = std::function<void(
+      LPCGUID, ULONG, UCHAR, ULONGLONG, ULONGLONG, PEVENT_FILTER_DESCRIPTOR, PVOID)>;
+  static void RegisterInternalCallback(const EtwInternalCallback& callback);
+  static void UnregisterInternalCallback(const EtwInternalCallback& callback);
+
+ private:
+  OVTracing();
+  ~OVTracing();
+  OVTracing(const OVTracing&) = delete;
+  OVTracing& operator=(const OVTracing&) = delete;
+  OVTracing(OVTracing&&) = delete;
+  OVTracing& operator=(OVTracing&&) = delete;
+
+  static std::mutex mutex_;
+  static uint32_t global_register_count_;
+  static bool enabled_;
+  static std::vector<const EtwInternalCallback*> callbacks_;
+  static std::mutex callbacks_mutex_;
+  static std::mutex provider_change_mutex_;
+  static UCHAR level_;
+  static ULONGLONG keyword_;
+
+  static void InvokeCallbacks(LPCGUID, ULONG, UCHAR, ULONGLONG, ULONGLONG, PEVENT_FILTER_DESCRIPTOR, PVOID);
+  static void NTAPI ORT_TL_EtwEnableCallback(_In_ LPCGUID, _In_ ULONG, _In_ UCHAR, _In_ ULONGLONG,
+                                             _In_ ULONGLONG, _In_opt_ PEVENT_FILTER_DESCRIPTOR, _In_opt_ PVOID);
+};
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
+
+#endif  // defined(_WIN32)
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 1893700cab09c..40036212ca125 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -41,14 +41,16 @@ GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler,
     npu_qdq_optimizer_enabled = true;  // see data_ops.cc ~615 where we check for int16 types for gpu, this may change to a better approach later
   }
 
-#if OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0
-  data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 1
-  data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled);
+#if OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 1
+  data_ops_ = std::make_unique<DataOps>(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled);
 #elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 2
-  data_ops_ = new DataOps(graph_viewer_, V_2025_2, device_type_, npu_qdq_optimizer_enabled);
+  data_ops_ = std::make_unique<DataOps>(graph_viewer_, V_2025_2, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 3
+  data_ops_ = std::make_unique<DataOps>(graph_viewer_, V_2025_3, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 4
+  data_ops_ = std::make_unique<DataOps>(graph_viewer_, V_2025_4, device_type_, npu_qdq_optimizer_enabled);
 #else
-  data_ops_ = new DataOps(graph_viewer_, V_2025_2, device_type_, npu_qdq_optimizer_enabled);
+  data_ops_ = std::make_unique<DataOps>(graph_viewer_, V_2025_4, device_type_, npu_qdq_optimizer_enabled);
 #endif
 }
 
@@ -179,7 +181,7 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
           omit_subgraph = false;
         } else if (j < total_clusters - 1) {
           bool append_node = false;
-          while (j < total_clusters && !append_node) {
+          while (j < total_clusters - 1 && !append_node) {
             j = j + 1;
             append_node = AddTrivialClusterToNextClusterIfConnected(graph_viewer_, index, connected_clusters[j]);
           }
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h
index 364e79a76f154..3974bdc3b8ff9 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h
@@ -16,7 +16,7 @@ class GetCapability {
   const EPCtxHandler& ep_ctx_handler_;
   const GraphViewer& graph_viewer_;
   std::string device_type_;
-  DataOps* data_ops_;
+  std::unique_ptr<DataOps> data_ops_;
   bool is_wholly_supported_graph_ = false;
   bool has_external_weights_ = false;
 
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index f848b89ed10c8..373b2121a9b60 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -96,6 +96,7 @@ std::vector<SupportedOp> supported_op_mode = {
     {"Atanh", V_2020_4, {"CPU"}},
     {"Atanh", V_2022_1, {"GPU"}},
     {"Attention", V_2023_0, {"CPU", "GPU"}},
+    {"GroupQueryAttention", V_2025_1, {"GPU"}},
     {"AveragePool", V_2020_4, {"CPU", "GPU"}},
     {"BatchNormalization", V_2020_4, {"CPU", "GPU"}},
     {"BiasGelu", V_2023_0, {"CPU", "GPU"}},
@@ -407,7 +408,7 @@ void DataOps::populate_op_mode_supported() {
 
   // populate unsupportedmode_t
   {
-    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1, V_2025_2},
+    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1, V_2025_2, V_2025_3, V_2025_4},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch)
                                for (size_t i = 0; i < node->InputDefs().size(); i++) {
@@ -424,7 +425,7 @@ void DataOps::populate_op_mode_supported() {
   {
     UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
                               V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1,
-                              V_2025_2},
+                              V_2025_2, V_2025_3, V_2025_4},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_args = node->InputDefs();
                                const auto& input_arg = (input_args.size() > 1) ? input_args[1] : input_args[0];
@@ -444,7 +445,7 @@ void DataOps::populate_op_mode_supported() {
   {
     UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
                               V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1,
-                              V_2025_2},
+                              V_2025_2, V_2025_3, V_2025_4},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -460,7 +461,7 @@ void DataOps::populate_op_mode_supported() {
   }
   {
     UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5,
-                              V_2024_6, V_2025_0, V_2025_1, V_2025_2},
+                              V_2024_6, V_2025_0, V_2025_1, V_2025_2, V_2025_3, V_2025_4},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
@@ -560,9 +561,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
   }
 
   auto dtype = type_proto->tensor_type().elem_type();
-  // Enable bfloat16 -> float16 on-the-fly conversion
-  if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16 ||
-      dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 ||
+  if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 ||
       dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)
     return true;
   if (is_initializer) {
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index 95905e010541e..cf6290ee07921 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -36,7 +36,9 @@ enum versionNum {
   V_2024_6,
   V_2025_0,
   V_2025_1,
-  V_2025_2
+  V_2025_2,
+  V_2025_3,
+  V_2025_4
 };
 
 using VersionNum = enum versionNum;
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp
index 3a0db44bca7bc..84d391a3f2ff3 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp
@@ -4,7 +4,6 @@
 #include "qdq_scales_fix.h"
 #include "core/providers/openvino/ov_protobuf_utils.h"
 #include "core/framework/ort_value.h"
-#include "core/common/float16.h"
 
 #include <fstream>
 #include <list>
@@ -463,11 +462,35 @@ struct CustomGraph {
       }
 
       if (!is_prev_input) {
-        for (const auto& edge : output_edges) {
+        if (prev.node_ptr->OutputDefs()[0]->Type() != dq_node_ref.OutputDefs()[0]->Type()) {
+          NodeArg& output = original_graph.GetOrCreateNodeArg(prev.node_name + "_cast_0", dq_node_ref.OutputDefs()[0]->TypeAsProto());
+          std::string cast_node_name = prev.node_ptr->OutputDefs()[0]->Name() + "_cast";
+          InlinedVector<NodeArg*> input_args = {const_cast<NodeArg*>(prev.node_ptr->OutputDefs()[0])};
+          InlinedVector<NodeArg*> output_args = {&output};
+          Node& cast_node = original_graph.AddNode(cast_node_name, "Cast", "", input_args, output_args, nullptr, "");
+          auto type_str = dq_node_ref.OutputDefs()[0]->Type();
+          ORT_ENFORCE(type_str != nullptr, "Type string is null in QDQ scales fix.");
+          auto type_cast = type_str->find("tensor(float)") != std::string::npos ? onnx::TensorProto_DataType_FLOAT : onnx::TensorProto_DataType_FLOAT16;
+          ORT_ENFORCE((type_cast == onnx::TensorProto_DataType_FLOAT) || (type_str->find("tensor(float16)") != std::string::npos),
+                      "QDQ type misalignment, expected float32 or float16 output");
+          cast_node.AddAttribute("to", static_cast<int64_t>(type_cast));
           original_graph.AddEdge(prev.node_ptr->Index(),
-                                 std::get<0>(edge),
+                                 cast_node.Index(),
                                  prev_output_index,
-                                 std::get<2>(edge));
+                                 0);
+          for (const auto& edge : output_edges) {
+            original_graph.AddEdge(cast_node.Index(),
+                                   std::get<0>(edge),
+                                   0,
+                                   std::get<2>(edge));
+          }
+        } else {
+          for (const auto& edge : output_edges) {
+            original_graph.AddEdge(prev.node_ptr->Index(),
+                                   std::get<0>(edge),
+                                   prev_output_index,
+                                   std::get<2>(edge));
+          }
         }
       }
     }
@@ -931,54 +954,5 @@ Status Transform(const GraphViewer& src_graph_viewer,
   return status;
 }
 }  // namespace qdq_scales_fix
-
-namespace bfloat16_fix {
-void replace_bf16_with_fp16(qdq_scales_fix::CustomGraph& gen_graph) {
-  for (auto& const_node : gen_graph.original_graph.Nodes()) {
-    auto node = const_cast<ONNX_NAMESPACE::Node*>(const_node);
-    if (node->OpType() == "Cast") {
-      for (auto& [name, const_attribute] : node->GetAttributes()) {
-        auto& attribute = const_cast<ONNX_NAMESPACE::AttributeProto&>(const_attribute);
-        if (name == "to" && attribute.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_INT)
-          if (attribute.i() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
-            attribute.set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
-      }
-    }
-    for (auto& output : node->OutputDefs()) {
-      auto& output_proto = const_cast<ONNX_NAMESPACE::TypeProto&>(output->ToProto().type());
-      if (output_proto.mutable_tensor_type()->elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
-        output_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
-    }
-  }
-
-  const auto& init_set = gen_graph.original_graph.GetAllInitializedTensors();
-  for (auto& [key, const_tensor_proto] : init_set) {
-    auto tensor_proto = const_cast<ONNX_NAMESPACE::TensorProto*>(const_tensor_proto);
-    auto dt = tensor_proto->data_type();
-    if (dt == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) {
-      auto raw_data = tensor_proto->has_raw_data() ? reinterpret_cast<std::uint16_t*>(tensor_proto->mutable_raw_data()->data()) : nullptr;
-      if (raw_data) {
-        tensor_proto->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
-        std::int64_t size = 1;
-        for (int i = 0; i < tensor_proto->dims_size(); ++i)
-          size *= tensor_proto->dims()[i];
-        for (std::int64_t i = 0; i < size; ++i) {
-          raw_data[i] = onnxruntime::MLFloat16(onnxruntime::BFloat16::FromBits(raw_data[i])).val;
-        }
-      }
-    }
-  }
-}
-
-Status Transform(const GraphViewer& src_graph_viewer,
-                 const logging::Logger& logger,
-                 /*out*/ std::unique_ptr<onnxruntime::Model>& model) {
-  auto status = qdq_scales_fix::copy_model(src_graph_viewer, logger, model);
-  auto g = qdq_scales_fix::generate_graph_from_onnx(model->MainGraph());
-
-  replace_bf16_with_fp16(g);
-  return status;
-}
-}  // namespace bfloat16_fix
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h
index 2182850d96c43..c54c531e1bd40 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h
@@ -15,10 +15,5 @@ Status Transform(const GraphViewer& src_graph,
                  const logging::Logger& logger,
                  /*out*/ std::unique_ptr<onnxruntime::Model>& model);
 }
-namespace bfloat16_fix {
-Status Transform(const GraphViewer& src_graph,
-                 const logging::Logger& logger,
-                 /*out*/ std::unique_ptr<onnxruntime::Model>& model);
-}
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index e010851f22e50..2e5bb7b8c86be 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -704,7 +704,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        bool enable_ovep_weight_sharing,
                                        bool enable_ovep_qdq_optimizer,
                                        /*out*/ std::unique_ptr<onnxruntime::Model>& model,
-                                       /*out*/ sw& shared_weights) {
+                                       /*out*/ SharedContext& shared_context) {
   // NOTE: This function is a re-implementation of GraphViewerToProto() in core/graph/graph_proto_serializer.cc
   // with the following differences:
   //   - Uses onnxruntime::Graph APIs instead of onnx::GraphProto APIs.
@@ -824,34 +824,28 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   });
 
   // initialize map for creating metadata for initilizers with external weights
-  auto& metadata = shared_weights.metadata;
-
-  const auto& insert_metadata = [&metadata](const ONNX_NAMESPACE::TensorProto& proto) {
-    sw::Metadata::Map::key_type key{proto.name()};
-    sw::Metadata::Map::mapped_type value{};
 
+  const auto& add_shared_weight = [&shared_context](const ONNX_NAMESPACE::TensorProto& proto) {
     using mutable_proto_t = ONNX_NAMESPACE::TensorProto*;
     auto& mutable_proto = *const_cast<mutable_proto_t>(&proto);
     auto* entry_protos = mutable_proto.mutable_external_data();
+
+    std::string location = "";
+    size_t data_offset = 0, size = 0;
     for (int i = 0; i < entry_protos->size(); i++) {
       auto& string_entry_proto{entry_protos->at(i)};
       const auto& pb_key{*(string_entry_proto.mutable_key())};
       const auto& pb_value{*(string_entry_proto.mutable_value())};
       if (pb_key == "location") {
-        value.location = pb_value;
+        location = pb_value;
       } else if (pb_key == "offset") {
-        value.data_offset = std::stoul(pb_value);
+        data_offset = std::stoul(pb_value);
       } else if (pb_key == "length") {
-        value.size = std::stoul(pb_value);
+        size = std::stoul(pb_value);
       }
     }
-    value.element_type = proto.data_type();
-    value.dimensions.resize(proto.dims_size());
-    for (uint32_t index = 0; auto& dim : value.dimensions) {
-      dim = proto.dims()[index++];
-    }
 
-    metadata.emplace(key, std::move(value));
+    shared_context.AddExternalWeight(proto.name(), data_offset, size, location);
   };
 
   // Handle initializers
@@ -871,7 +865,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
 
       if (!is_quant_param) {
         // This is actual weight data - so to convert to input for weight sharing
-        insert_metadata(initializer_tensor);
+        add_shared_weight(initializer_tensor);
         AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, name);
       } else {
         // This is a quantization parameter - keep as initializer even if external
@@ -912,7 +906,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
         if (!init_with_data &&
             utils::HasExternalData(initializer_tensor) &&
             enable_ovep_weight_sharing) {
-          insert_metadata(initializer_tensor);
+          add_shared_weight(initializer_tensor);
 
           // Add initializer as input if it has external data
           AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name());
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
index 53de0fd019311..e649b3ec71943 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
@@ -10,7 +10,7 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-using sw = SharedContext::SharedWeights;
+class SharedContext;
 
 // Creates a new model without the DQ/Q operators in the src graph as per pre-defined rulesets
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
@@ -18,8 +18,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        bool enable_ovep_weight_sharing,
                                        bool enable_ovep_qdq_optimizer,
                                        /*out*/ std::unique_ptr<onnxruntime::Model>& model,
-                                       /*out*/ sw& shared_weights);
+                                       /*out*/ SharedContext& shared_context);
 
-bool dumpMetaDataMapToBinary(const sw::Metadata::Map& shared_weights, const std::string& filename);
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/weak_singleton.h b/onnxruntime/core/providers/openvino/weak_singleton.h
new file mode 100644
index 0000000000000..949ed1b527c60
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/weak_singleton.h
@@ -0,0 +1,40 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+template <typename T>
+class WeakSingleton {
+ public:
+  static std::shared_ptr<T> Get() {
+    static std::weak_ptr<T> instance;
+    static std::mutex mutex;
+
+    auto ptr = instance.lock();
+    if (!ptr) {
+      std::lock_guard<std::mutex> lock(mutex);
+      // ensure another thread didn't create an instance while this thread was waiting
+      ptr = instance.lock();
+      if (!ptr) {
+        ptr = std::make_shared<T>();
+        instance = ptr;
+      }
+    }
+    return ptr;
+  }
+
+ protected:
+  WeakSingleton() = default;
+  virtual ~WeakSingleton() = default;
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeakSingleton);
+};
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 19c636ba6aff1..7195bfbc77bab 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -2508,9 +2508,9 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
     // arbitrary length to validate the key/value. adjust if/when needed.
     // TODO: are any other input validation checks required here (and in the other functions that process
     // provider options)?
-    if (strlen(provider_options_keys[i]) > 1024 || strlen(provider_options_values[i]) > 1024) {
+    if (strlen(provider_options_keys[i]) > 1024 || strlen(provider_options_values[i]) > 2048) {
       return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
-                                   "Maximum string length for a provider options key/value is 1024.");
+                                   "Maximum string length for a provider options key is 1024 and value is 2048.");
     }
 
     provider_options[provider_options_keys[i]] = provider_options_values[i];
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 4c3313046457c..91216473bcad2 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -397,6 +397,16 @@ def run_with_iobinding(self, iobinding, run_options=None):
         """
         self._sess.run_with_iobinding(iobinding._iobinding, run_options)
 
+    def set_ep_dynamic_options(self, options: dict[str, str]):
+        """
+        Set dynamic options for execution providers.
+
+        :param options: Dictionary of key-value pairs where both keys and values are strings.
+                        These options will be passed to the execution providers to modify
+                        their runtime behavior.
+        """
+        self._sess.set_ep_dynamic_options(options)
+
     def get_tuning_results(self):
         return self._sess.get_tuning_results()
 
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index c548f3df4fb27..92cf6b085c01e 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1083,7 +1083,7 @@ static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory
     ProviderOptions OV_provider_options_map;
     const std::unordered_set<std::string> valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision",
                                                                  "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer",
-                                                                 "enable_causallm", "disable_dynamic_shapes", "reshape_input"};
+                                                                 "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout"};
     auto it = provider_options_map.find(type);
     if (it != provider_options_map.end()) {
       for (auto option : it->second) {
@@ -1892,7 +1892,7 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
 
   py::class_<OrtSyncStream> py_sync_stream(m, "OrtSyncStream",
                                            R"pbdoc(Represents a synchronization stream for model inference.)pbdoc");
-  py_sync_stream.def("get_handle", [](OrtSyncStream* stream) -> uintptr_t { 
+  py_sync_stream.def("get_handle", [](OrtSyncStream* stream) -> uintptr_t {
       Ort::UnownedSyncStream ort_stream(stream);
       return reinterpret_cast<uintptr_t>(ort_stream.GetHandle()); }, R"pbdoc(SyncStream handle that can be converted to a string and added to SessionOptions)pbdoc");
 
@@ -2006,7 +2006,7 @@ for model inference.)pbdoc");
       .def_property_readonly("allocator_type", [](const OrtMemoryInfo* mem_info) -> OrtAllocatorType { return mem_info->alloc_type; }, R"pbdoc(Allocator type)pbdoc")
       .def_property_readonly("device_mem_type", [](const OrtMemoryInfo* mem_info) -> OrtDeviceMemoryType {
               auto mem_type = mem_info->device.MemType();
-              return (mem_type == OrtDevice::MemType::DEFAULT) ? 
+              return (mem_type == OrtDevice::MemType::DEFAULT) ?
                   OrtDeviceMemoryType_DEFAULT: OrtDeviceMemoryType_HOST_ACCESSIBLE ; }, R"pbdoc(Device memory type (Device or Host accessible).)pbdoc")
       .def_property_readonly("device_vendor_id", [](const OrtMemoryInfo* mem_info) -> uint32_t { return mem_info->device.Vendor(); });
 
@@ -2748,7 +2748,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
             auto res = sess->GetSessionHandle()->GetModelMetadata();
             OrtPybindThrowIfError(res.first);
             return *(res.second); }, py::return_value_policy::reference_internal)
-      .def_property_readonly("input_meminfos", [](const PyInferenceSession* sess) -> py::list { 
+      .def_property_readonly("input_meminfos", [](const PyInferenceSession* sess) -> py::list {
           Ort::ConstSession session(reinterpret_cast<const OrtSession*>(sess->GetSessionHandle()));
           auto inputs_mem_info = session.GetMemoryInfoForInputs();
           py::list result;
@@ -2757,7 +2757,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
             result.append(py::cast(p_info, py::return_value_policy::reference));
           }
           return result; })
-      .def_property_readonly("output_meminfos", [](const PyInferenceSession* sess) -> py::list { 
+      .def_property_readonly("output_meminfos", [](const PyInferenceSession* sess) -> py::list {
           Ort::ConstSession session(reinterpret_cast<const OrtSession*>(sess->GetSessionHandle()));
           auto outputs_mem_info = session.GetMemoryInfoForOutputs();
           py::list result;
@@ -2810,6 +2810,53 @@ including arg name, arg type (contains both type and shape).)pbdoc")
         ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
 #endif
       })
+      .def("set_ep_dynamic_options", [](PyInferenceSession* sess, const py::dict& options) {
+            std::vector<const char*> keys;
+            std::vector<const char*> values;
+            std::vector<std::string> key_strings;
+            std::vector<std::string> value_strings;
+
+            // Reserve space to avoid reallocations
+            key_strings.reserve(options.size());
+            value_strings.reserve(options.size());
+            keys.reserve(options.size());
+            values.reserve(options.size());
+
+            // Convert Python dict to C-style arrays
+            for (const auto& item : options) {
+              key_strings.emplace_back(py::str(item.first));
+              value_strings.emplace_back(py::str(item.second));
+              keys.push_back(key_strings.back().c_str());
+              values.push_back(value_strings.back().c_str());
+            }
+
+            if (keys.empty()) {
+              ORT_THROW("No options were provided");
+            }
+
+            auto status = sess->GetSessionHandle()->SetEpDynamicOptions(
+                gsl::make_span(keys.data(), keys.size()),
+                gsl::make_span(values.data(), values.size()));
+
+            if (!status.IsOK()) {
+              ORT_THROW("Failed to set EP dynamic options: " + status.ErrorMessage());
+            } },
+           R"pbdoc(Set dynamic options for execution providers.
+
+          Args:
+              options (dict): Dictionary of key-value pairs where both keys and values are strings.
+                            These options will be passed to the execution providers to modify
+                            their runtime behavior.
+
+          Example:
+              session.set_ep_dynamic_options({
+                  "option1": "value1",
+                  "option2": "value2"
+              })
+
+          Raises:
+              RuntimeError: If no options are provided or if setting the options fails.
+          )pbdoc")
       .def("set_tuning_results", [](PyInferenceSession* sess, py::list results, bool error_on_invalid) -> void {
 #if !defined(ORT_MINIMAL_BUILD)
         std::vector<TuningResults> tuning_results;
diff --git a/onnxruntime/test/contrib_ops/quantize_ops_test.cc b/onnxruntime/test/contrib_ops/quantize_ops_test.cc
index db685967ae5ff..de10f14ef4538 100644
--- a/onnxruntime/test/contrib_ops/quantize_ops_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_ops_test.cc
@@ -287,6 +287,7 @@ TEST(QuantizeLinearContribOpTest, QuantizeLinear_per_tensor_float_int8) {
                           127, -127,
                           127, -128,
                           127, -128});
+  test.SetOutputAbsErr("y", 1.0f);
   // Disable Tensorrt EP due to error: node1_quantize_scale_node: out of bounds channel axis 1. Number of input dimensions is 1.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
@@ -311,6 +312,7 @@ TEST(QuantizeLinearContribOpTest, QuantizeLinear_per_tensor_float_uint16) {
                             32769, 32765,
                             65535, 0,
                             65535, 0});
+  test.SetOutputAbsErr("y", 1.0f);
 
   // Disable Tensorrt EP due to error: unsupported data type
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
index 07cd2114372dd..0bed6b6e9abee 100644
--- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
@@ -828,7 +828,8 @@ TEST(Loop, Opset11WithNoVariadicInputsAndOutputs) {
   test.AddOutput<float>("loop_scan_out", {1}, {1.0f});
 
   // Disable TensorRT on unsupported data type BOOL
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  // Disable OpenVino for floating nodes
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 // Test a combination of things:
diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc
index c1452ab686279..7a4af4f4f504a 100644
--- a/onnxruntime/test/providers/cpu/math/clip_test.cc
+++ b/onnxruntime/test/providers/cpu/math/clip_test.cc
@@ -99,7 +99,8 @@ TEST(MathOpTest, Clip_Default_int64) {
                            -5, 9, 82});
 
   // TensorRT does not support Clip opset 12 yet.
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  // Skipping for OpenVINO because of the following error: Expected equality of these values: cur_expected[i] Which is: 11 cur_actual[i] Which is: 0
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(MathOpTest, Clip_Default_uint64) {
diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
index 289e94397fb39..ed67b531ef394 100644
--- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc
@@ -853,6 +853,9 @@ TEST(CastOpTest, Int32ToInt4x2OddNumberOfElements) {
 }
 
 TEST(CastOpTest, Int32ToInt4x2EmptyTensor) {
+  if (DefaultOpenVINOExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "The OpenVINO not support 0 size input";
+  }
   // GIVEN
   const std::vector<int64_t> empty_shape{0};
   const std::vector<int32_t> empty_input = {};
diff --git a/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc b/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc
index b5e13c6377ccb..5f08b6df6785d 100644
--- a/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc
@@ -73,6 +73,7 @@ TEST(ConcatOpTest, Concat1D_2) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kTensorrtExecutionProvider,  // TensorRT: no support for dynamic shape tensor
             kNnapiExecutionProvider,     // NNAPI: concat does not support 0 size input
+            kOpenVINOExecutionProvider,  // OpenVINO: does not support 0 size input
             kQnnExecutionProvider});     // QNN: not support dynamic shape tensor
 }
 
@@ -118,6 +119,7 @@ TEST(ConcatOpTest, Concat2D_3) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kTensorrtExecutionProvider,  // TensorRT: no support for dynamic shape tensor
             kNnapiExecutionProvider,     // NNAPI: concat does not support 0 size input
+            kOpenVINOExecutionProvider,  // OpenVINO: does not support 0 size input
             kQnnExecutionProvider});     // QNN: not support dynamic shape tensor
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index 46acb5a730a78..18eec7d1b42a3 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -448,6 +448,7 @@ TEST(QuantizeLinearOpTest, Uint16) {
                             32769, 32765,
                             65535, 0,
                             65535, 0});
+  test.SetOutputAbsErr("y", 1.0f);
 
   // Disable Tensorrt EP due to error: unsupported data type
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
@@ -477,6 +478,7 @@ TEST(QuantizeLinearOpTest, Int16) {
                            32767, -32768,
                            32767, -32768,
                            32767, -32768});
+  test.SetOutputAbsErr("y", 1.0f);
 
   // Disable Tensorrt EP due to error: unsupported data type
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
@@ -501,6 +503,7 @@ TEST(QuantizeLinearOpTest, Int4) {
   test.AddOutput<Int4x2>("y", dims,
                          {Int4x2(-8, -7), Int4x2(-1, 1), Int4x2(2, 7),
                           Int4x2(7, unused_val)});
+  test.SetOutputAbsErr("y", 1.0f);
 
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
@@ -568,6 +571,7 @@ TEST(QuantizeLinearOpTest, OddLarge_Int4) {
   test.AddInput<float>("scale", {}, {scale}, true);
   test.AddInput<Int4x2>("zero_point", {}, {Int4x2(zp, unused_val)}, true);
   test.AddOutput<Int4x2>("y", dims, output);
+  test.SetOutputAbsErr("y", 1.0f);
 
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
@@ -594,6 +598,7 @@ TEST(QuantizeLinearOpTest, OddLarge_UInt4) {
   test.AddInput<float>("scale", {}, {scale}, true);
   test.AddInput<UInt4x2>("zero_point", {}, {UInt4x2(zp, unused_val)}, true);
   test.AddOutput<UInt4x2>("y", dims, output);
+  test.SetOutputAbsErr("y", 1.0f);
 
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
@@ -611,6 +616,7 @@ TEST(QuantizeLinearOpTest, Int8_NegativeZeroPoint) {
   test.AddInput<float>("y_scale", {}, {.039215686f});
   test.AddInput<int8_t>("y_zero_point", {}, {-23});
   test.AddOutput<int8_t>("y", dims, {-23, 28, 53, 104, 127, -74, -128, -128});
+  test.SetOutputAbsErr("y", 1.0f);
   // Disable Tensorrt EP due to the error, node1_quantize_scale_node: out of bounds channel axis 1. Number of input dimensions is 1.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
@@ -628,6 +634,7 @@ TEST(QuantizeLinearOpTest, Int8_PositiveZeroPoint) {
   test.AddInput<float>("y_scale", {}, {.039215686f});
   test.AddInput<int8_t>("y_zero_point", {}, {23});
   test.AddOutput<int8_t>("y", dims, {23, 74, 99, 127, 127, -28, -104, -128});
+  test.SetOutputAbsErr("y", 1.0f);
   // Disable Tensorrt EP due to error:node1_quantize_scale_node: out of bounds channel axis 1. Number of input dimensions is 1.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index bb053bc37ce30..f3b0695bdbd9c 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -308,6 +308,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
   std::vector<uint8_t> Y = {2, 4};
 
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
+  test.SetOutputAbsErr("Y", 1.0f);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
@@ -647,6 +648,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   std::vector<uint8_t> Y = {1, 7, 12};
 
   test.AddOutput<uint8_t>("Y", {N, sizes[1], sizes[2], C}, Y);
+  test.SetOutputAbsErr("Y", 1.0f);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
   // DML: results mismatch
diff --git a/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc b/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
index 5b2865a3feed7..657f3fe9c127a 100644
--- a/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
@@ -540,6 +540,10 @@ TEST(SliceTest, Slice1D_ReverseAllAxes_1) {
     GTEST_SKIP() << "Skipping because of the following error: Expected output shape [{4}] did not match run output shape [{0}] for output";
   }
 
+  if (DefaultOpenVINOExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: The input ends do not support int max when step is negative.";
+  }
+
   RunSliceTest<float>({4},
                       {1.0f, 2.0f, 3.0f, 4.0f},
                       {-1},
diff --git a/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc b/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc
deleted file mode 100644
index 105a35011a78d..0000000000000
--- a/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <filesystem>
-#include <map>
-#include <string>
-
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/common/float16.h"
-
-#include "test/util/include/test/test_environment.h"
-#include "test/unittest_util/qdq_test_utils.h"
-
-#include "gtest/gtest.h"
-#include "gmock/gmock.h"
-
-using namespace ONNX_NAMESPACE;
-using namespace onnxruntime::logging;
-
-extern std::unique_ptr<Ort::Env> ort_env;
-
-class OVEP_BF16_Tests : public ::testing::TestWithParam<std::string> {};
-
-namespace detail {
-auto ConstructModel() {
-  using namespace onnxruntime;
-  using namespace test;
-
-  std::unordered_map<std::string, int> domain_to_version;
-  domain_to_version[kOnnxDomain] = 19;
-  Model model("Bfloat16Tester", true, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-              domain_to_version, {}, DefaultLoggingManager().DefaultLogger());
-
-  Graph& graph = model.MainGraph();
-  ModelTestBuilder builder(graph);
-  auto dim = 4;
-  std::vector<float> input_data(dim, 1.0f);
-  auto* input = builder.MakeInput<float>({dim}, input_data);
-  builder.graph_.SetInputs({input});
-
-  auto* cast_to_bf16 = builder.MakeIntermediate();
-  Node& cast_node = builder.AddNode("Cast", {input}, {cast_to_bf16}, "");
-  cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16));
-
-  std::vector<onnxruntime::BFloat16> weight_data(dim * dim);
-  for (std::size_t i = 0; i < weight_data.size(); ++i)
-    weight_data[i] = onnxruntime::BFloat16(static_cast<float>(i % dim) / dim);
-  auto* weights = builder.MakeInitializer<onnxruntime::BFloat16>({dim, dim}, weight_data);
-
-  auto* matmul_out = builder.MakeIntermediate();
-  builder.AddNode("MatMul", {cast_to_bf16, weights}, {matmul_out});
-
-  std::vector<onnxruntime::BFloat16> weight_data_2(dim * dim);
-  for (std::size_t i = 0; i < weight_data_2.size(); ++i)
-    weight_data_2[i] = onnxruntime::BFloat16(static_cast<float>(i % dim) / dim);
-  auto* weights_2 = builder.MakeInitializer<onnxruntime::BFloat16>({dim, dim}, weight_data_2);
-
-  auto* matmul_out_2 = builder.MakeIntermediate();
-  builder.AddNode("MatMul", {matmul_out, weights_2}, {matmul_out_2});
-
-  auto* output = builder.MakeOutput();
-  Node& cast2_node = builder.AddNode("Cast", {matmul_out_2}, {output});
-  cast2_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT));
-
-  builder.SetGraphOutputs();
-  auto st = model.MainGraph().Resolve();
-  if (st != Status::OK())
-    throw std::runtime_error(st.ErrorMessage());
-  return model;
-}
-
-auto ProbeDevice(const std::string& device) {
-  static std::map<std::string, bool> is_present;
-  if (is_present.find(device) == is_present.end()) {
-    Ort::SessionOptions sessionOptions;
-    std::unordered_map<std::string, std::string> ov_options;
-    ov_options["device_type"] = device;
-    try {
-      sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options);
-      is_present[device] = true;
-    } catch (...) {
-      is_present[device] = false;
-    }
-  }
-  return is_present[device];
-}
-}  // namespace detail
-
-namespace onnxruntime {
-namespace test {
-
-TEST_P(OVEP_BF16_Tests, TestModelConversion) {
-  Ort::SessionOptions sessionOptions;
-  std::unordered_map<std::string, std::string> ov_options;
-  const auto& device = GetParam();
-  if (!::detail::ProbeDevice(device))
-    GTEST_SKIP() << device + " is not available on this machine";
-
-  ov_options["device_type"] = device;
-  auto model = ::detail::ConstructModel();
-  sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options);
-
-  std::string model_data;
-  model.ToProto().SerializeToString(&model_data);
-  auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
-  try {
-    Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), sessionOptions);
-  } catch (...) {
-    FAIL();
-  }
-}
-INSTANTIATE_TEST_SUITE_P(OVEP_Tests,
-                         OVEP_BF16_Tests,
-                         ::testing::Values("CPU", "GPU", "NPU"));
-}  // namespace test
-}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/openvino/openvino_ep_ext_init.cc b/onnxruntime/test/providers/openvino/openvino_ep_ext_init.cc
new file mode 100644
index 0000000000000..139d9c0aaf2b1
--- /dev/null
+++ b/onnxruntime/test/providers/openvino/openvino_ep_ext_init.cc
@@ -0,0 +1,215 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <filesystem>
+#include <map>
+#include <string>
+
+#include "core/session/onnxruntime_cxx_api.h"
+
+#include "test/util/include/test/test_environment.h"
+#include "test/unittest_util/qdq_test_utils.h"
+
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+#include "onnxruntime_session_options_config_keys.h"
+
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::logging;
+
+extern std::unique_ptr<Ort::Env> ort_env;
+
+class OVEP_ExtInit_Tests : public ::testing::TestWithParam<std::string> {};
+
+namespace {
+
+std::vector<uint8_t> LoadFileToMemory(const std::string& path) {
+  std::ifstream file(path, std::ios::binary | std::ios::ate);
+  if (!file.is_open()) {
+    return std::vector<uint8_t>();
+  }
+  std::streamsize size = file.tellg();
+  file.seekg(0, std::ios::beg);
+  std::vector<uint8_t> buffer(static_cast<size_t>(size));
+  if (!file.read(reinterpret_cast<char*>(buffer.data()), size)) {
+    return std::vector<uint8_t>();
+  }
+  return buffer;
+}
+
+auto ProbeDevice(const std::string& device) {
+  static std::map<std::string, bool> is_present;
+  if (is_present.find(device) == is_present.end()) {
+    Ort::SessionOptions sessionOptions;
+    std::unordered_map<std::string, std::string> ov_options;
+    ov_options["device_type"] = device;
+    try {
+      sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options);
+      is_present[device] = true;
+    } catch (...) {
+      is_present[device] = false;
+    }
+  }
+  return is_present[device];
+}
+}  // namespace
+
+namespace onnxruntime {
+namespace test {
+
+// this test requiresOV 2025.4+ to run, currently CI uses OV 2025.2, so the test will be disabled until OV is updated
+TEST_P(OVEP_ExtInit_Tests, DISABLED_ModelFromExtInit) {
+  const auto& device = GetParam();
+  if (!ProbeDevice(device))
+    GTEST_SKIP() << device + " is not available on this machine";
+
+  // Model and weights file paths
+  const std::string model_path = "ovep_ext_init_test.onnx";
+  const std::string weights_path = "ovep_ext_init_test.onnx.data";
+  const size_t num_initializers = 8;
+  const size_t floats_per_initializer = 64 * 1024 * 1024;  // 64 millions floats per initializer, 256MB
+  const size_t total_floats = num_initializers * floats_per_initializer;
+  const size_t total_bytes = total_floats * sizeof(float);
+  // min size threshold for new logic with ext initializers
+  ASSERT_GE(total_bytes, 32 * 1024 * 1024);
+
+  // 1. Create initializers
+  std::vector<std::vector<float>> initializer_data;
+  for (size_t i = 0; i < num_initializers; ++i)
+    initializer_data.emplace_back(floats_per_initializer, static_cast<float>(i + 1));  // W0:1, W1:2...
+
+  // 2. Build ONNX model with 4 external initializers, and 4 ADD nodes
+  {
+    ModelProto model_proto;
+    model_proto.set_ir_version(7);
+    model_proto.set_producer_name("openvino_extinit_test");
+    model_proto.set_producer_version("1.0");
+    model_proto.set_domain("");
+    model_proto.set_model_version(1);
+
+    auto* graph = model_proto.mutable_graph();
+    graph->set_name("TestGraph");
+
+    // Input: shape [floats_per_initializer]
+    auto* input = graph->add_input();
+    input->set_name("X");
+    auto* input_type = input->mutable_type()->mutable_tensor_type();
+    input_type->set_elem_type(TensorProto_DataType_FLOAT);
+    input_type->mutable_shape()->add_dim()->set_dim_value(floats_per_initializer);
+
+    // Output: shape [floats_per_initializer]
+    auto* output = graph->add_output();
+    output->set_name("Y");
+    auto* output_type = output->mutable_type()->mutable_tensor_type();
+    output_type->set_elem_type(TensorProto_DataType_FLOAT);
+    output_type->mutable_shape()->add_dim()->set_dim_value(floats_per_initializer);
+
+    auto* opset_import = model_proto.add_opset_import();
+    opset_import->set_domain("");
+    opset_import->set_version(19);
+
+    // Add initializers as external data
+    size_t offset = 0;
+    std::vector<std::string> initializer_names;
+    for (size_t i = 0; i < num_initializers; ++i) {
+      std::string name = "W" + std::to_string(i);
+      initializer_names.push_back(name);
+      TensorProto* initializer = graph->add_initializer();
+      initializer->set_name(name);
+      initializer->set_data_type(TensorProto_DataType_FLOAT);
+      initializer->add_dims(floats_per_initializer);
+      initializer->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL);
+      auto* ext = initializer->add_external_data();
+      ext->set_key("location");
+      ext->set_value(weights_path);
+      ext = initializer->add_external_data();
+      ext->set_key("offset");
+      ext->set_value(std::to_string(offset));
+      ext = initializer->add_external_data();
+      ext->set_key("length");
+      ext->set_value(std::to_string(floats_per_initializer * sizeof(float)));
+      offset += floats_per_initializer * sizeof(float);
+    }
+
+    // nodes: X -> Add with Init[0] -> ... -> output Y
+    std::string prev_output = "X";
+    std::string node_output;
+    for (size_t i = 0; i < num_initializers; ++i) {
+      node_output = (i == num_initializers - 1) ? "Y" : "A" + std::to_string(i);
+      auto* add_node = graph->add_node();
+      add_node->set_op_type("Add");
+      add_node->add_input(prev_output);
+      add_node->add_input(initializer_names[i]);
+      add_node->add_output(node_output);
+      prev_output = node_output;
+    }
+
+    // Save model
+    std::ofstream model_file(model_path, std::ios::binary);
+    ASSERT_TRUE(model_proto.SerializeToOstream(&model_file));
+    model_file.close();
+  }
+
+  // 3. Save weights file (concatenate all initializers)
+  {
+    std::ofstream weights_file(weights_path, std::ios::binary);
+    ASSERT_TRUE(weights_file.is_open());
+    for (const auto& w : initializer_data) {
+      weights_file.write(reinterpret_cast<const char*>(w.data()), w.size() * sizeof(float));
+    }
+    weights_file.close();
+  }
+
+  // 4. Load model and weights into memory
+  std::vector<uint8_t> model_data = LoadFileToMemory(model_path);
+  std::vector<uint8_t> weights_data = LoadFileToMemory(weights_path);
+
+  // 5. Prepare external initializer info
+  PathString weights_name_path(weights_path.begin(), weights_path.end());
+  std::vector<PathString> names_path = {weights_name_path};
+  std::vector<char*> buffers = {reinterpret_cast<char*>(weights_data.data())};
+  std::vector<size_t> buffer_sizes = {weights_data.size()};
+
+  // 6. Set up session options with OpenVINO
+  Ort::SessionOptions session_options;
+  session_options.AddConfigEntry(kOrtSessionOptionsDisableCPUEPFallback, "1");
+  session_options.SetIntraOpNumThreads(1);
+  std::unordered_map<std::string, std::string> ov_options = {{"device_type", device}};
+  session_options.AppendExecutionProvider_OpenVINO_V2(ov_options);
+  session_options.AddExternalInitializersFromFilesInMemory(names_path, buffers, buffer_sizes);
+
+  // 7. Create session from memory
+  Ort::Session session(*ort_env, model_data.data(), model_data.size(), session_options);
+
+  // 8. Run inference to verify weights are loaded
+  std::vector<float> input_data(floats_per_initializer, 2.0f);
+  std::vector<int64_t> input_shape = {static_cast<int64_t>(floats_per_initializer)};
+  Ort::MemoryInfo mem_info = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtDeviceAllocator, OrtMemTypeDefault);
+  Ort::Value input_tensor = Ort::Value::CreateTensor<float>(mem_info, input_data.data(), input_data.size(), input_shape.data(), input_shape.size());
+
+  std::vector<const char*> input_names = {"X"};
+  std::vector<const char*> output_names = {"Y"};
+  std::vector<Ort::Value> output_tensors(1);
+
+  session.Run(Ort::RunOptions{nullptr}, input_names.data(), &input_tensor, 1, output_names.data(), output_tensors.data(), 1);
+
+  // Check output: should be input + W0 + W1 + W2...
+  auto* out_data = output_tensors[0].GetTensorMutableData<float>();
+  float expected = input_data[0];
+  for (size_t i = 0; i < num_initializers; ++i) {
+    expected += initializer_data[i][0];
+  }
+
+  for (size_t i = 0; i < floats_per_initializer; ++i)
+    ASSERT_FLOAT_EQ(out_data[i], expected);
+
+  // Cleanup
+  std::filesystem::remove(model_path);
+  std::filesystem::remove(weights_path);
+}
+INSTANTIATE_TEST_SUITE_P(OVEP_Tests,
+                         OVEP_ExtInit_Tests,
+                         ::testing::Values("CPU", "GPU", "NPU"));
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/unittest_util/checkers.cc b/onnxruntime/test/unittest_util/checkers.cc
index 7b2a5a4a4ff2f..d4b30cd11f1a0 100644
--- a/onnxruntime/test/unittest_util/checkers.cc
+++ b/onnxruntime/test/unittest_util/checkers.cc
@@ -225,17 +225,27 @@ template <>
 struct TensorCheck<Int4x2> {
   void operator()(const Tensor& expected, const Tensor& actual, const ValidateOutputParams& params,
                   const std::string& /*provider_type*/) const {
-    ORT_UNUSED_PARAMETER(params);
+    const bool has_abs_err = params.absolute_error.has_value();
+    Tensor expected_sorted, actual_sorted;
     const Int4x2* cur_expected;
     const Int4x2* cur_actual;
     const auto size = narrow<size_t>(actual.Shape().Size());
     cur_expected = expected.Data<Int4x2>();
     cur_actual = actual.Data<Int4x2>();
+    double threshold = 0.0f;
+    if (has_abs_err) {
+      threshold = *(params.absolute_error);
+    }
 
     for (size_t i = 0; i < size; ++i) {
       size_t r = i >> 1;
       size_t c = i & 0x1;
-      EXPECT_EQ(cur_expected[r].GetElem(c), cur_actual[r].GetElem(c)) << "i:" << i;
+      // TODO: the relative error is not used for int4 yet.
+      if (has_abs_err) {
+        EXPECT_NEAR(cur_expected[r].GetElem(c), cur_actual[r].GetElem(c), threshold) << "i:" << i;
+      } else {
+        EXPECT_EQ(cur_expected[r].GetElem(c), cur_actual[r].GetElem(c)) << "i:" << i;
+      }
     }
   }
 };
@@ -244,17 +254,28 @@ template <>
 struct TensorCheck<UInt4x2> {
   void operator()(const Tensor& expected, const Tensor& actual, const ValidateOutputParams& params,
                   const std::string& /*provider_type*/) const {
-    ORT_UNUSED_PARAMETER(params);
+    const bool has_abs_err = params.absolute_error.has_value();
+    Tensor expected_sorted, actual_sorted;
     const UInt4x2* cur_expected;
     const UInt4x2* cur_actual;
     const auto size = narrow<size_t>(actual.Shape().Size());
     cur_expected = expected.Data<UInt4x2>();
     cur_actual = actual.Data<UInt4x2>();
 
-    for (size_t i = 0; i < size; ++i) {
+    double threshold = 0.0f;
+    if (has_abs_err) {
+      threshold = *(params.absolute_error);
+    }
+
+    for (size_t i = 0; i < static_cast<size_t>(size); ++i) {
       size_t r = i >> 1;
       size_t c = i & 0x1;
-      EXPECT_EQ(cur_expected[r].GetElem(c), cur_actual[r].GetElem(c)) << "i:" << i;
+      // TODO: the relative error is not used for int4 yet.
+      if (has_abs_err) {
+        EXPECT_NEAR(cur_expected[r].GetElem(c), cur_actual[r].GetElem(c), threshold) << "i:" << i;
+      } else {
+        EXPECT_EQ(cur_expected[r].GetElem(c), cur_actual[r].GetElem(c)) << "i:" << i;
+      }
     }
   }
 };
@@ -292,7 +313,7 @@ struct TensorCheck<uint8_t> {
     // For any other EPs, we still expect an exact match for the results
     // TODO: Verify if DML can possibly have a ROUNDING_MODE parameter and conform to the other EPs #41968513
     if ((provider_type == kNnapiExecutionProvider || provider_type == kDmlExecutionProvider ||
-         provider_type == kXnnpackExecutionProvider) &&
+         provider_type == kXnnpackExecutionProvider || provider_type == kOpenVINOExecutionProvider) &&
         (has_abs_err || has_rel_err)) {
       double threshold = has_abs_err ? *(params.absolute_error)
                                      : 0.0;
@@ -357,6 +378,49 @@ struct TensorCheck<int8_t> {
   }
 };
 
+template <>
+struct TensorCheck<uint16_t> {
+  void operator()(const Tensor& expected,
+                  const Tensor& actual,
+                  const ValidateOutputParams& params,
+                  const std::string&) const {
+    const bool has_abs_err = params.absolute_error.has_value();
+    const bool has_rel_err = params.relative_error.has_value();
+
+    Tensor expected_sorted, actual_sorted;
+    const uint16_t* cur_expected;
+    const uint16_t* cur_actual;
+    const auto size = actual.Shape().Size();
+    if (params.sort_output) {
+      sort_expected_and_actual_buffers<uint16_t>(expected, expected_sorted, actual, actual_sorted);
+      cur_expected = expected_sorted.Data<uint16_t>();
+      cur_actual = actual_sorted.Data<uint16_t>();
+    } else {
+      cur_expected = expected.Data<uint16_t>();
+      cur_actual = actual.Data<uint16_t>();
+    }
+
+    if (has_abs_err || has_rel_err) {
+      double threshold = has_abs_err ? *(params.absolute_error)
+                                     : 0.0;
+
+      for (int64_t i = 0; i < size; ++i) {
+        if (has_rel_err) {
+          EXPECT_NEAR(cur_expected[i], cur_actual[i],
+                      *(params.relative_error) * cur_expected[i])  // expected[i] is unsigned, can't be negative
+              << "i:" << i;
+        } else {  // has_abs_err
+          EXPECT_NEAR(cur_expected[i], cur_actual[i], threshold) << "i:" << i;
+        }
+      }
+    } else {
+      for (int64_t i = 0; i < size; ++i) {
+        EXPECT_EQ(cur_expected[i], cur_actual[i]) << "i:" << i;
+      }
+    }
+  }
+};
+
 template <>
 struct TensorCheck<double> {
   void operator()(const Tensor& expected,