diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 606ab1fe5ba89..f1a2e752f7965 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -13,8 +13,8 @@
 
   # Header paths
   find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
-  if(OpenVINO_VERSION VERSION_LESS 2024.4)
-    message(FATAL_ERROR "OpenVINO 2024.4 and newer are supported. Please, use latest OpenVINO release")
+  if(OpenVINO_VERSION VERSION_LESS 2024.5)
+    message(FATAL_ERROR "OpenVINO 2024.5 and newer are supported. Please, use latest OpenVINO release")
   endif()
 
   if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
@@ -30,7 +30,7 @@
   endif()
 
   list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
-  if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}))
+  if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}) AND onnxruntime_USE_OPENVINO_GPU)
     add_definitions(-DIO_BUFFER_ENABLED=1)
     list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
   endif()
@@ -86,4 +86,4 @@
 set_target_properties(onnxruntime_providers_openvino PROPERTIES
   MAP_IMPORTED_CONFIG_RELEASE RelWithDebInfo
   MAP_IMPORTED_CONFIG_DEBUG RelWithDebInfo
-  )
\ No newline at end of file
+  )
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index a0bcf953938d9..16a92b43adaf6 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -10,8 +10,10 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include <istream>
 
 #include "core/providers/shared_library/provider_api.h"
+#include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/contexts.h"
 #include "core/providers/openvino/backend_manager.h"
 #include "core/providers/openvino/ibackend.h"
@@ -21,8 +23,8 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-GlobalContext& BackendManager::GetGlobalContext() {
-  return global_context_;
+SessionContext& BackendManager::GetSessionContext() {
+  return session_context_;
 }
 
 ov::CompiledModel& BackendManager::GetOVCompiledModel() {
@@ -30,75 +32,95 @@ ov::CompiledModel& BackendManager::GetOVCompiledModel() {
   return (ov_ptr);
 }
 
-BackendManager::BackendManager(const GlobalContext& global_context,
+BackendManager::BackendManager(SessionContext& session_context,
+                               SharedContext& shared_context,
                                const onnxruntime::Node& fused_node,
                                const onnxruntime::GraphViewer& subgraph,
                                const logging::Logger& logger,
-                               EPCtxHandler& ep_ctx_handle_) {
-  global_context_ = global_context;
-
-  openvino_sdk_version_ = std::to_string(global_context_.OpenVINO_Version.at(0)) + "." +
-                          std::to_string(global_context_.OpenVINO_Version.at(1));
-  if (ep_ctx_handle_.CheckForOVEPCtxNode(subgraph, openvino_sdk_version_)) {
-    if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, global_context_.ep_context_embed_mode) != Status::OK())
-      ORT_THROW("Import blob from model failed");
-  }
+                               EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle),
+                                                              session_context_(session_context),
+                                                              shared_context_{shared_context} {
+  subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph);
+
+  subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) {
+    // return empty if graph has no inputs or if types are not one of FP32/FP16
+    // else assume the type of the first input
+    if (graph_viewer.GetInputs().empty()) {
+      return "";
+    } else {
+      auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type();
+      if (session_context_.precision == "ACCURACY" &&
+          session_context_.device_type.find("GPU") != std::string::npos) {
+        if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) {
+          return "FP32";
+        } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+          return "FP16";
+        }
+      }
+    }
+    return "";
+  }(subgraph);
 
   // Save the indexes of graph inputs among fused_node's inputDefs
   // (which also contains initializers).
-  auto node_input_defs = fused_node.InputDefs();
-  int i = 0;
-  for (auto idef : node_input_defs) {
-    subgraph_context_.input_names.insert({idef->Name(), i});
-    i++;
+  for (uint32_t index = 0; const auto& node : subgraph.GetInputs()) {
+    subgraph_context_.input_names.insert({node->Name(), index++});
   }
 
-  const std::vector<const NodeArg*>& graph_inputs = subgraph.GetInputs();
-  for (auto input : graph_inputs) {
-    auto it = subgraph_context_.input_names.find(input->Name());
-    if (it == subgraph_context_.input_names.end()) {
-      ORT_THROW("Input not found in the input defs list");
-    }
-    int index = it->second;
-    subgraph_context_.input_indexes.push_back(index);
+  for (uint32_t index = 0; const auto& node : subgraph.GetOutputs()) {
+    subgraph_context_.output_names.insert({node->Name(), index++});
   }
 
-  auto graph_outputs_defs = fused_node.OutputDefs();
-  i = 0;
-  for (auto output_def : graph_outputs_defs) {
-    subgraph_context_.output_names.insert({output_def->Name(), i});
-    i++;
-  }
   subgraph_context_.subgraph_name = fused_node.Name();
+
+  ptr_stream_t model_stream;
   std::unique_ptr<onnx::ModelProto> model_proto;
-  if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) {
+  if (subgraph_context_.is_ep_ctx_graph) {
+    model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph);
+  } else {
     model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
   }
-  std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type;
+  std::string device_type = session_context_.device_type;
+
+  auto& sw = shared_context_.shared_weights;
+  if (session_context_.so_share_ep_contexts) {
+    std::filesystem::path weight_filename = session_context_.onnx_model_path_name.parent_path();
+    if (sw.external_weight_filename.empty() && !sw.metadata.empty()) {
+      // Reasonable assumption that all metadata entries have the same external file location
+      sw.external_weight_filename = sw.metadata.begin()->second.location;
+    }
+    weight_filename /= sw.external_weight_filename;
+    std::ifstream weight_file(weight_filename);
+
+    if (weight_file) {
+      if (!sw.mapped_weights) {
+        sw.mapped_weights = std::make_unique<SharedContext::SharedWeights::WeightsFile>(weight_filename);
+      }
+      backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, *sw.mapped_weights);
+    }
+  }
 
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
-    ORT_ENFORCE(!global_context_.enable_qdq_optimizer,
-                "QDQ stripping should not be enabled for models with dynamic input shapes. "
-                "Set enable_qdq_optimizer to False");
-    if ((GetGlobalContext().device_type.find("CPU") != std::string::npos ||
-         GetGlobalContext().device_type.find("GPU") != std::string::npos) &&
-        !GetGlobalContext().disable_dynamic_shapes) {
+    if ((session_context_.device_type.find("CPU") != std::string::npos ||
+         session_context_.device_type.find("GPU") != std::string::npos) &&
+        !session_context_.disable_dynamic_shapes) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
                          << "Creating backend Dynamic Shapes";
       try {
         concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                        GetGlobalContext(),
+                                                        session_context_,
                                                         subgraph_context_,
-                                                        ep_ctx_handle_);
+                                                        shared_context_,
+                                                        model_stream);
       } catch (std::string const& msg) {
         ORT_THROW(msg);
       }
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                          << "Backend created for graph " << subgraph_context_.subgraph_name;
     } else {
-      // Only cache model_proto in global to rewrite the model with input shapes at runtime.
+      // Only cache model_proto in session context to rewrite the model with input shapes at runtime.
       // For dynamic backend creation
       model_proto_ = std::move(model_proto);
     }
@@ -112,14 +134,15 @@ BackendManager::BackendManager(const GlobalContext& global_context,
     // OV NPU plugin is supported with fallback to OV CPU upon compilation failures.
     try {
       concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                      GetGlobalContext(),
+                                                      session_context_,
                                                       subgraph_context_,
-                                                      ep_ctx_handle_);
+                                                      shared_context_,
+                                                      model_stream);
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
       bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
-                                       !GetGlobalContext().disable_cpu_fallback &&
-                                       !ep_ctx_handle_.IsValidOVEPCtxGraph();
+                                       !session_context_.so_disable_cpu_ep_fallback &&
+                                       !subgraph_context_.is_ep_ctx_graph;
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
       eligible_for_cpu_fallback = false;
 #else
@@ -127,13 +150,14 @@ BackendManager::BackendManager(const GlobalContext& global_context,
         LOGS_DEFAULT(VERBOSE) << exception_str;
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
-        GetGlobalContext().device_type = "CPU";
-        GetGlobalContext().precision_str = "FP32";
+        session_context_.device_type = "CPU";
+        session_context_.precision = "FP32";
         try {
           concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                          GetGlobalContext(),
+                                                          session_context_,
                                                           subgraph_context_,
-                                                          ep_ctx_handle_);
+                                                          shared_context_,
+                                                          model_stream);
         } catch (std::string const& msg) {
           ORT_THROW(msg);
         }
@@ -165,9 +189,8 @@ BackendManager::BackendManager(const GlobalContext& global_context,
       }
     }
   }
-  if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
-    auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph,
-                                                                                          logger);
+  if (session_context_.so_context_enable && !subgraph_context_.is_ep_ctx_graph) {
+    auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph);
     if ((!status.IsOK())) {
       ORT_THROW(status);
     }
@@ -178,9 +201,8 @@ BackendManager::BackendManager(const GlobalContext& global_context,
 // precompiled blob is set. If that's the case:
 // By default, create model in embed mode where the blob stream is exported as data within
 // the EPContext node.
-Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer,
-                                                     const logging::Logger& logger) {
-  if (GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) {
+Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer) {
+  if (session_context_.disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) {
     std::string exception_str =
         "Exporting dynamically compiled models at runtime is not supported. "
         "Cannot export blobs of dynamic models that request static shape inference. "
@@ -188,47 +210,48 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
     ORT_THROW(exception_str);
   }
 
-  std::string model_blob_str;
-  auto compiled_model = concrete_backend_->GetOVCompiledModel();
-  std::string graph_name = "";
-  // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability
-  if (!global_context_.cache_dir.empty()) {
-    graph_name = global_context_.cache_dir;
-  } else {
-    graph_name = global_context_.onnx_model_path_name;
-    // Remove extension so we can append suffix to form the complete name of output graph
-    size_t dot = global_context_.onnx_model_path_name.find_last_of(".");
-    graph_name = graph_name.substr(0, dot);
-    if (dot != std::string::npos) graph_name += "_ctx.onnx";
-  }
-
   // If embed_mode, then pass on the serialized blob
   // If not embed_mode, dump the blob here and only pass on the path to the blob
-  if (global_context_.ep_context_embed_mode) {
+  std::string model_blob_str;
+  auto compiled_model = concrete_backend_->GetOVCompiledModel();
+  if (session_context_.so_context_embed_mode) {  // Internal blob
     std::ostringstream model_blob_stream;
     compiled_model.export_model(model_blob_stream);
     model_blob_str = std::move(model_blob_stream).str();
     if (model_blob_str.empty()) {
       ORT_THROW("Model blob stream is empty after exporting the compiled model.");
     }
-  } else {
-    // Remove extension so we can append suffix to form the complete name of output graph
-    auto blob_name = graph_name.substr(0, graph_name.find_last_of("."));
-    std::ofstream blob_file(blob_name + ".blob",
+  } else {  // External blob
+    // Build name by combining EpCtx model name (if available) and subgraph name. Model
+    // name is not available in when creating a session from memory
+    auto name = session_context_.so_context_file_path.stem().string();
+    if (!name.empty() && !graph_body_viewer.ModelPath().empty()) {
+      name = graph_body_viewer.ModelPath().stem().string();
+    }
+    if (!name.empty()) {
+      name += "_";
+    }
+    name += subgraph_context_.subgraph_name;
+
+    std::filesystem::path blob_filename = session_context_.so_context_file_path;
+    if (blob_filename.empty()) {
+      blob_filename = session_context_.onnx_model_path_name;
+    }
+    blob_filename = blob_filename.parent_path() / name;
+    blob_filename.replace_extension("blob");
+    std::ofstream blob_file(blob_filename,
                             std::ios::out | std::ios::trunc | std::ios::binary);
     if (!blob_file) {
       ORT_THROW("Unable to open file for epctx model dump.");
     }
     compiled_model.export_model(blob_file);
-    model_blob_str = blob_name + ".blob";
+    model_blob_str = blob_filename.filename().string();
   }
 
-  ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer,
-                                                      graph_name,
-                                                      logger,
-                                                      global_context_.ep_context_embed_mode,
-                                                      std::move(model_blob_str),
-                                                      openvino_sdk_version_));
+  ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer,
+                                                           subgraph_context_.subgraph_name,
+                                                           session_context_.so_context_embed_mode,
+                                                           std::move(model_blob_str)));
 
   return Status::OK();
 }
@@ -236,8 +259,8 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
 bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& model_proto) const {
   bool has_batched_inputs = true;
 
-  for (int i = 0; i < static_cast<int>(subgraph_context_.input_indexes.size()); i++) {
-    auto& input = model_proto.graph().input(subgraph_context_.input_indexes[i]);
+  for (const auto& [name, index] : subgraph_context_.input_names) {
+    auto& input = model_proto.graph().input(index);
 
     // Batch-process only raw image inputs (NCHW or NHWC layouts)
     auto& shape = input.type().tensor_type().shape();
@@ -251,8 +274,8 @@ bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& mod
       break;
     }
 
-    for (int index = 1; index < 4; index++) {
-      if (shape.dim(index).value_case() != shape.dim(0).kDimValue) {
+    for (int dim_index = 1; dim_index < 4; dim_index++) {
+      if (shape.dim(dim_index).value_case() != shape.dim(0).kDimValue) {
         has_batched_inputs = false;
         break;
       }
@@ -299,27 +322,20 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
   return false;
 }
 
-static void DumpOpenVINOEPModel(std::string onnx_model_path_name,
+static void DumpOpenVINOEPModel(const std::filesystem::path& onnx_model_path_name,
                                 ONNX_NAMESPACE::ModelProto* model_proto,
                                 const onnxruntime::Node& fused_node) {
   if (openvino_ep::backend_utils::IsDebugEnabled()) {
-    auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : std::move(onnx_model_path_name);
-#ifdef _WIN32
-    size_t slash = model_name.find_last_of("\\");
-#else
-    size_t slash = model_name.find_last_of("/");
-#endif
-    model_name = model_name.substr(slash + 1, std::string::npos);
-    size_t dot = model_name.find_last_of(".");
-    model_name = model_name.substr(0, dot);
+    auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name.filename();
 
-    std::string subgraph_name = fused_node.Name();
+    const auto& subgraph_name = fused_node.Name();
     size_t dash = subgraph_name.find_last_of("-");
-    subgraph_name = subgraph_name.substr(dash, std::string::npos);
-
-    const std::string name = model_name + subgraph_name + ".onnx";
+    if (dash != std::string::npos) {
+      auto new_name = model_name.stem().string() + subgraph_name.substr(dash, std::string::npos);
+      model_name.replace_filename(new_name);
+    }
 
-    std::fstream dump(name, std::ios::out | std::ios::trunc | std::ios::binary);
+    std::fstream dump(model_name, std::ios::out | std::ios::trunc | std::ios::binary);
     model_proto->SerializeToOstream(dump);
   }
 }
@@ -344,17 +360,18 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     }
   };
 
+  const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU
-  if (global_context_.device_type.find("NPU") != std::string::npos &&
-      global_context_.enable_qdq_optimizer &&
+  if (session_context_.device_type.find("NPU") != std::string::npos &&
+      session_context_.enable_qdq_optimizer &&
       IsQDQGraph(subgraph)) {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1";
     std::unique_ptr<onnxruntime::Model> model;
-    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, model);
+    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, model, shared_context_.shared_weights);
     auto model_proto = model->ToProto();
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     print_model_proto_duration();
-    DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node);
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
   } else {
@@ -364,7 +381,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
     subgraph.ToProto(*model_proto->mutable_graph(), true, true);
     print_model_proto_duration();
-    DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node);
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     return model_proto;
   }
 }
@@ -456,16 +473,17 @@ void BackendManager::Compute(OrtKernelContext* context) {
   // by rewriting the model to static shaped model at runtime based on input shape.
   // disable_dynamic_shapes is always set to true for OV NPU plugin.
   if (subgraph_context_.has_dynamic_input_shape &&
-      !GetGlobalContext().disable_dynamic_shapes &&
-      (GetGlobalContext().device_type.find("CPU") != std::string::npos ||
-       GetGlobalContext().device_type.find("GPU") != std::string::npos)) {
+      !session_context_.disable_dynamic_shapes &&
+      (session_context_.device_type.find("CPU") != std::string::npos ||
+       session_context_.device_type.find("GPU") != std::string::npos)) {
     concrete_backend_->Infer(context);
   } else if (subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
-    auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
+    auto key = MakeMapKeyString(tensor_shapes, session_context_.device_type);
     std::shared_ptr<IBackend> dynamic_backend;
     auto search = backend_map_.find(key);
     if (search == backend_map_.end()) {
+      ptr_stream_t model_stream;
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                          << "Creating dynamic backend for key: " << key;
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
@@ -473,28 +491,30 @@ void BackendManager::Compute(OrtKernelContext* context) {
       auto modelproto_with_concrete_shapes = ReWriteInputShapeInfo(*model_proto_, tensor_shapes);
       try {
         dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
-                                                      GetGlobalContext(),
+                                                      session_context_,
                                                       subgraph_context_,
-                                                      ep_ctx_handle_);
+                                                      shared_context_,
+                                                      model_stream);
       } catch (const OnnxRuntimeException& ex) {
         // Build option disables fallback to CPU on compilation failures with NPU.
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU.";
         ORT_THROW(ex.what());
 #else
-        if (GetGlobalContext().device_type.find("NPU") != std::string::npos &&
-            !GetGlobalContext().disable_cpu_fallback) {
+        if (session_context_.device_type.find("NPU") != std::string::npos &&
+            !session_context_.so_disable_cpu_ep_fallback) {
           LOGS_DEFAULT(WARNING) << ex.what();
           LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                                 << "Falling back to OV CPU for execution";
-          GetGlobalContext().device_type = "CPU";
-          GetGlobalContext().precision_str = "FP32";
-          key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
+          session_context_.device_type = "CPU";
+          session_context_.precision = "FP32";
+          key = MakeMapKeyString(tensor_shapes, session_context_.device_type);
           try {
             dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes,
-                                                          GetGlobalContext(),
+                                                          session_context_,
                                                           subgraph_context_,
-                                                          ep_ctx_handle_);
+                                                          shared_context_,
+                                                          model_stream);
           } catch (std::string const& msg) {
             ORT_THROW(msg);
           }
@@ -524,6 +544,8 @@ void BackendManager::Compute(OrtKernelContext* context) {
 }
 
 void BackendManager::ShutdownBackendManager() {
+  backend_map_.clear();
+  concrete_backend_.reset();
 }
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index 5ec462afd9d01..cdc27701ec2e6 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -19,17 +19,16 @@ namespace openvino_ep {
 // Singleton class that manages all the backends
 class BackendManager {
  public:
-  BackendManager(const GlobalContext& global_context,
+  BackendManager(SessionContext& session_context,
+                 SharedContext& shared_context,
                  const onnxruntime::Node& fused_node,
                  const onnxruntime::GraphViewer& subgraph,
                  const logging::Logger& logger,
                  EPCtxHandler& ctx_handle);
   void Compute(OrtKernelContext* context);
   void ShutdownBackendManager();
-  void SetGlobalCotext(const GlobalContext& global_context);
-  GlobalContext& GetGlobalContext();
-  Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph,
-                                       const logging::Logger& logger);
+  SessionContext& GetSessionContext();
+  Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph);
   ov::CompiledModel& GetOVCompiledModel();
 
  private:
@@ -52,9 +51,9 @@ class BackendManager {
   std::shared_ptr<IBackend> concrete_backend_;
   std::map<std::string, std::shared_ptr<IBackend>> backend_map_;
   SubGraphContext subgraph_context_;
-  GlobalContext global_context_;
-  EPCtxHandler ep_ctx_handle_{};
-  std::string openvino_sdk_version_{};
+  EPCtxHandler& ep_ctx_handle_;
+  SessionContext& session_context_;
+  SharedContext& shared_context_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index d6f408228f2bf..acc3f120b270b 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -1,13 +1,16 @@
 // Copyright (C) Intel Corporation
 // Licensed under the MIT License
-
 #include <algorithm>
 #include <sstream>
 #include <fstream>
 #include <utility>
 
+#include <filesystem>
+#include <stdexcept>
+
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/pass/constant_folding.hpp"
+#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/backend_utils.h"
 #include "core/providers/openvino/ov_interface.h"
@@ -16,6 +19,105 @@ using Exception = ov::Exception;
 
 namespace onnxruntime {
 namespace openvino_ep {
+
+SharedContext::SharedWeights::WeightsFile::WeightsFile(std::filesystem::path filename) : file_(filename, std::ios::in | std::ios::binary) {
+  try {
+    file_.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+    weights_size_ = file_.seekg(0, std::ios::end).tellg();
+  } catch (std::ifstream::failure& e) {
+    ORT_THROW("Error: Failed to open weight file at ", filename.string(), " ", e.what());
+  }
+}
+
+void SharedContext::SharedWeights::WeightsFile::load_weights(size_t file_offset, void* data, size_t size) {
+  ORT_ENFORCE(file_offset < weights_size_ && size <= weights_size_ && (file_offset <= weights_size_ - size), "Error: File offset is out of bounds.");
+  file_.seekg(file_offset);
+  file_.read(reinterpret_cast<char*>(data), size);
+}
+
+std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) {
+  try {
+    stream << metadata.size();
+
+    // Write each key-value pair
+    // Put elements in separate lines to facilitate reading
+    for (const auto& [key, value] : metadata) {
+      stream << std::endl
+             << key.name;
+      stream << std::endl
+             << value.location;
+      stream << std::endl
+             << value.data_offset;
+      stream << std::endl
+             << value.size;
+      stream << std::endl
+             << value.dimensions.size();
+      for (const auto& dim : value.dimensions) {
+        stream << std::endl
+               << dim;
+      }
+      stream << std::endl
+             << value.element_type;
+    }
+  } catch (const Exception& e) {
+    ORT_THROW("Error: Failed to write map data.", e.what());
+  } catch (...) {
+    ORT_THROW("Error: Failed to write map data.");
+  }
+
+  ORT_ENFORCE(stream.good(), "Error: Failed to write map data.");
+  return stream;
+}
+
+std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Metadata::Map& metadata) {
+  size_t map_size{0};
+  try {
+    stream >> map_size;
+
+    while (!stream.eof()) {
+      SharedContext::SharedWeights::Metadata::Key key;
+      SharedContext::SharedWeights::Metadata::Value value;
+      stream >> key.name;
+      stream >> value.location;
+      stream >> value.data_offset;
+      stream >> value.size;
+      size_t num_dimensions;
+      stream >> num_dimensions;
+
+      if (stream.fail()) {
+        ORT_THROW("Error: Failed to read num_dimensions from stream.");
+      }
+
+      constexpr size_t MAX_SAFE_DIMENSIONS = 1024;
+
+      size_t safe_num_dimensions = num_dimensions;
+
+      if (num_dimensions == 0 || safe_num_dimensions > MAX_SAFE_DIMENSIONS) {
+        ORT_THROW("Invalid number of dimensions provided.");
+      }
+      try {
+        value.dimensions.resize(safe_num_dimensions);
+      } catch (const std::bad_alloc&) {
+        ORT_THROW("Error: Memory allocation failed while resizing dimensions.");
+      }
+
+      for (auto& dim : value.dimensions) {
+        stream >> dim;
+      }
+      stream >> value.element_type;
+      metadata.emplace(key, value);
+    }
+  } catch (const Exception& e) {
+    ORT_THROW("Error: Failed to read map data.", e.what());
+  } catch (...) {
+    ORT_THROW("Error: Failed to read map data.");
+  }
+
+  ORT_ENFORCE(metadata.size() == map_size, "Error: Inconsistent map data.");
+
+  return stream;
+}
+
 namespace backend_utils {
 
 bool IsDebugEnabled() {
@@ -34,23 +136,18 @@ bool IsCILogEnabled() {
   return false;
 }
 
-struct static_cast_int64 {
-  template <typename T1>  // T1 models type statically convertible to T
-  int64_t operator()(const T1& x) const { return static_cast<int64_t>(x); }
-};
-
 std::shared_ptr<const OVNetwork>
-CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context,
+CreateOVModel(const std::string model,
+              const SessionContext& session_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
     std::cout << "CreateNgraphFunc" << std::endl;
   }
-  const std::string model = model_proto.SerializeAsString();
   try {
-    auto ov_model = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);
+    auto ov_model = OVCore::ReadModel(model, session_context.onnx_model_path_name.string());
 
     // Check for Constant Folding
-    if ((global_context.device_type != "NPU") && !global_context.is_wholly_supported_graph) {
+    if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) {
       ov::pass::ConstantFolding pass_const_obj;
       pass_const_obj.run_on_model(ov_model);
       auto& results = const_cast<ov::ResultVector&>(ov_model.get()->get_results());
@@ -82,7 +179,7 @@ Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
                 OVInferRequestPtr infer_request,
                 std::string output_name,
-                std::unordered_map<std::string, int> output_names) {
+                const SubGraphContext::string_index_map_t& output_names) {
   auto graph_output_blob = infer_request->GetTensor(output_name);
 
   auto graph_output_dims = graph_output_blob->get_shape();
@@ -107,7 +204,7 @@ GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
 Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context,
                 std::string output_name,
-                std::unordered_map<std::string, int> output_names,
+                const SubGraphContext::string_index_map_t& output_names,
                 std::shared_ptr<ov::Node> node) {
   // Find position of '/' in the output_name
   auto pos = output_name.find("/");
@@ -129,13 +226,13 @@ GetOutputTensor(Ort::KernelContext& context,
   return context.GetOutput(index, output_shape.get(), num_dims);
 }
 
-int GetFirstAvailableDevice(GlobalContext& global_context) {
+int GetFirstAvailableDevice(SessionContext& session_context) {
   int i = 0;
   // Get the first available VAD-M device and set the device to busy
   while (i < 8) {
-    bool device = global_context.deviceAvailableList[i];
+    bool device = session_context.deviceAvailableList[i];
     if (device) {
-      global_context.deviceAvailableList[i] = false;
+      session_context.deviceAvailableList[i] = false;
       break;
     }
     i++;
@@ -144,9 +241,9 @@ int GetFirstAvailableDevice(GlobalContext& global_context) {
   // make all remaining devices free
   if (i == 8) {
     i = 0;
-    global_context.deviceAvailableList[i] = false;
+    session_context.deviceAvailableList[i] = false;
     for (int j = 1; j < 8; j++) {
-      global_context.deviceAvailableList[j] = true;
+      session_context.deviceAvailableList[j] = true;
     }
   }
   return i;
@@ -155,23 +252,23 @@ int GetFirstAvailableDevice(GlobalContext& global_context) {
 void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedValue& out_tensor) {
   switch (node->get_element_type()) {
     case ov::element::Type_t::f32: {
-      FillOutputHelper<float>(out_tensor, node);
+      FillOutputHelper<float>(out_tensor, std::move(node));
       break;
     }
     case ov::element::Type_t::boolean: {
-      FillOutputHelper<char>(out_tensor, node);
+      FillOutputHelper<char>(out_tensor, std::move(node));
       break;
     }
     case ov::element::Type_t::i32: {
-      FillOutputHelper<int32_t>(out_tensor, node);
+      FillOutputHelper<int32_t>(out_tensor, std::move(node));
       break;
     }
     case ov::element::Type_t::i64: {
-      FillOutputHelper<int64_t>(out_tensor, node);
+      FillOutputHelper<int64_t>(out_tensor, std::move(node));
       break;
     }
     case ov::element::Type_t::f16: {
-      FillOutputHelper<float>(out_tensor, node);
+      FillOutputHelper<float>(out_tensor, std::move(node));
       break;
     }
     default:
@@ -267,6 +364,78 @@ void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std
   printPerformanceCounts(performanceMap, stream, std::move(deviceName));
 }
 
+ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt) {
+  static std::unordered_map<ONNX_NAMESPACE::TensorProto_DataType, ov::element::Type> map{
+      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT, ov::element::f32},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT8, ov::element::u8},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT8, ov::element::i8},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT16, ov::element::u16},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT16, ov::element::i16},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT32, ov::element::i32},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT64, ov::element::i64},
+      {ONNX_NAMESPACE::TensorProto_DataType_STRING, ov::element::string},
+      {ONNX_NAMESPACE::TensorProto_DataType_BOOL, ov::element::boolean},
+      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, ov::element::f16},
+      {ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, ov::element::f64},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT32, ov::element::u32},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT64, ov::element::u64},
+      //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64, ov::element::undefined},
+      //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128, ov::element::undefined},
+      {ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16, ov::element::bf16},
+      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN, ov::element::undefined},
+      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ, ov::element::undefined},
+      {ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2, ov::element::f8e5m2},
+      //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ, ov::element::undefined},
+      {ONNX_NAMESPACE::TensorProto_DataType_UINT4, ov::element::u4},
+      {ONNX_NAMESPACE::TensorProto_DataType_INT4, ov::element::i4},
+  };
+
+  if (auto result = map.find(dt); result != map.end()) {
+    return result->second;
+  } else {
+    throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(dt));
+  }
+}
+
+// Function to handle tensor creation from external data
+void CreateOVTensors(const std::string& device_name,
+                     SharedContext::SharedWeights::Metadata::Map& metadata_map,
+                     SharedContext::SharedWeights::WeightsFile& weights) {
+  for (auto& [key, value] : metadata_map) {
+    if (value.tensor) continue;
+
+    // Get element data type
+    auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type;
+
+    ov::element::Type ov_elementType = GetOpenVINOElementType(onnx_element_type);  // Map to OpenVINO data type
+
+    // Create OpenVINO Tensor
+    if (device_name == "NPU") {
+      // Use remote tensors
+      auto npu_context = OVCore::Get().get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
+      auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT);
+
+      // Copy data to remote tensor
+      weights.load_weights(value.data_offset, remote_tensor.get(), value.size);
+      value.tensor = std::make_shared<ov::Tensor>(remote_tensor);
+    } else {
+      // Use vanilla tensors
+      value.tensor = std::make_shared<ov::Tensor>(ov_elementType, value.dimensions);
+      weights.load_weights(value.data_offset, value.tensor->data(), value.size);
+    }
+    ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch");
+  }
+}
+
+void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map) {
+  for (auto& [key, value] : metadata_map) {
+    if (value.tensor) {
+      value.tensor.reset();
+    }
+  }
+  metadata_map.clear();
+}
+
 }  // namespace backend_utils
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 9d58e1ca73abb..a4e6fc0828f79 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -10,6 +10,7 @@
 #include <memory>
 #include <vector>
 #include <string>
+#include <string_view>
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/openvino/contexts.h"
@@ -34,7 +35,7 @@ bool IsDebugEnabled();
 // Internal diagnostic function.
 bool IsCILogEnabled();
 
-int GetFirstAvailableDevice(GlobalContext& global_context);
+int GetFirstAvailableDevice(SessionContext& session_context);
 
 void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedValue& out_tensor);
 
@@ -44,14 +45,14 @@ void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ov::Node> n
 Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context,
                 std::string output_name,
-                std::unordered_map<std::string, int> output_names,
+                const SubGraphContext::string_index_map_t& output_names,
                 std::shared_ptr<ov::Node> node);
 
 Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
                 OVInferRequestPtr infer_request,
                 std::string output_name,
-                std::unordered_map<std::string, int> output_names);
+                const SubGraphContext::string_index_map_t& output_names);
 
 void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
                    std::string input_name, Ort::KernelContext& context,
@@ -61,10 +62,15 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
                     size_t batch_slice_idx);
 
 std::shared_ptr<const OVNetwork>
-CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
-              const GlobalContext& global_context,
+CreateOVModel(const std::string model,
+              const SessionContext& session_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
+void CreateOVTensors(const std::string& device_name,
+                     SharedContext::SharedWeights::Metadata::Map& metadata_map,
+                     SharedContext::SharedWeights::WeightsFile& weights);
+void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map);
+
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
                             std::ostream& stream, std::string deviceName);
 
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index b7e4aed6e7e18..6c1ed9aa42727 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -12,10 +12,11 @@ namespace openvino_ep {
 
 std::shared_ptr<IBackend>
 BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
-                            GlobalContext& global_context,
+                            SessionContext& session_context,
                             const SubGraphContext& subgraph_context,
-                            EPCtxHandler& ep_ctx_handle) {
-  std::string type = global_context.device_type;
+                            SharedContext& shared_context,
+                            ptr_stream_t& model_stream) {
+  std::string type = session_context.device_type;
   if (type == "CPU" || type.find("GPU") != std::string::npos ||
       type.find("NPU") != std::string::npos ||
       type.find("HETERO") != std::string::npos ||
@@ -23,7 +24,7 @@ BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_p
       type.find("AUTO") != std::string::npos) {
     std::shared_ptr<IBackend> concrete_backend_;
     try {
-      concrete_backend_ = std::make_shared<BasicBackend>(model_proto, global_context, subgraph_context, ep_ctx_handle);
+      concrete_backend_ = std::make_shared<BasicBackend>(model_proto, session_context, subgraph_context, shared_context, model_stream);
     } catch (std::string const& msg) {
       ORT_THROW(msg);
     }
@@ -32,5 +33,6 @@ BackendFactory::MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_p
     ORT_THROW("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type);
   }
 }
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 45639293344d8..a6a848c542b12 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -21,13 +21,12 @@ namespace openvino_ep {
 using namespace backend_utils;
 
 BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
-                           GlobalContext& global_context,
+                           SessionContext& session_context,
                            const SubGraphContext& subgraph_context,
-                           EPCtxHandler& ep_ctx_handle)
-    : global_context_(global_context), subgraph_context_(subgraph_context) {
-  std::string& hw_target = global_context_.device_type;
-
-  is_ep_ctx_graph_ = ep_ctx_handle.IsValidOVEPCtxGraph();
+                           SharedContext& shared_context,
+                           ptr_stream_t& model_stream)
+    : session_context_{session_context}, subgraph_context_{subgraph_context}, shared_context_{shared_context} {
+  std::string& hw_target = session_context_.device_type;
 
   if (ValidateSubgraph(const_outputs_map_))
     return;
@@ -37,7 +36,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   PopulateConfigValue(device_config);
 
   // Enable caching
-  EnableCaching(device_config);
+  EnableCaching();
 
   // Setting OpenCL queue throttling for GPU
   EnableGPUThrottling(device_config);
@@ -59,78 +58,90 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   }
 
   try {
-    std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str;
-
-    if (global_context.is_wholly_supported_graph) {  // Full graph is supported
+    // IO_BUFFER is enabled on GPU HW.
+    // Pre-requisite is provider_option "context" must be set
 #if defined(IO_BUFFER_ENABLED)
-      if (is_ep_ctx_graph_) {
-        std::istringstream model_stream(ep_ctx_handle.GetModelBlobString());
-        exe_network_ = global_context_.ie_core.ImportModel(model_stream,
-                                                           remote_context_,
-                                                           subgraph_context_.subgraph_name);
-      } else if ((global_context.device_type.find("GPU") != std::string::npos) &&
-                 (global_context_.context != nullptr)) {
-        LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
-        cl_context ctx = static_cast<cl_context>(global_context_.context);
-        remote_context_ = new ov::intel_gpu::ocl::ClContext(global_context_.ie_core.Get(), ctx);
-        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
-        exe_network_ = global_context_.ie_core.CompileModel(
-            ie_cnn_network_, remote_context_, subgraph_context_.subgraph_name);
-      } else {
-        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
-        exe_network_ = global_context_.ie_core.CompileModel(
-            ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
+    cl_context ctx = static_cast<cl_context>(session_context_.context);
+    remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get(), ctx);
+    if (subgraph_context_.is_ep_ctx_graph) {
+      exe_network_ = OVCore::ImportModel(*model_stream,
+                                         remote_context_,
+                                         subgraph_context_.subgraph_name);
+      model_stream.reset();  // Delete stream after it is no longer needed
+    } else {
+      std::shared_ptr<const OVNetwork> ov_model;
+      {
+        const std::string model = model_proto->SerializeAsString();
+        if (!subgraph_context.has_dynamic_input_shape) {
+          delete model_proto.release();
+        }
+        ov_model = CreateOVModel(model, session_context_, const_outputs_map_);
       }
+      LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
+      exe_network_ = OVCore::CompileModel(
+          ov_model, remote_context_, subgraph_context_.subgraph_name);
+    }
 #else  // !IO_BUFFER_ENABLED
-      std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision;
-      if (is_ep_ctx_graph_) {
-        // If the blob is held in an EPContext node, then skip FE+Compile
-        // and directly move on to creating a backend with the executable blob
-        exe_network_ = global_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(),
-                                                           hw_target,
-                                                           device_config,
-                                                           global_context_.ep_context_embed_mode,
-                                                           subgraph_context_.subgraph_name);
-      } else if (global_context_.export_ep_ctx_blob &&
-                 hw_target.find("NPU") != std::string::npos &&
-                 !global_context_.has_external_weights) {
-        std::shared_ptr<ov::Model> ov_model;
-        {
-          const std::string model = model_proto->SerializeAsString();
-          if (!subgraph_context.has_dynamic_input_shape) {
-            delete model_proto.release();
-          }
-          ov_model = global_context_.ie_core.Get().read_model(model, ov::Tensor());
-        }
-        exe_network_ = OVExeNetwork(global_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config));
-      } else if (!global_context_.has_external_weights &&
-                 (!subgraph_context_.has_dynamic_input_shape) &&
-                 ((hw_target.find("AUTO") == std::string::npos) ||
-                  (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) > 2))) {
-        // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above
-        // Inputs with static dimenstions
+    auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
+                                 (session_context_.OpenVINO_Version.at(0) >= 2024 &&
+                                  session_context_.OpenVINO_Version.at(1) > 2));
+    if (subgraph_context_.is_ep_ctx_graph) {
+      // If the blob is held in an EPContext node, then skip FE+Compile
+      // and directly move on to creating a backend with the executable blob
+      exe_network_ = OVCore::ImportModel(*model_stream,
+                                         hw_target,
+                                         device_config,
+                                         subgraph_context_.subgraph_name);
+      model_stream.reset();  // Delete stream after it is no longer needed
+    } else if (!session_context_.has_external_weights &&
+               !subgraph_context_.has_dynamic_input_shape &&
+               !session_context_.so_context_enable &&
+               auto_unified_compile) {
+      // Unified OV compile_model is efficient when ov model caching is enabled
+      // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
+      // Inputs with static dimenstions
+      // Not enabled for models with external weights and when ep context is set.
+      const std::string model = model_proto->SerializeAsString();
+      exe_network_ = OVCore::CompileModel(model,
+                                          hw_target,
+                                          device_config,
+                                          subgraph_context_.subgraph_name);
+    } else {  // For all other types use ov::ov_core read_model() to generate OV IR
+              // followed by ov::ov_core compile_model()
+      std::shared_ptr<const OVNetwork> ov_model;
+      {
         const std::string model = model_proto->SerializeAsString();
-        exe_network_ = global_context_.ie_core.CompileModel(model,
-                                                            hw_target,
-                                                            device_config,
-                                                            subgraph_context_.subgraph_name);
-      } else {  // For all other types use ov::Model Type
-        auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
-        exe_network_ = global_context_.ie_core.CompileModel(
-            ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
+        if (!subgraph_context.has_dynamic_input_shape) {
+          delete model_proto.release();
+        }
+        ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
       }
-#endif
-    } else {  // Full graph is not supported
-      auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
-      exe_network_ = global_context_.ie_core.CompileModel(
+      exe_network_ = OVCore::CompileModel(
           ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
     }
+#endif
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
   } catch (const char* msg) {
     ORT_THROW(msg);
   }
-  size_t num_infer_req = (global_context_.num_of_threads > 0) ? global_context_.num_of_threads : 1;
-  inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req));
+
+  int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1;
+  std::function<void(OVInferRequestPtr)> initializer = [](OVInferRequestPtr) {};
+  auto metadata = shared_context_.shared_weights.metadata;
+  if (session_context_.so_share_ep_contexts) {
+    initializer = [&metadata](OVInferRequestPtr ir_ptr) {
+      const auto input_count = ir_ptr->GetNumInputs();
+      for (auto i = 0u; i < input_count; i++) {
+        using Key = SharedContext::SharedWeights::Metadata::Key;
+        const auto tensor_key = Key{ir_ptr->GetInputTensorName(i)};
+        if (metadata.contains(tensor_key)) {
+          auto& value = metadata.at(tensor_key);
+          ir_ptr->SetTensor(tensor_key.name, value.tensor);
+        }
+      }
+    };
+  }
+  inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer)));
 }
 
 bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
@@ -146,21 +157,21 @@ bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::No
 void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   device_config = {};
   // Set inference precision based on device precision for OV backend
-  if (global_context_.precision_str.find("FP16") != std::string::npos &&
-      global_context_.device_type == "GPU") {
+  if (session_context_.precision.find("FP16") != std::string::npos &&
+      session_context_.device_type == "GPU") {
     device_config.emplace(ov::hint::inference_precision("f16"));
   }
-  if (global_context_.precision_str.find("FP32") != std::string::npos) {
+  if (session_context_.precision.find("FP32") != std::string::npos) {
     device_config.emplace(ov::hint::inference_precision("f32"));
   }
-  if (global_context_.precision_str.find("ACCURACY") != std::string::npos &&
-      global_context_.device_type.find("GPU") != std::string::npos) {
-    if (global_context_.OpenVINO_Version.at(0) >= 2024) {
+  if (session_context_.precision.find("ACCURACY") != std::string::npos &&
+      session_context_.device_type.find("GPU") != std::string::npos) {
+    if (session_context_.OpenVINO_Version.at(0) >= 2024) {
       device_config.emplace(ov::hint::inference_precision(ov::element::undefined));
       device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY));
     } else {
-      if (global_context_.model_precision != "")
-        device_config.emplace(ov::hint::inference_precision(global_context_.model_precision));
+      if (!subgraph_context_.model_precision.empty())
+        device_config.emplace(ov::hint::inference_precision(subgraph_context_.model_precision));
     }
   }
 #ifndef NDEBUG
@@ -171,10 +182,10 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
 
   // Set a priority level for the current workload for preemption;  default priority is "DEFAULT"
   // CPU Plugin doesn't support workload priority
-  if (global_context_.device_type.find("CPU") == std::string::npos)
-    device_config.emplace(ov::hint::model_priority(global_context_.model_priority));
+  if (session_context_.device_type.find("CPU") == std::string::npos)
+    device_config.emplace(ov::hint::model_priority(session_context_.model_priority));
 
-  if (global_context_.device_type.find("NPU") != std::string::npos) {
+  if (session_context_.device_type.find("NPU") != std::string::npos) {
     std::pair<std::string, ov::Any> device_property;
     device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER");
 
@@ -184,16 +195,16 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     }
     device_config.emplace(ov::device::properties("NPU", device_property));
 #if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024))
-    if (global_context_.export_ep_ctx_blob) {
-      global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
+    if (session_context_.so_context_enable) {
+      OVCore::Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
     }
 #endif
   }
 
-  if (!global_context_.load_config.empty()) {
-    const std::map<std::string, ov::AnyMap>& target_config = global_context_.load_config;
+  if (!session_context_.load_config.empty()) {
+    const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;
 
-    if (global_context_.device_type.find("NPU") != std::string::npos) {
+    if (session_context_.device_type.find("NPU") != std::string::npos) {
       auto npuw_config = target_config.at("NPU");
 
       // Check if "NPU_USE_NPUW" exists and is set to "YES"
@@ -253,7 +264,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
           continue;
         }
         if (is_supported_and_mutable(key, supported_properties)) {
-          global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}});
+          OVCore::Get().set_property(device, ov::AnyMap{{key, value}});
         } else {
           LOGS_DEFAULT(WARNING) << "WARNING: Property \"" << key
                                 << "\" is either unsupported in current OpenVINO version"
@@ -264,50 +275,44 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     };
 
     // Check if the device type is AUTO, HETERO, or MULTI
-    if (global_context_.device_type.find("AUTO") == 0 ||
-        global_context_.device_type.find("HETERO") == 0 ||
-        global_context_.device_type.find("MULTI") == 0) {
+    if (session_context_.device_type.find("AUTO") == 0 ||
+        session_context_.device_type.find("HETERO") == 0 ||
+        session_context_.device_type.find("MULTI") == 0) {
       // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"])
-      auto individual_devices = parse_individual_devices(global_context_.device_type);
+      auto individual_devices = parse_individual_devices(session_context_.device_type);
       // Set properties only for individual devices (e.g., "CPU", "GPU")
       for (const std::string& device : individual_devices) {
         if (target_config.count(device)) {
           // Get supported properties for each individual device
-          auto device_properties = global_context_.ie_core.Get().get_property(device, ov::supported_properties);
+          auto device_properties = OVCore::Get().get_property(device, ov::supported_properties);
           // Set properties for the device
           set_target_properties(device, target_config.at(device), device_properties);
         }
       }
     } else {
-      if (target_config.count(global_context_.device_type)) {
-        auto supported_properties = global_context_.ie_core.Get().get_property(global_context_.device_type,
-                                                                               ov::supported_properties);
-        set_target_properties(global_context_.device_type,
-                              target_config.at(global_context_.device_type), supported_properties);
+      if (target_config.count(session_context_.device_type)) {
+        auto supported_properties = OVCore::Get().get_property(session_context_.device_type,
+                                                               ov::supported_properties);
+        set_target_properties(session_context_.device_type,
+                              target_config.at(session_context_.device_type), supported_properties);
       }
     }
   }
 }
 
-void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
+void BasicBackend::EnableCaching() {
   // cache_dir argument has no effect when working with an embed-mode EPContext Graph
-  if (is_ep_ctx_graph_) return;
+  if (subgraph_context_.is_ep_ctx_graph) return;
 
-  if (!global_context_.cache_dir.empty() && !global_context_.export_ep_ctx_blob) {
+  if (!session_context_.cache_dir.empty() && !session_context_.so_context_enable) {
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
-    if (global_context_.device_type.find("AUTO:GPU") != std::string::npos) {
-      std::pair<std::string, ov::Any> device_property;
-      device_property = std::make_pair("CACHE_DIR", global_context_.cache_dir);
-      device_config.emplace(ov::device::properties("GPU", device_property));
-    } else {
-      global_context_.ie_core.SetCache(global_context_.cache_dir);
-    }
+    OVCore::SetCache(session_context_.cache_dir.string());
   }
 }
 
 void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) {
-  if (global_context_.enable_opencl_throttling == true &&
-      global_context_.device_type.find("GPU") != std::string::npos) {
+  if (session_context_.enable_opencl_throttling == true &&
+      session_context_.device_type.find("GPU") != std::string::npos) {
     LOGS_DEFAULT(INFO) << log_tag << "Enabled OpenCL queue throttling for GPU device";
     std::pair<std::string, ov::Any> device_property;
     device_property = std::make_pair("PLUGIN_THROTTLE", "1");
@@ -318,61 +323,56 @@ void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) {
 void BasicBackend::EnableStreams() {
   // Return silently for NPU as it's currently treated as a read-only flag by the NPU plugin
   // and throws an exception for the same
-  if (global_context_.device_type.find("NPU") != std::string::npos)
+  if (session_context_.device_type.find("NPU") != std::string::npos)
     return;
 
   // Streams can be set only if the device is not one of AUTO, MULTI, or HETERO
   // Throw an exception if the user tries to set num_streams for these devices
-  if ((global_context_.device_type.find("MULTI") != std::string::npos) ||
-      (global_context_.device_type.find("HETERO") != std::string::npos) ||
-      (global_context_.device_type.find("AUTO") != std::string::npos)) {
-    if (global_context_.num_streams != 1) {
+  if ((session_context_.device_type.find("MULTI") != std::string::npos) ||
+      (session_context_.device_type.find("HETERO") != std::string::npos) ||
+      (session_context_.device_type.find("AUTO") != std::string::npos)) {
+    if (session_context_.num_streams != 1) {
       ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " +
-                std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type);
+                std::to_string(session_context_.num_streams) + " for device " + session_context_.device_type);
     }
     // Do nothing
   } else {
-    global_context_.ie_core.SetStreams(global_context_.device_type, global_context_.num_streams);
+    OVCore::SetStreams(session_context_.device_type, session_context_.num_streams);
   }
 }
 
 void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
   // inference_num_threads is applicable only for the CPU device
-  if (global_context_.device_type.find("CPU") != std::string::npos)
-    device_config.emplace(ov::inference_num_threads(static_cast<int>(global_context_.num_of_threads)));
+  if (session_context_.device_type.find("CPU") != std::string::npos)
+    device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads));
 }
 
 // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on
 // an Infer Request indexed by infer_req_idx
 void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
   try {
-    auto graph_input_info = exe_network_.Get().inputs();
-    int input_idx = 0;
-    for (auto input_info_iter = graph_input_info.begin();
-         input_info_iter != graph_input_info.end(); ++input_info_iter) {
-      auto input_names = input_info_iter->get_names();
-      std::string onnx_input_name;
-      std::string input_name;
-      // use names retrieved from original ONNX model to assign the right onnx input name for the graph
-      for (auto it = subgraph_context_.input_names.begin(); it != subgraph_context_.input_names.end(); ++it) {
-        if (it->second == input_idx) {
-          onnx_input_name = it->first;
+    auto ov_input_info = exe_network_.Get().inputs();
+
+    // Loop over subgraph original input names to find the correspondent OV input name
+    for (const auto& [onnx_input_name, onnx_input_index] : subgraph_context_.input_names) {
+      std::string input_name{};
+      uint32_t input_idx = 0;
+      for (uint32_t index = 0; const auto& ov_input : ov_input_info) {
+        if (ov_input.get_names().contains(onnx_input_name)) {
+          input_name = onnx_input_name;
+          input_idx = index;
           break;
         }
+        index++;
       }
-      // using the input name retrieved from ONNX original to match with the input names returned by OV tensors
-      if (input_names.find(onnx_input_name) != input_names.end()) {
-        input_name = std::move(onnx_input_name);
-      } else {
-        ORT_THROW(log_tag +
-                  "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
+      ORT_ENFORCE(!input_name.empty(), log_tag,
+                  "Input names mismatch between OpenVINO and ONNX. ", onnx_input_name,
                   " doesn't exist in the list of OpenVINO input tensor names");
-      }
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
-          !global_context_.disable_dynamic_shapes &&
-          (global_context_.device_type.find("CPU") != std::string::npos ||
-           global_context_.device_type.find("GPU") != std::string::npos)) {
+          !session_context_.disable_dynamic_shapes &&
+          (session_context_.device_type.find("CPU") != std::string::npos ||
+           session_context_.device_type.find("GPU") != std::string::npos)) {
         auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
         auto tensor_info = tensor.GetTensorTypeAndShapeInfo();
         auto tensor_shape = tensor_info.GetShape();
@@ -384,10 +384,10 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           input_tensor_shape[tensor_iter] = *i;
           tensor_iter += 1;
         }
-        const auto& input = graph_input_info.at(input_idx);
+        const auto& input = ov_input_info.at(input_idx);
         OVTensorPtr tensor_ptr;
         // avoid input copies on the CPU device
-        if (global_context_.device_type.find("CPU") != std::string::npos) {
+        if (session_context_.device_type.find("CPU") != std::string::npos) {
           tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input_tensor_shape,
                                                     (void*)tensor_data);
         } else {
@@ -401,8 +401,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           ORT_THROW(msg);
         }
       } else {
-        if ((global_context_.device_type.find("CPU") != std::string::npos ||
-             global_context_.device_type.find("GPU") != std::string::npos)) {
+        if ((session_context_.device_type.find("CPU") != std::string::npos ||
+             session_context_.device_type.find("GPU") != std::string::npos)) {
           OVTensorPtr graph_input_blob;
           try {
             graph_input_blob = infer_request->GetTensor(input_name);
@@ -417,7 +417,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           if ((it == ort_ov_tensor_map.end()) ||
               (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) {
             ov_tensor_data_t ov_tensor_data;
-            const auto& input = graph_input_info.at(input_idx);
+            const auto& input = ov_input_info.at(input_idx);
             ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape(),
                                                                      const_cast<void*>(tensor.GetTensorRawData()));
 
@@ -432,9 +432,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           }
         }
       }
-      input_idx++;
-    }
-    if (global_context_.device_type.find("NPU") != std::string::npos) {
+    }  // Loop subgraph original input names
+
+    if (session_context_.device_type.find("NPU") != std::string::npos) {
       // Set the output blob as remote blob
       auto graph_output_info = exe_network_.Get().outputs();
       auto output_idx = 0;
@@ -628,8 +628,8 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
             " doesn't exist in the "
             "list of OpenVINO output tensor names");
       }
-      if ((global_context_.device_type.find("CPU") != std::string::npos ||
-           global_context_.device_type.find("GPU") != std::string::npos)) {
+      if ((session_context_.device_type.find("CPU") != std::string::npos ||
+           session_context_.device_type.find("GPU") != std::string::npos)) {
         try {
           graph_output_blob = infer_request->GetTensor(output_name);
         } catch (const char* msg) {
@@ -703,8 +703,8 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     OVInferRequestPtr infer_request;
     infer_request = inferRequestsQueue_->getIdleRequest();
 #ifdef IO_BUFFER_ENABLED
-    if ((global_context_.device_type.find("GPU") != std::string::npos) &&
-        (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) {
+    if ((session_context_.device_type.find("GPU") != std::string::npos) &&
+        (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) {
       try {
         StartRemoteAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
@@ -748,7 +748,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
 #ifndef IO_BUFFER_ENABLED  // Printing performance counts is disabled when IO_BUFFER_ENABLED
     if (openvino_ep::backend_utils::IsDebugEnabled()) {
       inferRequestsQueue_->printstatus();  // Printing the elements of infer_requests_ vector pool only in debug mode
-      std::string& hw_target = global_context_.device_type;
+      std::string& hw_target = session_context_.device_type;
       printPerformanceCounts(std::move(infer_request_), std::cout, hw_target);
     }
 #endif
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 3fcf6e4384d52..7d905f4a1e2f7 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -12,6 +12,7 @@
 #include <condition_variable>
 #include <mutex>
 #include <map>
+#include <functional>
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/openvino/contexts.h"
@@ -30,11 +31,13 @@ class InferRequestsQueue;
 class BasicBackend : public IBackend {
  public:
   BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
-               GlobalContext& global_context,
+               SessionContext& session_context,
                const SubGraphContext& subgraph_context,
-               EPCtxHandler& ep_ctx_handle);
+               SharedContext& shared_context,
+               ptr_stream_t& model_stream);
 
   void Infer(OrtKernelContext* context) override;
+  ~BasicBackend() override = default;
   ov::CompiledModel& GetOVCompiledModel() override {
     return exe_network_.Get();
   }
@@ -43,7 +46,7 @@ class BasicBackend : public IBackend {
   void PopulateCompiledDirectory(std::string, std::string&, std::string&, bool&);
   bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
   void PopulateConfigValue(ov::AnyMap& device_config);
-  void EnableCaching(ov::AnyMap& device_config);
+  void EnableCaching();
   void EnableGPUThrottling(ov::AnyMap& device_config);
   void EnableStreams();
   void SetNumThreads(ov::AnyMap& device_config);
@@ -55,13 +58,13 @@ class BasicBackend : public IBackend {
 
   void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
 
-  GlobalContext& global_context_;
+  SessionContext& session_context_;
   SubGraphContext subgraph_context_;
+  SharedContext& shared_context_;
   mutable std::mutex compute_lock_;
   OVExeNetwork exe_network_;
   std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
   std::unique_ptr<InferRequestsQueue> inferRequestsQueue_;
-  bool is_ep_ctx_graph_{false};
 #if defined IO_BUFFER_ENABLED
   OVRemoteContextPtr remote_context_;
 #endif
@@ -72,10 +75,11 @@ class BasicBackend : public IBackend {
 
 class InferRequestsQueue {
  public:
-  InferRequestsQueue(OVExeNetwork& net, size_t nireq) {
+  InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function<void(OVInferRequestPtr)> initializer) {
     OVInferRequestPtr infer_request;
     for (size_t id = 0; id < nireq; id++) {
       infer_request = std::make_shared<OVInferRequest>(net.CreateInferRequest());
+      initializer(infer_request);
       infer_requests_.push_back(infer_request);
     }
   }
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 4f970bc7bc287..66fcb8025ad8d 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -7,50 +7,122 @@
 #include <map>
 #include <unordered_map>
 #include <string>
+#include <filesystem>
+#include <memory>
+#include "core/common/common.h"
 #include "core/providers/openvino/ov_interface.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
+namespace fs = std::filesystem;
+
+struct SharedContext {
+  struct SharedWeights {
+    struct Metadata {
+      struct Key {
+        std::string name;
+        bool operator==(const Key&) const = default;
+      };
+      struct Hash {
+        std::size_t operator()(const Key& key) const noexcept {
+          return std::hash<std::string>()(key.name);
+        }
+      };
+      struct Value {
+        std::string location;
+        unsigned int data_offset;
+        unsigned int size;
+        std::vector<size_t> dimensions;
+        std::int32_t element_type;
+        std::shared_ptr<ov::Tensor> tensor;
+      };
+      using Map = std::unordered_map<Key, Value, Hash>;
+      friend std::ostream& operator<<(std::ostream& right, const Metadata::Map& metadata);
+      friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata);
+    };
+
+    struct WeightsFile {
+      ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightsFile);
+      WeightsFile() = delete;
+      explicit WeightsFile(std::filesystem::path filename);
+
+      void load_weights(size_t file_offset, void* data, size_t size);
+
+     private:
+      std::ifstream file_;
+      size_t weights_size_;
+    };
+
+    fs::path external_weight_filename;
+    std::unique_ptr<WeightsFile> mapped_weights;
+    Metadata::Map metadata;
+  } shared_weights;
+};
+
+using config_t = std::map<std::string, ov::AnyMap>;
+
+struct ProviderInfo {
+  std::string device_type{""};             // [device_type]: Overrides the accelerator hardware type and
+                                           // precision with these values at runtime.
+  std::string precision{""};               // [precision]: Sets the inference precision for execution.
+                                           // Supported precision for devices are
+                                           // CPU=FP32, GPU=FP32,FP16, NPU=FP16.
+                                           // Not setting precision will execute with optimized precision for
+                                           // best inference latency. set Precision=ACCURACY for executing
+                                           // models with input precision for best accuracy.
+  uint32_t num_of_threads{0};              // [num_of_threads]: Overrides the accelerator default value of
+                                           // number of threads with this value at runtime.
+  config_t load_config{};                  // JSON config map to load custom OV parameters.
+  fs::path cache_dir{""};                  // [cache_dir]: specify the path to
+                                           // dump and load the blobs for the model caching/kernel caching
+                                           // (GPU) feature. If blob files are already present,
+                                           // it will be directly loaded.
+  std::string model_priority{"DEFAULT"};   // High-level OpenVINO model priority hint
+                                           // Defines what model should be provided with more performant
+                                           // bounded resource first
+  uint32_t num_streams{1};                 // [num_streams]: Option that specifies the number of parallel
+                                           // inference requests to be processed on a given `device_type`.
+                                           // Overrides the accelerator default value of number of streams
+                                           // with this value at runtime.
+  void* context{nullptr};                  // OpenCL context
+  bool enable_opencl_throttling{false};    // [enable_opencl_throttling]: Enables OpenCL queue throttling for
+                                           // GPU device (Reduces CPU Utilization when using GPU)
+  bool disable_dynamic_shapes{false};      // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to
+                                           // static shape at runtime and execute.
+  bool enable_qdq_optimizer{false};        // Enables QDQ pruning for efficient inference latency with NPU
+  bool so_context_enable{false};           // ORT session option
+  bool so_disable_cpu_ep_fallback{false};  // ORT session option
+  bool so_context_embed_mode{false};       // ORT session option
+  bool so_share_ep_contexts{false};        // ORT session option
+  fs::path so_context_file_path{};         // ORT session option
+};
+
 // Holds context applicable to the entire EP instance.
-struct GlobalContext {
-  OVCore ie_core;
-  bool is_wholly_supported_graph = false;
-  bool enable_opencl_throttling = false;
-  bool disable_dynamic_shapes = false;
-  bool ep_context_embed_mode = false;
-  bool export_ep_ctx_blob = false;
-  bool enable_qdq_optimizer = false;
-  bool disable_cpu_fallback = false;
-  bool has_external_weights = false;
-  size_t num_of_threads;
-  std::string device_type;
-  std::string precision_str;
-  std::string model_precision;
-  std::string cache_dir;
-  std::map<std::string, ov::AnyMap> load_config;
-  std::string model_priority = "DEFAULT";
-  int num_streams;
+struct SessionContext : ProviderInfo {
+  SessionContext(const ProviderInfo& info) : ProviderInfo{info} {}
   std::vector<bool> deviceAvailableList = {true, true, true, true, true, true, true, true};
-  std::string onnx_model_name;
-  std::string onnx_model_path_name;
-  int onnx_opset_version;
-  void* context = 0;
-  bool use_api_2;
-  std::vector<int> OpenVINO_Version = {};  // Ov Major and OV minor version from OV headers
+  std::filesystem::path onnx_model_path_name;
+  uint32_t onnx_opset_version{0};
+  mutable bool is_wholly_supported_graph = false;  // Value is set to mutable to modify from capability
+  mutable bool has_external_weights = false;       // Value is set to mutable to modify from capability
+  const std::vector<uint32_t> OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
+  const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR);
 };
 
 // Holds context specific to subgraph.
 struct SubGraphContext {
+  using string_index_map_t = std::unordered_map<std::string, uint32_t>;
   bool has_dynamic_input_shape = false;
   bool enable_batching = false;
   bool set_npu_config = false;
   bool is_constant = false;
   void* context = 0;
   std::string subgraph_name;
-  std::vector<int> input_indexes;
-  std::unordered_map<std::string, int> input_names;
-  std::unordered_map<std::string, int> output_names;
+  string_index_map_t input_names;
+  string_index_map_t output_names;
+  std::string model_precision;
+  bool is_ep_ctx_graph = false;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 7a2d6f4e8cd69..04d1f52cbf834 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <memory>
+#include <istream>
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
@@ -15,15 +16,17 @@ class IBackend {
  public:
   virtual void Infer(OrtKernelContext* context) = 0;
   virtual ov::CompiledModel& GetOVCompiledModel() = 0;
+  virtual ~IBackend() = default;
 };
-
+using ptr_stream_t = std::unique_ptr<std::istream>;
 class BackendFactory {
  public:
   static std::shared_ptr<IBackend>
   MakeBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_proto,
-              GlobalContext& global_context,
+              SessionContext& session_context,
               const SubGraphContext& subgraph_context,
-              EPCtxHandler& ctx_handle);
+              SharedContext& shared_context,
+              ptr_stream_t& model_stream);
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index 6d159db3b390d..7bd4f8d96cc55 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -11,25 +11,45 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
+EPCtxHandler::EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger) : openvino_sdk_version_(std::move(ov_sdk_version)), logger_(logger) {
+  epctx_model_ = Model::Create("ovep_context_model", false, logger_);
+}
+
 /* Export the serialized blob string embedded onto an EPContext Node
  * along with other metadata necessary to validate the graph on import
  */
 
-Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
-                                      const std::string& graph_name,
-                                      const logging::Logger& logger,
-                                      const bool& ep_context_embed_mode,
-                                      std::string&& model_blob_str,
-                                      const std::string& openvino_sdk_version) const {
-  auto& metadata = graph_viewer.GetGraph().GetModel().MetaData();
-  auto model_build = graph_viewer.CreateModel(logger, metadata);
-  auto& graph_build = model_build->MainGraph();
+Status EPCtxHandler::ExportEPCtxModel(const std::string& model_name) {
+  // Serialize modelproto to string
+  auto model_proto = epctx_model_->ToProto();
+  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+
+  // Finally, dump the model
+  std::ofstream epctx_onnx_model(model_name,
+                                 std::ios::out | std::ios::trunc | std::ios::binary);
+  if (!epctx_onnx_model) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file");
+  }
+
+  if (!model_proto->SerializeToOstream(epctx_onnx_model)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file");
+  }
+  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node";
+
+  return Status::OK();
+}
+
+Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
+                                           const std::string& graph_name,
+                                           const bool embed_mode,
+                                           std::string&& model_blob_str) const {
+  auto& graph = epctx_model_->MainGraph();
 
   // Get graph inputs and outputs
   const auto& viewer_inputs = graph_viewer.GetInputs();
   const auto& viewer_outputs = graph_viewer.GetOutputs();
   std::vector<onnxruntime::NodeArg*> inputs(viewer_inputs.size()), outputs(viewer_outputs.size());
-  auto transform_f = [&](const onnxruntime::NodeArg* iter) { return &graph_build.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); };
+  auto transform_f = [&](const onnxruntime::NodeArg* iter) { return &graph.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); };
   auto fill_vectors = [transform_f](auto& src, auto& dst) {
     std::transform(src.begin(), src.end(), dst.begin(), transform_f);
   };
@@ -46,7 +66,7 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
     auto embed_mode_attr = ONNX_NAMESPACE::AttributeProto::Create();
     embed_mode_attr->set_name(EMBED_MODE);
     embed_mode_attr->set_type(onnx::AttributeProto_AttributeType_INT);
-    embed_mode_attr->set_i(ep_context_embed_mode);
+    embed_mode_attr->set_i(embed_mode);
     node_attributes->emplace(EMBED_MODE, std::move(*embed_mode_attr));
 
     // ep context
@@ -60,7 +80,7 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
     auto sdk_version_attr = ONNX_NAMESPACE::AttributeProto::Create();
     sdk_version_attr->set_name(EP_SDK_VER);
     sdk_version_attr->set_type(onnx::AttributeProto_AttributeType_STRING);
-    sdk_version_attr->set_s(openvino_sdk_version);
+    sdk_version_attr->set_s(openvino_sdk_version_);
     node_attributes->emplace(EP_SDK_VER, std::move(*sdk_version_attr));
 
     // source
@@ -70,73 +90,70 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
     source_attr->set_s(kOpenVINOExecutionProvider);
     node_attributes->emplace(SOURCE, std::move(*source_attr));
   }
-  // Create EP context node
-  graph_build.AddNode(graph_name, EPCONTEXT_OP, "", inputs, outputs, std::move(*node_attributes), kMSDomain);
-  ORT_ENFORCE(graph_build.Resolve().IsOK());
 
-  {
-    // Serialize modelproto to string
-    auto model_proto = model_build->ToProto();
-    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
-
-    // Finally, dump the model
-    std::ofstream epctx_onnx_model(graph_name,
-                                   std::ios::out | std::ios::trunc | std::ios::binary);
-    if (!epctx_onnx_model) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file");
-    }
+  // Create EP context node
+  graph.AddNode(graph_name, EPCONTEXT_OP, "", inputs, outputs, std::move(*node_attributes), kMSDomain);
 
-    if (!model_proto->SerializeToOstream(epctx_onnx_model)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file");
-    }
-  }
-  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node";
+  ORT_ENFORCE(graph.Resolve().IsOK());
 
   return Status::OK();
 }
 
-Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode) {
-  auto node = graph_viewer.GetNode(0);
+std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
+  auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
+  auto node = graph_viewer.GetNode(first_index);
+  ORT_ENFORCE(node != nullptr);
   auto& attrs = node->GetAttributes();
-  ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0);
-
-  ep_cache_context_attribute_ = &attrs.at(EP_CACHE_CONTEXT);
 
-  ep_context_embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
-  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
+  ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) == 1);
+  const auto& ep_cache_context = attrs.at(EP_CACHE_CONTEXT).s();
 
-  is_valid_ep_ctx_graph_ = true;
-  return Status::OK();
-}
+  ORT_ENFORCE(attrs.count(EMBED_MODE) == 1);
+  bool embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
 
-const std::string& EPCtxHandler::GetModelBlobStream() const {
-  static std::string empty;
-  if (ep_cache_context_attribute_ != nullptr) {
-    return ep_cache_context_attribute_->s();
+  std::unique_ptr<std::istream> result;
+  if (embed_mode) {
+    result.reset((std::istream*)new std::istringstream(ep_cache_context));
   } else {
-    return empty;
+    auto blob_filepath = so_context_file_path;
+    if (blob_filepath.empty() && !graph_viewer.ModelPath().empty()) {
+      blob_filepath = graph_viewer.ModelPath();
+    }
+    blob_filepath = blob_filepath.parent_path() / ep_cache_context;
+    ORT_ENFORCE(std::filesystem::exists(blob_filepath), "Blob file not found: ", blob_filepath.string());
+    result.reset((std::istream*)new std::ifstream(blob_filepath, std::ios_base::binary | std::ios_base::in));
   }
+  LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
+  return result;
 }
 
-bool EPCtxHandler::CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const {
-  for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) {
-    auto node = graph_viewer.GetNode(i);
-    auto& attrs = node->GetAttributes();
-
-    // Check for correct Op Type, EP SOURCE, and SDK version
-    if (node != nullptr && node->OpType() == EPCONTEXT_OP) {
-      if (attrs.at(SOURCE).s() == kOpenVINOExecutionProvider) {
-        if (attrs.at(EP_SDK_VER).s() == openvino_sdk_version) {
-          return true;
-        } else {
-          ORT_THROW("[Invalid Graph] Versions of OpenVINO used to export blob (" + attrs.at(EP_SDK_VER).s() +
-                    ") and current runtime (" + openvino_sdk_version + ") don't match.");
-        }
-      }
+bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const {
+  if (graph_viewer.NumberOfNodes() == 1) {
+    auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
+    if (auto node = graph_viewer.GetNode(first_index); (node != nullptr) && CheckForOVEPCtxNode(*node)) {
+      return true;
     }
   }
   return false;
 }
 
+bool EPCtxHandler::CheckForOVEPCtxNode(const Node& node) const {
+  // Check for correct Op Type, EP SOURCE, and SDK version
+  if (node.OpType() == EPCONTEXT_OP) {
+    auto& attrs = node.GetAttributes();
+    bool result = (attrs.count(SOURCE) == 1) && (attrs.at(SOURCE).s() == kOpenVINOExecutionProvider);
+    result &= (attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_);
+    result &= attrs.count(EMBED_MODE) == 1;
+    result &= attrs.count(EP_CACHE_CONTEXT) == 1;
+    return result;
+  }
+  return false;
+}
+
+InlinedVector<const Node*> EPCtxHandler::GetEPCtxNodes() const {
+  const auto& epctx_nodes{epctx_model_->MainGraph().Nodes()};
+  return InlinedVector<const Node*>(epctx_nodes.begin(), epctx_nodes.end());
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
index caab33b7db775..ff978bd6534d8 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
@@ -22,22 +22,22 @@ static const char SOURCE[] = "source";
 
 class EPCtxHandler {
  public:
-  EPCtxHandler() = default;
-  EPCtxHandler(const EPCtxHandler&) = delete;
-  Status ExportEPCtxModel(const GraphViewer& graph_viewer,
-                          const std::string& graph_name,
-                          const logging::Logger& logger,
-                          const bool& ep_context_embed_mode,
-                          std::string&& model_blob_str,
-                          const std::string& openvino_sdk_version) const;
-  Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode);
-  bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const;
-  bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; }
-  const std::string& GetModelBlobStream() const;
+  EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger);
+  EPCtxHandler(const EPCtxHandler&) = delete;  // No copy constructor
+  Status ExportEPCtxModel(const std::string& model_name);
+  bool CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const;
+  bool CheckForOVEPCtxNode(const Node& node) const;
+  Status AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
+                               const std::string& graph_name,
+                               const bool embed_mode,
+                               std::string&& model_blob_str) const;
+  std::unique_ptr<std::istream> GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const;
+  InlinedVector<const Node*> GetEPCtxNodes() const;
 
  private:
-  bool is_valid_ep_ctx_graph_{false};
-  const onnx::AttributeProto* ep_cache_context_attribute_;
+  const std::string openvino_sdk_version_;
+  std::unique_ptr<Model> epctx_model_;
+  const logging::Logger& logger_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 72a188108adef..22477611ce25b 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -11,62 +11,135 @@
 #include "core/providers/openvino/backend_manager.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
+#include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "openvino/core/version.hpp"
 #ifdef USE_OVEP_NPU_MEMORY
 #include "core/providers/openvino/ov_allocator.h"
 #endif
 
-#define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
-
 namespace onnxruntime {
+namespace openvino_ep {
 
-OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} {
-  InitProviderOrtApi();
+// Parking this code here for now before it's moved to the factory
+#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
+static std::vector<std::string> parseDevices(const std::string& device_string,
+                                             const std::vector<std::string>& available_devices) {
+  std::string comma_separated_devices = device_string;
+  if (comma_separated_devices.find(":") != std::string::npos) {
+    comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
+  }
+  auto devices = split(comma_separated_devices, ',');
+  if (devices.size() < 2) {
+    print_build_options();
+    ORT_THROW("Invalid device string: " + device_string);
+  }
+  std::set<std::string> dev_options = {"CPU", "GPU", "NPU"};
+
+  for (auto& device : available_devices) {
+    if (dev_options.find(device) == dev_options.end()) {
+      auto dev_options_update = dev_options.emplace(device);
+    }
+  }
+
+  for (const std::string& dev : devices) {
+    if (!std::count(dev_options.begin(), dev_options.end(), dev)) {
+      print_build_options();
+      ORT_THROW("Invalid device string: " + device_string);
+    }
+  }
+  return devices;
+}
+#endif
+
+// Parking this code here for now before it's moved to the factory
+void AdjustProviderInfo(ProviderInfo& info) {
+  std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
+                                                     "GPU.0", "GPU.1", "NPU"};
+
+  std::vector<std::string> available_devices = OVCore::GetAvailableDevices();
+
+  for (auto& device : available_devices) {
+    if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
+      ov_supported_device_types.emplace(device);
+    }
+  }
 
-  global_context_ = std::make_unique<openvino_ep::GlobalContext>();
-  global_context_->device_type = info.device_type_;
-  global_context_->precision_str = info.precision_;
-  global_context_->cache_dir = info.cache_dir_;
-  global_context_->load_config = info.load_config_;
-  global_context_->model_priority = info.model_priority_;
-  global_context_->num_streams = info.num_streams_;
-  global_context_->context = info.context_;
-  global_context_->enable_opencl_throttling = info.enable_opencl_throttling_;
-  global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_;
-  global_context_->num_of_threads = info.num_of_threads_;
-  global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
-  global_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_;
-  global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_;
-  global_context_->disable_cpu_fallback = info.disable_cpu_fallback_;
-  global_context_->ep_context_embed_mode = info.so_epctx_embed_mode_;
+  if (info.device_type == "") {
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
+                       << "No runtime device selection option provided.";
+#if defined OPENVINO_CONFIG_CPU
+    info.device_type = "CPU";
+    info.precision = "FP32";
+#elif defined OPENVINO_CONFIG_GPU
+    info.device_type = "GPU";
+    info.precision = "FP16";
+#elif defined OPENVINO_CONFIG_NPU
+    info.device_type = "NPU";
+    info.precision = "FP16";
+#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
+#ifdef DEVICE_NAME
+#define DEVICE DEVICE_NAME
+#endif
+    dev_type = DEVICE;
+
+    if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) {
+      std::vector<std::string> devices = parseDevices(info.device_type, available_devices);
+      info.precision = "FP16";
+      if (devices[0] == "CPU") {
+        info.precision = "FP32";
+      }
+      info.device_type = std::move(dev_type);
+    }
+#endif
+  } else if (ov_supported_device_types.find(info.device_type) != ov_supported_device_types.end()) {
+    info.device_type = std::move(info.device_type);
+  }
+#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
+  else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) {
+    std::ignore = parseDevices(info.device_type, available_devices);
+    info.device_type = std::move(info.device_type);
+  }
+#endif
+  else {
+    ORT_THROW("Invalid device string: " + info.device_type);
+  }
+  LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
+                     << "Choosing Device: " << info.device_type << " , Precision: " << info.precision;
+}
+
+OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context)
+    : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider},
+      session_context_(info),
+      shared_context_{shared_context},
+      ep_ctx_handle_{session_context_.openvino_sdk_version, *GetLogger()} {
+  InitProviderOrtApi();
 
   // to check if target device is available
-  // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
-  if (info.cache_dir_.empty()) {
+  // using OVCore capability GetAvailableDevices to fetch list of devices plugged in
+  if (info.cache_dir.empty()) {
     bool device_found = false;
-    std::vector<std::string> available_devices = global_context_->ie_core.GetAvailableDevices();
+    std::vector<std::string> available_devices = OVCore::GetAvailableDevices();
     // Checking for device_type configuration
-    if (info.device_type_ != "") {
-      if (info.device_type_.find("HETERO") != std::string::npos ||
-          info.device_type_.find("MULTI") != std::string::npos ||
-          info.device_type_.find("AUTO") != std::string::npos) {
+    if (info.device_type != "") {
+      if (info.device_type.find("HETERO") != std::string::npos ||
+          info.device_type.find("MULTI") != std::string::npos ||
+          info.device_type.find("AUTO") != std::string::npos) {
         device_found = true;
       } else {
         for (const std::string& device : available_devices) {
-          if (device.rfind(info.device_type_, 0) == 0) {
-            if (info.device_type_.find("GPU") != std::string::npos && (info.precision_ == "FP32" ||
-                                                                       info.precision_ == "FP16" ||
-                                                                       info.precision_ == "ACCURACY")) {
+          if (device.rfind(info.device_type, 0) == 0) {
+            if (info.device_type.find("GPU") != std::string::npos && (info.precision == "FP32" ||
+                                                                      info.precision == "FP16" ||
+                                                                      info.precision == "ACCURACY")) {
               device_found = true;
               break;
             }
-            if (info.device_type_ == "CPU" && (info.precision_ == "FP32")) {
+            if (info.device_type == "CPU" && (info.precision == "FP32")) {
               device_found = true;
               break;
             }
-            if (info.device_type_.find("NPU") != std::string::npos) {
+            if (info.device_type.find("NPU") != std::string::npos) {
               device_found = true;
               break;
             }
@@ -75,99 +148,101 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
       }
     }
     if (!device_found) {
-      ORT_THROW("[ERROR] [OpenVINO] Specified device - " + info.device_type_ + " is not available");
+      ORT_THROW("[ERROR] [OpenVINO] Specified device - " + info.device_type + " is not available");
     }
   }
 }
 
+OpenVINOExecutionProvider::~OpenVINOExecutionProvider() {
+  for (auto& backend_manager : backend_managers_) {
+    backend_manager.ShutdownBackendManager();
+  }
+  backend_managers_.clear();
+}
+
 std::vector<std::unique_ptr<ComputeCapability>>
 OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
                                          const IKernelLookup& /*kernel_lookup*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
-  std::string openvino_sdk_version = std::to_string(global_context_->OpenVINO_Version.at(0)) + "." +
-                                     std::to_string(global_context_->OpenVINO_Version.at(1));
-
-  // Check for valid ctx node and maintain state for validity
-  if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, std::move(openvino_sdk_version)))
-    ORT_ENFORCE(graph_viewer.NumberOfNodes() == 1,
-                "[Invalid Graph] EPContext Model with OpenVINO compiled blob should not have more than one node.");
-
   // Enable CI Logs
   if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) {
     std::cout << "In the OpenVINO EP" << std::endl;
   }
-  global_context_->onnx_model_path_name = graph_viewer.ModelPath().string();
-
-  global_context_->onnx_opset_version =
-      graph_viewer.DomainToVersionMap().at(kOnnxDomain);
-
-  global_context_->model_precision = [&](const GraphViewer& graph_viewer) {
-    // return empty if graph has no inputs or if types are not one of FP32/FP16
-    // else assume the type of the first input
-    if (graph_viewer.GetInputs().empty()) {
-      return "";
-    } else {
-      auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type();
-      if (global_context_->precision_str == "ACCURACY" &&
-          global_context_->device_type.find("GPU") != std::string::npos) {
-        if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) {
-          return "FP32";
-        } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) {
-          return "FP16";
-        }
-      }
-    }
-    return "";
-  }(graph_viewer);
-
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type,
-                                 global_context_->enable_qdq_optimizer);
+  openvino_ep::GetCapability obj(ep_ctx_handle_,
+                                 graph_viewer,
+                                 session_context_.device_type,
+                                 session_context_.enable_qdq_optimizer);
   result = obj.Execute();
-
-  global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
-  global_context_->has_external_weights = obj.HasExternalWeights();
-
+  session_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph();
+  session_context_.has_external_weights = obj.HasExternalWeights();
   return result;
 }
 
 common::Status OpenVINOExecutionProvider::Compile(
     const std::vector<FusedNodeAndGraph>& fused_nodes,
     std::vector<NodeComputeInfo>& node_compute_funcs) {
+  auto& logger = *GetLogger();
+  Status status = Status::OK();
+
+  if (!fused_nodes.empty()) {
+    // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
+    const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
+    session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
+    session_context_.onnx_opset_version =
+        graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
+  }
+
+  // Temporary code to read metadata before it moves to the .bin
+  auto& metadata = shared_context_.shared_weights.metadata;
+  if (session_context_.so_share_ep_contexts && metadata.empty()) {
+    // Metadata is always read from model location, this could be a source or epctx model
+    fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
+    std::ifstream file(metadata_filename, std::ios::binary);
+    if (file) {
+      file >> metadata;
+    }
+  }
+
+  struct OpenVINOEPFunctionState {
+    AllocateFunc allocate_func = nullptr;
+    DestroyFunc destroy_func = nullptr;
+    AllocatorHandle allocator_handle = nullptr;
+    BackendManager& backend_manager;
+  };
+
   for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
     const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
     const Node& fused_node = fused_node_graph.fused_node;
 
     NodeComputeInfo compute_info;
 
-    global_context_->use_api_2 = true;
-
     // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
     // For precompiled blob, directly load the model instead of compiling the model
     // For original model, check if the user wants to export a model with pre-compiled blob
 
-    std::shared_ptr<openvino_ep::BackendManager> backend_manager =
-        std::make_shared<openvino_ep::BackendManager>(*global_context_,
-                                                      fused_node,
-                                                      graph_body_viewer,
-                                                      *GetLogger(),
-                                                      ep_ctx_handle_);
-    backend_manager_ = backend_manager;
+    auto& backend_manager = backend_managers_.emplace_back(session_context_,
+                                                           shared_context_,
+                                                           fused_node,
+                                                           graph_body_viewer,
+                                                           logger,
+                                                           ep_ctx_handle_);
+
     compute_info.create_state_func =
-        [backend_manager](ComputeContext* context, FunctionState* state) {
-          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState();
-          p->allocate_func = context->allocate_func;
-          p->destroy_func = context->release_func;
-          p->allocator_handle = context->allocator_handle;
-          p->backend_manager = backend_manager;
+        [&backend_manager](ComputeContext* context, FunctionState* state) {
+          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
+              .allocate_func = context->allocate_func,
+              .destroy_func = context->release_func,
+              .allocator_handle = context->allocator_handle,
+              .backend_manager = backend_manager};
           *state = static_cast<FunctionState>(p);
           return 0;
         };
+
     compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
       auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
       try {
-        function_state->backend_manager->Compute(context);
+        function_state->backend_manager.Compute(context);
       } catch (const std::exception& ex) {
         return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
       }
@@ -181,19 +256,42 @@ common::Status OpenVINOExecutionProvider::Compile(
             delete function_state;
           }
         };
-    node_compute_funcs.push_back(compute_info);
+
+    node_compute_funcs.push_back(std::move(compute_info));
+
+    if (!status.IsOK()) {
+      break;
+    }
   }
 
-  return Status::OK();
+  if (session_context_.so_share_ep_contexts) {
+    fs::path metadata_filename;
+    if (session_context_.so_context_file_path.empty()) {
+      metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin";
+    } else {
+      metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin";
+    }
+
+    // Metadata is generated only for shared contexts
+    // If saving metadata then save it to the provided path or ose the original model path
+    // Multiple calls to Compile() will update the metadata and for the last call
+    //   the resulting file will contain the aggregated content
+    std::ofstream file(metadata_filename, std::ios::binary);
+    if (file) {
+      file << metadata;
+    }
+  }
+
+  return status;
 }
 
 #ifdef USE_OVEP_NPU_MEMORY
 std::vector<AllocatorPtr> OpenVINOExecutionProvider::CreatePreferredAllocators() {
-  if (global_context_->device_type.find("NPU") != std::string::npos) {
+  if (session_context_.device_type.find("NPU") != std::string::npos) {
     AllocatorCreationInfo npu_allocator_info{
         [this](OrtDevice::DeviceId device_id) {
           return std::make_unique<OVRTAllocator>(
-              global_context_->ie_core.Get(),
+              OVCore::Get(),
               OrtDevice::NPU,
               device_id,
               OpenVINO_RT_NPU);
@@ -232,8 +330,10 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
       }
       if (workload_type != "") {
         LOGS_DEFAULT(INFO) << "SetEpDynamicOptions - modifying: " << key << "/" << value;
-        ov::CompiledModel& ov_compiled_model = backend_manager_->GetOVCompiledModel();
-        ov_compiled_model.set_property(ov::workload_type(workload_type));
+        for (auto& backend : backend_managers_) {
+          ov::CompiledModel& ov_compiled_model = backend.GetOVCompiledModel();
+          ov_compiled_model.set_property(ov::workload_type(workload_type));
+        }
       }
     } else {
       // Handle unknown options
@@ -242,4 +342,10 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
   }
   return Status::OK();
 }
+
+const InlinedVector<const Node*> OpenVINOExecutionProvider::GetEpContextNodes() const {
+  return ep_ctx_handle_.GetEPCtxNodes();
+}
+
+}  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index d5c22a4e2a9e4..75f4ef9f8ecc8 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -13,15 +13,10 @@
 #include <utility>
 
 #include "core/providers/openvino/backend_manager.h"
+#include "core/providers/openvino/contexts.h"
 
 namespace onnxruntime {
-
-struct OVDevices {
-  ov::Core core;
-  std::vector<std::string> get_ov_devices() const {
-    return core.get_available_devices();
-  }
-};
+namespace openvino_ep {
 
 static void print_build_options() {
   std::cout << "[ERROR] INVALID DEVICE BUILD TYPE SPECIFIED" << std::endl;
@@ -47,139 +42,11 @@ static std::vector<std::string> split(const std::string& s, char delim) {
   return result;
 }
 
-static std::vector<std::string> parseDevices(const std::string& device_string,
-                                             const std::vector<std::string>& available_devices) {
-  std::string comma_separated_devices = device_string;
-  if (comma_separated_devices.find(":") != std::string::npos) {
-    comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
-  }
-  auto devices = split(comma_separated_devices, ',');
-  if (devices.size() < 2) {
-    print_build_options();
-    ORT_THROW("Invalid device string: " + device_string);
-  }
-  std::set<std::string> dev_options = {"CPU", "GPU", "NPU"};
-
-  for (auto& device : available_devices) {
-    if (dev_options.find(device) == dev_options.end()) {
-      auto dev_options_update = dev_options.emplace(device);
-    }
-  }
-
-  for (const std::string& dev : devices) {
-    if (!std::count(dev_options.begin(), dev_options.end(), dev)) {
-      print_build_options();
-      ORT_THROW("Invalid device string: " + device_string);
-    }
-  }
-  return devices;
-}
-
-// Information needed to construct OpenVINO execution providers.
-struct OpenVINOExecutionProviderInfo {
-  std::string device_type_{""};
-  std::string precision_{""};
-  size_t num_of_threads_{0};
-  std::map<std::string, ov::AnyMap> load_config_{};
-  std::string cache_dir_{""};
-  std::string model_priority_{""};
-  int num_streams_{1};
-  void* context_{NULL};
-  bool enable_opencl_throttling_{false};
-  bool disable_dynamic_shapes_{false};
-  bool export_ep_ctx_blob_{false};
-  bool enable_qdq_optimizer_{false};
-  bool disable_cpu_fallback_{false};
-  bool so_epctx_embed_mode_{false};
-
-  OpenVINOExecutionProviderInfo() = delete;
-
-  explicit OpenVINOExecutionProviderInfo(std::string dev_type, const std::string& precision,
-                                         size_t num_of_threads,
-                                         const std::map<std::string, ov::AnyMap>& load_config,
-                                         const std::string& cache_dir,
-                                         const std::string& model_priority, int num_streams,
-                                         void* context, bool enable_opencl_throttling,
-                                         bool disable_dynamic_shapes, bool export_ep_ctx_blob,
-                                         bool enable_qdq_optimizer, bool disable_cpu_fallback,
-                                         bool so_epctx_embed_mode)
-      : precision_(std::move(precision)),
-        num_of_threads_(num_of_threads),
-        load_config_(std::move(load_config)),
-        cache_dir_(std::move(cache_dir)),
-        model_priority_(std::move(model_priority)),
-        num_streams_(num_streams),
-        context_(context),
-        enable_opencl_throttling_(enable_opencl_throttling),
-        disable_dynamic_shapes_(disable_dynamic_shapes),
-        export_ep_ctx_blob_(export_ep_ctx_blob),
-        enable_qdq_optimizer_(enable_qdq_optimizer),
-        disable_cpu_fallback_(disable_cpu_fallback),
-        so_epctx_embed_mode_{so_epctx_embed_mode} {
-    std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
-                                                       "GPU.0", "GPU.1", "NPU"};
-
-    OVDevices devices;
-    std::vector<std::string> available_devices = devices.get_ov_devices();
-
-    for (auto& device : available_devices) {
-      if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
-        ov_supported_device_types.emplace(device);
-      }
-    }
-
-    if (dev_type == "") {
-      LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
-                         << "No runtime device selection option provided.";
-#if defined OPENVINO_CONFIG_CPU
-      device_type_ = "CPU";
-      precision_ = "FP32";
-#elif defined OPENVINO_CONFIG_GPU
-      device_type_ = "GPU";
-      precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_NPU
-      device_type_ = "NPU";
-      precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
-#ifdef DEVICE_NAME
-#define DEVICE DEVICE_NAME
-#endif
-      dev_type = DEVICE;
-
-      if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) {
-        std::vector<std::string> devices = parseDevices(dev_type, available_devices);
-        precision_ = "FP16";
-        if (devices[0] == "CPU") {
-          precision_ = "FP32";
-        }
-        device_type_ = std::move(dev_type);
-      }
-#endif
-    } else if (ov_supported_device_types.find(dev_type) != ov_supported_device_types.end()) {
-      device_type_ = std::move(dev_type);
-    } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) {
-      std::vector<std::string> devices = parseDevices(dev_type, available_devices);
-      device_type_ = std::move(dev_type);
-    } else {
-      ORT_THROW("Invalid device string: " + dev_type);
-    }
-    LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
-                       << "Choosing Device: " << device_type_ << " , Precision: " << precision_;
-  }
-};
-
-struct OpenVINOEPFunctionState {
-  AllocateFunc allocate_func = nullptr;
-  DestroyFunc destroy_func = nullptr;
-  AllocatorHandle allocator_handle = nullptr;
-  std::shared_ptr<openvino_ep::BackendManager> backend_manager;
-};
-
 // Logical device representation.
 class OpenVINOExecutionProvider : public IExecutionProvider {
  public:
-  explicit OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info);
-  ~OpenVINOExecutionProvider() = default;
+  explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context);
+  ~OpenVINOExecutionProvider();
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const GraphViewer& graph_viewer,
@@ -194,13 +61,18 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   const void* GetExecutionHandle() const noexcept override {
     return nullptr;
   }
+
+  const InlinedVector<const Node*> GetEpContextNodes() const override;
+
 #ifdef USE_OVEP_NPU_MEMORY
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 #endif
  private:
-  std::unique_ptr<openvino_ep::GlobalContext> global_context_;
-  std::shared_ptr<openvino_ep::BackendManager> backend_manager_;
-  openvino_ep::EPCtxHandler ep_ctx_handle_{};
+  SessionContext session_context_;
+  SharedContext& shared_context_;
+  std::list<BackendManager> backend_managers_;  // EP session owns the backend objects
+  EPCtxHandler ep_ctx_handle_;
 };
 
+}  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 879d2399e68af..1c2d857b6252d 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -7,203 +7,212 @@
 #include "core/providers/openvino/openvino_provider_factory.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
+#include "core/providers/openvino/contexts.h"
+#include "core/providers/openvino/backend_utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 
 namespace onnxruntime {
-struct OpenVINOProviderFactory : IExecutionProviderFactory {
-  OpenVINOProviderFactory(const std::string& device_type, const std::string& precision,
-                          size_t num_of_threads,
-                          const std::map<std::string, ov::AnyMap>& load_config, const std::string& cache_dir,
-                          const std::string& model_priority, int num_streams, void* context,
-                          bool enable_opencl_throttling, bool disable_dynamic_shapes,
-                          bool enable_qdq_optimizer, const ConfigOptions& config_options)
-      : device_type_(device_type),
-        precision_(precision),
-        num_of_threads_(num_of_threads),
-        load_config_(load_config),
-        cache_dir_(cache_dir),
-        model_priority_(model_priority),
-        num_streams_(num_streams),
-        context_(context),
-        enable_opencl_throttling_(enable_opencl_throttling),
-        disable_dynamic_shapes_(disable_dynamic_shapes),
-        enable_qdq_optimizer_(enable_qdq_optimizer),
-        config_options_(config_options) {}
+namespace openvino_ep {
+void ParseConfigOptions(ProviderInfo& pi, const ConfigOptions& config_options) {
+  pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
+  pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+  pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
+  pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
+  pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+}
 
-  ~OpenVINOProviderFactory() override {}
+void* ParseUint64(const ProviderOptions& provider_options, std::string option_name) {
+  if (provider_options.contains(option_name)) {
+    uint64_t number = std::strtoull(provider_options.at(option_name).data(), nullptr, 16);
+    return reinterpret_cast<void*>(number);
+  } else {
+    return nullptr;
+  }
+}
 
-  std::unique_ptr<IExecutionProvider> CreateProvider() override;
+bool ParseBooleanOption(const ProviderOptions& provider_options, std::string option_name) {
+  if (provider_options.contains(option_name)) {
+    const auto& value = provider_options.at(option_name);
+    if (value == "true" || value == "True") {
+      return true;
+    } else if (value == "false" || value == "False") {
+      return false;
+    } else {
+      ORT_THROW("[ERROR] [OpenVINO-EP] ", option_name, " should be a boolean.\n");
+    }
+  }
+  return false;
+}
 
- private:
-  std::string device_type_;
-  std::string precision_;
-  size_t num_of_threads_;
-  const std::map<std::string, ov::AnyMap> load_config_;
-  std::string cache_dir_;
-  std::string model_priority_;
-  int num_streams_;
-  void* context_;
-  bool enable_opencl_throttling_;
-  bool disable_dynamic_shapes_;
-  bool enable_qdq_optimizer_;
-  const ConfigOptions& config_options_;
-};
+std::string ParseDeviceType(const ProviderOptions& provider_options, std::string option_name) {
+  const std::vector<std::string> ov_available_devices = OVCore::GetAvailableDevices();
+
+  std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
+                                                     "GPU.0", "GPU.1", "NPU"};
+  std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
+                                                   "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
+                                                   "GPU.0_FP16", "GPU.1_FP16"};
+
+  // Expand set of supported device with OV devices
+  ov_supported_device_types.insert(ov_available_devices.begin(), ov_available_devices.end());
 
-std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
-  bool so_disable_cpu_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
-  bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-  bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
-  std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str();
-
-  if (so_export_ep_ctx_blob && !so_cache_path.empty()) {
-    cache_dir_ = std::move(so_cache_path);
-    auto file_path = std::filesystem::path(cache_dir_);
-    // ep_context_file_path_ file extension must be .onnx
-    if (file_path.extension().generic_string() == ".onnx") {
-      // ep_context_file_path_ must be provided as a directory, create it if doesn't exist
-      auto parent_path = file_path.parent_path();
-      if (!parent_path.empty() && !std::filesystem::is_directory(parent_path) &&
-          !std::filesystem::create_directory(parent_path)) {
-        ORT_THROW("[ERROR] [OpenVINO] Failed to create directory : " +
-                  file_path.parent_path().generic_string() + " \n");
+  if (provider_options.contains(option_name)) {
+    const auto& selected_device = provider_options.at("device_type");
+
+    if (deprecated_device_types.contains(selected_device)) {
+      // Deprecated device and precision is handled together at ParsePrecision
+      return selected_device;
+    }
+
+    if (!((ov_supported_device_types.contains(selected_device)) ||
+          (selected_device.find("HETERO:") == 0) ||
+          (selected_device.find("MULTI:") == 0) ||
+          (selected_device.find("AUTO:") == 0))) {
+      ORT_THROW(
+          "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. "
+          "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from"
+          " HETERO/MULTI/AUTO options available. \n");
+    }
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Choosing Device: " << selected_device;
+    return selected_device;
+  } else {
+    std::string default_device;
+
+    // Take default behavior from project configuration
+#if defined OPENVINO_CONFIG_CPU
+    default_device = "CPU";
+#elif defined OPENVINO_CONFIG_GPU
+    default_device = "GPU";
+#elif defined OPENVINO_CONFIG_NPU
+    default_device = "NPU";
+#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
+    default_device = DEVICE_NAME;
+
+    // Validate that devices passed are valid
+    int delimit = device_type.find(":");
+    const auto& devices = device_type.substr(delimit + 1);
+    auto device_list = split(devices, ',');
+    for (const auto& device : devices) {
+      if (!ov_supported_device_types.contains(device)) {
+        ORT_THROW("[ERROR] [OpenVINO] Invalid device selected: ", device);
       }
-    } else {
-      ORT_THROW("[ERROR] [OpenVINO] Invalid ep_ctx_file_path" + cache_dir_ + " \n");
     }
+#endif
+
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Choosing Device: " << default_device;
+    return default_device;
   }
+}
+
+// Depends on ProviderOptions.
+std::string ParsePrecision(const ProviderOptions& provider_options, std::string& device_type, const std::string& option_name) {
+  using DeviceName = std::string;
+  using DefaultValue = std::string;
+  using ValidValues = std::list<std::string>;
+  using foo = std::pair<DefaultValue, ValidValues>;
+  using ParserHelper = std::map<DeviceName, foo>;
+  ParserHelper helper = {
+      {"GPU", {"FP16", {"FP16", "FP32"}}},
+      {"NPU", {"FP16", {"FP16"}}},
+      {"CPU", {"FP32", {"FP32"}}},
+  };
+
+  std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
+                                                   "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
+                                                   "GPU.0_FP16", "GPU.1_FP16"};
+
+  if (provider_options.contains(option_name)) {
+    // Start by checking if the device_type is a normal valid one
+    if (helper.contains(device_type)) {
+      auto const& valid_values = helper[device_type].second;
+      const auto& precision = provider_options.at(option_name);
+      if (precision == "ACCURACY") {
+        return valid_values.back();  // Return highest supported precision
+      } else {
+        if (std::find(valid_values.begin(), valid_values.end(), precision) != valid_values.end()) {
+          return precision;  // Return precision selected if valid
+        } else {
+          auto value_iter = valid_values.begin();
+          std::string valid_values_joined = *value_iter;
+          // Append 2nd and up, if only one then ++value_iter is same as end()
+          for (++value_iter; value_iter != valid_values.end(); ++value_iter) {
+            valid_values_joined += ", " + *value_iter;
+          }
 
-  OpenVINOExecutionProviderInfo info(device_type_, precision_, num_of_threads_, load_config_,
-                                     cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_,
-                                     disable_dynamic_shapes_, so_export_ep_ctx_blob, enable_qdq_optimizer_,
-                                     so_disable_cpu_fallback, so_epctx_embed_mode);
-  return std::make_unique<OpenVINOExecutionProvider>(info);
+          ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. ", device_type, " only supports", valid_values_joined, ".\n");
+        }
+      }
+    } else if (deprecated_device_types.contains(device_type)) {
+      LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + device_type + " is deprecated. \n"
+                            << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', "
+                            << "'GPU.1', 'NPU' or from"
+                            << " HETERO/MULTI/AUTO options and set 'precision' separately. \n";
+      int delimit = device_type.find("_");
+      device_type = device_type.substr(0, delimit);
+      return device_type.substr(delimit + 1);
+    }
+  }
+  // Return default
+  return helper[device_type].first;
 }
 
-}  // namespace onnxruntime
+void ParseProviderOptions([[maybe_unused]] ProviderInfo& result, [[maybe_unused]] const ProviderOptions& config_options) {}
+
+struct OpenVINOProviderFactory : IExecutionProviderFactory {
+  OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext& shared_context)
+      : provider_info_(std::move(provider_info)), shared_context_(shared_context) {}
+
+  ~OpenVINOProviderFactory() override {}
+
+  std::unique_ptr<IExecutionProvider> CreateProvider() override {
+    return std::make_unique<OpenVINOExecutionProvider>(provider_info_, shared_context_);
+  }
+
+ private:
+  ProviderInfo provider_info_;
+  SharedContext& shared_context_;
+};
 
-namespace onnxruntime {
 struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO {
   std::vector<std::string> GetAvailableDevices() const override {
-    openvino_ep::OVCore ie_core;
-    return ie_core.GetAvailableDevices();
+    return OVCore::GetAvailableDevices();
   }
-} g_info;
+};
 
 struct OpenVINO_Provider : Provider {
-  void* GetInfo() override { return &g_info; }
+  void* GetInfo() override { return &info_; }
 
   std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* void_params) override {
     // Extract the void_params into ProviderOptions and ConfigOptions
-    typedef std::pair<const ProviderOptions*, const ConfigOptions&> ConfigBuffer;
+    using ConfigBuffer = std::pair<const ProviderOptions*, const ConfigOptions&>;
     const ConfigBuffer* buffer = reinterpret_cast<const ConfigBuffer*>(void_params);
-    auto& provider_options_map = *buffer->first;
-    const ConfigOptions& config_options = buffer->second;
-
-    std::string device_type = "";                   // [device_type]: Overrides the accelerator hardware type and
-                                                    // precision with these values at runtime.
-    std::string precision = "";                     // [precision]: Sets the inference precision for execution.
-                                                    // Supported precision for devices are
-                                                    // CPU=FP32, GPU=FP32,FP16, NPU=FP16.
-                                                    // Not setting precision will execute with optimized precision for
-                                                    // best inference latency. set Precision=ACCURACY for executing
-                                                    // models with input precision for best accuracy.
-    int num_of_threads = 0;                         // [num_of_threads]: Overrides the accelerator default value of
-                                                    // number of threads with this value at runtime.
-    std::map<std::string, ov::AnyMap> load_config;  // JSON config map to load custom OV parameters.
-    std::string cache_dir = "";                     // [cache_dir]: specify the path to
-                                                    // dump and load the blobs for the model caching/kernel caching
-                                                    // (GPU) feature. If blob files are already present,
-                                                    // it will be directly loaded.
-    std::string model_priority = "DEFAULT";         // High-level OpenVINO model priority hint
-                                                    // Defines what model should be provided with more performant
-                                                    // bounded resource first
-    int num_streams = 1;                            // [num_streams]: Option that specifies the number of parallel
-                                                    // inference requests to be processed on a given `device_type`.
-                                                    // Overrides the accelerator default value of number of streams
-                                                    // with this value at runtime.
-    bool enable_opencl_throttling = false;          // [enable_opencl_throttling]: Enables OpenCL queue throttling for
-                                                    // GPU device (Reduces CPU Utilization when using GPU)
-
-    bool enable_qdq_optimizer = false;  // Enables QDQ pruning for efficient inference latency with NPU
-
-    void* context = nullptr;
+    const auto& provider_options = *buffer->first;
+    const auto& config_options = buffer->second;
+
+    ProviderInfo pi;
 
     std::string bool_flag = "";
-    if (provider_options_map.find("device_type") != provider_options_map.end()) {
-      device_type = provider_options_map.at("device_type").c_str();
-
-      std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
-                                                         "GPU.0", "GPU.1", "NPU"};
-      std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
-                                                       "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
-                                                       "GPU.0_FP16", "GPU.1_FP16"};
-      OVDevices devices;
-      std::vector<std::string> available_devices = devices.get_ov_devices();
-
-      for (auto& device : available_devices) {
-        if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
-          ov_supported_device_types.emplace(device);
-        }
-      }
-      if (deprecated_device_types.find(device_type) != deprecated_device_types.end()) {
-        std::string deprecated_device = device_type;
-        auto delimit = device_type.find("_");
-        device_type = deprecated_device.substr(0, delimit);
-        precision = deprecated_device.substr(delimit + 1);
-        LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + deprecated_device + " is deprecated. \n"
-                              << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', "
-                              << "'GPU.1', 'NPU' or from"
-                              << " HETERO/MULTI/AUTO options and set 'precision' separately. \n";
-      }
-      if (!((ov_supported_device_types.find(device_type) != ov_supported_device_types.end()) ||
-            (device_type.find("HETERO:") == 0) ||
-            (device_type.find("MULTI:") == 0) ||
-            (device_type.find("AUTO:") == 0))) {
-        ORT_THROW(
-            "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. "
-            "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from"
-            " HETERO/MULTI/AUTO options available. \n");
-      }
-    }
-    if (provider_options_map.find("device_id") != provider_options_map.end()) {
-      std::string dev_id = provider_options_map.at("device_id").c_str();
+
+    pi.device_type = ParseDeviceType(provider_options, "device_type");
+
+    if (provider_options.contains("device_id")) {
+      std::string dev_id = provider_options.at("device_id").data();
       LOGS_DEFAULT(WARNING) << "[OpenVINO] The options 'device_id' is deprecated. "
                             << "Upgrade to set deice_type and precision session options.\n";
       if (dev_id == "CPU" || dev_id == "GPU" || dev_id == "NPU") {
-        device_type = std::move(dev_id);
+        pi.device_type = std::move(dev_id);
       } else {
         ORT_THROW("[ERROR] [OpenVINO] Unsupported device_id is selected. Select from available options.");
       }
     }
-    if (provider_options_map.find("precision") != provider_options_map.end()) {
-      precision = provider_options_map.at("precision").c_str();
-    }
-    if (device_type.find("GPU") != std::string::npos) {
-      if (precision == "") {
-        precision = "FP16";
-      } else if (precision != "ACCURACY" && precision != "FP16" && precision != "FP32") {
-        ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. GPU only supports FP32 / FP16. \n");
-      }
-    } else if (device_type.find("NPU") != std::string::npos) {
-      if (precision == "" || precision == "ACCURACY" || precision == "FP16") {
-        precision = "FP16";
-      } else {
-        ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. NPU only supported FP16. \n");
-      }
-    } else if (device_type.find("CPU") != std::string::npos) {
-      if (precision == "" || precision == "ACCURACY" || precision == "FP32") {
-        precision = "FP32";
-      } else {
-        ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. CPU only supports FP32 . \n");
-      }
+    if (provider_options.contains("cache_dir")) {
+      pi.cache_dir = provider_options.at("cache_dir");
     }
 
-    if (provider_options_map.find("cache_dir") != provider_options_map.end()) {
-      cache_dir = provider_options_map.at("cache_dir");
-    }
+    pi.precision = ParsePrecision(provider_options, pi.device_type, "precision");
 
-    if (provider_options_map.find("load_config") != provider_options_map.end()) {
+    if (provider_options.contains("load_config")) {
       auto parse_config = [&](const std::string& config_str) -> std::map<std::string, ov::AnyMap> {
         // If the config string is empty, return an empty map and skip processing
         if (config_str.empty()) {
@@ -262,116 +271,96 @@ struct OpenVINO_Provider : Provider {
         return target_map;
       };
 
-      load_config = parse_config(provider_options_map.at("load_config"));
+      pi.load_config = parse_config(provider_options.at("load_config"));
     }
 
-    if (provider_options_map.find("context") != provider_options_map.end()) {
-      std::string str = provider_options_map.at("context");
-      uint64_t number = std::strtoull(str.c_str(), nullptr, 16);
-      context = reinterpret_cast<void*>(number);
+    pi.context = ParseUint64(provider_options, "context");
+#if defined(IO_BUFFER_ENABLED)
+    // a valid context must be provided to enable IO Buffer optimizations
+    if (pi.context == nullptr) {
+#undef IO_BUFFER_ENABLED
+#define IO_BUFFER_ENABLED = 0
+      LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization";
     }
+#endif
 
-    if (provider_options_map.find("num_of_threads") != provider_options_map.end()) {
-      if (!std::all_of(provider_options_map.at("num_of_threads").begin(),
-                       provider_options_map.at("num_of_threads").end(), ::isdigit)) {
+    if (provider_options.contains("num_of_threads")) {
+      if (!std::all_of(provider_options.at("num_of_threads").begin(),
+                       provider_options.at("num_of_threads").end(), ::isdigit)) {
         ORT_THROW("[ERROR] [OpenVINO-EP] Number of threads should be a number. \n");
       }
-      num_of_threads = std::stoi(provider_options_map.at("num_of_threads"));
-      if (num_of_threads <= 0) {
-        num_of_threads = 1;
+      pi.num_of_threads = std::stoi(provider_options.at("num_of_threads"));
+      if (pi.num_of_threads <= 0) {
+        pi.num_of_threads = 1;
         LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_threads' should be in the positive range.\n "
                               << "Executing with num_threads=1";
       }
     }
 
-    if (provider_options_map.find("model_priority") != provider_options_map.end()) {
-      model_priority = provider_options_map.at("model_priority").c_str();
+    if (provider_options.contains("model_priority")) {
+      pi.model_priority = provider_options.at("model_priority").data();
       std::vector<std::string> supported_priorities({"LOW", "MEDIUM", "HIGH", "DEFAULT"});
       if (std::find(supported_priorities.begin(), supported_priorities.end(),
-                    model_priority) == supported_priorities.end()) {
-        model_priority = "DEFAULT";
+                    pi.model_priority) == supported_priorities.end()) {
+        pi.model_priority = "DEFAULT";
         LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'model_priority' "
                               << "is not one of LOW, MEDIUM, HIGH, DEFAULT. "
                               << "Executing with model_priorty=DEFAULT";
       }
     }
 
-    if (provider_options_map.find("num_streams") != provider_options_map.end()) {
-      num_streams = std::stoi(provider_options_map.at("num_streams"));
-      if (num_streams <= 0) {
-        num_streams = 1;
+    if (provider_options.contains("num_streams")) {
+      pi.num_streams = std::stoi(provider_options.at("num_streams"));
+      if (pi.num_streams <= 0) {
+        pi.num_streams = 1;
         LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_streams' should be in the range of 1-8.\n "
                               << "Executing with num_streams=1";
       }
     }
-    if (provider_options_map.find("enable_opencl_throttling") != provider_options_map.end()) {
-      bool_flag = provider_options_map.at("enable_opencl_throttling");
-      if (bool_flag == "true" || bool_flag == "True")
-        enable_opencl_throttling = true;
-      else if (bool_flag == "false" || bool_flag == "False")
-        enable_opencl_throttling = false;
-      bool_flag = "";
-    }
+    pi.enable_opencl_throttling = ParseBooleanOption(provider_options, "enable_opencl_throttling");
 
-    if (provider_options_map.find("enable_qdq_optimizer") != provider_options_map.end()) {
-      bool_flag = provider_options_map.at("enable_qdq_optimizer");
-      if (bool_flag == "true" || bool_flag == "True")
-        enable_qdq_optimizer = true;
-      else if (bool_flag == "false" || bool_flag == "False")
-        enable_qdq_optimizer = false;
-      else
-        ORT_THROW("[ERROR] [OpenVINO-EP] enable_qdq_optimiser should be a boolean.\n");
-      bool_flag = "";
-    }
+    pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer");
 
-    // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to static shape at runtime and execute.
-    // Always true for NPU plugin.
-    bool disable_dynamic_shapes = false;
-    if (device_type.find("NPU") != std::string::npos) {
-      disable_dynamic_shapes = true;
+    pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes");
+
+    ParseConfigOptions(pi, config_options);
+
+    // Always true for NPU plugin or when passed .
+    if (pi.device_type.find("NPU") != std::string::npos) {
+      pi.disable_dynamic_shapes = true;
     }
-    if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) {
-      bool_flag = provider_options_map.at("disable_dynamic_shapes");
-      if (bool_flag == "true" || bool_flag == "True") {
-        disable_dynamic_shapes = true;
-      } else if (bool_flag == "false" || bool_flag == "False") {
-        if (device_type.find("NPU") != std::string::npos) {
-          disable_dynamic_shapes = true;
-          LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to "
-                             << "TRUE for NPU backend.\n ";
-        } else {
-          disable_dynamic_shapes = false;
-        }
-      }
-      bool_flag = "";
+
+    // Append values to config to support weight-as-inputs conversion for shared contexts
+    if (pi.so_share_ep_contexts) {
+      ov::AnyMap map;
+      map["NPU_COMPILATION_MODE_PARAMS"] = "enable-wd-blockarg-input=true compute-layers-with-higher-precision=Sqrt,Power,ReduceSum";
+      pi.load_config["NPU"] = std::move(map);
     }
 
-    return std::make_shared<OpenVINOProviderFactory>(device_type,
-                                                     precision,
-                                                     num_of_threads,
-                                                     load_config,
-                                                     cache_dir,
-                                                     model_priority,
-                                                     num_streams,
-                                                     context,
-                                                     enable_opencl_throttling,
-                                                     disable_dynamic_shapes,
-                                                     enable_qdq_optimizer,
-                                                     config_options);
+    return std::make_shared<OpenVINOProviderFactory>(pi, shared_context_);
   }
 
   void Initialize() override {
+    OVCore::Initialize();
   }
 
   void Shutdown() override {
+    backend_utils::DestroyOVTensors(shared_context_.shared_weights.metadata);
+    OVCore::Teardown();
   }
-} g_provider;
 
+ private:
+  SharedContext shared_context_;
+  ProviderInfo_OpenVINO_Impl info_;
+};  // OpenVINO_Provider
+
+}  // namespace openvino_ep
 }  // namespace onnxruntime
 
 extern "C" {
 
 ORT_API(onnxruntime::Provider*, GetProvider) {
-  return &onnxruntime::g_provider;
+  static onnxruntime::openvino_ep::OpenVINO_Provider g_provider;
+  return &g_provider;
 }
 }
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 12ab7ecede031..4c656bceff550 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -13,7 +13,16 @@ using Exception = ov::Exception;
 namespace onnxruntime {
 namespace openvino_ep {
 
-const std::string log_tag = "[OpenVINO-EP] ";
+static const std::string log_tag = "[OpenVINO-EP] ";
+static std::unique_ptr<ov::Core> g_core;
+
+void OVCore::Initialize() {
+  g_core = std::make_unique<ov::Core>();
+}
+
+void OVCore::Teardown() {
+  g_core.reset();
+}
 
 #ifndef NDEBUG
 void printDebugInfo(const ov::CompiledModel& obj) {
@@ -46,7 +55,7 @@ void printDebugInfo(const ov::CompiledModel& obj) {
 }
 #endif
 
-std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std::string& model_path) const {
+std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std::string& model_path) {
   try {
     std::istringstream modelStringStream(model);
     std::istream& modelStream = modelStringStream;
@@ -77,7 +86,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
                                   const std::string& name) {
   ov::CompiledModel obj;
   try {
-    obj = oe.compile_model(ie_cnn_network, hw_target, device_config);
+    obj = Get().compile_model(ie_cnn_network, hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -96,7 +105,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
                                   const std::string& name) {
   ov::CompiledModel obj;
   try {
-    obj = oe.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
+    obj = Get().compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -109,22 +118,13 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
   }
 }
 
-OVExeNetwork OVCore::ImportModel(const std::string& model_string,
+OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
                                  std::string hw_target,
                                  const ov::AnyMap& device_config,
-                                 bool embed_mode,
                                  std::string name) {
   try {
     ov::CompiledModel obj;
-    if (embed_mode) {
-      std::istringstream model_stream(model_string);
-      obj = oe.import_model(model_stream, hw_target, device_config);
-    } else {
-      std::ifstream modelStream(model_string, std::ios_base::binary | std::ios_base::in);
-      obj = oe.import_model(modelStream,
-                            hw_target,
-                            {});
-    }
+    obj = Get().import_model(model_stream, hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -138,7 +138,12 @@ OVExeNetwork OVCore::ImportModel(const std::string& model_string,
 }
 
 void OVCore::SetCache(const std::string& cache_dir_path) {
-  oe.set_property(ov::cache_dir(cache_dir_path));
+  Get().set_property(ov::cache_dir(cache_dir_path));
+}
+
+inline ov::Core& OVCore::Get() {
+  ORT_ENFORCE(g_core);
+  return *g_core;
 }
 
 #ifdef IO_BUFFER_ENABLED
@@ -174,12 +179,12 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_strea
 #endif
 
 std::vector<std::string> OVCore::GetAvailableDevices() {
-  auto available_devices = oe.get_available_devices();
+  auto available_devices = Get().get_available_devices();
   return available_devices;
 }
 
 void OVCore::SetStreams(const std::string& device_type, int num_streams) {
-  oe.set_property(device_type, {ov::num_streams(num_streams)});
+  Get().set_property(device_type, {ov::num_streams(num_streams)});
 }
 
 OVInferRequest OVExeNetwork::CreateInferRequest() {
@@ -206,7 +211,18 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
   }
 }
 
-void OVInferRequest::SetTensor(std::string name, OVTensorPtr& blob) {
+std::string OVInferRequest::GetInputTensorName(uint32_t index) {
+  try {
+    const auto& model = ovInfReq.get_compiled_model();
+    return *model.input(index).get_names().begin();
+  } catch (const Exception& e) {
+    ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index, e.what());
+  } catch (...) {
+    ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index);
+  }
+}
+
+void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
   try {
     ovInfReq.set_tensor(name, *(blob.get()));
   } catch (const Exception& e) {
@@ -216,6 +232,10 @@ void OVInferRequest::SetTensor(std::string name, OVTensorPtr& blob) {
   }
 }
 
+uint32_t OVInferRequest::GetNumInputs() {
+  return ovInfReq.get_compiled_model().inputs().size();
+}
+
 void OVInferRequest::StartAsync() {
   try {
     ovInfReq.start_async();
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index c3417003f8e1f..53b814094438e 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -37,40 +37,40 @@ typedef ov::intel_gpu::ocl::ClContext* OVRemoteContextPtr;
 typedef ov::RemoteContext OVRemoteContext;
 #endif
 
-class OVCore {
-  ov::Core oe;
+struct OVCore {
+  static void Initialize();
+  static void Teardown();
 
- public:
   // OV Interface For Reading Model
-  std::shared_ptr<OVNetwork> ReadModel(const std::string& model_stream, const std::string& model_path) const;
+  static std::shared_ptr<OVNetwork> ReadModel(const std::string& model_stream, const std::string& model_path);
+
   // OV Interface for Compiling OV Model Type
-  OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
-                            std::string& hw_target,
-                            ov::AnyMap& device_config,
-                            const std::string& name);
+  static OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
+                                   std::string& hw_target,
+                                   ov::AnyMap& device_config,
+                                   const std::string& name);
   // OV Interface for Fast Compile
-  OVExeNetwork CompileModel(const std::string& onnx_model,
-                            std::string& hw_target,
-                            ov::AnyMap& device_config,
-                            const std::string& name);
+  static OVExeNetwork CompileModel(const std::string& onnx_model,
+                                   std::string& hw_target,
+                                   ov::AnyMap& device_config,
+                                   const std::string& name);
   // OV Interface for Import model Stream
-  OVExeNetwork ImportModel(const std::string& model_string,
-                           std::string hw_target,
-                           const ov::AnyMap& device_config,
-                           bool embed_mode,
-                           std::string name);
+  static OVExeNetwork ImportModel(std::istream& model_stream,
+                                  std::string hw_target,
+                                  const ov::AnyMap& device_config,
+                                  std::string name);
 #ifdef IO_BUFFER_ENABLED
-  OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& model,
-                            OVRemoteContextPtr context,
-                            std::string name);
-  OVExeNetwork ImportModel(std::shared_ptr<std::istringstream> model_stream,
-                           OVRemoteContextPtr context,
-                           std::string name);
+  static OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& model,
+                                   OVRemoteContextPtr context,
+                                   std::string name);
+  static OVExeNetwork ImportModel(std::shared_ptr<std::istringstream> model_stream,
+                                  OVRemoteContextPtr context,
+                                  std::string name);
 #endif
-  std::vector<std::string> GetAvailableDevices();
-  void SetCache(const std::string& cache_dir_path);
-  ov::Core& Get() { return oe; }
-  void SetStreams(const std::string& device_type, int num_streams);
+  static std::vector<std::string> GetAvailableDevices();
+  static void SetCache(const std::string& cache_dir_path);
+  inline static ov::Core& Get();
+  static void SetStreams(const std::string& device_type, int num_streams);
 };
 
 class OVExeNetwork {
@@ -87,8 +87,10 @@ class OVInferRequest {
   ov::InferRequest ovInfReq;
 
  public:
+  uint32_t GetNumInputs();
   OVTensorPtr GetTensor(const std::string& name);
-  void SetTensor(std::string name, OVTensorPtr& blob);
+  std::string GetInputTensorName(uint32_t index);
+  void SetTensor(const std::string& name, OVTensorPtr& blob);
   void StartAsync();
   void Infer();
   void WaitRequest();
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 3e780f74145ae..d56687f868c3d 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License
 #include <map>
 #include <unordered_set>
+#include <type_traits>
 
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/backend_utils.h"
@@ -26,23 +27,27 @@ namespace onnxruntime {
 namespace openvino_ep {
 
 // Constructor
-GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
+GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler,
+                             const GraphViewer& graph_viewer_param,
                              const std::string device_type_param,
-                             const bool enable_qdq_optimizer)
-    : graph_viewer_(graph_viewer_param), device_type_(device_type_param) {
+                             const bool enable_qdq_optimizer) : ep_ctx_handler_(ep_ctx_handler),
+                                                                graph_viewer_(graph_viewer_param),
+                                                                device_type_(std::move(device_type_param)) {
   bool npu_qdq_optimizer_enabled = false;
   if (device_type_.find("NPU") != std::string::npos) {
     device_type_ = "CPU";
     if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true;
   }
-#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4
-  data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5
+#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5
   data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 6
+  data_ops_ = new DataOps(graph_viewer_, V_2024_6, device_type_, npu_qdq_optimizer_enabled);
 #elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0
   data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 1
+  data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled);
 #else
-  data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled);
+  data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled);
 #endif
 }
 
@@ -54,6 +59,28 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     return result;
   }
 
+  auto Iterable2String = []<typename U, typename V>(U& strings, const V& node_args) {
+    constexpr bool has_name = requires(V v) {
+      (*v.begin())->Name();
+    };
+    for (const auto& arg : node_args) {
+      if constexpr (has_name) {
+        strings.push_back(arg->Name());
+      } else {
+        strings.push_back(arg);
+      }
+    }
+  };
+
+  // Check for EpContext nodes
+  const auto& nodes = graph_viewer_.GetNodesInTopologicalOrder();
+
+  // If all the nodes have been accounted for then no more processing is needed
+  if (result.size() == nodes.size()) {
+    is_wholly_supported_graph_ = true;
+    return result;
+  }
+
   // This is a list of initializers that nGraph considers as constants. Example weights, reshape shape etc.
   std::unordered_set<std::string> ng_required_initializers;
 
@@ -62,8 +89,8 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
   if (openvino_ep::backend_utils::IsDebugEnabled()) {
     std::cout << "No of unsupported nodes " << unsupported_nodes.size() << std::endl;
     for (size_t i = 0; i < unsupported_nodes.size(); i++) {
-      const Node* node = graph_viewer_.GetNode(unsupported_nodes[i]);
-      std::cout << "Unsupported node op " << node->OpType() << std::endl;
+      const Node* unode = graph_viewer_.GetNode(unsupported_nodes[i]);
+      std::cout << "Unsupported node op " << unode->OpType() << std::endl;
     }
   }
 #endif
@@ -73,8 +100,7 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     std::vector<std::string> inputs;
     std::vector<std::string> outputs;
     // Fill inputs with names
-    std::for_each(graph_viewer_.GetInputs().begin(), graph_viewer_.GetInputs().end(),
-                  [&inputs](const NodeArg* node_arg) { inputs.push_back(node_arg->Name()); });
+    Iterable2String(inputs, graph_viewer_.GetInputs());
 
     /* In scenarios, when there are no inputs or all inputs being initializers,
          ConstantFolding optimization in onnxruntime pre-computes the value.*/
@@ -82,8 +108,6 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
       return result;
     }
 
-    const std::vector<NodeIndex>& nodes = graph_viewer_.GetNodesInTopologicalOrder();
-
     const Node* node = graph_viewer_.GetNode(nodes[0]);
 
     // Handle cases where lone, reoccuring Ops in smaller models cannot be supported in OpenVINO
@@ -103,12 +127,10 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     }
 
     // Initializers need to be part of meta_def->inputs
-    std::for_each(ng_required_initializers.begin(), ng_required_initializers.end(),
-                  [&inputs](const std::string& initializer) { inputs.push_back(initializer); });
+    Iterable2String(inputs, ng_required_initializers);
 
     // Fill outputs with names
-    std::for_each(graph_viewer_.GetOutputs().begin(), graph_viewer_.GetOutputs().end(),
-                  [&outputs](const NodeArg* node_arg) { outputs.push_back(node_arg->Name()); });
+    Iterable2String(outputs, graph_viewer_.GetOutputs());
 
     // Create and add this graph to result.
     AppendClusterToSubGraph(graph_viewer_.GetNodesInTopologicalOrder(), inputs, outputs, result);
@@ -148,9 +170,15 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     int no_of_clusters = 0;
 
     for (auto this_cluster : connected_clusters) {
-      // If subgraph has less then three, graph is considered trivial
+      // If subgraph has less then three, graph is considered trivial unless its an epctx cluster
       if (this_cluster.size() < 3) {
-        continue;
+        bool is_epctx_node = false;
+        for (auto node_idx : this_cluster) {
+          if (graph_viewer_.GetNode(node_idx)->OpType() == "EPContext")
+            is_epctx_node = true;
+        }
+        if (!is_epctx_node)
+          continue;
       }
 
       std::vector<std::string> cluster_graph_inputs, cluster_inputs, cluster_outputs;
@@ -166,16 +194,6 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
       // Omitting zero dim subgraphs
       for (auto index : this_cluster) {
         const Node* node = graph_viewer_.GetNode(index);
-        if (data_ops_->DoNotOmitSubGraph(node->OpType())) {
-          for (const auto& input : node->InputDefs()) {
-            const auto& input_name = input->Name();
-            auto it = find(cluster_graph_inputs.begin(), cluster_graph_inputs.end(), input_name);
-            if (it != cluster_graph_inputs.end()) {
-              omit_subgraph = true;
-              break;
-            }
-          }
-        }
 
         if (node->OpType() == "Conv" || node->OpType() == "Identity") {
           const auto& output_name = node->OutputDefs()[0]->Name();
@@ -213,7 +231,6 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     }
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Supported subgraphs on OpenVINO: " << no_of_clusters;
   }
-
   return result;
 }
 
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h
index 2f87c4c73d892..364e79a76f154 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h
@@ -6,12 +6,14 @@
 #include <string>
 #include <memory>
 #include "core/providers/openvino/ov_versions/data_ops.h"
+#include "core/providers/openvino/onnx_ctx_model_helper.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
 class GetCapability {
  private:
+  const EPCtxHandler& ep_ctx_handler_;
   const GraphViewer& graph_viewer_;
   std::string device_type_;
   DataOps* data_ops_;
@@ -19,7 +21,8 @@ class GetCapability {
   bool has_external_weights_ = false;
 
  public:
-  GetCapability(const GraphViewer& graph_viewer_param,
+  GetCapability(const EPCtxHandler& ep_ctx_handler,
+                const GraphViewer& graph_viewer_param,
                 const std::string device_type_param,
                 const bool enable_qdq_optimizer);
   virtual std::vector<std::unique_ptr<ComputeCapability>> Execute();
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index f118f057ac11e..2f0dd458cc349 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -388,7 +388,7 @@ void DataOps::populate_op_mode_supported() {
 
   // populate unsupportedmode_t
   {
-    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2025_0},
+    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch)
                                for (size_t i = 0; i < node->InputDefs().size(); i++) {
@@ -404,7 +404,7 @@ void DataOps::populate_op_mode_supported() {
   }
   {
     UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
-                              V_2024_3, V_2024_4, V_2024_5, V_2025_0},
+                              V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_arg = node->InputDefs()[1];
                                auto shape = input_arg->Shape();
@@ -422,7 +422,7 @@ void DataOps::populate_op_mode_supported() {
   }
   {
     UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
-                              V_2024_3, V_2024_4, V_2024_5, V_2025_0},
+                              V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -437,8 +437,8 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5,
-                              V_2025_0},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6,
+                              V_2025_0, V_2025_1},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index 07fa36f355d55..cf7d834d6cfc7 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -33,7 +33,9 @@ enum versionNum {
   V_2024_3,
   V_2024_4,
   V_2024_5,
-  V_2025_0
+  V_2024_6,
+  V_2025_0,
+  V_2025_1
 };
 
 using VersionNum = enum versionNum;
@@ -82,7 +84,7 @@ class DataOps {
           const std::string dev_id, const bool npu_qdq_optimizer_enabled)
       : graph_viewer_(graph_viewer_param),
         version_id_(ver),
-        device_id_(dev_id),
+        device_id_(std::move(dev_id)),
         npu_qdq_optimizer_enabled_(npu_qdq_optimizer_enabled) {
     populate_op_mode_supported();
     populate_types_supported();
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index e021edc878709..4d513c0533ff1 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -56,7 +56,7 @@ static NodeArg& ProcessNodeUnitIO(onnxruntime::Graph& dst_graph,
                                   std::set<std::string>& initializers_to_keep,
                                   const NodeUnitIODef& io_def) {
   const std::string& name = io_def.node_arg.Name();
-  const ONNX_NAMESPACE::TypeProto* orig_type_proto = io_def.node_arg.TypeAsProto();
+  const auto* orig_type_proto = io_def.node_arg.TypeAsProto();
 
   // Handle quantized input or output. Convert to float type.
   if (io_def.quant_param.has_value()) {
@@ -68,11 +68,11 @@ static NodeArg& ProcessNodeUnitIO(onnxruntime::Graph& dst_graph,
     ORT_ENFORCE(tensor_proto_iter != src_initializers.end(),
                 "Unable to find scale initializer ", scale_initializer_name);
 
-    const ONNX_NAMESPACE::TensorProto* scale_tensor_proto = tensor_proto_iter->second;
+    const auto* scale_tensor_proto = tensor_proto_iter->second;
     int32_t float_type = scale_tensor_proto->data_type();
 
     // Noe set the arg type to the float type of scale. Could be one of float/float16/bfloat16
-    std::unique_ptr<ONNX_NAMESPACE::TypeProto> type_proto = ONNX_NAMESPACE::TypeProto::Create();
+    auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
     type_proto->copy_from(orig_type_proto);
     type_proto->mutable_tensor_type()->set_elem_type(float_type);
 
@@ -457,7 +457,7 @@ static void AddStandaloneNodeUnit(onnxruntime::Graph& dst_graph, const onnxrunti
     if (duplicate_dq &&
         GetQDQDataType(&node_unit.GetNode()) != DT_UINT16 && GetQDQDataType(&node_unit.GetNode()) != DT_INT16) {
       std::string orig_dq_name = node_unit.Outputs()[0].node_arg.Name();  // ex: dql_output/duplicated
-      std::unique_ptr<ONNX_NAMESPACE::TypeProto> type_proto = ONNX_NAMESPACE::TypeProto::Create();
+      auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
       type_proto->copy_from(node_unit.Inputs()[0].node_arg.TypeAsProto());
       type_proto->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
       orig_dq_name.erase(orig_dq_name.find(DuplicateDQ), std::string::npos);  // ex: dql_output
@@ -625,10 +625,54 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph,
   KeepInitsInDstGraph(initializers_to_keep, src_graph, &target_node);
 }
 
+static void AddInitializerAsInput(onnxruntime::Graph& dst_graph,
+                                  InlinedVector<const NodeArg*>& accumulated_inputs,
+                                  const onnxruntime::GraphViewer& src_graph,
+                                  const std::string& initializer_name) {
+  // Get the initializer from source graph
+  const auto& src_initializers = src_graph.GetAllInitializedTensors();
+  auto init_iter = src_initializers.find(initializer_name);
+
+  if (init_iter == src_initializers.end()) {
+    // Initializer not found
+    return;
+  }
+
+  const auto* tensor_proto = init_iter->second;
+
+  // Create TypeProto for the initializer
+  auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
+  auto* tensor_type = type_proto->mutable_tensor_type();
+  tensor_type->set_elem_type(tensor_proto->data_type());
+
+  for (int i = 0; i < tensor_proto->dims_size(); ++i) {
+    tensor_type->mutable_shape()->add_dim()->set_dim_value(tensor_proto->dims().Get(i));
+  }
+
+  // Create NodeArg for the initializer
+  auto& input_arg = dst_graph.GetOrCreateNodeArg(initializer_name, type_proto.get());
+
+  // Check if input already exists in accumulated inputs
+  bool input_exists = false;
+  for (const auto* existing_input : accumulated_inputs) {
+    if (existing_input->Name() == initializer_name) {
+      input_exists = true;
+      break;
+    }
+  }
+
+  if (!input_exists) {
+    // Add to accumulated inputs
+    accumulated_inputs.push_back(&input_arg);
+  }
+}
+
 // Creates a new model without the DQ/Q operators in the src graph.
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
-                                       /*out*/ std::unique_ptr<onnxruntime::Model>& model) {
+                                       bool enable_ovep_weight_sharing,
+                                       /*out*/ std::unique_ptr<onnxruntime::Model>& model,
+                                       /*out*/ sw& shared_weights) {
   // NOTE: This function is a re-implementation of GraphViewerToProto() in core/graph/graph_proto_serializer.cc
   // with the following differences:
   //   - Uses onnxruntime::Graph APIs instead of onnx::GraphProto APIs.
@@ -665,7 +709,12 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
     dst_graph_outputs.push_back(&ep_graph_output_arg);
   }
 
-  dst_graph.SetInputs(dst_graph_inputs);
+  // Will set inputs after deciding fate oif all internal and external initializers
+  // accumulated_inputs container will store input of the original graph and initializer with ext data
+  InlinedVector<const NodeArg*> accumulated_inputs;
+  accumulated_inputs.reserve(dst_graph_inputs.size());
+
+  // dst_graph.SetInputs(dst_graph_inputs);
   dst_graph.SetOutputs(dst_graph_outputs);
 
   // TODO(sspintel): add Graph::SetName() provider api
@@ -723,9 +772,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
     seen_node_units.insert(node_unit);
   }
 
-  //
-  // Copy initializers to dst graph.
-  //
+  //  Copy initializers to dst graph.
 
   std::unordered_set<std::string> current_scope_initializer_set;
 
@@ -738,26 +785,93 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   }
   std::sort(const_inits.begin(), const_inits.end());
 
+  // initialize map for creating metadata for initilizers with external weights
+  auto& metadata = shared_weights.metadata;
+
+  const auto& insert_metadata = [&metadata](const ONNX_NAMESPACE::TensorProto& proto) {
+    sw::Metadata::Map::key_type key{proto.name()};
+    sw::Metadata::Map::mapped_type value{};
+
+    using mutable_proto_t = ONNX_NAMESPACE::TensorProto*;
+    auto& mutable_proto = *const_cast<mutable_proto_t>(&proto);
+    auto* entry_protos = mutable_proto.mutable_external_data();
+    for (int i = 0; i < entry_protos->size(); i++) {
+      auto& string_entry_proto{entry_protos->at(i)};
+      const auto& pb_key{*(string_entry_proto.mutable_key())};
+      const auto& pb_value{*(string_entry_proto.mutable_value())};
+      if (pb_key == "location") {
+        value.location = pb_value;
+      } else if (pb_key == "offset") {
+        value.data_offset = std::stoul(pb_value);
+      } else if (pb_key == "length") {
+        value.size = std::stoul(pb_value);
+      }
+    }
+    value.element_type = proto.data_type();
+    value.dimensions.resize(proto.dims_size());
+    for (uint32_t index = 0; auto& dim : value.dimensions) {
+      dim = proto.dims()[index++];
+    }
+
+    metadata.emplace(key, std::move(value));
+  };
+
+  // Handle constant initializers
   for (auto& it : const_inits) {
-    if (initializers_to_keep.count(it))
-      dst_graph.AddInitializedTensor(*(initializers.at(it)));
+    const auto& initializer_tensor = *initializers.at(it);
+
+    // Check if the initializer has external data
+    if (initializer_tensor.has_data_location() &&
+        initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
+        enable_ovep_weight_sharing) {
+      insert_metadata(initializer_tensor);
+
+      // Add initializer with external data as input
+      AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it);
+
+    } else {
+      // Add as an initialized tensor if it does not have external data
+      if (initializers_to_keep.count(it))
+        dst_graph.AddInitializedTensor(*(initializers.at(it)));
+    }
+
     current_scope_initializer_set.insert(it);
   }
 
-  // handle outer scope value which is a constant initializer
+  // Handle outer-scope constant initializers
   for (auto& node_idx : src_graph.GetNodesInTopologicalOrder()) {
     const auto& node = src_graph.GetNode(node_idx);
     for (const auto& input : node->InputDefs()) {
       if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) {
         continue;
       }
+
       if (src_graph.IsConstantInitializer(input->Name(), true)) {
-        if (initializers_to_keep.count(input->Name()))
-          dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true)));
+        const auto& initializer_tensor = *src_graph.GetConstantInitializer(input->Name(), true);
+        // Check if the initializer has external data
+        if (initializer_tensor.has_data_location() &&
+            initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL &&
+            enable_ovep_weight_sharing) {
+          insert_metadata(initializer_tensor);
+
+          // Add initializer as input if it has external data
+          AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name());
+
+        } else {
+          // Add as an initialized tensor if it does not have external data
+          if (initializers_to_keep.count(input->Name())) {
+            dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true)));
+          }
+        }
+
         current_scope_initializer_set.insert(input->Name());
       }
     }
   }
+  accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end());
+
+  // Set all inputs (original inputs amnd initializers as inputs) of the destination Graph
+  dst_graph.SetInputs(accumulated_inputs);
 
   // Validate graph, remove unnecessary initializers, and run type/shape inference.
   ORT_RETURN_IF_ERROR(dst_graph.Resolve());
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
index 94a8eb4d5da17..02831525cba32 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h
@@ -5,14 +5,20 @@
 
 #include <memory>
 #include "core/providers/shared_library/provider_api.h"
+#include "core/providers/openvino/contexts.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
+using sw = SharedContext::SharedWeights;
+
 // Creates a new model without the DQ/Q operators in the src graph as per pre-defined rulesets
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
-                                       /*out*/ std::unique_ptr<onnxruntime::Model>& model);
+                                       bool enable_ovep_weight_sharing,
+                                       /*out*/ std::unique_ptr<onnxruntime::Model>& model,
+                                       /*out*/ sw& shared_weights);
 
+bool dumpMetaDataMapToBinary(const sw::Metadata::Map& shared_weights, const std::string& filename);
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index e434935343663..4feedd75f8004 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -991,7 +991,8 @@ struct Model final {
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
     return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger);
   }
-  static std::unique_ptr<Model> Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) {
+  static std::unique_ptr<Model> Create(const std::string& graph_name, bool is_onnx_domain_only,
+                                       const logging::Logger& logger) {
     return g_host->Model__construct(graph_name, is_onnx_domain_only, logger);
   }
   static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast<Model*>(p)); }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 3a694ac6f8e5e..f36345cdabf64 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1178,7 +1178,8 @@ struct ProviderHostImpl : ProviderHost {
                                           const logging::Logger& logger) override {
     return std::make_unique<Model>(model_proto, model_path, local_registries, logger);
   }
-  std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+  std::unique_ptr<Model> Model__construct(const std::string& graph_name,
+                                          bool is_onnx_domain_only,
                                           const logging::Logger& logger) override {
     return std::make_unique<Model>(graph_name, is_onnx_domain_only, logger);
   }
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 272ea37fcc70c..d224246b98e5b 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -522,6 +522,7 @@
         "^test_affine_grid_3d_align_corners_expanded",
         "^test_affine_grid_3d",
         "^test_affine_grid_3d_expanded",
+        "^test_dynamicquantizelinear_expanded_cpu",
         "^test_operator_permute2",
         "^test_operator_repeat",
         "^test_operator_repeat_dim_overflow",