From 07624709e3ecf8b46189e086edc9db1e7ddf809f Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Fri, 31 Oct 2025 03:21:39 -0700 Subject: [PATCH 01/11] Reorder KV cache using the new gather_by_axis API --- .../core/providers/openvino/ov_interface.cc | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 85fc4d93d6243..c2b9b1ca350f9 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -461,6 +461,29 @@ void StatefulOVInferRequest::Infer() { OVInferRequest::Infer(); } +void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { + // Validate input parameters + if (src_indices.size() != dst_indices.size()) { + ORT_THROW(log_tag + "ReorderKVCache: src_indices and dst_indices must have the same size. " + "Got src_indices.size()=" + std::to_string(src_indices.size()) + + ", dst_indices.size()=" + std::to_string(dst_indices.size())); + } + + LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with " + << src_indices.size() << " index pairs"; + + // Retrieve KVCache states and reorder them based on the provided indices + auto states = ovInfReq.query_state(); + + for (auto& state : states) { + auto start_time = std::chrono::high_resolution_clock::now(); + state.gather_by_axis(src_indices, dst_indices); + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time).count(); + LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds"; + } +} + void StatefulOVInferRequest::RewindKVCache(size_t index) { LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; From 1426c2a4fe5e4beb54112f89437e35b5d0163db1 Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Thu, 13 Nov 2025 06:07:10 -0800 Subject: [PATCH 02/11] Do a ScatterElementsUpdate-based reorder during execution --- .../core/providers/openvino/ov_interface.cc | 34 ++++++++++++++++++- .../core/providers/openvino/ov_interface.h | 2 ++ .../openvino/ov_stateful_patch_utils.cc | 17 ++++++++-- .../openvino/ov_stateful_patch_utils.h | 1 + 4 files changed, 50 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index c2b9b1ca350f9..be6a64dc62a5b 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -421,7 +421,25 @@ std::optional StatefulOVInferRequest::FindTensor(const std::string& void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - FillTensor("beam_idx", ov::element::i32, {1}, 0); + if (beam_idx_val.size() == 3) { + ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {3}); + for (int i = 0; i < 3; ++i) { + beam_idx_tensor.data()[i] = int32_t(beam_idx_val[i]); + } + ovInfReq.set_tensor("beam_idx", beam_idx_tensor); + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, 3, 96}); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 32; ++j) { + for (int k = 0; k < 96; ++k) { + dst_idx_tensor.data()[(j * 3 + i) * 96 + k] = int32_t(dst_idx_val[i]); + } + } + } + ovInfReq.set_tensor("dst_idx", dst_idx_tensor); + } else { + FillTensor("beam_idx", ov::element::i32, {3}, 0); + FillTensor("dst_idx", ov::element::i32, {1, 32, 3, 96}, 0); + } // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. if (prefill_use_full_chat_history) { @@ -472,6 +490,19 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with " << src_indices.size() << " index pairs"; + // set beam_idx and dst_idx based on provided values + if (beam_idx_val.size() == 0) { + for (int i = 0; i < 3; ++i) { + beam_idx_val.emplace_back(src_indices[i]); + dst_idx_val.emplace_back(dst_indices[i]); + } + } else { + for (int i = 0; i < 3; ++i) { + beam_idx_val[i] = src_indices[i]; + dst_idx_val[i] = dst_indices[i]; + } + } + /* // Retrieve KVCache states and reorder them based on the provided indices auto states = ovInfReq.query_state(); @@ -482,6 +513,7 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic auto duration = std::chrono::duration_cast(end_time - start_time).count(); LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds"; } + */ } void StatefulOVInferRequest::RewindKVCache(size_t index) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 8765cd040d098..738d13357dc82 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -155,6 +155,8 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; + std::vector beam_idx_val; + std::vector dst_idx_val; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index c4ec47534d009..a5b4ea95f6a91 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -91,13 +91,21 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates); auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0]; + auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape(); + update_shape[2] = 3; - auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({std::move(input_batch)})); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({3})); beam_idx->set_friendly_name("beam_idx"); beam_idx->output(0).get_tensor().add_names({"beam_idx"}); ov_model->add_parameters({beam_idx}); not_kv_inputs.push_back(beam_idx->get_friendly_name()); + auto dst_idx = std::make_shared(ov::element::i32, update_shape); + dst_idx->set_friendly_name("dst_idx"); + dst_idx->output(0).get_tensor().add_names({"dst_idx"}); + ov_model->add_parameters({dst_idx}); + not_kv_inputs.push_back(dst_idx->get_friendly_name()); + // Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx for (const auto& input_name : key_value_input_names) { auto parameter_output_port = ov_model->input(input_name); @@ -106,11 +114,14 @@ void FuseCacheReorder(std::shared_ptr ov_model, auto gather_op = std::make_shared(parameter_output_port, beam_idx, - ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); + ov::opset13::Constant::create(ov::element::i64, {}, {2})); + + auto update_op = std::make_shared(parameter_output_port, + dst_idx, gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); // Replace the source output for all consumers of the input tensor for (auto& consumer : consumers) { - consumer.replace_source_output(gather_op->output(0)); + consumer.replace_source_output(update_op->output(0)); } } diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index 0b89c4ed02e13..11b0cc1dbe9bb 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -13,6 +13,7 @@ #include "openvino/pass/manager.hpp" #include "openvino/pass/make_stateful.hpp" +#include "openvino/opsets/opset12.hpp" #include "openvino/opsets/opset13.hpp" namespace onnxruntime { From 2945283bd225ead095357048886a347cca095146 Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Thu, 13 Nov 2025 09:47:05 -0800 Subject: [PATCH 03/11] Get variable update lengths from incoming indices --- .../core/providers/openvino/ov_interface.cc | 31 ++++++++----------- .../openvino/ov_stateful_patch_utils.cc | 4 +-- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index be6a64dc62a5b..2a6c7ac7c8c9b 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -421,24 +421,24 @@ std::optional StatefulOVInferRequest::FindTensor(const std::string& void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - if (beam_idx_val.size() == 3) { - ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {3}); - for (int i = 0; i < 3; ++i) { + if (beam_idx_val.size() > 0) { + ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {beam_idx_val.size()}); + for (int i = 0; i < beam_idx_val.size(); ++i) { beam_idx_tensor.data()[i] = int32_t(beam_idx_val[i]); } ovInfReq.set_tensor("beam_idx", beam_idx_tensor); - ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, 3, 96}); - for (int i = 0; i < 3; ++i) { + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96}); + for (int i = 0; i < dst_idx_val.size(); ++i) { for (int j = 0; j < 32; ++j) { for (int k = 0; k < 96; ++k) { - dst_idx_tensor.data()[(j * 3 + i) * 96 + k] = int32_t(dst_idx_val[i]); + dst_idx_tensor.data()[(j * dst_idx_val.size() + i) * 96 + k] = int32_t(dst_idx_val[i]); } } } ovInfReq.set_tensor("dst_idx", dst_idx_tensor); } else { - FillTensor("beam_idx", ov::element::i32, {3}, 0); - FillTensor("dst_idx", ov::element::i32, {1, 32, 3, 96}, 0); + FillTensor("beam_idx", ov::element::i32, {0}, 0); + FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0); } // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. @@ -491,16 +491,11 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic << src_indices.size() << " index pairs"; // set beam_idx and dst_idx based on provided values - if (beam_idx_val.size() == 0) { - for (int i = 0; i < 3; ++i) { - beam_idx_val.emplace_back(src_indices[i]); - dst_idx_val.emplace_back(dst_indices[i]); - } - } else { - for (int i = 0; i < 3; ++i) { - beam_idx_val[i] = src_indices[i]; - dst_idx_val[i] = dst_indices[i]; - } + beam_idx_val.clear(); + dst_idx_val.clear(); + for (int i = 0; i < src_indices.size(); ++i) { + beam_idx_val.emplace_back(src_indices[i]); + dst_idx_val.emplace_back(dst_indices[i]); } /* // Retrieve KVCache states and reorder them based on the provided indices diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index a5b4ea95f6a91..9146a9a206855 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -90,11 +90,9 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates); - auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0]; auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape(); - update_shape[2] = 3; - auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({3})); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); beam_idx->set_friendly_name("beam_idx"); beam_idx->output(0).get_tensor().add_names({"beam_idx"}); ov_model->add_parameters({beam_idx}); From 5d00226f4d80075213f8e50a683344542a9b45f1 Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Thu, 20 Nov 2025 09:14:57 -0800 Subject: [PATCH 04/11] Make changes to support new KVCache fusion --- .../core/providers/openvino/ov_interface.cc | 18 ++++++++++-------- .../core/providers/openvino/ov_interface.h | 2 +- .../openvino/ov_stateful_patch_utils.cc | 18 +++++++++++++++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 2a6c7ac7c8c9b..fcf0ef222a5c0 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -421,12 +421,14 @@ std::optional StatefulOVInferRequest::FindTensor(const std::string& void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - if (beam_idx_val.size() > 0) { - ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {beam_idx_val.size()}); - for (int i = 0; i < beam_idx_val.size(); ++i) { - beam_idx_tensor.data()[i] = int32_t(beam_idx_val[i]); + FillTensor("beam_idx", ov::element::i32, {1}, 0); + + if (src_idx_val.size() > 0) { + ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {src_idx_val.size()}); + for (int i = 0; i < src_idx_val.size(); ++i) { + src_idx_tensor.data()[i] = int32_t(src_idx_val[i]); } - ovInfReq.set_tensor("beam_idx", beam_idx_tensor); + ovInfReq.set_tensor("src_idx", src_idx_tensor); ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96}); for (int i = 0; i < dst_idx_val.size(); ++i) { for (int j = 0; j < 32; ++j) { @@ -437,7 +439,7 @@ void StatefulOVInferRequest::PreProcessInferRequest() { } ovInfReq.set_tensor("dst_idx", dst_idx_tensor); } else { - FillTensor("beam_idx", ov::element::i32, {0}, 0); + FillTensor("src_idx", ov::element::i32, {0}, 0); FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0); } @@ -491,10 +493,10 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic << src_indices.size() << " index pairs"; // set beam_idx and dst_idx based on provided values - beam_idx_val.clear(); + src_idx_val.clear(); dst_idx_val.clear(); for (int i = 0; i < src_indices.size(); ++i) { - beam_idx_val.emplace_back(src_indices[i]); + src_idx_val.emplace_back(src_indices[i]); dst_idx_val.emplace_back(dst_indices[i]); } /* diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 738d13357dc82..ba43fc4b94bd9 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -155,7 +155,7 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; - std::vector beam_idx_val; + std::vector src_idx_val; std::vector dst_idx_val; }; diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index 9146a9a206855..b534064d47c29 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -90,14 +90,21 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates); + auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0]; auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape(); - auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({std::move(input_batch)})); beam_idx->set_friendly_name("beam_idx"); beam_idx->output(0).get_tensor().add_names({"beam_idx"}); ov_model->add_parameters({beam_idx}); not_kv_inputs.push_back(beam_idx->get_friendly_name()); + auto src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); + src_idx->set_friendly_name("src_idx"); + src_idx->output(0).get_tensor().add_names({"src_idx"}); + ov_model->add_parameters({src_idx}); + not_kv_inputs.push_back(src_idx->get_friendly_name()); + auto dst_idx = std::make_shared(ov::element::i32, update_shape); dst_idx->set_friendly_name("dst_idx"); dst_idx->output(0).get_tensor().add_names({"dst_idx"}); @@ -112,10 +119,15 @@ void FuseCacheReorder(std::shared_ptr ov_model, auto gather_op = std::make_shared(parameter_output_port, beam_idx, + ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); + + auto update_gather_op = + std::make_shared(gather_op, + src_idx, ov::opset13::Constant::create(ov::element::i64, {}, {2})); - auto update_op = std::make_shared(parameter_output_port, - dst_idx, gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); + auto update_op = std::make_shared(gather_op, + dst_idx, update_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); // Replace the source output for all consumers of the input tensor for (auto& consumer : consumers) { From d884cd56c98e0e566e25aa71882f57b90f53b335 Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Tue, 2 Dec 2025 03:44:43 -0800 Subject: [PATCH 05/11] Add proper include --- onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index 11b0cc1dbe9bb..76a3065910ee7 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -13,6 +13,7 @@ #include "openvino/pass/manager.hpp" #include "openvino/pass/make_stateful.hpp" +#include "openvino/opsets/opset3.hpp" #include "openvino/opsets/opset12.hpp" #include "openvino/opsets/opset13.hpp" From 7676b302bfc381ccc6935b0120bb33d6f879b809 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Wed, 15 Oct 2025 18:39:00 -0700 Subject: [PATCH 06/11] add reorder KV cache API --- .../providers/openvino/backend_manager.cc | 6 ++ .../core/providers/openvino/backend_manager.h | 1 + .../openvino/backends/basic_backend.cc | 6 ++ .../openvino/backends/basic_backend.h | 1 + .../core/providers/openvino/ibackend.h | 1 + .../openvino/openvino_execution_provider.cc | 62 +++++++++++++++++++ .../core/providers/openvino/ov_interface.h | 2 + 7 files changed, 79 insertions(+) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index abb5b31b76e44..fe29d075b5829 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -892,5 +892,11 @@ void BackendManager::RewindKVCache(size_t index) { } } +void BackendManager::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { + if (concrete_backend_) { + concrete_backend_->ReorderKVCache(src_indices, dst_indices); + } +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 64dadb6c2151b..474bf2a01a019 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -31,6 +31,7 @@ class BackendManager { void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data); ov::CompiledModel GetOVCompiledModel(); void RewindKVCache(size_t index); + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices); private: std::unique_ptr GetModelProtoFromFusedNode( diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index d7fc0553fb1d4..d08fa548b388b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -334,6 +334,12 @@ void BasicBackend::RewindKVCache(size_t index) { }); } +void BasicBackend::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { + infer_req_pool_->forEachIdleRequest([&](OVInferRequestPtr& infer_request) { + infer_request->ReorderKVCache(src_indices, dst_indices); + }); +} + void BasicBackend::Infer(OrtKernelContext* ctx) const { Ort::KernelContext context(ctx); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 2cf3d3faa8b47..a1b052ea7aa98 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -138,6 +138,7 @@ class BasicBackend : public IBackend { return exe_network_.Get(); } void RewindKVCache(size_t index) override; + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; private: bool ValidateSubgraph(std::map>& const_outputs_map); diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 365a4625815d6..672fdbc218a78 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -18,6 +18,7 @@ class IBackend { virtual ov::CompiledModel GetOVCompiledModel() = 0; virtual ~IBackend() = default; virtual void RewindKVCache(size_t index) {} + virtual void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) {} }; using ptr_stream_t = std::unique_ptr; class BackendFactory { diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index f9c9fa2ea6f48..1d419316d6f7e 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -288,6 +288,68 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span src_indices; + std::vector dst_indices; + + try { + // Parse source indices from comma-separated string + std::stringstream src_stream(src_string); + std::string src_token; + while (std::getline(src_stream, src_token, ',')) { + // Trim whitespace + src_token.erase(0, src_token.find_first_not_of(" \t")); + src_token.erase(src_token.find_last_not_of(" \t") + 1); + + if (!src_token.empty()) { + int64_t index = std::stoll(src_token); + if (index >= 0) { + src_indices.push_back(static_cast(index)); + } else { + LOGS_DEFAULT(WARNING) << "kvcache_reorder src_index is < 0: " << index; + } + } + } + + // Parse destination indices from comma-separated string + std::stringstream dst_stream(dst_string); + std::string dst_token; + while (std::getline(dst_stream, dst_token, ',')) { + // Trim whitespace + dst_token.erase(0, dst_token.find_first_not_of(" \t")); + dst_token.erase(dst_token.find_last_not_of(" \t") + 1); + + if (!dst_token.empty()) { + int64_t index = std::stoll(dst_token); + if (index >= 0) { + dst_indices.push_back(static_cast(index)); + } else { + LOGS_DEFAULT(WARNING) << "kvcache_reorder dst_index is < 0: " << index; + } + } + } + + } catch (const std::exception& e) { + LOGS_DEFAULT(WARNING) << "Conversion for kvcache_reorder string value to int64_t indices failed. " + << "Exception: " << e.what(); + return Status::OK(); + } + + // Trigger KVCache Reorder for target Backend with vector arguments + for (auto& backend : backend_managers_) { + backend.ReorderKVCache(src_indices, dst_indices); + } } else { // Handle unknown options LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value; diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index ba43fc4b94bd9..9429f0f149a8e 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -132,6 +132,7 @@ class OVInferRequest { return ovInfReq; } virtual void RewindKVCache([[maybe_unused]] size_t index) {} + virtual void ReorderKVCache([[maybe_unused]] const std::vector& src_indices, [[maybe_unused]] const std::vector& dst_indices) {} }; class StatefulOVInferRequest : public OVInferRequest { @@ -140,6 +141,7 @@ class StatefulOVInferRequest : public OVInferRequest { void Infer() override; void RewindKVCache(size_t index) override; + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; void FillTensor(const std::string& tensor_name, const ov::element::Type& type, const std::vector& shape, int32_t fill_value); void CacheTensor(const std::string& tensor_name, std::vector& cache); From 5432bd4c6c7be08e02a3ca98153e3d3cd5fae8c6 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Fri, 5 Dec 2025 15:02:51 -0800 Subject: [PATCH 07/11] clean up code --- .../core/providers/openvino/ov_interface.cc | 35 ++++++------------- .../core/providers/openvino/ov_interface.h | 4 +-- .../openvino/ov_stateful_patch_utils.cc | 8 ++--- .../openvino/ov_stateful_patch_utils.h | 1 - 4 files changed, 17 insertions(+), 31 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index fcf0ef222a5c0..dcb8f2d462400 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -423,17 +423,17 @@ void StatefulOVInferRequest::PreProcessInferRequest() { // TODO(ankit): Address this issue and implement the fix at the appropriate layer. FillTensor("beam_idx", ov::element::i32, {1}, 0); - if (src_idx_val.size() > 0) { - ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {src_idx_val.size()}); - for (int i = 0; i < src_idx_val.size(); ++i) { - src_idx_tensor.data()[i] = int32_t(src_idx_val[i]); + if (kv_src_indices.size() > 0) { + ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); + for (int i = 0; i < kv_src_indices.size(); ++i) { + src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); } ovInfReq.set_tensor("src_idx", src_idx_tensor); - ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96}); - for (int i = 0; i < dst_idx_val.size(); ++i) { + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, kv_dst_indices.size(), 96}); + for (int i = 0; i < kv_dst_indices.size(); ++i) { for (int j = 0; j < 32; ++j) { for (int k = 0; k < 96; ++k) { - dst_idx_tensor.data()[(j * dst_idx_val.size() + i) * 96 + k] = int32_t(dst_idx_val[i]); + dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * 96 + k] = int32_t(kv_dst_indices[i]); } } } @@ -492,25 +492,12 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with " << src_indices.size() << " index pairs"; - // set beam_idx and dst_idx based on provided values - src_idx_val.clear(); - dst_idx_val.clear(); + kv_src_indices.clear(); + kv_dst_indices.clear(); for (int i = 0; i < src_indices.size(); ++i) { - src_idx_val.emplace_back(src_indices[i]); - dst_idx_val.emplace_back(dst_indices[i]); + kv_src_indices.emplace_back(src_indices[i]); + kv_dst_indices.emplace_back(dst_indices[i]); } - /* - // Retrieve KVCache states and reorder them based on the provided indices - auto states = ovInfReq.query_state(); - - for (auto& state : states) { - auto start_time = std::chrono::high_resolution_clock::now(); - state.gather_by_axis(src_indices, dst_indices); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time).count(); - LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds"; - } - */ } void StatefulOVInferRequest::RewindKVCache(size_t index) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 9429f0f149a8e..c22039f5dc0b7 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -157,8 +157,8 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; - std::vector src_idx_val; - std::vector dst_idx_val; + std::vector kv_src_indices; + std::vector kv_dst_indices; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index b534064d47c29..d831461d97935 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -121,17 +121,17 @@ void FuseCacheReorder(std::shared_ptr ov_model, beam_idx, ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); - auto update_gather_op = + auto updatekv_gather_op = std::make_shared(gather_op, src_idx, ov::opset13::Constant::create(ov::element::i64, {}, {2})); - auto update_op = std::make_shared(gather_op, - dst_idx, update_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); + auto updatekv_op = std::make_shared(gather_op, + dst_idx, updatekv_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); // Replace the source output for all consumers of the input tensor for (auto& consumer : consumers) { - consumer.replace_source_output(update_op->output(0)); + consumer.replace_source_output(updatekv_op->output(0)); } } diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index 76a3065910ee7..11b0cc1dbe9bb 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -13,7 +13,6 @@ #include "openvino/pass/manager.hpp" #include "openvino/pass/make_stateful.hpp" -#include "openvino/opsets/opset3.hpp" #include "openvino/opsets/opset12.hpp" #include "openvino/opsets/opset13.hpp" From c7f57bb4b7b37844e08705789e6353ad1779c2ec Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Mon, 8 Dec 2025 21:29:45 -0800 Subject: [PATCH 08/11] add post process for internal handled inputs --- onnxruntime/core/providers/openvino/ov_interface.cc | 6 ++++++ onnxruntime/core/providers/openvino/ov_interface.h | 1 + 2 files changed, 7 insertions(+) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index dcb8f2d462400..eb333251437ca 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -479,6 +479,12 @@ void StatefulOVInferRequest::PreProcessInferRequest() { void StatefulOVInferRequest::Infer() { PreProcessInferRequest(); OVInferRequest::Infer(); + PostProcessInferRequest(); +} + +void StatefulOVInferRequest::PostProcessInferRequest() { + kv_src_indices.clear(); + kv_dst_indices.clear(); } void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index c22039f5dc0b7..3d57f67600eb2 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -150,6 +150,7 @@ class StatefulOVInferRequest : public OVInferRequest { private: void PreProcessInferRequest(); + void PostProcessInferRequest(); std::string target_device; // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors, From 8f464d6402518913f6cf5a45b027b20b79ae63a3 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Thu, 11 Dec 2025 20:53:40 -0800 Subject: [PATCH 09/11] disable update_kvcache for npu + pass kv info --- .../openvino/openvino_execution_provider.cc | 2 +- .../core/providers/openvino/ov_interface.cc | 17 +++-- .../openvino/ov_stateful_patch_utils.cc | 70 ++++++++++++------- .../openvino/ov_stateful_patch_utils.h | 5 +- 4 files changed, 58 insertions(+), 36 deletions(-) diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 1d419316d6f7e..b63ffa537e6b7 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -289,7 +289,7 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span& model, LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); if (!model_status) { LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; - PatchStatefulDecoder(model); + PatchStatefulDecoder(model, hw_target); } if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { @@ -422,25 +422,28 @@ void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. FillTensor("beam_idx", ov::element::i32, {1}, 0); - + ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape(); + uint64_t kv_num_heads = dst_idx_shape[1]; + uint64_t kv_head_size = dst_idx_shape[3]; if (kv_src_indices.size() > 0) { ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); for (int i = 0; i < kv_src_indices.size(); ++i) { src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); } ovInfReq.set_tensor("src_idx", src_idx_tensor); - ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, kv_dst_indices.size(), 96}); + + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); for (int i = 0; i < kv_dst_indices.size(); ++i) { - for (int j = 0; j < 32; ++j) { - for (int k = 0; k < 96; ++k) { - dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * 96 + k] = int32_t(kv_dst_indices[i]); + for (int j = 0; j < kv_num_heads; ++j) { + for (int k = 0; k < kv_head_size; ++k) { + dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); } } } ovInfReq.set_tensor("dst_idx", dst_idx_tensor); } else { FillTensor("src_idx", ov::element::i32, {0}, 0); - FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0); + FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0); } // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index d831461d97935..d280ad0276330 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -75,11 +75,16 @@ std::string GetInputOutputName(std::shared_ptr ov_model, void FuseCacheReorder(std::shared_ptr ov_model, std::vector& not_kv_inputs, const std::vector& key_value_input_names, - int gather_dim) { + int gather_dim, + const std::string& device) { if (ModelHasInputOutputNames(ov_model, "beam_idx")) { throw std::runtime_error("Model already has fused cache"); } + // Flag to add Gather+ScatterElementsUpdate subgraph for LLM speculative decoding + // TO-DO: extend to NPU device when OpenVINO NPU has related optimization + bool is_support_speculative_LLM = device.find("GPU") != std::string::npos; + // Define input name candidates in priority order const std::vector input_name_candidates = { "inputs_embeds", // Default fallback @@ -99,17 +104,22 @@ void FuseCacheReorder(std::shared_ptr ov_model, ov_model->add_parameters({beam_idx}); not_kv_inputs.push_back(beam_idx->get_friendly_name()); - auto src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); - src_idx->set_friendly_name("src_idx"); - src_idx->output(0).get_tensor().add_names({"src_idx"}); - ov_model->add_parameters({src_idx}); - not_kv_inputs.push_back(src_idx->get_friendly_name()); - - auto dst_idx = std::make_shared(ov::element::i32, update_shape); - dst_idx->set_friendly_name("dst_idx"); - dst_idx->output(0).get_tensor().add_names({"dst_idx"}); - ov_model->add_parameters({dst_idx}); - not_kv_inputs.push_back(dst_idx->get_friendly_name()); + std::shared_ptr src_idx; + std::shared_ptr dst_idx; + + if (is_support_speculative_LLM) { + src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); + src_idx->set_friendly_name("src_idx"); + src_idx->output(0).get_tensor().add_names({"src_idx"}); + ov_model->add_parameters({src_idx}); + not_kv_inputs.push_back(src_idx->get_friendly_name()); + + dst_idx = std::make_shared(ov::element::i32, update_shape); + dst_idx->set_friendly_name("dst_idx"); + dst_idx->output(0).get_tensor().add_names({"dst_idx"}); + ov_model->add_parameters({dst_idx}); + not_kv_inputs.push_back(dst_idx->get_friendly_name()); + } // Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx for (const auto& input_name : key_value_input_names) { @@ -121,17 +131,25 @@ void FuseCacheReorder(std::shared_ptr ov_model, beam_idx, ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); - auto updatekv_gather_op = - std::make_shared(gather_op, - src_idx, - ov::opset13::Constant::create(ov::element::i64, {}, {2})); - - auto updatekv_op = std::make_shared(gather_op, - dst_idx, updatekv_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); + std::shared_ptr output_node; + if (is_support_speculative_LLM) { + auto updatekv_gather_op = + std::make_shared(gather_op, + src_idx, + ov::opset13::Constant::create(ov::element::i64, {}, {2})); + + auto updatekv_op = std::make_shared(gather_op, + dst_idx, + updatekv_gather_op, + ov::opset13::Constant::create(ov::element::i64, {}, {2})); + output_node = updatekv_op; + } else { + output_node = gather_op; + } // Replace the source output for all consumers of the input tensor for (auto& consumer : consumers) { - consumer.replace_source_output(updatekv_op->output(0)); + consumer.replace_source_output(output_node->output(0)); } } @@ -269,7 +287,7 @@ std::pair, std::vector> ExtractInputKVTens } // Updated PatchStatefulDecoder function -void PatchStatefulDecoder(std::shared_ptr model) { +void PatchStatefulDecoder(std::shared_ptr model, const std::string& device) { // Use the dynamic pattern-based extraction logic auto [key_value_output_names, extracted_patterns] = ExtractKVPatternsFromOutputs(model); auto [key_value_input_names, not_kv_inputs] = ExtractInputKVTensors(model, extracted_patterns); @@ -279,10 +297,10 @@ void PatchStatefulDecoder(std::shared_ptr model) { } if (key_value_input_names.size() != key_value_output_names.size()) { - ORT_THROW("Found different sizes between key_value_input_names (", - key_value_input_names.size(), - ") and key_value_output_names (", - key_value_output_names.size(), + ORT_THROW("Found different sizes between key_value_input_names (", + key_value_input_names.size(), + ") and key_value_output_names (", + key_value_output_names.size(), "). They couldn't be paired."); } @@ -291,7 +309,7 @@ void PatchStatefulDecoder(std::shared_ptr model) { // batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0 auto batch_dim = 0; - FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim); + FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim, device); MakeStateful(model, key_value_input_names, key_value_output_names); } diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index 11b0cc1dbe9bb..ce7db01063426 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -26,13 +26,14 @@ bool ModelHasInputOutputNames(std::shared_ptr model, const std::strin void FuseCacheReorder(std::shared_ptr ov_model, std::vector& not_kv_inputs, const std::vector& key_value_input_names, - int gather_dim); + int gather_dim, + const std::string& device = ""); void MakeStateful(std::shared_ptr& ov_model, const std::vector& key_value_input_names, const std::vector& key_value_output_names); -void PatchStatefulDecoder(std::shared_ptr model); +void PatchStatefulDecoder(std::shared_ptr model, const std::string& device = ""); bool HasOpWithType(const std::shared_ptr& function, const std::string& type_name); From 203ee332183d31215f4ca2bce9d03b6e7a09194a Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Thu, 11 Dec 2025 22:08:58 -0800 Subject: [PATCH 10/11] refactor code --- .../openvino/openvino_execution_provider.cc | 77 +++++++++---------- .../core/providers/openvino/ov_interface.cc | 8 +- 2 files changed, 40 insertions(+), 45 deletions(-) diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index b63ffa537e6b7..0642ad55b5526 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -293,57 +293,52 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span src_indices; - std::vector dst_indices; - - try { - // Parse source indices from comma-separated string - std::stringstream src_stream(src_string); - std::string src_token; - while (std::getline(src_stream, src_token, ',')) { - // Trim whitespace - src_token.erase(0, src_token.find_first_not_of(" \t")); - src_token.erase(src_token.find_last_not_of(" \t") + 1); - - if (!src_token.empty()) { - int64_t index = std::stoll(src_token); - if (index >= 0) { - src_indices.push_back(static_cast(index)); - } else { - LOGS_DEFAULT(WARNING) << "kvcache_reorder src_index is < 0: " << index; + auto parse_indices = [](const std::string& input, const std::string& index_type) -> std::pair> { + std::vector indices; + std::stringstream stream(input); + std::string token; + + try { + while (std::getline(stream, token, ',')) { + // Trim whitespace + token.erase(0, token.find_first_not_of(" \t")); + token.erase(token.find_last_not_of(" \t") + 1); + + if (!token.empty()) { + int64_t index = std::stoll(token); + if (index >= 0) { + indices.push_back(static_cast(index)); + } else { + return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "kvcache_reorder " + index_type + " cannot be negative: " + std::to_string(index)), + std::vector()}; + } } } + } catch (const std::exception& e) { + return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "Failed to parse kvcache_reorder " + index_type + ": " + std::string(e.what())), + std::vector()}; } - // Parse destination indices from comma-separated string - std::stringstream dst_stream(dst_string); - std::string dst_token; - while (std::getline(dst_stream, dst_token, ',')) { - // Trim whitespace - dst_token.erase(0, dst_token.find_first_not_of(" \t")); - dst_token.erase(dst_token.find_last_not_of(" \t") + 1); - - if (!dst_token.empty()) { - int64_t index = std::stoll(dst_token); - if (index >= 0) { - dst_indices.push_back(static_cast(index)); - } else { - LOGS_DEFAULT(WARNING) << "kvcache_reorder dst_index is < 0: " << index; - } - } - } + return {Status::OK(), std::move(indices)}; + }; - } catch (const std::exception& e) { - LOGS_DEFAULT(WARNING) << "Conversion for kvcache_reorder string value to int64_t indices failed. " - << "Exception: " << e.what(); - return Status::OK(); + auto [src_status, src_indices] = parse_indices(src_string, "src_index"); + if (!src_status.IsOK()) { + return src_status; + } + + auto [dst_status, dst_indices] = parse_indices(dst_string, "dst_index"); + if (!dst_status.IsOK()) { + return dst_status; } // Trigger KVCache Reorder for target Backend with vector arguments diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 8817f791cc8bc..1234b7b4ade3d 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -427,15 +427,15 @@ void StatefulOVInferRequest::PreProcessInferRequest() { uint64_t kv_head_size = dst_idx_shape[3]; if (kv_src_indices.size() > 0) { ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); - for (int i = 0; i < kv_src_indices.size(); ++i) { + for (auto i = 0; i < kv_src_indices.size(); ++i) { src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); } ovInfReq.set_tensor("src_idx", src_idx_tensor); ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); - for (int i = 0; i < kv_dst_indices.size(); ++i) { - for (int j = 0; j < kv_num_heads; ++j) { - for (int k = 0; k < kv_head_size; ++k) { + for (auto i = 0; i < kv_dst_indices.size(); ++i) { + for (auto j = 0; j < kv_num_heads; ++j) { + for (auto k = 0; k < kv_head_size; ++k) { dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); } } From 7d201fa343889ccc9bb23fb43767c0f905eaee97 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Thu, 11 Dec 2025 22:33:46 -0800 Subject: [PATCH 11/11] minor change --- .../core/providers/openvino/ov_interface.cc | 48 +++++++++++-------- .../core/providers/openvino/ov_interface.h | 2 + .../openvino/ov_stateful_patch_utils.cc | 8 ++-- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 1234b7b4ade3d..a1b518298903a 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -361,6 +361,7 @@ void OVInferRequest::Infer() { StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) : OVInferRequest(std::move(infer_request)), target_device(device) { bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); + is_support_kvcache_reorder = device.find("GPU") != std::string::npos; // check if there is input_ids tensors and if the tensor type is int64, // because logic prefill_use_full_chat_history is only for specific inputs and data type @@ -422,28 +423,31 @@ void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. FillTensor("beam_idx", ov::element::i32, {1}, 0); - ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape(); - uint64_t kv_num_heads = dst_idx_shape[1]; - uint64_t kv_head_size = dst_idx_shape[3]; - if (kv_src_indices.size() > 0) { - ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); - for (auto i = 0; i < kv_src_indices.size(); ++i) { - src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); - } - ovInfReq.set_tensor("src_idx", src_idx_tensor); - ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); - for (auto i = 0; i < kv_dst_indices.size(); ++i) { - for (auto j = 0; j < kv_num_heads; ++j) { - for (auto k = 0; k < kv_head_size; ++k) { - dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); + if (is_support_kvcache_reorder){ + ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape(); + uint64_t kv_num_heads = dst_idx_shape[1]; + uint64_t kv_head_size = dst_idx_shape[3]; + if (kv_src_indices.size() > 0) { + ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); + for (auto i = 0; i < kv_src_indices.size(); ++i) { + src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); + } + ovInfReq.set_tensor("src_idx", src_idx_tensor); + + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); + for (auto i = 0; i < kv_dst_indices.size(); ++i) { + for (auto j = 0; j < kv_num_heads; ++j) { + for (auto k = 0; k < kv_head_size; ++k) { + dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); + } + } } + ovInfReq.set_tensor("dst_idx", dst_idx_tensor); + } else { + FillTensor("src_idx", ov::element::i32, {0}, 0); + FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0); } - } - ovInfReq.set_tensor("dst_idx", dst_idx_tensor); - } else { - FillTensor("src_idx", ov::element::i32, {0}, 0); - FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0); } // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. @@ -486,8 +490,10 @@ void StatefulOVInferRequest::Infer() { } void StatefulOVInferRequest::PostProcessInferRequest() { - kv_src_indices.clear(); - kv_dst_indices.clear(); + if(is_support_kvcache_reorder){ + kv_src_indices.clear(); + kv_dst_indices.clear(); + } } void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 3d57f67600eb2..2d70cc505f871 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -158,6 +158,8 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; + + bool is_support_kvcache_reorder = false; std::vector kv_src_indices; std::vector kv_dst_indices; }; diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index d280ad0276330..cda2fed1fe3e2 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -81,9 +81,9 @@ void FuseCacheReorder(std::shared_ptr ov_model, throw std::runtime_error("Model already has fused cache"); } - // Flag to add Gather+ScatterElementsUpdate subgraph for LLM speculative decoding + // Flag to add Gather+ScatterElementsUpdate subgraph to reorder KV cache for LLM speculative decoding // TO-DO: extend to NPU device when OpenVINO NPU has related optimization - bool is_support_speculative_LLM = device.find("GPU") != std::string::npos; + bool is_support_kvcache_reorder = device.find("GPU") != std::string::npos; // Define input name candidates in priority order const std::vector input_name_candidates = { @@ -107,7 +107,7 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::shared_ptr src_idx; std::shared_ptr dst_idx; - if (is_support_speculative_LLM) { + if (is_support_kvcache_reorder) { src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); src_idx->set_friendly_name("src_idx"); src_idx->output(0).get_tensor().add_names({"src_idx"}); @@ -132,7 +132,7 @@ void FuseCacheReorder(std::shared_ptr ov_model, ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); std::shared_ptr output_node; - if (is_support_speculative_LLM) { + if (is_support_kvcache_reorder) { auto updatekv_gather_op = std::make_shared(gather_op, src_idx,