Fix after rebasing

wine99 · wine99 · commit e180b86fd1c0 · 2025-09-04T17:42:39.000+08:00
- Layout of cache k and cache v are unified: [seq, n_head, head_size]
- Add CPY and FLASH_ATTN_EXT, flash attn is not used yet
- Skip test-backend-ops due to flash attn test crash
- Add mutex around graph conversion to avoid test-thread-safety fali in the future
- Update NPU config
- Update GPU config to disable SDPA opt to make phi-3 run
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -73,6 +73,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
 }
 
 GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
+    if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
+        std::string filename = "cgraph.txt";
+        dump_cgraph(cgraph, filename);
+    }
+
     m_cgraph = cgraph;
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         auto* cur_node = cgraph->nodes[node_n];
@@ -173,49 +178,46 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
             break;
         }
         case GGML_OP_CONT: {
-            if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) {
-                // The input comes from a PERMUTE
-                m_op_case = 1;
-            } else {
-                // The input comes from a VIEW which is subtensor
-                m_op_case = 2;
-            }
-            break;
-        }
-        case GGML_OP_SET_ROWS: {
-            if (std::string(node->name).find("cache_k") == 0) {
+            if (node->src[0]->op == GGML_OP_PERMUTE) {
                 m_op_case = 1;
-            } else {
+            } else if (node->src[0]->op == GGML_OP_TRANSPOSE) {
                 m_op_case = 2;
+            } else if (node->src[0]->op == GGML_OP_VIEW) {
+                // The input comes from a VIEW which is subtensor
+                m_op_case = 3;
             }
             break;
         }
         case GGML_OP_PERMUTE: {
-            if (node->src[0]->view_src == nullptr) {
-                // Permute Qcur
+            if (node->src[0]->op != GGML_OP_VIEW) {
                 m_op_case = 1;
             } else if (ggml_is_contiguous(node->src[0])) {
                 // Permute cache_k (view)
                 m_op_case = 2;
             } else {
-                // Permute cache_v (view)
+                // Permute cache_v (view), deprecated, cache_v will also fall to case 2
+                m_op_case = 3;
+            }
+            break;
+        }
+        case GGML_OP_MUL_MAT: {
+            if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) {
+                m_op_case = 2;
+            } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
+                // test-backend-ops case
                 m_op_case = 3;
             }
             break;
         }
         case GGML_OP_GET_ROWS: {
             if (node->src[1]->op == GGML_OP_VIEW) {
                 m_op_case = 2;
-            } else {
-                m_op_case = 1;
             }
             break;
         }
         case GGML_OP_ROPE: {
             if (node->src[0]->op == GGML_OP_VIEW) {
                 m_op_case = 2;
-            } else {
-                m_op_case = 1;
             }
             break;
         }
@@ -270,19 +272,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
     } else if (name.find("cache_k") == 0) {
         input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
     } else if (name.find("cache_v") == 0) {
-        input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
+        input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
     } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
-        input_shape = ov::PartialShape{1, 1, -1};
-        if (m_is_static) {
-            if (m_is_first_token) {
-                // Dummy static shape, since the indices are not used in this case
-                input_shape = ov::PartialShape{1};
-            } else if (std::string(op->name).find("cache_k") == 0) {
-                input_shape = ov::PartialShape{1, 1, 1};
-            } else {
-                input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size};
-            }
-        }
+        input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
     } else if (src->op == GGML_OP_VIEW) {
         // This case is added to make test-backend-ops work
         input_shape = ov::PartialShape{get_shape(src->view_src)};
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -270,12 +270,14 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
         }
     }
 
-    if (op->op == GGML_OP_MUL_MAT) {
-        if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) ||
-            (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) {
-            GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n");
+    if (op->op == GGML_OP_CPY) {
+        if (op->src[1] != op) {
+            GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n");
             return true;
         }
+    }
+
+    if (op->op == GGML_OP_MUL_MAT) {
         if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) {
             // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"`
             GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n");
@@ -346,7 +348,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
                                                  GGML_OP_RMS_NORM,
                                                  GGML_OP_SCALE,
                                                  GGML_OP_SOFT_MAX,
-                                                 GGML_OP_SET_ROWS};
+                                                 GGML_OP_SET_ROWS,
+                                                 GGML_OP_FLASH_ATTN_EXT,
+                                                 GGML_OP_CPY};
     static const std::set<ggml_unary_op> supported_unary_ops{
         GGML_UNARY_OP_SILU,
     };
diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp
@@ -19,7 +19,7 @@ OutputVector translate_cont(const NodeContext& context) {
     num_inputs_check(context, 1, 1);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
 
     auto src_shape = context.get_input_shape(0).to_shape();
     auto dst_shape = context.get_output_shape(0).to_shape();
@@ -32,6 +32,9 @@ OutputVector translate_cont(const NodeContext& context) {
             context.get_input(0),
             ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape),
             false);
+    } else if (op_case == 2) {
+        // The input comes from a TRANSPOSE
+        return {context.get_input(0)};
     } else {
         // The input comes from a VIEW
         res = process_view_input(context, 0);
diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp
@@ -0,0 +1,20 @@
+#include <memory>
+#include <openvino/op/convert.hpp>
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_cpy(const NodeContext& context) {
+    auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type(0));
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp
@@ -0,0 +1,35 @@
+#include <memory>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/scaled_dot_product_attention.hpp>
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_flash_attn_ext(const NodeContext& context) {
+    num_inputs_check(context, 4, 4);
+    auto q_f32 = context.get_input(0);
+    auto k = context.get_input(1);
+    auto v = context.get_input(2);
+    auto mask = context.get_input(3);
+
+    float* params = reinterpret_cast<float*>(context.get_output_op_params(0));
+    float scale         = params[0];
+    // float max_bias      = params[1];
+    // float logit_softcap = params[2];
+
+    auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
+    auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});
+    auto res = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v , mask, scale_node, false);
+    auto res_f32 = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
+    return rename_outputs_with_suffix({res_f32}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
@@ -21,7 +21,6 @@ OutputVector translate_get_rows(const NodeContext& context) {
     num_inputs_check(context, 2, 2);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
 
     Output<Node> res;
     auto data = context.get_input(0);
diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
@@ -27,15 +27,26 @@ namespace op {
 OutputVector translate_mulmat(const NodeContext& context) {
     num_inputs_check(context, 2, 2);
 
+    int op_case = context.get_op_case();
+
     ov::Output<Node> res;
     ov::Output<ov::Node> B = context.get_input(0);
     ov::Output<ov::Node> A = context.get_input(1);
 
+    bool transpose_b = true;
+    if (op_case == 2) {
+        B = B.get_node_shared_ptr()->input_value(0);
+        transpose_b = false;
+    } else if (op_case == 3) {
+        B = process_view_input(context, 0);
+        A = process_view_input(context, 1);
+    }
+
     bool convert_out_type = false;
     if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) {
-        B = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_input_type(1));
+        B = std::make_shared<ov::op::v0::Convert>(B, context.get_input_type(1));
     } else if (context.get_input_type(0) != context.get_input_type(1)) {
-        A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
+        A = std::make_shared<ov::op::v0::Convert>(A, context.get_input_type(0));
         convert_out_type = true;
     }
 
@@ -72,10 +83,10 @@ OutputVector translate_mulmat(const NodeContext& context) {
         }
 
         if (convert_out_type) {
-            auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
+            auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, transpose_b);
             res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
         } else {
-            res = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
+            res = std::make_shared<ov::op::v0::MatMul>(A, B, false, transpose_b);
         }
 
         return rename_outputs_with_suffix({res}, context.get_name());
diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp
@@ -21,13 +21,12 @@ OutputVector translate_permute(const NodeContext& context) {
     num_inputs_check(context, 1, 1);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case");
     ov::Output<Node> res;
 
     if (op_case == 1) {
-        auto perm = argsort_descend(context.get_output_stride(0));
         res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
-                                                      ov::op::v0::Constant::create(ov::element::i64, {3}, perm));
+                                                      ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
     } else {
         auto src = context.get_input(0);
         auto attention_size = context.get_input("attention_size");
diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp
@@ -27,7 +27,6 @@ OutputVector translate_rope(const NodeContext& context) {
     num_inputs_check(context, 2, 3);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
 
     ov::Output<Node> res;
 
diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp
@@ -32,21 +32,7 @@ OutputVector translate_set_rows(const NodeContext& context) {
     FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS");
 
     if (context.is_static() && context.is_first_token()) {
-        Output<Node> res;
-        if (context.get_op_case() == 2) {
-            res = std::make_shared<ov::op::v1::Reshape>(
-                data,
-                ov::op::v0::Constant::create(
-                    ov::element::i64,
-                    {3},
-                    {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}),
-                false);
-            res = std::make_shared<ov::op::v1::Transpose>(
-                res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0}));
-        } else {
-            res = data;
-        }
-        return rename_outputs_with_suffix({res}, context.get_name());
+        return rename_outputs_with_suffix({data}, context.get_name());
     }
 
     auto indices = context.get_input(1);
diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp
diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp
@@ -12,9 +12,8 @@ namespace op {
 OutputVector translate_transpose(const NodeContext& context) {
     num_inputs_check(context, 1, 1);
 
-    auto perm = argsort_descend(context.get_output_stride(0));
     auto res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
-                                                       ov::op::v0::Constant::create(ov::element::i64, {3}, perm));
+                                                       ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1}));
     return rename_outputs_with_suffix({res}, context.get_name());
 }
 
diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp
@@ -16,25 +16,27 @@ namespace ggml {
 std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
     using namespace ov::op;
     return {
-        {"GGML_OP_ADD",        op::translate_1to1_match_2_inputs<v1::Add>     },
-        {"GGML_OP_ADD1",       op::translate_1to1_match_2_inputs<v1::Add>     },
-        {"GGML_OP_CONT",       op::translate_cont                             },
-        {"GGML_OP_DIV",        op::translate_1to1_match_2_inputs<v1::Divide>  },
-        {"GGML_OP_GET_ROWS",   op::translate_get_rows                         },
-        {"GGML_OP_MUL",        op::translate_1to1_match_2_inputs<v1::Multiply>},
-        {"GGML_OP_MUL_MAT",    op::translate_mulmat                           },
-        {"GGML_OP_PERMUTE",    op::translate_permute                          },
-        {"GGML_OP_RESHAPE",    op::translate_reshape                          },
-        {"GGML_OP_RMS_NORM",   op::translate_rms_norm                         },
-        {"GGML_OP_ROPE",       op::translate_rope                             },
-        {"GGML_OP_SCALE",      op::translate_scale                            },
-        {"GGML_OP_SOFT_MAX",   op::translate_soft_max                         },
-        {"GGML_OP_SUB",        op::translate_1to1_match_2_inputs<v1::Subtract>},
-        {"GGML_OP_TRANSPOSE",  op::translate_transpose                        },
-        {"GGML_UNARY_OP_SILU", op::translate_unary_silu                       },
-        {"GGML_OP_VIEW",       op::translate_view                             },
-        {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu                       },
-        {"GGML_OP_SET_ROWS",   op::translate_set_rows                         },
+        {"GGML_OP_ADD",            op::translate_1to1_match_2_inputs<v1::Add>     },
+        {"GGML_OP_ADD1",           op::translate_1to1_match_2_inputs<v1::Add>     },
+        {"GGML_OP_CONT",           op::translate_cont                             },
+        {"GGML_OP_DIV",            op::translate_1to1_match_2_inputs<v1::Divide>  },
+        {"GGML_OP_GET_ROWS",       op::translate_get_rows                         },
+        {"GGML_OP_MUL",            op::translate_1to1_match_2_inputs<v1::Multiply>},
+        {"GGML_OP_MUL_MAT",        op::translate_mulmat                           },
+        {"GGML_OP_PERMUTE",        op::translate_permute                          },
+        {"GGML_OP_RESHAPE",        op::translate_reshape                          },
+        {"GGML_OP_RMS_NORM",       op::translate_rms_norm                         },
+        {"GGML_OP_ROPE",           op::translate_rope                             },
+        {"GGML_OP_SCALE",          op::translate_scale                            },
+        {"GGML_OP_SOFT_MAX",       op::translate_soft_max                         },
+        {"GGML_OP_SUB",            op::translate_1to1_match_2_inputs<v1::Subtract>},
+        {"GGML_OP_TRANSPOSE",      op::translate_transpose                        },
+        {"GGML_UNARY_OP_SILU",     op::translate_unary_silu                       },
+        {"GGML_OP_VIEW",           op::translate_view                             },
+        {"GGML_GLU_OP_SWIGLU",     op::translate_glu_swiglu                       },
+        {"GGML_OP_SET_ROWS",       op::translate_set_rows                         },
+        {"GGML_OP_CPY",            op::translate_cpy                              },
+        {"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext                   },
     };
 }
 
diff --git a/ggml/src/ggml-openvino/openvino/op_table.hpp b/ggml/src/ggml-openvino/openvino/op_table.hpp
@@ -26,6 +26,8 @@ GGML_OP_CONVERTER(translate_transpose);
 GGML_OP_CONVERTER(translate_view);
 GGML_OP_CONVERTER(translate_glu_swiglu);
 GGML_OP_CONVERTER(translate_set_rows);
+GGML_OP_CONVERTER(translate_cpy);
+GGML_OP_CONVERTER(translate_flash_attn_ext);
 
 } // namespace op
 
diff --git a/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp b/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp
@@ -40,11 +40,9 @@ FuseToSDPA::FuseToSDPA() {
         auto mask = pattern_to_output[m_mask];
         auto scale = pattern_to_output[m_scale];
 
-        auto v_trans =
-            register_new_node<ov::op::v1::Transpose>(v, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 2, 1}));
         auto mask_f16 = register_new_node<ov::op::v0::Convert>(mask, ov::element::f16);
         auto scale_f16 = register_new_node<ov::op::v0::Convert>(scale, ov::element::f16);
-        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v_trans, mask_f16, scale_f16, false);
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_f16, scale_f16, false);
 
         ov::replace_node(m.get_match_root(), sdpa);
         ov::copy_runtime_info(m.get_matched_nodes(), sdpa);
diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp
@@ -65,6 +65,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
         name += "_";
         name += suffix;
         node->set_friendly_name(name);
+        // std::cout << name << "  " << output.get_partial_shape() << std::endl;
     }
     return outputs;
 }
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h