Skip to content

Commit e180b86

Browse files
committed
Fix after rebasing
- Layout of cache k and cache v are unified: [seq, n_head, head_size] - Add CPY and FLASH_ATTN_EXT, flash attn is not used yet - Skip test-backend-ops due to flash attn test crash - Add mutex around graph conversion to avoid test-thread-safety fali in the future - Update NPU config - Update GPU config to disable SDPA opt to make phi-3 run
1 parent 73dfc75 commit e180b86

File tree

18 files changed

+244
-173
lines changed

18 files changed

+244
-173
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
7373
}
7474

7575
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
76+
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
77+
std::string filename = "cgraph.txt";
78+
dump_cgraph(cgraph, filename);
79+
}
80+
7681
m_cgraph = cgraph;
7782
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
7883
auto* cur_node = cgraph->nodes[node_n];
@@ -173,49 +178,46 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
173178
break;
174179
}
175180
case GGML_OP_CONT: {
176-
if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) {
177-
// The input comes from a PERMUTE
178-
m_op_case = 1;
179-
} else {
180-
// The input comes from a VIEW which is subtensor
181-
m_op_case = 2;
182-
}
183-
break;
184-
}
185-
case GGML_OP_SET_ROWS: {
186-
if (std::string(node->name).find("cache_k") == 0) {
181+
if (node->src[0]->op == GGML_OP_PERMUTE) {
187182
m_op_case = 1;
188-
} else {
183+
} else if (node->src[0]->op == GGML_OP_TRANSPOSE) {
189184
m_op_case = 2;
185+
} else if (node->src[0]->op == GGML_OP_VIEW) {
186+
// The input comes from a VIEW which is subtensor
187+
m_op_case = 3;
190188
}
191189
break;
192190
}
193191
case GGML_OP_PERMUTE: {
194-
if (node->src[0]->view_src == nullptr) {
195-
// Permute Qcur
192+
if (node->src[0]->op != GGML_OP_VIEW) {
196193
m_op_case = 1;
197194
} else if (ggml_is_contiguous(node->src[0])) {
198195
// Permute cache_k (view)
199196
m_op_case = 2;
200197
} else {
201-
// Permute cache_v (view)
198+
// Permute cache_v (view), deprecated, cache_v will also fall to case 2
199+
m_op_case = 3;
200+
}
201+
break;
202+
}
203+
case GGML_OP_MUL_MAT: {
204+
if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) {
205+
m_op_case = 2;
206+
} else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
207+
// test-backend-ops case
202208
m_op_case = 3;
203209
}
204210
break;
205211
}
206212
case GGML_OP_GET_ROWS: {
207213
if (node->src[1]->op == GGML_OP_VIEW) {
208214
m_op_case = 2;
209-
} else {
210-
m_op_case = 1;
211215
}
212216
break;
213217
}
214218
case GGML_OP_ROPE: {
215219
if (node->src[0]->op == GGML_OP_VIEW) {
216220
m_op_case = 2;
217-
} else {
218-
m_op_case = 1;
219221
}
220222
break;
221223
}
@@ -270,19 +272,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
270272
} else if (name.find("cache_k") == 0) {
271273
input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
272274
} else if (name.find("cache_v") == 0) {
273-
input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
275+
input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
274276
} else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
275-
input_shape = ov::PartialShape{1, 1, -1};
276-
if (m_is_static) {
277-
if (m_is_first_token) {
278-
// Dummy static shape, since the indices are not used in this case
279-
input_shape = ov::PartialShape{1};
280-
} else if (std::string(op->name).find("cache_k") == 0) {
281-
input_shape = ov::PartialShape{1, 1, 1};
282-
} else {
283-
input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size};
284-
}
285-
}
277+
input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
286278
} else if (src->op == GGML_OP_VIEW) {
287279
// This case is added to make test-backend-ops work
288280
input_shape = ov::PartialShape{get_shape(src->view_src)};

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -270,12 +270,14 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
270270
}
271271
}
272272

273-
if (op->op == GGML_OP_MUL_MAT) {
274-
if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) ||
275-
(op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) {
276-
GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n");
273+
if (op->op == GGML_OP_CPY) {
274+
if (op->src[1] != op) {
275+
GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n");
277276
return true;
278277
}
278+
}
279+
280+
if (op->op == GGML_OP_MUL_MAT) {
279281
if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) {
280282
// Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"`
281283
GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n");
@@ -346,7 +348,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
346348
GGML_OP_RMS_NORM,
347349
GGML_OP_SCALE,
348350
GGML_OP_SOFT_MAX,
349-
GGML_OP_SET_ROWS};
351+
GGML_OP_SET_ROWS,
352+
GGML_OP_FLASH_ATTN_EXT,
353+
GGML_OP_CPY};
350354
static const std::set<ggml_unary_op> supported_unary_ops{
351355
GGML_UNARY_OP_SILU,
352356
};

ggml/src/ggml-openvino/openvino/op/cont.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ OutputVector translate_cont(const NodeContext& context) {
1919
num_inputs_check(context, 1, 1);
2020

2121
int op_case = context.get_op_case();
22-
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
22+
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
2323

2424
auto src_shape = context.get_input_shape(0).to_shape();
2525
auto dst_shape = context.get_output_shape(0).to_shape();
@@ -32,6 +32,9 @@ OutputVector translate_cont(const NodeContext& context) {
3232
context.get_input(0),
3333
ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape),
3434
false);
35+
} else if (op_case == 2) {
36+
// The input comes from a TRANSPOSE
37+
return {context.get_input(0)};
3538
} else {
3639
// The input comes from a VIEW
3740
res = process_view_input(context, 0);
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#include <memory>
2+
#include <openvino/op/convert.hpp>
3+
#include "../node_context.hpp"
4+
#include "../op_table.hpp"
5+
#include "../utils.hpp"
6+
7+
namespace ov {
8+
namespace frontend {
9+
namespace ggml {
10+
namespace op {
11+
12+
OutputVector translate_cpy(const NodeContext& context) {
13+
auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type(0));
14+
return rename_outputs_with_suffix({res}, context.get_name());
15+
}
16+
17+
} // namespace op
18+
} // namespace ggml
19+
} // namespace frontend
20+
} // namespace ov
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#include <memory>
2+
#include <openvino/op/convert.hpp>
3+
#include <openvino/op/scaled_dot_product_attention.hpp>
4+
#include "../node_context.hpp"
5+
#include "../op_table.hpp"
6+
#include "../utils.hpp"
7+
8+
namespace ov {
9+
namespace frontend {
10+
namespace ggml {
11+
namespace op {
12+
13+
OutputVector translate_flash_attn_ext(const NodeContext& context) {
14+
num_inputs_check(context, 4, 4);
15+
auto q_f32 = context.get_input(0);
16+
auto k = context.get_input(1);
17+
auto v = context.get_input(2);
18+
auto mask = context.get_input(3);
19+
20+
float* params = reinterpret_cast<float*>(context.get_output_op_params(0));
21+
float scale = params[0];
22+
// float max_bias = params[1];
23+
// float logit_softcap = params[2];
24+
25+
auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
26+
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});
27+
auto res = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v , mask, scale_node, false);
28+
auto res_f32 = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
29+
return rename_outputs_with_suffix({res_f32}, context.get_name());
30+
}
31+
32+
} // namespace op
33+
} // namespace ggml
34+
} // namespace frontend
35+
} // namespace ov

ggml/src/ggml-openvino/openvino/op/get_rows.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ OutputVector translate_get_rows(const NodeContext& context) {
2121
num_inputs_check(context, 2, 2);
2222

2323
int op_case = context.get_op_case();
24-
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
2524

2625
Output<Node> res;
2726
auto data = context.get_input(0);

ggml/src/ggml-openvino/openvino/op/mulmat.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,26 @@ namespace op {
2727
OutputVector translate_mulmat(const NodeContext& context) {
2828
num_inputs_check(context, 2, 2);
2929

30+
int op_case = context.get_op_case();
31+
3032
ov::Output<Node> res;
3133
ov::Output<ov::Node> B = context.get_input(0);
3234
ov::Output<ov::Node> A = context.get_input(1);
3335

36+
bool transpose_b = true;
37+
if (op_case == 2) {
38+
B = B.get_node_shared_ptr()->input_value(0);
39+
transpose_b = false;
40+
} else if (op_case == 3) {
41+
B = process_view_input(context, 0);
42+
A = process_view_input(context, 1);
43+
}
44+
3445
bool convert_out_type = false;
3546
if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) {
36-
B = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_input_type(1));
47+
B = std::make_shared<ov::op::v0::Convert>(B, context.get_input_type(1));
3748
} else if (context.get_input_type(0) != context.get_input_type(1)) {
38-
A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
49+
A = std::make_shared<ov::op::v0::Convert>(A, context.get_input_type(0));
3950
convert_out_type = true;
4051
}
4152

@@ -72,10 +83,10 @@ OutputVector translate_mulmat(const NodeContext& context) {
7283
}
7384

7485
if (convert_out_type) {
75-
auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
86+
auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, transpose_b);
7687
res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
7788
} else {
78-
res = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
89+
res = std::make_shared<ov::op::v0::MatMul>(A, B, false, transpose_b);
7990
}
8091

8192
return rename_outputs_with_suffix({res}, context.get_name());

ggml/src/ggml-openvino/openvino/op/permute.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,12 @@ OutputVector translate_permute(const NodeContext& context) {
2121
num_inputs_check(context, 1, 1);
2222

2323
int op_case = context.get_op_case();
24-
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
24+
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case");
2525
ov::Output<Node> res;
2626

2727
if (op_case == 1) {
28-
auto perm = argsort_descend(context.get_output_stride(0));
2928
res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
30-
ov::op::v0::Constant::create(ov::element::i64, {3}, perm));
29+
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
3130
} else {
3231
auto src = context.get_input(0);
3332
auto attention_size = context.get_input("attention_size");

ggml/src/ggml-openvino/openvino/op/rope.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ OutputVector translate_rope(const NodeContext& context) {
2727
num_inputs_check(context, 2, 3);
2828

2929
int op_case = context.get_op_case();
30-
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
3130

3231
ov::Output<Node> res;
3332

ggml/src/ggml-openvino/openvino/op/set_rows.cpp

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,21 +32,7 @@ OutputVector translate_set_rows(const NodeContext& context) {
3232
FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS");
3333

3434
if (context.is_static() && context.is_first_token()) {
35-
Output<Node> res;
36-
if (context.get_op_case() == 2) {
37-
res = std::make_shared<ov::op::v1::Reshape>(
38-
data,
39-
ov::op::v0::Constant::create(
40-
ov::element::i64,
41-
{3},
42-
{context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}),
43-
false);
44-
res = std::make_shared<ov::op::v1::Transpose>(
45-
res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0}));
46-
} else {
47-
res = data;
48-
}
49-
return rename_outputs_with_suffix({res}, context.get_name());
35+
return rename_outputs_with_suffix({data}, context.get_name());
5036
}
5137

5238
auto indices = context.get_input(1);

0 commit comments

Comments
 (0)