1
- From 617486784d5394fbb54f4d99a4860a050318a4e8 Mon Sep 17 00:00:00 2001
2
- From: Gian Marco Iodice <gianmarco.iodice @arm.com>
3
- Date: Tue, 16 Jul 2024 17 :28:50 +0100
1
+ From 25ba8dfa43e2b4b101b890c88464b638427d3d42 Mon Sep 17 00:00:00 2001
2
+ From: Charles Xu <charles.xu @arm.com>
3
+ Date: Wed, 17 Jul 2024 13 :28:18 +0200
4
4
Subject: [PATCH] Use KleidiAI Int4 Matmul micro-kernels in llama.cpp
5
5
6
6
- Update CMake file to fetch the Int4 micro-kernels from the KleidiAI
7
7
repository
8
8
- Implement a KleidiAI backend for llama.cpp
9
+ - Add weight caching feature for KleidiAI
9
10
10
- Signed-off-by: Gian Marco Iodice <gianmarco.iodice @arm.com>
11
+ Signed-off-by: Charles Xu <charles.xu @arm.com>
11
12
---
12
- CMakeLists.txt | 48 ++++
13
- ggml-alloc.c | 13 ++
14
- ggml-kleidiai.cpp | 560 ++++++++++++++++++++++++++++++++++++++++++++++
13
+ CMakeLists.txt | 52 ++++
14
+ ggml-alloc.c | 13 +
15
+ ggml-kleidiai.cpp | 675 ++++++++++++++++++++++++++++++++++++++++++++++
15
16
ggml-kleidiai.h | 45 ++++
16
- ggml.c | 27 +++
17
+ ggml.c | 27 ++
17
18
llama.cpp | 19 +-
18
- 6 files changed, 711 insertions(+), 1 deletion(-)
19
+ 6 files changed, 830 insertions(+), 1 deletion(-)
19
20
create mode 100644 ggml-kleidiai.cpp
20
21
create mode 100644 ggml-kleidiai.h
21
22
22
23
diff --git a/CMakeLists.txt b/CMakeLists.txt
23
- index 08481334..22504ad2 100644
24
+ index 08481334..07f8f601 100644
24
25
--- a/CMakeLists.txt
25
26
+++ b/CMakeLists.txt
26
- @@ -548,6 +548,53 @@ if (LLAMA_VULKAN)
27
+ @@ -548,6 +548,57 @@ if (LLAMA_VULKAN)
27
28
endif()
28
29
endif()
29
30
@@ -72,12 +73,16 @@ index 08481334..22504ad2 100644
72
73
+ add_compile_definitions(GGML_USE_KLEIDIAI)
73
74
+ add_compile_definitions(GGML_KLEIDIAI_REUSE_MEMORY)
74
75
+
76
+ + if (LLAMA_KLEIDIAI_CACHE)
77
+ + add_compile_definitions(GGML_KLEIDIAI_USE_CACHE)
78
+ + endif()
79
+ +
75
80
+ endif()
76
81
+
77
82
if (LLAMA_HIPBLAS)
78
83
if (NOT EXISTS $ENV{ROCM_PATH})
79
84
if (NOT EXISTS /opt/rocm)
80
- @@ -1268,6 +1315 ,7 @@ add_library(ggml OBJECT
85
+ @@ -1268,6 +1319 ,7 @@ add_library(ggml OBJECT
81
86
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
82
87
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
83
88
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
@@ -118,10 +123,10 @@ index bd367c42..ed4ce0ae 100644
118
123
if (this_size > max_size) {
119
124
diff --git a/ggml-kleidiai.cpp b/ggml-kleidiai.cpp
120
125
new file mode 100644
121
- index 00000000..6800f63e
126
+ index 00000000..257a0d4c
122
127
--- /dev/null
123
128
+++ b/ggml-kleidiai.cpp
124
- @@ -0,0 +1,560 @@
129
+ @@ -0,0 +1,675 @@
125
130
+ /*
126
131
+ * Copyright (c) 2024 Arm Limited.
127
132
+ *
@@ -160,6 +165,13 @@ index 00000000..6800f63e
160
165
+ #include <string.h>
161
166
+ #include <asm/hwcap.h>
162
167
+ #include <sys/auxv.h>
168
+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
169
+ + #include <cstring>
170
+ + #include <sys/mman.h>
171
+ + #include <sys/stat.h>
172
+ + #include <fcntl.h>
173
+ + #include <unistd.h>
174
+ + #endif
163
175
+
164
176
+ // KleidiAI micro-kernels
165
177
+ #include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
@@ -213,6 +225,85 @@ index 00000000..6800f63e
213
225
+ unsigned long int getauxval(unsigned long int __type) __INTRODUCED_IN(18);
214
226
+ #endif
215
227
+
228
+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
229
+ + struct binary_data {
230
+ + void *ptr;
231
+ + size_t size;
232
+ + };
233
+ +
234
+ + struct cached_weight {
235
+ + int fd;
236
+ + binary_data data;
237
+ + };
238
+ +
239
+ + static const char *g_cache_filename = "kai_transformed_weights.cache";
240
+ + static const size_t g_cache_key_size = 16;
241
+ +
242
+ + static struct cached_weight g_kai_cached_weight;
243
+ +
244
+ + static void ggml_kai_open_cached_weight() {
245
+ + if (access(g_cache_filename, F_OK) != 0) {
246
+ + g_kai_cached_weight.fd = open(g_cache_filename, O_RDWR | O_CREAT, 0644);
247
+ + if (g_kai_cached_weight.fd == -1) {
248
+ + GGML_ASSERT(false);
249
+ + }
250
+ + g_kai_cached_weight.data.size = 0;
251
+ + }
252
+ + else {
253
+ + struct stat file_info;
254
+ + g_kai_cached_weight.fd = open(g_cache_filename, O_RDONLY);
255
+ + if (fstat(g_kai_cached_weight.fd, &file_info) == -1) {
256
+ + GGML_ASSERT(false);
257
+ + }
258
+ +
259
+ + g_kai_cached_weight.data.size = file_info.st_size;
260
+ +
261
+ + if (g_kai_cached_weight.data.size > 0) {
262
+ + g_kai_cached_weight.data.ptr = mmap(NULL, g_kai_cached_weight.data.size, PROT_READ, MAP_PRIVATE, g_kai_cached_weight.fd, 0);
263
+ + if (g_kai_cached_weight.data.ptr == MAP_FAILED) {
264
+ + GGML_ASSERT(false);
265
+ + }
266
+ + }
267
+ +
268
+ + }
269
+ + }
270
+ +
271
+ + static void ggml_kai_write_cache_weight(int fd, void *key, size_t key_size, void *data, size_t data_size) {
272
+ + if (write(fd, key, key_size) != static_cast<ssize_t>(key_size)) {
273
+ + GGML_ASSERT(false);
274
+ + }
275
+ +
276
+ + if (write(fd, &data_size, sizeof(size_t)) != sizeof(size_t)) {
277
+ + GGML_ASSERT(false);
278
+ + }
279
+ +
280
+ + if (write(fd, data, data_size) != static_cast<ssize_t>(data_size)) {
281
+ + GGML_ASSERT(false);
282
+ + }
283
+ + }
284
+ +
285
+ + static bool ggml_kai_match_cached_weight(void *token, struct binary_data *data) {
286
+ + char* data_ptr = static_cast<char*>(g_kai_cached_weight.data.ptr);
287
+ + char* end_ptr = data_ptr + g_kai_cached_weight.data.size;
288
+ +
289
+ + while (data_ptr < end_ptr) {
290
+ + void *key = data_ptr;
291
+ + data_ptr += g_cache_key_size;
292
+ +
293
+ + data->size=*(std::size_t*)data_ptr;
294
+ + data_ptr += sizeof(std::size_t);
295
+ +
296
+ + data->ptr = data_ptr;
297
+ + data_ptr += data->size;
298
+ +
299
+ + if (memcmp(token, key, 16) == 0) {
300
+ + return true;
301
+ + }
302
+ + }
303
+ + return false;
304
+ + }
305
+ + #endif
306
+ +
216
307
+ inline bool is_feature_supported(uint64_t features, uint64_t feature_mask) {
217
308
+ return (features & feature_mask);
218
309
+ }
@@ -240,6 +331,10 @@ index 00000000..6800f63e
240
331
+ ggml_kai_free_extra_mem();
241
332
+ initialized = true;
242
333
+ g_kai_loaded = true;
334
+ +
335
+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
336
+ + ggml_kai_open_cached_weight();
337
+ + #endif
243
338
+ }
244
339
+ }
245
340
+
@@ -523,6 +618,20 @@ index 00000000..6800f63e
523
618
+ if (cur->extra == NULL) {
524
619
+ if(cur->type == GGML_TYPE_Q4_0) {
525
620
+
621
+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
622
+ + if (g_kai_cached_weight.data.size > 0) {
623
+ + struct binary_data data;
624
+ + bool matched = ggml_kai_match_cached_weight(cur->data, &data);
625
+ + if (matched) {
626
+ + cur->extra = data.ptr;
627
+ + }
628
+ + else {
629
+ + perror("No match found, please remove the cache file and try again!");
630
+ + GGML_ASSERT(false);
631
+ + }
632
+ + return;
633
+ + }
634
+ + #endif
526
635
+ const size_t original_data_size = ggml_nbytes(cur);
527
636
+ const size_t reshaped_data_sz = rhs_packing_params.packed_size;
528
637
+
@@ -545,6 +654,10 @@ index 00000000..6800f63e
545
654
+ 0,
546
655
+ ¶ms);
547
656
+
657
+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
658
+ + ggml_kai_write_cache_weight(g_kai_cached_weight.fd, cur->data, g_cache_key_size, reshaped_data, reshaped_data_sz);
659
+ + #endif
660
+ +
548
661
+ #if defined(GGML_KLEIDIAI_REUSE_MEMORY)
549
662
+ GGML_ASSERT(reshaped_data_sz <= original_data_size);
550
663
+ memcpy(cur->data, (void *)reshaped_data, ggml_nbytes(cur));
@@ -570,9 +683,9 @@ index 00000000..6800f63e
570
683
+ // tensor->src[1] = second source tensor
571
684
+
572
685
+ ggml_kai_func_t func;
573
- + const bool is_cpu_only = tensor->backend == GGML_BACKEND_TYPE_CPU
574
- + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU ))
575
- + || (tensor->src[1] != nullptr && tensor->src[1 ]->backend == GGML_BACKEND_TYPE_CPU );
686
+ + const bool is_cpu_only = ggml_backend_buffer_is_host( tensor->buffer)
687
+ + || (tensor->src[0] != nullptr && ggml_backend_buffer_is_host (tensor->src[0]->buffer ))
688
+ + || (tensor->src[1] != nullptr && ggml_backend_buffer_is_host( tensor->src[0 ]->buffer) );
576
689
+
577
690
+ if (!is_cpu_only) {
578
691
+ return false;
@@ -604,9 +717,9 @@ index 00000000..6800f63e
604
717
+ // tensor->src[0] = first source tensor
605
718
+ // tensor->src[1] = second source tensor
606
719
+
607
- + const bool is_cpu_only = tensor->backend == GGML_BACKEND_TYPE_CPU
608
- + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU ))
609
- + || (tensor->src[1] != nullptr && tensor->src[1 ]->backend == GGML_BACKEND_TYPE_CPU );
720
+ + const bool is_cpu_only = ggml_backend_buffer_is_host( tensor->buffer)
721
+ + || (tensor->src[0] != nullptr && ggml_backend_buffer_is_host (tensor->src[0]->buffer ))
722
+ + || (tensor->src[1] != nullptr && ggml_backend_buffer_is_host( tensor->src[0 ]->buffer) );
610
723
+
611
724
+ if (!is_cpu_only) {
612
725
+ return false;
@@ -680,6 +793,13 @@ index 00000000..6800f63e
680
793
+ free(g_extra_mem[i]);
681
794
+ }
682
795
+ g_extra_mem_idx = 0;
796
+ +
797
+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
798
+ + if (g_kai_cached_weight.data.size > 0) {
799
+ + munmap(g_kai_cached_weight.data.ptr, g_kai_cached_weight.data.size);
800
+ + }
801
+ + close(g_kai_cached_weight.fd);
802
+ + #endif
683
803
+ }
684
804
+ #endif // defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__))
685
805
diff --git a/ggml-kleidiai.h b/ggml-kleidiai.h
@@ -845,5 +965,5 @@ index 05591aa4..735dde04 100644
845
965
}
846
966
847
967
- -
848
- 2.25 .1
968
+ 2.34 .1
849
969
0 commit comments