Skip to content

Commit 307bed2

Browse files
Merge pull request #140 from chaxu01/feature/my-ML-examples
Update KleidiAI example for llama.cpp
2 parents beee073 + 88d349c commit 307bed2

File tree

2 files changed

+145
-23
lines changed

2 files changed

+145
-23
lines changed

kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch

Lines changed: 141 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,30 @@
1-
From 617486784d5394fbb54f4d99a4860a050318a4e8 Mon Sep 17 00:00:00 2001
2-
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
3-
Date: Tue, 16 Jul 2024 17:28:50 +0100
1+
From 25ba8dfa43e2b4b101b890c88464b638427d3d42 Mon Sep 17 00:00:00 2001
2+
From: Charles Xu <charles.xu@arm.com>
3+
Date: Wed, 17 Jul 2024 13:28:18 +0200
44
Subject: [PATCH] Use KleidiAI Int4 Matmul micro-kernels in llama.cpp
55

66
- Update CMake file to fetch the Int4 micro-kernels from the KleidiAI
77
repository
88
- Implement a KleidiAI backend for llama.cpp
9+
- Add weight caching feature for KleidiAI
910

10-
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
11+
Signed-off-by: Charles Xu <charles.xu@arm.com>
1112
---
12-
CMakeLists.txt | 48 ++++
13-
ggml-alloc.c | 13 ++
14-
ggml-kleidiai.cpp | 560 ++++++++++++++++++++++++++++++++++++++++++++++
13+
CMakeLists.txt | 52 ++++
14+
ggml-alloc.c | 13 +
15+
ggml-kleidiai.cpp | 675 ++++++++++++++++++++++++++++++++++++++++++++++
1516
ggml-kleidiai.h | 45 ++++
16-
ggml.c | 27 +++
17+
ggml.c | 27 ++
1718
llama.cpp | 19 +-
18-
6 files changed, 711 insertions(+), 1 deletion(-)
19+
6 files changed, 830 insertions(+), 1 deletion(-)
1920
create mode 100644 ggml-kleidiai.cpp
2021
create mode 100644 ggml-kleidiai.h
2122

2223
diff --git a/CMakeLists.txt b/CMakeLists.txt
23-
index 08481334..22504ad2 100644
24+
index 08481334..07f8f601 100644
2425
--- a/CMakeLists.txt
2526
+++ b/CMakeLists.txt
26-
@@ -548,6 +548,53 @@ if (LLAMA_VULKAN)
27+
@@ -548,6 +548,57 @@ if (LLAMA_VULKAN)
2728
endif()
2829
endif()
2930

@@ -72,12 +73,16 @@ index 08481334..22504ad2 100644
7273
+ add_compile_definitions(GGML_USE_KLEIDIAI)
7374
+ add_compile_definitions(GGML_KLEIDIAI_REUSE_MEMORY)
7475
+
76+
+ if (LLAMA_KLEIDIAI_CACHE)
77+
+ add_compile_definitions(GGML_KLEIDIAI_USE_CACHE)
78+
+ endif()
79+
+
7580
+endif()
7681
+
7782
if (LLAMA_HIPBLAS)
7883
if (NOT EXISTS $ENV{ROCM_PATH})
7984
if (NOT EXISTS /opt/rocm)
80-
@@ -1268,6 +1315,7 @@ add_library(ggml OBJECT
85+
@@ -1268,6 +1319,7 @@ add_library(ggml OBJECT
8186
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
8287
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
8388
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
@@ -118,10 +123,10 @@ index bd367c42..ed4ce0ae 100644
118123
if (this_size > max_size) {
119124
diff --git a/ggml-kleidiai.cpp b/ggml-kleidiai.cpp
120125
new file mode 100644
121-
index 00000000..6800f63e
126+
index 00000000..257a0d4c
122127
--- /dev/null
123128
+++ b/ggml-kleidiai.cpp
124-
@@ -0,0 +1,560 @@
129+
@@ -0,0 +1,675 @@
125130
+/*
126131
+ * Copyright (c) 2024 Arm Limited.
127132
+ *
@@ -160,6 +165,13 @@ index 00000000..6800f63e
160165
+#include <string.h>
161166
+#include <asm/hwcap.h>
162167
+#include <sys/auxv.h>
168+
+#if defined(GGML_KLEIDIAI_USE_CACHE)
169+
+#include <cstring>
170+
+#include <sys/mman.h>
171+
+#include <sys/stat.h>
172+
+#include <fcntl.h>
173+
+#include <unistd.h>
174+
+#endif
163175
+
164176
+// KleidiAI micro-kernels
165177
+#include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
@@ -213,6 +225,85 @@ index 00000000..6800f63e
213225
+unsigned long int getauxval(unsigned long int __type) __INTRODUCED_IN(18);
214226
+#endif
215227
+
228+
+#if defined(GGML_KLEIDIAI_USE_CACHE)
229+
+struct binary_data {
230+
+ void *ptr;
231+
+ size_t size;
232+
+};
233+
+
234+
+struct cached_weight {
235+
+ int fd;
236+
+ binary_data data;
237+
+};
238+
+
239+
+static const char *g_cache_filename = "kai_transformed_weights.cache";
240+
+static const size_t g_cache_key_size = 16;
241+
+
242+
+static struct cached_weight g_kai_cached_weight;
243+
+
244+
+static void ggml_kai_open_cached_weight() {
245+
+ if (access(g_cache_filename, F_OK) != 0) {
246+
+ g_kai_cached_weight.fd = open(g_cache_filename, O_RDWR | O_CREAT, 0644);
247+
+ if (g_kai_cached_weight.fd == -1) {
248+
+ GGML_ASSERT(false);
249+
+ }
250+
+ g_kai_cached_weight.data.size = 0;
251+
+ }
252+
+ else {
253+
+ struct stat file_info;
254+
+ g_kai_cached_weight.fd = open(g_cache_filename, O_RDONLY);
255+
+ if (fstat(g_kai_cached_weight.fd, &file_info) == -1) {
256+
+ GGML_ASSERT(false);
257+
+ }
258+
+
259+
+ g_kai_cached_weight.data.size = file_info.st_size;
260+
+
261+
+ if (g_kai_cached_weight.data.size > 0) {
262+
+ g_kai_cached_weight.data.ptr = mmap(NULL, g_kai_cached_weight.data.size, PROT_READ, MAP_PRIVATE, g_kai_cached_weight.fd, 0);
263+
+ if (g_kai_cached_weight.data.ptr == MAP_FAILED) {
264+
+ GGML_ASSERT(false);
265+
+ }
266+
+ }
267+
+
268+
+ }
269+
+}
270+
+
271+
+static void ggml_kai_write_cache_weight(int fd, void *key, size_t key_size, void *data, size_t data_size) {
272+
+ if (write(fd, key, key_size) != static_cast<ssize_t>(key_size)) {
273+
+ GGML_ASSERT(false);
274+
+ }
275+
+
276+
+ if (write(fd, &data_size, sizeof(size_t)) != sizeof(size_t)) {
277+
+ GGML_ASSERT(false);
278+
+ }
279+
+
280+
+ if (write(fd, data, data_size) != static_cast<ssize_t>(data_size)) {
281+
+ GGML_ASSERT(false);
282+
+ }
283+
+}
284+
+
285+
+static bool ggml_kai_match_cached_weight(void *token, struct binary_data *data) {
286+
+ char* data_ptr = static_cast<char*>(g_kai_cached_weight.data.ptr);
287+
+ char* end_ptr = data_ptr + g_kai_cached_weight.data.size;
288+
+
289+
+ while (data_ptr < end_ptr) {
290+
+ void *key = data_ptr;
291+
+ data_ptr += g_cache_key_size;
292+
+
293+
+ data->size=*(std::size_t*)data_ptr;
294+
+ data_ptr += sizeof(std::size_t);
295+
+
296+
+ data->ptr = data_ptr;
297+
+ data_ptr += data->size;
298+
+
299+
+ if (memcmp(token, key, 16) == 0) {
300+
+ return true;
301+
+ }
302+
+ }
303+
+ return false;
304+
+}
305+
+#endif
306+
+
216307
+inline bool is_feature_supported(uint64_t features, uint64_t feature_mask) {
217308
+ return (features & feature_mask);
218309
+}
@@ -240,6 +331,10 @@ index 00000000..6800f63e
240331
+ ggml_kai_free_extra_mem();
241332
+ initialized = true;
242333
+ g_kai_loaded = true;
334+
+
335+
+#if defined(GGML_KLEIDIAI_USE_CACHE)
336+
+ ggml_kai_open_cached_weight();
337+
+#endif
243338
+ }
244339
+}
245340
+
@@ -523,6 +618,20 @@ index 00000000..6800f63e
523618
+ if (cur->extra == NULL) {
524619
+ if(cur->type == GGML_TYPE_Q4_0) {
525620
+
621+
+#if defined(GGML_KLEIDIAI_USE_CACHE)
622+
+ if (g_kai_cached_weight.data.size > 0) {
623+
+ struct binary_data data;
624+
+ bool matched = ggml_kai_match_cached_weight(cur->data, &data);
625+
+ if (matched) {
626+
+ cur->extra = data.ptr;
627+
+ }
628+
+ else {
629+
+ perror("No match found, please remove the cache file and try again!");
630+
+ GGML_ASSERT(false);
631+
+ }
632+
+ return;
633+
+ }
634+
+#endif
526635
+ const size_t original_data_size = ggml_nbytes(cur);
527636
+ const size_t reshaped_data_sz = rhs_packing_params.packed_size;
528637
+
@@ -545,6 +654,10 @@ index 00000000..6800f63e
545654
+ 0,
546655
+ &params);
547656
+
657+
+#if defined(GGML_KLEIDIAI_USE_CACHE)
658+
+ ggml_kai_write_cache_weight(g_kai_cached_weight.fd, cur->data, g_cache_key_size, reshaped_data, reshaped_data_sz);
659+
+#endif
660+
+
548661
+#if defined(GGML_KLEIDIAI_REUSE_MEMORY)
549662
+ GGML_ASSERT(reshaped_data_sz <= original_data_size);
550663
+ memcpy(cur->data, (void *)reshaped_data, ggml_nbytes(cur));
@@ -570,9 +683,9 @@ index 00000000..6800f63e
570683
+ // tensor->src[1] = second source tensor
571684
+
572685
+ ggml_kai_func_t func;
573-
+ const bool is_cpu_only = tensor->backend == GGML_BACKEND_TYPE_CPU
574-
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU))
575-
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
686+
+ const bool is_cpu_only = ggml_backend_buffer_is_host(tensor->buffer)
687+
+ || (tensor->src[0] != nullptr && ggml_backend_buffer_is_host(tensor->src[0]->buffer))
688+
+ || (tensor->src[1] != nullptr && ggml_backend_buffer_is_host(tensor->src[0]->buffer));
576689
+
577690
+ if (!is_cpu_only) {
578691
+ return false;
@@ -604,9 +717,9 @@ index 00000000..6800f63e
604717
+ // tensor->src[0] = first source tensor
605718
+ // tensor->src[1] = second source tensor
606719
+
607-
+ const bool is_cpu_only = tensor->backend == GGML_BACKEND_TYPE_CPU
608-
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU))
609-
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
720+
+ const bool is_cpu_only = ggml_backend_buffer_is_host(tensor->buffer)
721+
+ || (tensor->src[0] != nullptr && ggml_backend_buffer_is_host(tensor->src[0]->buffer))
722+
+ || (tensor->src[1] != nullptr && ggml_backend_buffer_is_host(tensor->src[0]->buffer));
610723
+
611724
+ if (!is_cpu_only) {
612725
+ return false;
@@ -680,6 +793,13 @@ index 00000000..6800f63e
680793
+ free(g_extra_mem[i]);
681794
+ }
682795
+ g_extra_mem_idx = 0;
796+
+
797+
+#if defined(GGML_KLEIDIAI_USE_CACHE)
798+
+ if (g_kai_cached_weight.data.size > 0) {
799+
+ munmap(g_kai_cached_weight.data.ptr, g_kai_cached_weight.data.size);
800+
+ }
801+
+ close(g_kai_cached_weight.fd);
802+
+#endif
683803
+}
684804
+#endif // defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__))
685805
diff --git a/ggml-kleidiai.h b/ggml-kleidiai.h
@@ -845,5 +965,5 @@ index 05591aa4..735dde04 100644
845965
}
846966

847967
--
848-
2.25.1
968+
2.34.1
849969

kleidiai-examples/llama_cpp/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ mkdir build && cd build
9797

9898
export NDK_PATH="your-android-ndk-path"
9999

100-
cmake -DLLAMA_KLEIDIAI=ON -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+dotprod -DCMAKE_CXX_FLAGS=-march=armv8.2a+i8mm+dotprod ..
100+
cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+dotprod -DCMAKE_CXX_FLAGS=-march=armv8.2a+i8mm+dotprod ..
101101

102102
make -j4
103103
```
@@ -106,10 +106,12 @@ Build the llama.cpp project for Linux®:
106106
```bash
107107
mkdir build && cd build
108108

109-
cmake -DLLAMA_KLEIDIAI=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm ..
109+
cmake -DLLAMA_KLEIDIAI=ON -DLLAMA_KLEIDIAI_CACHE=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm ..
110110

111111
make -j4
112112
```
113+
The -DLLAMA_KLEIDIAI_CACHE=ON is used to enable the weights caching. Weights caching is a feature available in the KleidiAI backend to improve the model loading time. Since the layout of the original model weights is transformed by KleidiAI to improve the performance of the matrix-multiplication routines, this option ensures that the weights transformation only happens the first time you run the model.
114+
To disable this option, you simply remove the flag from the cmake command.
113115

114116
### Step 6:
115117

0 commit comments

Comments
 (0)