Improve mul_mat performance for big matrices using Accelerate framework

Also: - Speedup GELU operator via F16 cast - Multi-thread NORM operator - Disable FLASH_FF in whisper example
RWKV · Oct 17, 2022 · d8f64bc · d8f64bc
1 parent ea0ef2a
commit d8f64bc
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 134 deletions.
diff --git a/README.md b/README.md
@@ -8,24 +8,26 @@ Tensor library for machine learning
 - 16-bit float support
 - Automatic differentiation (WIP in progress)
 - ADAM and L-BFGS optimizers
-- Optimized for Arm64 architectures (M1) via NEON intrinsics
+- Optimized for Apple silicon via NEON intrinsics and Accelerate framework
 - On x86 architectures utilzes AVX intrinsics
 - No third-party dependencies
 - Zero memory allocations during runtime
 
+*Note that this project is under development and not ready for production use*
+
 ## Whisper inference (example)
 
 With ggml you can efficiently run [Whisper](examples/whisper) inference on the CPU.
 
 Memory requirements:
 
-| Model | Mem |
-| ---   | --- |
-| tiny.en | ~460 MB |
-| base.en | ~620 MB |
-| small.en | ~1.3 GB |
-| medium.en | ~2.8 GB |
-| large | ~4.9 GB |
+| Model  | Disk   | Mem     |
+| ---    | ---    | ---     |
+| tiny   |  75 MB | ~280 MB |
+| base   | 142 MB | ~430 MB |
+| small  | 466 MB | ~1.0 GB |
+| medium | 1.5 GB | ~2.6 GB |
+| large  | 2.9 GB | ~4.7 GB |
 
 ## GPT inference (example)
 

diff --git a/examples/whisper/README.md b/examples/whisper/README.md
@@ -11,11 +11,11 @@ Checkout https://github.com/ggerganov/whisper.cpp
 
 | Model  | Disk   | Mem     |
 | ---    | ---    | ---     |
-| tiny   |  75 MB | ~240 MB |
-| base   | 142 MB | ~380 MB |
-| small  | 466 MB | ~970 MB |
-| medium | 1.5 GB | ~2.5 GB |
-| large  | 2.9 GB | ~4.6 GB |
+| tiny   |  75 MB | ~280 MB |
+| base   | 142 MB | ~430 MB |
+| small  | 466 MB | ~1.0 GB |
+| medium | 1.5 GB | ~2.6 GB |
+| large  | 2.9 GB | ~4.7 GB |
 
 ## ggml format
 

diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp
@@ -15,7 +15,7 @@
 #include <vector>
 
 #define USE_FLASH_ATTN
-#define USE_FLASH_FF
+//#define USE_FLASH_FF
 
 // available whisper models
 enum e_model {
@@ -148,11 +148,11 @@ static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
 };
 
 static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
-    { MODEL_TINY,     64ull*MB },
-    { MODEL_BASE,     84ull*MB },
-    { MODEL_SMALL,   128ull*MB },
-    { MODEL_MEDIUM,  172ull*MB },
-    { MODEL_LARGE,   216ull*MB },
+    { MODEL_TINY,    104ull*MB },
+    { MODEL_BASE,    136ull*MB },
+    { MODEL_SMALL,   208ull*MB },
+    { MODEL_MEDIUM,  280ull*MB },
+    { MODEL_LARGE,   354ull*MB },
 };
 
 static const std::map<e_model, size_t> MEM_REQ_DECODE = {

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -36,17 +36,17 @@ endif()
 set(TARGET ggml)
 
 # on APPLE - include Accelerate framework
-#if (APPLE)
-#    find_library(ACCELERATE_FRAMEWORK Accelerate)
-#    if (ACCELERATE_FRAMEWORK)
-#        message(STATUS "Accelerate framework found")
-#
-#        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-#        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-#    else()
-#        message(WARNING "Accelerate framework not found")
-#    endif()
-#endif()
+if (APPLE AND NOT GGML_NO_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")
+
+        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
+    else()
+        message(WARNING "Accelerate framework not found")
+    endif()
+endif()
 
 if (GGML_PERF)
     set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)