Skip to content

Commit a62170c

Browse files
ggml : add SSE3 and fp16 conversion lookup table (ggml-org#368)
* Improves WASM performance: On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome * Add support for SSE3 SIMD * Add SSE3 to system information * Add Imath support for fp16-fp32 conversions * Add Imath to system information * Wrap Imath calls to avoid static function warnings * Drop Imath; Add lookup table for f16 -> f32 conversions * Remove TODO comments * Update SSE3 to new macro arguments * Correct updated macro definitions * Prefer static inline where possible * ggml : static inlines + add public f16 <-> f32 conversions Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 1944e7c commit a62170c

File tree

4 files changed

+147
-21
lines changed

4 files changed

+147
-21
lines changed

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
8484
ifneq (,$(findstring f16c,$(F16C_M)))
8585
CFLAGS += -mf16c
8686
endif
87+
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
88+
ifneq (,$(findstring sse3,$(SSE3_M)))
89+
CFLAGS += -msse3
90+
endif
8791
else ifeq ($(UNAME_S),Haiku)
8892
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
8993
ifneq (,$(findstring avx,$(AVX1_M)))

ggml.c

+141-21
Original file line numberDiff line numberDiff line change
@@ -124,13 +124,8 @@ typedef double ggml_float;
124124
//
125125
#include <arm_neon.h>
126126

127-
float ggml_fp16_to_fp32(ggml_fp16_t x) {
128-
return x;
129-
}
130-
131-
ggml_fp16_t ggml_fp32_to_fp16(float x) {
132-
return x;
133-
}
127+
#define GGML_COMPUTE_FP16_TO_FP32(x) (x)
128+
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
134129

135130
#define GGML_FP16_TO_FP32(x) (x)
136131
#define GGML_FP32_TO_FP16(x) (x)
@@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
150145
#endif
151146

152147
#ifdef __F16C__
153-
float ggml_fp16_to_fp32(ggml_fp16_t h) {
154-
return _cvtsh_ss(h);
155-
}
156-
ggml_fp16_t ggml_fp32_to_fp16(float f) {
157-
return _cvtss_sh(f, 0);
158-
}
159148

160-
#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
161-
#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
149+
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
150+
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
162151

163152
#else
164153

@@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) {
183172
return fp32.as_bits;
184173
}
185174

186-
float ggml_fp16_to_fp32(ggml_fp16_t h) {
175+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
187176
const uint32_t w = (uint32_t) h << 16;
188177
const uint32_t sign = w & UINT32_C(0x80000000);
189178
const uint32_t two_w = w + w;
@@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
206195
return fp32_from_bits(result);
207196
}
208197

209-
ggml_fp16_t ggml_fp32_to_fp16(float f) {
198+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
210199
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
211200
const float scale_to_inf = 0x1.0p+112f;
212201
const float scale_to_zero = 0x1.0p-110f;
@@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
232221
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
233222
}
234223

235-
#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
236-
#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
224+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
225+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
237226

238227
#endif // __F16C__
239228

@@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
249238
// precomputed exp table for f16 (128 KB)
250239
static ggml_fp16_t table_exp_f16[1 << 16];
251240

241+
// precomputed f32 table for f16 (256 KB)
242+
static float table_f32_f16[1 << 16];
243+
244+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
245+
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
246+
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
247+
248+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
249+
uint16_t s;
250+
memcpy(&s, &f, sizeof(uint16_t));
251+
return table_f32_f16[s];
252+
}
253+
254+
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
255+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
256+
257+
#endif
258+
259+
// note: do not use these inside ggml.c
260+
// these are meant to be used via the ggml.h API
261+
float ggml_fp16_to_fp32(ggml_fp16_t x) {
262+
return GGML_FP16_TO_FP32(x);
263+
}
264+
265+
ggml_fp16_t ggml_fp32_to_fp16(float x) {
266+
return GGML_FP32_TO_FP16(x);
267+
}
268+
252269
//
253270
// timing
254271
//
@@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
692709
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
693710
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
694711

712+
#elif defined(__SSE3__)
713+
714+
#define GGML_SIMD
715+
716+
// F32 SSE
717+
718+
#define GGML_F32_STEP 32
719+
#define GGML_F32_EPR 4
720+
721+
#define GGML_F32x4 __m128
722+
#define GGML_F32x4_ZERO _mm_setzero_ps()
723+
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
724+
#define GGML_F32x4_LOAD _mm_loadu_ps
725+
#define GGML_F32x4_STORE _mm_storeu_ps
726+
#if defined(__FMA__)
727+
// TODO: Does this work?
728+
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
729+
#else
730+
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
731+
#endif
732+
#define GGML_F32x4_ADD _mm_add_ps
733+
#define GGML_F32x4_MUL _mm_mul_ps
734+
#define GGML_F32x4_REDUCE(res, x) \
735+
{ \
736+
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
737+
x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
738+
} \
739+
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
740+
x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
741+
} \
742+
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
743+
x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
744+
} \
745+
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
746+
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
747+
}
748+
// TODO: is this optimal ?
749+
750+
#define GGML_F32_VEC GGML_F32x4
751+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
752+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
753+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
754+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
755+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
756+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
757+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
758+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
759+
760+
// F16 SSE
761+
762+
#define GGML_F16_STEP 32
763+
#define GGML_F16_EPR 4
764+
765+
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
766+
float tmp[4];
767+
768+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
769+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
770+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
771+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
772+
773+
return _mm_loadu_ps(tmp);
774+
}
775+
776+
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
777+
float arr[4];
778+
779+
_mm_storeu_ps(arr, y);
780+
781+
x[0] = GGML_FP32_TO_FP16(arr[0]);
782+
x[1] = GGML_FP32_TO_FP16(arr[1]);
783+
x[2] = GGML_FP32_TO_FP16(arr[2]);
784+
x[3] = GGML_FP32_TO_FP16(arr[3]);
785+
}
786+
787+
#define GGML_F32Cx4 __m128
788+
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
789+
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
790+
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
791+
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
792+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
793+
#define GGML_F32Cx4_ADD _mm_add_ps
794+
#define GGML_F32Cx4_MUL _mm_mul_ps
795+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
796+
797+
#define GGML_F16_VEC GGML_F32Cx4
798+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
799+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
800+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
801+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
802+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
803+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
804+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
805+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
806+
695807
#endif
696808

697809
// GGML_F32_ARR / GGML_F16_ARR
@@ -1269,15 +1381,15 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
12691381
static bool is_first_call = true;
12701382

12711383
if (is_first_call) {
1272-
// initialize GELU and EXP tables
1384+
// initialize GELU, EXP and F32 tables
12731385
{
12741386
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
12751387

12761388
ggml_fp16_t ii;
12771389
for (int i = 0; i < (1 << 16); ++i) {
12781390
uint16_t ui = i;
12791391
memcpy(&ii, &ui, sizeof(ii));
1280-
const float f = GGML_FP16_TO_FP32(ii);
1392+
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
12811393
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
12821394
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
12831395
}
@@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) {
82328344
#endif
82338345
}
82348346

8347+
int ggml_cpu_has_sse3(void) {
8348+
#if defined(__SSE3__)
8349+
return 1;
8350+
#else
8351+
return 0;
8352+
#endif
8353+
}
8354+
82358355
int ggml_cpu_has_vsx(void) {
82368356
#if defined(__POWER9_VECTOR__)
82378357
return 1;

ggml.h

+1
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,7 @@ int ggml_cpu_has_f16c(void);
731731
int ggml_cpu_has_fp16_va(void);
732732
int ggml_cpu_has_wasm_simd(void);
733733
int ggml_cpu_has_blas(void);
734+
int ggml_cpu_has_sse3(void);
734735
int ggml_cpu_has_vsx(void);
735736

736737
#ifdef __cplusplus

whisper.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -2582,6 +2582,7 @@ const char * whisper_print_system_info(void) {
25822582
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
25832583
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
25842584
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2585+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
25852586
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
25862587

25872588
return s.c_str();

0 commit comments

Comments
 (0)