@@ -124,13 +124,8 @@ typedef double ggml_float;
124124//
125125#include <arm_neon.h>
126126
127- float ggml_fp16_to_fp32 (ggml_fp16_t x ) {
128- return x ;
129- }
130-
131- ggml_fp16_t ggml_fp32_to_fp16 (float x ) {
132- return x ;
133- }
127+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) (x)
128+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) (x)
134129
135130#define GGML_FP16_TO_FP32 (x ) (x)
136131#define GGML_FP32_TO_FP16 (x ) (x)
@@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
150145#endif
151146
152147#ifdef __F16C__
153- float ggml_fp16_to_fp32 (ggml_fp16_t h ) {
154- return _cvtsh_ss (h );
155- }
156- ggml_fp16_t ggml_fp32_to_fp16 (float f ) {
157- return _cvtss_sh (f , 0 );
158- }
159148
160- #define GGML_FP16_TO_FP32 (x ) _cvtsh_ss(x)
161- #define GGML_FP32_TO_FP16 (x ) _cvtss_sh(x, 0)
149+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) _cvtsh_ss(x)
150+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) _cvtss_sh(x, 0)
162151
163152#else
164153
@@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) {
183172 return fp32 .as_bits ;
184173}
185174
186- float ggml_fp16_to_fp32 (ggml_fp16_t h ) {
175+ static inline float ggml_compute_fp16_to_fp32 (ggml_fp16_t h ) {
187176 const uint32_t w = (uint32_t ) h << 16 ;
188177 const uint32_t sign = w & UINT32_C (0x80000000 );
189178 const uint32_t two_w = w + w ;
@@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
206195 return fp32_from_bits (result );
207196}
208197
209- ggml_fp16_t ggml_fp32_to_fp16 (float f ) {
198+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16 (float f ) {
210199#if defined(__STDC_VERSION__ ) && (__STDC_VERSION__ >= 199901L ) || defined(__GNUC__ ) && !defined(__STRICT_ANSI__ )
211200 const float scale_to_inf = 0x1.0p+112f ;
212201 const float scale_to_zero = 0x1.0p-110f ;
@@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
232221 return (sign >> 16 ) | (shl1_w > UINT32_C (0xFF000000 ) ? UINT16_C (0x7E00 ) : nonsign );
233222}
234223
235- #define GGML_FP16_TO_FP32 (x ) ggml_fp16_to_fp32 (x)
236- #define GGML_FP32_TO_FP16 (x ) ggml_fp32_to_fp16 (x)
224+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32 (x)
225+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) ggml_compute_fp32_to_fp16 (x)
237226
238227#endif // __F16C__
239228
@@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
249238// precomputed exp table for f16 (128 KB)
250239static ggml_fp16_t table_exp_f16 [1 << 16 ];
251240
241+ // precomputed f32 table for f16 (256 KB)
242+ static float table_f32_f16 [1 << 16 ];
243+
244+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
245+ // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
246+ #if !defined(GGML_FP16_TO_FP32 ) || !defined(GGML_FP32_TO_FP16 )
247+
248+ inline static float ggml_lookup_fp16_to_fp32 (ggml_fp16_t f ) {
249+ uint16_t s ;
250+ memcpy (& s , & f , sizeof (uint16_t ));
251+ return table_f32_f16 [s ];
252+ }
253+
254+ #define GGML_FP16_TO_FP32 (x ) ggml_lookup_fp16_to_fp32(x)
255+ #define GGML_FP32_TO_FP16 (x ) GGML_COMPUTE_FP32_TO_FP16(x)
256+
257+ #endif
258+
259+ // note: do not use these inside ggml.c
260+ // these are meant to be used via the ggml.h API
261+ float ggml_fp16_to_fp32 (ggml_fp16_t x ) {
262+ return GGML_FP16_TO_FP32 (x );
263+ }
264+
265+ ggml_fp16_t ggml_fp32_to_fp16 (float x ) {
266+ return GGML_FP32_TO_FP16 (x );
267+ }
268+
252269//
253270// timing
254271//
@@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
692709#define GGML_F16_VEC_MUL GGML_F16x4_MUL
693710#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
694711
712+ #elif defined(__SSE3__ )
713+
714+ #define GGML_SIMD
715+
716+ // F32 SSE
717+
718+ #define GGML_F32_STEP 32
719+ #define GGML_F32_EPR 4
720+
721+ #define GGML_F32x4 __m128
722+ #define GGML_F32x4_ZERO _mm_setzero_ps()
723+ #define GGML_F32x4_SET1 (x ) _mm_set1_ps(x)
724+ #define GGML_F32x4_LOAD _mm_loadu_ps
725+ #define GGML_F32x4_STORE _mm_storeu_ps
726+ #if defined(__FMA__ )
727+ // TODO: Does this work?
728+ #define GGML_F32x4_FMA (a , b , c ) _mm_fmadd_ps(b, c, a)
729+ #else
730+ #define GGML_F32x4_FMA (a , b , c ) _mm_add_ps(_mm_mul_ps(b, c), a)
731+ #endif
732+ #define GGML_F32x4_ADD _mm_add_ps
733+ #define GGML_F32x4_MUL _mm_mul_ps
734+ #define GGML_F32x4_REDUCE (res , x ) \
735+ { \
736+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
737+ x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
738+ } \
739+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
740+ x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
741+ } \
742+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
743+ x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
744+ } \
745+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
746+ res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
747+ }
748+ // TODO: is this optimal ?
749+
750+ #define GGML_F32_VEC GGML_F32x4
751+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
752+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
753+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
754+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
755+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
756+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
757+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
758+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
759+
760+ // F16 SSE
761+
762+ #define GGML_F16_STEP 32
763+ #define GGML_F16_EPR 4
764+
765+ static inline __m128 __sse_f16x4_load (ggml_fp16_t * x ) {
766+ float tmp [4 ];
767+
768+ tmp [0 ] = GGML_FP16_TO_FP32 (x [0 ]);
769+ tmp [1 ] = GGML_FP16_TO_FP32 (x [1 ]);
770+ tmp [2 ] = GGML_FP16_TO_FP32 (x [2 ]);
771+ tmp [3 ] = GGML_FP16_TO_FP32 (x [3 ]);
772+
773+ return _mm_loadu_ps (tmp );
774+ }
775+
776+ static inline void __sse_f16x4_store (ggml_fp16_t * x , __m128 y ) {
777+ float arr [4 ];
778+
779+ _mm_storeu_ps (arr , y );
780+
781+ x [0 ] = GGML_FP32_TO_FP16 (arr [0 ]);
782+ x [1 ] = GGML_FP32_TO_FP16 (arr [1 ]);
783+ x [2 ] = GGML_FP32_TO_FP16 (arr [2 ]);
784+ x [3 ] = GGML_FP32_TO_FP16 (arr [3 ]);
785+ }
786+
787+ #define GGML_F32Cx4 __m128
788+ #define GGML_F32Cx4_ZERO _mm_setzero_ps()
789+ #define GGML_F32Cx4_SET1 (x ) _mm_set1_ps(x)
790+ #define GGML_F32Cx4_LOAD (x ) __sse_f16x4_load(x)
791+ #define GGML_F32Cx4_STORE (x , y ) __sse_f16x4_store(x, y)
792+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
793+ #define GGML_F32Cx4_ADD _mm_add_ps
794+ #define GGML_F32Cx4_MUL _mm_mul_ps
795+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
796+
797+ #define GGML_F16_VEC GGML_F32Cx4
798+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
799+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
800+ #define GGML_F16_VEC_LOAD (p , i ) GGML_F32Cx4_LOAD(p)
801+ #define GGML_F16_VEC_STORE (p , r , i ) GGML_F32Cx4_STORE(p, r[i])
802+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
803+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
804+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
805+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
806+
695807#endif
696808
697809// GGML_F32_ARR / GGML_F16_ARR
@@ -1269,15 +1381,15 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
12691381 static bool is_first_call = true;
12701382
12711383 if (is_first_call ) {
1272- // initialize GELU and EXP tables
1384+ // initialize GELU, EXP and F32 tables
12731385 {
12741386 const uint64_t t_start = ggml_time_us (); UNUSED (t_start );
12751387
12761388 ggml_fp16_t ii ;
12771389 for (int i = 0 ; i < (1 << 16 ); ++ i ) {
12781390 uint16_t ui = i ;
12791391 memcpy (& ii , & ui , sizeof (ii ));
1280- const float f = GGML_FP16_TO_FP32 (ii );
1392+ const float f = table_f32_f16 [ i ] = GGML_COMPUTE_FP16_TO_FP32 (ii );
12811393 table_gelu_f16 [i ] = GGML_FP32_TO_FP16 (ggml_gelu_f32 (f ));
12821394 table_exp_f16 [i ] = GGML_FP32_TO_FP16 (exp (f ));
12831395 }
@@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) {
82328344#endif
82338345}
82348346
8347+ int ggml_cpu_has_sse3 (void ) {
8348+ #if defined(__SSE3__ )
8349+ return 1 ;
8350+ #else
8351+ return 0 ;
8352+ #endif
8353+ }
8354+
82358355int ggml_cpu_has_vsx (void ) {
82368356#if defined(__POWER9_VECTOR__ )
82378357 return 1 ;
0 commit comments