@@ -124,13 +124,8 @@ typedef double ggml_float;
124
124
//
125
125
#include <arm_neon.h>
126
126
127
- float ggml_fp16_to_fp32 (ggml_fp16_t x ) {
128
- return x ;
129
- }
130
-
131
- ggml_fp16_t ggml_fp32_to_fp16 (float x ) {
132
- return x ;
133
- }
127
+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) (x)
128
+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) (x)
134
129
135
130
#define GGML_FP16_TO_FP32 (x ) (x)
136
131
#define GGML_FP32_TO_FP16 (x ) (x)
@@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
150
145
#endif
151
146
152
147
#ifdef __F16C__
153
- float ggml_fp16_to_fp32 (ggml_fp16_t h ) {
154
- return _cvtsh_ss (h );
155
- }
156
- ggml_fp16_t ggml_fp32_to_fp16 (float f ) {
157
- return _cvtss_sh (f , 0 );
158
- }
159
148
160
- #define GGML_FP16_TO_FP32 (x ) _cvtsh_ss(x)
161
- #define GGML_FP32_TO_FP16 (x ) _cvtss_sh(x, 0)
149
+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) _cvtsh_ss(x)
150
+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) _cvtss_sh(x, 0)
162
151
163
152
#else
164
153
@@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) {
183
172
return fp32 .as_bits ;
184
173
}
185
174
186
- float ggml_fp16_to_fp32 (ggml_fp16_t h ) {
175
+ static inline float ggml_compute_fp16_to_fp32 (ggml_fp16_t h ) {
187
176
const uint32_t w = (uint32_t ) h << 16 ;
188
177
const uint32_t sign = w & UINT32_C (0x80000000 );
189
178
const uint32_t two_w = w + w ;
@@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
206
195
return fp32_from_bits (result );
207
196
}
208
197
209
- ggml_fp16_t ggml_fp32_to_fp16 (float f ) {
198
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16 (float f ) {
210
199
#if defined(__STDC_VERSION__ ) && (__STDC_VERSION__ >= 199901L ) || defined(__GNUC__ ) && !defined(__STRICT_ANSI__ )
211
200
const float scale_to_inf = 0x1.0p+112f ;
212
201
const float scale_to_zero = 0x1.0p-110f ;
@@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
232
221
return (sign >> 16 ) | (shl1_w > UINT32_C (0xFF000000 ) ? UINT16_C (0x7E00 ) : nonsign );
233
222
}
234
223
235
- #define GGML_FP16_TO_FP32 (x ) ggml_fp16_to_fp32 (x)
236
- #define GGML_FP32_TO_FP16 (x ) ggml_fp32_to_fp16 (x)
224
+ #define GGML_COMPUTE_FP16_TO_FP32 (x ) ggml_compute_fp16_to_fp32 (x)
225
+ #define GGML_COMPUTE_FP32_TO_FP16 (x ) ggml_compute_fp32_to_fp16 (x)
237
226
238
227
#endif // __F16C__
239
228
@@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
249
238
// precomputed exp table for f16 (128 KB)
250
239
static ggml_fp16_t table_exp_f16 [1 << 16 ];
251
240
241
+ // precomputed f32 table for f16 (256 KB)
242
+ static float table_f32_f16 [1 << 16 ];
243
+
244
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
245
+ // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
246
+ #if !defined(GGML_FP16_TO_FP32 ) || !defined(GGML_FP32_TO_FP16 )
247
+
248
+ inline static float ggml_lookup_fp16_to_fp32 (ggml_fp16_t f ) {
249
+ uint16_t s ;
250
+ memcpy (& s , & f , sizeof (uint16_t ));
251
+ return table_f32_f16 [s ];
252
+ }
253
+
254
+ #define GGML_FP16_TO_FP32 (x ) ggml_lookup_fp16_to_fp32(x)
255
+ #define GGML_FP32_TO_FP16 (x ) GGML_COMPUTE_FP32_TO_FP16(x)
256
+
257
+ #endif
258
+
259
+ // note: do not use these inside ggml.c
260
+ // these are meant to be used via the ggml.h API
261
+ float ggml_fp16_to_fp32 (ggml_fp16_t x ) {
262
+ return GGML_FP16_TO_FP32 (x );
263
+ }
264
+
265
+ ggml_fp16_t ggml_fp32_to_fp16 (float x ) {
266
+ return GGML_FP32_TO_FP16 (x );
267
+ }
268
+
252
269
//
253
270
// timing
254
271
//
@@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
692
709
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
693
710
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
694
711
712
+ #elif defined(__SSE3__ )
713
+
714
+ #define GGML_SIMD
715
+
716
+ // F32 SSE
717
+
718
+ #define GGML_F32_STEP 32
719
+ #define GGML_F32_EPR 4
720
+
721
+ #define GGML_F32x4 __m128
722
+ #define GGML_F32x4_ZERO _mm_setzero_ps()
723
+ #define GGML_F32x4_SET1 (x ) _mm_set1_ps(x)
724
+ #define GGML_F32x4_LOAD _mm_loadu_ps
725
+ #define GGML_F32x4_STORE _mm_storeu_ps
726
+ #if defined(__FMA__ )
727
+ // TODO: Does this work?
728
+ #define GGML_F32x4_FMA (a , b , c ) _mm_fmadd_ps(b, c, a)
729
+ #else
730
+ #define GGML_F32x4_FMA (a , b , c ) _mm_add_ps(_mm_mul_ps(b, c), a)
731
+ #endif
732
+ #define GGML_F32x4_ADD _mm_add_ps
733
+ #define GGML_F32x4_MUL _mm_mul_ps
734
+ #define GGML_F32x4_REDUCE (res , x ) \
735
+ { \
736
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
737
+ x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
738
+ } \
739
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
740
+ x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
741
+ } \
742
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
743
+ x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
744
+ } \
745
+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
746
+ res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
747
+ }
748
+ // TODO: is this optimal ?
749
+
750
+ #define GGML_F32_VEC GGML_F32x4
751
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
752
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
753
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
754
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
755
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
756
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
757
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
758
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
759
+
760
+ // F16 SSE
761
+
762
+ #define GGML_F16_STEP 32
763
+ #define GGML_F16_EPR 4
764
+
765
+ static inline __m128 __sse_f16x4_load (ggml_fp16_t * x ) {
766
+ float tmp [4 ];
767
+
768
+ tmp [0 ] = GGML_FP16_TO_FP32 (x [0 ]);
769
+ tmp [1 ] = GGML_FP16_TO_FP32 (x [1 ]);
770
+ tmp [2 ] = GGML_FP16_TO_FP32 (x [2 ]);
771
+ tmp [3 ] = GGML_FP16_TO_FP32 (x [3 ]);
772
+
773
+ return _mm_loadu_ps (tmp );
774
+ }
775
+
776
+ static inline void __sse_f16x4_store (ggml_fp16_t * x , __m128 y ) {
777
+ float arr [4 ];
778
+
779
+ _mm_storeu_ps (arr , y );
780
+
781
+ x [0 ] = GGML_FP32_TO_FP16 (arr [0 ]);
782
+ x [1 ] = GGML_FP32_TO_FP16 (arr [1 ]);
783
+ x [2 ] = GGML_FP32_TO_FP16 (arr [2 ]);
784
+ x [3 ] = GGML_FP32_TO_FP16 (arr [3 ]);
785
+ }
786
+
787
+ #define GGML_F32Cx4 __m128
788
+ #define GGML_F32Cx4_ZERO _mm_setzero_ps()
789
+ #define GGML_F32Cx4_SET1 (x ) _mm_set1_ps(x)
790
+ #define GGML_F32Cx4_LOAD (x ) __sse_f16x4_load(x)
791
+ #define GGML_F32Cx4_STORE (x , y ) __sse_f16x4_store(x, y)
792
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
793
+ #define GGML_F32Cx4_ADD _mm_add_ps
794
+ #define GGML_F32Cx4_MUL _mm_mul_ps
795
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
796
+
797
+ #define GGML_F16_VEC GGML_F32Cx4
798
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
799
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
800
+ #define GGML_F16_VEC_LOAD (p , i ) GGML_F32Cx4_LOAD(p)
801
+ #define GGML_F16_VEC_STORE (p , r , i ) GGML_F32Cx4_STORE(p, r[i])
802
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
803
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
804
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
805
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
806
+
695
807
#endif
696
808
697
809
// GGML_F32_ARR / GGML_F16_ARR
@@ -1269,15 +1381,15 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1269
1381
static bool is_first_call = true;
1270
1382
1271
1383
if (is_first_call ) {
1272
- // initialize GELU and EXP tables
1384
+ // initialize GELU, EXP and F32 tables
1273
1385
{
1274
1386
const uint64_t t_start = ggml_time_us (); UNUSED (t_start );
1275
1387
1276
1388
ggml_fp16_t ii ;
1277
1389
for (int i = 0 ; i < (1 << 16 ); ++ i ) {
1278
1390
uint16_t ui = i ;
1279
1391
memcpy (& ii , & ui , sizeof (ii ));
1280
- const float f = GGML_FP16_TO_FP32 (ii );
1392
+ const float f = table_f32_f16 [ i ] = GGML_COMPUTE_FP16_TO_FP32 (ii );
1281
1393
table_gelu_f16 [i ] = GGML_FP32_TO_FP16 (ggml_gelu_f32 (f ));
1282
1394
table_exp_f16 [i ] = GGML_FP32_TO_FP16 (exp (f ));
1283
1395
}
@@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) {
8232
8344
#endif
8233
8345
}
8234
8346
8347
+ int ggml_cpu_has_sse3 (void ) {
8348
+ #if defined(__SSE3__ )
8349
+ return 1 ;
8350
+ #else
8351
+ return 0 ;
8352
+ #endif
8353
+ }
8354
+
8235
8355
int ggml_cpu_has_vsx (void ) {
8236
8356
#if defined(__POWER9_VECTOR__ )
8237
8357
return 1 ;
0 commit comments