Skip to content

Commit 9f8bc0f

Browse files
committed
minor corrections
1 parent 9221479 commit 9f8bc0f

6 files changed

+36
-26
lines changed

simd_test.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -7411,7 +7411,7 @@ int main(int argc, char **argv)
74117411

74127412
l2_errd(inoutd_ref, inoutd2, len);
74137413
/*for(int i = 0; i < len; i++)
7414-
printf("%f %f %f\n",inoutd[i], inoutd_ref[i], inoutd2[i]);*/
7414+
printf("%f %f %f\n",inoutd[i], inoutd_ref[i], inoutd2[i]);*/
74157415
#endif
74167416

74177417
#ifdef AVX

simd_utils.h

+7-8
Original file line numberDiff line numberDiff line change
@@ -408,12 +408,11 @@ static inline v2sd _mm_cvtepi64_pd_signed_custom(v2sid x)
408408
static inline v2sid _mm_cvtpd_epi64_custom(v2sd x)
409409
{
410410
// Signed
411-
#if 0
412-
x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
411+
#if 1
412+
x = _mm_add_pd(x, *(v2sd *) _pd_epi64_mask);
413413
return _mm_sub_epi64(
414414
_mm_castpd_si128(x),
415-
_mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
416-
);
415+
_mm_castpd_si128(*(v2sd *) _pd_epi64_mask));
417416
#else
418417
// Unsigned
419418
x = _mm_add_pd(x, *(v2sd *) _pd_PDEPI64U); //_mm_set1_pd(0x0010000000000000));
@@ -490,22 +489,22 @@ static inline __m256d _mm256_fnmadd_pd_custom(__m256d a, __m256d b, __m256d c)
490489

491490
// https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
492491
// Only works for inputs in the range: [-2^51, 2^51]
493-
/*static inline __m256i _mm256_cvtpd_epi64_custom(__m256d x)
492+
static inline __m256i _mm256_cvtpd_epi64_custom(__m256d x)
494493
{
495494
x = _mm256_add_pd(x, *(v4sd *) _pd256_epi64_mask);
496495
return _mm256_sub_epi64(
497496
_mm256_castpd_si256(x),
498497
_mm256_castpd_si256(*(v4sd *) _pd256_epi64_mask));
499-
}*/
500-
498+
}
499+
/*
501500
static inline v4sid _mm256_cvtpd_epi64_custom(v4sd x)
502501
{
503502
x = _mm256_add_pd(x, *(v4sd *) _pd256_PDEPI64U);
504503
return _mm256_xor_si256(
505504
_mm256_castpd_si256(x),
506505
_mm256_castpd_si256(*(v4sd *) _pd256_PDEPI64U));
507506
}
508-
507+
*/
509508
static inline v4sd _mm256_cvtepi64_pd_custom(v4sid x)
510509
{
511510
x = _mm256_or_si256(x, _mm256_castpd_si256(*(v4sd *) _pd256_PDEPI64U));

simd_utils_avx512_double.h

+8-8
Original file line numberDiff line numberDiff line change
@@ -1211,7 +1211,7 @@ v8sd log512_pd(v8sd x)
12111211
* where z = 2(x-1)/x+1)
12121212
*/
12131213
v8sd abse = _mm512_and_pd(e, *(v8sd *) _pd512_pos_sign_mask);
1214-
__mmask8 abseinf2 = _mm512_cmp_pd_mask(abse, *(v8sd *) _pd512_2, _CMP_LT_OS);// FF if < 2
1214+
__mmask8 abseinf2 = _mm512_cmp_pd_mask(abse, *(v8sd *) _pd512_2, _CMP_LT_OS); // FF if < 2
12151215
__mmask8 xinfsqrth = _mm512_cmp_pd_mask(x, *(v8sd *) _pd512_cephes_SQRTHF, _CMP_LT_OS);
12161216

12171217
e = _mm512_mask_blend_pd(xinfsqrth, e, _mm512_sub_pd(e, *(v8sd *) _pd512_1)); // if( x < SQRTH ) e-=1
@@ -1220,15 +1220,15 @@ v8sd log512_pd(v8sd x)
12201220

12211221
// if(x < SQRTH) z_abseinf2 = (x-0.5), else x-1
12221222
tmp_abseinf2 = _mm512_sub_pd(x, *(v8sd *) _pd512_1);
1223-
tmp2_abseinf2 = _mm512_sub_pd(x, *(v8sd *) _pd512_0p5);
1223+
tmp2_abseinf2 = _mm512_sub_pd(x, *(v8sd *) _pd512_0p5);
12241224
z_abseinf2 = _mm512_mask_blend_pd(xinfsqrth, tmp_abseinf2, tmp2_abseinf2);
12251225

12261226
tmp_abseinf2 = _mm512_fmadd_pd(z_abseinf2, *(v8sd *) _pd512_0p5, *(v8sd *) _pd512_0p5);
12271227
tmp2_abseinf2 = _mm512_fmadd_pd(x, *(v8sd *) _pd512_0p5, *(v8sd *) _pd512_0p5);
12281228

12291229
// if(x < SQRTH) y_abseinf2 = z*0.5 + 0.5, else = x*0.5 + 0.5
12301230
y_abseinf2 = _mm512_mask_blend_pd(xinfsqrth, tmp2_abseinf2, tmp_abseinf2);
1231-
1231+
12321232
x_abseinf2 = _mm512_div_pd(z_abseinf2, y_abseinf2); // x = z / y;
12331233
z_abseinf2 = _mm512_mul_pd(x_abseinf2, x_abseinf2); // z = x*x;
12341234

@@ -1245,12 +1245,12 @@ v8sd log512_pd(v8sd x)
12451245
// convert e to double
12461246
// y = e
12471247
z_abseinf2 = _mm512_fmadd_pd(e, *(v8sd *) _pd512_min_212emin4, z_abseinf2); // z = z - y * 2.121944400546905827679e-4;
1248-
z_abseinf2 = _mm512_add_pd(z_abseinf2, x_abseinf2); // z = z + x;
1248+
z_abseinf2 = _mm512_add_pd(z_abseinf2, x_abseinf2); // z = z + x;
12491249

12501250
/* logarithm using log(1+x) = x - .5x**2 + x**3 P(x)/Q(x) */
12511251
v8sd tmp3, tmp4;
12521252
tmp3 = _mm512_fmadd_pd(x, *(v8sd *) _pd512_2, *(v8sd *) _pd512_min1); // x = 2.0*x - 1.0; /* 2x - 1 */
1253-
tmp4 = _mm512_sub_pd(x, *(v8sd *) _pd512_1); // x = x - 1.0;
1253+
tmp4 = _mm512_sub_pd(x, *(v8sd *) _pd512_1); // x = x - 1.0;
12541254
x = _mm512_mask_blend_pd(xinfsqrth, tmp4, tmp3);
12551255

12561256
/* rational form */
@@ -1274,11 +1274,11 @@ v8sd log512_pd(v8sd x)
12741274
// if( e) => no need, if e==0 it still works
12751275
z = _mm512_fmadd_pd(e, *(v8sd *) _pd512_min_212emin4, z); // z = z - e * 2.121944400546905827679e-4;
12761276
y = _mm512_fmadd_pd(z, *(v8sd *) _pd512_min0p5, y); // y = y - 0.5*z;
1277-
z = _mm512_add_pd(x, y); // z = x + y;
1277+
z = _mm512_add_pd(x, y); // z = x + y;
12781278
// if( e) => no need, if e==0 it still works
12791279

1280-
z = _mm512_mask_blend_pd(abseinf2, z, z_abseinf2); // if fabs(e) < 2 z = z_abseinf2
1281-
z = _mm512_fmadd_pd(e, *(v8sd *) _pd512_0p69, z); // z + e * 0.693359375;
1280+
z = _mm512_mask_blend_pd(abseinf2, z, z_abseinf2); // if fabs(e) < 2 z = z_abseinf2
1281+
z = _mm512_fmadd_pd(e, *(v8sd *) _pd512_0p69, z); // z + e * 0.693359375;
12821282

12831283
return (z);
12841284
}

simd_utils_avx_double.h

100755100644
+7-7
Original file line numberDiff line numberDiff line change
@@ -1119,7 +1119,7 @@ v4sd log256_pd(v4sd x)
11191119
* where z = 2(x-1)/x+1)
11201120
*/
11211121
v4sd abse = _mm256_and_pd(e, *(v4sd *) _pd256_pos_sign_mask);
1122-
v4sd abseinf2 = _mm256_cmp_pd(abse, *(v4sd *) _pd256_2, _CMP_LT_OS);// FF if < 2
1122+
v4sd abseinf2 = _mm256_cmp_pd(abse, *(v4sd *) _pd256_2, _CMP_LT_OS); // FF if < 2
11231123
v4sd xinfsqrth = _mm256_cmp_pd(x, *(v4sd *) _pd256_cephes_SQRTHF, _CMP_LT_OS);
11241124

11251125
e = _mm256_blendv_pd(e, _mm256_sub_pd(e, *(v4sd *) _pd256_1), xinfsqrth); // if( x < SQRTH ) e-=1
@@ -1128,15 +1128,15 @@ v4sd log256_pd(v4sd x)
11281128

11291129
// if(x < SQRTH) z_abseinf2 = (x-0.5), else x-1
11301130
tmp_abseinf2 = _mm256_sub_pd(x, *(v4sd *) _pd256_1);
1131-
tmp2_abseinf2 = _mm256_sub_pd(x, *(v4sd *) _pd256_0p5);
1131+
tmp2_abseinf2 = _mm256_sub_pd(x, *(v4sd *) _pd256_0p5);
11321132
z_abseinf2 = _mm256_blendv_pd(tmp_abseinf2, tmp2_abseinf2, xinfsqrth);
11331133

11341134
tmp_abseinf2 = _mm256_fmadd_pd_custom(z_abseinf2, *(v4sd *) _pd256_0p5, *(v4sd *) _pd256_0p5);
11351135
tmp2_abseinf2 = _mm256_fmadd_pd_custom(x, *(v4sd *) _pd256_0p5, *(v4sd *) _pd256_0p5);
11361136

11371137
// if(x < SQRTH) y_abseinf2 = z*0.5 + 0.5, else = x*0.5 + 0.5
11381138
y_abseinf2 = _mm256_blendv_pd(tmp2_abseinf2, tmp_abseinf2, xinfsqrth);
1139-
1139+
11401140
x_abseinf2 = _mm256_div_pd(z_abseinf2, y_abseinf2); // x = z / y;
11411141
z_abseinf2 = _mm256_mul_pd(x_abseinf2, x_abseinf2); // z = x*x;
11421142

@@ -1153,12 +1153,12 @@ v4sd log256_pd(v4sd x)
11531153
// convert e to double
11541154
// y = e
11551155
z_abseinf2 = _mm256_fmadd_pd_custom(e, *(v4sd *) _pd256_min_212emin4, z_abseinf2); // z = z - y * 2.121944400546905827679e-4;
1156-
z_abseinf2 = _mm256_add_pd(z_abseinf2, x_abseinf2); // z = z + x;
1156+
z_abseinf2 = _mm256_add_pd(z_abseinf2, x_abseinf2); // z = z + x;
11571157

11581158
/* logarithm using log(1+x) = x - .5x**2 + x**3 P(x)/Q(x) */
11591159
v4sd tmp3, tmp4;
11601160
tmp3 = _mm256_fmadd_pd_custom(x, *(v4sd *) _pd256_2, *(v4sd *) _pd256_min1); // x = 2.0*x - 1.0; /* 2x - 1 */
1161-
tmp4 = _mm256_sub_pd(x, *(v4sd *) _pd256_1); // x = x - 1.0;
1161+
tmp4 = _mm256_sub_pd(x, *(v4sd *) _pd256_1); // x = x - 1.0;
11621162
x = _mm256_blendv_pd(tmp4, tmp3, xinfsqrth);
11631163

11641164
/* rational form */
@@ -1182,10 +1182,10 @@ v4sd log256_pd(v4sd x)
11821182
// if( e) => no need, if e==0 it still works
11831183
z = _mm256_fmadd_pd_custom(e, *(v4sd *) _pd256_min_212emin4, z); // z = z - e * 2.121944400546905827679e-4;
11841184
y = _mm256_fmadd_pd_custom(z, *(v4sd *) _pd256_min0p5, y); // y = y - 0.5*z;
1185-
z = _mm256_add_pd(x, y); // z = x + y;
1185+
z = _mm256_add_pd(x, y); // z = x + y;
11861186
// if( e) => no need, if e==0 it still works
11871187

1188-
z = _mm256_blendv_pd(z, z_abseinf2, abseinf2); // if fabs(e) < 2 z = z_abseinf2
1188+
z = _mm256_blendv_pd(z, z_abseinf2, abseinf2); // if fabs(e) < 2 z = z_abseinf2
11891189
z = _mm256_fmadd_pd_custom(e, *(v4sd *) _pd256_0p69, z); // z + e * 0.693359375;
11901190

11911191
return (z);

simd_utils_constants.h

+9
Original file line numberDiff line numberDiff line change
@@ -2115,6 +2115,15 @@ static inline void print2i(__m128i v)
21152115
printf("[%ld, %ld]", p[0], p[1]);
21162116
}
21172117

2118+
static inline void print2xi(__m128i v)
2119+
{
2120+
int64_t *p = (int64_t *) &v;
2121+
#ifndef USE_SSE2
2122+
_mm_empty();
2123+
#endif
2124+
printf("[%16x, %16x]", p[0], p[1]);
2125+
}
2126+
21182127
#endif
21192128

21202129
#ifdef AVX

simd_utils_sse_double.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -1172,14 +1172,16 @@ static inline v2sd exp_pd(v2sd x)
11721172
tmp2 = _mm_sub_pd(tmp2, px);
11731173
x = _mm_div_pd(px, tmp2);
11741174
x = _mm_fmadd_pd_custom(x, *(v2sd *) _pd_2, *(v2sd *) _pd_1);
1175-
1175+
// print2(x);
1176+
// print2xi(n);
11761177
/* build 2^n */
11771178
n = _mm_add_epi64(n, *(v2sid *) _pi64_1023);
11781179
n = _mm_slli_epi64(n, 52);
11791180
v2sd pow2n = _mm_castsi128_pd(n);
11801181

11811182
/* multiply by power of 2 */
11821183
x = _mm_mul_pd(x, pow2n);
1184+
// print2(x);printf("\n");
11831185
return (x);
11841186
}
11851187

@@ -1201,7 +1203,7 @@ static inline void exp128d(double *src, double *dst, int len)
12011203
}
12021204

12031205
for (int i = stop_len; i < len; i++) {
1204-
dst[i] = log(src[i]);
1206+
dst[i] = exp(src[i]);
12051207
}
12061208
}
12071209

0 commit comments

Comments
 (0)