@@ -20,41 +20,52 @@ namespace fputil {
20
20
21
21
#define DEFAULT_DOUBLE_SPLIT 27
22
22
23
- using DoubleDouble = LIBC_NAMESPACE::NumberPair<double >;
23
+ template <typename T> struct DefaultSplit ;
24
+ template <> struct DefaultSplit <float > {
25
+ static constexpr size_t VALUE = 12 ;
26
+ };
27
+ template <> struct DefaultSplit <double > {
28
+ static constexpr size_t VALUE = DEFAULT_DOUBLE_SPLIT;
29
+ };
30
+
31
+ using DoubleDouble = NumberPair<double >;
32
+ using FloatFloat = NumberPair<float >;
24
33
25
34
// The output of Dekker's FastTwoSum algorithm is correct, i.e.:
26
35
// r.hi + r.lo = a + b exactly
27
36
// and |r.lo| < eps(r.lo)
28
37
// Assumption: |a| >= |b|, or a = 0.
29
- template <bool FAST2SUM = true >
30
- LIBC_INLINE constexpr DoubleDouble exact_add (double a, double b) {
31
- DoubleDouble r{0.0 , 0.0 };
38
+ template <bool FAST2SUM = true , typename T = double >
39
+ LIBC_INLINE constexpr NumberPair<T> exact_add (T a, T b) {
40
+ NumberPair<T> r{0.0 , 0.0 };
32
41
if constexpr (FAST2SUM) {
33
42
r.hi = a + b;
34
- double t = r.hi - a;
43
+ T t = r.hi - a;
35
44
r.lo = b - t;
36
45
} else {
37
46
r.hi = a + b;
38
- double t1 = r.hi - a;
39
- double t2 = r.hi - t1;
40
- double t3 = b - t1;
41
- double t4 = a - t2;
47
+ T t1 = r.hi - a;
48
+ T t2 = r.hi - t1;
49
+ T t3 = b - t1;
50
+ T t4 = a - t2;
42
51
r.lo = t3 + t4;
43
52
}
44
53
return r;
45
54
}
46
55
47
56
// Assumption: |a.hi| >= |b.hi|
48
- LIBC_INLINE constexpr DoubleDouble add (const DoubleDouble &a,
49
- const DoubleDouble &b) {
50
- DoubleDouble r = exact_add (a.hi , b.hi );
51
- double lo = a.lo + b.lo ;
57
+ template <typename T>
58
+ LIBC_INLINE constexpr NumberPair<T> add (const NumberPair<T> &a,
59
+ const NumberPair<T> &b) {
60
+ NumberPair<T> r = exact_add (a.hi , b.hi );
61
+ T lo = a.lo + b.lo ;
52
62
return exact_add (r.hi , r.lo + lo);
53
63
}
54
64
55
65
// Assumption: |a.hi| >= |b|
56
- LIBC_INLINE constexpr DoubleDouble add (const DoubleDouble &a, double b) {
57
- DoubleDouble r = exact_add<false >(a.hi , b);
66
+ template <typename T>
67
+ LIBC_INLINE constexpr NumberPair<T> add (const NumberPair<T> &a, T b) {
68
+ NumberPair<T> r = exact_add<false >(a.hi , b);
58
69
return exact_add (r.hi , r.lo + a.lo );
59
70
}
60
71
@@ -63,12 +74,12 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
63
74
// Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
64
75
// Roundings," https://inria.hal.science/hal-04480440.
65
76
// Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
66
- template <size_t N = DEFAULT_DOUBLE_SPLIT >
67
- LIBC_INLINE constexpr DoubleDouble split (double a) {
68
- DoubleDouble r{0.0 , 0.0 };
77
+ template <typename T = double , size_t N = DefaultSplit<T>::VALUE >
78
+ LIBC_INLINE constexpr NumberPair<T> split (T a) {
79
+ NumberPair<T> r{0.0 , 0.0 };
69
80
// CN = 2^N.
70
- constexpr double CN = static_cast <double >(1 << N);
71
- constexpr double C = CN + 1.0 ;
81
+ constexpr T CN = static_cast <T >(1 << N);
82
+ constexpr T C = CN + 1.0 ;
72
83
double t1 = C * a;
73
84
double t2 = a - t1;
74
85
r.hi = t1 + t2;
@@ -77,16 +88,15 @@ LIBC_INLINE constexpr DoubleDouble split(double a) {
77
88
}
78
89
79
90
// Helper for non-fma exact mult where the first number is already split.
80
- template <size_t SPLIT_B = DEFAULT_DOUBLE_SPLIT>
81
- LIBC_INLINE DoubleDouble exact_mult (const DoubleDouble &as, double a,
82
- double b) {
83
- DoubleDouble bs = split<SPLIT_B>(b);
84
- DoubleDouble r{0.0 , 0.0 };
91
+ template <typename T = double , size_t SPLIT_B = DefaultSplit<T>::VALUE>
92
+ LIBC_INLINE NumberPair<T> exact_mult (const NumberPair<T> &as, T a, T b) {
93
+ NumberPair<T> bs = split<T, SPLIT_B>(b);
94
+ NumberPair<T> r{0.0 , 0.0 };
85
95
86
96
r.hi = a * b;
87
- double t1 = as.hi * bs.hi - r.hi ;
88
- double t2 = as.hi * bs.lo + t1;
89
- double t3 = as.lo * bs.hi + t2;
97
+ T t1 = as.hi * bs.hi - r.hi ;
98
+ T t2 = as.hi * bs.lo + t1;
99
+ T t3 = as.lo * bs.hi + t2;
90
100
r.lo = as.lo * bs.lo + t3;
91
101
92
102
return r;
@@ -99,18 +109,18 @@ LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a,
99
109
// Using Theorem 1 in the paper above, without FMA instruction, if we restrict
100
110
// the generated constants to precision <= 51, and splitting it by 2^28 + 1,
101
111
// then a * b = r.hi + r.lo is exact for all rounding modes.
102
- template <size_t SPLIT_B = 27 >
103
- LIBC_INLINE DoubleDouble exact_mult (double a, double b) {
104
- DoubleDouble r{0.0 , 0.0 };
112
+ template <typename T = double , size_t SPLIT_B = DefaultSplit<T>::VALUE >
113
+ LIBC_INLINE NumberPair<T> exact_mult (T a, T b) {
114
+ NumberPair<T> r{0.0 , 0.0 };
105
115
106
116
#ifdef LIBC_TARGET_CPU_HAS_FMA
107
117
r.hi = a * b;
108
118
r.lo = fputil::multiply_add (a, b, -r.hi );
109
119
#else
110
120
// Dekker's Product.
111
- DoubleDouble as = split (a);
121
+ NumberPair<T> as = split (a);
112
122
113
- r = exact_mult<SPLIT_B>(as, a, b);
123
+ r = exact_mult<T, SPLIT_B>(as, a, b);
114
124
#endif // LIBC_TARGET_CPU_HAS_FMA
115
125
116
126
return r;
@@ -125,7 +135,7 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
125
135
template <size_t SPLIT_B = 27 >
126
136
LIBC_INLINE DoubleDouble quick_mult (const DoubleDouble &a,
127
137
const DoubleDouble &b) {
128
- DoubleDouble r = exact_mult<SPLIT_B>(a.hi , b.hi );
138
+ DoubleDouble r = exact_mult<double , SPLIT_B>(a.hi , b.hi );
129
139
double t1 = multiply_add (a.hi , b.lo , r.lo );
130
140
double t2 = multiply_add (a.lo , b.hi , t1);
131
141
r.lo = t2;
@@ -157,19 +167,20 @@ LIBC_INLINE DoubleDouble multiply_add<DoubleDouble>(const DoubleDouble &a,
157
167
// rl = q * (ah - bh * rh) + q * (al - bl * rh)
158
168
// as accurate as possible, then the error is bounded by:
159
169
// |(ah + al) / (bh + bl) - (rh + rl)| < O(bl/bh) * (2^-52 + al/ah + bl/bh)
160
- LIBC_INLINE DoubleDouble div (const DoubleDouble &a, const DoubleDouble &b) {
161
- DoubleDouble r;
162
- double q = 1.0 / b.hi ;
170
+ template <typename T>
171
+ LIBC_INLINE NumberPair<T> div (const NumberPair<T> &a, const NumberPair<T> &b) {
172
+ NumberPair<T> r;
173
+ T q = T (1 ) / b.hi ;
163
174
r.hi = a.hi * q;
164
175
165
176
#ifdef LIBC_TARGET_CPU_HAS_FMA
166
- double e_hi = fputil::multiply_add (b.hi , -r.hi , a.hi );
167
- double e_lo = fputil::multiply_add (b.lo , -r.hi , a.lo );
177
+ T e_hi = fputil::multiply_add (b.hi , -r.hi , a.hi );
178
+ T e_lo = fputil::multiply_add (b.lo , -r.hi , a.lo );
168
179
#else
169
- DoubleDouble b_hi_r_hi = fputil::exact_mult (b.hi , -r.hi );
170
- DoubleDouble b_lo_r_hi = fputil::exact_mult (b.lo , -r.hi );
171
- double e_hi = (a.hi + b_hi_r_hi.hi ) + b_hi_r_hi.lo ;
172
- double e_lo = (a.lo + b_lo_r_hi.hi ) + b_lo_r_hi.lo ;
180
+ NumberPair<T> b_hi_r_hi = fputil::exact_mult (b.hi , -r.hi );
181
+ NumberPair<T> b_lo_r_hi = fputil::exact_mult (b.lo , -r.hi );
182
+ T e_hi = (a.hi + b_hi_r_hi.hi ) + b_hi_r_hi.lo ;
183
+ T e_lo = (a.lo + b_lo_r_hi.hi ) + b_lo_r_hi.lo ;
173
184
#endif // LIBC_TARGET_CPU_HAS_FMA
174
185
175
186
r.lo = q * (e_hi + e_lo);
0 commit comments