@@ -31,16 +31,27 @@ where
31
31
let exp_max: i32 = F :: EXP_BIAS as i32 ;
32
32
let exp_min = -( exp_max - 1 ) ;
33
33
34
- // 2 ^ Emax, where Emax is the maximum biased exponent value (1023 for f64)
34
+ // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
35
35
let f_exp_max = F :: from_parts ( false , F :: EXP_BIAS << 1 , zero) ;
36
36
37
- // 2 ^ Emin, where Emin is the minimum biased exponent value ( -1022 for f64)
37
+ // 2 ^ Emin, minimum positive normal with null significand (0x1p -1022 for f64)
38
38
let f_exp_min = F :: from_parts ( false , 1 , zero) ;
39
39
40
- // 2 ^ sig_total_bits, representation of what can be accounted for with subnormals
41
- let f_exp_subnorm = F :: from_parts ( false , sig_total_bits + F :: EXP_BIAS , zero) ;
40
+ // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
41
+ let f_pow_subnorm = F :: from_parts ( false , sig_total_bits + F :: EXP_BIAS , zero) ;
42
+
43
+ /*
44
+ * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
45
+ * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
46
+ * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
47
+ * the final scale operation by prescaling by the max/min power representable by `F`.
48
+ */
42
49
43
50
if n > exp_max {
51
+ // Worse case positive `n`: `x` is the minimum subnormal value, the result is `F::MAX`.
52
+ // This can be reached by three scaling multiplications (two here and one final).
53
+ debug_assert ! ( -exp_min + F :: SIG_BITS as i32 + exp_max <= exp_max * 3 ) ;
54
+
44
55
x *= f_exp_max;
45
56
n -= exp_max;
46
57
if n > exp_max {
@@ -51,21 +62,61 @@ where
51
62
}
52
63
}
53
64
} else if n < exp_min {
54
- let mul = f_exp_min * f_exp_subnorm;
55
- let add = ( exp_max - 1 ) - sig_total_bits as i32 ;
65
+ // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
66
+ // go subnormal. This avoids double rounding.
67
+ if F :: BITS > 16 {
68
+ // `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
69
+ let mul = f_exp_min * f_pow_subnorm;
70
+ let add = -exp_min - sig_total_bits as i32 ;
71
+
72
+ // Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`.
73
+ // This must be reachable by three scaling multiplications (two here and one final).
74
+ debug_assert ! ( -exp_min + F :: SIG_BITS as i32 + exp_max <= add * 2 + -exp_min) ;
56
75
57
- x *= mul;
58
- n += add;
59
- if n < exp_min {
60
76
x *= mul;
61
77
n += add;
78
+
62
79
if n < exp_min {
63
- n = exp_min;
80
+ x *= mul;
81
+ n += add;
82
+
83
+ if n < exp_min {
84
+ n = exp_min;
85
+ }
86
+ }
87
+ } else {
88
+ // `f16` is unique compared to other float types in that the difference between the
89
+ // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
90
+ // small, only three. The above method depend on decrementing `n` by `add` two times;
91
+ // for other float types this works out because `add` is a substantial fraction of
92
+ // the exponent range. For `f16`, however, 3 is relatively small compared to the
93
+ // exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
94
+ //
95
+ // Work aroudn this by using a different algorithm that calculates the prescale
96
+ // dynamically based on the maximum possible value. This adds more operations per round
97
+ // since it needs to construct the scale, but works better in the general case.
98
+ let add = -( n + sig_total_bits as i32 ) . clamp ( exp_min, sig_total_bits as i32 ) ;
99
+ let mul = F :: from_parts ( false , ( F :: EXP_BIAS as i32 - add) as u32 , zero) ;
100
+
101
+ x *= mul;
102
+ n += add;
103
+
104
+ if n < exp_min {
105
+ let add = -( n + sig_total_bits as i32 ) . clamp ( exp_min, sig_total_bits as i32 ) ;
106
+ let mul = F :: from_parts ( false , ( F :: EXP_BIAS as i32 - add) as u32 , zero) ;
107
+
108
+ x *= mul;
109
+ n += add;
110
+
111
+ if n < exp_min {
112
+ n = exp_min;
113
+ }
64
114
}
65
115
}
66
116
}
67
117
68
- x * F :: from_parts ( false , ( F :: EXP_BIAS as i32 + n) as u32 , zero)
118
+ let scale = F :: from_parts ( false , ( F :: EXP_BIAS as i32 + n) as u32 , zero) ;
119
+ x * scale
69
120
}
70
121
71
122
#[ cfg( test) ]
@@ -111,6 +162,12 @@ mod tests {
111
162
assert ! ( scalbn( -F :: NAN , -10 ) . is_nan( ) ) ;
112
163
}
113
164
165
+ #[ test]
166
+ #[ cfg( f16_enabled) ]
167
+ fn spec_test_f16 ( ) {
168
+ spec_test :: < f16 > ( ) ;
169
+ }
170
+
114
171
#[ test]
115
172
fn spec_test_f32 ( ) {
116
173
spec_test :: < f32 > ( ) ;
@@ -120,4 +177,10 @@ mod tests {
120
177
fn spec_test_f64 ( ) {
121
178
spec_test :: < f64 > ( ) ;
122
179
}
180
+
181
+ #[ test]
182
+ #[ cfg( f128_enabled) ]
183
+ fn spec_test_f128 ( ) {
184
+ spec_test :: < f128 > ( ) ;
185
+ }
123
186
}
0 commit comments