Merge pull request #391 from rust-lang/tgross35/f16-f128-scalbn

tgross35 · web-flow · commit 370831d9f025 · 2025-02-05T09:00:53.000-06:00
Add `ldexpf16`, `ldexpf128`, `scalbnf16`, and `scalbnf128`
diff --git a/libm/crates/libm-macros/src/shared.rs b/libm/crates/libm-macros/src/shared.rs
@@ -134,6 +134,13 @@ const ALL_OPERATIONS_NESTED: &[(FloatTy, Signature, Option<Signature>, &[&str])]
         None,
         &["jn", "yn"],
     ),
+    (
+        // `(f16, i32) -> f16`
+        FloatTy::F16,
+        Signature { args: &[Ty::F16, Ty::I32], returns: &[Ty::F16] },
+        None,
+        &["scalbnf16", "ldexpf16"],
+    ),
     (
         // `(f32, i32) -> f32`
         FloatTy::F32,
@@ -148,6 +155,13 @@ const ALL_OPERATIONS_NESTED: &[(FloatTy, Signature, Option<Signature>, &[&str])]
         None,
         &["scalbn", "ldexp"],
     ),
+    (
+        // `(f128, i32) -> f128`
+        FloatTy::F128,
+        Signature { args: &[Ty::F128, Ty::I32], returns: &[Ty::F128] },
+        None,
+        &["scalbnf128", "ldexpf128"],
+    ),
     (
         // `(f32, &mut f32) -> f32` as `(f32) -> (f32, f32)`
         FloatTy::F32,
diff --git a/libm/crates/libm-test/benches/icount.rs b/libm/crates/libm-test/benches/icount.rs
@@ -131,6 +131,8 @@ main!(
     icount_bench_jn_group,
     icount_bench_jnf_group,
     icount_bench_ldexp_group,
+    icount_bench_ldexpf128_group,
+    icount_bench_ldexpf16_group,
     icount_bench_ldexpf_group,
     icount_bench_lgamma_group,
     icount_bench_lgamma_r_group,
@@ -163,6 +165,8 @@ main!(
     icount_bench_roundf16_group,
     icount_bench_roundf_group,
     icount_bench_scalbn_group,
+    icount_bench_scalbnf128_group,
+    icount_bench_scalbnf16_group,
     icount_bench_scalbnf_group,
     icount_bench_sin_group,
     icount_bench_sinf_group,
diff --git a/libm/crates/libm-test/benches/random.rs b/libm/crates/libm-test/benches/random.rs
@@ -133,10 +133,14 @@ libm_macros::for_each_function! {
         | fminf16
         | fmodf128
         | fmodf16
+        | ldexpf128
+        | ldexpf16
         | rintf128
         | rintf16
         | roundf128
         | roundf16
+        | scalbnf128
+        | scalbnf16
         | sqrtf128
         | sqrtf16
         | truncf128
diff --git a/libm/crates/libm-test/src/mpfloat.rs b/libm/crates/libm-test/src/mpfloat.rs
@@ -159,6 +159,8 @@ libm_macros::for_each_function! {
         jnf,
         ldexp,
         ldexpf,
+        ldexpf128,
+        ldexpf16,
         lgamma_r,
         lgammaf_r,
         modf,
@@ -178,6 +180,8 @@ libm_macros::for_each_function! {
         roundf16,
         scalbn,
         scalbnf,
+        scalbnf128,
+        scalbnf16,
         sincos,sincosf,
         trunc,
         truncf,
@@ -351,34 +355,6 @@ macro_rules! impl_op_for_ty {
                 }
             }
 
-            // `ldexp` and `scalbn` are the same for binary floating point, so just forward all
-            // methods.
-            impl MpOp for crate::op::[<ldexp $suffix>]::Routine {
-                type MpTy = <crate::op::[<scalbn $suffix>]::Routine as MpOp>::MpTy;
-
-                fn new_mp() -> Self::MpTy {
-                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::new_mp()
-                }
-
-                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
-                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::run(this, input)
-                }
-            }
-
-            impl MpOp for crate::op::[<scalbn $suffix>]::Routine {
-                type MpTy = MpFloat;
-
-                fn new_mp() -> Self::MpTy {
-                    new_mpfloat::<Self::FTy>()
-                }
-
-                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
-                    this.assign(input.0);
-                    *this <<= input.1;
-                    prep_retval::<Self::FTy>(this, Ordering::Equal)
-                }
-            }
-
             impl MpOp for crate::op::[<sincos $suffix>]::Routine {
                 type MpTy = (MpFloat, MpFloat);
 
@@ -464,6 +440,35 @@ macro_rules! impl_op_for_ty_all {
                     this.1.assign(input.1);
                     let ord = this.0.rem_assign_round(&this.1, Nearest);
                     prep_retval::<Self::RustRet>(&mut this.0, ord)
+
+                }
+            }
+
+            // `ldexp` and `scalbn` are the same for binary floating point, so just forward all
+            // methods.
+            impl MpOp for crate::op::[<ldexp $suffix>]::Routine {
+                type MpTy = <crate::op::[<scalbn $suffix>]::Routine as MpOp>::MpTy;
+
+                fn new_mp() -> Self::MpTy {
+                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::new_mp()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::run(this, input)
+                }
+            }
+
+            impl MpOp for crate::op::[<scalbn $suffix>]::Routine {
+                type MpTy = MpFloat;
+
+                fn new_mp() -> Self::MpTy {
+                    new_mpfloat::<Self::FTy>()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.assign(input.0);
+                    *this <<= input.1;
+                    prep_retval::<Self::FTy>(this, Ordering::Equal)
                 }
             }
         }
diff --git a/libm/crates/libm-test/src/precision.rs b/libm/crates/libm-test/src/precision.rs
@@ -551,8 +551,12 @@ fn int_float_common<F1: Float, F2: Float>(
     DEFAULT
 }
 
+#[cfg(f16_enabled)]
+impl MaybeOverride<(f16, i32)> for SpecialCase {}
 impl MaybeOverride<(f32, i32)> for SpecialCase {}
 impl MaybeOverride<(f64, i32)> for SpecialCase {}
+#[cfg(f128_enabled)]
+impl MaybeOverride<(f128, i32)> for SpecialCase {}
 
 impl MaybeOverride<(f32, f32, f32)> for SpecialCase {
     fn check_float<F: Float>(
diff --git a/libm/crates/libm-test/tests/compare_built_musl.rs b/libm/crates/libm-test/tests/compare_built_musl.rs
@@ -95,10 +95,14 @@ libm_macros::for_each_function! {
         fminf16,
         fmodf128,
         fmodf16,
+        ldexpf128,
+        ldexpf16,
         rintf128,
         rintf16,
         roundf128,
         roundf16,
+        scalbnf128,
+        scalbnf16,
         sqrtf128,
         sqrtf16,
         truncf128,
diff --git a/libm/crates/util/src/main.rs b/libm/crates/util/src/main.rs
@@ -102,10 +102,14 @@ fn do_eval(basis: &str, op: &str, inputs: &[&str]) {
             | fminf16
             | fmodf128
             | fmodf16
+            | ldexpf128
+            | ldexpf16
             | rintf128
             | rintf16
             | roundf128
             | roundf16
+            | scalbnf128
+            | scalbnf16
             | sqrtf128
             | sqrtf16
             | truncf128
diff --git a/libm/etc/function-definitions.json b/libm/etc/function-definitions.json
@@ -554,6 +554,18 @@
         ],
         "type": "f32"
     },
+    "ldexpf128": {
+        "sources": [
+            "src/math/ldexpf128.rs"
+        ],
+        "type": "f128"
+    },
+    "ldexpf16": {
+        "sources": [
+            "src/math/ldexpf16.rs"
+        ],
+        "type": "f16"
+    },
     "lgamma": {
         "sources": [
             "src/libm_helper.rs",
@@ -774,6 +786,20 @@
         ],
         "type": "f32"
     },
+    "scalbnf128": {
+        "sources": [
+            "src/math/generic/scalbn.rs",
+            "src/math/scalbnf128.rs"
+        ],
+        "type": "f128"
+    },
+    "scalbnf16": {
+        "sources": [
+            "src/math/generic/scalbn.rs",
+            "src/math/scalbnf16.rs"
+        ],
+        "type": "f16"
+    },
     "sin": {
         "sources": [
             "src/libm_helper.rs",
diff --git a/libm/etc/function-list.txt b/libm/etc/function-list.txt
@@ -79,6 +79,8 @@ jn
 jnf
 ldexp
 ldexpf
+ldexpf128
+ldexpf16
 lgamma
 lgamma_r
 lgammaf
@@ -111,6 +113,8 @@ roundf128
 roundf16
 scalbn
 scalbnf
+scalbnf128
+scalbnf16
 sin
 sincos
 sincosf
diff --git a/libm/src/math/generic/scalbn.rs b/libm/src/math/generic/scalbn.rs
@@ -31,16 +31,27 @@ where
     let exp_max: i32 = F::EXP_BIAS as i32;
     let exp_min = -(exp_max - 1);
 
-    // 2 ^ Emax, where Emax is the maximum biased exponent value (1023 for f64)
+    // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
     let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero);
 
-    // 2 ^ Emin, where Emin is the minimum biased exponent value (-1022 for f64)
+    // 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64)
     let f_exp_min = F::from_parts(false, 1, zero);
 
-    // 2 ^ sig_total_bits, representation of what can be accounted for with subnormals
-    let f_exp_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
+    // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
+    let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
+
+    /*
+     * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
+     * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
+     * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
+     * the final scale operation by prescaling by the max/min power representable by `F`.
+     */
 
     if n > exp_max {
+        // Worse case positive `n`: `x`  is the minimum subnormal value, the result is `F::MAX`.
+        // This can be reached by three scaling multiplications (two here and one final).
+        debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3);
+
         x *= f_exp_max;
         n -= exp_max;
         if n > exp_max {
@@ -51,21 +62,61 @@ where
             }
         }
     } else if n < exp_min {
-        let mul = f_exp_min * f_exp_subnorm;
-        let add = (exp_max - 1) - sig_total_bits as i32;
+        // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
+        // go subnormal. This avoids double rounding.
+        if F::BITS > 16 {
+            // `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
+            let mul = f_exp_min * f_pow_subnorm;
+            let add = -exp_min - sig_total_bits as i32;
+
+            // Worse case negative `n`: `x`  is the maximum positive value, the result is `F::MIN`.
+            // This must be reachable by three scaling multiplications (two here and one final).
+            debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min);
 
-        x *= mul;
-        n += add;
-        if n < exp_min {
             x *= mul;
             n += add;
+
             if n < exp_min {
-                n = exp_min;
+                x *= mul;
+                n += add;
+
+                if n < exp_min {
+                    n = exp_min;
+                }
+            }
+        } else {
+            // `f16` is unique compared to other float types in that the difference between the
+            // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
+            // small, only three. The above method depend on decrementing `n` by `add` two times;
+            // for other float types this works out because `add` is a substantial fraction of
+            // the exponent range. For `f16`, however, 3 is relatively small compared to the
+            // exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
+            //
+            // Work aroudn this by using a different algorithm that calculates the prescale
+            // dynamically based on the maximum possible value. This adds more operations per round
+            // since it needs to construct the scale, but works better in the general case.
+            let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+            let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
+
+            x *= mul;
+            n += add;
+
+            if n < exp_min {
+                let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+                let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
+
+                x *= mul;
+                n += add;
+
+                if n < exp_min {
+                    n = exp_min;
+                }
             }
         }
     }
 
-    x * F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero)
+    let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
+    x * scale
 }
 
 #[cfg(test)]
@@ -111,6 +162,12 @@ mod tests {
         assert!(scalbn(-F::NAN, -10).is_nan());
     }
 
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_test_f16() {
+        spec_test::<f16>();
+    }
+
     #[test]
     fn spec_test_f32() {
         spec_test::<f32>();
@@ -120,4 +177,10 @@ mod tests {
     fn spec_test_f64() {
         spec_test::<f64>();
     }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_test_f128() {
+        spec_test::<f128>();
+    }
 }
diff --git a/libm/src/math/ldexpf128.rs b/libm/src/math/ldexpf128.rs
@@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf128(x: f128, n: i32) -> f128 {
+    super::scalbnf128(x, n)
+}
diff --git a/libm/src/math/ldexpf16.rs b/libm/src/math/ldexpf16.rs
@@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf16(x: f16, n: i32) -> f16 {
+    super::scalbnf16(x, n)
+}
diff --git a/libm/src/math/mod.rs b/libm/src/math/mod.rs
diff --git a/libm/src/math/scalbnf128.rs b/libm/src/math/scalbnf128.rs
diff --git a/libm/src/math/scalbnf16.rs b/libm/src/math/scalbnf16.rs
diff --git a/libm/src/math/support/hex_float.rs b/libm/src/math/support/hex_float.rs