rust-lang · Jan 27, 2022
diff --git a/‎crates/std_float/src/lib.rs
+6-288 b/‎crates/std_float/src/lib.rs
+6-288
diff --git a/‎crates/std_float/src/libm32.rs
+111 b/‎crates/std_float/src/libm32.rs
+111
diff --git a/‎crates/std_float/src/test_libm32.rs
+160 b/‎crates/std_float/src/test_libm32.rs
+160
@@ -11,6 +11,10 @@ use core_simd::simd;
 
 use simd::{LaneCount, Simd, SupportedLaneCount};
 
+mod libm32;
+#[cfg(test)]
+mod test_libm32;
+
 #[cfg(feature = "as_crate")]
 mod experimental {
     pub trait Sealed {}
@@ -115,7 +119,9 @@ pub trait StdFloat: Sealed + Sized {
     /// Returns the floating point's fractional value, with its integer part removed.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn fract(self) -> Self;
+}
 
+pub trait StdLibm : StdFloat {
     fn sin(self) -> Self;
 
     fn cos(self) -> Self;
@@ -143,112 +149,6 @@ where
     fn fract(self) -> Self {
         self - self.trunc()
     }
-
-    /// Calculate the sine of the angle
-    /// Note: this is hand-edited from generated scalar code.
-    /// In an ideal world, we would generate this directly by code transformation.
-    #[inline]
-    fn sin(self) -> Self {
-        #[allow(non_snake_case)]
-        let RECIP_2PI = Self::splat(0.15915494);
-
-        let scaled = self * RECIP_2PI;
-        let x = scaled - scaled.round();
-        Self::splat(-12.26885994095919635608)
-            .mul_add(x * x, Self::splat(41.21624105096574396575))
-            .mul_add(x * x, Self::splat(-76.58672703333290836700))
-            .mul_add(x * x, Self::splat(81.59746095374827019356))
-            .mul_add(x * x, Self::splat(-41.34151143437582891705))
-            .mul_add(x * x, Self::splat(6.28318452581127506328))
-            * x
-    }
-
-    fn cos(self) -> Self {
-        #[allow(non_snake_case)]
-        let RECIP_2PI = Self::splat(0.15915494);
-
-        let scaled = self * RECIP_2PI;
-        let x = scaled - scaled.round();
-        Self::splat(6.52865816174499269880)
-            .mul_add(x * x, Self::splat(-25.97327546890330396608))
-            .mul_add(x * x, Self::splat(60.17118230812820383560))
-            .mul_add(x * x, Self::splat(-85.45091743827674607508))
-            .mul_add(x * x, Self::splat(64.93918704099473042873))
-            .mul_add(x * x, Self::splat(-19.73920667935656472596))
-            .mul_add(x * x, Self::splat(1.00000000000000000000))
-    }
-
-    fn tan(self) -> Self {
-        use core::f32::consts::PI;
-        let scaled: Self = self * Self::splat(1.0 / PI);
-        let x: Self = scaled - scaled.round();
-        let recip: Self = (x * x - Self::splat(0.25)).recip();
-        let y: Self = Self::splat(0.01439730036301634345)
-            .mul_add(x * x, Self::splat(0.02101734538976238579))
-            .mul_add(x * x, Self::splat(0.05285888255895108345))
-            .mul_add(x * x, Self::splat(0.13475448281475060771))
-            .mul_add(x * x, Self::splat(0.55773663386075044866))
-            .mul_add(x * x, Self::splat(-0.78539816491781455948))
-            * x;
-        y * recip
-    }
-    
-    fn asin(self) -> Self {
-        use core::f32::consts::PI;
-        let lim: Self = Self::splat(0.9);
-        let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-PI / 2.0), Self::splat(PI / 2.0));
-        let s: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-1.0), Self::splat(1.0));
-        let x: Self = (self * self).lanes_lt(lim * lim).select(self, (Self::splat(1.0) - self * self).sqrt());
-        let y: Self = Self::splat(4374.97702992533695457424)
-            .mul_add(x * x, Self::splat(-13781.55764426881951685974))
-            .mul_add(x * x, Self::splat(17105.69475701115952774357))
-            .mul_add(x * x, Self::splat(-10486.64894150265898388567))
-            .mul_add(x * x, Self::splat(3231.76028705607279348342))
-            .mul_add(x * x, Self::splat(-447.56480696327035255708))
-            .mul_add(x * x, Self::splat(21.78206149264184872939))
-            .mul_add(x * x, Self::splat(0.84158415752395745675))
-            * x;
-        (self * self).lanes_lt(lim * lim).select(y, c - y * s)
-    }
-    
-    fn acos(self) -> Self {
-        use core::f32::consts::PI;
-        let lim: Self = Self::splat(0.9);
-        let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(PI), Self::splat(0.0));
-        let s: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(1.0), Self::splat(-1.0));
-        let x: Self = (self * self).lanes_lt(lim * lim).select(self, (Self::splat(1.0) - self * self).sqrt());
-        // let c: Self = select(self < 0.0, PI, 0.0);
-        // let s: Self = select(self < 0.0, 1.0, -1.0);
-        // let x: Self = select(self * self < lim * lim, self, (1.0 - self * self).sqrt());
-        let y: Self = Self::splat(4374.97702992533695457424)
-            .mul_add(x * x, Self::splat(-13781.55764426881951685974))
-            .mul_add(x * x, Self::splat(17105.69475701115952774357))
-            .mul_add(x * x, Self::splat(-10486.64894150265898388567))
-            .mul_add(x * x, Self::splat(3231.76028705607279348342))
-            .mul_add(x * x, Self::splat(-447.56480696327035255708))
-            .mul_add(x * x, Self::splat(21.78206149264184872939))
-            .mul_add(x * x, Self::splat(0.84158415752395745675))
-            * x;
-        (self * self).lanes_lt(lim * lim).select(y, c - y * s)
-    }
-    
-    fn atan(self) -> Self {
-        use core::f32::consts::PI;
-        let lim: Self = Self::splat(1.0);
-        let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-PI / 2.0), Self::splat(PI / 2.0));
-        let small = self.abs().lanes_lt(lim);
-        let x: Self = small.select(self, self.recip());
-        let y: Self = Self::splat(95.70126383842530559360)
-            .mul_add(x * x, Self::splat(424.99907022806059540464))
-            .mul_add(x * x, Self::splat(-767.48259680040570156003))
-            .mul_add(x * x, Self::splat(714.51953012224223415829))
-            .mul_add(x * x, Self::splat(-354.32654395426962592865))
-            .mul_add(x * x, Self::splat(83.96179897148539189638))
-            .mul_add(x * x, Self::splat(-6.23958170715441509270))
-            .mul_add(x * x, Self::splat(1.05498514186427524914))
-            * x;
-        small.select(y, c - y)
-    }
 }
 
 impl<const N: usize> StdFloat for Simd<f64, N>
@@ -261,36 +161,6 @@ where
     fn fract(self) -> Self {
         self - self.trunc()
     }
-
-    #[inline]
-    fn sin(self) -> Self {
-        self
-    }
-
-    #[inline]
-    fn cos(self) -> Self {
-        self
-    }
-
-    #[inline]
-    fn tan(self) -> Self {
-        self
-    }
-
-    #[inline]
-    fn asin(self) -> Self {
-        self
-    }
-
-    #[inline]
-    fn acos(self) -> Self {
-        self
-    }
-
-    #[inline]
-    fn atan(self) -> Self {
-        self
-    }
 }
 
 #[cfg(test)]
@@ -311,156 +181,4 @@ mod tests {
         let _ = x2.abs() * x2;
         let _ = x.sin();
     }
-
-    const NUM_ITER: usize = 0x10000;
-
-    macro_rules! test_range {
-        (
-                min: $min: expr,
-                max: $max: expr,
-                limit: $limit: expr,
-                scalar_fn: $scalar_fn: expr,
-                vector_fn: $vector_fn: expr,
-                scalar_type: $scalar_type: ty,
-                vector_type: $vector_type: ty,
-            ) => {{
-            let limit = <$vector_type>::splat($limit);
-            let b = (($max) - ($min)) * (1.0 / NUM_ITER as $scalar_type);
-            let a = $min;
-            let sf = $scalar_fn;
-            let vf = $vector_fn;
-            for i in (0..NUM_ITER / 4) {
-                let fi = (i * 4) as $scalar_type;
-                let x = <$vector_type>::from_array([
-                    (fi + 0.0) * b + a,
-                    (fi + 1.0) * b + a,
-                    (fi + 2.0) * b + a,
-                    (fi + 3.0) * b + a,
-                ]);
-                let yref = <$vector_type>::from_array([sf(x[0]), sf(x[1]), sf(x[2]), sf(x[3])]);
-                let y = vf(x);
-                let e = (y - yref);
-                if !(e.abs().lanes_le(limit)).all() {
-                    panic!("\nx     ={:20.16?}\ne     ={:20.16?}\nlimit ={:20.16?}\nvector={:20.16?}\nscalar={:20.16?}\nvector_fn={}", x, e, limit, y, yref, stringify!($vector_fn));
-                }
-            }
-        }};
-    }
-
-    #[test]
-    fn sin_f32() {
-        use core::f32::consts::PI;
-        let one_ulp = (2.0_f32).powi(-23);
-
-        test_range!(
-            min: -PI/4.0,
-            max: PI/4.0,
-            limit: one_ulp * 1.0,
-            scalar_fn: |x : f32| x.sin(),
-            vector_fn: |x : f32x4| x.sin(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-
-        test_range!(
-            min: -PI/2.0,
-            max: PI/2.0,
-            limit: one_ulp * 2.0,
-            scalar_fn: |x : f32| x.sin(),
-            vector_fn: |x : f32x4| x.sin(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-
-        test_range!(
-            min: -PI,
-            max: PI,
-            limit: one_ulp * 8.0,
-            scalar_fn: |x : f32| x.sin(),
-            vector_fn: |x : f32x4| x.sin(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-    }
-
-    #[test]
-    fn cos_f32() {
-        use core::f32::consts::PI;
-        let one_ulp = (2.0_f32).powi(-23);
-
-        // In the range +/- pi/4 the input has 1 ulp of error.
-        test_range!(
-            min: -PI/4.0,
-            max: PI/4.0,
-            limit: one_ulp * 1.0,
-            scalar_fn: |x : f32| x.cos(),
-            vector_fn: |x : f32x4| x.cos(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-
-        // In the range +/- pi/2 the input and output has 2 ulp of error.
-        test_range!(
-            min: -PI/2.0,
-            max: PI/2.0,
-            limit: one_ulp * 2.0,
-            scalar_fn: |x : f32| x.cos(),
-            vector_fn: |x : f32x4| x.cos(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-
-        // In the range +/- pi the input has 4 ulp of error and the output has 5.
-        // Note that the scalar cos also has this error but the implementation
-        // is different.
-        test_range!(
-            min: -PI,
-            max: PI,
-            limit: one_ulp * 8.0,
-            scalar_fn: |x : f32| x.cos(),
-            vector_fn: |x : f32x4| x.cos(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-    }
-
-    #[test]
-    fn tan_f32() {
-        use core::f32::consts::PI;
-        let one_ulp = (2.0_f32).powi(-23);
-
-        // For the outsides, reciprocal accuracy is important.
-        // Note that the vector function correctly gets -inf for -PI/2
-        // but the scalar function does not.
-        test_range!(
-            min: -PI/2.0 + 0.00001,
-            max: -PI/4.0,
-            limit: one_ulp * 3.0,
-            scalar_fn: |x : f32| x.tan().recip(),
-            vector_fn: |x : f32x4| x.tan().recip(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-
-        // For the insides, absolute accuracy is important.
-        test_range!(
-            min: -PI/4.0,
-            max: PI/4.0,
-            limit: one_ulp * 2.0,
-            scalar_fn: |x : f32| x.tan(),
-            vector_fn: |x : f32x4| x.tan(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-
-        test_range!(
-            min: PI/4.0,
-            max: PI/2.0 - 0.00001,
-            limit: one_ulp * 3.0,
-            scalar_fn: |x : f32| x.tan().recip(),
-            vector_fn: |x : f32x4| x.tan().recip(),
-            scalar_type: f32,
-            vector_type: f32x4,
-        );
-    }
 }
@@ -0,0 +1,111 @@
+#![allow(non_snake_case)]
+use super::StdLibm;
+
+use super::StdFloat;
+
+use super::simd::{LaneCount, Simd, SupportedLaneCount};
+
+impl<const N: usize> StdLibm for Simd<f32, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    #[inline]
+    fn asin(self) -> Self {
+        let PI_BY_2 = Self::splat(1.5707964f32);
+        let LIM: Self = Self::splat(0.9f32);
+        let c: Self = ((self).lanes_lt(Self::splat(0f32))).select(-PI_BY_2, PI_BY_2);
+        let s: Self =
+            ((self).lanes_lt(Self::splat(0f32))).select(-Self::splat(1f32), Self::splat(1f32));
+        let x: Self = ((self * self).lanes_lt(LIM * LIM))
+            .select(self, (Self::splat(1f32) - self * self).sqrt());
+        let y: Self = (Self::splat(4374.977f32))
+            .mul_add(x * x, -Self::splat(13781.558f32))
+            .mul_add(x * x, Self::splat(17105.695f32))
+            .mul_add(x * x, -Self::splat(10486.649f32))
+            .mul_add(x * x, Self::splat(3231.7603f32))
+            .mul_add(x * x, -Self::splat(447.56482f32))
+            .mul_add(x * x, Self::splat(21.78206f32))
+            .mul_add(x * x, Self::splat(0.84158415f32))
+            * x;
+        ((self * self).lanes_lt(LIM * LIM)).select(y, c - y * s)
+    }
+    #[inline]
+    fn acos(self) -> Self {
+        let PI_BY_2 = Self::splat(1.5707964f32);
+        let PI = Self::splat(3.1415927f32);
+        let LIM: Self = Self::splat(0.9f32);
+        let c: Self = ((self).lanes_lt(Self::splat(0f32))).select(PI, Self::splat(0f32));
+        let s: Self =
+            ((self).lanes_lt(Self::splat(0f32))).select(Self::splat(1f32), -Self::splat(1f32));
+        let x: Self = ((self * self).lanes_lt(LIM * LIM))
+            .select(self, (Self::splat(1f32) - self * self).sqrt());
+        let y: Self = (Self::splat(4374.977f32))
+            .mul_add(x * x, -Self::splat(13781.558f32))
+            .mul_add(x * x, Self::splat(17105.695f32))
+            .mul_add(x * x, -Self::splat(10486.649f32))
+            .mul_add(x * x, Self::splat(3231.7603f32))
+            .mul_add(x * x, -Self::splat(447.56482f32))
+            .mul_add(x * x, Self::splat(21.78206f32))
+            .mul_add(x * x, Self::splat(0.84158415f32))
+            * x;
+        ((self * self).lanes_lt(LIM * LIM)).select(PI_BY_2 - y, c - y * s)
+    }
+    #[inline]
+    fn atan(self) -> Self {
+        let PI_BY_2 = Self::splat(1.5707964f32);
+        let LIM: Self = Self::splat(1f32);
+        let c: Self = ((self).lanes_lt(Self::splat(0f32))).select(-PI_BY_2, PI_BY_2);
+        let x: Self = ((self.abs()).lanes_lt(LIM)).select(self, self.recip());
+        let y: Self = (-Self::splat(95.70126f32))
+            .mul_add(x * x, Self::splat(424.99908f32))
+            .mul_add(x * x, -Self::splat(767.4826f32))
+            .mul_add(x * x, Self::splat(714.51953f32))
+            .mul_add(x * x, -Self::splat(354.32654f32))
+            .mul_add(x * x, Self::splat(83.9618f32))
+            .mul_add(x * x, -Self::splat(6.2395816f32))
+            .mul_add(x * x, Self::splat(1.0549852f32))
+            * x;
+        ((self.abs()).lanes_lt(LIM)).select(y, c - y)
+    }
+    #[inline]
+    fn sin(self) -> Self {
+        let RECIP_2PI = Self::splat(0.15915494f32);
+        let scaled: Self = self * RECIP_2PI;
+        let x: Self = scaled - scaled.round();
+        (-Self::splat(12.26886f32))
+            .mul_add(x * x, Self::splat(41.21624f32))
+            .mul_add(x * x, -Self::splat(76.58672f32))
+            .mul_add(x * x, Self::splat(81.59746f32))
+            .mul_add(x * x, -Self::splat(41.34151f32))
+            .mul_add(x * x, Self::splat(6.2831845f32))
+            * x
+    }
+    #[inline]
+    fn cos(self) -> Self {
+        let RECIP_2PI = Self::splat(0.15915494f32);
+        let scaled: Self = self * RECIP_2PI;
+        let x: Self = scaled - scaled.round();
+        (Self::splat(6.5286584f32))
+            .mul_add(x * x, -Self::splat(25.973276f32))
+            .mul_add(x * x, Self::splat(60.17118f32))
+            .mul_add(x * x, -Self::splat(85.45092f32))
+            .mul_add(x * x, Self::splat(64.939186f32))
+            .mul_add(x * x, -Self::splat(19.739206f32))
+            .mul_add(x * x, Self::splat(1f32))
+    }
+    #[inline]
+    fn tan(self) -> Self {
+        let RECIP_PI = Self::splat(0.31830987f32);
+        let scaled: Self = self * RECIP_PI;
+        let x: Self = scaled - scaled.round();
+        let recip: Self = Self::splat(1f32) / (x * x - Self::splat(0.25f32));
+        let y: Self = (Self::splat(0.014397301f32))
+            .mul_add(x * x, Self::splat(0.021017345f32))
+            .mul_add(x * x, Self::splat(0.05285888f32))
+            .mul_add(x * x, Self::splat(0.13475448f32))
+            .mul_add(x * x, Self::splat(0.55773664f32))
+            .mul_add(x * x, -Self::splat(0.7853982f32))
+            * x;
+        y * recip
+    }
+}
@@ -0,0 +1,160 @@
+const NUM_ITER: usize = 0x10000;
+
+macro_rules! test_range {
+    (
+            min: $min: expr,
+            max: $max: expr,
+            limit: $limit: expr,
+            scalar_fn: $scalar_fn: expr,
+            vector_fn: $vector_fn: expr,
+            scalar_type: $scalar_type: ty,
+            vector_type: $vector_type: ty,
+        ) => {{
+        let limit = <$vector_type>::splat($limit);
+        let b = (($max) - ($min)) * (1.0 / NUM_ITER as $scalar_type);
+        let a = $min;
+        let sf = $scalar_fn;
+        let vf = $vector_fn;
+        for i in (0..NUM_ITER / 4) {
+            let fi = (i * 4) as $scalar_type;
+            let x = <$vector_type>::from_array([
+                (fi + 0.0) * b + a,
+                (fi + 1.0) * b + a,
+                (fi + 2.0) * b + a,
+                (fi + 3.0) * b + a,
+            ]);
+            let yref = <$vector_type>::from_array([sf(x[0]), sf(x[1]), sf(x[2]), sf(x[3])]);
+            let y = vf(x);
+            let e = (y - yref);
+            if !(e.abs().lanes_le(limit)).all() {
+                panic!("\nx     ={:20.16?}\ne     ={:20.16?}\nlimit ={:20.16?}\nvector={:20.16?}\nscalar={:20.16?}\nvector_fn={}", x, e, limit, y, yref, stringify!($vector_fn));
+            }
+        }
+    }};
+}
+
+#[test]
+fn sin_f32() {
+    use core::f32::consts::PI;
+    use core_simd::f32x4;
+    use crate::StdLibm;
+
+    let one_ulp = (2.0_f32).powi(-23);
+
+    test_range!(
+        min: -PI/4.0,
+        max: PI/4.0,
+        limit: one_ulp * 1.0,
+        scalar_fn: |x : f32| x.sin(),
+        vector_fn: |x : f32x4| x.sin(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+
+    test_range!(
+        min: -PI/2.0,
+        max: PI/2.0,
+        limit: one_ulp * 2.0,
+        scalar_fn: |x : f32| x.sin(),
+        vector_fn: |x : f32x4| x.sin(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+
+    test_range!(
+        min: -PI,
+        max: PI,
+        limit: one_ulp * 8.0,
+        scalar_fn: |x : f32| x.sin(),
+        vector_fn: |x : f32x4| x.sin(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+}
+
+#[test]
+fn cos_f32() {
+    use core::f32::consts::PI;
+    use core_simd::f32x4;
+    use crate::StdLibm;
+
+    let one_ulp = (2.0_f32).powi(-23);
+
+    // In the range +/- pi/4 the input has 1 ulp of error.
+    test_range!(
+        min: -PI/4.0,
+        max: PI/4.0,
+        limit: one_ulp * 1.0,
+        scalar_fn: |x : f32| x.cos(),
+        vector_fn: |x : f32x4| x.cos(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+
+    // In the range +/- pi/2 the input and output has 2 ulp of error.
+    test_range!(
+        min: -PI/2.0,
+        max: PI/2.0,
+        limit: one_ulp * 2.0,
+        scalar_fn: |x : f32| x.cos(),
+        vector_fn: |x : f32x4| x.cos(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+
+    // In the range +/- pi the input has 4 ulp of error and the output has 5.
+    // Note that the scalar cos also has this error but the implementation
+    // is different.
+    test_range!(
+        min: -PI,
+        max: PI,
+        limit: one_ulp * 8.0,
+        scalar_fn: |x : f32| x.cos(),
+        vector_fn: |x : f32x4| x.cos(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+}
+
+#[test]
+fn tan_f32() {
+    use core::f32::consts::PI;
+    use core_simd::f32x4;
+    use crate::StdLibm;
+
+    let one_ulp = (2.0_f32).powi(-23);
+
+    // For the outsides, reciprocal accuracy is important.
+    // Note that the vector function correctly gets -inf for -PI/2
+    // but the scalar function does not.
+    test_range!(
+        min: -PI/2.0 + 0.00001,
+        max: -PI/4.0,
+        limit: one_ulp * 3.0,
+        scalar_fn: |x : f32| x.tan().recip(),
+        vector_fn: |x : f32x4| x.tan().recip(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+
+    // For the insides, absolute accuracy is important.
+    test_range!(
+        min: -PI/4.0,
+        max: PI/4.0,
+        limit: one_ulp * 2.0,
+        scalar_fn: |x : f32| x.tan(),
+        vector_fn: |x : f32x4| x.tan(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+
+    test_range!(
+        min: PI/4.0,
+        max: PI/2.0 - 0.00001,
+        limit: one_ulp * 3.0,
+        scalar_fn: |x : f32| x.tan().recip(),
+        vector_fn: |x : f32x4| x.tan().recip(),
+        scalar_type: f32,
+        vector_type: f32x4,
+    );
+}