From 2683197c25d223cda7c2d4dfa552fda4267c8a43 Mon Sep 17 00:00:00 2001
From: quaternic <57393910+quaternic@users.noreply.github.com>
Date: Sun, 20 Jul 2025 22:27:01 +0300
Subject: [PATCH 1/2] substitute truncation for flooring in rempio2_large

---
 libm/src/math/rem_pio2_large.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libm/src/math/rem_pio2_large.rs b/libm/src/math/rem_pio2_large.rs
index 6d679bbe9..20d58c41a 100644
--- a/libm/src/math/rem_pio2_large.rs
+++ b/libm/src/math/rem_pio2_large.rs
@@ -11,7 +11,7 @@
  * ====================================================
  */
 
-use super::{floor, scalbn};
+use super::{scalbn, trunc};
 
 // initial value for jk
 const INIT_JK: [usize; 4] = [3, 4, 4, 6];
@@ -290,7 +290,7 @@ pub(crate) fn rem_pio2_large(x: &[f64], y: &mut [f64], e0: i32, prec: usize) ->
 
         /* compute n */
         z = scalbn(z, q0); /* actual value of z */
-        z -= 8.0 * floor(z * 0.125); /* trim off integer >= 8 */
+        z -= 8.0 * trunc(z * 0.125); /* trim off integer >= 8 */
         n = z as i32;
         z -= n as f64;
         ih = 0;

From 37b4f278a2e57cd75d03468e1e33d5b375f33364 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Sun, 13 Jul 2025 13:26:02 +0200
Subject: [PATCH 2/2] implement `floor` and `ceil` with inline assembly on
 `i586`

---
 libm-test/src/precision.rs | 22 ----------
 libm/src/math/arch/i586.rs | 85 +++++++++++++++++++++++++-------------
 2 files changed, 56 insertions(+), 51 deletions(-)

diff --git a/libm-test/src/precision.rs b/libm-test/src/precision.rs
index 32825b15d..3fb8c1b37 100644
--- a/libm-test/src/precision.rs
+++ b/libm-test/src/precision.rs
@@ -271,18 +271,6 @@ impl MaybeOverride<(f32,)> for SpecialCase {
 
 impl MaybeOverride<(f64,)> for SpecialCase {
     fn check_float<F: Float>(input: (f64,), actual: F, expected: F, ctx: &CheckCtx) -> CheckAction {
-        if cfg!(x86_no_sse)
-            && ctx.base_name == BaseName::Ceil
-            && ctx.basis == CheckBasis::Musl
-            && input.0 < 0.0
-            && input.0 > -1.0
-            && expected == F::ZERO
-            && actual == F::ZERO
-        {
-            // musl returns -0.0, we return +0.0
-            return XFAIL("i586 ceil signed zero");
-        }
-
         if cfg!(x86_no_sse)
             && (ctx.base_name == BaseName::Rint || ctx.base_name == BaseName::Roundeven)
             && (expected - actual).abs() <= F::ONE
@@ -292,16 +280,6 @@ impl MaybeOverride<(f64,)> for SpecialCase {
             return XFAIL("i586 rint rounding mode");
         }
 
-        if cfg!(x86_no_sse)
-            && (ctx.fn_ident == Identifier::Ceil || ctx.fn_ident == Identifier::Floor)
-            && expected.eq_repr(F::NEG_ZERO)
-            && actual.eq_repr(F::ZERO)
-        {
-            // FIXME: the x87 implementations do not keep the distinction between -0.0 and 0.0.
-            // See https://github.com/rust-lang/libm/pull/404#issuecomment-2572399955
-            return XFAIL("i586 ceil/floor signed zero");
-        }
-
         if cfg!(x86_no_sse)
             && (ctx.fn_ident == Identifier::Exp10 || ctx.fn_ident == Identifier::Exp2)
         {
diff --git a/libm/src/math/arch/i586.rs b/libm/src/math/arch/i586.rs
index f92b9a2af..b897bd231 100644
--- a/libm/src/math/arch/i586.rs
+++ b/libm/src/math/arch/i586.rs
@@ -1,37 +1,64 @@
 //! Architecture-specific support for x86-32 without SSE2
+//!
+//! We use an alternative implementation on x86, because the
+//! main implementation fails with the x87 FPU used by
+//! debian i386, probably due to excess precision issues.
+//!
+//! See https://github.com/rust-lang/compiler-builtins/pull/976 for discussion on why these
+//! functions are implemented in this way.
 
-use super::super::fabs;
+// FIXME: when the MSRV allows, use naked functions instead.
 
-/// Use an alternative implementation on x86, because the
-/// main implementation fails with the x87 FPU used by
-/// debian i386, probably due to excess precision issues.
-/// Basic implementation taken from https://github.com/rust-lang/libm/issues/219.
-pub fn ceil(x: f64) -> f64 {
-    if fabs(x).to_bits() < 4503599627370496.0_f64.to_bits() {
-        let truncated = x as i64 as f64;
-        if truncated < x {
-            return truncated + 1.0;
-        } else {
-            return truncated;
-        }
-    } else {
-        return x;
+pub extern "C" fn ceil(mut x: f64) -> f64 {
+    unsafe {
+        core::arch::asm!(
+            "fld qword ptr [{x}]",
+            // Save the FPU control word, using `x` as scratch space.
+            "fstcw [{x}]",
+            // Set rounding control to 0b10 (+∞).
+            "mov word ptr [{x} + 2], 0x0b7f",
+            "fldcw [{x} + 2]",
+            // Round.
+            "frndint",
+            // Restore FPU control word.
+            "fldcw [{x}]",
+            // Save rounded value to memory.
+            "fstp qword ptr [{x}]",
+            x = in(reg) &mut x,
+            // All the x87 FPU stack is used, all registers must be clobbered
+            out("st(0)") _, out("st(1)") _,
+            out("st(2)") _, out("st(3)") _,
+            out("st(4)") _, out("st(5)") _,
+            out("st(6)") _, out("st(7)") _,
+            options(nostack),
+        );
     }
+    x
 }
 
-/// Use an alternative implementation on x86, because the
-/// main implementation fails with the x87 FPU used by
-/// debian i386, probably due to excess precision issues.
-/// Basic implementation taken from https://github.com/rust-lang/libm/issues/219.
-pub fn floor(x: f64) -> f64 {
-    if fabs(x).to_bits() < 4503599627370496.0_f64.to_bits() {
-        let truncated = x as i64 as f64;
-        if truncated > x {
-            return truncated - 1.0;
-        } else {
-            return truncated;
-        }
-    } else {
-        return x;
+pub extern "C" fn floor(mut x: f64) -> f64 {
+    unsafe {
+        core::arch::asm!(
+            "fld qword ptr [{x}]",
+            // Save the FPU control word, using `x` as scratch space.
+            "fstcw [{x}]",
+            // Set rounding control to 0b01 (-∞).
+            "mov word ptr [{x} + 2], 0x077f",
+            "fldcw [{x} + 2]",
+            // Round.
+            "frndint",
+            // Restore FPU control word.
+            "fldcw [{x}]",
+            // Save rounded value to memory.
+            "fstp qword ptr [{x}]",
+            x = in(reg) &mut x,
+            // All the x87 FPU stack is used, all registers must be clobbered
+            out("st(0)") _, out("st(1)") _,
+            out("st(2)") _, out("st(3)") _,
+            out("st(4)") _, out("st(5)") _,
+            out("st(6)") _, out("st(7)") _,
+            options(nostack),
+        );
     }
+    x
 }