Merge pull request #236 from cuviper/x86-div

cuviper · web-flow · commit 9210db6df633 · 2024-05-06T21:53:20.000Z
Use inline asm! for x86 DIV
diff --git a/src/biguint/convert.rs b/src/biguint/convert.rs
@@ -4,7 +4,7 @@
 use super::{biguint_from_vec, BigUint, ToBigUint};
 
 use super::addition::add2;
-use super::division::div_rem_digit;
+use super::division::{div_rem_digit, FAST_DIV_WIDE};
 use super::multiplication::mac_with_carry;
 
 use crate::big_digit::{self, BigDigit};
@@ -688,16 +688,22 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
 
     let mut digits = u.clone();
 
-    let (base, power) = get_half_radix_base(radix);
+    // X86 DIV can quickly divide by a full digit, otherwise we choose a divisor
+    // that's suitable for `div_half` to avoid slow `DoubleBigDigit` division.
+    let (base, power) = if FAST_DIV_WIDE {
+        get_radix_base(radix)
+    } else {
+        get_half_radix_base(radix)
+    };
     let radix = radix as BigDigit;
 
     // For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the
     // performance. We can mitigate this by dividing into chunks of a larger base first.
     // The threshold for this was chosen by anecdotal performance measurements to
     // approximate where this starts to make a noticeable difference.
     if digits.data.len() >= 64 {
-        let mut big_base = BigUint::from(base * base);
-        let mut big_power = 2usize;
+        let mut big_base = BigUint::from(base);
+        let mut big_power = 1usize;
 
         // Choose a target base length near √n.
         let target_len = digits.data.len().sqrt();
diff --git a/src/biguint/division.rs b/src/biguint/division.rs
@@ -10,12 +10,15 @@ use core::ops::{Div, DivAssign, Rem, RemAssign};
 use num_integer::Integer;
 use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero};
 
+pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64"));
+
 /// Divide a two digit numerator by a one digit divisor, returns quotient and remainder:
 ///
 /// Note: the caller must ensure that both the quotient and remainder will fit into a single digit.
 /// This is _not_ true for an arbitrary numerator/denominator.
 ///
 /// (This function also matches what the x86 divide instruction does).
+#[cfg(any(miri, not(any(target_arch = "x86", target_arch = "x86_64"))))]
 #[inline]
 fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
     debug_assert!(hi < divisor);
@@ -25,6 +28,34 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi
     ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
 }
 
+/// x86 and x86_64 can use a real `div` instruction.
+#[cfg(all(not(miri), any(target_arch = "x86", target_arch = "x86_64")))]
+#[inline]
+fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
+    // This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one
+    // register, otherwise in release mode it will become a target-specific fault like SIGFPE.
+    // This should never occur with the inputs from our few `div_wide` callers.
+    debug_assert!(hi < divisor);
+
+    // SAFETY: The `div` instruction only affects registers, reading the explicit operand as the
+    // divisor, and implicitly reading RDX:RAX or EDX:EAX as the dividend. The result is implicitly
+    // written back to RAX or EAX for the quotient and RDX or EDX for the remainder. No memory is
+    // used, and flags are not preserved.
+    unsafe {
+        let (div, rem);
+
+        core::arch::asm!(
+            "div {}",
+            in(reg) divisor,
+            inout("dx") hi => rem,
+            inout("ax") lo => div,
+            options(pure, nomem, nostack),
+        );
+
+        (div, rem)
+    }
+}
+
 /// For small divisors, we can divide without promoting to `DoubleBigDigit` by
 /// using half-size pieces of digit, like long-division.
 #[inline]
@@ -45,7 +76,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit)
 
     let mut rem = 0;
 
-    if b <= big_digit::HALF {
+    if !FAST_DIV_WIDE && b <= big_digit::HALF {
         for d in a.data.iter_mut().rev() {
             let (q, r) = div_half(rem, *d, b);
             *d = q;
@@ -70,7 +101,7 @@ fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {
 
     let mut rem = 0;
 
-    if b <= big_digit::HALF {
+    if !FAST_DIV_WIDE && b <= big_digit::HALF {
         for &digit in a.data.iter().rev() {
             let (_, r) = div_half(rem, digit, b);
             rem = r;
@@ -230,7 +261,7 @@ fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
     let mut a0 = 0;
 
     // [b1, b0] are the two most significant digits of the divisor. They never change.
-    let b0 = *b.last().unwrap();
+    let b0 = b[b.len() - 1];
     let b1 = b[b.len() - 2];
 
     let q_len = a.data.len() - b.len() + 1;