Skip to content

Commit 9210db6

Browse files
authored
Merge pull request #236 from cuviper/x86-div
Use inline asm! for x86 DIV
2 parents 636317c + b02188d commit 9210db6

File tree

2 files changed

+44
-7
lines changed

2 files changed

+44
-7
lines changed

src/biguint/convert.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
use super::{biguint_from_vec, BigUint, ToBigUint};
55

66
use super::addition::add2;
7-
use super::division::div_rem_digit;
7+
use super::division::{div_rem_digit, FAST_DIV_WIDE};
88
use super::multiplication::mac_with_carry;
99

1010
use crate::big_digit::{self, BigDigit};
@@ -688,16 +688,22 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
688688

689689
let mut digits = u.clone();
690690

691-
let (base, power) = get_half_radix_base(radix);
691+
// X86 DIV can quickly divide by a full digit, otherwise we choose a divisor
692+
// that's suitable for `div_half` to avoid slow `DoubleBigDigit` division.
693+
let (base, power) = if FAST_DIV_WIDE {
694+
get_radix_base(radix)
695+
} else {
696+
get_half_radix_base(radix)
697+
};
692698
let radix = radix as BigDigit;
693699

694700
// For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the
695701
// performance. We can mitigate this by dividing into chunks of a larger base first.
696702
// The threshold for this was chosen by anecdotal performance measurements to
697703
// approximate where this starts to make a noticeable difference.
698704
if digits.data.len() >= 64 {
699-
let mut big_base = BigUint::from(base * base);
700-
let mut big_power = 2usize;
705+
let mut big_base = BigUint::from(base);
706+
let mut big_power = 1usize;
701707

702708
// Choose a target base length near √n.
703709
let target_len = digits.data.len().sqrt();

src/biguint/division.rs

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@ use core::ops::{Div, DivAssign, Rem, RemAssign};
1010
use num_integer::Integer;
1111
use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero};
1212

13+
pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64"));
14+
1315
/// Divide a two digit numerator by a one digit divisor, returns quotient and remainder:
1416
///
1517
/// Note: the caller must ensure that both the quotient and remainder will fit into a single digit.
1618
/// This is _not_ true for an arbitrary numerator/denominator.
1719
///
1820
/// (This function also matches what the x86 divide instruction does).
21+
#[cfg(any(miri, not(any(target_arch = "x86", target_arch = "x86_64"))))]
1922
#[inline]
2023
fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
2124
debug_assert!(hi < divisor);
@@ -25,6 +28,34 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi
2528
((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
2629
}
2730

31+
/// x86 and x86_64 can use a real `div` instruction.
32+
#[cfg(all(not(miri), any(target_arch = "x86", target_arch = "x86_64")))]
33+
#[inline]
34+
fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
35+
// This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one
36+
// register, otherwise in release mode it will become a target-specific fault like SIGFPE.
37+
// This should never occur with the inputs from our few `div_wide` callers.
38+
debug_assert!(hi < divisor);
39+
40+
// SAFETY: The `div` instruction only affects registers, reading the explicit operand as the
41+
// divisor, and implicitly reading RDX:RAX or EDX:EAX as the dividend. The result is implicitly
42+
// written back to RAX or EAX for the quotient and RDX or EDX for the remainder. No memory is
43+
// used, and flags are not preserved.
44+
unsafe {
45+
let (div, rem);
46+
47+
core::arch::asm!(
48+
"div {}",
49+
in(reg) divisor,
50+
inout("dx") hi => rem,
51+
inout("ax") lo => div,
52+
options(pure, nomem, nostack),
53+
);
54+
55+
(div, rem)
56+
}
57+
}
58+
2859
/// For small divisors, we can divide without promoting to `DoubleBigDigit` by
2960
/// using half-size pieces of digit, like long-division.
3061
#[inline]
@@ -45,7 +76,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit)
4576

4677
let mut rem = 0;
4778

48-
if b <= big_digit::HALF {
79+
if !FAST_DIV_WIDE && b <= big_digit::HALF {
4980
for d in a.data.iter_mut().rev() {
5081
let (q, r) = div_half(rem, *d, b);
5182
*d = q;
@@ -70,7 +101,7 @@ fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {
70101

71102
let mut rem = 0;
72103

73-
if b <= big_digit::HALF {
104+
if !FAST_DIV_WIDE && b <= big_digit::HALF {
74105
for &digit in a.data.iter().rev() {
75106
let (_, r) = div_half(rem, digit, b);
76107
rem = r;
@@ -230,7 +261,7 @@ fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
230261
let mut a0 = 0;
231262

232263
// [b1, b0] are the two most significant digits of the divisor. They never change.
233-
let b0 = *b.last().unwrap();
264+
let b0 = b[b.len() - 1];
234265
let b1 = b[b.len() - 2];
235266

236267
let q_len = a.data.len() - b.len() + 1;

0 commit comments

Comments
 (0)