-
-
Notifications
You must be signed in to change notification settings - Fork 731
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix #10513 - powmod is slow for ulong #10688
Conversation
Thanks for your pull request and interest in making D better, @Aditya-132! We are looking forward to reviewing it, and you should be hearing from a maintainer soon.
Please see CONTRIBUTING.md for more information. If you have addressed all reviews or aren't sure how to proceed, don't hesitate to ping us with a simple comment. Bugzilla referencesYour PR doesn't reference any Bugzilla issue. If your PR contains non-trivial changes, please reference a Bugzilla issue or create a manual changelog. Testing this PR locallyIf you don't have a local development environment setup, you can use Digger to test this PR: dub run digger -- build "master + phobos#10688" |
@Aditya-132 before I go through it, if you can run and post benchmarks, that'll help with review. Out of curiosity, would you also be able to.do gcc-style asm for gdc and ldc? |
sure |
Please give the PR and commit titles a description based on the issues this solves e.g. |
can you provide any example which is already present in code |
instead of modifying powmod we can diredctly modify mulmod |
So, unlike dmd iasm which As all you're doing is mul, and taking the result from two registers, the equivalent should be
|
TESTING RESULT
Script which i use for testingimport std.stdio;
import std.datetime.stopwatch;
import std.random;
import std.traits;
import std.meta;
import std.algorithm;
import core.time;
// First implementation from the pasted function
static T mulmod1(T)(T a, T b, T c) {
static if (T.sizeof == 8) {
version (D_InlineAsm_X86_64) {
// Fast path for small values
ulong low = void;
ulong high = void;
// Perform 128-bit multiplication: a * b = [high:low]
asm pure @trusted nothrow @nogc
{
mov RAX, a; // Load a into RAX
mul b; // Multiply by b (RDX:RAX = a * b)
mov low, RAX; // Store low 64 bits
mov high, RDX; // Store high 64 bits
}
// Handle overflow for large values
if (high >= c) {
high %= c;
}
if (high == 0) {
return low % c;
}
// Reduce the high 64 bits modulo mod
asm pure @trusted nothrow @nogc
{
mov RAX, high; // Load high part
xor RDX, RDX; // Clear RDX for division
div c; // RAX = high / mod, RDX = high % mod
mov high, RDX; // Store remainder (high % mod)
mov RAX, low; // Load low part
mov RDX, high; // Load reduced high part
div c; // Divide full 128-bit number by mod
mov low, RDX; // Store remainder (final result)
}
return low; // Return (a * b) % mod
}
else {
// Fallback for non-x86_64 platforms
T result = 0;
a %= c;
b %= c;
while (b > 0) {
if (b & 1) {
result = (result + a) % c;
}
a = (a * 2) % c;
b >>= 1;
}
return result;
}
}
else static if (T.sizeof == 4) {
version (D_InlineAsm_X86) {
// Fast path for small values
uint low = void; // Changed to uint for 32-bit
uint high = void; // Changed to uint for 32-bit
// Perform 64-bit multiplication: a * b = [high:low]
asm pure @trusted nothrow @nogc
{
mov EAX, a; // Load a into EAX
mul b; // Multiply by b
mov low, EAX; // Store low bits
mov high, EDX; // Store high bits
}
// Handle overflow for large values
if (high >= c) {
high %= c;
}
if (high == 0) {
return low % c;
}
// Reduce the high bits modulo mod
asm pure @trusted nothrow @nogc
{
mov EAX, high; // Load high part
xor EDX, EDX; // Clear EDX for division
div c; // EAX = high / mod, EDX = high % mod
mov high, EDX; // Store remainder (high % mod)
mov EAX, low; // Load low part
mov EDX, high; // Load reduced high part
div c; // Divide full number by mod
mov low, EDX; // Store remainder (final result)
}
return low; // Return (a * b) % mod
}
else {
// Use 64-bit type for the calculation
ulong result = (cast(ulong)a * cast(ulong)b) % c;
return cast(T)result;
}
}
else {
// Fallback for other sizes
import std.bigint;
return cast(T)((BigInt(a) * BigInt(b)) % BigInt(c));
}
}
// Second implementation from function provided in the query
template Largest(T...) {
static if (T.length == 1)
alias Largest = T[0];
else static if (T[0].sizeof > Largest!(T[1..$]).sizeof)
alias Largest = T[0];
else
alias Largest = Largest!(T[1..$]);
}
static T mulmod2(T)(T a, T b, T c) {
static if (T.sizeof == 8) {
static T addmod(T a, T b, T c) {
b = c - b;
if (a >= b)
return a - b;
else
return c - b + a;
}
T result = 0;
b %= c;
while (a > 0) {
if (a & 1)
result = addmod(result, b, c);
a >>= 1;
b = addmod(b, b, c);
}
return result;
}
else {
alias DoubleT = AliasSeq!(void, ushort, uint, void, ulong)[T.sizeof];
DoubleT result = cast(DoubleT) (cast(DoubleT) a * cast(DoubleT) b);
return result % c;
}
}
// FIX: Explicitly specify the return type
F powmod1(F, G, H)(F x, G n, H m)
if (isUnsigned!F && isUnsigned!G && isUnsigned!H)
{
alias ReturnT = std.traits.Unqual!(Largest!(F, H));
ReturnT base = cast(ReturnT)(x % m);
ReturnT result = 1;
ReturnT modulus = cast(ReturnT)m;
std.traits.Unqual!G exponent = n;
while (exponent > 0) {
if (exponent & 1)
result = mulmod1!ReturnT(result, base, modulus);
base = mulmod1!ReturnT(base, base, modulus);
exponent >>= 1;
}
return cast(F)result;
}
// FIX: Explicitly specify the return type
F powmod2(F, G, H)(F x, G n, H m)
if (isUnsigned!F && isUnsigned!G && isUnsigned!H)
{
alias ReturnT = std.traits.Unqual!(Largest!(F, H));
ReturnT base = cast(ReturnT)(x % m);
ReturnT result = 1;
ReturnT modulus = cast(ReturnT)m;
std.traits.Unqual!G exponent = n;
while (exponent > 0) {
if (exponent & 1)
result = mulmod2!ReturnT(result, base, modulus);
base = mulmod2!ReturnT(base, base, modulus);
exponent >>= 1;
}
return cast(F)result;
}
// DELETED conflicting Unqual template
void main() {
// Run tests first to ensure correctness
writeln("VERIFICATION TESTS:");
runTests();
// Test for 32-bit integers
writeln("\nBENCHMARK 1: Testing mulmod implementations");
writeln("Testing with 32-bit integers:");
benchmarkMulmod!(uint)();
// Test for 64-bit integers
writeln("\nTesting with 64-bit integers:");
benchmarkMulmod!(ulong)();
// Test powmod implementations
writeln("\n\nBENCHMARK 2: Testing powmod implementations");
writeln("Testing with 32-bit integers:");
benchmarkPowmod!(uint)();
writeln("\nTesting with 64-bit integers:");
benchmarkPowmod!(ulong)();
}
void runTests() {
writeln("Running unit tests...");
// Test cases provided in the query
// First set of tests
assert(powmod1(1U, 10U, 3U) == 1);
assert(powmod1(3U, 2U, 6U) == 3);
assert(powmod1(5U, 5U, 15U) == 5);
assert(powmod1(2U, 3U, 5U) == 3);
assert(powmod1(2U, 4U, 5U) == 1);
assert(powmod1(2U, 5U, 5U) == 2);
// Second implementation
assert(powmod2(1U, 10U, 3U) == 1);
assert(powmod2(3U, 2U, 6U) == 3);
assert(powmod2(5U, 5U, 15U) == 5);
assert(powmod2(2U, 3U, 5U) == 3);
assert(powmod2(2U, 4U, 5U) == 1);
assert(powmod2(2U, 5U, 5U) == 2);
// Second set of tests - ulong
ulong a = 18446744073709551615u, b = 20u, c = 18446744073709551610u;
assert(powmod1(a, b, c) == 95367431640625u);
a = 100; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 18223853583554725198u);
a = 117; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 11493139548346411394u);
a = 134; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 10979163786734356774u);
a = 151; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 7023018419737782840u);
a = 168; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 58082701842386811u);
a = 185; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 17423478386299876798u);
a = 202; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 5522733478579799075u);
a = 219; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 15230218982491623487u);
a = 236; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 5198328724976436000u);
a = 0; b = 7919; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 0);
a = 123; b = 0; c = 18446744073709551557u;
assert(powmod1(a, b, c) == 1);
immutable ulong a1 = 253, b1 = 7919, c1 = 18446744073709551557u;
assert(powmod1(a1, b1, c1) == 3883707345459248860u);
// Second implementation
a = 18446744073709551615u; b = 20u; c = 18446744073709551610u;
assert(powmod2(a, b, c) == 95367431640625u);
a = 100; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 18223853583554725198u);
a = 117; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 11493139548346411394u);
a = 134; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 10979163786734356774u);
a = 151; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 7023018419737782840u);
a = 168; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 58082701842386811u);
a = 185; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 17423478386299876798u);
a = 202; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 5522733478579799075u);
a = 219; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 15230218982491623487u);
a = 236; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 5198328724976436000u);
a = 0; b = 7919; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 0);
a = 123; b = 0; c = 18446744073709551557u;
assert(powmod2(a, b, c) == 1);
assert(powmod2(a1, b1, c1) == 3883707345459248860u);
// Third set of tests - uint
uint x = 100, y = 7919, z = 1844674407u;
assert(powmod1(x, y, z) == 1613100340u);
x = 134; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 734956622u);
x = 151; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1738696945u);
x = 168; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1247580927u);
x = 185; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1293855176u);
x = 202; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1566963682u);
x = 219; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 181227807u);
x = 236; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 217988321u);
x = 253; y = 7919; z = 1844674407u;
assert(powmod1(x, y, z) == 1588843243u);
x = 0; y = 7919; z = 184467u;
assert(powmod1(x, y, z) == 0);
x = 123; y = 0; z = 1844674u;
assert(powmod1(x, y, z) == 1);
// Second implementation
x = 100; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1613100340u);
x = 134; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 734956622u);
x = 151; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1738696945u);
x = 168; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1247580927u);
x = 185; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1293855176u);
x = 202; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1566963682u);
x = 219; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 181227807u);
x = 236; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 217988321u);
x = 253; y = 7919; z = 1844674407u;
assert(powmod2(x, y, z) == 1588843243u);
x = 0; y = 7919; z = 184467u;
assert(powmod2(x, y, z) == 0);
x = 123; y = 0; z = 1844674u;
assert(powmod2(x, y, z) == 1);
writeln("All tests passed!");
}
void benchmarkMulmod(T)() {
// Number of iterations for benchmarking
immutable iterations = 1_000_000;
// Generate random test cases
T[] a_values = new T[iterations];
T[] b_values = new T[iterations];
T[] m_values = new T[iterations];
auto rnd = Random(unpredictableSeed);
// Fill arrays with random values
for (size_t i = 0; i < iterations; i++) {
static if (is(T == ulong)) {
a_values[i] = uniform!("[]")(T.min, T.max / 2, rnd);
b_values[i] = uniform!("[]")(T.min, T.max / 2, rnd);
// Avoid modulus being too small or zero
m_values[i] = uniform!("[]")(T.max / 4, T.max - 1, rnd);
} else {
a_values[i] = uniform!("[]")(T.min, T.max, rnd);
b_values[i] = uniform!("[]")(T.min, T.max, rnd);
// Avoid modulus being too small or zero
m_values[i] = uniform!("[]")(T.max / 2, T.max - 1, rnd);
}
}
// Variables to store results to prevent compiler optimizations
T result1 = 0;
T result2 = 0;
// Test first implementation
writeln("Testing first implementation (ASM-based):");
auto sw1 = StopWatch(AutoStart.yes);
for (size_t i = 0; i < iterations; i++) {
result1 ^= mulmod1!T(a_values[i], b_values[i], m_values[i]);
}
sw1.stop();
// Test second implementation
writeln("Testing second implementation (Standard D):");
auto sw2 = StopWatch(AutoStart.yes);
for (size_t i = 0; i < iterations; i++) {
result2 ^= mulmod2!T(a_values[i], b_values[i], m_values[i]);
}
sw2.stop();
// Print results
writefln("First implementation (ASM): %s", sw1.peek());
writefln("Second implementation (D): %s", sw2.peek());
double ratio = (sw2.peek().total!"usecs" * 100.0 / sw1.peek().total!"usecs");
writefln("Performance ratio: %.2f%% (higher means ASM is faster)", ratio);
// Print dummy value to prevent optimization
writefln("Verification values: %s %s", result1, result2);
}
void benchmarkPowmod(T)() {
// Number of iterations for benchmarking
immutable iterations = 10_000;
// Generate random test cases
T[] x_values = new T[iterations];
T[] n_values = new T[iterations];
T[] m_values = new T[iterations];
auto rnd = Random(unpredictableSeed);
// Fill arrays with random values - using realistic ranges
for (size_t i = 0; i < iterations; i++) {
static if (is(T == ulong)) {
x_values[i] = uniform!("[]")(2, 1000000, rnd);
n_values[i] = uniform!("[]")(10, 10000, rnd);
// Avoid modulus being too small or zero
m_values[i] = uniform!("[]")(1000, 10000000, rnd);
} else {
x_values[i] = uniform!("[]")(2, 1000000, rnd);
n_values[i] = uniform!("[]")(10, 10000, rnd);
// Avoid modulus being too small or zero
m_values[i] = uniform!("[]")(1000, 10000000, rnd);
}
}
// Variables to store results to prevent compiler optimizations
T result1 = 0;
T result2 = 0;
// Test first implementation
writeln("Testing first powmod implementation (with ASM mulmod):");
auto sw1 = StopWatch(AutoStart.yes);
for (size_t i = 0; i < iterations; i++) {
result1 ^= powmod1!T(x_values[i], n_values[i], m_values[i]);
}
sw1.stop();
// Test second implementation
writeln("Testing second powmod implementation (with standard D mulmod):");
auto sw2 = StopWatch(AutoStart.yes);
for (size_t i = 0; i < iterations; i++) {
result2 ^= powmod2!T(x_values[i], n_values[i], m_values[i]);
}
sw2.stop();
// Print results
writefln("First implementation (ASM): %s", sw1.peek());
writefln("Second implementation (D): %s", sw2.peek());
double ratio = (sw2.peek().total!"usecs" * 100.0 / sw1.peek().total!"usecs");
writefln("Performance ratio: %.2f%% (higher means ASM is faster)", ratio);
// Print dummy value to prevent optimization
writefln("Verification values: %s %s", result1, result2);
} |
@ibuclaw can you explain why a = 100; b = 7919; c = 18446744073709551557u; |
Anyone has solution of it please tell me i cant unterstand this errors on why they are coming on x86 systems how much i figureout is that this occured due to 32 bits x86 system the value overflowed |
8c7d8c0
to
43c35de
Compare
Yes. The new fallback code is prone to overflow. The original implementation can't be made simpler by removing the |
i have a solution for that like we do in assembly we treat 16bit number in two register(in 8085) instaed of one and then perform operations on it . we can do same here by implementing 128 bits multipication using more registers in 32 bits (its complex) |
I don't understand why you need inline assembler for 32bit powmod, when the original D code should be sufficient?
|
ok then i will make changes for it |
For the slow path, the Running the benchmark locally, the D code measured faster than the inline asm when the modulus was within this bounds. static T mulmod2(T)(T a, T b, T c) {
static if (T.sizeof == 8) {
static T addmod(T a, T b, T c) {
b = c - b;
if (a >= b)
return a - b;
else
return c - b + a;
}
T result = void; // <---- start new snip
if (c <= 0x100000000)
{
result = a * b;
return result % c;
}
result = 0; // <---- end new snip
b %= c;
while (a > 0) {
if (a & 1)
result = addmod(result, b, c);
a >>= 1;
b = addmod(b, b, c);
}
return result;
}
else {
alias DoubleT = AliasSeq!(void, ushort, uint, void, ulong)[T.sizeof];
DoubleT result = cast(DoubleT) (cast(DoubleT) a * cast(DoubleT) b);
return result % c;
}
}
|
ok i agree then what is better way of optimizing it |
So I think it should be sufficient to optimize mulmod in the following way: static T mulmod(T a, T b, T c)
{
static if (T.sizeof == 8)
{
if (c <= 0x100000000)
{
T result = a * b;
return result % c;
}
T result = 0;
version (D_InlineAsm_X86_64)
{
// 128-bit mul with asm
}
else version (GNU_OR_LDC_X86_64)
{
// 128-bit mul with gcc extended asm
}
else
{
// slow addmod() method (does pragma(inline, true) help?)
}
return result;
}
else
{
// DoubleT method
}
} Does this make sense? |
It might even help when |
i think you are correct here this looks similiar to other open source language iplementation of function |
Sorry for inconvenience |
sorry for wasting your time with inLine ASM should have thought of using core.int128 earlier |
Co-authored-by: Iain Buclaw <[email protected]>
Co-authored-by: Iain Buclaw <[email protected]>
6edc6ab
to
ef8a738
Compare
std/math/exponential.d
Outdated
const product128 = mul(Cent(a), Cent(b)); | ||
Cent centRemainder; | ||
udivmod(product128, Cent(c), centRemainder); | ||
return cast(T)(centRemainder.lo); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry @Aditya-132, looks like @kinke wasn't done with changes to core.int128.
With that PR merged, the correct overloads to call are:
const product128 = mul(Cent(a), Cent(b)); | |
Cent centRemainder; | |
udivmod(product128, Cent(c), centRemainder); | |
return cast(T)(centRemainder.lo); | |
const product128 = mul(a, b); | |
T remainder = void; | |
udivmod(product128, c, remainder); | |
return remainder; |
Thanks for the patience. 🙇
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UFCS should also be possible
T remainder = void;
mul(a, b).udivmod(c, remainder);
return remainder;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UFCS should also be possible
T remainder = void; mul(a, b).udivmod(c, remainder); return remainder;
its giving error
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggested change are also giving error
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DMD PR is merged now, please rebase on top of phobos/master.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@kinke done with changes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So the error is floating point exception?
There was overflow code between the mul and div once, but that disappeared for some reason?
auto product = mul(a, b);
if (product.hi >= c)
product.hi %= c;
T remainder = void;
udivmod(product, c, remainder);
return remainder;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should i make above changes in code
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, adding the overflow condition should allow you to use 64bit udivmod.
Co-authored-by: Iain Buclaw <[email protected]>
std/math/exponential.d
Outdated
else | ||
{ | ||
if (a & 1) | ||
result = addmod(result, b, c); | ||
|
||
a >>= 1; | ||
b = addmod(b, b, c); | ||
import core.int128 : Cent, mul, udivmod; | ||
auto product = mul(a, b); | ||
if (product.hi >= c) | ||
product.hi %= c; | ||
T remainder = void; | ||
udivmod(product, c, remainder); | ||
return remainder; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this being rendered wrong or is indentation missing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done with correction
Is there any specific reason why it fails for [Main / FreeBSD 13.2 x64 (pull_request) |
Unrelated. |
The key optimizations I've made are:
128-bit multiplication via inline assembly:
For 64-bit types on x86_64 platforms, I've added direct use of hardware multiplication that produces a 128-bit result (stored in high:low registers)
This avoids the repeated addition loop in the original algorithm
Efficient modular reduction:
Implemented a reduction algorithm that handles the 128-bit result efficiently
Uses a simplified approach for when the high bits are zero
Early reduction:
The base value is reduced modulo m at the beginning, which improves performance when the base is large and modulus is small
Special case handling:
Added explicit handling for modulus=1 and exponent=0 cases up front
Version checks:
The code falls back to the original algorithm if inline assembly is not available
This ensures portability while providing optimizations where possible