Skip to content

Commit

Permalink
math: add hardware-accelerated inverse floating point square root
Browse files Browse the repository at this point in the history
  • Loading branch information
Kuratius committed Jan 24, 2025
1 parent dd86e8f commit 83517d8
Show file tree
Hide file tree
Showing 2 changed files with 273 additions and 0 deletions.
15 changes: 15 additions & 0 deletions include/nds/arm9/math.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,21 @@ static inline uint32_t sqrt64(uint64_t a)
/// 32 bit floating point value
ARM_CODE float hw_sqrtf(float x);

/// 32-bit inverse floating point sqrt
///
/// @warning
/// Subnormal floats are treated as if they were zero.
///
/// @warning
/// Not safe to call inside an interrupt handler.
///
/// @param x
/// Valid 32 bit non-negative floating point value.
///
/// @return
/// 32 bit floating point value
float hw_rsqrtf_asm(float x);

/// 20.12 fixed point cross product.
///
/// Cross product:
Expand Down
258 changes: 258 additions & 0 deletions source/arm9/rsqrt.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
// SPDX-License-Identifier: Zlib
//
// Copyright (C) 2025 aikku93


#include <nds/asminc.h>
// 2^N+1 entries in the LUT
#define RECPLUT_LENGTH_BITS 5
#define QUIET_NAN ((255 << 23) | ((1 << 22) | 1))
#define INF (0xFF << 23)
#ifndef ARM9
#error "ARM9 must be defined."
#endif
.syntax unified
.arch armv5te
.cpu arm946e-s
.arm
BEGIN_ASM_FUNC hw_rsqrtf_asm
asrs r1, r0, #23 @ Exponent -> r1, and check for negative and subnormals
bmi .LinputIsNegative
beq .LinputIsSubnormal
bic r0, r0, r1, lsl #23 @ Significand -> r0
add r0, #1 << 23

tst r1, #1 @ Test even/odd exponent
mov r2, #0
lslne r3, r0, #7 @ Even exponent: Keep significand as-is
lsleq r3, r0, #6 @ Odd exponent: Shift significand down one bit
mov ip, #0x04000001 @ &REG_SQRTCNT + 01h -> ip
add ip, #0x000002B0
strd r2, [ip, #8-1] @ Start square root of significand
ldr r2, [ip, #0-1] @ Ensure we use 64-bit mode
tst r2, #1
streq ip, [ip, #0-1]
cmp r1, #255 @ Check for Inf/NaN inputs
beq .LinputIsInfOrNaN

ldr ip, =hw_rsqrtf_recpLUT - 4*(1<<RECPLUT_LENGTH_BITS)
lsr r2, r0, #23 - RECPLUT_LENGTH_BITS
bic r3, r0, r2, lsl #23 - RECPLUT_LENGTH_BITS
add ip, ip, r2, lsl #2 @ Build reciprocal approximation
ldmia ip, {r2,ip}
str lr, [sp, #-8]! @ <- Awkwardly sandwiched to avoid a stall cycle
rsb ip, r2
umull r3, lr, ip, r3
sub r1, #127 @ Remove exponent bias
sub r2, r2, r3, lsr #23 - RECPLUT_LENGTH_BITS
sub r2, r2, lr, lsl #32 - (23 - RECPLUT_LENGTH_BITS)

umull r3, ip, r2, r2 @ Perform Newton iteration
asr r1, #1 @ Exponent /= 2
umull r3, lr, ip, r0
rsb r1, #127 - 1 - 1 @ Restore exponent bias and apply correction
sub r0, r2, r3, lsr #23
sub r0, r0, lr, lsl #32 - 23
mov ip, #0x04000000 @ Wait until square root is ready, then sqrtX - > r2
1: ldr r2, [ip, #0x2B0]
tst r2, #1 << 15
bne 1b
ldr r2, [ip, #0x2B4]

umull r2, r3, r0, r2 @ (sqrtX * inv)|Exponent as result
lsrs r3, #5
adc r0, r3, r1, lsl #23
ldr pc, [sp], #8

.LinputIsNegative:
ldr r0, =QUIET_NAN @ 1 / sqrt(-x) = qNaN
bx lr

.LinputIsSubnormal:
ldr r0, =INF @ 1 / sqrt(+0) = +Inf
bx lr

.LinputIsInfOrNaN:
subs r0, r0, r1, lsl #23 @ 1 / sqrt(+Inf) = 0
ldrne r0, =QUIET_NAN @ 1 / sqrt(NaN) = NaN
bx lr

//.section .dtcm.hw_rsqrtf_recpLUT, "aw", %progbits
.section .rodata.hw_rsqrtf_recpLUT
.balign 4

hw_rsqrtf_recpLUT:
.word 0x80000000,0x7C1F07C2,0x78787879,0x75075076,0x71C71C72,0x6EB3E454,0x6BCA1AF3,0x6906906A
.word 0x66666667,0x63E7063F,0x61861862,0x5F417D06,0x5D1745D2,0x5B05B05C,0x590B2165,0x572620AF
.word 0x55555556,0x5397829D,0x51EB851F,0x50505051,0x4EC4EC4F,0x4D4873ED,0x4BDA12F7,0x4A7904A8
.word 0x4924924A,0x47DC11F8,0x469EE585,0x456C797E,0x44444445,0x4325C53F,0x42108422,0x41041042
.word 0x40000000






/* //C-version for documentation purposes
#include <nds/arm9/math.h>
#define QUIET_NAN ((255 << 23) | ((1 << 22) | 1))
#define INF (0xFF << 23)
// 2^N+1 entries in the LUT
#define RECPLUT_LENGTH_BITS 5
// Mathematica formula:
// Table[Ceiling[2^31/x], {x, 1, 2, 2^-RECPLUT_LENGTH_BITS}]
static const uint32_t recpLUT[32+1] = {
0x80000000,0x7C1F07C2,0x78787879,0x75075076,0x71C71C72,0x6EB3E454,0x6BCA1AF3,0x6906906A,
0x66666667,0x63E7063F,0x61861862,0x5F417D06,0x5D1745D2,0x5B05B05C,0x590B2165,0x572620AF,
0x55555556,0x5397829D,0x51EB851F,0x50505051,0x4EC4EC4F,0x4D4873ED,0x4BDA12F7,0x4A7904A8,
0x4924924A,0x47DC11F8,0x469EE585,0x456C797E,0x44444445,0x4325C53F,0x42108422,0x41041042,
0x40000000
};
ARM_CODE float hw_rsqrtf(float x)
{
union
{
float f;
u32 i;
} xu;
xu.f = x;
// Check exponent special cases
// These really should be `asrs,bmi,beq`
s32 exp = (s32)xu.i >> 23;
if(exp < 0)
{
// 1 / sqrt(-x) = qNaN
xu.i = QUIET_NAN;
return xu.f;
}
else if(exp == 0)
{
// 1 / sqrt(+0) = +Inf
// We could check for subnormals here if we wanted to
xu.i = INF;
return xu.f;
}
// Build the significand and start a square root as
// soon as possible. We shift the 23-bit significand
// up to 62-bit before the square root, which gives
// us a 31-bit output. Also note for odd exponents,
// we need a further shift up because we will drop
// one bit from `exp`, and must account for it.
u32 significand = xu.i &~ ((u32)exp << 23);
significand += 1u << 23;
{
u64 sigBits = (u64)significand << (62-23);
if((exp & 1) == 0) sigBits <<= 1;
REG_SQRT_PARAM = sigBits;
}
if((REG_SQRTCNT & SQRT_MODE_MASK) != SQRT_64)
{
REG_SQRTCNT = SQRT_64;
}
// Check for Inf/NaN inputs
if(exp == 255)
{
// 1 / sqrt(+Inf) = 0
// 1 / sqrt(NaN) = NaN
xu.i = (xu.i == INF) ? 0 : QUIET_NAN;
return xu.f;
}
// First, get a rough estimate for `1/significand`.
// `significand` is in range 2^23 .. 2^24-1, so we
// split the calculation of `2^k/significand` into
// `2^31/(x*2^-23)` to give us a 31-bit reciprocal.
// The reciprocal is 31 bits wide, but is actually
// 54 bits because of the scaling we just did.
// `x*2^-23` gives us a value in range 1 inclusive
// through 2 exclusive, so we use a lookup table
// to get us most of the way to the result (using
// the high bits of the significand), and we then
// linearly interpolate the result to improve it
// somewhat (this is actually more precise than
// applying Newton's method, because we may start
// much closer to the previous lookup entry than
// to the next, which would slow down convergence).
// Note that we can't use a 32bit reciprocal, as
// when we move to the Newton iteration in the next
// step, we could get a reciprocal of 2^32, which
// then gets cast to 32bit to equal 0.
u32 inv;
{
u32 sigInt = (significand >> (23-RECPLUT_LENGTH_BITS));
u32 sigFrac = significand &~ (sigInt << (23-RECPLUT_LENGTH_BITS));
const u32 *lut = recpLUT - (1<<RECPLUT_LENGTH_BITS) + sigInt;
u32 invA = lut[0];
u32 invB = lut[1];
// Note that we use `x - a*(x-y)` rather than the
// usual `x + a*(y-x)`. This is because our entries
// are monotonically /decreasing/, and we want an
// unsigned multiply here.
inv = invA - (u32)((u64)(invA - invB)*sigFrac >> (23-RECPLUT_LENGTH_BITS));
}
// Now apply Newton's method to improve the result,
// but we only actually need a single iteration to
// get us nearly perfect results.
// The recurrence for 1/x is:
// y(n+1) = y(n)*(2 - x*y(n)) = 2*y(n) - x*y(n)*y(n)
// However, in the interests of improved performance,
// we actually /drop/ one bit here, which avoids us
// having to shift the starting value on top of the
// weighted difference. Note that the bit is being
// dropped in the calculation of `inv*inv>>32` for
// the sake of performance, but this doesn't affect
// the precision of the output beyond 1ulp.
inv = inv - (u32)(((u64)inv*inv >> 32) * significand >> 23);
// If we had an odd exponent, we have to remember we
// shifted the significand up by 1 bit, so account for
// it now, by shifting the inverse /down/ 1 bit.
if((exp & 1) == 0) inv >>= 1;
// Because we calculated an inverse, our exponent needs
// to be negated. Further, because we used a square root,
// the exponent also needs to be halved, ensuring to use
// a floor division by 2 since we already accounted for
// odd exponent values. Finally, we need to subtract 1
// from the exponent and shift the significand up one
// bit to account for the bit dropped during sqrt.
// As an optimization, we also subtract 1 from the new
// exponent, because we can then use the fact that the
// significand has the implicit bit already set at this
// point, and we can then just add the fixed exponent.
s32 newExp = -((exp - 127) >> 1) + 127 - 1 - 1;
// Wait for our square root to be ready, as we are
// bottlenecked by it rather than this code on DSi.
while(REG_SQRTCNT & SQRT_BUSY);
u32 sqrtX = REG_SQRT_RESULT;
// Now that we have the inverse and the square root,
// we can calculate rsqrt(x) using:
// rsqrt(x) = (x^0.5)^-1 = x^0.5 * x^-1
// For the final shift down value, we have:
// -`sqrtX` is a Q31 value
// -`inv` is a Q53 value, and relative to a Q23 value
// -We want a Q23 output
// -The output significand needs to shift up 1 bit
// Note that we also apply rounding here to improve the
// precision of the results. We will be within 1ulp of
// the target even without the rounding, but this will
// improve exactnessness from <50% to >95%.
const u32 shift = (31 + 53-23 - 23 - 1);
const u64 bias = (1ull << shift) / 2;
u32 newSignificand = (u32)(((u64)sqrtX * inv + bias) >> shift);
// Finally, build the output float value
xu.i = newSignificand + ((u32)newExp << 23);
return xu.f;
} */

0 comments on commit 83517d8

Please sign in to comment.