-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc][math] Add float-only option for atan2f. #122979
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
@llvm/pr-subscribers-libc Author: None (lntue) ChangesPatch is 21.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122979.diff 7 Files Affected:
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index db3c2c8a3d7a6..825038b22290a 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -20,41 +20,52 @@ namespace fputil {
#define DEFAULT_DOUBLE_SPLIT 27
-using DoubleDouble = LIBC_NAMESPACE::NumberPair<double>;
+template <typename T> struct DefaultSplit;
+template <> struct DefaultSplit<float> {
+ static constexpr size_t VALUE = 12;
+};
+template <> struct DefaultSplit<double> {
+ static constexpr size_t VALUE = 27;
+};
+
+using DoubleDouble = NumberPair<double>;
+using FloatFloat = NumberPair<float>;
// The output of Dekker's FastTwoSum algorithm is correct, i.e.:
// r.hi + r.lo = a + b exactly
// and |r.lo| < eps(r.lo)
// Assumption: |a| >= |b|, or a = 0.
-template <bool FAST2SUM = true>
-LIBC_INLINE constexpr DoubleDouble exact_add(double a, double b) {
- DoubleDouble r{0.0, 0.0};
+template <bool FAST2SUM = true, typename T = double>
+LIBC_INLINE constexpr NumberPair<T> exact_add(T a, T b) {
+ NumberPair<T> r{0.0, 0.0};
if constexpr (FAST2SUM) {
r.hi = a + b;
- double t = r.hi - a;
+ T t = r.hi - a;
r.lo = b - t;
} else {
r.hi = a + b;
- double t1 = r.hi - a;
- double t2 = r.hi - t1;
- double t3 = b - t1;
- double t4 = a - t2;
+ T t1 = r.hi - a;
+ T t2 = r.hi - t1;
+ T t3 = b - t1;
+ T t4 = a - t2;
r.lo = t3 + t4;
}
return r;
}
// Assumption: |a.hi| >= |b.hi|
-LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a,
- const DoubleDouble &b) {
- DoubleDouble r = exact_add(a.hi, b.hi);
- double lo = a.lo + b.lo;
+template <typename T>
+LIBC_INLINE constexpr NumberPair<T> add(const NumberPair<T> &a,
+ const NumberPair<T> &b) {
+ NumberPair<T> r = exact_add(a.hi, b.hi);
+ T lo = a.lo + b.lo;
return exact_add(r.hi, r.lo + lo);
}
// Assumption: |a.hi| >= |b|
-LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
- DoubleDouble r = exact_add<false>(a.hi, b);
+template <typename T>
+LIBC_INLINE constexpr NumberPair<T> add(const NumberPair<T> &a, T b) {
+ NumberPair<T> r = exact_add<false>(a.hi, b);
return exact_add(r.hi, r.lo + a.lo);
}
@@ -63,12 +74,12 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
// Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
// Roundings," https://inria.hal.science/hal-04480440.
// Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
-template <size_t N = DEFAULT_DOUBLE_SPLIT>
-LIBC_INLINE constexpr DoubleDouble split(double a) {
- DoubleDouble r{0.0, 0.0};
+template <typename T = double, size_t N = DefaultSplit<T>::VALUE>
+LIBC_INLINE constexpr NumberPair<T> split(T a) {
+ NumberPair<T> r{0.0, 0.0};
// CN = 2^N.
- constexpr double CN = static_cast<double>(1 << N);
- constexpr double C = CN + 1.0;
+ constexpr T CN = static_cast<T>(1 << N);
+ constexpr T C = CN + 1.0;
double t1 = C * a;
double t2 = a - t1;
r.hi = t1 + t2;
@@ -77,16 +88,15 @@ LIBC_INLINE constexpr DoubleDouble split(double a) {
}
// Helper for non-fma exact mult where the first number is already split.
-template <size_t SPLIT_B = DEFAULT_DOUBLE_SPLIT>
-LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a,
- double b) {
- DoubleDouble bs = split<SPLIT_B>(b);
- DoubleDouble r{0.0, 0.0};
+template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
+LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
+ NumberPair<T> bs = split<T, SPLIT_B>(b);
+ NumberPair<T> r{0.0, 0.0};
r.hi = a * b;
- double t1 = as.hi * bs.hi - r.hi;
- double t2 = as.hi * bs.lo + t1;
- double t3 = as.lo * bs.hi + t2;
+ T t1 = as.hi * bs.hi - r.hi;
+ T t2 = as.hi * bs.lo + t1;
+ T t3 = as.lo * bs.hi + t2;
r.lo = as.lo * bs.lo + t3;
return r;
@@ -99,18 +109,18 @@ LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a,
// Using Theorem 1 in the paper above, without FMA instruction, if we restrict
// the generated constants to precision <= 51, and splitting it by 2^28 + 1,
// then a * b = r.hi + r.lo is exact for all rounding modes.
-template <size_t SPLIT_B = 27>
-LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
- DoubleDouble r{0.0, 0.0};
+template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
+LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
+ NumberPair<T> r{0.0, 0.0};
#ifdef LIBC_TARGET_CPU_HAS_FMA
r.hi = a * b;
r.lo = fputil::multiply_add(a, b, -r.hi);
#else
// Dekker's Product.
- DoubleDouble as = split(a);
+ NumberPair<T> as = split(a);
- r = exact_mult<SPLIT_B>(as, a, b);
+ r = exact_mult<T, SPLIT_B>(as, a, b);
#endif // LIBC_TARGET_CPU_HAS_FMA
return r;
@@ -125,7 +135,7 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
template <size_t SPLIT_B = 27>
LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a,
const DoubleDouble &b) {
- DoubleDouble r = exact_mult<SPLIT_B>(a.hi, b.hi);
+ DoubleDouble r = exact_mult<double, SPLIT_B>(a.hi, b.hi);
double t1 = multiply_add(a.hi, b.lo, r.lo);
double t2 = multiply_add(a.lo, b.hi, t1);
r.lo = t2;
@@ -157,19 +167,20 @@ LIBC_INLINE DoubleDouble multiply_add<DoubleDouble>(const DoubleDouble &a,
// rl = q * (ah - bh * rh) + q * (al - bl * rh)
// as accurate as possible, then the error is bounded by:
// |(ah + al) / (bh + bl) - (rh + rl)| < O(bl/bh) * (2^-52 + al/ah + bl/bh)
-LIBC_INLINE DoubleDouble div(const DoubleDouble &a, const DoubleDouble &b) {
- DoubleDouble r;
- double q = 1.0 / b.hi;
+template <typename T>
+LIBC_INLINE NumberPair<T> div(const NumberPair<T> &a, const NumberPair<T> &b) {
+ NumberPair<T> r;
+ T q = T(1) / b.hi;
r.hi = a.hi * q;
#ifdef LIBC_TARGET_CPU_HAS_FMA
- double e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi);
- double e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo);
+ T e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi);
+ T e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo);
#else
- DoubleDouble b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi);
- DoubleDouble b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi);
- double e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo;
- double e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo;
+ NumberPair<T> b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi);
+ NumberPair<T> b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi);
+ T e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo;
+ T e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo;
#endif // LIBC_TARGET_CPU_HAS_FMA
r.lo = q * (e_hi + e_lo);
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index a2634950d431b..253843e5e37aa 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -45,6 +45,7 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
#define LIBC_MATH_FAST \
(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES | \
LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT)
+#define LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT 0x10
#ifndef LIBC_MATH
#define LIBC_MATH 0
@@ -58,4 +59,8 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
#define LIBC_MATH_HAS_SMALL_TABLES
#endif
+#if (LIBC_MATH & LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT)
+#define LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
+#endif
+
#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 9faf46d491426..2bda741b453f5 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -4052,8 +4052,10 @@ add_entrypoint_object(
atan2f.cpp
HDRS
../atan2f.h
+ atan2f_float.h
DEPENDS
.inv_trigf_utils
+ libc.src.__support.FPUtil.double_double
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.multiply_add
libc.src.__support.FPUtil.nearest_integer
diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp
index db7639396cdd7..5ac2b29438ea9 100644
--- a/libc/src/math/generic/atan2f.cpp
+++ b/libc/src/math/generic/atan2f.cpp
@@ -17,6 +17,14 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) && \
+ defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT)
+
+// We use float-float implementation to reduce size.
+#include "src/math/generic/atan2f_float.h"
+
+#else
+
namespace LIBC_NAMESPACE_DECL {
namespace {
@@ -324,3 +332,5 @@ LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) {
}
} // namespace LIBC_NAMESPACE_DECL
+
+#endif
diff --git a/libc/src/math/generic/atan2f_float.h b/libc/src/math/generic/atan2f_float.h
new file mode 100644
index 0000000000000..1819a3c3fb0a0
--- /dev/null
+++ b/libc/src/math/generic/atan2f_float.h
@@ -0,0 +1,239 @@
+//===-- Single-precision atan2f function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/math/atan2f.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+using FloatFloat = fputil::FloatFloat;
+
+// atan(i/64) with i = 0..16, generated by Sollya with:
+// > for i from 0 to 16 do {
+// a = round(atan(i/16), SG, RN);
+// b = round(atan(i/16) - a, SG, RN);
+// print("{", b, ",", a, "},");
+// };
+constexpr FloatFloat ATAN_I[17] = {
+ {0.0f, 0.0f},
+ {-0x1.1a6042p-30f, 0x1.ff55bcp-5f},
+ {-0x1.54f424p-30f, 0x1.fd5baap-4f},
+ {0x1.79cb6p-28f, 0x1.7b97b4p-3f},
+ {-0x1.b4dfc8p-29f, 0x1.f5b76p-3f},
+ {-0x1.1f0286p-27f, 0x1.362774p-2f},
+ {0x1.e4defp-30f, 0x1.6f6194p-2f},
+ {0x1.e611fep-29f, 0x1.a64eecp-2f},
+ {0x1.586ed4p-28f, 0x1.dac67p-2f},
+ {-0x1.6499e6p-26f, 0x1.0657eap-1f},
+ {0x1.7bdfd6p-26f, 0x1.1e00bap-1f},
+ {-0x1.98e422p-28f, 0x1.345f02p-1f},
+ {0x1.934f7p-28f, 0x1.4978fap-1f},
+ {0x1.c5a6c6p-27f, 0x1.5d5898p-1f},
+ {0x1.5e118cp-27f, 0x1.700a7cp-1f},
+ {-0x1.1d4eb6p-26f, 0x1.819d0cp-1f},
+ {-0x1.777a5cp-26f, 0x1.921fb6p-1f},
+};
+
+// Approximate atan(x) for |x| <= 2^-5.
+// Using degree-3 Taylor polynomial:
+// P = x - x^3/3
+// Then the absolute error is bounded by:
+// |atan(x) - P(x)| < |x|^5/5 < 2^(-5*5) / 5 < 2^-27.
+// And the relative error is bounded by:
+// |(atan(x) - P(x))/atan(x)| < |x|^4 / 4 < 2^-22.
+// For x = x_hi + x_lo, fully expand the polynomial and drop any terms less than
+// ulp(x_hi^3 / 3) gives us:
+// P(x) ~ x_hi - x_hi^3/3 + x_lo * (1 - x_hi^2)
+FloatFloat atan_eval(const FloatFloat &x) {
+ FloatFloat p;
+ p.hi = x.hi;
+ float x_hi_sq = x.hi * x.hi;
+ // c0 ~ - x_hi^2 / 3
+ float c0 = -0x1.555556p-2f * x_hi_sq;
+ // c1 ~ x_lo * (1 - x_hi^2)
+ float c1 = fputil::multiply_add(x_hi_sq, -x.lo, x.lo);
+ // p.lo ~ - x_hi^3 / 3 + x_lo * (1 - x_hi*2)
+ p.lo = fputil::multiply_add(x.hi, c0, c1);
+ return p;
+}
+
+} // anonymous namespace
+
+// There are several range reduction steps we can take for atan2(y, x) as
+// follow:
+
+// * Range reduction 1: signness
+// atan2(y, x) will return a number between -PI and PI representing the angle
+// forming by the 0x axis and the vector (x, y) on the 0xy-plane.
+// In particular, we have that:
+// atan2(y, x) = atan( y/x ) if x >= 0 and y >= 0 (I-quadrant)
+// = pi + atan( y/x ) if x < 0 and y >= 0 (II-quadrant)
+// = -pi + atan( y/x ) if x < 0 and y < 0 (III-quadrant)
+// = atan( y/x ) if x >= 0 and y < 0 (IV-quadrant)
+// Since atan function is odd, we can use the formula:
+// atan(-u) = -atan(u)
+// to adjust the above conditions a bit further:
+// atan2(y, x) = atan( |y|/|x| ) if x >= 0 and y >= 0 (I-quadrant)
+// = pi - atan( |y|/|x| ) if x < 0 and y >= 0 (II-quadrant)
+// = -pi + atan( |y|/|x| ) if x < 0 and y < 0 (III-quadrant)
+// = -atan( |y|/|x| ) if x >= 0 and y < 0 (IV-quadrant)
+// Which can be simplified to:
+// atan2(y, x) = sign(y) * atan( |y|/|x| ) if x >= 0
+// = sign(y) * (pi - atan( |y|/|x| )) if x < 0
+
+// * Range reduction 2: reciprocal
+// Now that the argument inside atan is positive, we can use the formula:
+// atan(1/x) = pi/2 - atan(x)
+// to make the argument inside atan <= 1 as follow:
+// atan2(y, x) = sign(y) * atan( |y|/|x|) if 0 <= |y| <= x
+// = sign(y) * (pi/2 - atan( |x|/|y| ) if 0 <= x < |y|
+// = sign(y) * (pi - atan( |y|/|x| )) if 0 <= |y| <= -x
+// = sign(y) * (pi/2 + atan( |x|/|y| )) if 0 <= -x < |y|
+
+// * Range reduction 3: look up table.
+// After the previous two range reduction steps, we reduce the problem to
+// compute atan(u) with 0 <= u <= 1, or to be precise:
+// atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|).
+// An accurate polynomial approximation for the whole [0, 1] input range will
+// require a very large degree. To make it more efficient, we reduce the input
+// range further by finding an integer idx such that:
+// | n/d - idx/16 | <= 1/32.
+// In particular,
+// idx := 2^-4 * round(2^4 * n/d)
+// Then for the fast pass, we find a polynomial approximation for:
+// atan( n/d ) ~ atan( idx/16 ) + (n/d - idx/16) * Q(n/d - idx/16)
+// with Q(x) = x - x^3/3 be the cubic Taylor polynomial of atan(x).
+// It's error in float-float precision is estimated in Sollya to be:
+// > P = x - x^3/3;
+// > dirtyinfnorm(atan(x) - P, [-2^-5, 2^-5]);
+// 0x1.995...p-28.
+
+LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) {
+ using FPBits = typename fputil::FPBits<float>;
+ constexpr float IS_NEG[2] = {1.0f, -1.0f};
+ constexpr FloatFloat ZERO = {0.0f, 0.0f};
+ constexpr FloatFloat MZERO = {-0.0f, -0.0f};
+ constexpr FloatFloat PI = {-0x1.777a5cp-24f, 0x1.921fb6p1f};
+ constexpr FloatFloat MPI = {0x1.777a5cp-24f, -0x1.921fb6p1f};
+ constexpr FloatFloat PI_OVER_4 = {-0x1.777a5cp-26f, 0x1.921fb6p-1f};
+ constexpr FloatFloat PI_OVER_2 = {-0x1.777a5cp-25f, 0x1.921fb6p0f};
+ constexpr FloatFloat MPI_OVER_2 = {-0x1.777a5cp-25f, 0x1.921fb6p0f};
+ constexpr FloatFloat THREE_PI_OVER_4 = {-0x1.99bc5cp-28f, 0x1.2d97c8p1f};
+ // Adjustment for constant term:
+ // CONST_ADJ[x_sign][y_sign][recip]
+ constexpr FloatFloat CONST_ADJ[2][2][2] = {
+ {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}},
+ {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}};
+
+ FPBits x_bits(x), y_bits(y);
+ bool x_sign = x_bits.sign().is_neg();
+ bool y_sign = y_bits.sign().is_neg();
+ x_bits = x_bits.abs();
+ y_bits = y_bits.abs();
+ uint32_t x_abs = x_bits.uintval();
+ uint32_t y_abs = y_bits.uintval();
+ bool recip = x_abs < y_abs;
+ uint32_t min_abs = recip ? x_abs : y_abs;
+ uint32_t max_abs = !recip ? x_abs : y_abs;
+ unsigned min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+ unsigned max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+
+ float num = FPBits(min_abs).get_val();
+ float den = FPBits(max_abs).get_val();
+
+ // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
+ // overflow, or close to underflow.
+ if (LIBC_UNLIKELY(max_exp > 0xffU - 64U || min_exp < 64U)) {
+ if (x_bits.is_nan() || y_bits.is_nan())
+ return FPBits::quiet_nan().get_val();
+ unsigned x_except = x == 0.0f ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1);
+ unsigned y_except = y == 0.0f ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1);
+
+ // Exceptional cases:
+ // EXCEPT[y_except][x_except][x_is_neg]
+ // with x_except & y_except:
+ // 0: zero
+ // 1: finite, non-zero
+ // 2: infinity
+ constexpr FloatFloat EXCEPTS[3][3][2] = {
+ {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}},
+ {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}},
+ {{PI_OVER_2, PI_OVER_2},
+ {PI_OVER_2, PI_OVER_2},
+ {PI_OVER_4, THREE_PI_OVER_4}},
+ };
+
+ if ((x_except != 1) || (y_except != 1)) {
+ FloatFloat r = EXCEPTS[y_except][x_except][x_sign];
+ return fputil::multiply_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
+ }
+ bool scale_up = min_exp < 64U;
+ bool scale_down = max_exp > 0xffU - 64U;
+ // At least one input is denormal, multiply both numerator and denominator
+ // by some large enough power of 2 to normalize denormal inputs.
+ if (scale_up) {
+ num *= 0x1.0p32f;
+ if (!scale_down)
+ den *= 0x1.0p32f;
+ } else if (scale_down) {
+ den *= 0x1.0p-32f;
+ if (!scale_up)
+ num *= 0x1.0p-32f;
+ }
+
+ min_abs = FPBits(num).uintval();
+ max_abs = FPBits(den).uintval();
+ min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+ max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+ }
+
+ float final_sign = IS_NEG[(x_sign != y_sign) != recip];
+ FloatFloat const_term = CONST_ADJ[x_sign][y_sign][recip];
+ unsigned exp_diff = max_exp - min_exp;
+ // We have the following bound for normalized n and d:
+ // 2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
+ if (LIBC_UNLIKELY(exp_diff > 25)) {
+ return fputil::multiply_add(final_sign, const_term.hi,
+ final_sign * (const_term.lo + num / den));
+ }
+
+ float k = fputil::nearest_integer(16.0f * num / den);
+ unsigned idx = static_cast<unsigned>(k);
+ // k = idx / 16
+ k *= 0x1.0p-4f;
+
+ // Range reduction:
+ // atan(n/d) - atan(k/64) = atan((n/d - k/16) / (1 + (n/d) * (k/16)))
+ // = atan((n - d * k/16)) / (d + n * k/16))
+ FloatFloat num_k = fputil::exact_mult(num, k);
+ FloatFloat den_k = fputil::exact_mult(den, k);
+
+ // num_dd = n - d * k
+ FloatFloat num_ff = fputil::exact_add(num - den_k.hi, -den_k.lo);
+ // den_dd = d + n * k
+ FloatFloat den_ff = fputil::exact_add(den, num_k.hi);
+ den_ff.lo += num_k.lo;
+
+ // q = (n - d * k) / (d + n * k)
+ FloatFloat q = fputil::div(num_ff, den_ff);
+ // p ~ atan(q)
+ FloatFloat p = atan_eval(q);
+
+ FloatFloat r = fputil::add(const_term, fputil::add(ATAN_I[idx], p));
+ return final_sign * r.hi;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/range_reduction_double_fma.h b/libc/src/math/generic/range_reduction_double_fma.h
index cab031c28baa1..8e0bc3a42462c 100644
--- a/libc/src/math/generic/range_reduction_double_fma.h
+++ b/libc/src/math/generic/range_reduction_double_fma.h
@@ -33,14 +33,14 @@ LIBC_INLINE unsigned LargeRangeReduction::fast(double x, DoubleDouble &u) {
// 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78
x_reduced = xbits.get_val();
// x * c_hi = ph.hi + ph.lo exactly.
- DoubleDouble ph =
- fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]);
+ DoubleDouble ph = fputil::exact_mult<double, SPLIT>(
+ x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]);
// x * c_mid = pm.hi + pm.lo exactly.
- DoubleDouble pm =
- fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]);
+ DoubleDouble pm = fputil::exact_mult<double, SPLIT>(
+ x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]);
// x * c_lo = pl.hi + pl.lo exactly.
- DoubleDouble pl =
- fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]);
+ DoubleDouble pl = fputil::exact_mult<double, SPLIT>(
+ x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]);
// Extract integral parts and fractional parts of (ph.lo + pm.hi).
double sum_hi = ph.lo + pm.hi;
double kd = fputil::nearest_integer(sum_hi);
diff --git a/libc/src/math/generic/range_reduction_double_nofma.h b/libc/src/math/generic/range_reduction_double_...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a commit description that addresses "why is this patch necessary?" In particular, if it's size related, having some measurement and for which architecture might be useful.
static constexpr size_t VALUE = 12; | ||
}; | ||
template <> struct DefaultSplit<double> { | ||
static constexpr size_t VALUE = 27; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static constexpr size_t VALUE = 27; | |
static constexpr size_t VALUE = DEFAULT_DOUBLE_SPLIT; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
libc/src/math/generic/atan2f_float.h
Outdated
den *= 0x1.0p32f; | ||
} else if (scale_down) { | ||
den *= 0x1.0p-32f; | ||
if (!scale_up) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically, if we reach this point, we know scale_up
is false
because of the else if
on L191. The whole conditional block here looks like it could be rewritten as two ternary expressions, but I guess they would have to be wrapped in a check for either being true.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
some style nits
template <typename T> struct DefaultSplit; | ||
template <> struct DefaultSplit<float> { | ||
static constexpr size_t VALUE = 12; | ||
}; | ||
template <> struct DefaultSplit<double> { | ||
static constexpr size_t VALUE = DEFAULT_DOUBLE_SPLIT; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these should probably be consistent on how the number is defined. Either there should be a macro DEFAULT_FLOAT_SPLIT
to match DEFAULT_DOUBLE_SPLIT
, or both of them should be just numbers here. Personally I'd lean towards deleting the macro, since this struct already effectively names the value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FloatFloat num_k = fputil::exact_mult(num, k); | ||
FloatFloat den_k = fputil::exact_mult(den, k); | ||
|
||
// num_dd = n - d * k |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these comments should be updated, since these are now _ff
instead of _dd
For targets that have single precision FPU but not double precision FPU such as Cortex M4, only using float-float in the intermediate computations might reduce the code size compared to using double. In this case, when the exact pass is skipped, the float-only option for atan2f implemented in this PR reduces the code size of this function by ~1 KB compared to the double precision version.
For targets that have single precision FPU but not double precision FPU such as Cortex M4, only using float-float in the intermediate computations might reduce the code size compared to using double. In this case, when the exact pass is skipped, the float-only option for atan2f implemented in this PR reduces the code size of this function by ~1 KB compared to the double precision version.
For targets that have single precision FPU but not double precision FPU such as Cortex M4, only using float-float in the intermediate computations might reduce the code size compared to using double. In this case, when the exact pass is skipped, the float-only option for atan2f implemented in this PR reduces the code size of this function by ~1 KB compared to the double precision version.
For targets that have single precision FPU but not double precision FPU such as Cortex M4, only using float-float in the intermediate computations might reduce the code size compared to using double. In this case, when the exact pass is skipped, the float-only option for atan2f implemented in this PR reduces the code size of this function by ~1 KB compared to the double precision version.
For targets that have single precision FPU but not double precision FPU such as Cortex M4, only using float-float in the intermediate computations might reduce the code size compared to using double. In this case, when the exact pass is skipped, the float-only option for atan2f implemented in this PR reduces the code size of this function by ~1 KB compared to the double precision version.