[libc][math] Add float-only option for atan2f. #122979

lntue · 2025-01-14T22:27:47Z

For targets that have single precision FPU but not double precision FPU such as Cortex M4, only using float-float in the intermediate computations might reduce the code size compared to using double. In this case, when the exact pass is skipped, the float-only option for atan2f implemented in this PR reduces the code size of this function by ~1 KB compared to the double precision version.

github-actions · 2025-01-14T22:31:10Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvmbot · 2025-02-11T18:52:23Z

@llvm/pr-subscribers-libc

Author: None (lntue)

Changes

Patch is 21.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122979.diff

7 Files Affected:

(modified) libc/src/__support/FPUtil/double_double.h (+54-43)
(modified) libc/src/__support/macros/optimization.h (+5)
(modified) libc/src/math/generic/CMakeLists.txt (+2)
(modified) libc/src/math/generic/atan2f.cpp (+10)
(added) libc/src/math/generic/atan2f_float.h (+239)
(modified) libc/src/math/generic/range_reduction_double_fma.h (+6-6)
(modified) libc/src/math/generic/range_reduction_double_nofma.h (+6-6)

diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index db3c2c8a3d7a6..825038b22290a 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -20,41 +20,52 @@ namespace fputil {
 
 #define DEFAULT_DOUBLE_SPLIT 27
 
-using DoubleDouble = LIBC_NAMESPACE::NumberPair<double>;
+template <typename T> struct DefaultSplit;
+template <> struct DefaultSplit<float> {
+  static constexpr size_t VALUE = 12;
+};
+template <> struct DefaultSplit<double> {
+  static constexpr size_t VALUE = 27;
+};
+
+using DoubleDouble = NumberPair<double>;
+using FloatFloat = NumberPair<float>;
 
 // The output of Dekker's FastTwoSum algorithm is correct, i.e.:
 //   r.hi + r.lo = a + b exactly
 //   and |r.lo| < eps(r.lo)
 // Assumption: |a| >= |b|, or a = 0.
-template <bool FAST2SUM = true>
-LIBC_INLINE constexpr DoubleDouble exact_add(double a, double b) {
-  DoubleDouble r{0.0, 0.0};
+template <bool FAST2SUM = true, typename T = double>
+LIBC_INLINE constexpr NumberPair<T> exact_add(T a, T b) {
+  NumberPair<T> r{0.0, 0.0};
   if constexpr (FAST2SUM) {
     r.hi = a + b;
-    double t = r.hi - a;
+    T t = r.hi - a;
     r.lo = b - t;
   } else {
     r.hi = a + b;
-    double t1 = r.hi - a;
-    double t2 = r.hi - t1;
-    double t3 = b - t1;
-    double t4 = a - t2;
+    T t1 = r.hi - a;
+    T t2 = r.hi - t1;
+    T t3 = b - t1;
+    T t4 = a - t2;
     r.lo = t3 + t4;
   }
   return r;
 }
 
 // Assumption: |a.hi| >= |b.hi|
-LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a,
-                                       const DoubleDouble &b) {
-  DoubleDouble r = exact_add(a.hi, b.hi);
-  double lo = a.lo + b.lo;
+template <typename T>
+LIBC_INLINE constexpr NumberPair<T> add(const NumberPair<T> &a,
+                                        const NumberPair<T> &b) {
+  NumberPair<T> r = exact_add(a.hi, b.hi);
+  T lo = a.lo + b.lo;
   return exact_add(r.hi, r.lo + lo);
 }
 
 // Assumption: |a.hi| >= |b|
-LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
-  DoubleDouble r = exact_add<false>(a.hi, b);
+template <typename T>
+LIBC_INLINE constexpr NumberPair<T> add(const NumberPair<T> &a, T b) {
+  NumberPair<T> r = exact_add<false>(a.hi, b);
   return exact_add(r.hi, r.lo + a.lo);
 }
 
@@ -63,12 +74,12 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
 //   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
 //   Roundings," https://inria.hal.science/hal-04480440.
 // Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
-template <size_t N = DEFAULT_DOUBLE_SPLIT>
-LIBC_INLINE constexpr DoubleDouble split(double a) {
-  DoubleDouble r{0.0, 0.0};
+template <typename T = double, size_t N = DefaultSplit<T>::VALUE>
+LIBC_INLINE constexpr NumberPair<T> split(T a) {
+  NumberPair<T> r{0.0, 0.0};
   // CN = 2^N.
-  constexpr double CN = static_cast<double>(1 << N);
-  constexpr double C = CN + 1.0;
+  constexpr T CN = static_cast<T>(1 << N);
+  constexpr T C = CN + 1.0;
   double t1 = C * a;
   double t2 = a - t1;
   r.hi = t1 + t2;
@@ -77,16 +88,15 @@ LIBC_INLINE constexpr DoubleDouble split(double a) {
 }
 
 // Helper for non-fma exact mult where the first number is already split.
-template <size_t SPLIT_B = DEFAULT_DOUBLE_SPLIT>
-LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a,
-                                    double b) {
-  DoubleDouble bs = split<SPLIT_B>(b);
-  DoubleDouble r{0.0, 0.0};
+template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
+LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
+  NumberPair<T> bs = split<T, SPLIT_B>(b);
+  NumberPair<T> r{0.0, 0.0};
 
   r.hi = a * b;
-  double t1 = as.hi * bs.hi - r.hi;
-  double t2 = as.hi * bs.lo + t1;
-  double t3 = as.lo * bs.hi + t2;
+  T t1 = as.hi * bs.hi - r.hi;
+  T t2 = as.hi * bs.lo + t1;
+  T t3 = as.lo * bs.hi + t2;
   r.lo = as.lo * bs.lo + t3;
 
   return r;
@@ -99,18 +109,18 @@ LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a,
 // Using Theorem 1 in the paper above, without FMA instruction, if we restrict
 // the generated constants to precision <= 51, and splitting it by 2^28 + 1,
 // then a * b = r.hi + r.lo is exact for all rounding modes.
-template <size_t SPLIT_B = 27>
-LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
-  DoubleDouble r{0.0, 0.0};
+template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
+LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
+  NumberPair<T> r{0.0, 0.0};
 
 #ifdef LIBC_TARGET_CPU_HAS_FMA
   r.hi = a * b;
   r.lo = fputil::multiply_add(a, b, -r.hi);
 #else
   // Dekker's Product.
-  DoubleDouble as = split(a);
+  NumberPair<T> as = split(a);
 
-  r = exact_mult<SPLIT_B>(as, a, b);
+  r = exact_mult<T, SPLIT_B>(as, a, b);
 #endif // LIBC_TARGET_CPU_HAS_FMA
 
   return r;
@@ -125,7 +135,7 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
 template <size_t SPLIT_B = 27>
 LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a,
                                     const DoubleDouble &b) {
-  DoubleDouble r = exact_mult<SPLIT_B>(a.hi, b.hi);
+  DoubleDouble r = exact_mult<double, SPLIT_B>(a.hi, b.hi);
   double t1 = multiply_add(a.hi, b.lo, r.lo);
   double t2 = multiply_add(a.lo, b.hi, t1);
   r.lo = t2;
@@ -157,19 +167,20 @@ LIBC_INLINE DoubleDouble multiply_add<DoubleDouble>(const DoubleDouble &a,
 //   rl = q * (ah - bh * rh) + q * (al - bl * rh)
 // as accurate as possible, then the error is bounded by:
 //   |(ah + al) / (bh + bl) - (rh + rl)| < O(bl/bh) * (2^-52 + al/ah + bl/bh)
-LIBC_INLINE DoubleDouble div(const DoubleDouble &a, const DoubleDouble &b) {
-  DoubleDouble r;
-  double q = 1.0 / b.hi;
+template <typename T>
+LIBC_INLINE NumberPair<T> div(const NumberPair<T> &a, const NumberPair<T> &b) {
+  NumberPair<T> r;
+  T q = T(1) / b.hi;
   r.hi = a.hi * q;
 
 #ifdef LIBC_TARGET_CPU_HAS_FMA
-  double e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi);
-  double e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo);
+  T e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi);
+  T e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo);
 #else
-  DoubleDouble b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi);
-  DoubleDouble b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi);
-  double e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo;
-  double e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo;
+  NumberPair<T> b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi);
+  NumberPair<T> b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi);
+  T e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo;
+  T e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo;
 #endif // LIBC_TARGET_CPU_HAS_FMA
 
   r.lo = q * (e_hi + e_lo);
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index a2634950d431b..253843e5e37aa 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -45,6 +45,7 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
 #define LIBC_MATH_FAST                                                         \
   (LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES |                     \
    LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT)
+#define LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT 0x10
 
 #ifndef LIBC_MATH
 #define LIBC_MATH 0
@@ -58,4 +59,8 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
 #define LIBC_MATH_HAS_SMALL_TABLES
 #endif
 
+#if (LIBC_MATH & LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT)
+#define LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
+#endif
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 9faf46d491426..2bda741b453f5 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -4052,8 +4052,10 @@ add_entrypoint_object(
     atan2f.cpp
   HDRS
     ../atan2f.h
+    atan2f_float.h
   DEPENDS
     .inv_trigf_utils
+    libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.nearest_integer
diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp
index db7639396cdd7..5ac2b29438ea9 100644
--- a/libc/src/math/generic/atan2f.cpp
+++ b/libc/src/math/generic/atan2f.cpp
@@ -17,6 +17,14 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
+#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) &&                               \
+    defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT)
+
+// We use float-float implementation to reduce size.
+#include "src/math/generic/atan2f_float.h"
+
+#else
+
 namespace LIBC_NAMESPACE_DECL {
 
 namespace {
@@ -324,3 +332,5 @@ LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
+
+#endif
diff --git a/libc/src/math/generic/atan2f_float.h b/libc/src/math/generic/atan2f_float.h
new file mode 100644
index 0000000000000..1819a3c3fb0a0
--- /dev/null
+++ b/libc/src/math/generic/atan2f_float.h
@@ -0,0 +1,239 @@
+//===-- Single-precision atan2f function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/math/atan2f.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+using FloatFloat = fputil::FloatFloat;
+
+// atan(i/64) with i = 0..16, generated by Sollya with:
+// > for i from 0 to 16 do {
+//     a = round(atan(i/16), SG, RN);
+//     b = round(atan(i/16) - a, SG, RN);
+//     print("{", b, ",", a, "},");
+//   };
+constexpr FloatFloat ATAN_I[17] = {
+    {0.0f, 0.0f},
+    {-0x1.1a6042p-30f, 0x1.ff55bcp-5f},
+    {-0x1.54f424p-30f, 0x1.fd5baap-4f},
+    {0x1.79cb6p-28f, 0x1.7b97b4p-3f},
+    {-0x1.b4dfc8p-29f, 0x1.f5b76p-3f},
+    {-0x1.1f0286p-27f, 0x1.362774p-2f},
+    {0x1.e4defp-30f, 0x1.6f6194p-2f},
+    {0x1.e611fep-29f, 0x1.a64eecp-2f},
+    {0x1.586ed4p-28f, 0x1.dac67p-2f},
+    {-0x1.6499e6p-26f, 0x1.0657eap-1f},
+    {0x1.7bdfd6p-26f, 0x1.1e00bap-1f},
+    {-0x1.98e422p-28f, 0x1.345f02p-1f},
+    {0x1.934f7p-28f, 0x1.4978fap-1f},
+    {0x1.c5a6c6p-27f, 0x1.5d5898p-1f},
+    {0x1.5e118cp-27f, 0x1.700a7cp-1f},
+    {-0x1.1d4eb6p-26f, 0x1.819d0cp-1f},
+    {-0x1.777a5cp-26f, 0x1.921fb6p-1f},
+};
+
+// Approximate atan(x) for |x| <= 2^-5.
+// Using degree-3 Taylor polynomial:
+//  P = x - x^3/3
+// Then the absolute error is bounded by:
+//   |atan(x) - P(x)| < |x|^5/5 < 2^(-5*5) / 5 < 2^-27.
+// And the relative error is bounded by:
+//   |(atan(x) - P(x))/atan(x)| < |x|^4 / 4 < 2^-22.
+// For x = x_hi + x_lo, fully expand the polynomial and drop any terms less than
+//   ulp(x_hi^3 / 3) gives us:
+// P(x) ~ x_hi - x_hi^3/3 + x_lo * (1 - x_hi^2)
+FloatFloat atan_eval(const FloatFloat &x) {
+  FloatFloat p;
+  p.hi = x.hi;
+  float x_hi_sq = x.hi * x.hi;
+  // c0 ~ - x_hi^2 / 3
+  float c0 = -0x1.555556p-2f * x_hi_sq;
+  // c1 ~ x_lo * (1 - x_hi^2)
+  float c1 = fputil::multiply_add(x_hi_sq, -x.lo, x.lo);
+  // p.lo ~ - x_hi^3 / 3 + x_lo * (1 - x_hi*2)
+  p.lo = fputil::multiply_add(x.hi, c0, c1);
+  return p;
+}
+
+} // anonymous namespace
+
+// There are several range reduction steps we can take for atan2(y, x) as
+// follow:
+
+// * Range reduction 1: signness
+// atan2(y, x) will return a number between -PI and PI representing the angle
+// forming by the 0x axis and the vector (x, y) on the 0xy-plane.
+// In particular, we have that:
+//   atan2(y, x) = atan( y/x )         if x >= 0 and y >= 0 (I-quadrant)
+//               = pi + atan( y/x )    if x < 0 and y >= 0  (II-quadrant)
+//               = -pi + atan( y/x )   if x < 0 and y < 0   (III-quadrant)
+//               = atan( y/x )         if x >= 0 and y < 0  (IV-quadrant)
+// Since atan function is odd, we can use the formula:
+//   atan(-u) = -atan(u)
+// to adjust the above conditions a bit further:
+//   atan2(y, x) = atan( |y|/|x| )         if x >= 0 and y >= 0 (I-quadrant)
+//               = pi - atan( |y|/|x| )    if x < 0 and y >= 0  (II-quadrant)
+//               = -pi + atan( |y|/|x| )   if x < 0 and y < 0   (III-quadrant)
+//               = -atan( |y|/|x| )        if x >= 0 and y < 0  (IV-quadrant)
+// Which can be simplified to:
+//   atan2(y, x) = sign(y) * atan( |y|/|x| )             if x >= 0
+//               = sign(y) * (pi - atan( |y|/|x| ))      if x < 0
+
+// * Range reduction 2: reciprocal
+// Now that the argument inside atan is positive, we can use the formula:
+//   atan(1/x) = pi/2 - atan(x)
+// to make the argument inside atan <= 1 as follow:
+//   atan2(y, x) = sign(y) * atan( |y|/|x|)            if 0 <= |y| <= x
+//               = sign(y) * (pi/2 - atan( |x|/|y| )   if 0 <= x < |y|
+//               = sign(y) * (pi - atan( |y|/|x| ))    if 0 <= |y| <= -x
+//               = sign(y) * (pi/2 + atan( |x|/|y| ))  if 0 <= -x < |y|
+
+// * Range reduction 3: look up table.
+// After the previous two range reduction steps, we reduce the problem to
+// compute atan(u) with 0 <= u <= 1, or to be precise:
+//   atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|).
+// An accurate polynomial approximation for the whole [0, 1] input range will
+// require a very large degree.  To make it more efficient, we reduce the input
+// range further by finding an integer idx such that:
+//   | n/d - idx/16 | <= 1/32.
+// In particular,
+//   idx := 2^-4 * round(2^4 * n/d)
+// Then for the fast pass, we find a polynomial approximation for:
+//   atan( n/d ) ~ atan( idx/16 ) + (n/d - idx/16) * Q(n/d - idx/16)
+// with Q(x) = x - x^3/3 be the cubic Taylor polynomial of atan(x).
+// It's error in float-float precision is estimated in Sollya to be:
+// > P = x - x^3/3;
+// > dirtyinfnorm(atan(x) - P, [-2^-5, 2^-5]);
+// 0x1.995...p-28.
+
+LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) {
+  using FPBits = typename fputil::FPBits<float>;
+  constexpr float IS_NEG[2] = {1.0f, -1.0f};
+  constexpr FloatFloat ZERO = {0.0f, 0.0f};
+  constexpr FloatFloat MZERO = {-0.0f, -0.0f};
+  constexpr FloatFloat PI = {-0x1.777a5cp-24f, 0x1.921fb6p1f};
+  constexpr FloatFloat MPI = {0x1.777a5cp-24f, -0x1.921fb6p1f};
+  constexpr FloatFloat PI_OVER_4 = {-0x1.777a5cp-26f, 0x1.921fb6p-1f};
+  constexpr FloatFloat PI_OVER_2 = {-0x1.777a5cp-25f, 0x1.921fb6p0f};
+  constexpr FloatFloat MPI_OVER_2 = {-0x1.777a5cp-25f, 0x1.921fb6p0f};
+  constexpr FloatFloat THREE_PI_OVER_4 = {-0x1.99bc5cp-28f, 0x1.2d97c8p1f};
+  // Adjustment for constant term:
+  //   CONST_ADJ[x_sign][y_sign][recip]
+  constexpr FloatFloat CONST_ADJ[2][2][2] = {
+      {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}},
+      {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}};
+
+  FPBits x_bits(x), y_bits(y);
+  bool x_sign = x_bits.sign().is_neg();
+  bool y_sign = y_bits.sign().is_neg();
+  x_bits = x_bits.abs();
+  y_bits = y_bits.abs();
+  uint32_t x_abs = x_bits.uintval();
+  uint32_t y_abs = y_bits.uintval();
+  bool recip = x_abs < y_abs;
+  uint32_t min_abs = recip ? x_abs : y_abs;
+  uint32_t max_abs = !recip ? x_abs : y_abs;
+  unsigned min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+  unsigned max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+
+  float num = FPBits(min_abs).get_val();
+  float den = FPBits(max_abs).get_val();
+
+  // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
+  // overflow, or close to underflow.
+  if (LIBC_UNLIKELY(max_exp > 0xffU - 64U || min_exp < 64U)) {
+    if (x_bits.is_nan() || y_bits.is_nan())
+      return FPBits::quiet_nan().get_val();
+    unsigned x_except = x == 0.0f ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1);
+    unsigned y_except = y == 0.0f ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1);
+
+    // Exceptional cases:
+    //   EXCEPT[y_except][x_except][x_is_neg]
+    // with x_except & y_except:
+    //   0: zero
+    //   1: finite, non-zero
+    //   2: infinity
+    constexpr FloatFloat EXCEPTS[3][3][2] = {
+        {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}},
+        {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}},
+        {{PI_OVER_2, PI_OVER_2},
+         {PI_OVER_2, PI_OVER_2},
+         {PI_OVER_4, THREE_PI_OVER_4}},
+    };
+
+    if ((x_except != 1) || (y_except != 1)) {
+      FloatFloat r = EXCEPTS[y_except][x_except][x_sign];
+      return fputil::multiply_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
+    }
+    bool scale_up = min_exp < 64U;
+    bool scale_down = max_exp > 0xffU - 64U;
+    // At least one input is denormal, multiply both numerator and denominator
+    // by some large enough power of 2 to normalize denormal inputs.
+    if (scale_up) {
+      num *= 0x1.0p32f;
+      if (!scale_down)
+        den *= 0x1.0p32f;
+    } else if (scale_down) {
+      den *= 0x1.0p-32f;
+      if (!scale_up)
+        num *= 0x1.0p-32f;
+    }
+
+    min_abs = FPBits(num).uintval();
+    max_abs = FPBits(den).uintval();
+    min_exp = static_cast<unsigned>(min_abs >> FPBits::FRACTION_LEN);
+    max_exp = static_cast<unsigned>(max_abs >> FPBits::FRACTION_LEN);
+  }
+
+  float final_sign = IS_NEG[(x_sign != y_sign) != recip];
+  FloatFloat const_term = CONST_ADJ[x_sign][y_sign][recip];
+  unsigned exp_diff = max_exp - min_exp;
+  // We have the following bound for normalized n and d:
+  //   2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
+  if (LIBC_UNLIKELY(exp_diff > 25)) {
+    return fputil::multiply_add(final_sign, const_term.hi,
+                                final_sign * (const_term.lo + num / den));
+  }
+
+  float k = fputil::nearest_integer(16.0f * num / den);
+  unsigned idx = static_cast<unsigned>(k);
+  // k = idx / 16
+  k *= 0x1.0p-4f;
+
+  // Range reduction:
+  // atan(n/d) - atan(k/64) = atan((n/d - k/16) / (1 + (n/d) * (k/16)))
+  //                        = atan((n - d * k/16)) / (d + n * k/16))
+  FloatFloat num_k = fputil::exact_mult(num, k);
+  FloatFloat den_k = fputil::exact_mult(den, k);
+
+  // num_dd = n - d * k
+  FloatFloat num_ff = fputil::exact_add(num - den_k.hi, -den_k.lo);
+  // den_dd = d + n * k
+  FloatFloat den_ff = fputil::exact_add(den, num_k.hi);
+  den_ff.lo += num_k.lo;
+
+  // q = (n - d * k) / (d + n * k)
+  FloatFloat q = fputil::div(num_ff, den_ff);
+  // p ~ atan(q)
+  FloatFloat p = atan_eval(q);
+
+  FloatFloat r = fputil::add(const_term, fputil::add(ATAN_I[idx], p));
+  return final_sign * r.hi;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/range_reduction_double_fma.h b/libc/src/math/generic/range_reduction_double_fma.h
index cab031c28baa1..8e0bc3a42462c 100644
--- a/libc/src/math/generic/range_reduction_double_fma.h
+++ b/libc/src/math/generic/range_reduction_double_fma.h
@@ -33,14 +33,14 @@ LIBC_INLINE unsigned LargeRangeReduction::fast(double x, DoubleDouble &u) {
   // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78
   x_reduced = xbits.get_val();
   // x * c_hi = ph.hi + ph.lo exactly.
-  DoubleDouble ph =
-      fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]);
+  DoubleDouble ph = fputil::exact_mult<double, SPLIT>(
+      x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]);
   // x * c_mid = pm.hi + pm.lo exactly.
-  DoubleDouble pm =
-      fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]);
+  DoubleDouble pm = fputil::exact_mult<double, SPLIT>(
+      x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]);
   // x * c_lo = pl.hi + pl.lo exactly.
-  DoubleDouble pl =
-      fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]);
+  DoubleDouble pl = fputil::exact_mult<double, SPLIT>(
+      x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]);
   // Extract integral parts and fractional parts of (ph.lo + pm.hi).
   double sum_hi = ph.lo + pm.hi;
   double kd = fputil::nearest_integer(sum_hi);
diff --git a/libc/src/math/generic/range_reduction_double_nofma.h b/libc/src/math/generic/range_reduction_double_...
[truncated]

nickdesaulniers

Can you add a commit description that addresses "why is this patch necessary?" In particular, if it's size related, having some measurement and for which architecture might be useful.

nickdesaulniers · 2025-02-11T19:05:24Z

libc/src/__support/FPUtil/double_double.h

+  static constexpr size_t VALUE = 12;
+};
+template <> struct DefaultSplit<double> {
+  static constexpr size_t VALUE = 27;


Suggested change

static constexpr size_t VALUE = 27;

static constexpr size_t VALUE = DEFAULT_DOUBLE_SPLIT;

libc/src/math/generic/atan2f_float.h

nickdesaulniers · 2025-02-11T19:20:21Z

libc/src/math/generic/atan2f_float.h

+        den *= 0x1.0p32f;
+    } else if (scale_down) {
+      den *= 0x1.0p-32f;
+      if (!scale_up)


Technically, if we reach this point, we know scale_up is false because of the else if on L191. The whole conditional block here looks like it could be rewritten as two ternary expressions, but I guess they would have to be wrapped in a check for either being true.

libc/src/math/generic/atan2f_float.h

michaelrj-google

some style nits

michaelrj-google · 2025-02-11T22:32:45Z

libc/src/__support/FPUtil/double_double.h

+template <typename T> struct DefaultSplit;
+template <> struct DefaultSplit<float> {
+  static constexpr size_t VALUE = 12;
+};
+template <> struct DefaultSplit<double> {
+  static constexpr size_t VALUE = DEFAULT_DOUBLE_SPLIT;
+};


these should probably be consistent on how the number is defined. Either there should be a macro DEFAULT_FLOAT_SPLIT to match DEFAULT_DOUBLE_SPLIT, or both of them should be just numbers here. Personally I'd lean towards deleting the macro, since this struct already effectively names the value.

michaelrj-google · 2025-02-11T22:36:20Z

libc/src/math/generic/atan2f_float.h

+  FloatFloat num_k = fputil::exact_mult(num, k);
+  FloatFloat den_k = fputil::exact_mult(den, k);
+
+  // num_dd = n - d * k


these comments should be updated, since these are now _ff instead of _dd

For targets that have single precision FPU but not double precision FPU such as Cortex M4, only using float-float in the intermediate computations might reduce the code size compared to using double. In this case, when the exact pass is skipped, the float-only option for atan2f implemented in this PR reduces the code size of this function by ~1 KB compared to the double precision version.

lntue changed the title ~~[libc][math] Add use float-only option for atan2f.~~ [libc][math] Add float-only option for atan2f. Jan 14, 2025

[libc][math] Add float-only option for atan2f.

420bcff

lntue force-pushed the atan2f branch from 7d3fc1f to 420bcff Compare February 11, 2025 18:51

lntue marked this pull request as ready for review February 11, 2025 18:51

lntue requested a review from nickdesaulniers February 11, 2025 18:51

llvmbot added the libc label Feb 11, 2025

lntue requested review from michaelrj-google and petrhosek February 11, 2025 18:52

lntue requested review from Prabhuk and jhuber6 February 11, 2025 18:52

nickdesaulniers approved these changes Feb 11, 2025

View reviewed changes

lntue added 2 commits February 11, 2025 21:07

Address comments.

8c051a7

Address comment.

fb2632e

nickdesaulniers approved these changes Feb 11, 2025

View reviewed changes

petrhosek approved these changes Feb 11, 2025

View reviewed changes

lntue merged commit fd41393 into llvm:main Feb 11, 2025
13 checks passed

lntue deleted the atan2f branch February 11, 2025 22:36

michaelrj-google reviewed Feb 11, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[libc][math] Add float-only option for atan2f. #122979

[libc][math] Add float-only option for atan2f. #122979

lntue commented Jan 14, 2025 •

edited

Loading

github-actions bot commented Jan 14, 2025 •

edited

Loading

llvmbot commented Feb 11, 2025

nickdesaulniers left a comment

nickdesaulniers Feb 11, 2025

lntue Feb 11, 2025

nickdesaulniers Feb 11, 2025

lntue Feb 11, 2025

michaelrj-google left a comment

michaelrj-google Feb 11, 2025

lntue Feb 11, 2025

michaelrj-google Feb 11, 2025

	static constexpr size_t VALUE = 27;
	static constexpr size_t VALUE = DEFAULT_DOUBLE_SPLIT;

[libc][math] Add float-only option for atan2f. #122979

[libc][math] Add float-only option for atan2f. #122979

Conversation

lntue commented Jan 14, 2025 • edited Loading

github-actions bot commented Jan 14, 2025 • edited Loading

llvmbot commented Feb 11, 2025

nickdesaulniers left a comment

Choose a reason for hiding this comment

nickdesaulniers Feb 11, 2025

Choose a reason for hiding this comment

lntue Feb 11, 2025

Choose a reason for hiding this comment

nickdesaulniers Feb 11, 2025

Choose a reason for hiding this comment

lntue Feb 11, 2025

Choose a reason for hiding this comment

michaelrj-google left a comment

Choose a reason for hiding this comment

michaelrj-google Feb 11, 2025

Choose a reason for hiding this comment

lntue Feb 11, 2025

Choose a reason for hiding this comment

michaelrj-google Feb 11, 2025

Choose a reason for hiding this comment

lntue commented Jan 14, 2025 •

edited

Loading

github-actions bot commented Jan 14, 2025 •

edited

Loading