Skip to content

Commit b674a39

Browse files
author
andy-thomason
committed
Inv trig working + exp2, exp.
1 parent a067a6f commit b674a39

File tree

4 files changed

+177
-25
lines changed

4 files changed

+177
-25
lines changed

crates/core_simd/src/round.rs

+24-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use crate::simd::{LaneCount, Simd, SupportedLaneCount};
33

44
macro_rules! implement {
55
{
6-
$type:ty, $int_type:ty
6+
$type:ty, $int_type:ty, $uint_type:ty
77
} => {
88
impl<const LANES: usize> Simd<$type, LANES>
99
where
@@ -29,9 +29,30 @@ macro_rules! implement {
2929
pub fn round_from_int(value: Simd<$int_type, LANES>) -> Self {
3030
unsafe { intrinsics::simd_cast(value) }
3131
}
32+
33+
/// Rounds toward zero and converts to the same-width integer type, assuming that
34+
/// the value is finite and fits in that type.
35+
///
36+
/// # Safety
37+
/// The value must:
38+
///
39+
/// * Not be NaN
40+
/// * Not be infinite
41+
/// * Be representable in the return type, after truncating off its fractional part
42+
#[inline]
43+
pub unsafe fn to_uint_unchecked(self) -> Simd<$uint_type, LANES> {
44+
unsafe { intrinsics::simd_cast(self) }
45+
}
46+
47+
/// Creates a floating-point vector from an unsigned integer vector. Rounds values that are
48+
/// not exactly representable.
49+
#[inline]
50+
pub fn round_from_uint(value: Simd<$uint_type, LANES>) -> Self {
51+
unsafe { intrinsics::simd_cast(value) }
52+
}
3253
}
3354
}
3455
}
3556

36-
implement! { f32, i32 }
37-
implement! { f64, i64 }
57+
implement! { f32, i32, u32 }
58+
implement! { f64, i64, u64 }

crates/std_float/src/lib.rs

+7
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ pub trait StdFloat: Sealed + Sized {
122122
}
123123

124124
pub trait StdLibm : StdFloat {
125+
type IntType;
126+
type UintType;
127+
125128
fn sin(self) -> Self;
126129

127130
fn cos(self) -> Self;
@@ -135,6 +138,10 @@ pub trait StdLibm : StdFloat {
135138
fn atan(self) -> Self;
136139

137140
fn atan2(self, x: Self) -> Self;
141+
142+
fn exp2(self) -> Self;
143+
144+
fn exp(self) -> Self;
138145
}
139146

140147
impl<const N: usize> Sealed for Simd<f32, N> where LaneCount<N>: SupportedLaneCount {}

crates/std_float/src/libm32.rs

+60-21
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#![allow(non_snake_case)]
2+
#![doc("This code is automatically generated, do not edit.")]
23
use super::StdLibm;
34

45
use super::StdFloat;
@@ -9,22 +10,36 @@ impl<const N: usize> StdLibm for Simd<f32, N>
910
where
1011
LaneCount<N>: SupportedLaneCount,
1112
{
13+
type IntType = Simd<i32, N>;
14+
type UintType = Simd<u32, N>;
1215
#[inline]
1316
fn asin(self) -> Self {
17+
let PI_BY_2 = Self::splat(1.57079632679489661923);
1418
let arg = self;
15-
arg.atan2((Self::splat(1f32) - arg * arg).sqrt())
19+
let LIM: Self = Self::splat(0.70710678118654752440);
20+
let c: Self = ((arg).lanes_lt(Self::splat(0.0))).select(-PI_BY_2, PI_BY_2);
21+
let s: Self =
22+
((arg).lanes_lt(Self::splat(0.0))).select(-Self::splat(1.0), Self::splat(1.0));
23+
let x: Self =
24+
((arg * arg).lanes_lt(LIM * LIM)).select(arg, (Self::splat(1.0) - arg * arg).sqrt());
25+
let y: Self = (Self::splat(0.11644821f32))
26+
.mul_add(x * x, Self::splat(0.04343228f32))
27+
.mul_add(x * x, Self::splat(0.17078044f32))
28+
.mul_add(x * x, Self::splat(0.99991643f32))
29+
* x;
30+
((arg * arg).lanes_lt(LIM * LIM)).select(y, c - y * s)
1631
}
1732
#[inline]
1833
fn acos(self) -> Self {
19-
let PI_BY_2 = Self::splat(1.5707964f32);
20-
let PI = Self::splat(3.1415927f32);
34+
let PI_BY_2 = Self::splat(1.57079632679489661923);
35+
let PI = Self::splat(3.14159265358979323846);
2136
let arg = self;
22-
let LIM: Self = Self::splat(0.9f32);
23-
let c: Self = ((arg).lanes_lt(Self::splat(0f32))).select(PI, Self::splat(0f32));
37+
let LIM: Self = Self::splat(0.9);
38+
let c: Self = ((arg).lanes_lt(Self::splat(0.0))).select(PI, Self::splat(0.0));
2439
let s: Self =
25-
((arg).lanes_lt(Self::splat(0f32))).select(Self::splat(1f32), -Self::splat(1f32));
40+
((arg).lanes_lt(Self::splat(0.0))).select(Self::splat(1.0), -Self::splat(1.0));
2641
let x: Self =
27-
((arg * arg).lanes_lt(LIM * LIM)).select(arg, (Self::splat(1f32) - arg * arg).sqrt());
42+
((arg * arg).lanes_lt(LIM * LIM)).select(arg, (Self::splat(1.0) - arg * arg).sqrt());
2843
let y: Self = (Self::splat(1.3740137f32))
2944
.mul_add(x * x, -Self::splat(3.1993167f32))
3045
.mul_add(x * x, Self::splat(3.103398f32))
@@ -38,10 +53,10 @@ where
3853
}
3954
#[inline]
4055
fn atan(self) -> Self {
41-
let PI_BY_2 = Self::splat(1.5707964f32);
56+
let PI_BY_2 = Self::splat(1.57079632679489661923);
4257
let arg = self;
43-
let LIM: Self = Self::splat(1f32);
44-
let c: Self = ((arg).lanes_lt(Self::splat(0f32))).select(-PI_BY_2, PI_BY_2);
58+
let LIM: Self = Self::splat(1.0);
59+
let c: Self = ((arg).lanes_lt(Self::splat(0.0))).select(-PI_BY_2, PI_BY_2);
4560
let x: Self = ((arg.abs()).lanes_lt(LIM)).select(arg, arg.recip());
4661
let y: Self = (-Self::splat(0.0039602574f32))
4762
.mul_add(x * x, Self::splat(0.021659138f32))
@@ -56,14 +71,14 @@ where
5671
}
5772
#[inline]
5873
fn atan2(self, x: Self) -> Self {
59-
let PI_BY_2 = Self::splat(1.5707964f32);
60-
let PI = Self::splat(3.1415927f32);
74+
let PI_BY_2 = Self::splat(1.57079632679489661923);
75+
let PI = Self::splat(3.14159265358979323846);
6176
let y = self;
62-
let offset180: Self = ((y).lanes_lt(Self::splat(0f32))).select(-PI, PI);
63-
let x1: Self = ((x).lanes_lt(Self::splat(0f32))).select(-x, x);
64-
let y1: Self = ((x).lanes_lt(Self::splat(0f32))).select(-y, y);
65-
let offset1: Self = ((x).lanes_lt(Self::splat(0f32))).select(offset180, Self::splat(0f32));
66-
let offset90: Self = ((y).lanes_lt(Self::splat(0f32))).select(-PI_BY_2, PI_BY_2);
77+
let offset180: Self = ((y).lanes_lt(Self::splat(0.0))).select(-PI, PI);
78+
let x1: Self = ((x).lanes_lt(Self::splat(0.0))).select(-x, x);
79+
let y1: Self = ((x).lanes_lt(Self::splat(0.0))).select(-y, y);
80+
let offset1: Self = ((x).lanes_lt(Self::splat(0.0))).select(offset180, Self::splat(0.0));
81+
let offset90: Self = ((y).lanes_lt(Self::splat(0.0))).select(-PI_BY_2, PI_BY_2);
6782
let x2: Self = ((y1.abs()).lanes_gt(x1)).select(y1, x1);
6883
let y2: Self = ((y1.abs()).lanes_gt(x1)).select(-x1, y1);
6984
let offset2: Self = ((y1.abs()).lanes_gt(x1)).select(offset1 + offset90, offset1);
@@ -80,8 +95,32 @@ where
8095
y3 + offset2
8196
}
8297
#[inline]
98+
fn exp2(self) -> Self {
99+
let arg = self;
100+
let r: Self = arg.round();
101+
let mul: Self = Self::from_bits(unsafe {
102+
(r.mul_add(Self::splat(8388608.0f32), Self::splat(1065353216.0f32))).to_uint_unchecked()
103+
});
104+
let x: Self = arg - r;
105+
(Self::splat(0.000015310081f32))
106+
.mul_add(x, Self::splat(0.0001547802f32))
107+
.mul_add(x, Self::splat(0.0013333454f32))
108+
.mul_add(x, Self::splat(0.009617995f32))
109+
.mul_add(x, Self::splat(0.05550411f32))
110+
.mul_add(x, Self::splat(0.24022652f32))
111+
.mul_add(x, Self::splat(0.6931472f32))
112+
.mul_add(x, Self::splat(1f32))
113+
* mul
114+
}
115+
#[inline]
116+
fn exp(self) -> Self {
117+
let LOG2_E =Self ::splat (1.442695040888963407359769137464649992339735961996202908859290566914912486673985594186422766333708408);
118+
let arg = self;
119+
(arg * LOG2_E).exp2()
120+
}
121+
#[inline]
83122
fn sin(self) -> Self {
84-
let RECIP_2PI = Self::splat(0.15915494f32);
123+
let RECIP_2PI = Self::splat(0.15915494309189533577);
85124
let arg = self;
86125
let scaled: Self = arg * RECIP_2PI;
87126
let x: Self = scaled - scaled.round();
@@ -95,7 +134,7 @@ where
95134
}
96135
#[inline]
97136
fn cos(self) -> Self {
98-
let RECIP_2PI = Self::splat(0.15915494f32);
137+
let RECIP_2PI = Self::splat(0.15915494309189533577);
99138
let arg = self;
100139
let scaled: Self = arg * RECIP_2PI;
101140
let x: Self = scaled - scaled.round();
@@ -109,11 +148,11 @@ where
109148
}
110149
#[inline]
111150
fn tan(self) -> Self {
112-
let RECIP_PI = Self::splat(0.31830987f32);
151+
let RECIP_PI = Self::splat(0.31830988618379067154);
113152
let arg = self;
114153
let scaled: Self = arg * RECIP_PI;
115154
let x: Self = scaled - scaled.round();
116-
let recip: Self = Self::splat(1f32) / (x * x - Self::splat(0.25f32));
155+
let recip: Self = Self::splat(1.0) / (x * x - Self::splat(0.25));
117156
let y: Self = (Self::splat(0.014397301f32))
118157
.mul_add(x * x, Self::splat(0.021017345f32))
119158
.mul_add(x * x, Self::splat(0.05285888f32))

crates/std_float/src/test_libm32.rs

+86-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,17 @@ fn asin_f32() {
169169
test_range!(
170170
min: -1.0,
171171
max: 1.0,
172-
limit: one_ulp * 8.0,
172+
limit: one_ulp * 9.0,
173+
scalar_fn: |x : f32| x.asin(),
174+
vector_fn: |x : f32x4| x.asin(),
175+
scalar_type: f32,
176+
vector_type: f32x4,
177+
);
178+
179+
test_range!(
180+
min: -0.5,
181+
max: 0.5,
182+
limit: one_ulp * 2.0,
173183
scalar_fn: |x : f32| x.asin(),
174184
vector_fn: |x : f32x4| x.asin(),
175185
scalar_type: f32,
@@ -204,3 +214,78 @@ fn atan_f32() {
204214
vector_type: f32x4,
205215
);
206216
}
217+
218+
#[test]
219+
fn acos_f32() {
220+
use core_simd::f32x4;
221+
use crate::StdLibm;
222+
223+
let one_ulp = (2.0_f32).powi(-23);
224+
225+
test_range!(
226+
min: -1.0,
227+
max: 1.0,
228+
limit: one_ulp * 8.0,
229+
scalar_fn: |x : f32| x.acos(),
230+
vector_fn: |x : f32x4| x.acos(),
231+
scalar_type: f32,
232+
vector_type: f32x4,
233+
);
234+
235+
test_range!(
236+
min: -0.5,
237+
max: 0.5,
238+
limit: one_ulp * 2.0,
239+
scalar_fn: |x : f32| x.asin(),
240+
vector_fn: |x : f32x4| x.asin(),
241+
scalar_type: f32,
242+
vector_type: f32x4,
243+
);
244+
}
245+
246+
#[test]
247+
fn exp2_f32() {
248+
use core_simd::f32x4;
249+
use crate::StdLibm;
250+
251+
let one_ulp = (2.0_f32).powi(-23);
252+
253+
test_range!(
254+
min: -2.0,
255+
max: 2.0,
256+
limit: one_ulp * 2.0,
257+
scalar_fn: |x : f32| x.exp2(),
258+
vector_fn: |x : f32x4| x.exp2(),
259+
scalar_type: f32,
260+
vector_type: f32x4,
261+
);
262+
}
263+
264+
#[test]
265+
fn exp_f32() {
266+
use core_simd::f32x4;
267+
use crate::StdLibm;
268+
269+
let one_ulp = (2.0_f32).powi(-23);
270+
271+
test_range!(
272+
min: -2.0,
273+
max: 0.0,
274+
limit: one_ulp * 2.0,
275+
scalar_fn: |x : f32| x.exp(),
276+
vector_fn: |x : f32x4| x.exp(),
277+
scalar_type: f32,
278+
vector_type: f32x4,
279+
);
280+
281+
test_range!(
282+
min: 0.0,
283+
max: 2.0,
284+
limit: one_ulp * 8.0,
285+
scalar_fn: |x : f32| x.exp(),
286+
vector_fn: |x : f32x4| x.exp(),
287+
scalar_type: f32,
288+
vector_type: f32x4,
289+
);
290+
}
291+

0 commit comments

Comments
 (0)