Skip to content

Commit 4aa9b56

Browse files
ylpoonlgylpoon-arm
andauthored
Add SquareRoot and Logarithm to SVE microbenchmark (#5021)
Add SquareRoot and Logarithm to SVE microbenchmark --------- Co-authored-by: Yat Long Poon <[email protected]>
1 parent 0618600 commit 4aa9b56

File tree

2 files changed

+387
-0
lines changed

2 files changed

+387
-0
lines changed
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
using System;
2+
using System.Diagnostics;
3+
using System.Numerics;
4+
using System.Runtime.Intrinsics;
5+
using System.Runtime.Intrinsics.Arm;
6+
using BenchmarkDotNet.Attributes;
7+
using BenchmarkDotNet.Extensions;
8+
using BenchmarkDotNet.Configs;
9+
using BenchmarkDotNet.Filters;
10+
using MicroBenchmarks;
11+
12+
namespace SveBenchmarks
13+
{
14+
[BenchmarkCategory(Categories.Runtime)]
15+
[OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
16+
[Config(typeof(Config))]
17+
public class Logarithm
18+
{
19+
private class Config : ManualConfig
20+
{
21+
public Config()
22+
{
23+
AddFilter(new SimpleFilter(_ => Sve.IsSupported));
24+
}
25+
}
26+
27+
[Params(15, 127, 527, 10015)]
28+
public int Size;
29+
30+
private float[] _input;
31+
private float[] _data;
32+
private float[] _output;
33+
34+
[GlobalSetup]
35+
public virtual void Setup()
36+
{
37+
Random rand = new Random(0);
38+
_input = new float[Size];
39+
for (int i = 0; i < Size; i++)
40+
{
41+
_input[i] = (float)(rand.NextDouble() * (double)Size);
42+
}
43+
44+
// Coefficients taken from Arm Optimized-Routines.
45+
// https://github.com/ARM-software/optimized-routines/blob/v25.07/math/aarch64/advsimd/logf.c
46+
_data = new float[8]{
47+
// p0, p1, p3, p5
48+
BitConverter.UInt32BitsToSingle(0xbe1f39be),
49+
BitConverter.UInt32BitsToSingle(0x3e2d4d51),
50+
BitConverter.UInt32BitsToSingle(0x3e4b09a4),
51+
BitConverter.UInt32BitsToSingle(0x3eaaaebe),
52+
// p2, p4, p6, ln2
53+
BitConverter.UInt32BitsToSingle(0xbe27cc9a),
54+
BitConverter.UInt32BitsToSingle(0xbe800c3e),
55+
BitConverter.UInt32BitsToSingle(0xbeffffe4),
56+
BitConverter.UInt32BitsToSingle(0x3f317218),
57+
};
58+
59+
_output = new float[Size];
60+
}
61+
62+
[GlobalCleanup]
63+
public virtual void Verify()
64+
{
65+
float[] current = (float[])_output.Clone();
66+
Setup();
67+
Scalar();
68+
float[] scalar = (float[])_output.Clone();
69+
// Check that the result is the same as scalar (within 3ULP).
70+
for (int i = 0; i < Size; i++)
71+
{
72+
int e = (int)(BitConverter.SingleToUInt32Bits(scalar[i]) >> 23 & 0xff);
73+
if (e == 0) e++;
74+
float ulpScale = (float)Math.ScaleB(1.0, e - 127 - 23);
75+
float ulpError = (float)Math.Abs(current[i] - scalar[i]) / ulpScale;
76+
Debug.Assert(ulpError <= 3);
77+
}
78+
}
79+
80+
[Benchmark]
81+
public unsafe void Scalar()
82+
{
83+
fixed (float* input = _input, output = _output)
84+
{
85+
for (int i = 0; i < Size; i++)
86+
{
87+
output[i] = (float)Math.Log(input[i]);
88+
}
89+
}
90+
}
91+
92+
[Benchmark]
93+
public unsafe void Vector128Logarithm()
94+
{
95+
// Algorithm based on Arm Optimized-Routines.
96+
// https://github.com/ARM-software/optimized-routines/blob/v25.07/math/aarch64/advsimd/logf.c
97+
fixed (float* input = _input, output = _output, d = _data)
98+
{
99+
int i = 0;
100+
Vector128<uint> offVec = Vector128.Create(0x3f2aaaabu);
101+
102+
for (; i <= Size - 4; i += 4)
103+
{
104+
Vector128<float> x = AdvSimd.LoadVector128(input + i);
105+
Vector128<uint> u_off = AdvSimd.Subtract(x.AsUInt32(), offVec);
106+
107+
Vector64<ushort> cmp = AdvSimd.CompareGreaterThanOrEqual(
108+
AdvSimd.SubtractHighNarrowingLower(u_off, Vector128.Create(0xc1555555u)), // u_off - (0x00800000 - 0x3f2aaaab)
109+
Vector64.Create((ushort)0x7f00)
110+
);
111+
112+
// x = 2^n * (1+r), where 2/3 < 1+r < 4/3.
113+
Vector128<float> n = AdvSimd.ConvertToSingle(
114+
AdvSimd.ShiftRightArithmetic(u_off.AsInt32(), 23)
115+
);
116+
117+
Vector128<uint> u = AdvSimd.And(u_off, Vector128.Create(0x007fffffu));
118+
u = AdvSimd.Add(u, offVec);
119+
120+
Vector128<float> r = AdvSimd.Subtract(u.AsSingle(), Vector128.Create(1.0f));
121+
// y = log(1+r) + n*ln2.
122+
Vector128<float> r2 = AdvSimd.Multiply(r, r);
123+
124+
// n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).
125+
Vector128<float> p_0135 = AdvSimd.LoadVector128(&d[0]);
126+
Vector128<float> p = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(Vector128.Create(d[4]), r, p_0135, 1);
127+
Vector128<float> q = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(Vector128.Create(d[5]), r, p_0135, 2);
128+
Vector128<float> y = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(Vector128.Create(d[6]), r, p_0135, 3);
129+
p = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(p, r2, p_0135, 0);
130+
131+
q = AdvSimd.FusedMultiplyAdd(q, r2, p);
132+
y = AdvSimd.FusedMultiplyAdd(y, r2, q);
133+
p = AdvSimd.FusedMultiplyAdd(r, n, Vector128.Create(d[7]));
134+
135+
Vector128<float> outVec = AdvSimd.FusedMultiplyAdd(p, r2, y);
136+
137+
// Handle special case.
138+
if (cmp.AsUInt64().ToScalar() != 0)
139+
{
140+
// Restore input x.
141+
x = AdvSimd.Add(u_off, offVec).AsSingle();
142+
// Widen cmp to 32-bit lanes.
143+
Vector128<uint> pCmp = AdvSimd.ZeroExtendWideningLower(cmp);
144+
// Use scalar for lanes that are special cases.
145+
outVec = Vector128.Create(
146+
pCmp[0] != 0 ? (float)Math.Log(x[0]) : outVec[0],
147+
pCmp[1] != 0 ? (float)Math.Log(x[1]) : outVec[1],
148+
pCmp[2] != 0 ? (float)Math.Log(x[2]) : outVec[2],
149+
pCmp[3] != 0 ? (float)Math.Log(x[3]) : outVec[3]
150+
);
151+
}
152+
153+
AdvSimd.Store(output + i, outVec);
154+
}
155+
// Handle tail.
156+
for (; i < Size; i++)
157+
{
158+
output[i] = (float)Math.Log(input[i]);
159+
}
160+
}
161+
}
162+
163+
[Benchmark]
164+
public unsafe void SveLogarithm()
165+
{
166+
// Algorithm based on Arm Optimized-Routines.
167+
// https://github.com/ARM-software/optimized-routines/blob/v25.07/math/aarch64/sve/logf.c
168+
fixed (float* input = _input, output = _output, d = _data)
169+
{
170+
int i = 0;
171+
int cntw = (int)Sve.Count32BitElements();
172+
173+
Vector<uint> offVec = new Vector<uint>(0x3f2aaaab);
174+
175+
Vector<uint> pTrue = Sve.CreateTrueMaskUInt32();
176+
Vector<float> pTruef = Sve.CreateTrueMaskSingle();
177+
Vector<uint> pLoop = Sve.CreateWhileLessThanMask32Bit(0, Size);
178+
while (Sve.TestFirstTrue(pTrue, pLoop))
179+
{
180+
Vector<float> x = (Vector<float>)Sve.LoadVector(pLoop, (uint*)(input + i));
181+
Vector<uint> u_off = Sve.Subtract((Vector<uint>)x, offVec);
182+
183+
// Check for extreme values outside of 0x00800000 and 0x00ffffff.
184+
Vector<uint> cmp = Sve.CompareGreaterThanOrEqual(
185+
Sve.Subtract(u_off, new Vector<uint>(0xc1555555u)), // u_off - (0x00800000 - 0x3f2aaaab)
186+
new Vector<uint>(0x7f000000)
187+
);
188+
189+
// x = 2^n * (1+r), where 2/3 < 1+r < 4/3.
190+
Vector<float> n = Sve.ConvertToSingle(
191+
Sve.ShiftRightArithmetic((Vector<int>)u_off, new Vector<uint>(23))
192+
);
193+
194+
Vector<uint> u = Sve.And(u_off, new Vector<uint>(0x007fffff));
195+
u = Sve.Add(u, offVec);
196+
197+
Vector<float> r = Sve.Subtract((Vector<float>)u, new Vector<float>(1.0f));
198+
199+
// y = log(1+r) + n*ln2.
200+
Vector<float> r2 = Sve.Multiply(r, r);
201+
202+
// n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).
203+
Vector<float> p_0135 = Sve.LoadVector(pTruef, &d[0]);
204+
Vector<float> p = Sve.FusedMultiplyAddBySelectedScalar(new Vector<float>(d[4]), r, p_0135, 1);
205+
Vector<float> q = Sve.FusedMultiplyAddBySelectedScalar(new Vector<float>(d[5]), r, p_0135, 2);
206+
Vector<float> y = Sve.FusedMultiplyAddBySelectedScalar(new Vector<float>(d[6]), r, p_0135, 3);
207+
p = Sve.FusedMultiplyAddBySelectedScalar(p, r2, p_0135, 0);
208+
209+
q = Sve.FusedMultiplyAdd(q, r2, p);
210+
y = Sve.FusedMultiplyAdd(y, r2, q);
211+
p = Sve.FusedMultiplyAdd(r, n, new Vector<float>(d[7]));
212+
213+
Vector<float> outVec = Sve.FusedMultiplyAdd(p, r2, y);
214+
// Handle special case.
215+
if (Sve.TestAnyTrue(pTrue, cmp))
216+
{
217+
// Restore input x.
218+
x = (Vector<float>)Sve.Add(u_off, offVec);
219+
// Get the first extreme value.
220+
Vector<uint> pElem = Sve.CreateMaskForFirstActiveElement(
221+
cmp, Sve.CreateFalseMaskUInt32()
222+
);
223+
while (Sve.TestAnyTrue(cmp, pElem))
224+
{
225+
float elem = Sve.ConditionalExtractLastActiveElement(
226+
(Vector<float>)pElem, 0, x
227+
);
228+
// Fallback to scalar for extreme values.
229+
elem = (float)Math.Log(elem);
230+
Vector<float> y2 = new Vector<float>(elem);
231+
// Replace value back to outVec.
232+
outVec = Sve.ConditionalSelect((Vector<float>)pElem, y2, outVec);
233+
// Get next extreme value.
234+
pElem = Sve.CreateMaskForNextActiveElement(cmp, pElem);
235+
}
236+
}
237+
238+
Sve.StoreAndZip(pLoop, (uint*)output + i, (Vector<uint>)outVec);
239+
240+
// Handle loop.
241+
i += cntw;
242+
pLoop = Sve.CreateWhileLessThanMask32Bit(i, Size);
243+
}
244+
}
245+
}
246+
247+
}
248+
}
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
using System;
2+
using System.Diagnostics;
3+
using System.Numerics;
4+
using System.Runtime.Intrinsics;
5+
using System.Runtime.Intrinsics.Arm;
6+
using BenchmarkDotNet.Attributes;
7+
using BenchmarkDotNet.Extensions;
8+
using BenchmarkDotNet.Configs;
9+
using BenchmarkDotNet.Filters;
10+
using MicroBenchmarks;
11+
12+
namespace SveBenchmarks
13+
{
14+
[BenchmarkCategory(Categories.Runtime)]
15+
[OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)]
16+
[Config(typeof(Config))]
17+
public class SquareRoot
18+
{
19+
private class Config : ManualConfig
20+
{
21+
public Config()
22+
{
23+
AddFilter(new SimpleFilter(_ => Sve.IsSupported));
24+
}
25+
}
26+
27+
[Params(15, 127, 527, 10015)]
28+
public int Size;
29+
30+
private float[] _input;
31+
private float[] _output;
32+
33+
[GlobalSetup]
34+
public virtual void Setup()
35+
{
36+
_input = ValuesGenerator.Array<float>(Size);
37+
_output = new float[Size];
38+
}
39+
40+
[GlobalCleanup]
41+
public virtual void Verify()
42+
{
43+
float[] current = (float[])_output.Clone();
44+
Setup();
45+
Scalar();
46+
float[] scalar = (float[])_output.Clone();
47+
// Check that the result is the same as scalar.
48+
for (int i = 0; i < Size; i++)
49+
{
50+
Debug.Assert(current[i] == scalar[i]);
51+
}
52+
}
53+
54+
[Benchmark]
55+
public unsafe void Scalar()
56+
{
57+
fixed (float* input = _input, output = _output)
58+
{
59+
for (int i = 0; i < Size; i++)
60+
{
61+
output[i] = (float)Math.Sqrt(input[i]);
62+
}
63+
}
64+
}
65+
66+
[Benchmark]
67+
public unsafe void Vector128SquareRoot()
68+
{
69+
fixed (float* input = _input, output = _output)
70+
{
71+
int i = 0;
72+
for (; i <= Size - 4; i += 4)
73+
{
74+
Vector128<float> inVec = AdvSimd.LoadVector128(input + i);
75+
Vector128<float> outVec = AdvSimd.Arm64.Sqrt(inVec);
76+
AdvSimd.Store(output + i, outVec);
77+
}
78+
// Handle tail.
79+
for (; i < Size; i++)
80+
{
81+
output[i] = (float)Math.Sqrt(input[i]);
82+
}
83+
}
84+
}
85+
86+
[Benchmark]
87+
public unsafe void SveSquareRoot()
88+
{
89+
fixed (float* input = _input, output = _output)
90+
{
91+
int i = 0;
92+
int cntw = (int)Sve.Count32BitElements();
93+
94+
// We use Vector<uint> for predicates since there are no Vector<float>
95+
// overloads for TestFirstTrue and CreateWhileLessThanMask etc.
96+
Vector<uint> pTrue = Sve.CreateTrueMaskUInt32();
97+
Vector<uint> pLoop = Sve.CreateWhileLessThanMask32Bit(0, Size);
98+
while (Sve.TestFirstTrue(pTrue, pLoop))
99+
{
100+
// Since pLoop is a Vector<uint> predicate, we load the input as uint array,
101+
// then cast it back to Vector<float>.
102+
// This is preferable to casting pLoop to Vector<float>, which would cause
103+
// an unnecessary conversion from predicate to vector in the codegen.
104+
Vector<float> inVec = (Vector<float>)Sve.LoadVector(pLoop, (uint*)(input + i));
105+
Vector<float> outVec = Sve.Sqrt(inVec);
106+
Sve.StoreAndZip(pLoop, (uint*)output + i, (Vector<uint>)outVec);
107+
108+
// Handle loop.
109+
i += cntw;
110+
pLoop = Sve.CreateWhileLessThanMask32Bit(i, Size);
111+
}
112+
}
113+
}
114+
115+
[Benchmark]
116+
public unsafe void SveTail()
117+
{
118+
fixed (float* input = _input, output = _output)
119+
{
120+
int i = 0;
121+
int cntw = (int)Sve.Count32BitElements();
122+
123+
Vector<float> pTrue = Sve.CreateTrueMaskSingle();
124+
for (; i <= Size - cntw; i += cntw)
125+
{
126+
Vector<float> inVec = Sve.LoadVector(pTrue, input + i);
127+
Vector<float> outVec = Sve.Sqrt(inVec);
128+
Sve.StoreAndZip(pTrue, output + i, outVec);
129+
}
130+
// Handle tail.
131+
for (; i < Size; i++)
132+
{
133+
output[i] = (float)Math.Sqrt(input[i]);
134+
}
135+
}
136+
}
137+
138+
}
139+
}

0 commit comments

Comments
 (0)