From 25e991d23a2b5aff52795e2d3fa5cfcd8e228562 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Tue, 7 Oct 2025 10:28:29 +0100 Subject: [PATCH 1/3] Add SquareRoot and Logarithm to SVE microbenchmark --- src/benchmarks/micro/sve/Logarithm.cs | 248 +++++++++++++++++++++++++ src/benchmarks/micro/sve/SquareRoot.cs | 139 ++++++++++++++ 2 files changed, 387 insertions(+) create mode 100644 src/benchmarks/micro/sve/Logarithm.cs create mode 100644 src/benchmarks/micro/sve/SquareRoot.cs diff --git a/src/benchmarks/micro/sve/Logarithm.cs b/src/benchmarks/micro/sve/Logarithm.cs new file mode 100644 index 00000000000..9f44663e2dc --- /dev/null +++ b/src/benchmarks/micro/sve/Logarithm.cs @@ -0,0 +1,248 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class Logarithm + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private float[] _input; + private float[] _data; + private float[] _output; + + [GlobalSetup] + public virtual void Setup() + { + Random rand = new Random(0); + _input = new float[Size]; + for (int i = 0; i < Size; i++) + { + _input[i] = (float)(rand.NextDouble() * (double)Size); + } + + // Coefficients taken from Arm Optimized-Routines. + // https://github.com/ARM-software/optimized-routines/blob/v25.07/math/aarch64/advsimd/logf.c + _data = new float[8]{ + // p0, p1, p3, p5 + BitConverter.UInt32BitsToSingle(0xbe1f39be), + BitConverter.UInt32BitsToSingle(0x3e2d4d51), + BitConverter.UInt32BitsToSingle(0x3e4b09a4), + BitConverter.UInt32BitsToSingle(0x3eaaaebe), + // p2, p4, p6, ln2 + BitConverter.UInt32BitsToSingle(0xbe27cc9a), + BitConverter.UInt32BitsToSingle(0xbe800c3e), + BitConverter.UInt32BitsToSingle(0xbeffffe4), + BitConverter.UInt32BitsToSingle(0x3f317218), + }; + + _output = new float[Size]; + } + + [GlobalCleanup] + public virtual void Verify() + { + float[] current = (float[])_output.Clone(); + Setup(); + Scalar(); + float[] scalar = (float[])_output.Clone(); + // Check that the result is the same as scalar (within 3ULP). + for (int i = 0; i < Size; i++) + { + int e = (int)(BitConverter.SingleToUInt32Bits(scalar[i]) >> 23 & 0xff); + if (e == 0) e++; + float ulpScale = (float)Math.ScaleB(1.0, e - 127 - 23); + float ulpError = (float)Math.Abs(current[i] - scalar[i]) / ulpScale; + Debug.Assert(ulpError <= 3); + } + } + + [Benchmark] + public unsafe void Scalar() + { + fixed (float* input = _input, output = _output) + { + for (int i = 0; i < Size; i++) + { + output[i] = (float)Math.Log(input[i]); + } + } + } + + [Benchmark] + public unsafe void Vector128Logarithm() + { + // Algorithm based on Arm Optimized-Routines. + // https://github.com/ARM-software/optimized-routines/blob/v25.07/math/aarch64/advsimd/logf.c + fixed (float* input = _input, output = _output, d = _data) + { + int i = 0; + Vector128 offVec = Vector128.Create(0x3f2aaaabu); + + for (; i <= Size - 4; i += 4) + { + Vector128 x = AdvSimd.LoadVector128(input + i); + Vector128 u_off = AdvSimd.Subtract(x.AsUInt32(), offVec); + + Vector64 cmp = AdvSimd.CompareGreaterThanOrEqual( + AdvSimd.SubtractHighNarrowingLower(u_off, Vector128.Create(0xc1555555u)), // u_off - (0x00800000 - 0x3f2aaaab) + Vector64.Create((ushort)0x7f00) + ); + + // x = 2^n * (1+r), where 2/3 < 1+r < 4/3. + Vector128 n = AdvSimd.ConvertToSingle( + AdvSimd.ShiftRightArithmetic(u_off.AsInt32(), 23) + ); + + Vector128 u = AdvSimd.And(u_off, Vector128.Create(0x007fffffu)); + u = AdvSimd.Add(u, offVec); + + Vector128 r = Sve.Subtract(u.AsSingle(), Vector128.Create(1.0f)); + // y = log(1+r) + n*ln2. + Vector128 r2 = AdvSimd.Multiply(r, r); + + // n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). + Vector128 p_0135 = AdvSimd.LoadVector128(&d[0]); + Vector128 p = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(Vector128.Create(d[4]), r, p_0135, 1); + Vector128 q = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(Vector128.Create(d[5]), r, p_0135, 2); + Vector128 y = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(Vector128.Create(d[6]), r, p_0135, 3); + p = AdvSimd.Arm64.FusedMultiplyAddBySelectedScalar(p, r2, p_0135, 0); + + q = AdvSimd.FusedMultiplyAdd(q, r2, p); + y = AdvSimd.FusedMultiplyAdd(y, r2, q); + p = AdvSimd.FusedMultiplyAdd(r, n, Vector128.Create(d[7])); + + Vector128 outVec = AdvSimd.FusedMultiplyAdd(p, r2, y); + + // Handle special case. + if (cmp.AsUInt64().ToScalar() != 0) + { + // Restore input x. + x = AdvSimd.Add(u_off, offVec).AsSingle(); + // Widen cmp to 32-bit lanes. + Vector128 pCmp = AdvSimd.ZeroExtendWideningLower(cmp); + // Use scalar for lanes that are special cases. + outVec = Vector128.Create( + pCmp[0] != 0 ? (float)Math.Log(x[0]) : outVec[0], + pCmp[1] != 0 ? (float)Math.Log(x[1]) : outVec[1], + pCmp[2] != 0 ? (float)Math.Log(x[2]) : outVec[2], + pCmp[3] != 0 ? (float)Math.Log(x[3]) : outVec[3] + ); + } + + AdvSimd.Store(output + i, outVec); + } + // Handle tail. + for (; i < Size; i++) + { + output[i] = (float)Math.Log(input[i]); + } + } + } + + [Benchmark] + public unsafe void SveLogarithm() + { + // Algorithm based on Arm Optimized-Routines. + // https://github.com/ARM-software/optimized-routines/blob/v25.07/math/aarch64/sve/logf.c + fixed (float* input = _input, output = _output, d = _data) + { + int i = 0; + int cntw = (int)Sve.Count32BitElements(); + + Vector offVec = new Vector(0x3f2aaaab); + + Vector pTrue = Sve.CreateTrueMaskUInt32(); + Vector pTruef = Sve.CreateTrueMaskSingle(); + Vector pLoop = Sve.CreateWhileLessThanMask32Bit(0, Size); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + Vector x = (Vector)Sve.LoadVector(pLoop, (uint*)(input + i)); + Vector u_off = Sve.Subtract((Vector)x, offVec); + + // Check for extreme values outside of 0x00800000 and 0x00ffffff. + Vector cmp = Sve.CompareGreaterThanOrEqual( + Sve.Subtract(u_off, new Vector(0xc1555555u)), // u_off - (0x00800000 - 0x3f2aaaab) + new Vector(0x7f000000) + ); + + // x = 2^n * (1+r), where 2/3 < 1+r < 4/3. + Vector n = Sve.ConvertToSingle( + Sve.ShiftRightArithmetic((Vector)u_off, new Vector(23)) + ); + + Vector u = Sve.And(u_off, new Vector(0x007fffff)); + u = Sve.Add(u, offVec); + + Vector r = Sve.Subtract((Vector)u, new Vector(1.0f)); + + // y = log(1+r) + n*ln2. + Vector r2 = Sve.Multiply(r, r); + + // n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). + Vector p_0135 = Sve.LoadVector(pTruef, &d[0]); + Vector p = Sve.FusedMultiplyAddBySelectedScalar(new Vector(d[4]), r, p_0135, 1); + Vector q = Sve.FusedMultiplyAddBySelectedScalar(new Vector(d[5]), r, p_0135, 2); + Vector y = Sve.FusedMultiplyAddBySelectedScalar(new Vector(d[6]), r, p_0135, 3); + p = Sve.FusedMultiplyAddBySelectedScalar(p, r2, p_0135, 0); + + q = Sve.FusedMultiplyAdd(q, r2, p); + y = Sve.FusedMultiplyAdd(y, r2, q); + p = Sve.FusedMultiplyAdd(r, n, new Vector(d[7])); + + Vector outVec = Sve.FusedMultiplyAdd(p, r2, y); + // Handle special case. + if (Sve.TestAnyTrue(pTrue, cmp)) + { + // Restore input x. + x = (Vector)Sve.Add(u_off, offVec); + // Get the first extreme value. + Vector pElem = Sve.CreateMaskForFirstActiveElement( + cmp, Sve.CreateFalseMaskUInt32() + ); + while (Sve.TestAnyTrue(cmp, pElem)) + { + float elem = Sve.ConditionalExtractLastActiveElement( + (Vector)pElem, 0, x + ); + // Fallback to scalar for extreme values. + elem = (float)Math.Log(elem); + Vector y2 = new Vector(elem); + // Replace value back to outVec. + outVec = Sve.ConditionalSelect((Vector)pElem, y2, outVec); + // Get next extreme value. + pElem = Sve.CreateMaskForNextActiveElement(cmp, pElem); + } + } + + Sve.StoreAndZip(pLoop, (uint*)output + i, (Vector)outVec); + + // Handle loop. + i += cntw; + pLoop = Sve.CreateWhileLessThanMask32Bit(i, Size); + } + } + } + + } +} diff --git a/src/benchmarks/micro/sve/SquareRoot.cs b/src/benchmarks/micro/sve/SquareRoot.cs new file mode 100644 index 00000000000..1fcf95fec12 --- /dev/null +++ b/src/benchmarks/micro/sve/SquareRoot.cs @@ -0,0 +1,139 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class SquareRoot + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private float[] _input; + private float[] _output; + + [GlobalSetup] + public virtual void Setup() + { + _input = ValuesGenerator.Array(Size); + _output = new float[Size]; + } + + [GlobalCleanup] + public virtual void Verify() + { + float[] current = (float[])_output.Clone(); + Setup(); + Scalar(); + float[] scalar = (float[])_output.Clone(); + // Check that the result is the same as scalar. + for (int i = 0; i < Size; i++) + { + Debug.Assert(current[i] == scalar[i]); + } + } + + [Benchmark] + public unsafe void Scalar() + { + fixed (float* input = _input, output = _output) + { + for (int i = 0; i < Size; i++) + { + output[i] = (float)Math.Sqrt(input[i]); + } + } + } + + [Benchmark] + public unsafe void Vector128SquareRoot() + { + fixed (float* input = _input, output = _output) + { + int i = 0; + for (; i <= Size - 4; i += 4) + { + Vector128 inVec = AdvSimd.LoadVector128(input + i); + Vector128 outVec = AdvSimd.Arm64.Sqrt(inVec); + AdvSimd.Store(output + i, outVec); + } + // Handle tail. + for (; i < Size; i++) + { + output[i] = (float)Math.Sqrt(input[i]); + } + } + } + + [Benchmark] + public unsafe void SveSquareRoot() + { + fixed (float* input = _input, output = _output) + { + int i = 0; + int cntw = (int)Sve.Count32BitElements(); + + // We use Vector for predicates since there are no Vector + // overloads for TestFirstTrue and CreateWhileLessThanMask etc. + Vector pTrue = Sve.CreateTrueMaskUInt32(); + Vector pLoop = Sve.CreateWhileLessThanMask32Bit(0, Size); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + // Since pLoop is a Vector predicate, we load the input as uint array, + // then cast it back to Vector. + // This is preferrable to casting pLoop to Vector, which would cause + // a unnecessary conversion from predicate to vector in the codegen. + Vector inVec = (Vector)Sve.LoadVector(pLoop, (uint*)(input + i)); + Vector outVec = Sve.Sqrt(inVec); + Sve.StoreAndZip(pLoop, (uint*)output + i, (Vector)outVec); + + // Handle loop. + i += cntw; + pLoop = Sve.CreateWhileLessThanMask32Bit(i, Size); + } + } + } + + [Benchmark] + public unsafe void SveTail() + { + fixed (float* input = _input, output = _output) + { + int i = 0; + int cntw = (int)Sve.Count32BitElements(); + + Vector pTrue = Sve.CreateTrueMaskSingle(); + for (; i <= Size - cntw; i += cntw) + { + Vector inVec = Sve.LoadVector(pTrue, input + i); + Vector outVec = Sve.Sqrt(inVec); + Sve.StoreAndZip(pTrue, output + i, outVec); + } + // Handle tail. + for (; i < Size; i++) + { + output[i] = (float)Math.Sqrt(input[i]); + } + } + } + + } +} From d3a9f6a33afc137652899d561671c065c8b1aac7 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Thu, 30 Oct 2025 09:32:40 +0000 Subject: [PATCH 2/3] Fix AdvSimd intrinsic call --- src/benchmarks/micro/sve/Logarithm.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/benchmarks/micro/sve/Logarithm.cs b/src/benchmarks/micro/sve/Logarithm.cs index 9f44663e2dc..55a38a3d048 100644 --- a/src/benchmarks/micro/sve/Logarithm.cs +++ b/src/benchmarks/micro/sve/Logarithm.cs @@ -117,7 +117,7 @@ public unsafe void Vector128Logarithm() Vector128 u = AdvSimd.And(u_off, Vector128.Create(0x007fffffu)); u = AdvSimd.Add(u, offVec); - Vector128 r = Sve.Subtract(u.AsSingle(), Vector128.Create(1.0f)); + Vector128 r = AdvSimd.Subtract(u.AsSingle(), Vector128.Create(1.0f)); // y = log(1+r) + n*ln2. Vector128 r2 = AdvSimd.Multiply(r, r); From 1efb11a33a0014998ba2a399c55761f8aec95cb1 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Thu, 30 Oct 2025 10:02:06 +0000 Subject: [PATCH 3/3] Fix typos in comments --- src/benchmarks/micro/sve/SquareRoot.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/benchmarks/micro/sve/SquareRoot.cs b/src/benchmarks/micro/sve/SquareRoot.cs index 1fcf95fec12..b17614e2461 100644 --- a/src/benchmarks/micro/sve/SquareRoot.cs +++ b/src/benchmarks/micro/sve/SquareRoot.cs @@ -99,8 +99,8 @@ public unsafe void SveSquareRoot() { // Since pLoop is a Vector predicate, we load the input as uint array, // then cast it back to Vector. - // This is preferrable to casting pLoop to Vector, which would cause - // a unnecessary conversion from predicate to vector in the codegen. + // This is preferable to casting pLoop to Vector, which would cause + // an unnecessary conversion from predicate to vector in the codegen. Vector inVec = (Vector)Sve.LoadVector(pLoop, (uint*)(input + i)); Vector outVec = Sve.Sqrt(inVec); Sve.StoreAndZip(pLoop, (uint*)output + i, (Vector)outVec);