Skip to content

Commit 63c8f9e

Browse files
authored
Merge pull request #2356 from SixLabors/bp/modeScoreArm
Add ARM version of calculating mode scores
2 parents 5ebc460 + 963d993 commit 63c8f9e

File tree

3 files changed

+194
-20
lines changed

3 files changed

+194
-20
lines changed

src/ImageSharp/Common/Helpers/Numerics.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Runtime.CompilerServices;
66
using System.Runtime.InteropServices;
77
using System.Runtime.Intrinsics;
8+
using System.Runtime.Intrinsics.Arm;
89
using System.Runtime.Intrinsics.X86;
910

1011
namespace SixLabors.ImageSharp;
@@ -808,6 +809,25 @@ public static int ReduceSum(Vector256<int> accumulator)
808809
return Sse2.ConvertToInt32(vsum);
809810
}
810811

812+
/// <summary>
813+
/// Reduces elements of the vector into one sum.
814+
/// </summary>
815+
/// <param name="accumulator">The accumulator to reduce.</param>
816+
/// <returns>The sum of all elements.</returns>
817+
[MethodImpl(InliningOptions.ShortMethod)]
818+
public static int ReduceSumArm(Vector128<uint> accumulator)
819+
{
820+
if (AdvSimd.Arm64.IsSupported)
821+
{
822+
Vector64<uint> sum = AdvSimd.Arm64.AddAcross(accumulator);
823+
return (int)AdvSimd.Extract(sum, 0);
824+
}
825+
826+
Vector128<ulong> sum2 = AdvSimd.AddPairwiseWidening(accumulator);
827+
Vector64<uint> sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32());
828+
return (int)AdvSimd.Extract(sum3, 0);
829+
}
830+
811831
/// <summary>
812832
/// Reduces even elements of the vector into one sum.
813833
/// </summary>

src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Runtime.CompilerServices;
66
using System.Runtime.InteropServices;
77
using System.Runtime.Intrinsics;
8+
using System.Runtime.Intrinsics.Arm;
89
using System.Runtime.Intrinsics.X86;
910

1011
// ReSharper disable InconsistentNaming
@@ -26,6 +27,11 @@ public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b)
2627
return Vp8_Sse16xN_Sse2(a, b, 8);
2728
}
2829

30+
if (AdvSimd.IsSupported)
31+
{
32+
return Vp8_Sse16x16_Neon(a, b);
33+
}
34+
2935
return Vp8_SseNxN(a, b, 16, 16);
3036
}
3137

@@ -43,6 +49,11 @@ public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b)
4349
return Vp8_Sse16xN_Sse2(a, b, 4);
4450
}
4551

52+
if (AdvSimd.IsSupported)
53+
{
54+
return Vp8_Sse16x8_Neon(a, b);
55+
}
56+
4657
return Vp8_SseNxN(a, b, 16, 8);
4758
}
4859

@@ -119,6 +130,11 @@ public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b)
119130
return Numerics.ReduceSum(sum);
120131
}
121132

133+
if (AdvSimd.IsSupported)
134+
{
135+
return Vp8_Sse4x4_Neon(a, b);
136+
}
137+
122138
return Vp8_SseNxN(a, b, 4, 4);
123139
}
124140

@@ -199,6 +215,106 @@ private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs)
199215
return Numerics.ReduceSum(sum);
200216
}
201217

218+
[MethodImpl(InliningOptions.ShortMethod)]
219+
private static unsafe int Vp8_Sse16x16_Neon(Span<byte> a, Span<byte> b)
220+
{
221+
Vector128<uint> sum = Vector128<uint>.Zero;
222+
fixed (byte* aRef = &MemoryMarshal.GetReference(a))
223+
{
224+
fixed (byte* bRef = &MemoryMarshal.GetReference(b))
225+
{
226+
for (int y = 0; y < 16; y++)
227+
{
228+
sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
229+
}
230+
}
231+
}
232+
233+
#if NET7_0_OR_GREATER
234+
return (int)Vector128.Sum(sum);
235+
#else
236+
return Numerics.ReduceSumArm(sum);
237+
#endif
238+
}
239+
240+
[MethodImpl(InliningOptions.ShortMethod)]
241+
private static unsafe int Vp8_Sse16x8_Neon(Span<byte> a, Span<byte> b)
242+
{
243+
Vector128<uint> sum = Vector128<uint>.Zero;
244+
fixed (byte* aRef = &MemoryMarshal.GetReference(a))
245+
{
246+
fixed (byte* bRef = &MemoryMarshal.GetReference(b))
247+
{
248+
for (int y = 0; y < 8; y++)
249+
{
250+
sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
251+
}
252+
}
253+
}
254+
255+
#if NET7_0_OR_GREATER
256+
return (int)Vector128.Sum(sum);
257+
#else
258+
return Numerics.ReduceSumArm(sum);
259+
#endif
260+
}
261+
262+
[MethodImpl(InliningOptions.ShortMethod)]
263+
private static int Vp8_Sse4x4_Neon(Span<byte> a, Span<byte> b)
264+
{
265+
Vector128<byte> a0 = Load4x4Neon(a).AsByte();
266+
Vector128<byte> b0 = Load4x4Neon(b).AsByte();
267+
Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
268+
Vector64<byte> absDiffLower = absDiff.GetLower().AsByte();
269+
Vector64<byte> absDiffUpper = absDiff.GetUpper().AsByte();
270+
Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
271+
Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
272+
273+
// pair-wise adds and widen.
274+
Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
275+
Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);
276+
277+
Vector128<uint> sum = AdvSimd.Add(sum1, sum2);
278+
#if NET7_0_OR_GREATER
279+
return (int)Vector128.Sum(sum);
280+
#else
281+
return Numerics.ReduceSumArm(sum);
282+
#endif
283+
}
284+
285+
// Load all 4x4 pixels into a single Vector128<uint>
286+
[MethodImpl(InliningOptions.ShortMethod)]
287+
private static unsafe Vector128<uint> Load4x4Neon(Span<byte> src)
288+
{
289+
fixed (byte* srcRef = &MemoryMarshal.GetReference(src))
290+
{
291+
Vector128<uint> output = Vector128<uint>.Zero;
292+
output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef);
293+
output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps));
294+
output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2)));
295+
output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3)));
296+
return output;
297+
}
298+
}
299+
300+
[MethodImpl(InliningOptions.ShortMethod)]
301+
private static unsafe Vector128<uint> AccumulateSSE16Neon(byte* a, byte* b, Vector128<uint> sum)
302+
{
303+
Vector128<byte> a0 = AdvSimd.LoadVector128(a);
304+
Vector128<byte> b0 = AdvSimd.LoadVector128(b);
305+
306+
Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
307+
Vector64<byte> absDiffLower = absDiff.GetLower();
308+
Vector64<byte> absDiffUpper = absDiff.GetUpper();
309+
Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
310+
Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
311+
312+
// pair-wise adds and widen.
313+
Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
314+
Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);
315+
return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2));
316+
}
317+
202318
[MethodImpl(InliningOptions.ShortMethod)]
203319
private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b)
204320
{

tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

Lines changed: 58 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) Six Labors.
22
// Licensed under the Six Labors Split License.
33

4+
using System.Runtime.InteropServices;
45
using SixLabors.ImageSharp.Formats.Webp.Lossy;
56
using SixLabors.ImageSharp.Tests.TestUtilities;
67

@@ -222,62 +223,99 @@ private static void RunHadamardTransformTest()
222223
public void HadamardTransform_Works() => RunHadamardTransformTest();
223224

224225
[Fact]
225-
public void TransformTwo_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll);
226+
public void TransformTwo_WithHardwareIntrinsics_Works() =>
227+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.AllowAll);
226228

227229
[Fact]
228-
public void TransformTwo_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic);
230+
public void TransformTwo_WithoutHardwareIntrinsics_Works() =>
231+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformTwoTest, HwIntrinsics.DisableHWIntrinsic);
229232

230233
[Fact]
231-
public void TransformOne_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll);
234+
public void TransformOne_WithHardwareIntrinsics_Works() =>
235+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.AllowAll);
232236

233237
[Fact]
234-
public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);
238+
public void TransformOne_WithoutHardwareIntrinsics_Works() =>
239+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);
235240

236-
// This will test the AVX2 version.
241+
// This will test the AVX2 or ARM version.
237242
[Fact]
238-
public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);
243+
public void Vp8Sse16X16_WithHardwareIntrinsics_Works() =>
244+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);
239245

240246
// This will test the SSE2 version.
241247
[Fact]
242-
public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
248+
public void Vp8Sse16X16_WithoutAVX2_Works()
249+
{
250+
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
251+
{
252+
return;
253+
}
254+
255+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
256+
}
243257

244258
// This will test the fallback scalar version.
245259
[Fact]
246-
public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
260+
public void Vp8Sse16X16_WithoutHwIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableHWIntrinsic);
247261

248-
// This will test the AVX2 version.
262+
// This will test the AVX2 or ARM version.
249263
[Fact]
250-
public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);
264+
public void Vp8Sse16X8_WithHardwareIntrinsics_Works() =>
265+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);
251266

252267
// This will test the SSE2 version.
253268
[Fact]
254-
public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
269+
public void Vp8Sse16X8_WithoutAVX2_Works()
270+
{
271+
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
272+
{
273+
return;
274+
}
275+
276+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
277+
}
255278

256279
// This will test the fallback scalar version.
257280
[Fact]
258-
public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
281+
public void Vp8Sse16X8_WithoutHardwareIntrinsics_Works() =>
282+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableHWIntrinsic);
259283

260-
// This will test the AVX2 version.
284+
// This will test the AVX2 version or ARM version.
261285
[Fact]
262-
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
286+
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() =>
287+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
263288

264289
// This will test the SSE2 version.
265290
[Fact]
266-
public void Vp8Sse4X4_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2);
291+
public void Vp8Sse4X4_WithoutAVX2_Works()
292+
{
293+
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
294+
{
295+
return;
296+
}
297+
298+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2);
299+
}
267300

268301
// This will test the fallback scalar version.
269302
[Fact]
270-
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableAVX);
303+
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() =>
304+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);
271305

272306
[Fact]
273-
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
307+
public void Mean16x4_WithHardwareIntrinsics_Works() =>
308+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
274309

275310
[Fact]
276-
public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);
311+
public void Mean16x4_WithoutHardwareIntrinsics_Works() =>
312+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);
277313

278314
[Fact]
279-
public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
315+
public void HadamardTransform_WithHardwareIntrinsics_Works() =>
316+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
280317

281318
[Fact]
282-
public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
319+
public void HadamardTransform_WithoutHardwareIntrinsics_Works() =>
320+
FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
283321
}

0 commit comments

Comments
 (0)