5
5
using System . Runtime . CompilerServices ;
6
6
using System . Runtime . InteropServices ;
7
7
using System . Runtime . Intrinsics ;
8
+ using System . Runtime . Intrinsics . Arm ;
8
9
using System . Runtime . Intrinsics . X86 ;
9
10
10
11
// ReSharper disable InconsistentNaming
@@ -26,6 +27,11 @@ public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b)
26
27
return Vp8_Sse16xN_Sse2 ( a , b , 8 ) ;
27
28
}
28
29
30
+ if ( AdvSimd . IsSupported )
31
+ {
32
+ return Vp8_Sse16x16_Neon ( a , b ) ;
33
+ }
34
+
29
35
return Vp8_SseNxN ( a , b , 16 , 16 ) ;
30
36
}
31
37
@@ -43,6 +49,11 @@ public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b)
43
49
return Vp8_Sse16xN_Sse2 ( a , b , 4 ) ;
44
50
}
45
51
52
+ if ( AdvSimd . IsSupported )
53
+ {
54
+ return Vp8_Sse16x8_Neon ( a , b ) ;
55
+ }
56
+
46
57
return Vp8_SseNxN ( a , b , 16 , 8 ) ;
47
58
}
48
59
@@ -119,6 +130,11 @@ public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b)
119
130
return Numerics . ReduceSum ( sum ) ;
120
131
}
121
132
133
+ if ( AdvSimd . IsSupported )
134
+ {
135
+ return Vp8_Sse4x4_Neon ( a , b ) ;
136
+ }
137
+
122
138
return Vp8_SseNxN ( a , b , 4 , 4 ) ;
123
139
}
124
140
@@ -199,6 +215,106 @@ private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs)
199
215
return Numerics . ReduceSum ( sum ) ;
200
216
}
201
217
218
+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
219
+ private static unsafe int Vp8_Sse16x16_Neon ( Span < byte > a , Span < byte > b )
220
+ {
221
+ Vector128 < uint > sum = Vector128 < uint > . Zero ;
222
+ fixed ( byte * aRef = & MemoryMarshal . GetReference ( a ) )
223
+ {
224
+ fixed ( byte * bRef = & MemoryMarshal . GetReference ( b ) )
225
+ {
226
+ for ( int y = 0 ; y < 16 ; y ++ )
227
+ {
228
+ sum = AccumulateSSE16Neon ( aRef + ( y * WebpConstants . Bps ) , bRef + ( y * WebpConstants . Bps ) , sum ) ;
229
+ }
230
+ }
231
+ }
232
+
233
+ #if NET7_0_OR_GREATER
234
+ return ( int ) Vector128 . Sum ( sum ) ;
235
+ #else
236
+ return Numerics . ReduceSumArm ( sum ) ;
237
+ #endif
238
+ }
239
+
240
+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
241
+ private static unsafe int Vp8_Sse16x8_Neon ( Span < byte > a , Span < byte > b )
242
+ {
243
+ Vector128 < uint > sum = Vector128 < uint > . Zero ;
244
+ fixed ( byte * aRef = & MemoryMarshal . GetReference ( a ) )
245
+ {
246
+ fixed ( byte * bRef = & MemoryMarshal . GetReference ( b ) )
247
+ {
248
+ for ( int y = 0 ; y < 8 ; y ++ )
249
+ {
250
+ sum = AccumulateSSE16Neon ( aRef + ( y * WebpConstants . Bps ) , bRef + ( y * WebpConstants . Bps ) , sum ) ;
251
+ }
252
+ }
253
+ }
254
+
255
+ #if NET7_0_OR_GREATER
256
+ return ( int ) Vector128 . Sum ( sum ) ;
257
+ #else
258
+ return Numerics . ReduceSumArm ( sum ) ;
259
+ #endif
260
+ }
261
+
262
+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
263
+ private static int Vp8_Sse4x4_Neon ( Span < byte > a , Span < byte > b )
264
+ {
265
+ Vector128 < byte > a0 = Load4x4Neon ( a ) . AsByte ( ) ;
266
+ Vector128 < byte > b0 = Load4x4Neon ( b ) . AsByte ( ) ;
267
+ Vector128 < byte > absDiff = AdvSimd . AbsoluteDifference ( a0 , b0 ) ;
268
+ Vector64 < byte > absDiffLower = absDiff . GetLower ( ) . AsByte ( ) ;
269
+ Vector64 < byte > absDiffUpper = absDiff . GetUpper ( ) . AsByte ( ) ;
270
+ Vector128 < ushort > prod1 = AdvSimd . MultiplyWideningLower ( absDiffLower , absDiffLower ) ;
271
+ Vector128 < ushort > prod2 = AdvSimd . MultiplyWideningLower ( absDiffUpper , absDiffUpper ) ;
272
+
273
+ // pair-wise adds and widen.
274
+ Vector128 < uint > sum1 = AdvSimd . AddPairwiseWidening ( prod1 ) ;
275
+ Vector128 < uint > sum2 = AdvSimd . AddPairwiseWidening ( prod2 ) ;
276
+
277
+ Vector128 < uint > sum = AdvSimd . Add ( sum1 , sum2 ) ;
278
+ #if NET7_0_OR_GREATER
279
+ return ( int ) Vector128 . Sum ( sum ) ;
280
+ #else
281
+ return Numerics . ReduceSumArm ( sum ) ;
282
+ #endif
283
+ }
284
+
285
+ // Load all 4x4 pixels into a single Vector128<uint>
286
+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
287
+ private static unsafe Vector128 < uint > Load4x4Neon ( Span < byte > src )
288
+ {
289
+ fixed ( byte * srcRef = & MemoryMarshal . GetReference ( src ) )
290
+ {
291
+ Vector128 < uint > output = Vector128 < uint > . Zero ;
292
+ output = AdvSimd . LoadAndInsertScalar ( output , 0 , ( uint * ) srcRef ) ;
293
+ output = AdvSimd . LoadAndInsertScalar ( output , 1 , ( uint * ) ( srcRef + WebpConstants . Bps ) ) ;
294
+ output = AdvSimd . LoadAndInsertScalar ( output , 2 , ( uint * ) ( srcRef + ( WebpConstants . Bps * 2 ) ) ) ;
295
+ output = AdvSimd . LoadAndInsertScalar ( output , 3 , ( uint * ) ( srcRef + ( WebpConstants . Bps * 3 ) ) ) ;
296
+ return output ;
297
+ }
298
+ }
299
+
300
+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
301
+ private static unsafe Vector128 < uint > AccumulateSSE16Neon ( byte * a , byte * b , Vector128 < uint > sum )
302
+ {
303
+ Vector128 < byte > a0 = AdvSimd . LoadVector128 ( a ) ;
304
+ Vector128 < byte > b0 = AdvSimd . LoadVector128 ( b ) ;
305
+
306
+ Vector128 < byte > absDiff = AdvSimd . AbsoluteDifference ( a0 , b0 ) ;
307
+ Vector64 < byte > absDiffLower = absDiff . GetLower ( ) ;
308
+ Vector64 < byte > absDiffUpper = absDiff . GetUpper ( ) ;
309
+ Vector128 < ushort > prod1 = AdvSimd . MultiplyWideningLower ( absDiffLower , absDiffLower ) ;
310
+ Vector128 < ushort > prod2 = AdvSimd . MultiplyWideningLower ( absDiffUpper , absDiffUpper ) ;
311
+
312
+ // pair-wise adds and widen.
313
+ Vector128 < uint > sum1 = AdvSimd . AddPairwiseWidening ( prod1 ) ;
314
+ Vector128 < uint > sum2 = AdvSimd . AddPairwiseWidening ( prod2 ) ;
315
+ return AdvSimd . Add ( sum , AdvSimd . Add ( sum1 , sum2 ) ) ;
316
+ }
317
+
202
318
[ MethodImpl ( InliningOptions . ShortMethod ) ]
203
319
private static Vector128 < int > SubtractAndAccumulate ( Vector128 < byte > a , Vector128 < byte > b )
204
320
{
0 commit comments