55using System . Runtime . CompilerServices ;
66using System . Runtime . InteropServices ;
77using System . Runtime . Intrinsics ;
8- using System . Runtime . Intrinsics . X86 ;
8+ using SixLabors . ImageSharp . Common . Helpers ;
99
1010namespace SixLabors . ImageSharp . Processing . Processors . Transforms ;
1111
@@ -14,11 +14,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
1414/// </summary>
1515internal readonly unsafe struct ResizeKernel
1616{
17+ /// <summary>
18+ /// The buffer with the convolution factors.
19+ /// Note that when FMA is supported, this is of size 4x that reported in <see cref="Length"/>.
20+ /// </summary>
1721 private readonly float * bufferPtr ;
1822
1923 /// <summary>
2024 /// Initializes a new instance of the <see cref="ResizeKernel"/> struct.
2125 /// </summary>
26+ /// <param name="startIndex">The starting index for the destination row.</param>
27+ /// <param name="bufferPtr">The pointer to the buffer with the convolution factors.</param>
28+ /// <param name="length">The length of the kernel.</param>
2229 [ MethodImpl ( InliningOptions . ShortMethod ) ]
2330 internal ResizeKernel ( int startIndex , float * bufferPtr , int length )
2431 {
@@ -27,6 +34,15 @@ internal ResizeKernel(int startIndex, float* bufferPtr, int length)
2734 this . Length = length ;
2835 }
2936
37+ /// <summary>
38+ /// Gets a value indicating whether vectorization is supported.
39+ /// </summary>
40+ public static bool IsHardwareAccelerated
41+ {
42+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
43+ get => Vector256 . IsHardwareAccelerated ;
44+ }
45+
3046 /// <summary>
3147 /// Gets the start index for the destination row.
3248 /// </summary>
@@ -53,7 +69,15 @@ public int Length
5369 public Span < float > Values
5470 {
5571 [ MethodImpl ( InliningOptions . ShortMethod ) ]
56- get => new ( this . bufferPtr , this . Length ) ;
72+ get
73+ {
74+ if ( Vector256 . IsHardwareAccelerated )
75+ {
76+ return new ( this . bufferPtr , this . Length * 4 ) ;
77+ }
78+
79+ return new ( this . bufferPtr , this . Length ) ;
80+ }
5781 }
5882
5983 /// <summary>
@@ -68,73 +92,45 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
6892 [ MethodImpl ( InliningOptions . ShortMethod ) ]
6993 public Vector4 ConvolveCore ( ref Vector4 rowStartRef )
7094 {
71- if ( Avx2 . IsSupported && Fma . IsSupported )
95+ if ( IsHardwareAccelerated )
7296 {
7397 float * bufferStart = this . bufferPtr ;
74- float * bufferEnd = bufferStart + ( this . Length & ~ 3 ) ;
98+ ref Vector4 rowEndRef = ref Unsafe . Add ( ref rowStartRef , this . Length & ~ 3 ) ;
7599 Vector256 < float > result256_0 = Vector256 < float > . Zero ;
76100 Vector256 < float > result256_1 = Vector256 < float > . Zero ;
77- ReadOnlySpan < byte > maskBytes =
78- [
79- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
80- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
81- 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 ,
82- 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0
83- ] ;
84- Vector256 < int > mask = Unsafe . ReadUnaligned < Vector256 < int > > ( ref MemoryMarshal . GetReference ( maskBytes ) ) ;
85101
86- while ( bufferStart < bufferEnd )
102+ while ( Unsafe . IsAddressLessThan ( ref rowStartRef , ref rowEndRef ) )
87103 {
88- // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
89- // for the FMA operation, and execute it directly on the target register and reading directly from
90- // memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
91- // The code below should compile in the following assembly on .NET 5 x64:
92- //
93- // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
94- // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
95- // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
96- //
97- // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
98- // Additionally, we're also unrolling two computations per each loop iterations to leverage the
99- // fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
100- result256_0 = Fma . MultiplyAdd (
101- Unsafe . As < Vector4 , Vector256 < float > > ( ref rowStartRef ) ,
102- Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) bufferStart ) . AsSingle ( ) , mask ) ,
103- result256_0 ) ;
104-
105- result256_1 = Fma . MultiplyAdd (
106- Unsafe . As < Vector4 , Vector256 < float > > ( ref Unsafe . Add ( ref rowStartRef , 2 ) ) ,
107- Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) ( bufferStart + 2 ) ) . AsSingle ( ) , mask ) ,
108- result256_1 ) ;
109-
110- bufferStart += 4 ;
111- rowStartRef = ref Unsafe . Add ( ref rowStartRef , 4 ) ;
104+ Vector256 < float > pixels256_0 = Unsafe . As < Vector4 , Vector256 < float > > ( ref rowStartRef ) ;
105+ Vector256 < float > pixels256_1 = Unsafe . As < Vector4 , Vector256 < float > > ( ref Unsafe . Add ( ref rowStartRef , ( nuint ) 2 ) ) ;
106+
107+ result256_0 = Vector256_ . MultiplyAdd ( result256_0 , Vector256 . Load ( bufferStart ) , pixels256_0 ) ;
108+ result256_1 = Vector256_ . MultiplyAdd ( result256_1 , Vector256 . Load ( bufferStart + 8 ) , pixels256_1 ) ;
109+
110+ bufferStart += 16 ;
111+ rowStartRef = ref Unsafe . Add ( ref rowStartRef , ( nuint ) 4 ) ;
112112 }
113113
114- result256_0 = Avx . Add ( result256_0 , result256_1 ) ;
114+ result256_0 += result256_1 ;
115115
116116 if ( ( this . Length & 3 ) >= 2 )
117117 {
118- result256_0 = Fma . MultiplyAdd (
119- Unsafe . As < Vector4 , Vector256 < float > > ( ref rowStartRef ) ,
120- Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) bufferStart ) . AsSingle ( ) , mask ) ,
121- result256_0 ) ;
118+ Vector256 < float > pixels256_0 = Unsafe . As < Vector4 , Vector256 < float > > ( ref rowStartRef ) ;
119+ result256_0 = Vector256_ . MultiplyAdd ( result256_0 , Vector256 . Load ( bufferStart ) , pixels256_0 ) ;
122120
123- bufferStart += 2 ;
124- rowStartRef = ref Unsafe . Add ( ref rowStartRef , 2 ) ;
121+ bufferStart += 8 ;
122+ rowStartRef = ref Unsafe . Add ( ref rowStartRef , ( nuint ) 2 ) ;
125123 }
126124
127- Vector128 < float > result128 = Sse . Add ( result256_0 . GetLower ( ) , result256_0 . GetUpper ( ) ) ;
125+ Vector128 < float > result128 = result256_0 . GetLower ( ) + result256_0 . GetUpper ( ) ;
128126
129127 if ( ( this . Length & 1 ) != 0 )
130128 {
131- result128 = Fma . MultiplyAdd (
132- Unsafe . As < Vector4 , Vector128 < float > > ( ref rowStartRef ) ,
133- Vector128 . Create ( * bufferStart ) ,
134- result128 ) ;
129+ Vector128 < float > pixels128 = Unsafe . As < Vector4 , Vector128 < float > > ( ref rowStartRef ) ;
130+ result128 = Vector128_ . MultiplyAdd ( result128 , Vector128 . Load ( bufferStart ) , pixels128 ) ;
135131 }
136132
137- return * ( Vector4 * ) & result128 ;
133+ return result128 . AsVector4 ( ) ;
138134 }
139135 else
140136 {
@@ -149,7 +145,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
149145 result += rowStartRef * * bufferStart ;
150146
151147 bufferStart ++ ;
152- rowStartRef = ref Unsafe . Add ( ref rowStartRef , 1 ) ;
148+ rowStartRef = ref Unsafe . Add ( ref rowStartRef , ( nuint ) 1 ) ;
153149 }
154150
155151 return result ;
@@ -160,17 +156,32 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
160156 /// Copy the contents of <see cref="ResizeKernel"/> altering <see cref="StartIndex"/>
161157 /// to the value <paramref name="left"/>.
162158 /// </summary>
159+ /// <param name="left">The new value for <see cref="StartIndex"/>.</param>
163160 [ MethodImpl ( InliningOptions . ShortMethod ) ]
164161 internal ResizeKernel AlterLeftValue ( int left )
165162 => new ( left , this . bufferPtr , this . Length ) ;
166163
167- internal void Fill ( Span < double > values )
164+ internal void FillOrCopyAndExpand ( Span < float > values )
168165 {
169166 DebugGuard . IsTrue ( values . Length == this . Length , nameof ( values ) , "ResizeKernel.Fill: values.Length != this.Length!" ) ;
170167
171- for ( int i = 0 ; i < this . Length ; i ++ )
168+ if ( IsHardwareAccelerated )
169+ {
170+ Vector4 * bufferStart = ( Vector4 * ) this . bufferPtr ;
171+ ref float valuesStart = ref MemoryMarshal . GetReference ( values ) ;
172+ ref float valuesEnd = ref Unsafe . Add ( ref valuesStart , values . Length ) ;
173+
174+ while ( Unsafe . IsAddressLessThan ( ref valuesStart , ref valuesEnd ) )
175+ {
176+ * bufferStart = new Vector4 ( valuesStart ) ;
177+
178+ bufferStart ++ ;
179+ valuesStart = ref Unsafe . Add ( ref valuesStart , ( nuint ) 1 ) ;
180+ }
181+ }
182+ else
172183 {
173- this . Values [ i ] = ( float ) values [ i ] ;
184+ values . CopyTo ( this . Values ) ;
174185 }
175186 }
176187}
0 commit comments