Skip to content

Commit ad816ed

Browse files
Merge pull request #2793 from SixLabors/js/resize-map-optimizations
Speed improvements to resize convolution (no vpermps w/ FMA)
2 parents 5fc087a + a96c78d commit ad816ed

File tree

8 files changed

+253
-187
lines changed

8 files changed

+253
-187
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ SixLabors.ImageSharp
88
<div align="center">
99

1010
[![Build Status](https://img.shields.io/github/actions/workflow/status/SixLabors/ImageSharp/build-and-test.yml?branch=main)](https://github.com/SixLabors/ImageSharp/actions)
11-
[![Code coverage](https://codecov.io/gh/SixLabors/ImageSharp/branch/main/graph/badge.svg)](https://codecov.io/gh/SixLabors/ImageSharp)
11+
[![codecov](https://codecov.io/gh/SixLabors/ImageSharp/graph/badge.svg?token=g2WJwz770q)](https://codecov.io/gh/SixLabors/ImageSharp)
1212
[![License: Six Labors Split](https://img.shields.io/badge/license-Six%20Labors%20Split-%23e30183)](https://github.com/SixLabors/ImageSharp/blob/main/LICENSE)
1313
[![Twitter](https://img.shields.io/twitter/url/http/shields.io.svg?style=flat&logo=twitter)](https://twitter.com/intent/tweet?hashtags=imagesharp,dotnet,oss&text=ImageSharp.+A+new+cross-platform+2D+graphics+API+in+C%23&url=https%3a%2f%2fgithub.com%2fSixLabors%2fImageSharp&via=sixlabors)
1414

shared-infrastructure

src/ImageSharp/Common/Helpers/Numerics.cs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,4 +1080,47 @@ public static nuint Vector512Count<TVector>(this Span<float> span)
10801080
public static nuint Vector512Count<TVector>(int length)
10811081
where TVector : struct
10821082
=> (uint)length / (uint)Vector512<TVector>.Count;
1083+
1084+
/// <summary>
1085+
/// Normalizes the values in a given <see cref="Span{T}"/>.
1086+
/// </summary>
1087+
/// <param name="span">The sequence of <see cref="float"/> values to normalize.</param>
1088+
/// <param name="sum">The sum of the values in <paramref name="span"/>.</param>
1089+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1090+
public static void Normalize(Span<float> span, float sum)
1091+
{
1092+
if (Vector256.IsHardwareAccelerated)
1093+
{
1094+
ref float startRef = ref MemoryMarshal.GetReference(span);
1095+
ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);
1096+
Vector256<float> sum256 = Vector256.Create(sum);
1097+
1098+
while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
1099+
{
1100+
Unsafe.As<float, Vector256<float>>(ref startRef) /= sum256;
1101+
startRef = ref Unsafe.Add(ref startRef, (nuint)8);
1102+
}
1103+
1104+
if ((span.Length & 7) >= 4)
1105+
{
1106+
Unsafe.As<float, Vector128<float>>(ref startRef) /= sum256.GetLower();
1107+
startRef = ref Unsafe.Add(ref startRef, (nuint)4);
1108+
}
1109+
1110+
endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
1111+
1112+
while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
1113+
{
1114+
startRef /= sum;
1115+
startRef = ref Unsafe.Add(ref startRef, (nuint)1);
1116+
}
1117+
}
1118+
else
1119+
{
1120+
for (int i = 0; i < span.Length; i++)
1121+
{
1122+
span[i] /= sum;
1123+
}
1124+
}
1125+
}
10831126
}

src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs

Lines changed: 65 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
using System.Runtime.CompilerServices;
66
using System.Runtime.InteropServices;
77
using System.Runtime.Intrinsics;
8-
using System.Runtime.Intrinsics.X86;
8+
using SixLabors.ImageSharp.Common.Helpers;
99

1010
namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
1111

@@ -14,11 +14,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
1414
/// </summary>
1515
internal readonly unsafe struct ResizeKernel
1616
{
17+
/// <summary>
18+
/// The buffer with the convolution factors.
19+
/// Note that when FMA is supported, this is of size 4x that reported in <see cref="Length"/>.
20+
/// </summary>
1721
private readonly float* bufferPtr;
1822

1923
/// <summary>
2024
/// Initializes a new instance of the <see cref="ResizeKernel"/> struct.
2125
/// </summary>
26+
/// <param name="startIndex">The starting index for the destination row.</param>
27+
/// <param name="bufferPtr">The pointer to the buffer with the convolution factors.</param>
28+
/// <param name="length">The length of the kernel.</param>
2229
[MethodImpl(InliningOptions.ShortMethod)]
2330
internal ResizeKernel(int startIndex, float* bufferPtr, int length)
2431
{
@@ -27,6 +34,15 @@ internal ResizeKernel(int startIndex, float* bufferPtr, int length)
2734
this.Length = length;
2835
}
2936

37+
/// <summary>
38+
/// Gets a value indicating whether vectorization is supported.
39+
/// </summary>
40+
public static bool IsHardwareAccelerated
41+
{
42+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
43+
get => Vector256.IsHardwareAccelerated;
44+
}
45+
3046
/// <summary>
3147
/// Gets the start index for the destination row.
3248
/// </summary>
@@ -53,7 +69,15 @@ public int Length
5369
public Span<float> Values
5470
{
5571
[MethodImpl(InliningOptions.ShortMethod)]
56-
get => new(this.bufferPtr, this.Length);
72+
get
73+
{
74+
if (Vector256.IsHardwareAccelerated)
75+
{
76+
return new(this.bufferPtr, this.Length * 4);
77+
}
78+
79+
return new(this.bufferPtr, this.Length);
80+
}
5781
}
5882

5983
/// <summary>
@@ -68,73 +92,45 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
6892
[MethodImpl(InliningOptions.ShortMethod)]
6993
public Vector4 ConvolveCore(ref Vector4 rowStartRef)
7094
{
71-
if (Avx2.IsSupported && Fma.IsSupported)
95+
if (IsHardwareAccelerated)
7296
{
7397
float* bufferStart = this.bufferPtr;
74-
float* bufferEnd = bufferStart + (this.Length & ~3);
98+
ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
7599
Vector256<float> result256_0 = Vector256<float>.Zero;
76100
Vector256<float> result256_1 = Vector256<float>.Zero;
77-
ReadOnlySpan<byte> maskBytes =
78-
[
79-
0, 0, 0, 0, 0, 0, 0, 0,
80-
0, 0, 0, 0, 0, 0, 0, 0,
81-
1, 0, 0, 0, 1, 0, 0, 0,
82-
1, 0, 0, 0, 1, 0, 0, 0
83-
];
84-
Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
85101

86-
while (bufferStart < bufferEnd)
102+
while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
87103
{
88-
// It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
89-
// for the FMA operation, and execute it directly on the target register and reading directly from
90-
// memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
91-
// The code below should compile in the following assembly on .NET 5 x64:
92-
//
93-
// vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
94-
// vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
95-
// vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
96-
//
97-
// For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
98-
// Additionally, we're also unrolling two computations per each loop iterations to leverage the
99-
// fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
100-
result256_0 = Fma.MultiplyAdd(
101-
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
102-
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
103-
result256_0);
104-
105-
result256_1 = Fma.MultiplyAdd(
106-
Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
107-
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
108-
result256_1);
109-
110-
bufferStart += 4;
111-
rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
104+
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
105+
Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
106+
107+
result256_0 = Vector256_.MultiplyAdd(result256_0, Vector256.Load(bufferStart), pixels256_0);
108+
result256_1 = Vector256_.MultiplyAdd(result256_1, Vector256.Load(bufferStart + 8), pixels256_1);
109+
110+
bufferStart += 16;
111+
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
112112
}
113113

114-
result256_0 = Avx.Add(result256_0, result256_1);
114+
result256_0 += result256_1;
115115

116116
if ((this.Length & 3) >= 2)
117117
{
118-
result256_0 = Fma.MultiplyAdd(
119-
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
120-
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
121-
result256_0);
118+
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
119+
result256_0 = Vector256_.MultiplyAdd(result256_0, Vector256.Load(bufferStart), pixels256_0);
122120

123-
bufferStart += 2;
124-
rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
121+
bufferStart += 8;
122+
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
125123
}
126124

127-
Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
125+
Vector128<float> result128 = result256_0.GetLower() + result256_0.GetUpper();
128126

129127
if ((this.Length & 1) != 0)
130128
{
131-
result128 = Fma.MultiplyAdd(
132-
Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
133-
Vector128.Create(*bufferStart),
134-
result128);
129+
Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
130+
result128 = Vector128_.MultiplyAdd(result128, Vector128.Load(bufferStart), pixels128);
135131
}
136132

137-
return *(Vector4*)&result128;
133+
return result128.AsVector4();
138134
}
139135
else
140136
{
@@ -149,7 +145,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
149145
result += rowStartRef * *bufferStart;
150146

151147
bufferStart++;
152-
rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
148+
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)1);
153149
}
154150

155151
return result;
@@ -160,17 +156,32 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
160156
/// Copy the contents of <see cref="ResizeKernel"/> altering <see cref="StartIndex"/>
161157
/// to the value <paramref name="left"/>.
162158
/// </summary>
159+
/// <param name="left">The new value for <see cref="StartIndex"/>.</param>
163160
[MethodImpl(InliningOptions.ShortMethod)]
164161
internal ResizeKernel AlterLeftValue(int left)
165162
=> new(left, this.bufferPtr, this.Length);
166163

167-
internal void Fill(Span<double> values)
164+
internal void FillOrCopyAndExpand(Span<float> values)
168165
{
169166
DebugGuard.IsTrue(values.Length == this.Length, nameof(values), "ResizeKernel.Fill: values.Length != this.Length!");
170167

171-
for (int i = 0; i < this.Length; i++)
168+
if (IsHardwareAccelerated)
169+
{
170+
Vector4* bufferStart = (Vector4*)this.bufferPtr;
171+
ref float valuesStart = ref MemoryMarshal.GetReference(values);
172+
ref float valuesEnd = ref Unsafe.Add(ref valuesStart, values.Length);
173+
174+
while (Unsafe.IsAddressLessThan(ref valuesStart, ref valuesEnd))
175+
{
176+
*bufferStart = new Vector4(valuesStart);
177+
178+
bufferStart++;
179+
valuesStart = ref Unsafe.Add(ref valuesStart, (nuint)1);
180+
}
181+
}
182+
else
172183
{
173-
this.Values[i] = (float)values[i];
184+
values.CopyTo(this.Values);
174185
}
175186
}
176187
}

src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ private sealed class PeriodicKernelMap : ResizeKernelMap
1616

1717
private readonly int cornerInterval;
1818

19+
private readonly int sourcePeriod;
20+
1921
public PeriodicKernelMap(
2022
MemoryAllocator memoryAllocator,
2123
int sourceLength,
@@ -24,7 +26,8 @@ public PeriodicKernelMap(
2426
double scale,
2527
int radius,
2628
int period,
27-
int cornerInterval)
29+
int cornerInterval,
30+
int sourcePeriod)
2831
: base(
2932
memoryAllocator,
3033
sourceLength,
@@ -36,6 +39,7 @@ public PeriodicKernelMap(
3639
{
3740
this.cornerInterval = cornerInterval;
3841
this.period = period;
42+
this.sourcePeriod = sourcePeriod;
3943
}
4044

4145
internal override string Info => base.Info + $"|period:{this.period}|cornerInterval:{this.cornerInterval}";
@@ -54,10 +58,11 @@ protected internal override void Initialize<TResampler>(in TResampler sampler)
5458
int bottomStartDest = this.DestinationLength - this.cornerInterval;
5559
for (int i = startOfFirstRepeatedMosaic; i < bottomStartDest; i++)
5660
{
57-
double center = ((i + .5) * this.ratio) - .5;
58-
int left = (int)TolerantMath.Ceiling(center - this.radius);
5961
ResizeKernel kernel = this.kernels[i - this.period];
60-
this.kernels[i] = kernel.AlterLeftValue(left);
62+
63+
// Shift the kernel start index by the source-side period so the same weights align to the
64+
// next repeated sampling window in the source image.
65+
this.kernels[i] = kernel.AlterLeftValue(kernel.StartIndex + this.sourcePeriod);
6166
}
6267

6368
// Build bottom corner data:

0 commit comments

Comments
 (0)