diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ca14ae4c38..621229ab0b 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -1097,4 +1097,79 @@ public static nuint Vector512Count(this Span span) public static nuint Vector512Count(int length) where TVector : struct => (uint)length / (uint)Vector512.Count; + + /// + /// Normalizes the values in a given . + /// + /// The sequence of values to normalize. + /// The sum of the values in . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Normalize(Span span, float sum) + { + if (Vector512.IsHardwareAccelerated) + { + ref float startRef = ref MemoryMarshal.GetReference(span); + ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~15); + Vector512 sum512 = Vector512.Create(sum); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + Unsafe.As>(ref startRef) /= sum512; + startRef = ref Unsafe.Add(ref startRef, (nuint)16); + } + + if ((span.Length & 15) >= 8) + { + Unsafe.As>(ref startRef) /= sum512.GetLower(); + startRef = ref Unsafe.Add(ref startRef, (nuint)8); + } + + if ((span.Length & 7) >= 4) + { + Unsafe.As>(ref startRef) /= sum512.GetLower().GetLower(); + startRef = ref Unsafe.Add(ref startRef, (nuint)4); + } + + endRef = ref Unsafe.Add(ref startRef, span.Length & 3); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + startRef /= sum; + startRef = ref Unsafe.Add(ref startRef, (nuint)1); + } + } + else if (Vector256.IsHardwareAccelerated) + { + ref float startRef = ref MemoryMarshal.GetReference(span); + ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7); + Vector256 sum256 = Vector256.Create(sum); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + Unsafe.As>(ref startRef) /= sum256; + startRef = ref Unsafe.Add(ref startRef, (nuint)8); + } + + if ((span.Length & 7) >= 4) + { + Unsafe.As>(ref startRef) /= sum256.GetLower(); + startRef = ref Unsafe.Add(ref startRef, (nuint)4); + } + + endRef = ref Unsafe.Add(ref startRef, span.Length & 3); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + startRef /= sum; + startRef = ref Unsafe.Add(ref startRef, (nuint)1); + } + } + else + { + for (int i = 0; i < span.Length; i++) + { + span[i] /= sum; + } + } + } } diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index b6dd319f06..07cfe02850 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -245,6 +245,44 @@ public static Vector128 PackSignedSaturate(Vector128 left, Vector128 return default; } + /// + /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the + /// product of corresponding elements in and added to the + /// corresponding element in . + /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single + /// fused operation for better performance and precision. + /// + /// The first vector of single-precision floating-point numbers to be multiplied. + /// The second vector of single-precision floating-point numbers to be multiplied. + /// The vector of single-precision floating-point numbers to be added to the product of + /// and . + /// + /// A where each element is the result of multiplying the corresponding elements + /// of and , and then adding the corresponding element from . + /// + /// + /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using + /// . This approach can result + /// in slightly different results compared to performing the multiplication and addition separately due to + /// differences in how floating-point + /// rounding is handled. + /// + /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead + /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy + /// is critical. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyAddEstimate(Vector128 a, Vector128 b, Vector128 c) + { + if (Fma.IsSupported) + { + return Fma.MultiplyAdd(a, b, c); + } + + return (a * b) + c; + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 6e8c0d1de4..082e4683b0 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -110,6 +110,44 @@ public static Vector256 ConvertToInt32RoundToEven(Vector256 vector) return Vector256.ConvertToInt32(val_2p23_f32 | sign); } + /// + /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the + /// product of corresponding elements in and added to the + /// corresponding element in . + /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single + /// fused operation for better performance and precision. + /// + /// The first vector of single-precision floating-point numbers to be multiplied. + /// The second vector of single-precision floating-point numbers to be multiplied. + /// The vector of single-precision floating-point numbers to be added to the product of + /// and . + /// + /// A where each element is the result of multiplying the corresponding elements + /// of and , and then adding the corresponding element from . + /// + /// + /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using + /// . This approach can result + /// in slightly different results compared to performing the multiplication and addition separately due to + /// differences in how floating-point + /// rounding is handled. + /// + /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead + /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy + /// is critical. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyAddEstimate(Vector256 a, Vector256 b, Vector256 c) + { + if (Fma.IsSupported) + { + return Fma.MultiplyAdd(a, b, c); + } + + return (a * b) + c; + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 0165af90ef..bcc3c9fa92 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -3,6 +3,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -110,6 +111,43 @@ public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) return Vector512.ConvertToInt32(val_2p23_f32 | sign); } + /// + /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the + /// product of corresponding elements in and added to the + /// corresponding element in . + /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single + /// fused operation for better performance and precision. + /// + /// The first vector of single-precision floating-point numbers to be multiplied. + /// The second vector of single-precision floating-point numbers to be multiplied. + /// The vector of single-precision floating-point numbers to be added to the product of + /// and . + /// + /// A where each element is the result of multiplying the corresponding elements + /// of and , and then adding the corresponding element from . + /// + /// + /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using + /// against the upper and lower + /// buts. This approach can result in slightly different results compared to performing the multiplication and + /// addition separately due to differences in how floating-point rounding is handled. + /// + /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead + /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy + /// is critical. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector512 MultiplyAddEstimate(Vector512 a, Vector512 b, Vector512 c) + { + if (Avx512F.IsSupported) + { + return Avx512F.FusedMultiplyAdd(a, b, c); + } + + return (a + b) * c; + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 51a739d35e..6c1f7217a7 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -5,7 +5,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Processing.Processors.Transforms; @@ -14,11 +14,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms; /// internal readonly unsafe struct ResizeKernel { + /// + /// The buffer with the convolution factors. + /// Note that when FMA is supported, this is of size 4x that reported in . + /// private readonly float* bufferPtr; /// /// Initializes a new instance of the struct. /// + /// The starting index for the destination row. + /// The pointer to the buffer with the convolution factors. + /// The length of the kernel. [MethodImpl(InliningOptions.ShortMethod)] internal ResizeKernel(int startIndex, float* bufferPtr, int length) { @@ -27,6 +34,15 @@ internal ResizeKernel(int startIndex, float* bufferPtr, int length) this.Length = length; } + /// + /// Gets a value indicating whether vectorization is supported. + /// + public static bool SupportsVectorization + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Vector256.IsHardwareAccelerated; + } + /// /// Gets the start index for the destination row. /// @@ -53,7 +69,15 @@ public int Length public Span Values { [MethodImpl(InliningOptions.ShortMethod)] - get => new(this.bufferPtr, this.Length); + get + { + if (Vector256.IsHardwareAccelerated) + { + return new(this.bufferPtr, this.Length * 4); + } + + return new(this.bufferPtr, this.Length); + } } /// @@ -68,73 +92,99 @@ public Vector4 Convolve(Span rowSpan) [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - if (Avx2.IsSupported && Fma.IsSupported) + if (SupportsVectorization) { - float* bufferStart = this.bufferPtr; - float* bufferEnd = bufferStart + (this.Length & ~3); - Vector256 result256_0 = Vector256.Zero; - Vector256 result256_1 = Vector256.Zero; - ReadOnlySpan maskBytes = new byte[] + if (Vector512.IsHardwareAccelerated) { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, - }; - Vector256 mask = Unsafe.ReadUnaligned>(ref MemoryMarshal.GetReference(maskBytes)); + float* bufferStart = this.bufferPtr; + ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7); + Vector512 result512_0 = Vector512.Zero; + Vector512 result512_1 = Vector512.Zero; - while (bufferStart < bufferEnd) - { - // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps - // for the FMA operation, and execute it directly on the target register and reading directly from - // memory for the first parameter. This skips initializing a SIMD register, and an extra copy. - // The code below should compile in the following assembly on .NET 5 x64: - // - // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] - // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] - // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0 - // - // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. - // Additionally, we're also unrolling two computations per each loop iterations to leverage the - // fact that most CPUs have two ports to schedule multiply operations for FMA instructions. - result256_0 = Fma.MultiplyAdd( - Unsafe.As>(ref rowStartRef), - Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), - result256_0); - - result256_1 = Fma.MultiplyAdd( - Unsafe.As>(ref Unsafe.Add(ref rowStartRef, 2)), - Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask), - result256_1); - - bufferStart += 4; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 4); - } + while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) + { + Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); + Vector512 pixels512_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)4)); - result256_0 = Avx.Add(result256_0, result256_1); + result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); + result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1); - if ((this.Length & 3) >= 2) - { - result256_0 = Fma.MultiplyAdd( - Unsafe.As>(ref rowStartRef), - Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), - result256_0); + bufferStart += 32; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8); + } - bufferStart += 2; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); - } + result512_0 += result512_1; - Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); + if ((this.Length & 7) >= 4) + { + Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); + result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); - if ((this.Length & 1) != 0) - { - result128 = Fma.MultiplyAdd( - Unsafe.As>(ref rowStartRef), - Vector128.Create(*bufferStart), - result128); + bufferStart += 16; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); + } + + Vector256 result256 = result512_0.GetLower() + result512_0.GetUpper(); + + if ((this.Length & 3) >= 2) + { + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256); + + bufferStart += 8; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); + } + + Vector128 result128 = result256.GetLower() + result256.GetUpper(); + + if ((this.Length & 1) != 0) + { + Vector128 pixels128 = Unsafe.As>(ref rowStartRef); + result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); + } + + return *(Vector4*)&result128; } + else + { + float* bufferStart = this.bufferPtr; + ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3); + Vector256 result256_0 = Vector256.Zero; + Vector256 result256_1 = Vector256.Zero; + + while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) + { + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + Vector256 pixels256_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)2)); + + result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); + result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1); + + bufferStart += 16; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); + } + + result256_0 += result256_1; + + if ((this.Length & 3) >= 2) + { + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); + + bufferStart += 8; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); + } + + Vector128 result128 = result256_0.GetLower() + result256_0.GetUpper(); - return *(Vector4*)&result128; + if ((this.Length & 1) != 0) + { + Vector128 pixels128 = Unsafe.As>(ref rowStartRef); + result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); + } + + return *(Vector4*)&result128; + } } else { @@ -149,7 +199,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) result += rowStartRef * *bufferStart; bufferStart++; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)1); } return result; @@ -160,17 +210,32 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef) /// Copy the contents of altering /// to the value . /// + /// The new value for . [MethodImpl(InliningOptions.ShortMethod)] internal ResizeKernel AlterLeftValue(int left) => new(left, this.bufferPtr, this.Length); - internal void Fill(Span values) + internal void FillOrCopyAndExpand(Span values) { DebugGuard.IsTrue(values.Length == this.Length, nameof(values), "ResizeKernel.Fill: values.Length != this.Length!"); - for (int i = 0; i < this.Length; i++) + if (Vector256.IsHardwareAccelerated) + { + Vector4* bufferStart = (Vector4*)this.bufferPtr; + ref float valuesStart = ref MemoryMarshal.GetReference(values); + ref float valuesEnd = ref Unsafe.Add(ref valuesStart, values.Length); + + while (Unsafe.IsAddressLessThan(ref valuesStart, ref valuesEnd)) + { + *bufferStart = new Vector4(valuesStart); + + bufferStart++; + valuesStart = ref Unsafe.Add(ref valuesStart, (nuint)1); + } + } + else { - this.Values[i] = (float)values[i]; + values.CopyTo(this.Values); } } } diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs index ee1ada43ad..b39f6de2a5 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs @@ -54,7 +54,7 @@ protected internal override void Initialize(in TResampler sampler) int bottomStartDest = this.DestinationLength - this.cornerInterval; for (int i = startOfFirstRepeatedMosaic; i < bottomStartDest; i++) { - double center = ((i + .5) * this.ratio) - .5; + float center = (float)(((i + .5) * this.ratio) - .5); int left = (int)TolerantMath.Ceiling(center - this.radius); ResizeKernel kernel = this.kernels[i - this.period]; this.kernels[i] = kernel.AlterLeftValue(left); diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs index c1907bb520..b52054d553 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs @@ -33,7 +33,7 @@ internal partial class ResizeKernelMap : IDisposable private bool isDisposed; // To avoid both GC allocations, and MemoryAllocator ceremony: - private readonly double[] tempValues; + private readonly float[] tempValues; private ResizeKernelMap( MemoryAllocator memoryAllocator, @@ -50,10 +50,19 @@ private ResizeKernelMap( this.sourceLength = sourceLength; this.DestinationLength = destinationLength; this.MaxDiameter = (radius * 2) + 1; - this.data = memoryAllocator.Allocate2D(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true, AllocationOptions.Clean); + + if (ResizeKernel.SupportsVectorization) + { + this.data = memoryAllocator.Allocate2D(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true); + } + else + { + this.data = memoryAllocator.Allocate2D(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true); + } + this.pinHandle = this.data.DangerousGetSingleMemory().Pin(); this.kernels = new ResizeKernel[destinationLength]; - this.tempValues = new double[this.MaxDiameter]; + this.tempValues = new float[this.MaxDiameter]; } /// @@ -155,23 +164,23 @@ public static ResizeKernelMap Calculate( bool hasAtLeast2Periods = 2 * (cornerInterval + period) < destinationSize; ResizeKernelMap result = hasAtLeast2Periods - ? new PeriodicKernelMap( - memoryAllocator, - sourceSize, - destinationSize, - ratio, - scale, - radius, - period, - cornerInterval) - : new ResizeKernelMap( - memoryAllocator, - sourceSize, - destinationSize, - destinationSize, - ratio, - scale, - radius); + ? new PeriodicKernelMap( + memoryAllocator, + sourceSize, + destinationSize, + ratio, + scale, + radius, + period, + cornerInterval) + : new ResizeKernelMap( + memoryAllocator, + sourceSize, + destinationSize, + destinationSize, + ratio, + scale, + radius); result.Initialize(in sampler); @@ -198,7 +207,8 @@ protected internal virtual void Initialize(in TResampler sampler) private ResizeKernel BuildKernel(in TResampler sampler, int destRowIndex, int dataRowIndex) where TResampler : struct, IResampler { - double center = ((destRowIndex + .5) * this.ratio) - .5; + float center = (float)(((destRowIndex + .5) * this.ratio) - .5); + float scale = (float)this.scale; // Keep inside bounds. int left = (int)TolerantMath.Ceiling(center - this.radius); @@ -214,30 +224,25 @@ private ResizeKernel BuildKernel(in TResampler sampler, int destRowI } ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right); - - Span kernelValues = this.tempValues.AsSpan(0, kernel.Length); - double sum = 0; + Span kernelValues = this.tempValues.AsSpan(0, kernel.Length); + ref float kernelStart = ref MemoryMarshal.GetReference(kernelValues); + float sum = 0; for (int j = left; j <= right; j++) { - double value = sampler.GetValue((float)((j - center) / this.scale)); + float value = sampler.GetValue((j - center) / scale); sum += value; - - kernelValues[j - left] = value; + kernelStart = value; + kernelStart = ref Unsafe.Add(ref kernelStart, 1); } // Normalize, best to do it here rather than in the pixel loop later on. if (sum > 0) { - for (int j = 0; j < kernel.Length; j++) - { - // weights[w] = weights[w] / sum: - ref double kRef = ref kernelValues[j]; - kRef /= sum; - } + Numerics.Normalize(kernelValues, sum); } - kernel.Fill(kernelValues); + kernel.FillOrCopyAndExpand(kernelValues); return kernel; } diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs index 290a3b37ac..72142cbdc3 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs @@ -16,9 +16,7 @@ internal class ReferenceKernelMap private readonly ReferenceKernel[] kernels; public ReferenceKernelMap(ReferenceKernel[] kernels) - { - this.kernels = kernels; - } + => this.kernels = kernels; public int DestinationSize => this.kernels.Length; @@ -28,22 +26,23 @@ public static ReferenceKernelMap Calculate(in TResampler sampler, in where TResampler : struct, IResampler { double ratio = (double)sourceSize / destinationSize; - double scale = ratio; + double scaleD = ratio; - if (scale < 1F) + if (scaleD < 1) { - scale = 1F; + scaleD = 1; } TolerantMath tolerantMath = TolerantMath.Default; - double radius = tolerantMath.Ceiling(scale * sampler.Radius); + double radius = tolerantMath.Ceiling(scaleD * sampler.Radius); - var result = new List(); + List result = []; + float scale = (float)scaleD; for (int i = 0; i < destinationSize; i++) { - double center = ((i + .5) * ratio) - .5; + float center = (float)(((i + .5) * ratio) - .5); // Keep inside bounds. int left = (int)tolerantMath.Ceiling(center - radius); @@ -58,15 +57,14 @@ public static ReferenceKernelMap Calculate(in TResampler sampler, in right = sourceSize - 1; } - double sum = 0; + float sum = 0; - double[] values = new double[right - left + 1]; + float[] values = new float[right - left + 1]; for (int j = left; j <= right; j++) { - double weight = sampler.GetValue((float)((j - center) / scale)); + float weight = sampler.GetValue((j - center) / scale); sum += weight; - values[j - left] = weight; } @@ -78,16 +76,14 @@ public static ReferenceKernelMap Calculate(in TResampler sampler, in } } - float[] floatVals = values.Select(v => (float)v).ToArray(); - - result.Add(new ReferenceKernel(left, floatVals)); + result.Add(new ReferenceKernel(left, values)); } - return new ReferenceKernelMap(result.ToArray()); + return new ReferenceKernelMap([.. result]); } } - internal struct ReferenceKernel + internal readonly struct ReferenceKernel { public ReferenceKernel(int left, float[] values) { @@ -102,8 +98,6 @@ public ReferenceKernel(int left, float[] values) public int Length => this.Values.Length; public static implicit operator ReferenceKernel(ResizeKernel orig) - { - return new ReferenceKernel(orig.StartIndex, orig.Values.ToArray()); - } + => new(orig.StartIndex, orig.Values.ToArray()); } } diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs index c6da46ee2f..6d0de65c42 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs @@ -124,7 +124,6 @@ private void VerifyKernelMapContentIsCorrect(TResampler resampler, i this.Output.WriteLine($"Expected KernelMap:\n{PrintKernelMap(referenceMap)}\n"); this.Output.WriteLine($"Actual KernelMap:\n{PrintKernelMap(kernelMap)}\n"); #endif - var comparer = new ApproximateFloatComparer(1e-6f); for (int i = 0; i < kernelMap.DestinationLength; i++) { @@ -139,7 +138,29 @@ private void VerifyKernelMapContentIsCorrect(TResampler resampler, i referenceKernel.Left == kernel.StartIndex, $"referenceKernel.Left != kernel.Left: {referenceKernel.Left} != {kernel.StartIndex}"); float[] expectedValues = referenceKernel.Values; - Span actualValues = kernel.Values; + Span actualValues; + + ApproximateFloatComparer comparer; + if (ResizeKernel.SupportsVectorization) + { + comparer = new ApproximateFloatComparer(1e-4f); + + Assert.Equal(expectedValues.Length, kernel.Values.Length / 4); + + int actualLength = referenceKernel.Length / 4; + + actualValues = new float[expectedValues.Length]; + + for (int j = 0; j < expectedValues.Length; j++) + { + actualValues[j] = kernel.Values[j * 4]; + } + } + else + { + comparer = new ApproximateFloatComparer(1e-6f); + actualValues = kernel.Values; + } Assert.Equal(expectedValues.Length, actualValues.Length);