From 5c34191f2c4db1463c6c3f6cb79aa91ac660b1fc Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:04 +0200 Subject: [PATCH 01/12] Updated VelocityEntryPointHandler to accept chunk sizes. --- .../Runtime/Velocity/VelocityEntryPointHandler.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Src/ILGPU/Runtime/Velocity/VelocityEntryPointHandler.cs b/Src/ILGPU/Runtime/Velocity/VelocityEntryPointHandler.cs index 32a4ed1f1..c74345643 100644 --- a/Src/ILGPU/Runtime/Velocity/VelocityEntryPointHandler.cs +++ b/Src/ILGPU/Runtime/Velocity/VelocityEntryPointHandler.cs @@ -17,9 +17,17 @@ namespace ILGPU.Runtime.Velocity /// Represents a single velocity kernel processing delegate. /// /// The main group context. + /// The main group dimension. + /// The main grid dimension. + /// The global start index (inclusive). + /// The global end index (exclusive). /// The current parameters. delegate void VelocityEntryPointHandler( VelocityGroupExecutionContext groupContext, + int groupDim, + int gridDim, + long startIndex, + long endIndex, VelocityParameters parameters); /// @@ -35,6 +43,10 @@ static class VelocityEntryPointHandlerHelper public static readonly Type[] EntryPointParameterTypes = new Type[] { typeof(VelocityGroupExecutionContext), + typeof(int), + typeof(int), + typeof(long), + typeof(long), typeof(VelocityParameters), }; } From 62bbe32cbcc65b98f48bd98cff47fbc66441c0bf Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:04 +0200 Subject: [PATCH 02/12] Updated VelocityCodeGenerator to store additional parameter references. --- .../Velocity/VelocityCodeGenerator.cs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs index 18adce758..382d419ba 100644 --- a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs @@ -39,15 +39,30 @@ static class VelocityCodeGenerator /// public const int ExecutionContextIndex = 0; + /// + /// The parameter index of the current global index. + /// + public const int GlobalIndexScalar = 1; + + /// + /// The parameter index of the current group dimension. + /// + public const int GroupDimIndexScalar = 2; + /// + /// + /// The parameter index of the current grid dimension. + /// + public const int GridDimIndexScalar = 3; + /// /// The parameter index of all masks. /// - public const int MaskParameterIndex = 1; + public const int MaskParameterIndex = 4; /// /// The method parameter offset for all parameters. /// - public const int MethodParameterOffset = 2; + public const int MethodParameterOffset = 5; #endregion } From 70d92f0cc8ba7ae413ca08b77cd7b97f39f150e7 Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:05 +0200 Subject: [PATCH 03/12] Updated VelocityCodeGenerator.Values to pass threading information to other methods. --- .../Velocity/VelocityCodeGenerator.Values.cs | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs index 50c3e0a2a..22b24cc72 100644 --- a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs @@ -22,12 +22,36 @@ namespace ILGPU.Backends.Velocity { partial class VelocityCodeGenerator { + /// + /// Loads the current global index. + /// + protected abstract void LoadGlobalIndexScalar(); + + /// + /// Loads the current group dimension. + /// + protected abstract void LoadGroupDimScalar(); + + /// + /// Loads the current grid dimension. + /// + protected abstract void LoadGridDimScalar(); + /// public void GenerateCode(MethodCall methodCall) { // Load the execution context Emitter.Emit(OpCodes.Ldarg_0); + // Load the global index + LoadGlobalIndexScalar(); + + // Load the group dimension + LoadGroupDimScalar(); + + // Load the grid dimension + LoadGridDimScalar(); + // Load the current execution mask Emitter.Emit(LocalOperation.Load, GetBlockMask(methodCall.BasicBlock)); From 10f0f9235088280910dc8019d3a003cf6728a533 Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:05 +0200 Subject: [PATCH 04/12] Updated VelocityCodeGenerator.Threads to load thread values from method parameters. --- .../Backends/Velocity/VelocityCodeGenerator.Threads.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs index 58c43e4f4..7f3aa854a 100644 --- a/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs +++ b/Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs @@ -65,8 +65,10 @@ public void GenerateCode(GridIndexValue value) switch (value.Dimension) { case DeviceConstantDimension3D.X: - // Load the first context argument and query the grid index - VelocityTargetSpecializer.GetGridIndex(Emitter); + // Load global index and compute the actual grid index + LoadGlobalIndexScalar(); + LoadGroupDimScalar(); + Emitter.Emit(OpCodes.Div); break; case DeviceConstantDimension3D.Y: case DeviceConstantDimension3D.Z: @@ -100,7 +102,7 @@ public void GenerateCode(GridDimensionValue value) switch (value.Dimension) { case DeviceConstantDimension3D.X: - VelocityTargetSpecializer.GetGridDim(Emitter); + LoadGridDimScalar(); break; case DeviceConstantDimension3D.Y: case DeviceConstantDimension3D.Z: @@ -117,7 +119,7 @@ public void GenerateCode(GroupDimensionValue value) switch (value.Dimension) { case DeviceConstantDimension3D.X: - VelocityTargetSpecializer.GetGroupDim(Emitter); + LoadGroupDimScalar(); break; case DeviceConstantDimension3D.Y: case DeviceConstantDimension3D.Z: From c9349b4e77fa829b96ccaec6730c44abc913c109 Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:05 +0200 Subject: [PATCH 05/12] Improved VelocityBackend to perform additional cleanup specification. --- Src/ILGPU/Backends/Velocity/VelocityBackend.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/Src/ILGPU/Backends/Velocity/VelocityBackend.cs b/Src/ILGPU/Backends/Velocity/VelocityBackend.cs index 5fb22a55d..16fa7163c 100644 --- a/Src/ILGPU/Backends/Velocity/VelocityBackend.cs +++ b/Src/ILGPU/Backends/Velocity/VelocityBackend.cs @@ -96,6 +96,7 @@ public VelocityBackend( // Transform all if and switch branches to make them compatible with // the internal vectorization engine transformerBuilder.Add(new VelocityBlockScheduling()); + transformerBuilder.Add(new DeadCodeElimination()); builder.Add(transformerBuilder.ToTransformer()); }); From 412187e63358a92ff3f026a3fc86d7117b867fdd Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:05 +0200 Subject: [PATCH 06/12] Updated VelocityFunctionGenerator to load thread information from parameters. --- .../Velocity/VelocityFunctionGenerator.cs | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs index 96236bc98..83b3686c1 100644 --- a/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs +++ b/Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs @@ -62,6 +62,30 @@ public VelocityFunctionGenerator( targetMaskCount = Emitter.DeclareLocal(typeof(int)); } + /// + /// Loads the current global index. + /// + protected override void LoadGlobalIndexScalar() => + Emitter.Emit( + ArgumentOperation.Load, + VelocityCodeGenerator.GlobalIndexScalar); + + /// + /// Loads the current group dimension. + /// + protected override void LoadGroupDimScalar() => + Emitter.Emit( + ArgumentOperation.Load, + VelocityCodeGenerator.GroupDimIndexScalar); + + /// + /// Loads the current grid dimension. + /// + protected override void LoadGridDimScalar() => + Emitter.Emit( + ArgumentOperation.Load, + VelocityCodeGenerator.GridDimIndexScalar); + /// /// Generates Velocity code for this function. /// From ed2a38bc54bad44da728243da59232721682ea91 Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:05 +0200 Subject: [PATCH 07/12] Adapted VelocityGenerationModule to generate non-kernel functions with additional parameters. --- Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs b/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs index 8c0d687ed..a0f8de927 100644 --- a/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs +++ b/Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs @@ -435,6 +435,9 @@ private void DeclareMethod( // Convert all parameter types parameterTypes[VelocityCodeGenerator.ExecutionContextIndex] = typeof(VelocityGroupExecutionContext); + parameterTypes[VelocityCodeGenerator.GlobalIndexScalar] = typeof(int); + parameterTypes[VelocityCodeGenerator.GroupDimIndexScalar] = typeof(int); + parameterTypes[VelocityCodeGenerator.GridDimIndexScalar] = typeof(int); parameterTypes[VelocityCodeGenerator.MaskParameterIndex] = specializer.WarpType32; for (int i = 0; i < method.NumParameters; ++i) From 304e4bba67359c23bead9a8bf6386363bee52a87 Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:05 +0200 Subject: [PATCH 08/12] Removed thread-value properties from VelocityGroupExecutionContext. --- .../Velocity/VelocityGroupExecutionContext.cs | 49 ++----------------- 1 file changed, 3 insertions(+), 46 deletions(-) diff --git a/Src/ILGPU/Runtime/Velocity/VelocityGroupExecutionContext.cs b/Src/ILGPU/Runtime/Velocity/VelocityGroupExecutionContext.cs index 87421c00d..9de1e2530 100644 --- a/Src/ILGPU/Runtime/Velocity/VelocityGroupExecutionContext.cs +++ b/Src/ILGPU/Runtime/Velocity/VelocityGroupExecutionContext.cs @@ -37,62 +37,19 @@ public VelocityGroupExecutionContext(VelocityAccelerator accelerator) warpSize = accelerator.WarpSize; } - /// - /// Returns the current grid index. - /// - public int GridIdx { get; private set; } - - /// - /// Returns the current group dimension. - /// - public int GroupDim { get; private set; } - - /// - /// Returns the current grid dimension. - /// - public int GridDim { get; private set; } - - /// - /// Returns the user-specific total size. - /// - public int UserSize { get; private set; } - /// /// Returns a view to dynamic shared memory (if any). /// public ArrayView DynamicSharedMemory { get; private set; } - /// - /// Returns the linear group . - /// - public int GroupOffset => GridIdx * GroupDim; - - /// - /// Resets this execution context. - /// - private void Reset() - { - sharedMemoryPool.Reset(); - localMemoryPool.Reset(); - } - /// /// Sets up the current thread grid information for the current thread group. /// - public void SetupThreadGrid( - int gridIdx, - int groupDim, - int gridDim, - int userSize, - int dynamicSharedMemoryLength) + public void SetupThreadGrid(int dynamicSharedMemoryLength) { - GridIdx = gridIdx; - GridDim = gridDim; - GroupDim = groupDim; - UserSize = userSize; - // Reset everything - Reset(); + sharedMemoryPool.Reset(); + localMemoryPool.Reset(); // Allocate dynamic shared memory if (dynamicSharedMemoryLength > 0) From 858c41df5e524420cb40b7b7e708a23bf6ed8999 Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:05 +0200 Subject: [PATCH 09/12] Extended VelocityKernelFunctionGenerator to emit kernel-intrinsic operation loops over processing chunks. --- .../VelocityKernelFunctionGenerator.cs | 133 +++++++++++++----- 1 file changed, 95 insertions(+), 38 deletions(-) diff --git a/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs b/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs index d950e2c6e..80e9efacc 100644 --- a/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs +++ b/Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs @@ -29,12 +29,17 @@ sealed class VelocityKernelFunctionGenerator : { #region Constants - public const int GlobalParametersIndex = 1; + public const int GlobalGroupDimIndex = 1; + public const int GlobalGridDimIndex = 2; + public const int GlobalStartIndex = 3; + public const int GlobalEndIndex = 4; + public const int GlobalParametersIndex = 5; #endregion - private readonly ILLabel exitMarker; + private readonly ILLabel localExitMarker; private readonly ILLocal targetMaskCount; + private readonly ILLocal globalIndex; /// /// Creates a new Velocity kernel generator. @@ -52,11 +57,14 @@ public VelocityKernelFunctionGenerator( ParametersType = args.Module.ParametersType; // Generate an exit marker to jump to when the kernel function returns - exitMarker = Emitter.DeclareLabel(); + localExitMarker = Emitter.DeclareLabel(); // We use this counter to remember the number of active threads that entered // the kernel successfully targetMaskCount = Emitter.DeclareLocal(typeof(int)); + + // Declare our global thread index local + globalIndex = Emitter.DeclareLocal(typeof(int)); } /// @@ -69,6 +77,24 @@ public VelocityKernelFunctionGenerator( /// public Type ParametersType { get; } + /// + /// Loads the current global index. + /// + protected override void LoadGlobalIndexScalar() => + Emitter.Emit(LocalOperation.Load, globalIndex); + + /// + /// Loads the current group dimension. + /// + protected override void LoadGroupDimScalar() => + Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex); + + /// + /// Loads the current grid dimension. + /// + protected override void LoadGridDimScalar() => + Emitter.Emit(ArgumentOperation.Load, GlobalGridDimIndex); + /// /// Generates Velocity code for this kernel. /// @@ -98,55 +124,86 @@ public override void GenerateCode() Alias(Method.Parameters[i], parameterLocal); } - // Bind the current implicitly grouped kernel index (if any) - var offsetVector = Emitter.DeclareLocal(Specializer.WarpType32); - if (EntryPoint.IsImplicitlyGrouped) - Alias(Method.Parameters[0], offsetVector); - - // Store the current global index - VelocityTargetSpecializer.ComputeGlobalBaseIndex(Emitter); - Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); - Specializer.LoadLaneIndexVector32(Emitter); - Specializer.BinaryOperation32( - Emitter, - BinaryArithmeticKind.Add, - VelocityWarpOperationMode.I); - Emitter.Emit(LocalOperation.Store, offsetVector); + // Declare a local index counter and initialize it with the start index + Emitter.Emit(ArgumentOperation.Load, GlobalStartIndex); + Emitter.Emit(LocalOperation.Store, globalIndex); // Setup the current main kernel mask based on the current group size + var baseGroupMask = Emitter.DeclareLocal(Specializer.WarpType32); Specializer.LoadLaneIndexVector32(Emitter); - VelocityTargetSpecializer.GetGroupDim(Emitter); + Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex); Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); Specializer.Compare32( Emitter, CompareKind.LessThan, VelocityWarpOperationMode.I); + Emitter.Emit(LocalOperation.Store, baseGroupMask); - // Adjust the current main kernel mask based on the user grid size - Emitter.Emit(LocalOperation.Load, offsetVector); - VelocityTargetSpecializer.GetUserSize(Emitter); - Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); - Specializer.Compare32( - Emitter, - CompareKind.LessThan, - VelocityWarpOperationMode.I); - Specializer.IntersectMask32(Emitter); + // Build our execution loop header + var iterationHeader = Emitter.DeclareLabel(); + Emitter.MarkLabel(iterationHeader); - var entryPointMask = GetBlockMask(Method.EntryBlock); - Emitter.Emit(OpCodes.Dup); - Emitter.Emit(LocalOperation.Store, entryPointMask); + // Compare our global index against the exclusive end index + var exitMarker = Emitter.DeclareLabel(); + Emitter.Emit(LocalOperation.Load, globalIndex); + Emitter.Emit(ArgumentOperation.Load, GlobalEndIndex); + Emitter.Emit(OpCodes.Bge, exitMarker); - // Determine the target mask count - Specializer.GetNumberOfActiveLanes(Emitter); - Emitter.Emit(LocalOperation.Store, targetMaskCount); + // Build our execution body + { + // Bind the current implicitly grouped kernel index (if any) + var offsetVector = Emitter.DeclareLocal(Specializer.WarpType32); + if (EntryPoint.IsImplicitlyGrouped) + Alias(Method.Parameters[0], offsetVector); + + // Compute the current global index + Emitter.Emit(LocalOperation.Load, globalIndex); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Specializer.LoadLaneIndexVector32(Emitter); + Specializer.BinaryOperation32( + Emitter, + BinaryArithmeticKind.Add, + VelocityWarpOperationMode.I); + Emitter.Emit(LocalOperation.Store, offsetVector); + + // Adjust the current main kernel mask based on the user grid size + Emitter.Emit(LocalOperation.Load, offsetVector); + Emitter.Emit(ArgumentOperation.Load, GlobalEndIndex); + Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I); + Specializer.Compare32( + Emitter, + CompareKind.LessThan, + VelocityWarpOperationMode.I); + Emitter.Emit(LocalOperation.Load, baseGroupMask); + Specializer.IntersectMask32(Emitter); + + var entryPointMask = GetBlockMask(Method.EntryBlock); + Emitter.Emit(OpCodes.Dup); + Emitter.Emit(LocalOperation.Store, entryPointMask); + + // Determine the target mask count + Specializer.GetNumberOfActiveLanes(Emitter); + Emitter.Emit(LocalOperation.Store, targetMaskCount); + + // Emit the actual kernel code + GenerateCodeInternal(); + + // Emit the exit marker + Emitter.MarkLabel(localExitMarker); + } - // Emit the actual kernel code - GenerateCodeInternal(); + // Increase the processing index + Emitter.Emit(LocalOperation.Load, globalIndex); + Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex); + Emitter.Emit(OpCodes.Conv_I8); + Emitter.Emit(OpCodes.Add); + Emitter.Emit(LocalOperation.Store, globalIndex); - // Emit the exit marker - Emitter.MarkLabel(exitMarker); + // Branch to the loop header + Emitter.Emit(OpCodes.Br, iterationHeader); // Return + Emitter.MarkLabel(exitMarker); Emitter.Emit(OpCodes.Ret); } @@ -161,7 +218,7 @@ public override void GenerateCode(ReturnTerminator returnTerminator) GetBlockMask(returnTerminator.BasicBlock)); Specializer.GetNumberOfActiveLanes(Emitter); Emitter.Emit(LocalOperation.Load, targetMaskCount); - Emitter.Emit(OpCodes.Beq, exitMarker); + Emitter.Emit(OpCodes.Beq, localExitMarker); } } } From b2020813d8cfab0549b3ec1bfa59d9383e8a12e1 Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:06 +0200 Subject: [PATCH 10/12] Removed obsolete methods from VelocityTargetSpecializer. --- .../Velocity/VelocityTargetSpecializer.cs | 75 ------------------- 1 file changed, 75 deletions(-) diff --git a/Src/ILGPU/Backends/Velocity/VelocityTargetSpecializer.cs b/Src/ILGPU/Backends/Velocity/VelocityTargetSpecializer.cs index 23fab7e9f..e8cd73e95 100644 --- a/Src/ILGPU/Backends/Velocity/VelocityTargetSpecializer.cs +++ b/Src/ILGPU/Backends/Velocity/VelocityTargetSpecializer.cs @@ -35,14 +35,6 @@ internal static MethodInfo GetMethod(string name) => private static readonly MethodInfo MemoryBarrierMethod = GetMethod(nameof(MemoryBarrier)); - private static readonly MethodInfo GetGridIndexMethod = - GetMethod(nameof(GetGridIndexImpl)); - private static readonly MethodInfo GetGridDimMethod = - GetMethod(nameof(GetGridDimImpl)); - private static readonly MethodInfo GetGroupDimMethod = - GetMethod(nameof(GetGroupDimImpl)); - private static readonly MethodInfo GetUserSizeMethod = - GetMethod(nameof(GetUserSizeImpl)); private static readonly MethodInfo GetDynamicSharedMemoryMethod = GetMethod( @@ -54,8 +46,6 @@ internal static MethodInfo GetMethod(string name) => GetMethod(nameof(GetSharedMemoryFromPoolImpl)); private static readonly MethodInfo GetLocalMemoryFromPoolMethod = GetMethod(nameof(GetLocalMemoryFromPoolImpl)); - private static readonly MethodInfo ComputeGlobalBaseIndexMethod = - GetMethod(nameof(ComputeGlobalBaseIndexImpl)); private static readonly MethodInfo DebuggerBreakMethod = GetMethod(nameof(DebuggerBreakImpl)); @@ -64,36 +54,6 @@ internal static MethodInfo GetMethod(string name) => /// internal static void MemoryBarrier() => Interlocked.MemoryBarrier(); - /// - /// Wrapper around a group extension context. - /// - internal static int GetGridIndexImpl(VelocityGroupExecutionContext context) => - context.GridIdx; - - /// - /// Wrapper around a group extension context. - /// - internal static int GetGridDimImpl(VelocityGroupExecutionContext context) => - context.GridDim; - - /// - /// Wrapper around a group extension context. - /// - internal static int GetGroupDimImpl(VelocityGroupExecutionContext context) => - context.GroupDim; - - /// - /// Wrapper around a group extension context. - /// - internal static int GetUserSizeImpl(VelocityGroupExecutionContext context) => - context.UserSize; - - /// - /// Wrapper around a group extension context. - /// - internal static int ComputeGlobalBaseIndexImpl( - VelocityGroupExecutionContext context) => context.GroupOffset; - /// /// Wrapper around a group extension context. /// @@ -557,34 +517,6 @@ public static void MemoryBarrier(TILEmitter emitter) where TILEmitter : struct, IILEmitter => emitter.EmitCall(MemoryBarrierMethod); - public static void GetGridIndex(TILEmitter emitter) - where TILEmitter : struct, IILEmitter - { - emitter.Emit(OpCodes.Ldarg_0); - emitter.EmitCall(GetGridIndexMethod); - } - - public static void GetGridDim(TILEmitter emitter) - where TILEmitter : struct, IILEmitter - { - emitter.Emit(OpCodes.Ldarg_0); - emitter.EmitCall(GetGridDimMethod); - } - - public static void GetUserSize(TILEmitter emitter) - where TILEmitter : struct, IILEmitter - { - emitter.Emit(OpCodes.Ldarg_0); - emitter.EmitCall(GetUserSizeMethod); - } - - public static void GetGroupDim(TILEmitter emitter) - where TILEmitter : struct, IILEmitter - { - emitter.Emit(OpCodes.Ldarg_0); - emitter.EmitCall(GetGroupDimMethod); - } - public void GetDynamicSharedMemory(TILEmitter emitter) where TILEmitter : struct, IILEmitter { @@ -640,13 +572,6 @@ public void GetUnifiedLocalMemoryFromPool( ConvertScalarTo64(emitter, VelocityWarpOperationMode.U); } - public static void ComputeGlobalBaseIndex(TILEmitter emitter) - where TILEmitter : struct, IILEmitter - { - emitter.Emit(OpCodes.Ldarg_0); - emitter.EmitCall(ComputeGlobalBaseIndexMethod); - } - public static void DebuggerBreak(TILEmitter emitter) where TILEmitter : struct, IILEmitter => emitter.EmitCall(DebuggerBreakMethod); From d55bce0ad35d7bcd204154905efab1784dda6078 Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:06 +0200 Subject: [PATCH 11/12] Adapted VelocityAccelerator to split the processing workload into large chunks. --- .../Runtime/Velocity/VelocityAccelerator.cs | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs index 450b35a73..a9b25410d 100644 --- a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs +++ b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs @@ -87,7 +87,12 @@ public void Initialize() { } /// /// Returns the parent user size. /// - public long UserSize { get; set; } + public int UserSize { get; set; } + + /// + /// Returns the chunk size of each processing thread. + /// + public int ChunkSize { get; set; } /// /// Returns the dynamic shared memory length in bytes. @@ -120,16 +125,17 @@ public void Process( ParallelLoopState? loopState, VelocityGroupExecutionContext intermediateState) { - intermediateState.SetupThreadGrid( - index, - GroupDim, - GridDim, - (int)UserSize, - DynamicSharedMemoryLengthInBytes); + intermediateState.SetupThreadGrid(DynamicSharedMemoryLengthInBytes); // Invoke the actual kernel + int startIndex = index * ChunkSize; + int endIndex = Math.Min(startIndex + ChunkSize, UserSize); EntryPointHandler.AsNotNull().Invoke( intermediateState, + GroupDim, + GridDim, + startIndex, + endIndex, Parameters.AsNotNull()); } @@ -178,11 +184,7 @@ internal VelocityAccelerator(Context context, VelocityDevice device) parallelOptions = new ParallelOptions() { -#if DEBUG - MaxDegreeOfParallelism = 1, -#else MaxDegreeOfParallelism = device.NumMultiprocessors, -#endif }; executionEngine = new ParallelExecutionEngine(this); @@ -247,15 +249,26 @@ internal void Run( // Setup engine properties executionEngine.GroupDim = groupSize; - executionEngine.UserSize = userKernelConfig.Size; + executionEngine.UserSize = userKernelConfig.GetIntSize(); executionEngine.GridDim = gridSize; executionEngine.DynamicSharedMemoryLengthInBytes = runtimeKernelConfig.SharedMemoryConfig.DynamicArraySize; executionEngine.EntryPointHandler = entryPointHandler; executionEngine.Parameters = velocityParameters; + // Compute chunk size + int chunkSize = IntrinsicMath.DivRoundUp(gridSize, NumMultiprocessors); + int paddedChunkSize = IntrinsicMath.DivRoundUp( + chunkSize, + groupSize) * groupSize; + executionEngine.ChunkSize = paddedChunkSize; + + // Compute num threads to launch + int totalSize = gridSize * groupSize; + int numChunks = IntrinsicMath.DivRoundUp(totalSize, paddedChunkSize); + // Launch all threads - executionEngine.ParallelFor(0, gridSize, parallelOptions); + executionEngine.ParallelFor(0, numChunks, parallelOptions); } finally { From 6da5bb707d94684d6f9de1bff763fdd4c8b54b2b Mon Sep 17 00:00:00 2001 From: Marcel Koester Date: Fri, 8 Sep 2023 09:32:06 +0200 Subject: [PATCH 12/12] Updated VelocityDevice to adapt the number of multiprocessors in debug builds. --- Src/ILGPU/Runtime/Velocity/VelocityDevice.cs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs index e2793482e..567132954 100644 --- a/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs +++ b/Src/ILGPU/Runtime/Velocity/VelocityDevice.cs @@ -67,7 +67,11 @@ public VelocityDevice(VelocityDeviceType deviceType) .AsNotNullCast(); WarpSize = TargetSpecializer.WarpSize; MaxNumThreadsPerGroup = MaxNumThreadsPerMultiprocessor = WarpSize; +#if DEBUG + NumMultiprocessors = 1; +#else NumMultiprocessors = Environment.ProcessorCount; +#endif MaxGroupSize = new Index3D( MaxNumThreadsPerGroup, 1,