Skip to content

Commit

Permalink
Improved performance of Velocity kernels by refining workload dispatc…
Browse files Browse the repository at this point in the history
…her. (#1085)

* Updated VelocityEntryPointHandler to accept chunk sizes.
* Updated VelocityCodeGenerator to store additional parameter references.
* Updated VelocityCodeGenerator.Values to pass threading information to other methods.
* Updated VelocityCodeGenerator.Threads to load thread values from method parameters.
* Improved VelocityBackend to perform additional cleanup specification.
* Updated VelocityFunctionGenerator to load thread information from parameters.
* Adapted VelocityGenerationModule to generate non-kernel functions with additional parameters.
* Removed thread-value properties from VelocityGroupExecutionContext.
* Extended VelocityKernelFunctionGenerator to emit kernel-intrinsic operation loops over processing chunks.
* Removed obsolete methods from VelocityTargetSpecializer.
* Adapted VelocityAccelerator to split the processing workload into large chunks.
* Updated VelocityDevice to adapt the number of multiprocessors in debug builds.
  • Loading branch information
m4rs-mt authored Oct 10, 2023
2 parents ac71802 + 6da5bb7 commit 0f67f7d
Show file tree
Hide file tree
Showing 12 changed files with 215 additions and 178 deletions.
1 change: 1 addition & 0 deletions Src/ILGPU/Backends/Velocity/VelocityBackend.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ public VelocityBackend(
// Transform all if and switch branches to make them compatible with
// the internal vectorization engine
transformerBuilder.Add(new VelocityBlockScheduling());
transformerBuilder.Add(new DeadCodeElimination());

builder.Add(transformerBuilder.ToTransformer());
});
Expand Down
10 changes: 6 additions & 4 deletions Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ public void GenerateCode(GridIndexValue value)
switch (value.Dimension)
{
case DeviceConstantDimension3D.X:
// Load the first context argument and query the grid index
VelocityTargetSpecializer.GetGridIndex(Emitter);
// Load global index and compute the actual grid index
LoadGlobalIndexScalar();
LoadGroupDimScalar();
Emitter.Emit(OpCodes.Div);
break;
case DeviceConstantDimension3D.Y:
case DeviceConstantDimension3D.Z:
Expand Down Expand Up @@ -100,7 +102,7 @@ public void GenerateCode(GridDimensionValue value)
switch (value.Dimension)
{
case DeviceConstantDimension3D.X:
VelocityTargetSpecializer.GetGridDim(Emitter);
LoadGridDimScalar();
break;
case DeviceConstantDimension3D.Y:
case DeviceConstantDimension3D.Z:
Expand All @@ -117,7 +119,7 @@ public void GenerateCode(GroupDimensionValue value)
switch (value.Dimension)
{
case DeviceConstantDimension3D.X:
VelocityTargetSpecializer.GetGroupDim(Emitter);
LoadGroupDimScalar();
break;
case DeviceConstantDimension3D.Y:
case DeviceConstantDimension3D.Z:
Expand Down
24 changes: 24 additions & 0 deletions Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,36 @@ namespace ILGPU.Backends.Velocity
{
partial class VelocityCodeGenerator<TILEmitter>
{
/// <summary>
/// Loads the current global index.
/// </summary>
protected abstract void LoadGlobalIndexScalar();

/// <summary>
/// Loads the current group dimension.
/// </summary>
protected abstract void LoadGroupDimScalar();

/// <summary>
/// Loads the current grid dimension.
/// </summary>
protected abstract void LoadGridDimScalar();

/// <inheritdoc/>
public void GenerateCode(MethodCall methodCall)
{
// Load the execution context
Emitter.Emit(OpCodes.Ldarg_0);

// Load the global index
LoadGlobalIndexScalar();

// Load the group dimension
LoadGroupDimScalar();

// Load the grid dimension
LoadGridDimScalar();

// Load the current execution mask
Emitter.Emit(LocalOperation.Load, GetBlockMask(methodCall.BasicBlock));

Expand Down
19 changes: 17 additions & 2 deletions Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,30 @@ static class VelocityCodeGenerator
/// </summary>
public const int ExecutionContextIndex = 0;

/// <summary>
/// The parameter index of the current global index.
/// </summary>
public const int GlobalIndexScalar = 1;

/// <summary>
/// The parameter index of the current group dimension.
/// </summary>
public const int GroupDimIndexScalar = 2;
/// <summary>
///
/// The parameter index of the current grid dimension.
/// </summary>
public const int GridDimIndexScalar = 3;

/// <summary>
/// The parameter index of all masks.
/// </summary>
public const int MaskParameterIndex = 1;
public const int MaskParameterIndex = 4;

/// <summary>
/// The method parameter offset for all parameters.
/// </summary>
public const int MethodParameterOffset = 2;
public const int MethodParameterOffset = 5;

#endregion
}
Expand Down
24 changes: 24 additions & 0 deletions Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,30 @@ public VelocityFunctionGenerator(
targetMaskCount = Emitter.DeclareLocal(typeof(int));
}

/// <summary>
/// Loads the current global index.
/// </summary>
protected override void LoadGlobalIndexScalar() =>
Emitter.Emit(
ArgumentOperation.Load,
VelocityCodeGenerator.GlobalIndexScalar);

/// <summary>
/// Loads the current group dimension.
/// </summary>
protected override void LoadGroupDimScalar() =>
Emitter.Emit(
ArgumentOperation.Load,
VelocityCodeGenerator.GroupDimIndexScalar);

/// <summary>
/// Loads the current grid dimension.
/// </summary>
protected override void LoadGridDimScalar() =>
Emitter.Emit(
ArgumentOperation.Load,
VelocityCodeGenerator.GridDimIndexScalar);

/// <summary>
/// Generates Velocity code for this function.
/// </summary>
Expand Down
3 changes: 3 additions & 0 deletions Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,9 @@ private void DeclareMethod(
// Convert all parameter types
parameterTypes[VelocityCodeGenerator.ExecutionContextIndex] =
typeof(VelocityGroupExecutionContext);
parameterTypes[VelocityCodeGenerator.GlobalIndexScalar] = typeof(int);
parameterTypes[VelocityCodeGenerator.GroupDimIndexScalar] = typeof(int);
parameterTypes[VelocityCodeGenerator.GridDimIndexScalar] = typeof(int);
parameterTypes[VelocityCodeGenerator.MaskParameterIndex] =
specializer.WarpType32;
for (int i = 0; i < method.NumParameters; ++i)
Expand Down
133 changes: 95 additions & 38 deletions Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,17 @@ sealed class VelocityKernelFunctionGenerator<TILEmitter> :
{
#region Constants

public const int GlobalParametersIndex = 1;
public const int GlobalGroupDimIndex = 1;
public const int GlobalGridDimIndex = 2;
public const int GlobalStartIndex = 3;
public const int GlobalEndIndex = 4;
public const int GlobalParametersIndex = 5;

#endregion

private readonly ILLabel exitMarker;
private readonly ILLabel localExitMarker;
private readonly ILLocal targetMaskCount;
private readonly ILLocal globalIndex;

/// <summary>
/// Creates a new Velocity kernel generator.
Expand All @@ -52,11 +57,14 @@ public VelocityKernelFunctionGenerator(
ParametersType = args.Module.ParametersType;

// Generate an exit marker to jump to when the kernel function returns
exitMarker = Emitter.DeclareLabel();
localExitMarker = Emitter.DeclareLabel();

// We use this counter to remember the number of active threads that entered
// the kernel successfully
targetMaskCount = Emitter.DeclareLocal(typeof(int));

// Declare our global thread index local
globalIndex = Emitter.DeclareLocal(typeof(int));
}

/// <summary>
Expand All @@ -69,6 +77,24 @@ public VelocityKernelFunctionGenerator(
/// </summary>
public Type ParametersType { get; }

/// <summary>
/// Loads the current global index.
/// </summary>
protected override void LoadGlobalIndexScalar() =>
Emitter.Emit(LocalOperation.Load, globalIndex);

/// <summary>
/// Loads the current group dimension.
/// </summary>
protected override void LoadGroupDimScalar() =>
Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex);

/// <summary>
/// Loads the current grid dimension.
/// </summary>
protected override void LoadGridDimScalar() =>
Emitter.Emit(ArgumentOperation.Load, GlobalGridDimIndex);

/// <summary>
/// Generates Velocity code for this kernel.
/// </summary>
Expand Down Expand Up @@ -98,55 +124,86 @@ public override void GenerateCode()
Alias(Method.Parameters[i], parameterLocal);
}

// Bind the current implicitly grouped kernel index (if any)
var offsetVector = Emitter.DeclareLocal(Specializer.WarpType32);
if (EntryPoint.IsImplicitlyGrouped)
Alias(Method.Parameters[0], offsetVector);

// Store the current global index
VelocityTargetSpecializer.ComputeGlobalBaseIndex(Emitter);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.LoadLaneIndexVector32(Emitter);
Specializer.BinaryOperation32(
Emitter,
BinaryArithmeticKind.Add,
VelocityWarpOperationMode.I);
Emitter.Emit(LocalOperation.Store, offsetVector);
// Declare a local index counter and initialize it with the start index
Emitter.Emit(ArgumentOperation.Load, GlobalStartIndex);
Emitter.Emit(LocalOperation.Store, globalIndex);

// Setup the current main kernel mask based on the current group size
var baseGroupMask = Emitter.DeclareLocal(Specializer.WarpType32);
Specializer.LoadLaneIndexVector32(Emitter);
VelocityTargetSpecializer.GetGroupDim(Emitter);
Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.Compare32(
Emitter,
CompareKind.LessThan,
VelocityWarpOperationMode.I);
Emitter.Emit(LocalOperation.Store, baseGroupMask);

// Adjust the current main kernel mask based on the user grid size
Emitter.Emit(LocalOperation.Load, offsetVector);
VelocityTargetSpecializer.GetUserSize(Emitter);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.Compare32(
Emitter,
CompareKind.LessThan,
VelocityWarpOperationMode.I);
Specializer.IntersectMask32(Emitter);
// Build our execution loop header
var iterationHeader = Emitter.DeclareLabel();
Emitter.MarkLabel(iterationHeader);

var entryPointMask = GetBlockMask(Method.EntryBlock);
Emitter.Emit(OpCodes.Dup);
Emitter.Emit(LocalOperation.Store, entryPointMask);
// Compare our global index against the exclusive end index
var exitMarker = Emitter.DeclareLabel();
Emitter.Emit(LocalOperation.Load, globalIndex);
Emitter.Emit(ArgumentOperation.Load, GlobalEndIndex);
Emitter.Emit(OpCodes.Bge, exitMarker);

// Determine the target mask count
Specializer.GetNumberOfActiveLanes(Emitter);
Emitter.Emit(LocalOperation.Store, targetMaskCount);
// Build our execution body
{
// Bind the current implicitly grouped kernel index (if any)
var offsetVector = Emitter.DeclareLocal(Specializer.WarpType32);
if (EntryPoint.IsImplicitlyGrouped)
Alias(Method.Parameters[0], offsetVector);

// Compute the current global index
Emitter.Emit(LocalOperation.Load, globalIndex);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.LoadLaneIndexVector32(Emitter);
Specializer.BinaryOperation32(
Emitter,
BinaryArithmeticKind.Add,
VelocityWarpOperationMode.I);
Emitter.Emit(LocalOperation.Store, offsetVector);

// Adjust the current main kernel mask based on the user grid size
Emitter.Emit(LocalOperation.Load, offsetVector);
Emitter.Emit(ArgumentOperation.Load, GlobalEndIndex);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.Compare32(
Emitter,
CompareKind.LessThan,
VelocityWarpOperationMode.I);
Emitter.Emit(LocalOperation.Load, baseGroupMask);
Specializer.IntersectMask32(Emitter);

var entryPointMask = GetBlockMask(Method.EntryBlock);
Emitter.Emit(OpCodes.Dup);
Emitter.Emit(LocalOperation.Store, entryPointMask);

// Determine the target mask count
Specializer.GetNumberOfActiveLanes(Emitter);
Emitter.Emit(LocalOperation.Store, targetMaskCount);

// Emit the actual kernel code
GenerateCodeInternal();

// Emit the exit marker
Emitter.MarkLabel(localExitMarker);
}

// Emit the actual kernel code
GenerateCodeInternal();
// Increase the processing index
Emitter.Emit(LocalOperation.Load, globalIndex);
Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex);
Emitter.Emit(OpCodes.Conv_I8);
Emitter.Emit(OpCodes.Add);
Emitter.Emit(LocalOperation.Store, globalIndex);

// Emit the exit marker
Emitter.MarkLabel(exitMarker);
// Branch to the loop header
Emitter.Emit(OpCodes.Br, iterationHeader);

// Return
Emitter.MarkLabel(exitMarker);
Emitter.Emit(OpCodes.Ret);
}

Expand All @@ -161,7 +218,7 @@ public override void GenerateCode(ReturnTerminator returnTerminator)
GetBlockMask(returnTerminator.BasicBlock));
Specializer.GetNumberOfActiveLanes(Emitter);
Emitter.Emit(LocalOperation.Load, targetMaskCount);
Emitter.Emit(OpCodes.Beq, exitMarker);
Emitter.Emit(OpCodes.Beq, localExitMarker);
}
}
}
Loading

0 comments on commit 0f67f7d

Please sign in to comment.