Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved performance of Velocity kernels by refining workload dispatcher. #1085

Merged
merged 12 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Src/ILGPU/Backends/Velocity/VelocityBackend.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ public VelocityBackend(
// Transform all if and switch branches to make them compatible with
// the internal vectorization engine
transformerBuilder.Add(new VelocityBlockScheduling());
transformerBuilder.Add(new DeadCodeElimination());

builder.Add(transformerBuilder.ToTransformer());
});
Expand Down
10 changes: 6 additions & 4 deletions Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Threads.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ public void GenerateCode(GridIndexValue value)
switch (value.Dimension)
{
case DeviceConstantDimension3D.X:
// Load the first context argument and query the grid index
VelocityTargetSpecializer.GetGridIndex(Emitter);
// Load global index and compute the actual grid index
LoadGlobalIndexScalar();
LoadGroupDimScalar();
Emitter.Emit(OpCodes.Div);
break;
case DeviceConstantDimension3D.Y:
case DeviceConstantDimension3D.Z:
Expand Down Expand Up @@ -100,7 +102,7 @@ public void GenerateCode(GridDimensionValue value)
switch (value.Dimension)
{
case DeviceConstantDimension3D.X:
VelocityTargetSpecializer.GetGridDim(Emitter);
LoadGridDimScalar();
break;
case DeviceConstantDimension3D.Y:
case DeviceConstantDimension3D.Z:
Expand All @@ -117,7 +119,7 @@ public void GenerateCode(GroupDimensionValue value)
switch (value.Dimension)
{
case DeviceConstantDimension3D.X:
VelocityTargetSpecializer.GetGroupDim(Emitter);
LoadGroupDimScalar();
break;
case DeviceConstantDimension3D.Y:
case DeviceConstantDimension3D.Z:
Expand Down
24 changes: 24 additions & 0 deletions Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.Values.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,36 @@ namespace ILGPU.Backends.Velocity
{
partial class VelocityCodeGenerator<TILEmitter>
{
/// <summary>
/// Loads the current global index.
/// </summary>
protected abstract void LoadGlobalIndexScalar();

/// <summary>
/// Loads the current group dimension.
/// </summary>
protected abstract void LoadGroupDimScalar();

/// <summary>
/// Loads the current grid dimension.
/// </summary>
protected abstract void LoadGridDimScalar();

/// <inheritdoc/>
public void GenerateCode(MethodCall methodCall)
{
// Load the execution context
Emitter.Emit(OpCodes.Ldarg_0);

// Load the global index
LoadGlobalIndexScalar();

// Load the group dimension
LoadGroupDimScalar();

// Load the grid dimension
LoadGridDimScalar();

// Load the current execution mask
Emitter.Emit(LocalOperation.Load, GetBlockMask(methodCall.BasicBlock));

Expand Down
19 changes: 17 additions & 2 deletions Src/ILGPU/Backends/Velocity/VelocityCodeGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,30 @@ static class VelocityCodeGenerator
/// </summary>
public const int ExecutionContextIndex = 0;

/// <summary>
/// The parameter index of the current global index.
/// </summary>
public const int GlobalIndexScalar = 1;

/// <summary>
/// The parameter index of the current group dimension.
/// </summary>
public const int GroupDimIndexScalar = 2;
/// <summary>
///
/// The parameter index of the current grid dimension.
/// </summary>
public const int GridDimIndexScalar = 3;

/// <summary>
/// The parameter index of all masks.
/// </summary>
public const int MaskParameterIndex = 1;
public const int MaskParameterIndex = 4;

/// <summary>
/// The method parameter offset for all parameters.
/// </summary>
public const int MethodParameterOffset = 2;
public const int MethodParameterOffset = 5;

#endregion
}
Expand Down
24 changes: 24 additions & 0 deletions Src/ILGPU/Backends/Velocity/VelocityFunctionGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,30 @@ public VelocityFunctionGenerator(
targetMaskCount = Emitter.DeclareLocal(typeof(int));
}

/// <summary>
/// Loads the current global index.
/// </summary>
protected override void LoadGlobalIndexScalar() =>
Emitter.Emit(
ArgumentOperation.Load,
VelocityCodeGenerator.GlobalIndexScalar);

/// <summary>
/// Loads the current group dimension.
/// </summary>
protected override void LoadGroupDimScalar() =>
Emitter.Emit(
ArgumentOperation.Load,
VelocityCodeGenerator.GroupDimIndexScalar);

/// <summary>
/// Loads the current grid dimension.
/// </summary>
protected override void LoadGridDimScalar() =>
Emitter.Emit(
ArgumentOperation.Load,
VelocityCodeGenerator.GridDimIndexScalar);

/// <summary>
/// Generates Velocity code for this function.
/// </summary>
Expand Down
3 changes: 3 additions & 0 deletions Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@
ImmutableArray.CreateBuilder<FieldInfo>(numParameters);
for (int i = 0; i < numParameters; ++i)
{
var fieldInfo = ILEmitterExtensions.GetFieldInfo(result, i);

Check warning on line 113 in Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs

View workflow job for this annotation

GitHub Actions / Analyze (Src/ILGPU.sln, net6.0)

Possible null reference argument for parameter 'type' in 'FieldInfo ILEmitterExtensions.GetFieldInfo(Type type, int fieldIndex)'.

Check warning on line 113 in Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs

View workflow job for this annotation

GitHub Actions / Analyze (Samples/ILGPU.Samples.sln, net6.0)

Possible null reference argument for parameter 'type' in 'FieldInfo ILEmitterExtensions.GetFieldInfo(Type type, int fieldIndex)'.
parameterMapping.Add(fieldInfo);
}
parameterFields = parameterMapping.MoveToImmutable();
constructor = result.GetConstructor(constructorParameterTypes).AsNotNull();

Check warning on line 117 in Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs

View workflow job for this annotation

GitHub Actions / Analyze (Src/ILGPU.sln, net6.0)

Dereference of a possibly null reference.

Check warning on line 117 in Src/ILGPU/Backends/Velocity/VelocityGenerationModule.cs

View workflow job for this annotation

GitHub Actions / Analyze (Samples/ILGPU.Samples.sln, net6.0)

Dereference of a possibly null reference.
return result;
}

Expand Down Expand Up @@ -435,6 +435,9 @@
// Convert all parameter types
parameterTypes[VelocityCodeGenerator.ExecutionContextIndex] =
typeof(VelocityGroupExecutionContext);
parameterTypes[VelocityCodeGenerator.GlobalIndexScalar] = typeof(int);
parameterTypes[VelocityCodeGenerator.GroupDimIndexScalar] = typeof(int);
parameterTypes[VelocityCodeGenerator.GridDimIndexScalar] = typeof(int);
parameterTypes[VelocityCodeGenerator.MaskParameterIndex] =
specializer.WarpType32;
for (int i = 0; i < method.NumParameters; ++i)
Expand Down
133 changes: 95 additions & 38 deletions Src/ILGPU/Backends/Velocity/VelocityKernelFunctionGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,17 @@ sealed class VelocityKernelFunctionGenerator<TILEmitter> :
{
#region Constants

public const int GlobalParametersIndex = 1;
public const int GlobalGroupDimIndex = 1;
public const int GlobalGridDimIndex = 2;
public const int GlobalStartIndex = 3;
public const int GlobalEndIndex = 4;
public const int GlobalParametersIndex = 5;

#endregion

private readonly ILLabel exitMarker;
private readonly ILLabel localExitMarker;
private readonly ILLocal targetMaskCount;
private readonly ILLocal globalIndex;

/// <summary>
/// Creates a new Velocity kernel generator.
Expand All @@ -52,11 +57,14 @@ public VelocityKernelFunctionGenerator(
ParametersType = args.Module.ParametersType;

// Generate an exit marker to jump to when the kernel function returns
exitMarker = Emitter.DeclareLabel();
localExitMarker = Emitter.DeclareLabel();

// We use this counter to remember the number of active threads that entered
// the kernel successfully
targetMaskCount = Emitter.DeclareLocal(typeof(int));

// Declare our global thread index local
globalIndex = Emitter.DeclareLocal(typeof(int));
}

/// <summary>
Expand All @@ -69,6 +77,24 @@ public VelocityKernelFunctionGenerator(
/// </summary>
public Type ParametersType { get; }

/// <summary>
/// Loads the current global index.
/// </summary>
protected override void LoadGlobalIndexScalar() =>
Emitter.Emit(LocalOperation.Load, globalIndex);

/// <summary>
/// Loads the current group dimension.
/// </summary>
protected override void LoadGroupDimScalar() =>
Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex);

/// <summary>
/// Loads the current grid dimension.
/// </summary>
protected override void LoadGridDimScalar() =>
Emitter.Emit(ArgumentOperation.Load, GlobalGridDimIndex);

/// <summary>
/// Generates Velocity code for this kernel.
/// </summary>
Expand Down Expand Up @@ -98,55 +124,86 @@ public override void GenerateCode()
Alias(Method.Parameters[i], parameterLocal);
}

// Bind the current implicitly grouped kernel index (if any)
var offsetVector = Emitter.DeclareLocal(Specializer.WarpType32);
if (EntryPoint.IsImplicitlyGrouped)
Alias(Method.Parameters[0], offsetVector);

// Store the current global index
VelocityTargetSpecializer.ComputeGlobalBaseIndex(Emitter);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.LoadLaneIndexVector32(Emitter);
Specializer.BinaryOperation32(
Emitter,
BinaryArithmeticKind.Add,
VelocityWarpOperationMode.I);
Emitter.Emit(LocalOperation.Store, offsetVector);
// Declare a local index counter and initialize it with the start index
Emitter.Emit(ArgumentOperation.Load, GlobalStartIndex);
Emitter.Emit(LocalOperation.Store, globalIndex);

// Setup the current main kernel mask based on the current group size
var baseGroupMask = Emitter.DeclareLocal(Specializer.WarpType32);
Specializer.LoadLaneIndexVector32(Emitter);
VelocityTargetSpecializer.GetGroupDim(Emitter);
Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.Compare32(
Emitter,
CompareKind.LessThan,
VelocityWarpOperationMode.I);
Emitter.Emit(LocalOperation.Store, baseGroupMask);

// Adjust the current main kernel mask based on the user grid size
Emitter.Emit(LocalOperation.Load, offsetVector);
VelocityTargetSpecializer.GetUserSize(Emitter);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.Compare32(
Emitter,
CompareKind.LessThan,
VelocityWarpOperationMode.I);
Specializer.IntersectMask32(Emitter);
// Build our execution loop header
var iterationHeader = Emitter.DeclareLabel();
Emitter.MarkLabel(iterationHeader);

var entryPointMask = GetBlockMask(Method.EntryBlock);
Emitter.Emit(OpCodes.Dup);
Emitter.Emit(LocalOperation.Store, entryPointMask);
// Compare our global index against the exclusive end index
var exitMarker = Emitter.DeclareLabel();
Emitter.Emit(LocalOperation.Load, globalIndex);
Emitter.Emit(ArgumentOperation.Load, GlobalEndIndex);
Emitter.Emit(OpCodes.Bge, exitMarker);

// Determine the target mask count
Specializer.GetNumberOfActiveLanes(Emitter);
Emitter.Emit(LocalOperation.Store, targetMaskCount);
// Build our execution body
{
// Bind the current implicitly grouped kernel index (if any)
var offsetVector = Emitter.DeclareLocal(Specializer.WarpType32);
if (EntryPoint.IsImplicitlyGrouped)
Alias(Method.Parameters[0], offsetVector);

// Compute the current global index
Emitter.Emit(LocalOperation.Load, globalIndex);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.LoadLaneIndexVector32(Emitter);
Specializer.BinaryOperation32(
Emitter,
BinaryArithmeticKind.Add,
VelocityWarpOperationMode.I);
Emitter.Emit(LocalOperation.Store, offsetVector);

// Adjust the current main kernel mask based on the user grid size
Emitter.Emit(LocalOperation.Load, offsetVector);
Emitter.Emit(ArgumentOperation.Load, GlobalEndIndex);
Specializer.ConvertScalarTo32(Emitter, VelocityWarpOperationMode.I);
Specializer.Compare32(
Emitter,
CompareKind.LessThan,
VelocityWarpOperationMode.I);
Emitter.Emit(LocalOperation.Load, baseGroupMask);
Specializer.IntersectMask32(Emitter);

var entryPointMask = GetBlockMask(Method.EntryBlock);
Emitter.Emit(OpCodes.Dup);
Emitter.Emit(LocalOperation.Store, entryPointMask);

// Determine the target mask count
Specializer.GetNumberOfActiveLanes(Emitter);
Emitter.Emit(LocalOperation.Store, targetMaskCount);

// Emit the actual kernel code
GenerateCodeInternal();

// Emit the exit marker
Emitter.MarkLabel(localExitMarker);
}

// Emit the actual kernel code
GenerateCodeInternal();
// Increase the processing index
Emitter.Emit(LocalOperation.Load, globalIndex);
Emitter.Emit(ArgumentOperation.Load, GlobalGroupDimIndex);
Emitter.Emit(OpCodes.Conv_I8);
Emitter.Emit(OpCodes.Add);
Emitter.Emit(LocalOperation.Store, globalIndex);

// Emit the exit marker
Emitter.MarkLabel(exitMarker);
// Branch to the loop header
Emitter.Emit(OpCodes.Br, iterationHeader);

// Return
Emitter.MarkLabel(exitMarker);
Emitter.Emit(OpCodes.Ret);
}

Expand All @@ -161,7 +218,7 @@ public override void GenerateCode(ReturnTerminator returnTerminator)
GetBlockMask(returnTerminator.BasicBlock));
Specializer.GetNumberOfActiveLanes(Emitter);
Emitter.Emit(LocalOperation.Load, targetMaskCount);
Emitter.Emit(OpCodes.Beq, exitMarker);
Emitter.Emit(OpCodes.Beq, localExitMarker);
}
}
}
Loading
Loading