Skip to content

Commit

Permalink
Adapted VelocityAccelerator to split the processing workload into lar…
Browse files Browse the repository at this point in the history
…ge chunks.
  • Loading branch information
m4rs-mt committed Oct 9, 2023
1 parent b202081 commit d55bce0
Showing 1 changed file with 26 additions and 13 deletions.
39 changes: 26 additions & 13 deletions Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,12 @@ public void Initialize() { }
/// <summary>
/// Returns the parent user size.
/// </summary>
public long UserSize { get; set; }
public int UserSize { get; set; }

/// <summary>
/// Returns the chunk size of each processing thread.
/// </summary>
public int ChunkSize { get; set; }

/// <summary>
/// Returns the dynamic shared memory length in bytes.
Expand Down Expand Up @@ -120,16 +125,17 @@ public void Process(
ParallelLoopState? loopState,
VelocityGroupExecutionContext intermediateState)
{
intermediateState.SetupThreadGrid(
index,
GroupDim,
GridDim,
(int)UserSize,
DynamicSharedMemoryLengthInBytes);
intermediateState.SetupThreadGrid(DynamicSharedMemoryLengthInBytes);

// Invoke the actual kernel
int startIndex = index * ChunkSize;
int endIndex = Math.Min(startIndex + ChunkSize, UserSize);
EntryPointHandler.AsNotNull().Invoke(
intermediateState,
GroupDim,
GridDim,
startIndex,
endIndex,
Parameters.AsNotNull());
}

Expand Down Expand Up @@ -178,11 +184,7 @@ internal VelocityAccelerator(Context context, VelocityDevice device)

parallelOptions = new ParallelOptions()
{
#if DEBUG
MaxDegreeOfParallelism = 1,
#else
MaxDegreeOfParallelism = device.NumMultiprocessors,
#endif
};
executionEngine = new ParallelExecutionEngine(this);

Expand Down Expand Up @@ -247,15 +249,26 @@ internal void Run(

// Setup engine properties
executionEngine.GroupDim = groupSize;
executionEngine.UserSize = userKernelConfig.Size;
executionEngine.UserSize = userKernelConfig.GetIntSize();
executionEngine.GridDim = gridSize;
executionEngine.DynamicSharedMemoryLengthInBytes =
runtimeKernelConfig.SharedMemoryConfig.DynamicArraySize;
executionEngine.EntryPointHandler = entryPointHandler;
executionEngine.Parameters = velocityParameters;

// Compute chunk size
int chunkSize = IntrinsicMath.DivRoundUp(gridSize, NumMultiprocessors);
int paddedChunkSize = IntrinsicMath.DivRoundUp(
chunkSize,
groupSize) * groupSize;
executionEngine.ChunkSize = paddedChunkSize;

// Compute num threads to launch
int totalSize = gridSize * groupSize;
int numChunks = IntrinsicMath.DivRoundUp(totalSize, paddedChunkSize);

// Launch all threads
executionEngine.ParallelFor(0, gridSize, parallelOptions);
executionEngine.ParallelFor(0, numChunks, parallelOptions);
}
finally
{
Expand Down

0 comments on commit d55bce0

Please sign in to comment.