Skip to content

Commit

Permalink
Adapted VelocityAccelerator to split the processing workload into lar…
Browse files Browse the repository at this point in the history
…ge chunks.
  • Loading branch information
m4rs-mt committed Sep 27, 2023
1 parent f137d9c commit ea47982
Showing 1 changed file with 21 additions and 13 deletions.
34 changes: 21 additions & 13 deletions Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,12 @@ public void Initialize() { }
/// <summary>
/// Returns the parent user size.
/// </summary>
public long UserSize { get; set; }
public int UserSize { get; set; }

/// <summary>
/// Returns the chunk size of each processing thread.
/// </summary>
public int ChunkSize { get; set; }

/// <summary>
/// Returns the dynamic shared memory length in bytes.
Expand Down Expand Up @@ -120,16 +125,17 @@ public void Process(
ParallelLoopState? loopState,
VelocityGroupExecutionContext intermediateState)
{
intermediateState.SetupThreadGrid(
index,
GroupDim,
GridDim,
(int)UserSize,
DynamicSharedMemoryLengthInBytes);
intermediateState.SetupThreadGrid(DynamicSharedMemoryLengthInBytes);

// Invoke the actual kernel
int startIndex = index * ChunkSize;
int endIndex = Math.Min(startIndex + ChunkSize, UserSize);
EntryPointHandler.AsNotNull().Invoke(
intermediateState,
GroupDim,
GridDim,
startIndex,
endIndex,
Parameters.AsNotNull());
}

Expand Down Expand Up @@ -178,11 +184,7 @@ internal VelocityAccelerator(Context context, VelocityDevice device)

parallelOptions = new ParallelOptions()
{
#if DEBUG
MaxDegreeOfParallelism = 1,
#else
MaxDegreeOfParallelism = device.NumMultiprocessors,
#endif
};
executionEngine = new ParallelExecutionEngine(this);

Expand Down Expand Up @@ -247,15 +249,21 @@ internal void Run(

// Setup engine properties
executionEngine.GroupDim = groupSize;
executionEngine.UserSize = userKernelConfig.Size;
executionEngine.UserSize = userKernelConfig.GetIntSize();
executionEngine.GridDim = gridSize;
executionEngine.DynamicSharedMemoryLengthInBytes =
runtimeKernelConfig.SharedMemoryConfig.DynamicArraySize;
executionEngine.EntryPointHandler = entryPointHandler;
executionEngine.Parameters = velocityParameters;

// Compute the internal thread configuration
int totalSize = gridSize * groupSize;
int chunkSize = IntrinsicMath.DivRoundUp(totalSize, NumMultiprocessors);
int numChunks = IntrinsicMath.DivRoundUp(totalSize, chunkSize);
executionEngine.ChunkSize = chunkSize;

// Launch all threads
executionEngine.ParallelFor(0, gridSize, parallelOptions);
executionEngine.ParallelFor(0, numChunks, parallelOptions);
}
finally
{
Expand Down

0 comments on commit ea47982

Please sign in to comment.