diff --git a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
index 450b35a73..a9b25410d 100644
--- a/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
+++ b/Src/ILGPU/Runtime/Velocity/VelocityAccelerator.cs
@@ -87,7 +87,12 @@ public void Initialize() { }
///
/// Returns the parent user size.
///
- public long UserSize { get; set; }
+ public int UserSize { get; set; }
+
+ ///
+ /// Returns the chunk size of each processing thread.
+ ///
+ public int ChunkSize { get; set; }
///
/// Returns the dynamic shared memory length in bytes.
@@ -120,16 +125,17 @@ public void Process(
ParallelLoopState? loopState,
VelocityGroupExecutionContext intermediateState)
{
- intermediateState.SetupThreadGrid(
- index,
- GroupDim,
- GridDim,
- (int)UserSize,
- DynamicSharedMemoryLengthInBytes);
+ intermediateState.SetupThreadGrid(DynamicSharedMemoryLengthInBytes);
// Invoke the actual kernel
+ int startIndex = index * ChunkSize;
+ int endIndex = Math.Min(startIndex + ChunkSize, UserSize);
EntryPointHandler.AsNotNull().Invoke(
intermediateState,
+ GroupDim,
+ GridDim,
+ startIndex,
+ endIndex,
Parameters.AsNotNull());
}
@@ -178,11 +184,7 @@ internal VelocityAccelerator(Context context, VelocityDevice device)
parallelOptions = new ParallelOptions()
{
-#if DEBUG
- MaxDegreeOfParallelism = 1,
-#else
MaxDegreeOfParallelism = device.NumMultiprocessors,
-#endif
};
executionEngine = new ParallelExecutionEngine(this);
@@ -247,15 +249,26 @@ internal void Run(
// Setup engine properties
executionEngine.GroupDim = groupSize;
- executionEngine.UserSize = userKernelConfig.Size;
+ executionEngine.UserSize = userKernelConfig.GetIntSize();
executionEngine.GridDim = gridSize;
executionEngine.DynamicSharedMemoryLengthInBytes =
runtimeKernelConfig.SharedMemoryConfig.DynamicArraySize;
executionEngine.EntryPointHandler = entryPointHandler;
executionEngine.Parameters = velocityParameters;
+ // Compute chunk size
+ int chunkSize = IntrinsicMath.DivRoundUp(gridSize, NumMultiprocessors);
+ int paddedChunkSize = IntrinsicMath.DivRoundUp(
+ chunkSize,
+ groupSize) * groupSize;
+ executionEngine.ChunkSize = paddedChunkSize;
+
+ // Compute num threads to launch
+ int totalSize = gridSize * groupSize;
+ int numChunks = IntrinsicMath.DivRoundUp(totalSize, paddedChunkSize);
+
// Launch all threads
- executionEngine.ParallelFor(0, gridSize, parallelOptions);
+ executionEngine.ParallelFor(0, numChunks, parallelOptions);
}
finally
{