Skip to content

Commit

Permalink
Reduce block size of sumFramesGpu CUDA kernel
Browse files Browse the repository at this point in the history
This commit reduces the maximum kernel block size so that (nominally)
we can fit at least 3 blocks per multiprocessor (MP) instead of just 2.
This was necessary because it turns out that it is register usage,
rather than thread count or shared memory usage, that is limiting how
many blocks can be launched per MP. Unfortunately, we don't have a way
to programatically determine the register usage, so I am lowering the
block size as a workaround.

This commit also adds error-checking code so that such kernel launch
errors are reported more transparently in the future.
  • Loading branch information
kqshan committed Feb 7, 2019
1 parent f85a5e3 commit 0cd3671
Showing 1 changed file with 10 additions and 3 deletions.
13 changes: 10 additions & 3 deletions @MoDT/mex_src/sumFramesGpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -307,11 +307,11 @@ void computeFrameSums(int D, int N, int K, int T,
cudaGetDeviceProperties(&prop, deviceNo);
// Design for the worst-case scenario in terms of # frames per block
int maxT_eff = 64; // I decided this
// Figure out how many clusters we can do and still have 2 blocks per MP
// Figure out how many clusters we can do and still have 3 blocks per MP
int maxThreads = prop.maxThreadsPerMultiProcessor;
int maxK_thread = (maxThreads/2) / (D+1);
int maxK_thread = (maxThreads/3) / (D+1);
int maxMem = prop.sharedMemPerMultiprocessor;
int maxK_mem = ((maxMem/2) - maxT_eff*sizeof(*d_fsLim))
int maxK_mem = ((maxMem/3) - maxT_eff*sizeof(*d_fsLim))
/ (READ_BUFF_SZ*sizeof(numeric_t)) - D;
int maxK = std::min(maxK_thread, maxK_mem);
// If we can't do all of the clusters at once, try to spread them evenly
Expand All @@ -338,6 +338,13 @@ void computeFrameSums(int D, int N, int K, int T,
// Launch the kernel
sumFrames<<<blocksPerGrid, threadsPerBlock, memPerBlock>>>
(D, N, K, T, d_Y, d_wzu, d_fsLim, d_wzuY, d_sumwzu);
// Check for kernel launch errors
cudaStat = cudaGetLastError();
if (cudaStat != cudaSuccess) {
std::cout<<"kernel launch requested 32 x "<<nWarps<<" threads and ";
std::cout<<memPerBlock<<" mem per block."<<std::endl;
mexErrMsgIdAndTxt(cudaErrId, cudaGetErrorString(cudaStat));
}

} else {
/* For larger problems, we turn to cuBLAS */
Expand Down

0 comments on commit 0cd3671

Please sign in to comment.