From b1e569f931e8fe7635a66b82f44a220316e55b6f Mon Sep 17 00:00:00 2001 From: Kevin Shan Date: Tue, 25 Apr 2017 16:42:29 -0700 Subject: [PATCH] Add single-precision kernel for sumFramesGpu() --- @MoDT/mex_src/sumFramesGpu.cu | 147 ++++++++++++++++------------------ 1 file changed, 71 insertions(+), 76 deletions(-) diff --git a/@MoDT/mex_src/sumFramesGpu.cu b/@MoDT/mex_src/sumFramesGpu.cu index 70ee559..2b436bf 100644 --- a/@MoDT/mex_src/sumFramesGpu.cu +++ b/@MoDT/mex_src/sumFramesGpu.cu @@ -20,11 +20,13 @@ * wzuY(:,:,t) = Y(:,n1:n2) * wzu(n1:n2,:); * end * - * For small problems, (D < 32), this is performed in a custom kernel that + * For small problems, (D <= 32), this is performed in a custom kernel that * reduces the kernel launch overhead. For larger problems, this calls cuBLAS: * GEMM for Y*wzu, and GEMV with a vector of ones for sum(wzu). * - * Kevin Shan, 2016-08-29 + * Kevin Shan + * 2017-04-25 Add single-precision support + * 2016-08-29 Initial version *============================================================================*/ #ifdef MATLAB_MEX_FILE @@ -63,9 +65,10 @@ * wSum [K x T] sum of the weights in each time frame */ int const READ_BUFF_SZ = 16; -__global__ void sumFrames( int const D, long const N, int const K, int const T, - double const * __restrict__ spikes, double const * __restrict__ weights, - long const * __restrict__ spkLims, double *spkSum, double *wSum ) +template +__global__ void sumFrames( int const D, int const N, int const K, int const T, + numeric_t const * __restrict__ spikes, numeric_t const * __restrict__ weights, + int const * __restrict__ spkLims, numeric_t *spkSum, numeric_t *wSum ) { // Declare our dynamically-allocated shared memory extern __shared__ int S[]; @@ -81,18 +84,18 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T, int const frameCount = min(framesPerBlk, T-frameOffset); if (frameCount <= 0) return; // And the spike range - long const spkOffset = spkLims[frameOffset]; + int const spkOffset = spkLims[frameOffset]; // Shift the pointers to account for these offsets - spikes += D*spkOffset; - weights += spkOffset + N*kOffset; + spikes += D*static_cast(spkOffset); + weights += spkOffset + static_cast(N)*kOffset; spkLims += frameOffset; spkSum += D*(kOffset + K*frameOffset); wSum += kOffset + K*frameOffset; // Lay out our shared memory int offset = 0; - double *spikeBuff = reinterpret_cast (S + offset); + numeric_t *spikeBuff = reinterpret_cast (S + offset); offset += D * READ_BUFF_SZ * sizeof(*spikeBuff)/sizeof(*S); - double *weightBuff = reinterpret_cast (S + offset); + numeric_t *weightBuff = reinterpret_cast (S + offset); offset += kPerBlk * READ_BUFF_SZ * sizeof(*weightBuff)/sizeof(*S); int *spkCounts = reinterpret_cast (S + offset); // Copy the spike counts into shared memory @@ -103,9 +106,9 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T, // Determine what this tread is responsible for computing bool threadHasValidCompute = false; - double *spkCompBuff, *wCompBuff, *outputTgt; + numeric_t *spkCompBuff, *wCompBuff, *outputTgt; int spkCompStride, outputStride; - double one = 1; + numeric_t one = 1; if (tIdx < D*kCount) { threadHasValidCompute = true; // Thread computes spkSum(d,k,t) = spikes(d,:) * weights(:,k) @@ -135,7 +138,7 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T, for (int frameIdx=0; frameIdx= READ_BUFF_SZ) { // Load from spikes @@ -152,7 +155,7 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T, // Compute spikes*weights and sum(weights) __syncthreads(); if (threadHasValidCompute) { - double local_acc = 0; + numeric_t local_acc = 0; for (int ii=0; ii -void computeFrameSums(ptrdiff_t D, ptrdiff_t N, ptrdiff_t K, ptrdiff_t T, - numeric_t const *d_Y, numeric_t const *d_wzu, std::vector &fsLim0, +void computeFrameSums(int D, int N, int K, int T, + numeric_t const *d_Y, numeric_t const *d_wzu, std::vector &fsLim0, numeric_t *d_wzuY, numeric_t *d_sumwzu) { // Validate the fsLim indices and get the max # spikes per frame char const * const errId = "MoDT:sumFramesGpu:InvalidInput"; - ptrdiff_t maxCount = 0; - ptrdiff_t last_n2 = 0; + int maxCount = 0; + int last_n2 = 0; for (int t=0; tN) || (n2>=N) || (n2-n1 < -1)) mexErrMsgIdAndTxt(errId, "Invalid frame spike limits"); @@ -293,14 +285,14 @@ void computeFrameSums(ptrdiff_t D, ptrdiff_t N, ptrdiff_t K, ptrdiff_t T, // Sum across time frames cudaError_t cudaStat; char const * const cudaErrId = "MoDT:sumFramesGpu:cudaError"; - if ((D < 32) && std::is_same::value) { + if (D <= 32) { /* For small problems, we have a custom kernel that will perform the for * loop over time frames, reducing the kernel launch overhead. */ // Copy the fsLim indices to the GPU - ptrdiff_t *d_fsLim; + int *d_fsLim; cudaStat = cudaMalloc((void**)&d_fsLim, T*2*sizeof(d_fsLim)); - std::unique_ptr cleanup_fsLim(d_fsLim); + std::unique_ptr cleanup_fsLim(d_fsLim); if (cudaStat != cudaSuccess) mexErrMsgIdAndTxt(cudaErrId, "Failed to allocate CUDA memory"); cudaStat = cudaMemcpyAsync(d_fsLim, fsLim0.data(), T*2*sizeof(d_fsLim), @@ -313,39 +305,42 @@ void computeFrameSums(ptrdiff_t D, ptrdiff_t N, ptrdiff_t K, ptrdiff_t T, cudaGetDevice(&deviceNo); cudaDeviceProp prop; cudaGetDeviceProperties(&prop, deviceNo); - // Figure out how many clusters we can do at once - int maxK_thread = prop.maxThreadsPerBlock / (D+1); - int maxK_mem = (prop.sharedMemPerBlock/2 - T*sizeof(d_fsLim)) / - (READ_BUFF_SZ*sizeof(d_Y)) - D; + // Design for the worst-case scenario in terms of # frames per block + int maxT_eff = 64; // I decided this + // Figure out how many clusters we can do and still have 2 blocks per MP + int maxThreads = prop.maxThreadsPerMultiProcessor; + int maxK_thread = (maxThreads/2) / (D+1); + int maxMem = prop.sharedMemPerMultiprocessor; + int maxK_mem = ((maxMem/2) - maxT_eff*sizeof(d_fsLim)) + / (READ_BUFF_SZ*sizeof(d_Y)) - D; int maxK = std::min(maxK_thread, maxK_mem); + // If we can't do all of the clusters at once, try to spread them evenly int K_eff, grid_x; - if (maxK < K) { - // If we can't do them all at once, try to spread them evenly - grid_x = (K + maxK-1)/maxK; - K_eff = (K + grid_x-1)/grid_x; - } else { - // We can do all the clusters at once + if (maxK >= K) { grid_x = 1; K_eff = K; + } else { + grid_x = (K + maxK-1)/maxK; + K_eff = (K + grid_x-1)/grid_x; } - // Figure out how many threads per block + // This determines the threads per block and the memory usage int nWarps = (D*K_eff + K_eff + 31)/32; dim3 threadsPerBlock(32, nWarps); + int memPerBlock = (D+K_eff)*READ_BUFF_SZ*sizeof(d_Y) + + maxT_eff*sizeof(d_fsLim); // Figure out how many blocks in the grid - int blocksPerDevice = prop.multiProcessorCount * (maxK/K_eff); - int grid_y = std::min((int) T, 2*blocksPerDevice); + int blocksPerMP = std::min(maxThreads/(64*nWarps), maxMem/memPerBlock); + int grid_y = 2 * prop.multiProcessorCount * blocksPerMP; + grid_y = std::max(grid_y, (T+maxT_eff-1)/maxT_eff); + grid_y = std::min(grid_y, T); dim3 blocksPerGrid(grid_x, grid_y); - // Figure out how much memory per block - int T_eff = (T + grid_y-1)/grid_y; - int memPerBlock = (D+K_eff)*READ_BUFF_SZ*sizeof(d_Y) + - T_eff*sizeof(d_fsLim); // Launch the kernel sumFrames<<>> (D, N, K, T, d_Y, d_wzu, d_fsLim, d_wzuY, d_sumwzu); } else { - /* For larger problems (or single-precision), we turn to cuBLAS */ + /* For larger problems, we turn to cuBLAS */ // Initialize the cuBLAS context cublasHandle_t handle; @@ -372,7 +367,7 @@ void computeFrameSums(ptrdiff_t D, ptrdiff_t N, ptrdiff_t K, ptrdiff_t T, // For loop over time frames for (int t=0; t(dims[0]); - ptrdiff_t N = static_cast(dims[1]); + int D = dims[0]; + int N = dims[1]; // wzu (weights) = input 1 mxArray const *mx_wzu = prhs[1]; if (!mxIsGPUArray(mx_wzu)) @@ -437,21 +432,21 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]) (mxGPUGetClassID(mgpu_wzu)!=numericType)) mexErrMsgIdAndTxt(errId, "wzu must a 2-D gpuArray of the same type as Y"); dims = mxGPUGetDimensions( mgpu_wzu ); - ptrdiff_t N_wzu = static_cast(dims[0]); - ptrdiff_t K = dims[1]; + int N_wzu = dims[0]; + int K = dims[1]; if (N_wzu != N) mexErrMsgIdAndTxt(errId, "wzu must be a [N x K] gpuArray"); // fsLim (frame spike limits) = input 2 mxArray const *mx_fsLim = prhs[2]; - ptrdiff_t T = mxGetM(mx_fsLim); + int T = mxGetM(mx_fsLim); if (!mxIsDouble(mx_fsLim) || (T==0) || (mxGetN(mx_fsLim)!=2)) mexErrMsgIdAndTxt(errId, "f_spklim must be a [T x 2] array of doubles"); double const *fsLim = mxGetPr(mx_fsLim); // Copy the fsLim indices to a vector of 0-indexed integers - std::vector fsLim0(T*2); + std::vector fsLim0(T*2); std::transform(fsLim, fsLim+2*T, fsLim0.begin(), - [](double matlabIdx){ return ((ptrdiff_t) matlabIdx)-1; }); + [](double matlabIdx){ return ((int) matlabIdx)-1; }); // Allocate memory for the outputs std::vector dims_wzuY = {(size_t)D, (size_t)K, (size_t)T}; @@ -488,12 +483,12 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]) #else template -void demo_sumFrames(ptrdiff_t D, ptrdiff_t N, ptrdiff_t K, ptrdiff_t T) +void demo_sumFrames(int D, int N, int K, int T) { // Create the data thrust::device_vector Y(D*N,1); thrust::device_vector wzu(N*K,1); - std::vector fsLim(T*2); + std::vector fsLim(T*2); for (int t=0; t