From e583ef80a2f363ff13559236ba9ddd048b859cae Mon Sep 17 00:00:00 2001
From: Kevin Shan <kshan@caltech.edu>
Date: Wed, 7 Dec 2016 12:38:59 -0800
Subject: [PATCH] Minor changes to GPU MEX routines

weightCovGpu
* Switched from "paged" approach with dedicated load threads to a
  simpler version, to try to make the code more readable (in preparation
  for future single-precision support).
* Reduced round-off error by performing the sum in two stages (three if
  you count the batching).
* Switched to 16-byte (128-bit) loads from A to reduce load latency.
  Didn't seem to make a difference in the benchmark test case, though.
  Can't do this for w because of the alignment requirements. (I tried).

sumFramesGpu
* Bugfix the frame index validation. We were previously comparing a
  signed to an unsigned integer, and negative values were problematic.
* Improved readability of some parts of the kernel code.
* Reduced round-off error by performing the sum in two stages.
---
 @MoDT/mex_src/sumFramesGpu.cu |  53 ++++-----
 @MoDT/mex_src/weightCovGpu.cu | 196 ++++++++++++++++------------------
 2 files changed, 122 insertions(+), 127 deletions(-)
diff --git a/@MoDT/mex_src/sumFramesGpu.cu b/@MoDT/mex_src/sumFramesGpu.cu
index e92a2e9..af3792e 100644
--- a/@MoDT/mex_src/sumFramesGpu.cu
+++ b/@MoDT/mex_src/sumFramesGpu.cu
@@ -32,7 +32,7 @@
 #include <cuda_runtime.h>
 #include "cublas_v2.h"
 
-#include <stdio.h>
+#include <iostream>
 #include <algorithm>
 #include <utility>
 #include <memory>
@@ -84,11 +84,11 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T,
     wSum += kOffset + K*frameOffset;
     // Lay out our shared memory
     int offset = 0;
-    double *spikeBuff = (double*) &S[offset];
+    double *spikeBuff = reinterpret_cast<double*> (S + offset);
     offset += D * READ_BUFF_SZ * sizeof(*spikeBuff)/sizeof(*S);
-    double *weightBuff = (double*) &S[offset];
+    double *weightBuff = reinterpret_cast<double*> (S + offset);
     offset += kPerBlk * READ_BUFF_SZ * sizeof(*weightBuff)/sizeof(*S);
-    int *spkCounts = (int*) &S[offset];
+    int *spkCounts = reinterpret_cast<int*> (S + offset);
     // Copy the spike counts into shared memory
     int const tIdx = threadIdx.x + threadIdx.y*blockDim.x;
     int const tPerBlk = blockDim.x * blockDim.y;
@@ -96,7 +96,6 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T,
         spkCounts[ii] = (int) (spkLims[ii+T] - spkLims[ii] + 1);
     
     // Determine what this tread is responsible for computing
-    int const tIdx_rev = tPerBlk-tIdx-1;
     bool threadHasValidCompute = false;
     double *spkCompBuff, *wCompBuff, *outputTgt;
     int spkCompStride, outputStride;
@@ -111,20 +110,20 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T,
         wCompBuff = weightBuff + READ_BUFF_SZ*k;
         outputTgt = spkSum + d + D*k;
         outputStride = D*K;
-    } else if (tIdx_rev < kCount) {
+    } else if (tIdx - D*kCount < kCount) {
         threadHasValidCompute = true;
         // Thread computes wSum(k,t) = sum(weights(:,k))
-        int k = tIdx_rev;
+        int k = tIdx - D*kCount;
         spkCompBuff = &one;
         spkCompStride = 0;
         wCompBuff = weightBuff + READ_BUFF_SZ*k;
         outputTgt = wSum + k;
         outputStride = K;
     }
-    
-    // Some useful values for determining what to do when loading data
-    int const spkLoadThreads = D * READ_BUFF_SZ;
-    int const wLoadThreads = READ_BUFF_SZ * kCount;
+    // Determine what to do when loading
+    int const loadOps_spk = D * READ_BUFF_SZ;
+    int const loadOps_w = READ_BUFF_SZ * kCount;
+    int const tIdx_2 = (tIdx + 32*((loadOps_spk+31)/32)) % tPerBlk;
     
     // Main loop over time frames
     for (int frameIdx=0; frameIdx<frameCount; frameIdx++) {
@@ -134,11 +133,11 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T,
         // About 98% of the time, we can load+compute a whole buffer at a time
         while (nSpkRemain >= READ_BUFF_SZ) {
             // Load from spikes
-            for (int ldIdx=tIdx; ldIdx<spkLoadThreads; ldIdx+=tPerBlk)
+            for (int ldIdx = tIdx; ldIdx < loadOps_spk; ldIdx += tPerBlk)
                 spikeBuff[ldIdx] = spikes[ldIdx];
             spikes += D*READ_BUFF_SZ;
             // Load from weights
-            for (int ldIdx=tIdx_rev; ldIdx<wLoadThreads; ldIdx+=tPerBlk) {
+            for (int ldIdx = tIdx_2; ldIdx < loadOps_w; ldIdx += tPerBlk) {
                 int load_n = ldIdx % READ_BUFF_SZ;
                 int load_k = ldIdx / READ_BUFF_SZ;
                 weightBuff[ldIdx] = weights[load_n + N*load_k];
@@ -147,19 +146,21 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T,
             // Compute spikes*weights and sum(weights)
             __syncthreads();
             if (threadHasValidCompute) {
+                double local_acc = 0;
                 for (int ii=0; ii<READ_BUFF_SZ; ii++)
-                    result_acc += spkCompBuff[spkCompStride*ii] * wCompBuff[ii];
+                    local_acc += spkCompBuff[spkCompStride*ii] * wCompBuff[ii];
+                result_acc += local_acc;
             }
             __syncthreads();
             // Advance the buffer
             nSpkRemain -= READ_BUFF_SZ;
         }
         // Load the remaining spikes
-        for (int ldIdx=tIdx; ldIdx<(D*nSpkRemain); ldIdx+=tPerBlk)
+        for (int ldIdx = tIdx; ldIdx < D*nSpkRemain; ldIdx += tPerBlk)
             spikeBuff[ldIdx] = spikes[ldIdx];
         spikes += D*nSpkRemain;
         // Load the remaining weights
-        for (int ldIdx=tIdx_rev; ldIdx<(nSpkRemain*kCount); ldIdx+=tPerBlk) {
+        for (int ldIdx = tIdx_2; ldIdx < nSpkRemain*kCount; ldIdx += tPerBlk) {
             int load_n = ldIdx % nSpkRemain;
             int load_k = ldIdx / nSpkRemain;
             weightBuff[load_n+READ_BUFF_SZ*load_k] = weights[load_n + N*load_k];
@@ -169,8 +170,10 @@ __global__ void sumFrames( int const D, long const N, int const K, int const T,
         __syncthreads();
         if (threadHasValidCompute) {
             // Compute on the partial buffer
+            double local_acc = 0;
             for (int ii=0; ii<nSpkRemain; ii++)
-                result_acc += spkCompBuff[spkCompStride*ii] * wCompBuff[ii];
+                local_acc += spkCompBuff[spkCompStride*ii] * wCompBuff[ii];
+            result_acc += local_acc;
             // Write to output
             outputTgt[outputStride*frameIdx] = result_acc;
         }
@@ -216,8 +219,8 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
             (mxGPUGetClassID(mgpu_Y)!=mxDOUBLE_CLASS))
         mexErrMsgIdAndTxt(errId, "Y must a 2-D double-precision gpuArray");
     size_t const *dims = mxGPUGetDimensions( mgpu_Y );
-    size_t D = dims[0];
-    size_t N = dims[1];
+    ptrdiff_t D = (ptrdiff_t) dims[0];
+    ptrdiff_t N = (ptrdiff_t) dims[1];
     double const *d_Y = (double const *) mxGPUGetDataReadOnly( mgpu_Y );
     // wzu (weights) = input 1
     mxArray const *mx_wzu = prhs[1];
@@ -230,14 +233,14 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
             (mxGPUGetClassID(mgpu_wzu)!=mxDOUBLE_CLASS))
         mexErrMsgIdAndTxt(errId, "wzu must a 2-D double-precision gpuArray");
     dims = mxGPUGetDimensions( mgpu_wzu );
-    size_t N_wzu = dims[0];
-    size_t K = dims[1];
+    ptrdiff_t N_wzu = (ptrdiff_t) dims[0];
+    ptrdiff_t K = (ptrdiff_t) dims[1];
     if (N_wzu != N)
         mexErrMsgIdAndTxt(errId, "wzu must be a [N x K] gpuArray");
     double const *d_wzu = (double const *) mxGPUGetDataReadOnly( mgpu_wzu );
     // fsLim (frame spike limits) = input 2
     mxArray const *mx_fsLim = prhs[2];
-    size_t T = mxGetM(mx_fsLim);
+    ptrdiff_t T = mxGetM(mx_fsLim);
     if (!mxIsDouble(mx_fsLim) || (T==0) || (mxGetN(mx_fsLim)!=2))
         mexErrMsgIdAndTxt(errId, "f_spklim must be a [T x 2] array of doubles");
     double const *fsLim = mxGetPr(mx_fsLim);
@@ -262,12 +265,12 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
     }
     
     // Allocate memory for the outputs
-    std::vector<size_t> dims_wzuY = {D, K, T};
+    std::vector<size_t> dims_wzuY = {(size_t)D, (size_t)K, (size_t)T};
     mxGPUArray *mgpu_wzuY = mxGPUCreateGPUArray( 3, dims_wzuY.data(), 
             mxDOUBLE_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES );
     mxGPUArrayCleanupWrapper cleanup_wzuY(mgpu_wzuY);
     double *d_wzuY = (double *) mxGPUGetData(mgpu_wzuY);
-    std::vector<size_t> dims_sumwzu = {K, T};
+    std::vector<size_t> dims_sumwzu = {(size_t)K, (size_t)T};
     mxGPUArray *mgpu_sumwzu = mxGPUCreateGPUArray( 2, dims_sumwzu.data(),
             mxDOUBLE_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES );
     mxGPUArrayCleanupWrapper cleanup_sumwzu(mgpu_sumwzu);
@@ -312,7 +315,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
             K_eff = K;
         }
         // Figure out how many threads per block
-        int nWarps = (D*K_eff+31)/32 + (K_eff+31)/32;
+        int nWarps = (D*K_eff + K_eff + 31)/32;
         dim3 threadsPerBlock(32, nWarps);
         // Figure out how many blocks in the grid
         int blocksPerDevice = prop.multiProcessorCount * (maxK/K_eff);
diff --git a/@MoDT/mex_src/weightCovGpu.cu b/@MoDT/mex_src/weightCovGpu.cu
index 92bc282..f4c3577 100644
--- a/@MoDT/mex_src/weightCovGpu.cu
+++ b/@MoDT/mex_src/weightCovGpu.cu
@@ -29,117 +29,102 @@
 #include <algorithm>
 #include <utility>
 
+
 /* CUDA kernel for computing A*diag(w)*A'
  *
  * This expects:
- *  - 1-D thread block of size [32*loadWarps + D*(D+1)]
- *  - Shared memory to hold (D+1)*64 doubles
+ *  - 1-D thread block with at least D*(D+1)/2 threads
+      (it should also have at least 32 threads)
+ *  - Shared memory to hold (D+1)*32 doubles
  *  - 1-D block grid of any size
  *
- * This kernel uses (32*loadWarps) non-computing threads to load data into
- * shared memory and D*(D+1) compute threads to compute the weighted covariance.
- *
  * Inputs:
  *    D             # rows in A, and dimension of C
  *    N             # columns in A, and entries in w
- *    loadWarps     # warps (32 threads per warp) to dedicate to loading data
  *    A             [D x N] data matrix
  *    w             [N] weight vector
  * Outputs:
  *    C             [D x D x #blocks] matrix to store the output
  */
+int const READ_BUFF_SZ = 32;
 __global__ void calcWeightedCov(
-        int const D, int const N, int const loadWarps,
-        double const * __restrict__ A, double const * __restrict__ w, 
-        double *C)
+        int const D, int const N, double const *A, double const *w, double *C)
 {
-    /* Dynamically-allocated shared memory, length = (D+1)*64
-     * This is split into 2 pages of size (32*D+32), laid out as:
-     *  [ A_page0 ; w_page0; A_page1 ; w_page1 ]
-     * In our main loop, we alternate pages so that the next load can occur
-     * while we are still computing on the previously-loaded page. */
+    // Dynamically-allocated shared memory, length = (D+1)*32
     extern __shared__ double S[];
-    double * A_buff = S;
-    int const A_buffSize = 32*D;
-    double * w_buff = S + A_buffSize;
-    int const w_buffSize = 32;
-    int const pageSize = A_buffSize + w_buffSize;
+    double *w_buff = S;                // [32 x 1] buffer for weights
+    double *A_buff = S + READ_BUFF_SZ; // [D x 32] buffer for data matrix
     
     // Figure out what this thread is supposed to be doing
-    int const loadThreads = 32 * loadWarps;
-    int const tIsLoad = (threadIdx.x < loadThreads);
-    int const tIdx = (tIsLoad) ? (threadIdx.x) : (threadIdx.x-loadThreads);
-    int ii, jj;
-    if (tIsLoad) {
-        /* Since we are loading 32 columns at a time, exactly one warp
-         * will be tasked with loading from w.
-         * ii = wIdx if this thread belongs to that warp, ii = -1 otherwise */
-        ii = ((tIdx/32)==((D+1)%loadWarps)) ? (tIdx%32) : (-1);
-    } else {
-        /* Compute threads are organized into a [D x (D+1)] array.
-         *          [ 00a  00b  01b  02b ]
-         * For D=3: [ 10a  11a  11b  12b ]
-         *          [ 20a  21a  22a  22b ]
-         * The entries marked __a will compute the covariance over
-         * even-numbered columns and __b will compute the odd columns. */
-        jj = tIdx/D;
-        ii = tIdx - D*jj;
+    int const tIdx = threadIdx.x;
+    int const nThreads = blockDim.x;
+    // Loading phase: the first 32 threads will load w, rest will do A
+    int const readOps_w = READ_BUFF_SZ; // Gonna assume nThreads > READ_BUFF_SZ
+    int const tIdx_A = (tIdx - readOps_w + nThreads) % nThreads;
+    int const readOps_A = D * READ_BUFF_SZ * sizeof(double)/sizeof(double2);
+    // Compute phase: this thread will compute C(ii,jj)
+    int ii = -1, jj = -1;
+    if (tIdx < D*(D+1)/2) {
+        ii = tIdx % D; jj = tIdx / D;
         if (jj > ii) {
-            jj--;
-            // Shift the pointers up by one column so it does the odd columns
-            A_buff += D;
-            w_buff++;
+            ii = D-1 - ii; jj = D - jj;
         }
     }
+    // And we will store the result in this local register
+    double result_acc = 0;
     
-    // Main loop: load and compute the covariance
-    int currPage = 0;       // Start on buffer page 0
-    double result_acc = 0;  // Local accumulator for C(ii, jj)
-    for (int colOffset=blockIdx.x*32; colOffset<N; colOffset+=32*gridDim.x) {
-        // Point to the appropriate buffer page
-        double * A_page = A_buff + currPage*pageSize;
-        double * w_page = w_buff + currPage*pageSize;
-        // Load A and w from global memory
-        if (tIsLoad) {
-            if (colOffset+32 > N) {
-                // This page is incomplete; pad it with zeroes
-                int nLeftover = N - colOffset;
-                for (int idx=tIdx; idx<A_buffSize; idx+=loadThreads)
-                    A_page[idx] = (idx<D*nLeftover) ? A[idx+D*colOffset] : 0;
-                if (ii >= 0)
-                    w_page[ii] = (ii<nLeftover) ? w[ii+colOffset] : 0;
-            } else {
-                // No need to worry about indices going out of bounds
-                for (int idx=tIdx; idx<A_buffSize; idx+=loadThreads)
-                    A_page[idx] = A[idx + D*colOffset];
-                if (ii >= 0)
-                    w_page[ii] = w[ii + colOffset];
-            }
+    // Main loop: grid-stride loop over complete buffers
+    int readOffset = READ_BUFF_SZ * blockIdx.x;
+    for ( ; readOffset <= N-READ_BUFF_SZ; readOffset += READ_BUFF_SZ*gridDim.x) {
+        // Load w from memory
+        if (tIdx < readOps_w)
+            w_buff[tIdx] = w[readOffset + tIdx];
+        // Load A from memory, 16 bytes at a time
+        double2 const *src = reinterpret_cast<double2 const*>(A + D*readOffset);
+        double2 *tgt = reinterpret_cast<double2*>(A_buff);
+        for (int idx = tIdx_A; idx < readOps_A; idx += nThreads)
+            tgt[idx] = src[idx];
+        // Wait for load to complete
+        __syncthreads();
+        // Compute
+        if (ii >= 0) {
+            // Compute the result for this buffer
+            double local_acc = 0;
+            for (int kk=0; kk<READ_BUFF_SZ; kk++)
+                local_acc += A_buff[ii+D*kk] * A_buff[jj+D*kk] * w_buff[kk];
+            // Add it to the running total
+            result_acc += local_acc;
         }
-        // Wait for load to complete before computing on this page
+        // Wait for computation to finish
+        __syncthreads();
+    }
+    // Deal with remaining data
+    int nLeftover = N - readOffset;
+    if (nLeftover > 0) {
+        // Load remaining w
+        if (tIdx < nLeftover)
+            w_buff[tIdx] = w[readOffset + tIdx];
+        // Load remaining A
+        for (int idx = tIdx_A; idx < D*nLeftover; idx += nThreads)
+            A_buff[idx] = A[D*readOffset + idx];
+        // Wait for load to complete.
         __syncthreads();
         // Compute
-        if (!tIsLoad) {
-            for (int kk=0; kk<32; kk+=2)
-                result_acc += A_page[ii+D*kk] * A_page[jj+D*kk] * w_page[kk];
+        if (ii >= 0) {
+            double local_acc = 0;
+            for (int kk=0; kk<nLeftover; kk++)
+                local_acc += A_buff[ii+D*kk] * A_buff[jj+D*kk] * w_buff[kk];
+            result_acc += local_acc;
         }
-        // Switch buffer pages
-        currPage = (currPage+1) % 2;
     }
     
-    // Sum the compute halves and write output to global memory
-    if (!tIsLoad) {
-        // Write to shared memory
-        __syncthreads();
-        S[tIdx] = result_acc;
-        __syncthreads();
-        // Add the result from the other compute half
-        int transposeIdx = jj + ii*D;
-        if (tIdx==ii+D*jj) transposeIdx += D; // See compute thread layout
-        result_acc += S[transposeIdx];
-        // Write to global memory
-        double * C_block = C + blockIdx.x*D*D;
+    // Write output to global memory
+    if (ii >= 0) {
+        double *C_block = C + D*D*blockIdx.x;
         C_block[ii+D*jj] = result_acc;
+        // Write the other half as well
+        if (ii != jj)
+            C_block[jj+D*ii] = result_acc;
     }
 }
 
@@ -296,24 +281,31 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
         /* When D is small, we can improve GPU utilization by breaking X into
          * batches, computing the weighted covariance of each batch, and then
          * summing across the batches. */
-        int computeThreads = D*(D+1);
-        int sharedMemPerBlock = (D+1)*64*sizeof(*d_A);
-        int loadWarps = 1;
-        int blocksPerMP = 16;
-        /* Some hardcoded optimization (specific to Compute Capability 3.0) to
-         * maximize the number of load threads per block, but only if it doesn't
-         * affect the total # of blocks per multiprocessor. */
-        int computeWarps = (computeThreads+31)/32;
-        int const sharedMemLimit = 48*1024;
-        int const warpLimit = 64;
-        blocksPerMP = min(blocksPerMP, sharedMemLimit/sharedMemPerBlock);
-        blocksPerMP = min(blocksPerMP, warpLimit/(computeWarps+loadWarps));
-        int extraWarps = warpLimit - (computeWarps+loadWarps)*blocksPerMP;
-        loadWarps += extraWarps/blocksPerMP;
-        /* Figure out how many blocks. More blocks can potentially assist in GPU
-         * scheduling, but we get diminishing returns after we are able to
-         * saturate all available multiprocessors. Assumes 8 MPs. */
-        int numBlocks = min( (int) (N/1024)+1, 8*blocksPerMP );
+        
+        // Figure out how to maximize the kernel residency
+        int deviceNo;
+        cudaGetDevice(&deviceNo);
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, deviceNo);
+        int numMPs = prop.multiProcessorCount;
+        /* I haven't found a way to programatically determine the maximum number
+         * of blocks per multiprocessor or the total shared memory per MP, so 
+         * this is going to be hardcoded for Compute Capability 3.0 */
+        int maxWarpsPerMP = 64;
+        int maxSharedMemPerMP = 48*1024;
+        int maxBlocksPerMP = 16;
+        // Determine the maximum number of blocks per multiprocessor (MP)
+        int minWarpsPerBlock = (D*(D+1)/2 + 31)/32;
+        int sharedMemPerBlock = (D+1)*32*sizeof(*d_A);
+        int blocksPerMP = std::min( maxWarpsPerMP/minWarpsPerBlock, 
+                maxSharedMemPerMP/sharedMemPerBlock );
+        blocksPerMP = std::min(blocksPerMP, maxBlocksPerMP);
+        // Allocate additional warps if the limit allows. Although these won't
+        // participate in computation, they can help load data.
+        int warpsPerBlock = std::max(minWarpsPerBlock, maxWarpsPerMP/blocksPerMP);
+        // And decide on how many blocks total
+        int numBlocks = std::min( (int)(N/1024)+1, blocksPerMP*numMPs );
+        
         // Allocate memory for the outputs
         double *d_CBatched;
         cudaError_t cudaStat = cudaMalloc((void**)&d_CBatched, 
@@ -321,9 +313,9 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
         if (cudaStat != cudaSuccess)
             mexErrMsgIdAndTxt(cudaErrId, "Failed to allocate CUDA memory");
         // Launch our CUDA kernel to calculate the weighted covariance
-        int threadsPerBlock = computeThreads + 32*loadWarps;
+        int threadsPerBlock = 32*warpsPerBlock;
         calcWeightedCov<<<numBlocks,threadsPerBlock,sharedMemPerBlock>>>
-                (D, N, loadWarps, d_A, d_w, d_CBatched);
+                (D, N, d_A, d_w, d_CBatched);
         // Sum the batched results
         sumColumns<<<(D*D+31)/32, 32>>>(D*D, numBlocks, d_CBatched, d_C);
         // Free the memory that we'd allocated for the outputs