From 6885ff78d0532f325df5128dc392b24ab55e2149 Mon Sep 17 00:00:00 2001
From: mkrumin <mkrumin@gmail.com>
Date: Wed, 2 Nov 2016 18:47:16 +0000
Subject: [PATCH] speed optimization for svd and clustering (mostly for GPU
 users)

- large matrix multiplications are now done on GPU (usually x2
improvement in speed for 'single' arrays)
- slightly better vectorization and memory management
- saving with '-v6' when possible (not everywhere yet)
---
 cellDetection/fastClustNeuropilCoef.m | 48 ++++++++++++++----------
 registration/reg2P.m                  | 14 ++++---
 signalExtraction/get_regions.m        | 26 ++++++++-----
 svd/get_svdForROI.m                   | 47 ++++++++++++++++-------
 svd/get_svdcomps.m                    | 54 ++++++++++++++++++---------
 svd/gpuBlockSmallXtY.m                | 41 ++++++++++++++++++++
 svd/gpuBlockXY.m                      | 39 +++++++++++++++++++
 svd/gpuBlockXtX.m                     | 54 +++++++++++++++++++++++++++
 svd/gpuBlockXtY.m                     | 38 +++++++++++++++++++
 utils/normc.m                         |  5 ++-
 10 files changed, 300 insertions(+), 66 deletions(-)
 create mode 100644 svd/gpuBlockSmallXtY.m
 create mode 100644 svd/gpuBlockXY.m
 create mode 100644 svd/gpuBlockXtX.m
 create mode 100644 svd/gpuBlockXtY.m

diff --git a/cellDetection/fastClustNeuropilCoef.m b/cellDetection/fastClustNeuropilCoef.m
index 9211821f..f90305cc 100644
--- a/cellDetection/fastClustNeuropilCoef.m
+++ b/cellDetection/fastClustNeuropilCoef.m
@@ -1,5 +1,5 @@
 function [ops, stat, res] = fastClustNeuropilCoef(ops, U, Sv)
-% 
+%
 U =  reshape(U, [], size(U,ndims(U)));
 iplane = ops.iplane;
 U = bsxfun(@times, U, Sv'.^.5)';
@@ -18,7 +18,7 @@
 
 xs = repmat(round(linspace(1, nsqrt, Lx)), Ly, 1);
 ys = repmat(round(linspace(1, nsqrt, Ly))', 1, Lx);
-iclust = xs + (ys-1) * nsqrt; 
+iclust = xs + (ys-1) * nsqrt;
 
 clear xs ys
 
@@ -26,13 +26,13 @@
 
 % xs = repmat(1:Lx, Ly, 1);
 % ys = repmat((1:Ly)', 1, Lx);
-% 
+%
 % randx = rand(1, Nk) * Lx;
 % randy = rand(1, Nk) * Ly;
-% 
+%
 % dx = repmat(xs(:), 1, Nk) - repmat(randx, numel(xs(:)), 1);
 % dy = repmat(ys(:), 1, Nk) - repmat(randy, numel(ys(:)), 1);
-% 
+%
 % dxy = dx.^2 + dy.^2;
 % [~, iclust] = min(dxy, [], 2);
 %%
@@ -61,7 +61,7 @@
 
 ison = true(Nk,1);
 TileFactor = getOr(ops, {'TileFactor'}, 1); % this option can be overwritten by the user
-nTiles = ceil(TileFactor * (Ly+Lx)/2 / (10 * ops.diameter)); % neuropil is modelled as nTiles by nTiles 
+nTiles = ceil(TileFactor * (Ly+Lx)/2 / (10 * ops.diameter)); % neuropil is modelled as nTiles by nTiles
 
 xc = linspace(1, Lx, nTiles);
 yc = linspace(1, Ly, nTiles);
@@ -106,7 +106,7 @@
     StU = Sm' * Uneu';
     Lam = (StS + 1e-4 * eye(nBasis)) \ StU;
     
-    % recompute neuropil pixel contribution 
+    % recompute neuropil pixel contribution
     neuropil = Lam' * S';
     PixL = mean(bsxfun(@times, neuropil, Uneu), 1);
     PixL = bsxfun(@rdivide, PixL, mean(neuropil.^2,1));
@@ -124,18 +124,28 @@
         end
     end
     %
-%     Ff = Fs' * vs;
-%     vs = Finv * max(0, Ff);
-%     [dcell, Ffr] = run_deconvolution2(Ff, f0, kernel);    
-%     vs = Finv * Ffr;
+    %     Ff = Fs' * vs;
+    %     vs = Finv * max(0, Ff);
+    %     [dcell, Ffr] = run_deconvolution2(Ff, f0, kernel);
+    %     vs = Finv * Ffr;
     
     vs = bsxfun(@rdivide, vs, sum(vs.^2,1).^.5 + 1e-8);% normalize activity vectors
+    vs = single(vs);
     
     % recompute pixels' assignments
-    xs          = vs' * Ucell;
+    if ops.useGPU
+        xs          = gpuBlockSmallXtY(vs, Ucell);
+    else
+        xs          = vs' * Ucell;
+    end
+    
     [M, iclust] = max(xs,[],1);
+    %     Uneu        = U - bsxfun(@times, M, vs(:,iclust)); %what's left over for neuropil model
     Uneu        = U - bsxfun(@times, M, vs(:,iclust)); %what's left over for neuropil model
-    err(k)      = sum(sum((Uneu-neuropil).^2)).^.5;
+    %     vs = double(vs);
+    
+    %     err(k)      = sum(sum((Uneu-neuropil).^2)).^.5;
+    err(k)      = norm(Uneu(:)-neuropil(:));
     
     if 1
         %---------------------------------------------%
@@ -177,20 +187,20 @@
     
     
     if (rem(k,10)==1 || k==niter) && ops.ShowCellMap
-%         imagesc(reshape(PixL, Ly, Lx), [0 2])
-%         drawnow
-%         
+        %         imagesc(reshape(PixL, Ly, Lx), [0 2])
+        %         drawnow
+        %
         lam = M;
         for i = 1:Nk
             ix = find(iclust==i);
             nT0 = numel(ix);
             if nT0>0
                 vM = lam(ix);
-%                 vM = vM/sum(vM.^2)^.5;
+                %                 vM = vM/sum(vM.^2)^.5;
                 lam(ix) = vM;
             end
         end
-%         V = max(0, min(10 * reshape(lam, Ly, Lx), 1));
+        %         V = max(0, min(10 * reshape(lam, Ly, Lx), 1));
         V = max(0, min(.5 * reshape(lam, Ly, Lx)/mean(lam(:)), 1));
         H = reshape(r(iclust), Ly, Lx);
         rgb_image = hsv2rgb(cat(3, H, Sat, V));
@@ -199,7 +209,7 @@
         drawnow
         fprintf('residual variance is %2.6f time %2.2f \n', err(k), toc)
     end
-   
+    
 end
 
 lam = M;
diff --git a/registration/reg2P.m b/registration/reg2P.m
index 634e5ad8..2b1bcc07 100644
--- a/registration/reg2P.m
+++ b/registration/reg2P.m
@@ -88,17 +88,19 @@
     end
     
     if ops.showTargetRegistration
-        figure('position', [900 50 900 900])
-        ax = ceil(sqrt(numel(ops1)/2));
+        nRows = floor(sqrt(numel(ops1)));
+        nColumns = ceil(numel(ops1)/nRows);
+        figure('Name', 'Registration Target Frames', ...
+            'Position', [50 50 nColumns*500 nRows*500])
         i0 = 0;
         for i = 1:numPlanes
             for j = 1:size(xFOVs,2)
                 i0 = i0+1;
-                subplot(ax,2*ax,i0)
-                imagesc(ops1{i,j}.mimg)
+                subplot(nRows, nColumns, i0);
+                imagesc(ops1{i,j}.mimg);
                 colormap('gray')
-                title(sprintf('Registration for plane %d, mouse %s, date %s', ...
-                    i, ops.mouse_name, ops.date))
+                axis equal tight
+                title(sprintf('Plane %d, %s_%s', i, ops.mouse_name, ops.date))
             end
         end
         
diff --git a/signalExtraction/get_regions.m b/signalExtraction/get_regions.m
index 7e261f0d..ae5a2836 100644
--- a/signalExtraction/get_regions.m
+++ b/signalExtraction/get_regions.m
@@ -22,9 +22,9 @@
 for k = 1:Nk
     % needs stat, res, neigh
     pixall = stat(k).ipix;
-
-%     minV = clustrules.parent.minPixRelVar * mean(res.M(pixall));
-%     pixall(res.M(pixall)< minV) = [];
+    
+    %     minV = clustrules.parent.minPixRelVar * mean(res.M(pixall));
+    %     pixall(res.M(pixall)< minV) = [];
     
     whclust = 0;
     region = [];
@@ -47,15 +47,23 @@
         
         x0 = xs(pixi); y0 = ys(pixi);
         
-        rs = ((x0 - mean(x0)).^2 + (y0 - mean(y0)).^2).^.5;
-        region(whclust).mrs     = mean(rs);
+        meanX0 = mean(x0);
+        meanY0 = mean(y0);
+        rs = ((x0 - meanX0).^2 + (y0 - meanY0).^2).^.5;
         region(whclust).npix    = numel(pixi);
-        region(whclust).mrs0    = mean(rgridsort(1:region(whclust).npix));
-        region(whclust).med     = [mean(y0) mean(x0)];
+        if region(whclust).npix>1
+            region(whclust).mrs     = mean(rs);
+            region(whclust).mrs0    = mean(rgridsort(1:region(whclust).npix));
+            region(whclust).V       = sum(res.M(pixi));
+        else
+            region(whclust).mrs     = rs;
+            region(whclust).mrs0    = rgridsort(1);
+            region(whclust).V       = res.M(pixi);
+        end
+        region(whclust).med     = [meanY0 meanX0];
         region(whclust).ipix    = pixi;
         region(whclust).lambda  = res.lambda(pixi);
-        region(whclust).V       = sum(res.M(pixi));
-        region(whclust).parent      = k;  
+        region(whclust).parent      = k;
     end
     stat(k).region = region;
     
diff --git a/svd/get_svdForROI.m b/svd/get_svdForROI.m
index 3f13ee7d..f13a4568 100644
--- a/svd/get_svdForROI.m
+++ b/svd/get_svdForROI.m
@@ -15,7 +15,7 @@
 ix = 0;
 fid = fopen(ops.RegFile, 'r');
 
-mov = zeros(Ly, Lx, ops.NavgFramesSVD, 'single');
+mov = zeros(numel(ops.yrange), numel(ops.xrange), ops.NavgFramesSVD, 'single');
 
 while 1
     data = fread(fid,  Ly*Lx*nimgbatch, '*int16');
@@ -30,21 +30,21 @@
     data = bsxfun(@minus, data, mean(data,3));
 %     data = bsxfun(@minus, data, ops.mimg1);
     
-    irange = 1:nt0*floor(size(data,3)/nt0);
-    data = data(:,:, irange);
+    nSlices = nt0*floor(size(data,3)/nt0);
+    if nSlices~=size(data,3)
+        data = data(:,:, 1:nSlices);
+    end
     
     data = reshape(data, Ly, Lx, nt0, []);
-    davg = single(squeeze(mean(data,3)));
+    davg = squeeze(mean(data,3));
       
-    mov(:,:,ix + (1:size(davg,3))) = davg;
+    mov(:,:,ix + (1:size(davg,3))) = davg(ops.yrange, ops.xrange, :);
         
     ix = ix + size(davg,3);    
 end
 fclose(fid);
-%%
-mov(:, :, (ix+1):end) = [];
 
-mov = mov(ops.yrange, ops.xrange, :);
+mov = mov(:, :, 1:ix);
 % mov = mov - repmat(mean(mov,3), 1, 1, size(mov,3));
 %% SVD options
 
@@ -60,8 +60,13 @@
 
 mov             = reshape(mov, [], size(mov,3));
 sdmov           = mean(mov.^2,2).^.5;
-mov             = mov./repmat(sdmov, 1, size(mov,2));
-COV             = mov' * mov/size(mov,1);
+% mov             = mov./repmat(sdmov, 1, size(mov,2));
+mov             = bsxfun(@rdivide, mov, sdmov);
+if ops.useGPU
+    COV             = gpuBlockXtX(mov)/size(mov,1);
+else
+    COV             = mov' * mov/size(mov,1);
+end
 
 ops.nSVDforROI = min(size(COV,1)-2, ops.nSVDforROI);
 
@@ -78,8 +83,13 @@
     Sv              = single(diag(Sv));
 end
 
-U               = normc(mov * V);
+if ops.useGPU
+    U               = normc(gpuBlockXY(mov, V));
+else
+    U               = normc(mov * V);
+end
 U               = single(U);
+
 %%
 fid = fopen(ops.RegFile, 'r');
 
@@ -97,7 +107,11 @@
     data = bsxfun(@minus, data, mean(data,3));
 %     data = bsxfun(@minus, data, ops.mimg1);
     data = data(ops.yrange, ops.xrange, :);
-    Fs(:, ix + (1:size(data,3))) = U' * reshape(data, [], size(data,3));
+    if ops.useGPU
+      Fs(:, ix + (1:size(data,3))) = gpuBlockXtY(U, reshape(data, [], size(data,3)));
+    else
+      Fs(:, ix + (1:size(data,3))) = U' * reshape(data, [], size(data,3));
+    end
     
     ix = ix + size(data,3);
 end
@@ -109,6 +123,11 @@
    mkdir(ops.ResultsSavePath); 
 end
 if getOr(ops, {'writeSVDroi'}, 0)
-    save(sprintf('%s/SVDroi_%s_%s_plane%d.mat', ops.ResultsSavePath, ...
-        ops.mouse_name, ops.date, ops.iplane), 'U', 'Sv', 'V', 'ops');
+    try
+        save(sprintf('%s/SVDroi_%s_%s_plane%d.mat', ops.ResultsSavePath, ...
+            ops.mouse_name, ops.date, ops.iplane), 'U', 'Sv', 'V', 'ops', '-v6');
+    catch
+        save(sprintf('%s/SVDroi_%s_%s_plane%d.mat', ops.ResultsSavePath, ...
+            ops.mouse_name, ops.date, ops.iplane), 'U', 'Sv', 'V', 'ops');
+    end
 end
diff --git a/svd/get_svdcomps.m b/svd/get_svdcomps.m
index bdabf71e..717d0595 100644
--- a/svd/get_svdcomps.m
+++ b/svd/get_svdcomps.m
@@ -20,54 +20,59 @@
 ops.NavgFramesSVD = floor(ntotframes/nt0);
 nimgbatch = nt0 * floor(2000/nt0);
 
+%% load the data 
+
 ix = 0;
 fid = fopen(ops.RegFile, 'r');
-
-mov = zeros(Ly, Lx, ops.NavgFramesSVD, 'single');
+mov = zeros(numel(ops.yrange), numel(ops.xrange), ops.NavgFramesSVD, 'single');
 
 while 1
     data = fread(fid,  Ly*Lx*nimgbatch, '*int16');
     if isempty(data)
         break;
     end
-    data = single(data);
     data = reshape(data, Ly, Lx, []);
     
     % subtract off the mean of this batch
 %     data = data - repmat(ops.mimg1, 1, 1, size(data,3));
     
-    irange = 1:nt0*floor(size(data,3)/nt0);
-    data = data(:,:, irange);
+    nSlices = nt0*floor(size(data,3)/nt0);
+    if nSlices ~= size(data,3)
+        data = data(:,:, 1:nSlices);
+    end
     
     data = reshape(data, Ly, Lx, nt0, []);
-    davg = single(squeeze(mean(data,3)));
+    data = single(data);
+    davg = squeeze(mean(data,3));
     
-    mov(:,:,ix + (1:size(davg,3))) = davg;
+    mov(:,:,ix + (1:size(davg,3))) = davg(ops.yrange, ops.xrange, :);
     
     ix = ix + size(davg,3);
 end
 fclose(fid);
-%%
-mov(:, :, (ix+1):end) = [];
+mov = mov(:, :, 1:ix);
 
-mov = mov(ops.yrange, ops.xrange, :);
 %% SVD options
 
 ops.nSVD = min(ops.nSVD, size(mov,3));
 %
 mov             = reshape(mov, [], size(mov,3));
 % mov             = mov./repmat(mean(mov.^2,2).^.5, 1, size(mov,2));
-COV             = mov' * mov/size(mov,1);
+if ops.useGPU
+    COV             = gpuBlockXtX(mov)/size(mov, 1);
+else
+    COV             = mov' * mov/size(mov,1);
+end
 
 ops.nSVD = min(size(COV,1)-2, ops.nSVD);
 
-
-
 if ops.nSVD<1000 || size(COV,1)>1e4
     [V, Sv]          = eigs(double(COV), ops.nSVD);
 else
     if ops.useGPU
-        [V, Sv]         = svd(gpuArray(double(COV)));
+        gpuCOV = gpuArray(double(COV));
+        [V, Sv]         = svd(gpuCOV);
+        clear gpuCOV;
         V = gather(single(V));
         Sv = gather(single(Sv));
     else
@@ -77,7 +82,12 @@
     Sv              = Sv(1:ops.nSVD, 1:ops.nSVD);
 end
 %%
-U               = normc(mov * V);
+
+if ops.useGPU
+    U               = normc(gpuBlockXY(mov, V));
+else
+    U               = normc(mov*V);
+end
 U               = single(U);
 Sv              = single(diag(Sv));
 
@@ -94,14 +104,18 @@
     if isempty(data)
         break;
     end
-    data = single(data);
     data = reshape(data, Ly, Lx, []);
     
     % subtract off the mean of this batch
     %         data = data - repmat(mean(data,3), 1, 1, size(data,3));
 %     data = data - repmat(ops.mimg1, 1, 1, size(data,3));
     data = data(ops.yrange, ops.xrange, :);
-    Fs(:, ix + (1:size(data,3))) = U' * reshape(data, [], size(data,3));
+    data = single(data);
+    if ops.useGPU
+        Fs(:, ix + (1:size(data,3))) = gpuBlockXtY(U, reshape(data, [], size(data,3)));
+    else
+        Fs(:, ix + (1:size(data,3))) = U' * reshape(data, [], size(data,3));
+    end
     
     ix = ix + size(data,3);
 end
@@ -119,6 +133,12 @@
 end
 
 U = reshape(U, numel(ops.yrange), numel(ops.xrange), []);
+try % this is faster, but is limited to 2GB files
+save(sprintf('%s/SVD_%s_%s_plane%d.mat', ops.ResultsSavePath, ...
+    ops.mouse_name, ops.date, iplane), 'U', 'Sv', 'Vcell', 'ops', '-v6');
+catch % this takes a bit less space, but is significantly slower
 save(sprintf('%s/SVD_%s_%s_plane%d.mat', ops.ResultsSavePath, ...
     ops.mouse_name, ops.date, iplane), 'U', 'Sv', 'Vcell', 'ops');
+end
+
 % keyboard;
\ No newline at end of file
diff --git a/svd/gpuBlockSmallXtY.m b/svd/gpuBlockSmallXtY.m
new file mode 100644
index 00000000..b0e5470c
--- /dev/null
+++ b/svd/gpuBlockSmallXtY.m
@@ -0,0 +1,41 @@
+function out = gpuBlockSmallXtY(X, Y)
+
+% X is small, Y is long (shallow and wide)
+
+g = gpuDevice;
+bytesGPU = g.AvailableMemory;
+infoX = whos('X');
+infoY = whos('Y');
+[hX, wX] = size(X);
+[hY, wY] = size(Y); % hX should be equal to hY
+% the dimensions of 'out' will be wX-by-wY
+bytesPerColumnY = infoY.bytes/wY;
+% theoretically, to keep the tmp, X' and batchY in GPU memory
+% bytesRequired = wX*batchSize + infoX.bytes+batchSize*bytesPerColumnY;
+batchSize = floor((bytesGPU-infoX.bytes)/(bytesPerColumnY+wX));
+batchSize = floor(batchSize/8); % just to be on the safe side
+% also, interestingly with smaller batches the code runs slightly faster
+nBatches = ceil(wY/batchSize);
+
+startIdx = 1:batchSize:wY;
+endIdx = startIdx + batchSize-1;
+endIdx = min(endIdx, wY);
+
+out = zeros([wX, wY], 'like', X);
+out(end) = 1;
+gpuX = gpuArray(X);
+gpuXT = gpuX';
+for iBatch = 1:nBatches
+%     fprintf('Batch %d/%d\n', iBatch, nBatches);
+    batchY = gpuArray(Y(:, startIdx(iBatch):endIdx(iBatch)));
+    tmp = gpuXT*batchY;
+
+% MK - this is an annoying line of code, but it helps Matlab to do memory
+% management on the GPU properly. Otherwise I get OUT OF MEMORY errors
+% maybe it is just my GPU is a bit unstable
+    wait(g);
+    out(:, startIdx(iBatch):endIdx(iBatch)) = gather(tmp);
+end
+
+
+return
diff --git a/svd/gpuBlockXY.m b/svd/gpuBlockXY.m
new file mode 100644
index 00000000..90ad8292
--- /dev/null
+++ b/svd/gpuBlockXY.m
@@ -0,0 +1,39 @@
+function out = gpuBlockXY(X, Y)
+
+% this code is for a very tall X, and small Y
+
+g = gpuDevice;
+bytesGPU = g.AvailableMemory;
+infoX = whos('X');
+infoY = whos('Y');
+[hX, wX] = size(X);
+[hY, wY] = size(Y); % hY should be equal to wX
+% the dimensions of out will be hX-by-wY
+bytesPerRowX = infoX.bytes/hX;
+bytesPerRowY = infoY.bytes/hY;
+% theoretically, to keep the tmp result, X, and Y in GPU memory
+% bytesRequired = bytesPerRowY*batchSize + bytesPerRowX*batchSize + infoY.bytes;
+batchSize = floor((bytesGPU-infoY.bytes)/(bytesPerRowX+bytesPerRowY));
+batchSize = floor(batchSize/8); % just to be on the safe side
+% also, interestingly with smaller batches the code runs slightly faster
+nBatches = ceil(hX/batchSize);
+
+startIdx = 1:batchSize:hX;
+endIdx = startIdx + batchSize-1;
+endIdx = min(endIdx, hX);
+
+out = zeros([hX, wY], 'like', X);
+gpuY = gpuArray(Y);
+for iBatch = 1:nBatches
+%     fprintf('Batch %d/%d\n', iBatch, nBatches);
+    batchX = gpuArray(X(startIdx(iBatch):endIdx(iBatch), :));
+    tmp = batchX*gpuY;
+
+% MK - this is an annoying line of code, but it helps Matlab to do memory
+% management on the GPU properly. Otherwise I get OUT OF MEMORY errors
+% maybe it is just my GPU is a bit unstable
+    wait(g);
+    out(startIdx(iBatch):endIdx(iBatch), :) = gather(tmp);
+end
+
+return
diff --git a/svd/gpuBlockXtX.m b/svd/gpuBlockXtX.m
new file mode 100644
index 00000000..9b43ef81
--- /dev/null
+++ b/svd/gpuBlockXtX.m
@@ -0,0 +1,54 @@
+function out = gpuBlockXtX(A)
+
+g = gpuDevice;
+bytesGPU = g.AvailableMemory;
+infoA = whos('A');
+[h, w] = size(A);
+bytesPerRow = infoA.bytes/h;
+% theoretically, to keep the result and two (A dn A') copies of data
+% bytesRequired = bytesPerRow*w + 2*batchSize*bytesPerRow;
+batchSize = floor((bytesGPU/bytesPerRow-w)/2);
+batchSize = floor(batchSize/8); % just to be on the safe side
+% also, interestingly with smaller batches the code runs slightly faster
+nBatches = ceil(h/batchSize);
+
+startIdx = 1:batchSize:h;
+endIdx = startIdx + batchSize-1;
+endIdx = min(endIdx, h);
+
+out = gpuArray(zeros(w, 'like', A));
+for iBatch = 1:nBatches
+%     fprintf('Batch %d/%d\n', iBatch, nBatches);
+    batchA = gpuArray(A(startIdx(iBatch):endIdx(iBatch), :));
+    out = out + batchA'*batchA;
+    
+    % trying to debug OUT OF MEMORY issues on GPU
+%     batchAT = batchA';
+%     out = out + batchAT*batchA;
+%     clear batchA batchAT;
+
+% MK - this is an annoying line of code, but it helps Matlab to do memory
+% management on the GPU properly. Otherwise I get OUT OF MEMORY errors
+% maybe it is just my GPU is a bit unstable
+    wait(gpuDevice);
+end
+
+out = gather(out);
+
+return
+
+%% this is a non-GPU version
+batchSize = 2^16;
+[h, w] = size(A);
+out = zeros(w, 'like', A);
+nBatches = ceil(h/batchSize);
+startIdx = 1:batchSize:h;
+endIdx = startIdx + batchSize-1;
+endIdx = min(endIdx, h);
+
+for iBatch = 1:nBatches
+    fprintf('Batch %d/%d\n', iBatch, nBatches);
+    batchA = A(startIdx(iBatch):endIdx(iBatch), :);
+    out = batchA'*batchA + out;
+end
+
diff --git a/svd/gpuBlockXtY.m b/svd/gpuBlockXtY.m
new file mode 100644
index 00000000..6db02768
--- /dev/null
+++ b/svd/gpuBlockXtY.m
@@ -0,0 +1,38 @@
+function out = gpuBlockSmallXtY(X, Y)
+
+g = gpuDevice;
+bytesGPU = g.AvailableMemory;
+infoX = whos('X');
+infoY = whos('Y');
+[hX, wX] = size(X);
+[hY, wY] = size(Y); % hY should be equal to hX
+% the dimensions of out will be wX-by-wY
+bytesPerRowX = infoX.bytes/hX;
+bytesPerRowY = infoY.bytes/hY;
+% theoretically, to keep the result and {X', X, and Y} in GPU memory
+% bytesRequired = bytesPerRowX*wY + batchSize*(2*bytesPerRowX+bytesPerRowY);
+batchSize = floor((bytesGPU-bytesPerRowX*wY)/(2*bytesPerRowX+bytesPerRowY));
+batchSize = floor(batchSize/8); % just to be on the safe side
+% also, interestingly with smaller batches the code runs slightly faster
+nBatches = ceil(hX/batchSize);
+
+startIdx = 1:batchSize:hX;
+endIdx = startIdx + batchSize-1;
+endIdx = min(endIdx, hX);
+
+out = gpuArray(zeros([wX, wY], 'like', X));
+for iBatch = 1:nBatches
+%     fprintf('Batch %d/%d\n', iBatch, nBatches);
+    batchX = gpuArray(X(startIdx(iBatch):endIdx(iBatch), :));
+    batchY = gpuArray(Y(startIdx(iBatch):endIdx(iBatch), :));
+    out = out + batchX'*batchY;
+
+% MK - this is an annoying line of code, but it helps Matlab to do memory
+% management on the GPU properly. Otherwise I get OUT OF MEMORY errors
+% maybe it is just my GPU is a bit unstable
+    wait(gpuDevice);
+end
+
+out = gather(out);
+
+return
diff --git a/utils/normc.m b/utils/normc.m
index 9d7d082d..8ea0574e 100644
--- a/utils/normc.m
+++ b/utils/normc.m
@@ -1,3 +1,6 @@
 function v = normc(v)
 
-v = v./repmat(sum(v.^2, 1), [size(v,1),ones(1,ndims(v)-1)]).^.5;
\ No newline at end of file
+% v = v./repmat(sum(v.^2, 1), [size(v,1),ones(1,ndims(v)-1)]).^.5;
+% MK - using bsxfun is faster
+v = bsxfun(@rdivide, v, sum(v.^2, 1).^.5);
+