From 6885ff78d0532f325df5128dc392b24ab55e2149 Mon Sep 17 00:00:00 2001 From: mkrumin Date: Wed, 2 Nov 2016 18:47:16 +0000 Subject: [PATCH] speed optimization for svd and clustering (mostly for GPU users) - large matrix multiplications are now done on GPU (usually x2 improvement in speed for 'single' arrays) - slightly better vectorization and memory management - saving with '-v6' when possible (not everywhere yet) --- cellDetection/fastClustNeuropilCoef.m | 48 ++++++++++++++---------- registration/reg2P.m | 14 ++++--- signalExtraction/get_regions.m | 26 ++++++++----- svd/get_svdForROI.m | 47 ++++++++++++++++------- svd/get_svdcomps.m | 54 ++++++++++++++++++--------- svd/gpuBlockSmallXtY.m | 41 ++++++++++++++++++++ svd/gpuBlockXY.m | 39 +++++++++++++++++++ svd/gpuBlockXtX.m | 54 +++++++++++++++++++++++++++ svd/gpuBlockXtY.m | 38 +++++++++++++++++++ utils/normc.m | 5 ++- 10 files changed, 300 insertions(+), 66 deletions(-) create mode 100644 svd/gpuBlockSmallXtY.m create mode 100644 svd/gpuBlockXY.m create mode 100644 svd/gpuBlockXtX.m create mode 100644 svd/gpuBlockXtY.m diff --git a/cellDetection/fastClustNeuropilCoef.m b/cellDetection/fastClustNeuropilCoef.m index 9211821f..f90305cc 100644 --- a/cellDetection/fastClustNeuropilCoef.m +++ b/cellDetection/fastClustNeuropilCoef.m @@ -1,5 +1,5 @@ function [ops, stat, res] = fastClustNeuropilCoef(ops, U, Sv) -% +% U = reshape(U, [], size(U,ndims(U))); iplane = ops.iplane; U = bsxfun(@times, U, Sv'.^.5)'; @@ -18,7 +18,7 @@ xs = repmat(round(linspace(1, nsqrt, Lx)), Ly, 1); ys = repmat(round(linspace(1, nsqrt, Ly))', 1, Lx); -iclust = xs + (ys-1) * nsqrt; +iclust = xs + (ys-1) * nsqrt; clear xs ys @@ -26,13 +26,13 @@ % xs = repmat(1:Lx, Ly, 1); % ys = repmat((1:Ly)', 1, Lx); -% +% % randx = rand(1, Nk) * Lx; % randy = rand(1, Nk) * Ly; -% +% % dx = repmat(xs(:), 1, Nk) - repmat(randx, numel(xs(:)), 1); % dy = repmat(ys(:), 1, Nk) - repmat(randy, numel(ys(:)), 1); -% +% % dxy = dx.^2 + dy.^2; % [~, iclust] = min(dxy, [], 2); %% @@ -61,7 +61,7 @@ ison = true(Nk,1); TileFactor = getOr(ops, {'TileFactor'}, 1); % this option can be overwritten by the user -nTiles = ceil(TileFactor * (Ly+Lx)/2 / (10 * ops.diameter)); % neuropil is modelled as nTiles by nTiles +nTiles = ceil(TileFactor * (Ly+Lx)/2 / (10 * ops.diameter)); % neuropil is modelled as nTiles by nTiles xc = linspace(1, Lx, nTiles); yc = linspace(1, Ly, nTiles); @@ -106,7 +106,7 @@ StU = Sm' * Uneu'; Lam = (StS + 1e-4 * eye(nBasis)) \ StU; - % recompute neuropil pixel contribution + % recompute neuropil pixel contribution neuropil = Lam' * S'; PixL = mean(bsxfun(@times, neuropil, Uneu), 1); PixL = bsxfun(@rdivide, PixL, mean(neuropil.^2,1)); @@ -124,18 +124,28 @@ end end % -% Ff = Fs' * vs; -% vs = Finv * max(0, Ff); -% [dcell, Ffr] = run_deconvolution2(Ff, f0, kernel); -% vs = Finv * Ffr; + % Ff = Fs' * vs; + % vs = Finv * max(0, Ff); + % [dcell, Ffr] = run_deconvolution2(Ff, f0, kernel); + % vs = Finv * Ffr; vs = bsxfun(@rdivide, vs, sum(vs.^2,1).^.5 + 1e-8);% normalize activity vectors + vs = single(vs); % recompute pixels' assignments - xs = vs' * Ucell; + if ops.useGPU + xs = gpuBlockSmallXtY(vs, Ucell); + else + xs = vs' * Ucell; + end + [M, iclust] = max(xs,[],1); + % Uneu = U - bsxfun(@times, M, vs(:,iclust)); %what's left over for neuropil model Uneu = U - bsxfun(@times, M, vs(:,iclust)); %what's left over for neuropil model - err(k) = sum(sum((Uneu-neuropil).^2)).^.5; + % vs = double(vs); + + % err(k) = sum(sum((Uneu-neuropil).^2)).^.5; + err(k) = norm(Uneu(:)-neuropil(:)); if 1 %---------------------------------------------% @@ -177,20 +187,20 @@ if (rem(k,10)==1 || k==niter) && ops.ShowCellMap -% imagesc(reshape(PixL, Ly, Lx), [0 2]) -% drawnow -% + % imagesc(reshape(PixL, Ly, Lx), [0 2]) + % drawnow + % lam = M; for i = 1:Nk ix = find(iclust==i); nT0 = numel(ix); if nT0>0 vM = lam(ix); -% vM = vM/sum(vM.^2)^.5; + % vM = vM/sum(vM.^2)^.5; lam(ix) = vM; end end -% V = max(0, min(10 * reshape(lam, Ly, Lx), 1)); + % V = max(0, min(10 * reshape(lam, Ly, Lx), 1)); V = max(0, min(.5 * reshape(lam, Ly, Lx)/mean(lam(:)), 1)); H = reshape(r(iclust), Ly, Lx); rgb_image = hsv2rgb(cat(3, H, Sat, V)); @@ -199,7 +209,7 @@ drawnow fprintf('residual variance is %2.6f time %2.2f \n', err(k), toc) end - + end lam = M; diff --git a/registration/reg2P.m b/registration/reg2P.m index 634e5ad8..2b1bcc07 100644 --- a/registration/reg2P.m +++ b/registration/reg2P.m @@ -88,17 +88,19 @@ end if ops.showTargetRegistration - figure('position', [900 50 900 900]) - ax = ceil(sqrt(numel(ops1)/2)); + nRows = floor(sqrt(numel(ops1))); + nColumns = ceil(numel(ops1)/nRows); + figure('Name', 'Registration Target Frames', ... + 'Position', [50 50 nColumns*500 nRows*500]) i0 = 0; for i = 1:numPlanes for j = 1:size(xFOVs,2) i0 = i0+1; - subplot(ax,2*ax,i0) - imagesc(ops1{i,j}.mimg) + subplot(nRows, nColumns, i0); + imagesc(ops1{i,j}.mimg); colormap('gray') - title(sprintf('Registration for plane %d, mouse %s, date %s', ... - i, ops.mouse_name, ops.date)) + axis equal tight + title(sprintf('Plane %d, %s_%s', i, ops.mouse_name, ops.date)) end end diff --git a/signalExtraction/get_regions.m b/signalExtraction/get_regions.m index 7e261f0d..ae5a2836 100644 --- a/signalExtraction/get_regions.m +++ b/signalExtraction/get_regions.m @@ -22,9 +22,9 @@ for k = 1:Nk % needs stat, res, neigh pixall = stat(k).ipix; - -% minV = clustrules.parent.minPixRelVar * mean(res.M(pixall)); -% pixall(res.M(pixall)< minV) = []; + + % minV = clustrules.parent.minPixRelVar * mean(res.M(pixall)); + % pixall(res.M(pixall)< minV) = []; whclust = 0; region = []; @@ -47,15 +47,23 @@ x0 = xs(pixi); y0 = ys(pixi); - rs = ((x0 - mean(x0)).^2 + (y0 - mean(y0)).^2).^.5; - region(whclust).mrs = mean(rs); + meanX0 = mean(x0); + meanY0 = mean(y0); + rs = ((x0 - meanX0).^2 + (y0 - meanY0).^2).^.5; region(whclust).npix = numel(pixi); - region(whclust).mrs0 = mean(rgridsort(1:region(whclust).npix)); - region(whclust).med = [mean(y0) mean(x0)]; + if region(whclust).npix>1 + region(whclust).mrs = mean(rs); + region(whclust).mrs0 = mean(rgridsort(1:region(whclust).npix)); + region(whclust).V = sum(res.M(pixi)); + else + region(whclust).mrs = rs; + region(whclust).mrs0 = rgridsort(1); + region(whclust).V = res.M(pixi); + end + region(whclust).med = [meanY0 meanX0]; region(whclust).ipix = pixi; region(whclust).lambda = res.lambda(pixi); - region(whclust).V = sum(res.M(pixi)); - region(whclust).parent = k; + region(whclust).parent = k; end stat(k).region = region; diff --git a/svd/get_svdForROI.m b/svd/get_svdForROI.m index 3f13ee7d..f13a4568 100644 --- a/svd/get_svdForROI.m +++ b/svd/get_svdForROI.m @@ -15,7 +15,7 @@ ix = 0; fid = fopen(ops.RegFile, 'r'); -mov = zeros(Ly, Lx, ops.NavgFramesSVD, 'single'); +mov = zeros(numel(ops.yrange), numel(ops.xrange), ops.NavgFramesSVD, 'single'); while 1 data = fread(fid, Ly*Lx*nimgbatch, '*int16'); @@ -30,21 +30,21 @@ data = bsxfun(@minus, data, mean(data,3)); % data = bsxfun(@minus, data, ops.mimg1); - irange = 1:nt0*floor(size(data,3)/nt0); - data = data(:,:, irange); + nSlices = nt0*floor(size(data,3)/nt0); + if nSlices~=size(data,3) + data = data(:,:, 1:nSlices); + end data = reshape(data, Ly, Lx, nt0, []); - davg = single(squeeze(mean(data,3))); + davg = squeeze(mean(data,3)); - mov(:,:,ix + (1:size(davg,3))) = davg; + mov(:,:,ix + (1:size(davg,3))) = davg(ops.yrange, ops.xrange, :); ix = ix + size(davg,3); end fclose(fid); -%% -mov(:, :, (ix+1):end) = []; -mov = mov(ops.yrange, ops.xrange, :); +mov = mov(:, :, 1:ix); % mov = mov - repmat(mean(mov,3), 1, 1, size(mov,3)); %% SVD options @@ -60,8 +60,13 @@ mov = reshape(mov, [], size(mov,3)); sdmov = mean(mov.^2,2).^.5; -mov = mov./repmat(sdmov, 1, size(mov,2)); -COV = mov' * mov/size(mov,1); +% mov = mov./repmat(sdmov, 1, size(mov,2)); +mov = bsxfun(@rdivide, mov, sdmov); +if ops.useGPU + COV = gpuBlockXtX(mov)/size(mov,1); +else + COV = mov' * mov/size(mov,1); +end ops.nSVDforROI = min(size(COV,1)-2, ops.nSVDforROI); @@ -78,8 +83,13 @@ Sv = single(diag(Sv)); end -U = normc(mov * V); +if ops.useGPU + U = normc(gpuBlockXY(mov, V)); +else + U = normc(mov * V); +end U = single(U); + %% fid = fopen(ops.RegFile, 'r'); @@ -97,7 +107,11 @@ data = bsxfun(@minus, data, mean(data,3)); % data = bsxfun(@minus, data, ops.mimg1); data = data(ops.yrange, ops.xrange, :); - Fs(:, ix + (1:size(data,3))) = U' * reshape(data, [], size(data,3)); + if ops.useGPU + Fs(:, ix + (1:size(data,3))) = gpuBlockXtY(U, reshape(data, [], size(data,3))); + else + Fs(:, ix + (1:size(data,3))) = U' * reshape(data, [], size(data,3)); + end ix = ix + size(data,3); end @@ -109,6 +123,11 @@ mkdir(ops.ResultsSavePath); end if getOr(ops, {'writeSVDroi'}, 0) - save(sprintf('%s/SVDroi_%s_%s_plane%d.mat', ops.ResultsSavePath, ... - ops.mouse_name, ops.date, ops.iplane), 'U', 'Sv', 'V', 'ops'); + try + save(sprintf('%s/SVDroi_%s_%s_plane%d.mat', ops.ResultsSavePath, ... + ops.mouse_name, ops.date, ops.iplane), 'U', 'Sv', 'V', 'ops', '-v6'); + catch + save(sprintf('%s/SVDroi_%s_%s_plane%d.mat', ops.ResultsSavePath, ... + ops.mouse_name, ops.date, ops.iplane), 'U', 'Sv', 'V', 'ops'); + end end diff --git a/svd/get_svdcomps.m b/svd/get_svdcomps.m index bdabf71e..717d0595 100644 --- a/svd/get_svdcomps.m +++ b/svd/get_svdcomps.m @@ -20,54 +20,59 @@ ops.NavgFramesSVD = floor(ntotframes/nt0); nimgbatch = nt0 * floor(2000/nt0); +%% load the data + ix = 0; fid = fopen(ops.RegFile, 'r'); - -mov = zeros(Ly, Lx, ops.NavgFramesSVD, 'single'); +mov = zeros(numel(ops.yrange), numel(ops.xrange), ops.NavgFramesSVD, 'single'); while 1 data = fread(fid, Ly*Lx*nimgbatch, '*int16'); if isempty(data) break; end - data = single(data); data = reshape(data, Ly, Lx, []); % subtract off the mean of this batch % data = data - repmat(ops.mimg1, 1, 1, size(data,3)); - irange = 1:nt0*floor(size(data,3)/nt0); - data = data(:,:, irange); + nSlices = nt0*floor(size(data,3)/nt0); + if nSlices ~= size(data,3) + data = data(:,:, 1:nSlices); + end data = reshape(data, Ly, Lx, nt0, []); - davg = single(squeeze(mean(data,3))); + data = single(data); + davg = squeeze(mean(data,3)); - mov(:,:,ix + (1:size(davg,3))) = davg; + mov(:,:,ix + (1:size(davg,3))) = davg(ops.yrange, ops.xrange, :); ix = ix + size(davg,3); end fclose(fid); -%% -mov(:, :, (ix+1):end) = []; +mov = mov(:, :, 1:ix); -mov = mov(ops.yrange, ops.xrange, :); %% SVD options ops.nSVD = min(ops.nSVD, size(mov,3)); % mov = reshape(mov, [], size(mov,3)); % mov = mov./repmat(mean(mov.^2,2).^.5, 1, size(mov,2)); -COV = mov' * mov/size(mov,1); +if ops.useGPU + COV = gpuBlockXtX(mov)/size(mov, 1); +else + COV = mov' * mov/size(mov,1); +end ops.nSVD = min(size(COV,1)-2, ops.nSVD); - - if ops.nSVD<1000 || size(COV,1)>1e4 [V, Sv] = eigs(double(COV), ops.nSVD); else if ops.useGPU - [V, Sv] = svd(gpuArray(double(COV))); + gpuCOV = gpuArray(double(COV)); + [V, Sv] = svd(gpuCOV); + clear gpuCOV; V = gather(single(V)); Sv = gather(single(Sv)); else @@ -77,7 +82,12 @@ Sv = Sv(1:ops.nSVD, 1:ops.nSVD); end %% -U = normc(mov * V); + +if ops.useGPU + U = normc(gpuBlockXY(mov, V)); +else + U = normc(mov*V); +end U = single(U); Sv = single(diag(Sv)); @@ -94,14 +104,18 @@ if isempty(data) break; end - data = single(data); data = reshape(data, Ly, Lx, []); % subtract off the mean of this batch % data = data - repmat(mean(data,3), 1, 1, size(data,3)); % data = data - repmat(ops.mimg1, 1, 1, size(data,3)); data = data(ops.yrange, ops.xrange, :); - Fs(:, ix + (1:size(data,3))) = U' * reshape(data, [], size(data,3)); + data = single(data); + if ops.useGPU + Fs(:, ix + (1:size(data,3))) = gpuBlockXtY(U, reshape(data, [], size(data,3))); + else + Fs(:, ix + (1:size(data,3))) = U' * reshape(data, [], size(data,3)); + end ix = ix + size(data,3); end @@ -119,6 +133,12 @@ end U = reshape(U, numel(ops.yrange), numel(ops.xrange), []); +try % this is faster, but is limited to 2GB files +save(sprintf('%s/SVD_%s_%s_plane%d.mat', ops.ResultsSavePath, ... + ops.mouse_name, ops.date, iplane), 'U', 'Sv', 'Vcell', 'ops', '-v6'); +catch % this takes a bit less space, but is significantly slower save(sprintf('%s/SVD_%s_%s_plane%d.mat', ops.ResultsSavePath, ... ops.mouse_name, ops.date, iplane), 'U', 'Sv', 'Vcell', 'ops'); +end + % keyboard; \ No newline at end of file diff --git a/svd/gpuBlockSmallXtY.m b/svd/gpuBlockSmallXtY.m new file mode 100644 index 00000000..b0e5470c --- /dev/null +++ b/svd/gpuBlockSmallXtY.m @@ -0,0 +1,41 @@ +function out = gpuBlockSmallXtY(X, Y) + +% X is small, Y is long (shallow and wide) + +g = gpuDevice; +bytesGPU = g.AvailableMemory; +infoX = whos('X'); +infoY = whos('Y'); +[hX, wX] = size(X); +[hY, wY] = size(Y); % hX should be equal to hY +% the dimensions of 'out' will be wX-by-wY +bytesPerColumnY = infoY.bytes/wY; +% theoretically, to keep the tmp, X' and batchY in GPU memory +% bytesRequired = wX*batchSize + infoX.bytes+batchSize*bytesPerColumnY; +batchSize = floor((bytesGPU-infoX.bytes)/(bytesPerColumnY+wX)); +batchSize = floor(batchSize/8); % just to be on the safe side +% also, interestingly with smaller batches the code runs slightly faster +nBatches = ceil(wY/batchSize); + +startIdx = 1:batchSize:wY; +endIdx = startIdx + batchSize-1; +endIdx = min(endIdx, wY); + +out = zeros([wX, wY], 'like', X); +out(end) = 1; +gpuX = gpuArray(X); +gpuXT = gpuX'; +for iBatch = 1:nBatches +% fprintf('Batch %d/%d\n', iBatch, nBatches); + batchY = gpuArray(Y(:, startIdx(iBatch):endIdx(iBatch))); + tmp = gpuXT*batchY; + +% MK - this is an annoying line of code, but it helps Matlab to do memory +% management on the GPU properly. Otherwise I get OUT OF MEMORY errors +% maybe it is just my GPU is a bit unstable + wait(g); + out(:, startIdx(iBatch):endIdx(iBatch)) = gather(tmp); +end + + +return diff --git a/svd/gpuBlockXY.m b/svd/gpuBlockXY.m new file mode 100644 index 00000000..90ad8292 --- /dev/null +++ b/svd/gpuBlockXY.m @@ -0,0 +1,39 @@ +function out = gpuBlockXY(X, Y) + +% this code is for a very tall X, and small Y + +g = gpuDevice; +bytesGPU = g.AvailableMemory; +infoX = whos('X'); +infoY = whos('Y'); +[hX, wX] = size(X); +[hY, wY] = size(Y); % hY should be equal to wX +% the dimensions of out will be hX-by-wY +bytesPerRowX = infoX.bytes/hX; +bytesPerRowY = infoY.bytes/hY; +% theoretically, to keep the tmp result, X, and Y in GPU memory +% bytesRequired = bytesPerRowY*batchSize + bytesPerRowX*batchSize + infoY.bytes; +batchSize = floor((bytesGPU-infoY.bytes)/(bytesPerRowX+bytesPerRowY)); +batchSize = floor(batchSize/8); % just to be on the safe side +% also, interestingly with smaller batches the code runs slightly faster +nBatches = ceil(hX/batchSize); + +startIdx = 1:batchSize:hX; +endIdx = startIdx + batchSize-1; +endIdx = min(endIdx, hX); + +out = zeros([hX, wY], 'like', X); +gpuY = gpuArray(Y); +for iBatch = 1:nBatches +% fprintf('Batch %d/%d\n', iBatch, nBatches); + batchX = gpuArray(X(startIdx(iBatch):endIdx(iBatch), :)); + tmp = batchX*gpuY; + +% MK - this is an annoying line of code, but it helps Matlab to do memory +% management on the GPU properly. Otherwise I get OUT OF MEMORY errors +% maybe it is just my GPU is a bit unstable + wait(g); + out(startIdx(iBatch):endIdx(iBatch), :) = gather(tmp); +end + +return diff --git a/svd/gpuBlockXtX.m b/svd/gpuBlockXtX.m new file mode 100644 index 00000000..9b43ef81 --- /dev/null +++ b/svd/gpuBlockXtX.m @@ -0,0 +1,54 @@ +function out = gpuBlockXtX(A) + +g = gpuDevice; +bytesGPU = g.AvailableMemory; +infoA = whos('A'); +[h, w] = size(A); +bytesPerRow = infoA.bytes/h; +% theoretically, to keep the result and two (A dn A') copies of data +% bytesRequired = bytesPerRow*w + 2*batchSize*bytesPerRow; +batchSize = floor((bytesGPU/bytesPerRow-w)/2); +batchSize = floor(batchSize/8); % just to be on the safe side +% also, interestingly with smaller batches the code runs slightly faster +nBatches = ceil(h/batchSize); + +startIdx = 1:batchSize:h; +endIdx = startIdx + batchSize-1; +endIdx = min(endIdx, h); + +out = gpuArray(zeros(w, 'like', A)); +for iBatch = 1:nBatches +% fprintf('Batch %d/%d\n', iBatch, nBatches); + batchA = gpuArray(A(startIdx(iBatch):endIdx(iBatch), :)); + out = out + batchA'*batchA; + + % trying to debug OUT OF MEMORY issues on GPU +% batchAT = batchA'; +% out = out + batchAT*batchA; +% clear batchA batchAT; + +% MK - this is an annoying line of code, but it helps Matlab to do memory +% management on the GPU properly. Otherwise I get OUT OF MEMORY errors +% maybe it is just my GPU is a bit unstable + wait(gpuDevice); +end + +out = gather(out); + +return + +%% this is a non-GPU version +batchSize = 2^16; +[h, w] = size(A); +out = zeros(w, 'like', A); +nBatches = ceil(h/batchSize); +startIdx = 1:batchSize:h; +endIdx = startIdx + batchSize-1; +endIdx = min(endIdx, h); + +for iBatch = 1:nBatches + fprintf('Batch %d/%d\n', iBatch, nBatches); + batchA = A(startIdx(iBatch):endIdx(iBatch), :); + out = batchA'*batchA + out; +end + diff --git a/svd/gpuBlockXtY.m b/svd/gpuBlockXtY.m new file mode 100644 index 00000000..6db02768 --- /dev/null +++ b/svd/gpuBlockXtY.m @@ -0,0 +1,38 @@ +function out = gpuBlockSmallXtY(X, Y) + +g = gpuDevice; +bytesGPU = g.AvailableMemory; +infoX = whos('X'); +infoY = whos('Y'); +[hX, wX] = size(X); +[hY, wY] = size(Y); % hY should be equal to hX +% the dimensions of out will be wX-by-wY +bytesPerRowX = infoX.bytes/hX; +bytesPerRowY = infoY.bytes/hY; +% theoretically, to keep the result and {X', X, and Y} in GPU memory +% bytesRequired = bytesPerRowX*wY + batchSize*(2*bytesPerRowX+bytesPerRowY); +batchSize = floor((bytesGPU-bytesPerRowX*wY)/(2*bytesPerRowX+bytesPerRowY)); +batchSize = floor(batchSize/8); % just to be on the safe side +% also, interestingly with smaller batches the code runs slightly faster +nBatches = ceil(hX/batchSize); + +startIdx = 1:batchSize:hX; +endIdx = startIdx + batchSize-1; +endIdx = min(endIdx, hX); + +out = gpuArray(zeros([wX, wY], 'like', X)); +for iBatch = 1:nBatches +% fprintf('Batch %d/%d\n', iBatch, nBatches); + batchX = gpuArray(X(startIdx(iBatch):endIdx(iBatch), :)); + batchY = gpuArray(Y(startIdx(iBatch):endIdx(iBatch), :)); + out = out + batchX'*batchY; + +% MK - this is an annoying line of code, but it helps Matlab to do memory +% management on the GPU properly. Otherwise I get OUT OF MEMORY errors +% maybe it is just my GPU is a bit unstable + wait(gpuDevice); +end + +out = gather(out); + +return diff --git a/utils/normc.m b/utils/normc.m index 9d7d082d..8ea0574e 100644 --- a/utils/normc.m +++ b/utils/normc.m @@ -1,3 +1,6 @@ function v = normc(v) -v = v./repmat(sum(v.^2, 1), [size(v,1),ones(1,ndims(v)-1)]).^.5; \ No newline at end of file +% v = v./repmat(sum(v.^2, 1), [size(v,1),ones(1,ndims(v)-1)]).^.5; +% MK - using bsxfun is faster +v = bsxfun(@rdivide, v, sum(v.^2, 1).^.5); +