-
Notifications
You must be signed in to change notification settings - Fork 0
/
PCAtest.m
300 lines (222 loc) · 7.32 KB
/
PCAtest.m
1
%% load datadata = load('features-deg50-cur-selected-binary-interpol-100.mat');features = abs(data.data.features); %for zernike momentsnames = data.data.filenames; %zernikeagg_ids = data.data.agg_ids; %zernike% features = abs(data.data.means); % for intensity features% names = data.data.names; %for intensity features% agg_ids = zeros(1, length(names)); %for intensity %% maybe subsample from each patient 50 aggregates? (for curc), for bf1 = 10 aggsamples = [];for pat = 1:100 I = regexpcell(names, sprintf('^%d\\-.*DYEcur\\-.*', pat)); if length(I) < 50 fprintf('Patient %d has less than 50 aggregates (%d)\n', pat, length(I)); continue; end samples = [samples, randsample(I, 50)];endfeatures = features(samples, :);names = names(samples);agg_ids = agg_ids(samples);%% classify all patients based on mutation typeTbl = readtable('patients-all-dataset-input-new.csv', 'ReadVariableNames', true, 'delimiter', ',');types = {'Sporadic', 'PSEN1 AD', 'PSEN2 AD', 'London AD', 'E3Q fAD', 'Swedish AD', 'Unknown'};% this cell holds all patient numbers for each mutation type% i.e. patients_forEach_type{3} would be all patients suffering% from 'PSEN2 AD'patients_forEach_type = {};for i=1:length(types) patients_forEach_type{i} = unique(Tbl(strcmp(Tbl.Classification, types{i}), :).PatientNum);end%% filter out features based on mutation% In array 'keep', if an element is 1, it means you want to have that% mutation and if it is 0, it means you don't want. E.g. the following% means that you want all mutations except 'Sporadic' and 'Unknown'keep = [0, 1, 1, 1, 1, 0, 0];J = [];for i=1:length(types) if keep(i) for pat = patients_forEach_type{i}' J = [J, regexpcell(names, sprintf('^%d\\-', pat))]; end endendfeatures = features(J, :);names = names(J);agg_ids = agg_ids(J);%% just consider aggregates of a certain sizewindow = [70, 120];% window = [120, 10000];J = features(:, size(features, 2)) <= window(2) & ... features(:, size(features, 2)) >= window(1);features = features(J, :);names = names(J);agg_ids = agg_ids(J);%% normalize size feature - do not run this part if "size" is not a featurefeatures(:,size(features,2)) = features(:,size(features,2))/max(features(:,size(features,2)))*0.001;%% run the dimensionality reduction algorithm[mappedX, mapping] = compute_mapping(features, 'PCA', 676);% if we want to have the explained variance correctly, we need to do the% complete PCA (dim = size(features, 2)).%% Only for PCA V = var(mappedX);total_variance = sum(V);%% Or run the Spectral Clustering algorith%build similarity graphW = SimGraph(features', 10, 1, 1); % features, number of neighbors, Normal(1) or Mutual(2), sigma[C, L, mappedX] = specclust(W, 6); % W, k clusters %% ploting the eigenvectorseig1 = mapping.M(:, 1);eig2 = mapping.M (:, 2);figure; subplot(1,2,1);bar(eig1);subplot(1,2,2);bar(-eig2);%% plottingfigure;hold on;title('PCA')xlabel(sprintf('Principal Component 1 (%% %.2f)', V(1) / total_variance * 100.));ylabel(sprintf('Principal Component 2 (%% %.2f)', V(2) / total_variance * 100.));for i = 1:length(types) if ~keep(i); continue; end I = []; for j = patients_forEach_type{i}' I = [I, regexpcell(names, sprintf('^%d\\-', j))]; end scatter(mappedX(I, 1), mappedX(I, 2), 15, 'filled' ,'DisplayName', types{i}); hold on;end% [n,c] = hist3([mappedX(I, 1), mappedX(I, 2)], [20 20]);% contour(c{1},c{2}, n, 'k');hold off;legend();% figure;% imshow(n, [0, max(max(n))]);%% render certain images from different parts of PCApts = [ 0.5052, 0.3211; 0.3386, 0.3337; ... 0.5022, 0.01477; 0.6114, -0.02122; ... 0.5673, -0.1209; 0.5374, -0.02887];% need to find the aggregate with the coordinates defined as above.% the way is to get the nearest point to these points defined.examples = [];for i = 1:size(pts, 1) distssq = sum((repmat(pts(i, :), size(features, 1), 1) - mappedX(:, 1:2)) .^ 2, 2); [~, I] = min(distssq); examples = [examples, I]; scatter(mappedX(I, 1), mappedX(I, 2), 100, 'filled'); text(mappedX(I, 1), mappedX(I, 2), sprintf('Image %d', i));enddisp(examples);% reconstruct the images...deg = 50;N = 512; % size of the reconstructed imageind = int32(1);for e = examples disp(e); recon = reconstructFast(deg, N, data.data.features(e, :)); axes('pos', [0.2 * double(mod(ind - 1, 6)), .7 - double(idivide(ind - 1, int32(5))) * 0.3, 0.2, 0.2]); imshow(recon); ind = ind + int32(1);endhold off;%% corr coef between brightness and 1st PC coeffc = corrcoef(features(:,1), mappedX(:,1))%% new ideapatD = zeros(100, 100);for i = 1:100 I_i = regexpcell(names, sprintf('^%d\\-', i)); %returns the indices for j = (i+1):100 I_j = regexpcell(names, sprintf('^%d\\-', j)); patD(i, j) = patientDistance(mappedX(I_i, [1,2]), mappedX(I_j, [1,2])); patD(j, i) = patD(i, j); endendpatD(isnan(patD)) = 0;imagesc(patD); %% ordering patientsorder = [];for mut_type = 2:5 I = patients_forEach_type{mut_type}; disp(size(I)); order = [order; I];endfigure();imagesc(patD(order, order));title('Distance matrix for LLE')%% kmeans idx = kmeans(mappedX(:,[1,2,3]), 4);type_of_each_patient = zeros(104, 1);for i = 2:5 for j = patients_forEach_type{i}' type_of_each_patient(j) = i-1; endendC = zeros(4, 4);for i = 1:length(idx) [~, tok] = regexp(names{i}, '^(\d+)\-.*', 'match', 'tokens', 'once'); pat_num = str2num(tok{1}); C(idx(i), type_of_each_patient(pat_num)) = C(idx(i), type_of_each_patient(pat_num)) + 1;enddisp(C);P = perms([1 2 3 4]);best = 0;best_perm = 0;for i = 1:size(P,1) correctly_classified = C(1, P(i, 1)) + C(2, P(i, 2)) + ... C(3, P(i, 3)) + C(4, P(i, 4)); if correctly_classified > best best = correctly_classified; best_perm = i; endenddisp('Best classification error = ');disp((sum(sum(C)) - best) / sum(sum(C)) * 100.); %% plotting according to patIDf = figure;p = uipanel('Parent',f,'BorderType','none');p.Title = 'PCA';p.TitlePosition = 'centertop';p.FontSize = 12;p.FontWeight = 'bold';colors = colormap(lines(7)); %% Plot type 1 (9 plots) for ii = 1:9 disp(ii); subplot(3,3,ii, 'Parent', p); hold on; for j = ((ii-1)*7+1):(ii*7) I = regexpcell(names, sprintf('^%d\\-', j)); scatter(mappedX(I, 1), mappedX(I, 2), 15, colors(j -(ii-1)*7,:), 'filled', 'DisplayName', sprintf('P%d', j)); end hold off; legend();end%% See some patients aggregates over Embedding plotfigure;subplot(1,2,1);hold on;scatter(mappedX(:,1), mappedX(:, 2), 3);patients = [1 2 3 6 9 10 69 70];for patient = patients I = regexpcell (names, sprintf('^%d\\-', patient)); scatter(mappedX(I,1), mappedX(I, 2), 30, 'filled');endlegend('agg', sprintf('P%d', patients(1)), sprintf('P%d', patients(2)), sprintf('P%d', patients(3)), ...sprintf('P%d', patients(4)), sprintf('P%d', patients(5)), sprintf('P%d', patients(6)), sprintf('P%d', patients(7)), ...sprintf('P%d', patients(8)));subplot(1,2,2);hold on;for i = 1:length(types) if ~keep(i); continue; end J = []; for j = patients_forEach_type{i}' J = [J, regexpcell(names, sprintf('^%d\\-', j))]; end scatter(mappedX(J, 1), mappedX(J, 2), 15, 'filled' ,'DisplayName', types{i}); hold on;endhold off;legend();