-
Notifications
You must be signed in to change notification settings - Fork 0
/
permutation_htest2_np.m
212 lines (184 loc) · 8.14 KB
/
permutation_htest2_np.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
function [stats] = permutation_htest2_np(data,design,niter,pthr,htest)
% PERMUTATION_HTEST2_NP - A "non parametric" two-sample hypotesis test that, instead of
% relying on the test-type standard distribution, uses permutations of group labels to
% estimate the null distribution. The null distribution is computed
% independently for each data point (= row), i.e. we do not assume the same
% distribution for each datapoint. However, we do assume that the data
% points are comparable (e.g. they correspond to the same location
% collected across all subjects)
%
% USAGE:
% stats = bramila_ttest2_np(data,design,niter)
% INPUT:
% data - a matrix where each column is a subject and each row is a
% data-point for example a voxel intensity in fMRI, a node level
% value in a network, etc. NaN values will be ignored.
% design - a row vector containing the numbers 1 and 2 for the two groups
% niter - number of permutations (recommended 5000)
% htest - hypothesis test used to compare populations. The script is
% prepared to run the ttest2, kstest2, and ranksum tests.
%
%
% OUTPUT:
% stats is a struct with the following subfields:
% pvals - p-values for each datapoint; it returns in order the p-values
% for the right tail and for the left tail
% tvals - test statistic values for datapoint, positive tvals mean
% group 1 > group 2
%
% Notes: the null distribution is estimated using the matlab function
% ksdensity by interpolating the permuted data. The distribution is
% estimated over 200 points if niter<=5000, otherwise it is estimated over
% round(200*niter/5000) points, for greater precision.
% Henrique Fernandes 2014
% adapted from: Enrico Glerean 2013
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% INPUT VALIDATION
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Nsubj = size(data,2); % number of subjects
if(size(design,2) ~= Nsubj)
error('Mismatched number of subjects: the number of columns of data variable should match the number of columns of the design variable.')
end
if(size(design,1) ~= 1)
error('The design variable should only contain 1 row')
end
g1 = find(design==1);
g2 = find(design==2);
if((length(g1)+length(g2))~=Nsubj)
error('The design variable should only contain numbers 1 and 2.')
end
if(niter<=0)
disp('The variable niter should be a positive integer, function will continue assuming niter=5000.')
niter=5000;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% HYPOTHESIS TESTING (for each row/area)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% stats.tvals=tt_np(data,g1,g2); % similar to ttest2
NC = size(data,1); % number of comparisons
tvals = zeros(NC,1);
means = zeros(NC,1);
switch htest
case 'ttest'
% - the population means are not equal. (alternative hypothesis)
% - the two groups are derived from normal distributions with unknown and unequal variances.
for t=1:NC
[H,P,CI,STATS] = ttest(data(t,g1)',data(t,g2)',pthr,'both');
tvals(t,:) = STATS.tstat;
diffs(t,:) = mean(data(t,g1))-mean(data(t,g2));
end
case 'ttest2'
% - the population means are not equal. (alternative hypothesis)
% - the two groups are derived from normal distributions with unknown and unequal variances.
for t=1:NC
[H,P,CI,STATS] = ttest2(data(t,g1)',data(t,g2)',pthr,'both','unequal');
tvals(t,:) = STATS.tstat;
diffs(t,:) = mean(data(t,g1))-mean(data(t,g2));
end
case 'kstest'
for t=1:NC
[H,P,STATS]=kstest2(data(t,g1)',data(t,g2)',pthr);
tvals(t,:)=STATS;
diffs(t,:) = mean(data(t,g1))-mean(data(t,g2));
end
case 'ranksum'
for t=1:NC
[P,H,STATS]=ranksum(data(t,g1)',data(t,g2)','alpha',pthr);
tvals(t,:)=STATS.zval;
diffs(t,:) = mean(data(t,g1))-mean(data(t,g2));
end
case 'signrank'
for t=1:NC
[P,H,STATS]=signrank(data(t,g1)',data(t,g2)','alpha',pthr);
tvals(t,:)=STATS.zval;
diffs(t,:) = mean(data(t,g1))-mean(data(t,g2));
end
otherwise
error('\n-------------------------------\n\nHypothesis test %s not recognized. \n\n-------------------------------\n',htest)
end
stats.tvals = tvals;
% tvals(isnan(tvals)) = 0; % or tvals(tvals~=tvals) = 0
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% PERMUTATION TESTING (for each row/area)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% outptus the pval (from the computed null distribution using permutation
% testing) given the tstat previously calculated.
% each comparison is treated independently
pvals=zeros(NC,2);
for n=1:NC
if median(data(n,g1))~=0 || median(data(n,g2))~=0 % Exclude tests where all (tstat=NaN) or most of the population (median=0) as a null value.
pvals(n,:) = tt_np_pval(data(n,:),g1,g2,niter,tvals(n),pthr);
else
pvals(n,:) = [NaN NaN];
end
end
stats.pvals=pvals;
stats.diffs=diffs;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% NESTED FUNCTIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function tval=tt_np(data,g1,g2)
% helper function similar to matlab function ttest2.m for the case of
% groups with difference variance
xnans = isnan(data(:,g1));
if any(xnans(:))
nx = sum(~xnans,2);
else
nx = size(data(:,g1),2);
end
ynans = isnan(data(:,g2));
if any(ynans(:))
ny = sum(~ynans,2);
else
ny = size(data(:,g2),2); % a scalar, => a scalar call to tinv
end
difference = nanmean(data(:,g1),2) - nanmean(data(:,g2),2);
s2x = nanvar(data(:,g1),[],2);
s2y = nanvar(data(:,g2),[],2);
s2xbar = s2x ./ nx;
s2ybar = s2y ./ ny;
se = sqrt(s2xbar + s2ybar); % standard error of the estimated coefficient 'difference'.
if(any(se == 0) || any(isnan(se)))
error('Group variance seems to be null or NaN, please check your data')
end
tval = difference ./ se;
end
function pval = tt_np_pval(data,g1,g2,niter,tval,pthr)
outiter=zeros(niter,1);
ND=length(data);
for iter=1:niter
perm=randperm(ND);
% one could add a test to see that they are indeed permuted
temp=data(perm);
[H,P,CI,STATS]=ttest2(temp(:,g1)',temp(:,g2)',pthr,'both','unequal');
outiter(iter)=STATS.tstat;
% outiter(iter)=tt_np(temp,g1,g2); % t-stats for this permutation test
end
NCDF=200;
if(niter>5000)
NCDF=round(200*niter/5000);
end
[fi xi]=ksdensity(outiter,'function','cdf','npoints',NCDF); % estimated cumulative distribution function
% figure;
% subpfd=subplot(1,2,1); hold on; vline(tval,'g','tval');
% subcdf=subplot(1,2,2); hold on; vline(tval,'g','tval');
% [h, fhat, xgrid] = kde(outiter);
% [fi2 xi2]=ksdensity(outiter,'npoints',NCDF); % estimated probability density function
% plot(subpfd,xgrid, fhat, 'linewidth', 1.7, 'color', 'blue');
% plot(subpfd,outiter,zeros(size(outiter,2),1), 'r+');
% set(subpfd,'box','off');
% axis(subpfd,[min(xi2),max(xi2),0,max(fi2)+0.05])
% xlabel(subpfd,'t-stats permuted values')
% ylabel(subpfd,'Probability density function')
% plot(subcdf,xi,fi,'linewidth', 1.7, 'color', 'blue')
% axis(subcdf,[min(xi2),max(xi2),0,1]);
% xlabel(subcdf,'t-stats permuted values')
% ylabel(subcdf,'Cumulative distribution function')
% trick to avoid NaNs, we approximate the domain of the CDF between
% -Inf and Inf using the atanh function and the eps matlab precision
% variable
pval_left=interp1([atanh(-1+eps) xi atanh(1-eps)],[0 fi 1],tval); % G1 > G2
pval_right=1-pval_left; % G1 < G2
pval=[pval_right pval_left];
end