-
Notifications
You must be signed in to change notification settings - Fork 14
/
varbvsprint.m
253 lines (239 loc) · 9.15 KB
/
varbvsprint.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
%--------------------------------------------------------------------------
% varbvsprint.m: Print a summary of a model fitted by varbvs.
%--------------------------------------------------------------------------
%
% DESCRIPTION:
% Print a summary of a Bayesian variable selection model fitted using
% variational approximation methods (see function varbvs).
%
% USAGE:
% varbvsprint(fit)
% varbvsprint(fit, c, nv, nr)
%
% INPUT ARGUMENTS:
% fit Output of function varbvs.
% c Compute c% credible intervals. By default, c = 0.95.
% nv Show detailed statistics on nv variables. By default, nv = 5.
% nr Draw nr samples for Monte Carlo estimates of the credible
% intervals for the coefficients. Default is nr = 1000.
%
% DETAILS:
% varbvsprint generates a four-part summary of the fitted Bayesian
% variable selection model. The first part summarizes the analysis setup,
% including the number of samples used to fit the model, the number of
% covariates, and additional optimization settings. For the linear
% regression model only, the "proportion of variance explained" (PVE) is
% the posterior mean estimate of the proportion of variance in Y
% explained by the variable selection model, along with a c% credible
% interval of the PVE.
%
% The second part summarizes the posterior distribution of the
% hyperparameters by reporting the mean, the c% credible interval, and
% the range of settings provided as input to varbvs (or NA when
% options.sa or options.sigma is not set). These statistics are computed
% by interpreting fit.logw as a vector of log-probabilities. (See
% function varbvs for details, and how to adjust fit.logw to account for
% different priors on the hyperparameters.) Summary statistics for
% logodds are only shown if the prior log-odds is identical for all
% variables.
%
% The third part summarizes the variable selection results. Again, all
% these statistics are computed by averaging over the hyperparameter
% settings according to fit.logw. The number variables included at
% different probability thresholds are given in the "count" row.
%
% Finally, the fourth part gives more detailed statistics about the
% variables that are most likely to be included in the regression model
% (the 'prob.' column gives the posterior inclusion probability). Column
% 'coef' gives the posterior mean estimate of the regression coefficient
% conditioned on being included in the model, and the next column gives
% the c% credible interval. For the logistic regression model, the
% coefficient is the same as the (natural) logarithm of the odds
% ratio. For the linear regression model (family = 'gaussian'), the
% posterior mean estimate of the proportion of variance in Y explained by
% the variable ('PVE') is also provided.
%
% LICENSE: GPL v3
%
% DATE: January 10, 2016
%
% AUTHORS:
% Algorithm was designed by Peter Carbonetto and Matthew Stephens.
% R, MATLAB and C code was written by Peter Carbonetto.
% Depts. of Statistics and Human Genetics, University of Chicago,
% Chicago, IL, USA, and AncestryDNA, San Francisco, CA, USA
%
% REFERENCES:
% P. Carbonetto, M. Stephens (2012). Scalable variational inference
% for Bayesian variable selection in regression, and its accuracy in
% genetic association studies. Bayesian Analysis 7: 73-108.
%
% SEE ALSO:
% varbvs, varbvsprint
%
% EXAMPLES:
% See demo_qtl.m and demo_cc.m for examples.
%
function varbvsprint (fit, c, nv, nr)
% Part of the varbvs package, https://github.com/pcarbo/varbvs
%
% Copyright (C) 2012-2017, Peter Carbonetto
%
% This program is free software: you can redistribute it under the
% terms of the GNU General Public License; either version 3 of the
% License, or (at your option) any later version.
%
% This program is distributed in the hope that it will be useful, but
% WITHOUT ANY WARRANY; without even the implied warranty of
% MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU
% General Public License for more details.
%
% Get the number of variables (p) and number of candidate hyperparameter
% settings (ns).
p = numel(fit.labels);
ns = numel(fit.logw);
% Take care of optional inputs.
if nargin < 2
c = 0.95;
end
% Show detailed statistics on nv variables. This cannot be greater than the
% number of variables.
if nargin < 3
nv = 5;
end
nv = min(nv,p);
% Draw this many samples to compute the Monte Carlo estimates of the
% credible intervals for the regression coefficients.
if nargin < 4
nr = 1000;
end
% (1) COMPUTE POSTERIOR STATISTICS
% --------------------------------
% Get the normalized (approximate) probabilities and the posterior
% inclusion probabilities (PIPs).
w = fit.w;
pip = fit.pip;
% (2) SUMMARIZE ANALYSIS SETUP
% ----------------------------
fprintf('Summary of fitted Bayesian variable selection model:\n');
fprintf('family: %-8s',fit.family);
fprintf(' num. hyperparameter settings: %d\n',numel(fit.sa));
fprintf('samples: %-6d',fit.n);
fprintf(' iid variable selection prior: %s\n',tf2yn(fit.prior_same));
fprintf('variables: %-6d',p);
fprintf(' fit prior var. of coefs (sa): %s\n',tf2yn(fit.update_sa));
fprintf('covariates: %-6d ',size(fit.mu_cov,1));
if strcmp(fit.family,'gaussian')
fprintf('fit residual var. (sigma): %s\n',tf2yn(fit.update_sigma));
elseif strcmp(fit.family,'binomial')
fprintf('fit approx. factors (eta): %s\n',tf2yn(fit.optimize_eta));
end
fprintf('maximum log-likelihood lower bound: %0.4f\n',max(fit.logw));
if strcmp(fit.family,'gaussian')
x = sort(fit.model_pve);
a = x(floor((0.5 - c/2)*length(x)));
b = x(ceil((0.5 + c/2)*length(x)));
fprintf('proportion of variance explained: ');
fprintf('%0.3f [%0.3f,%0.3f]\n',mean(x),a,b);
end
% (3) SUMMARIZE RESULTS ON HYPERPARAMETERS
% ----------------------------------------
% Summarize the fitted residual variance parameter (sigma).
fprintf('Hyperparameters: ');
if ns == 1
% Summarize the hyperparameter settings when there is only one
% candidate setting.
if strcmp(fit.family,'gaussian')
fprintf('sigma=%0.3g ',fit.sigma);
end
fprintf('sa=%0.3g ',fit.sa);
if (fit.prior_same)
fprintf('logodds=%+0.2f',fit.logodds);
end
fprintf('\n');
else
fprintf('\n');
fprintf(' estimate Pr>%0.2f candidate values\n',c);
if strcmp(fit.family,'gaussian')
x = fit.sigma(:);
if length(unique(x)) == 1
fprintf('sigma %8.3g NA %0.3g\n',x(1),x(1));
else
x0 = dot(w(:),x);
[a b] = cred(x,x0,w,c);
fprintf('sigma %8.3g %-19s ',x0,sprintf('[%0.3g,%0.3g]',a,b));
if fit.update_sigma
fprintf('NA\n');
else
fprintf('%0.3g--%0.3g\n',min(x),max(x));
end
end
end
% Summarize the fitted prior variance parameter (sa).
x = fit.sa(:);
if length(unique(x)) == 1
fprintf('sa %8.3g NA %0.3g\n',x(1),x(1));
else
x0 = dot(w(:),x);
[a b] = cred(x,x0,w,c);
fprintf('sa %8.3g %-19s ',x0,sprintf('[%0.3g,%0.3g]',a,b));
if fit.update_sa
fprintf('NA\n')
else
fprintf('%0.3g--%0.3g\n',min(x),max(x));
end
end
% Summarize the fitted prior log-odds of inclusion (logodds).
if (fit.prior_same)
x = fit.logodds(:);
if length(unique(x)) == 1
fprintf('logodds %8.2f NA %0.2f\n',x(1),x(1));
else
x0 = dot(w(:),x);
[a b] = cred(x,x0,w,c);
fprintf('logodds %+8.2f %-19s (%+0.2f)--(%+0.2f)\n',x0,...
sprintf('[%+0.2f,%+0.2f]',a,b),min(x),max(x));
end
end
end
% (4) SUMMARIZE VARIABLE SELECTION RESULTS
% ----------------------------------------
% Summarize the number of variables selected at different PIP thresholds.
fprintf('Selected variables:\n');
fprintf('prob. >0.10 >0.25 >0.50 >0.75 >0.90 >0.95\n');
fprintf('count %5d %5d %5d %5d %5d %5d\n',sum(pip > 0.1),sum(pip > 0.25),...
sum(pip > 0.5),sum(pip > 0.75),sum(pip > 0.9),sum(pip > 0.95));
% Give more detailed statistics about the top nv variables by the
% probability that they are included.
[ans vars] = sort(-pip);
vars = vars(1:nv);
vars = vars(:)';
fprintf('Top %d variables by inclusion probability:\n',nv);
fprintf(' index variable prob.');
if strcmp(fit.family,'gaussian')
fprintf(' PVE');
end
if strcmp(fit.family,'binomial')
fprintf(' coef* Pr(coef>%0.2f)\n',c);
else
fprintf(' coef Pr(coef>%0.2f)\n',c);
end
for i = vars
[a b] = varbvscoefcred(fit,i,c,nr);
fprintf('%6d %-10s %0.3f',i,fit.labels{i},pip(i));
if strcmp(fit.family,'gaussian')
fprintf(' %0.3f',dot(w,fit.pve(i,:)));
end
fprintf(' %+7.3f [%+0.3f,%+0.3f]\n',fit.beta(i),a,b);
end
if strcmp(fit.family,'binomial')
fprintf('*See "help varbvs" for interpreting coefficients in ');
fprintf('logistic regression.\n');
end
% ------------------------------------------------------------------
function y = tf2yn (x)
if x
y = 'yes';
else
y = 'no';
end