diff --git a/DESCRIPTION b/DESCRIPTION index eb9a64d..f82700c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: STAAR Type: Package Title: STAAR Procedure for Dynamic Incorporation of Multiple Functional Annotations in Whole-Genome Sequencing Studies -Version: 0.9.5 -Date: 2020-09-14 +Version: 0.9.6 +Date: 2021-08-16 Author: Xihao Li [aut, cre], Zilin Li [aut, cre], Han Chen [aut] Maintainer: Xihao Li , Zilin Li Description: An R package for performing STAAR procedure in whole-genome sequencing studies. @@ -13,6 +13,6 @@ Encoding: UTF-8 LazyData: true Depends: R (>= 3.2.0) LinkingTo: Rcpp, RcppArmadillo -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 Suggests: knitr, rmarkdown VignetteBuilder: knitr diff --git a/R/CCT.R b/R/CCT.R index 846cc38..0a01786 100644 --- a/R/CCT.R +++ b/R/CCT.R @@ -5,14 +5,18 @@ #' @param pvals a numeric vector of p-values, where each of the element is #' between 0 to 1, to be combined. #' @param weights a numeric vector of non-negative weights. If \code{NULL}, the -#' equal weights are assumed. +#' equal weights are assumed (default = NULL). #' @return the aggregated p-value combining p-values from the vector \code{pvals}. -#' @examples pvalues <- c(2e-02,4e-04,0.2,0.1,0.8) -#' @examples CCT(pvals=pvalues) +#' @examples pvalues <- c(2e-02, 4e-04, 0.2, 0.1, 0.8) +#' @examples CCT(pvals = pvalues) #' @references Liu, Y., & Xie, J. (2020). Cauchy combination test: a powerful test #' with analytic p-value calculation under arbitrary dependency structures. -#' \emph{Journal of the American Statistical Association 115}(529), 393-402. -#' (\href{https://www.tandfonline.com/doi/full/10.1080/01621459.2018.1554485}{pub}) +#' \emph{Journal of the American Statistical Association}, \emph{115}(529), 393-402. +#' (\href{https://doi.org/10.1080/01621459.2018.1554485}{pub}) +#' @references Liu, Y., et al. (2019). Acat: A fast and powerful p value combination +#' method for rare-variant analysis in sequencing studies. +#' \emph{The American Journal of Human Genetics}, \emph{104}(3), 410-421. +#' (\href{https://doi.org/10.1016/j.ajhg.2019.01.002}{pub}) #' @export CCT <- function(pvals, weights=NULL){ diff --git a/R/Indiv_Score_Test_Region.R b/R/Indiv_Score_Test_Region.R index 97539ae..277c533 100644 --- a/R/Indiv_Score_Test_Region.R +++ b/R/Indiv_Score_Test_Region.R @@ -10,9 +10,9 @@ #' \code{\link{fit_null_glmmkin}} function for related samples. Note that \code{\link{fit_null_glmmkin}} #' is a wrapper of \code{\link{glmmkin}} function from the \code{\link{GMMAT}} package. #' @param rare_maf_cutoff the cutoff of maximum minor allele frequency in -#' defining rare variants (default is 0.01). +#' defining rare variants (default = 0.01). #' @param rv_num_cutoff the cutoff of minimum number of variants of analyzing -#' a given variant-set (default is 2). +#' a given variant-set (default = 2). #' @return a data frame with p rows corresponding to the p genetic variants in the given variant-set #' and three columns: \code{Score} (the score test statistic), \code{SE} (the standard error associated #' with the score test statistic), and \code{pvalue} (the score test p-value). diff --git a/R/Indiv_Score_Test_Region_cond.R b/R/Indiv_Score_Test_Region_cond.R index 8c20d59..3fddef9 100644 --- a/R/Indiv_Score_Test_Region_cond.R +++ b/R/Indiv_Score_Test_Region_cond.R @@ -16,9 +16,14 @@ #' \code{\link{fit_null_glmmkin}} function for related samples. Note that \code{\link{fit_null_glmmkin}} #' is a wrapper of \code{\link{glmmkin}} function from the \code{\link{GMMAT}} package. #' @param rare_maf_cutoff the cutoff of maximum minor allele frequency in -#' defining rare variants (default is 0.01). +#' defining rare variants (default = 0.01). #' @param rv_num_cutoff the cutoff of minimum number of variants of analyzing -#' a given variant-set (default is 2). +#' a given variant-set (default = 2). +#' @param method_cond a character value indicating the method for conditional analysis. +#' \code{optimal} refers to regressing residuals from the null model on \code{genotype_adj} +#' as well as all covariates used in fitting the null model (fully adjusted) and taking the residuals; +#' \code{naive} refers to regressing residuals from the null model on \code{genotype_adj} +#' and taking the residuals (default = \code{optimal}). #' @return a data frame with p rows corresponding to the p genetic variants in the given variant-set #' and three columns: \code{Score_cond} (the conditional score test statistic adjusting for variants #' in \code{genotype_adj}), \code{SE_cond} (the standard error associated with the @@ -26,11 +31,16 @@ #' If a variant in the given variant-set has minor allele frequency = 0 or #' greater than \code{rare_maf_cutoff}, the corresponding row will be \code{NA}. If a variant in #' the given variant-set has standard error equal to 0, the p-value will be set as 1. +#' @references Sofer, T., et al. (2019). A fully adjusted two-stage procedure for rank-normalization +#' in genetic association studies. \emph{Genetic Epidemiology}, \emph{43}(3), 263-275. +#' (\href{https://doi.org/10.1002/gepi.22188}{pub}) #' @export Indiv_Score_Test_Region_cond <- function(genotype,genotype_adj,obj_nullmodel, - rare_maf_cutoff=0.01,rv_num_cutoff=2){ + rare_maf_cutoff=0.01,rv_num_cutoff=2, + method_cond=c("optimal","naive")){ + method_cond <- match.arg(method_cond) # evaluate choices if(class(genotype) != "matrix" && !(!is.null(attr(class(genotype), "package")) && attr(class(genotype), "package") == "Matrix")){ stop("genotype is not a matrix!") } @@ -75,8 +85,13 @@ Indiv_Score_Test_Region_cond <- function(genotype,genotype_adj,obj_nullmodel, residuals.phenotype <- obj_nullmodel$scaled.residuals residuals.phenotype <- residuals.phenotype*sqrt(P_scalar) - residuals.phenotype <- lm(residuals.phenotype~genotype_adj)$residuals - X_adj <- cbind(rep(1,length(residuals.phenotype)),genotype_adj) + if(method_cond == "optimal"){ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj+obj_nullmodel$X-1) + }else{ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj) + } + residuals.phenotype <- residuals.phenotype.fit$residuals + X_adj <- model.matrix(residuals.phenotype.fit) PX_adj <- P%*%X_adj P_cond <- P - X_adj%*%solve(t(X_adj)%*%X_adj)%*%t(PX_adj) - PX_adj%*%solve(t(X_adj)%*%X_adj)%*%t(X_adj) + @@ -91,8 +106,13 @@ Indiv_Score_Test_Region_cond <- function(genotype,genotype_adj,obj_nullmodel, cov <- obj_nullmodel$cov residuals.phenotype <- obj_nullmodel$scaled.residuals - residuals.phenotype <- lm(residuals.phenotype~genotype_adj)$residuals - X_adj <- cbind(rep(1,length(residuals.phenotype)),genotype_adj) + if(method_cond == "optimal"){ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj+obj_nullmodel$X-1) + }else{ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj) + } + residuals.phenotype <- residuals.phenotype.fit$residuals + X_adj <- model.matrix(residuals.phenotype.fit) results[RV_label,] <- do.call(cbind,Indiv_Score_Test_SMMAT_sparse_cond(G,Sigma_i,Sigma_iX,cov,X_adj,residuals.phenotype)) } @@ -107,8 +127,13 @@ Indiv_Score_Test_Region_cond <- function(genotype,genotype_adj,obj_nullmodel, } residuals.phenotype <- obj_nullmodel$y - obj_nullmodel$fitted.values - residuals.phenotype <- lm(residuals.phenotype~genotype_adj)$residuals - X_adj <- cbind(rep(1,length(residuals.phenotype)),genotype_adj) + if(method_cond == "optimal"){ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj+model.matrix(obj_nullmodel)-1) + }else{ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj) + } + residuals.phenotype <- residuals.phenotype.fit$residuals + X_adj <- model.matrix(residuals.phenotype.fit) PX_adj <- P%*%X_adj P_cond <- P - X_adj%*%solve(t(X_adj)%*%X_adj)%*%t(PX_adj) - PX_adj%*%solve(t(X_adj)%*%X_adj)%*%t(X_adj) + diff --git a/R/STAAR.R b/R/STAAR.R index 804a29a..acd80e2 100644 --- a/R/STAAR.R +++ b/R/STAAR.R @@ -22,13 +22,15 @@ #' SKAT(1,25), SKAT(1,1), Burden(1,25), Burden(1,1), ACAT-V(1,25), ACAT-V(1,1) #' and ACAT-O tests (default = NULL). #' @param rare_maf_cutoff the cutoff of maximum minor allele frequency in -#' defining rare variants (default is 0.01). +#' defining rare variants (default = 0.01). #' @param rv_num_cutoff the cutoff of minimum number of variants of analyzing -#' a given variant-set (default is 2). +#' a given variant-set (default = 2). #' @return a list with the following members: #' @return \code{num_variant}: the number of variants with minor allele frequency > 0 and less than #' \code{rare_maf_cutoff} in the given variant-set that are used for performing the #' variant-set using STAAR. +#' @return \code{cMAC}: the cumulative minor allele count of variants with +#' minor allele frequency > 0 and less than \code{rare_maf_cutoff} in the given variant-set. #' @return \code{RV_label}: the boolean vector indicating whether each variant in the given #' variant-set has minor allele frequency > 0 and less than \code{rare_maf_cutoff}. #' @return \code{results_STAAR_O}: the STAAR-O p-value that aggregated SKAT(1,25), @@ -60,14 +62,18 @@ #' including ACAT-V(1,1) p-value weighted by MAF, the ACAT-V(1,1) #' p-values weighted by each annotation, and a STAAR-A(1,1) #' p-value by aggregating these p-values using Cauchy method. -#' @references Li, X., Li, Z. et al. (2020). Dynamic incorporation of multiple +#' @references Li, X., Li, Z., et al. (2020). Dynamic incorporation of multiple #' in silico functional annotations empowers rare variant association analysis of -#' large whole-genome sequencing studies at scale. \emph{Nature Genetics}. -#' (\href{https://www.nature.com/articles/s41588-020-0676-4}{pub}) +#' large whole-genome sequencing studies at scale. \emph{Nature Genetics}, \emph{52}(9), 969-983. +#' (\href{https://doi.org/10.1038/s41588-020-0676-4}{pub}) #' @references Liu, Y., et al. (2019). Acat: A fast and powerful p value combination #' method for rare-variant analysis in sequencing studies. -#' \emph{The American Journal of Humann Genetics 104}(3), 410-421. -#' (\href{https://www.sciencedirect.com/science/article/pii/S0002929719300023}{pub}) +#' \emph{The American Journal of Human Genetics}, \emph{104}(3), 410-421. +#' (\href{https://doi.org/10.1016/j.ajhg.2019.01.002}{pub}) +#' @references Li, Z., Li, X., et al. (2020). Dynamic scan procedure for +#' detecting rare-variant association regions in whole-genome sequencing studies. +#' \emph{The American Journal of Human Genetics}, \emph{104}(5), 802-814. +#' (\href{https://doi.org/10.1016/j.ajhg.2019.03.002}{pub}) #' @export STAAR <- function(genotype,obj_nullmodel,annotation_phred=NULL, @@ -181,6 +187,7 @@ STAAR <- function(genotype,obj_nullmodel,annotation_phred=NULL, } num_variant <- sum(RV_label) #dim(G)[2] + cMAC <- sum(G) num_annotation <- dim(annotation_phred)[2]+1 results_STAAR_O <- CCT(pvalues) results_ACAT_O <- CCT(pvalues[c(1,num_annotation+1,2*num_annotation+1,3*num_annotation+1,4*num_annotation+1,5*num_annotation+1)]) @@ -238,6 +245,7 @@ STAAR <- function(genotype,obj_nullmodel,annotation_phred=NULL, } return(list(num_variant = num_variant, + cMAC = cMAC, RV_label = RV_label, results_STAAR_O = results_STAAR_O, results_ACAT_O = results_ACAT_O, diff --git a/R/STAAR_cond.R b/R/STAAR_cond.R index c848fc9..1678f54 100644 --- a/R/STAAR_cond.R +++ b/R/STAAR_cond.R @@ -28,13 +28,20 @@ #' SKAT(1,25), SKAT(1,1), Burden(1,25), Burden(1,1), ACAT-V(1,25), ACAT-V(1,1) #' and ACAT-O tests (default = NULL). #' @param rare_maf_cutoff the cutoff of maximum minor allele frequency in -#' defining rare variants (default is 0.01). +#' defining rare variants (default = 0.01). #' @param rv_num_cutoff the cutoff of minimum number of variants of analyzing -#' a given variant-set (default is 2). +#' a given variant-set (default = 2). +#' @param method_cond a character value indicating the method for conditional analysis. +#' \code{optimal} refers to regressing residuals from the null model on \code{genotype_adj} +#' as well as all covariates used in fitting the null model (fully adjusted) and taking the residuals; +#' \code{naive} refers to regressing residuals from the null model on \code{genotype_adj} +#' and taking the residuals (default = \code{optimal}). #' @return a list with the following members: #' @return \code{num_variant}: the number of variants with minor allele frequency > 0 and less than #' \code{rare_maf_cutoff} in the given variant-set that are used for performing the #' variant-set using STAAR. +#' @return \code{cMAC}: the cumulative minor allele count of variants with +#' minor allele frequency > 0 and less than \code{rare_maf_cutoff} in the given variant-set. #' @return \code{RV_label}: the boolean vector indicating whether each variant in the given #' variant-set has minor allele frequency > 0 and less than \code{rare_maf_cutoff}. #' @return \code{results_STAAR_O_cond}: the conditional STAAR-O p-value that aggregated conditional @@ -66,19 +73,28 @@ #' including conditional ACAT-V(1,1) p-value weighted by MAF, the conditional ACAT-V(1,1) #' p-values weighted by each annotation, and a conditional STAAR-A(1,1) #' p-value by aggregating these p-values using Cauchy method. -#' @references Li, X., Li, Z. et al. (2020). Dynamic incorporation of multiple +#' @references Li, X., Li, Z., et al. (2020). Dynamic incorporation of multiple #' in silico functional annotations empowers rare variant association analysis of -#' large whole-genome sequencing studies at scale. \emph{Nature Genetics}. -#' (\href{https://www.nature.com/articles/s41588-020-0676-4}{pub}) +#' large whole-genome sequencing studies at scale. \emph{Nature Genetics}, \emph{52}(9), 969-983. +#' (\href{https://doi.org/10.1038/s41588-020-0676-4}{pub}) #' @references Liu, Y., et al. (2019). Acat: A fast and powerful p value combination #' method for rare-variant analysis in sequencing studies. -#' \emph{The American Journal of Humann Genetics 104}(3), 410-421. -#' (\href{https://www.sciencedirect.com/science/article/pii/S0002929719300023}{pub}) +#' \emph{The American Journal of Human Genetics}, \emph{104}(3), 410-421. +#' (\href{https://doi.org/10.1016/j.ajhg.2019.01.002}{pub}) +#' @references Li, Z., Li, X., et al. (2020). Dynamic scan procedure for +#' detecting rare-variant association regions in whole-genome sequencing studies. +#' \emph{The American Journal of Human Genetics}, \emph{104}(5), 802-814. +#' (\href{https://doi.org/10.1016/j.ajhg.2019.03.002}{pub}) +#' @references Sofer, T., et al. (2019). A fully adjusted two-stage procedure for rank-normalization +#' in genetic association studies. \emph{Genetic Epidemiology}, \emph{43}(3), 263-275. +#' (\href{https://doi.org/10.1002/gepi.22188}{pub}) #' @export STAAR_cond <- function(genotype,genotype_adj,obj_nullmodel,annotation_phred=NULL, - rare_maf_cutoff=0.01,rv_num_cutoff=2){ + rare_maf_cutoff=0.01,rv_num_cutoff=2, + method_cond=c("optimal","naive")){ + method_cond <- match.arg(method_cond) # evaluate choices if(class(genotype) != "matrix" && !(!is.null(attr(class(genotype), "package")) && attr(class(genotype), "package") == "Matrix")){ stop("genotype is not a matrix!") } @@ -162,8 +178,13 @@ STAAR_cond <- function(genotype,genotype_adj,obj_nullmodel,annotation_phred=NULL residuals.phenotype <- obj_nullmodel$scaled.residuals residuals.phenotype <- residuals.phenotype*sqrt(P_scalar) - residuals.phenotype <- lm(residuals.phenotype~genotype_adj)$residuals - X_adj <- cbind(rep(1,length(residuals.phenotype)),genotype_adj) + if(method_cond == "optimal"){ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj+obj_nullmodel$X-1) + }else{ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj) + } + residuals.phenotype <- residuals.phenotype.fit$residuals + X_adj <- model.matrix(residuals.phenotype.fit) PX_adj <- P%*%X_adj P_cond <- P - X_adj%*%solve(t(X_adj)%*%X_adj)%*%t(PX_adj) - PX_adj%*%solve(t(X_adj)%*%X_adj)%*%t(X_adj) + @@ -180,8 +201,13 @@ STAAR_cond <- function(genotype,genotype_adj,obj_nullmodel,annotation_phred=NULL cov <- obj_nullmodel$cov residuals.phenotype <- obj_nullmodel$scaled.residuals - residuals.phenotype <- lm(residuals.phenotype~genotype_adj)$residuals - X_adj <- cbind(rep(1,length(residuals.phenotype)),genotype_adj) + if(method_cond == "optimal"){ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj+obj_nullmodel$X-1) + }else{ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj) + } + residuals.phenotype <- residuals.phenotype.fit$residuals + X_adj <- model.matrix(residuals.phenotype.fit) pvalues <- STAAR_O_SMMAT_sparse_cond(G,Sigma_i,Sigma_iX,cov,X_adj,residuals.phenotype, weights_B=w_B,weights_S=w_S,weights_A=w_A, @@ -198,8 +224,13 @@ STAAR_cond <- function(genotype,genotype_adj,obj_nullmodel,annotation_phred=NULL } residuals.phenotype <- obj_nullmodel$y - obj_nullmodel$fitted.values - residuals.phenotype <- lm(residuals.phenotype~genotype_adj)$residuals - X_adj <- cbind(rep(1,length(residuals.phenotype)),genotype_adj) + if(method_cond == "optimal"){ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj+model.matrix(obj_nullmodel)-1) + }else{ + residuals.phenotype.fit <- lm(residuals.phenotype~genotype_adj) + } + residuals.phenotype <- residuals.phenotype.fit$residuals + X_adj <- model.matrix(residuals.phenotype.fit) PX_adj <- P%*%X_adj P_cond <- P - X_adj%*%solve(t(X_adj)%*%X_adj)%*%t(PX_adj) - PX_adj%*%solve(t(X_adj)%*%X_adj)%*%t(X_adj) + @@ -213,6 +244,7 @@ STAAR_cond <- function(genotype,genotype_adj,obj_nullmodel,annotation_phred=NULL } num_variant <- sum(RV_label) #dim(G)[2] + cMAC <- sum(G) num_annotation <- dim(annotation_phred)[2]+1 results_STAAR_O <- CCT(pvalues) results_ACAT_O <- CCT(pvalues[c(1,num_annotation+1,2*num_annotation+1,3*num_annotation+1,4*num_annotation+1,5*num_annotation+1)]) @@ -270,6 +302,7 @@ STAAR_cond <- function(genotype,genotype_adj,obj_nullmodel,annotation_phred=NULL } return(list(num_variant = num_variant, + cMAC = cMAC, RV_label = RV_label, results_STAAR_O_cond = results_STAAR_O, results_ACAT_O_cond = results_ACAT_O, diff --git a/R/fit_null_glmmkin.R b/R/fit_null_glmmkin.R index e178c0b..e09f035 100644 --- a/R/fit_null_glmmkin.R +++ b/R/fit_null_glmmkin.R @@ -50,13 +50,13 @@ #' with additional elements indicating the samples are related (\code{obj_nullmodel$relatedness = TRUE}), #' and whether the \code{kins} matrix is sparse when fitting the null model. See \code{\link{glmmkin}} for more details. #' @references Chen, H., et al. (2016). Control for population structure and relatedness for binary traits -#' in genetic association studies via logistic mixed models. \emph{The American Journal of Humann Genetics 98}(4), 653-666. -#' (\href{https://www.sciencedirect.com/science/article/pii/S000292971600063X}{pub}) +#' in genetic association studies via logistic mixed models. \emph{The American Journal of Human Genetics}, \emph{98}(4), 653-666. +#' (\href{https://doi.org/10.1016/j.ajhg.2016.02.012}{pub}) #' @references Chen, H., et al. (2019). Efficient variant set mixed model association tests for continuous and -#' binary traits in large-scale whole-genome sequencing studies. \emph{The American Journal of Humann Genetics 104}(2), 260-274. -#' (\href{https://www.sciencedirect.com/science/article/pii/S0002929718304658}{pub}) -#' @references Chen, H. (2020). GMMAT: Generalized Linear Mixed Model Association Tests. -#' (\href{https://github.com/hanchenphd/GMMAT/blob/master/inst/doc/GMMAT.pdf}{web}) +#' binary traits in large-scale whole-genome sequencing studies. \emph{The American Journal of Human Genetics}, \emph{104}(2), 260-274. +#' (\href{https://doi.org/10.1016/j.ajhg.2018.12.012}{pub}) +#' @references Chen, H. (2020). GMMAT: Generalized linear Mixed Model Association Tests Version 1.3. +#' (\href{https://cloud.r-project.org/web/packages/GMMAT/vignettes/GMMAT.pdf}{web}) #' @export fit_null_glmmkin <- function(fixed, data = parent.frame(), kins, use_sparse = NULL, @@ -73,7 +73,7 @@ fit_null_glmmkin <- function(fixed, data = parent.frame(), kins, use_sparse = NU obj_nullmodel <- glmmkin(fixed = fixed, data = data, kins = kins, id = id, random.slope = random.slope, groups = groups, family = family, method = method, - method.optim = "AI", maxiter = maxiter, + method.optim = method.optim, maxiter = maxiter, tol = tol, taumin = taumin, taumax = taumax, tauregion = tauregion, verbose = verbose, ...) obj_nullmodel$sparse_kins <- TRUE @@ -89,7 +89,7 @@ fit_null_glmmkin <- function(fixed, data = parent.frame(), kins, use_sparse = NU obj_nullmodel <- glmmkin(fixed = fixed, data = data, kins = kins_sp, id = id, random.slope = random.slope, groups = groups, family = family, method = method, - method.optim = "AI", maxiter = maxiter, + method.optim = method.optim, maxiter = maxiter, tol = tol, taumin = taumin, taumax = taumax, tauregion = tauregion, verbose = verbose, ...) obj_nullmodel$sparse_kins <- TRUE diff --git a/README.md b/README.md index da1acc0..31c7ed8 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ STAAR is an R package for performing variant-Set Test for Association using Anno ## Workflow Overview ![STAAR_workflow](docs/STAAR_workflow.png) ## Prerequisites -R (recommended version >= 3.4.2) +R (recommended version >= 3.5.1) For optimal computational performance, it is recommended to use an R version configured with the Intel Math Kernel Library (or other fast BLAS/LAPACK libraries). See the instructions on building R with Intel MKL. ## Dependencies @@ -25,11 +25,11 @@ Please see the **STAAR** user manual for det ## Data Availability The whole-genome individual functional annotation data assembled from a variety of sources and the computed annotation principal components are available at the [Functional Annotation of Variant - Online Resource (FAVOR)](http://favor.genohub.org) site. ## Version -The current version is 0.9.5 (September 14, 2020). +The current version is 0.9.6 (August 16, 2021). ## Citation If you use **STAAR** for your work, please cite: -Xihao Li*, Zilin Li*, Hufeng Zhou, Sheila M. Gaynor, Yaowu Liu, Han Chen, Ryan Sun, Rounak Dey, Donna K. Arnett, Stella Aslibekyan, Christie M. Ballantyne, Lawrence F. Bielak, John Blangero, Eric Boerwinkle, Donald W. Bowden, Jai G. Broome, Matthew P. Conomos, Adolfo Correa, L. Adrienne Cupples, Joanne E. Curran, Barry I. Freedman, Xiuqing Guo, George Hindy, Marguerite R. Irvin, Sharon L.R. Kardia, Sekar Kathiresan, Alyna T. Khan, Charles L. Kooperberg, Cathy C. Laurie, X. Shirley Liu, Michael C. Mahaney, Ani W. Manichaikul, Lisa W. Martin, Rasika A. Mathias, Stephen T. McGarvey, Braxton D. Mitchell, May E. Montasser, Jill Moore, Alanna C. Morrison, Jeffrey R. O'Connell, Nicholette D. Palmer, Akhil Pampana, Juan M. Peralta, Patricia A. Peyser, Bruce M. Psaty, Susan Redline, Kenneth M. Rice, Stephen S. Rich, Jennifer A. Smith, Hemant K. Tiwari, Michael Y. Tsai, Ramachandran S. Vasan, Fei Fei Wang, Daniel E. Weeks, Zhiping Weng, James G. Wilson, Lisa R. Yanek, NHLBI Trans-Omics for Precision Medicine (TOPMed) Consortium, TOPMed Lipids Working Group, Benjamin M. Neale, Shamil R. Sunyaev, Gonçalo R. Abecasis, Jerome I. Rotter, Cristen J. Willer, Gina M. Peloso, Pradeep Natarajan and Xihong Lin (2020) "Dynamic incorporation of multiple in silico functional annotations empowers rare variant association analysis of large whole-genome sequencing studies at scale". _Nature Genetics_. DOI: 10.1038/s41588-020-0676-4. +Xihao Li*, Zilin Li*, Hufeng Zhou, Sheila M. Gaynor, Yaowu Liu, Han Chen, Ryan Sun, Rounak Dey, Donna K. Arnett, Stella Aslibekyan, Christie M. Ballantyne, Lawrence F. Bielak, John Blangero, Eric Boerwinkle, Donald W. Bowden, Jai G. Broome, Matthew P. Conomos, Adolfo Correa, L. Adrienne Cupples, Joanne E. Curran, Barry I. Freedman, Xiuqing Guo, George Hindy, Marguerite R. Irvin, Sharon L. R. Kardia, Sekar Kathiresan, Alyna T. Khan, Charles L. Kooperberg, Cathy C. Laurie, X. Shirley Liu, Michael C. Mahaney, Ani W. Manichaikul, Lisa W. Martin, Rasika A. Mathias, Stephen T. McGarvey, Braxton D. Mitchell, May E. Montasser, Jill E. Moore, Alanna C. Morrison, Jeffrey R. O'Connell, Nicholette D. Palmer, Akhil Pampana, Juan M. Peralta, Patricia A. Peyser, Bruce M. Psaty, Susan Redline, Kenneth M. Rice, Stephen S. Rich, Jennifer A. Smith, Hemant K. Tiwari, Michael Y. Tsai, Ramachandran S. Vasan, Fei Fei Wang, Daniel E. Weeks, Zhiping Weng, James G. Wilson, Lisa R. Yanek, NHLBI Trans-Omics for Precision Medicine (TOPMed) Consortium, TOPMed Lipids Working Group, Benjamin M. Neale, Shamil R. Sunyaev, Gonçalo R. Abecasis, Jerome I. Rotter, Cristen J. Willer, Gina M. Peloso, Pradeep Natarajan, & Xihong Lin. (2020). Dynamic incorporation of multiple in silico functional annotations empowers rare variant association analysis of large whole-genome sequencing studies at scale. _Nature Genetics_, _52_(9), 969-983. PMID: 32839606. PMCID: PMC7483769. DOI: 10.1038/s41588-020-0676-4. ## License This software is licensed under GPLv3. diff --git a/docs/STAAR_manual.pdf b/docs/STAAR_manual.pdf index ca0fe3e..94856b8 100644 Binary files a/docs/STAAR_manual.pdf and b/docs/STAAR_manual.pdf differ diff --git a/man/CCT.Rd b/man/CCT.Rd index 9e98269..cbd339e 100644 --- a/man/CCT.Rd +++ b/man/CCT.Rd @@ -11,7 +11,7 @@ CCT(pvals, weights = NULL) between 0 to 1, to be combined.} \item{weights}{a numeric vector of non-negative weights. If \code{NULL}, the -equal weights are assumed.} +equal weights are assumed (default = NULL).} } \value{ the aggregated p-value combining p-values from the vector \code{pvals}. @@ -21,12 +21,17 @@ The \code{CCT} function takes in a numeric vector of p-values, a numeric vector of non-negative weights, and return the aggregated p-value using Cauchy method. } \examples{ -pvalues <- c(2e-02,4e-04,0.2,0.1,0.8) -CCT(pvals=pvalues) +pvalues <- c(2e-02, 4e-04, 0.2, 0.1, 0.8) +CCT(pvals = pvalues) } \references{ Liu, Y., & Xie, J. (2020). Cauchy combination test: a powerful test with analytic p-value calculation under arbitrary dependency structures. -\emph{Journal of the American Statistical Association 115}(529), 393-402. -(\href{https://www.tandfonline.com/doi/full/10.1080/01621459.2018.1554485}{pub}) +\emph{Journal of the American Statistical Association}, \emph{115}(529), 393-402. +(\href{https://doi.org/10.1080/01621459.2018.1554485}{pub}) + +Liu, Y., et al. (2019). Acat: A fast and powerful p value combination +method for rare-variant analysis in sequencing studies. +\emph{The American Journal of Human Genetics}, \emph{104}(3), 410-421. +(\href{https://doi.org/10.1016/j.ajhg.2019.01.002}{pub}) } diff --git a/man/Indiv_Score_Test_Region.Rd b/man/Indiv_Score_Test_Region.Rd index 224cbfc..f702562 100644 --- a/man/Indiv_Score_Test_Region.Rd +++ b/man/Indiv_Score_Test_Region.Rd @@ -21,10 +21,10 @@ output from either \code{\link{fit_null_glm}} function for unrelated samples or is a wrapper of \code{\link{glmmkin}} function from the \code{\link{GMMAT}} package.} \item{rare_maf_cutoff}{the cutoff of maximum minor allele frequency in -defining rare variants (default is 0.01).} +defining rare variants (default = 0.01).} \item{rv_num_cutoff}{the cutoff of minimum number of variants of analyzing -a given variant-set (default is 2).} +a given variant-set (default = 2).} } \value{ a data frame with p rows corresponding to the p genetic variants in the given variant-set diff --git a/man/Indiv_Score_Test_Region_cond.Rd b/man/Indiv_Score_Test_Region_cond.Rd index dcadb88..523d51d 100644 --- a/man/Indiv_Score_Test_Region_cond.Rd +++ b/man/Indiv_Score_Test_Region_cond.Rd @@ -9,7 +9,8 @@ Indiv_Score_Test_Region_cond( genotype_adj, obj_nullmodel, rare_maf_cutoff = 0.01, - rv_num_cutoff = 2 + rv_num_cutoff = 2, + method_cond = c("optimal", "naive") ) } \arguments{ @@ -27,10 +28,16 @@ output from either \code{\link{fit_null_glm}} function for unrelated samples or is a wrapper of \code{\link{glmmkin}} function from the \code{\link{GMMAT}} package.} \item{rare_maf_cutoff}{the cutoff of maximum minor allele frequency in -defining rare variants (default is 0.01).} +defining rare variants (default = 0.01).} \item{rv_num_cutoff}{the cutoff of minimum number of variants of analyzing -a given variant-set (default is 2).} +a given variant-set (default = 2).} + +\item{method_cond}{a character value indicating the method for conditional analysis. +\code{optimal} refers to regressing residuals from the null model on \code{genotype_adj} +as well as all covariates used in fitting the null model (fully adjusted) and taking the residuals; +\code{naive} refers to regressing residuals from the null model on \code{genotype_adj} +and taking the residuals (default = \code{optimal}).} } \value{ a data frame with p rows corresponding to the p genetic variants in the given variant-set @@ -48,3 +55,8 @@ the object from fitting the null model to analyze the conditional associations b a quantitative/dichotomous phenotype and all individual variants in a given variant-set by using score test, adjusting for a given list of variants. } +\references{ +Sofer, T., et al. (2019). A fully adjusted two-stage procedure for rank-normalization +in genetic association studies. \emph{Genetic Epidemiology}, \emph{43}(3), 263-275. +(\href{https://doi.org/10.1002/gepi.22188}{pub}) +} diff --git a/man/STAAR.Rd b/man/STAAR.Rd index 0450563..197925a 100644 --- a/man/STAAR.Rd +++ b/man/STAAR.Rd @@ -31,10 +31,10 @@ SKAT(1,25), SKAT(1,1), Burden(1,25), Burden(1,1), ACAT-V(1,25), ACAT-V(1,1) and ACAT-O tests (default = NULL).} \item{rare_maf_cutoff}{the cutoff of maximum minor allele frequency in -defining rare variants (default is 0.01).} +defining rare variants (default = 0.01).} \item{rv_num_cutoff}{the cutoff of minimum number of variants of analyzing -a given variant-set (default is 2).} +a given variant-set (default = 2).} } \value{ a list with the following members: @@ -43,6 +43,9 @@ a list with the following members: \code{rare_maf_cutoff} in the given variant-set that are used for performing the variant-set using STAAR. +\code{cMAC}: the cumulative minor allele count of variants with +minor allele frequency > 0 and less than \code{rare_maf_cutoff} in the given variant-set. + \code{RV_label}: the boolean vector indicating whether each variant in the given variant-set has minor allele frequency > 0 and less than \code{rare_maf_cutoff}. @@ -93,13 +96,18 @@ and ACAT-V(1,1) together with p-values of each test weighted by each annotation using Cauchy method. } \references{ -Li, X., Li, Z. et al. (2020). Dynamic incorporation of multiple +Li, X., Li, Z., et al. (2020). Dynamic incorporation of multiple in silico functional annotations empowers rare variant association analysis of -large whole-genome sequencing studies at scale. \emph{Nature Genetics}. -(\href{https://www.nature.com/articles/s41588-020-0676-4}{pub}) +large whole-genome sequencing studies at scale. \emph{Nature Genetics}, \emph{52}(9), 969-983. +(\href{https://doi.org/10.1038/s41588-020-0676-4}{pub}) Liu, Y., et al. (2019). Acat: A fast and powerful p value combination method for rare-variant analysis in sequencing studies. -\emph{The American Journal of Humann Genetics 104}(3), 410-421. -(\href{https://www.sciencedirect.com/science/article/pii/S0002929719300023}{pub}) +\emph{The American Journal of Human Genetics}, \emph{104}(3), 410-421. +(\href{https://doi.org/10.1016/j.ajhg.2019.01.002}{pub}) + +Li, Z., Li, X., et al. (2020). Dynamic scan procedure for +detecting rare-variant association regions in whole-genome sequencing studies. +\emph{The American Journal of Human Genetics}, \emph{104}(5), 802-814. +(\href{https://doi.org/10.1016/j.ajhg.2019.03.002}{pub}) } diff --git a/man/STAAR_cond.Rd b/man/STAAR_cond.Rd index bea136e..9447a33 100644 --- a/man/STAAR_cond.Rd +++ b/man/STAAR_cond.Rd @@ -10,7 +10,8 @@ STAAR_cond( obj_nullmodel, annotation_phred = NULL, rare_maf_cutoff = 0.01, - rv_num_cutoff = 2 + rv_num_cutoff = 2, + method_cond = c("optimal", "naive") ) } \arguments{ @@ -37,10 +38,16 @@ SKAT(1,25), SKAT(1,1), Burden(1,25), Burden(1,1), ACAT-V(1,25), ACAT-V(1,1) and ACAT-O tests (default = NULL).} \item{rare_maf_cutoff}{the cutoff of maximum minor allele frequency in -defining rare variants (default is 0.01).} +defining rare variants (default = 0.01).} \item{rv_num_cutoff}{the cutoff of minimum number of variants of analyzing -a given variant-set (default is 2).} +a given variant-set (default = 2).} + +\item{method_cond}{a character value indicating the method for conditional analysis. +\code{optimal} refers to regressing residuals from the null model on \code{genotype_adj} +as well as all covariates used in fitting the null model (fully adjusted) and taking the residuals; +\code{naive} refers to regressing residuals from the null model on \code{genotype_adj} +and taking the residuals (default = \code{optimal}).} } \value{ a list with the following members: @@ -49,6 +56,9 @@ a list with the following members: \code{rare_maf_cutoff} in the given variant-set that are used for performing the variant-set using STAAR. +\code{cMAC}: the cumulative minor allele count of variants with +minor allele frequency > 0 and less than \code{rare_maf_cutoff} in the given variant-set. + \code{RV_label}: the boolean vector indicating whether each variant in the given variant-set has minor allele frequency > 0 and less than \code{rare_maf_cutoff}. @@ -101,13 +111,22 @@ together with conditional p-values of each test weighted by each annotation using Cauchy method. } \references{ -Li, X., Li, Z. et al. (2020). Dynamic incorporation of multiple +Li, X., Li, Z., et al. (2020). Dynamic incorporation of multiple in silico functional annotations empowers rare variant association analysis of -large whole-genome sequencing studies at scale. \emph{Nature Genetics}. -(\href{https://www.nature.com/articles/s41588-020-0676-4}{pub}) +large whole-genome sequencing studies at scale. \emph{Nature Genetics}, \emph{52}(9), 969-983. +(\href{https://doi.org/10.1038/s41588-020-0676-4}{pub}) Liu, Y., et al. (2019). Acat: A fast and powerful p value combination method for rare-variant analysis in sequencing studies. -\emph{The American Journal of Humann Genetics 104}(3), 410-421. -(\href{https://www.sciencedirect.com/science/article/pii/S0002929719300023}{pub}) +\emph{The American Journal of Human Genetics}, \emph{104}(3), 410-421. +(\href{https://doi.org/10.1016/j.ajhg.2019.01.002}{pub}) + +Li, Z., Li, X., et al. (2020). Dynamic scan procedure for +detecting rare-variant association regions in whole-genome sequencing studies. +\emph{The American Journal of Human Genetics}, \emph{104}(5), 802-814. +(\href{https://doi.org/10.1016/j.ajhg.2019.03.002}{pub}) + +Sofer, T., et al. (2019). A fully adjusted two-stage procedure for rank-normalization +in genetic association studies. \emph{Genetic Epidemiology}, \emph{43}(3), 263-275. +(\href{https://doi.org/10.1002/gepi.22188}{pub}) } diff --git a/man/fit_null_glmmkin.Rd b/man/fit_null_glmmkin.Rd index 21c97a7..8c8e6a0 100644 --- a/man/fit_null_glmmkin.Rd +++ b/man/fit_null_glmmkin.Rd @@ -99,13 +99,13 @@ variant-set tests in whole genome sequencing data analysis. See \code{\link{glmm } \references{ Chen, H., et al. (2016). Control for population structure and relatedness for binary traits -in genetic association studies via logistic mixed models. \emph{The American Journal of Humann Genetics 98}(4), 653-666. -(\href{https://www.sciencedirect.com/science/article/pii/S000292971600063X}{pub}) +in genetic association studies via logistic mixed models. \emph{The American Journal of Human Genetics}, \emph{98}(4), 653-666. +(\href{https://doi.org/10.1016/j.ajhg.2016.02.012}{pub}) Chen, H., et al. (2019). Efficient variant set mixed model association tests for continuous and -binary traits in large-scale whole-genome sequencing studies. \emph{The American Journal of Humann Genetics 104}(2), 260-274. -(\href{https://www.sciencedirect.com/science/article/pii/S0002929718304658}{pub}) +binary traits in large-scale whole-genome sequencing studies. \emph{The American Journal of Human Genetics}, \emph{104}(2), 260-274. +(\href{https://doi.org/10.1016/j.ajhg.2018.12.012}{pub}) -Chen, H. (2020). GMMAT: Generalized Linear Mixed Model Association Tests. -(\href{https://github.com/hanchenphd/GMMAT/blob/master/inst/doc/GMMAT.pdf}{web}) +Chen, H. (2020). GMMAT: Generalized linear Mixed Model Association Tests Version 1.3. +(\href{https://cloud.r-project.org/web/packages/GMMAT/vignettes/GMMAT.pdf}{web}) }