diff --git a/.Rbuildignore b/.Rbuildignore index 6d2eaaa..fe9d8bd 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1 +1,2 @@ -.github/ \ No newline at end of file +.github/ +^simulated_* \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 9e2e2a3..92c6a1b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,6 +14,7 @@ Suggests: Remotes: jeffersonfparil/simquantgen Imports: + methods, vcfR, parallel, cluster, diff --git a/NAMESPACE b/NAMESPACE index 87c9e8d..11ef448 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,5 +11,6 @@ export(fn_filter_phenotype) export(fn_load_genotype) export(fn_load_phenotype) export(fn_merge_genotype_and_phenotype) +export(fn_prediction_performance_metrics) export(fn_simulate_data) export(fn_vcf_to_G) diff --git a/R/load.R b/R/load.R index bbb78e2..0650407 100644 --- a/R/load.R +++ b/R/load.R @@ -1,13 +1,12 @@ suppressWarnings(suppressPackageStartupMessages(library(simquantgen))) suppressWarnings(suppressPackageStartupMessages(library(vcfR))) suppressWarnings(suppressPackageStartupMessages(library(txtplot))) -suppressWarnings(suppressPackageStartupMessages(library(testthat))) ### Create an error class methods::setClass("gpError", representation(code="numeric", message="character"), prototype(code=0, message="Empty error message. Please define.")) ### Create an error chaining method methods::setGeneric("chain", function(x, y){ - methods::standardGeneric("chain") + standardGeneric("chain") }) methods::setMethod(f="chain", signature=c(x="gpError", y="gpError"), @@ -695,10 +694,10 @@ fn_classify_allele_frequencies = function(G, ploidy=2, verbose=FALSE) { #' @param l number of loci (Default=1000) #' @param ploidy ploidy level which can refer to the number of haploid genomes to simulate pools (Default=2) #' @param n_alleles macimum number of alleles per locus (Default=2) -#' @param min_depth minimum depth per locus [Default=5] -#' @param max_depth maximum depth per locus [Default=5000] -#' @param n_pop number of randomly assigned population groupings [Default=1] -#' @param seed randomisation seed for replicability [Default=12345] +#' @param min_depth minimum depth per locus (Default=5) +#' @param max_depth maximum depth per locus (Default=5000) +#' @param n_pop number of randomly assigned population groupings (Default=1) +#' @param seed randomisation seed for replicability (Default=12345) #' @param save_pheno_tsv save the phenotype data as a tab-delimited file? (Default=TRUE) #' @param save_geno_vcf save the genotype data as a vcf file? (Default=TRUE) #' @param save_geno_tsv save the genotype data as a tab-delimited allele frequency table file? (Default=FALSE) @@ -955,16 +954,16 @@ fn_load_genotype = function(fname_geno, ploidy=NULL, retain_minus_one_alleles_pe #' @param maf minimum allele frequency (Default=0.01) #' @param sdev_min minimum allele frequency standard deviation (constant allele frequency across samples is just another intercept) (Default=0.0001) #' @param fname_snp_list name of the file containing the list of expected SNPs including their coordinates and alleles. -#' This is a tab-delimited file with 3 columns: '#CHROM', 'POS', 'REF,ALT', corresponding to([Default=NULL) +#' This is a tab-delimited file with 3 columns: '#CHROM', 'POS', 'REF,ALT', corresponding to (Default=NULL) #' chromosome names (e.g. 'chr1' & 'chrom_1'), #' numeric positions (e.g. 12345 & 100001), and -#' reference-alternative allele strings separated by a comma (e.g. 'A,T' & 'allele_1,allele_alt')([Default=NULL) -#' @param max_sparsity_per_locus maximum mean sparsity per locus, e.g. 0.1 or 0.5([Default=NULL) -#' @param frac_topmost_sparse_loci_to_remove fraction of the top-most sparse loci to remove, e.g. 0.01 or 0.25([Default=NULL) -#' @param n_topmost_sparse_loci_to_remove number of top-most sparse loci to remove, e.g. 100 or 1000([Default=NULL) -#' @param max_sparsity_per_sample maximum mean sparsity per sample, e.g. 0.3 or 0.5([Default=NULL) -#' @param frac_topmost_sparse_samples_to_remove fraction of the top-most sparse samples to remove, e.g. 0.01 or 0.05([Default=NULL) -#' @param n_topmost_sparse_samples_to_remove number of top-most sparse samples to remove, e.g. 5 or 10([Default=NULL) +#' reference-alternative allele strings separated by a comma (e.g. 'A,T' & 'allele_1,allele_alt') (Default=NULL) +#' @param max_sparsity_per_locus maximum mean sparsity per locus, e.g. 0.1 or 0.5 (Default=NULL) +#' @param frac_topmost_sparse_loci_to_remove fraction of the top-most sparse loci to remove, e.g. 0.01 or 0.25 (Default=NULL) +#' @param n_topmost_sparse_loci_to_remove number of top-most sparse loci to remove, e.g. 100 or 1000 (Default=NULL) +#' @param max_sparsity_per_sample maximum mean sparsity per sample, e.g. 0.3 or 0.5 (Default=NULL) +#' @param frac_topmost_sparse_samples_to_remove fraction of the top-most sparse samples to remove, e.g. 0.01 or 0.05 (Default=NULL) +#' @param n_topmost_sparse_samples_to_remove number of top-most sparse samples to remove, e.g. 5 or 10 (Default=NULL) #' @param verbose show genotype filtering messages? (Default=FALSE) #' @returns #' Ok: numeric n samples x p loci-alleles matrix of allele frequencies with non-null row and column names. @@ -1274,7 +1273,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001, #' Load phenotype data from a text-delimited file #' #' @param fname_pheno filname of the text-delimited phenotype data -#' @param sep column-delimiter in the phenotype file (Default='\t') +#' @param sep column-delimiter in the phenotype file (Default="\t") #' @param header does the phenotype file have a header line? (Default=TRUE) #' @param idx_col_id which column correspond to the sample/entry/pool/genotype names? (Default=1) #' @param idx_col_pop which column correspond to the population groupings? (Default=2) diff --git a/R/metrics.R b/R/metrics.R index f530cd9..8ac537a 100644 --- a/R/metrics.R +++ b/R/metrics.R @@ -1,8 +1,10 @@ +# source("R/load.R") + #' Genomic prediction metrics -#' Add more metrics which we may be interested in. #' NOTES: -#' - the number of non-zero estimated marker effects are extracted within each model, i.e. in `models.R` -#' - the run-time of each genomic prediction and validation on a single set is measured during each cross-validation replicate, i.e in `cross-validation.R` +#' - Add more metrics which we may be interested in. while keeping the function signatures, i.e. input and output consistent +#' - The number of non-zero estimated marker effects are extracted within each model, i.e. in `models.R` +#' - Run time of each genomic prediction and validation on a single set is measured during each cross-validation replicate, i.e in `cross-validation.R` #' #' @param y_true numeric vector observed phenotype values #' @param y_pred numeric vector predicted phenotype values @@ -15,50 +17,91 @@ #' $power_t10: fraction of observed top 10 phenotype values correctly predicted #' $power_b10: fraction of observed bottom 10 phenotype values correctly predicted #' $h2: estimated narrow-sense heritability weighted by Pearson's correlation between observed and predicted phenotype values -#' @example +#' Formula: +#' - h2_predicted = corr(observed, predicted) * h2_true +#' - Caveat: yields NA if the estimated heritability lies beyond 0 and 1 +#' which happens when correlation between observed and predicted phenotypes is negative, and +#' the predicted phenotypes have higher variance than observed phenotypes. +#' This unexpected higher variance in the predictions implies that the genomic prediction model performs very poorly, +#' and therefore heritability estimates from them should not be considered. +#' Assumptions: +#' - correlation between predicted and true trait values relates the variance of the predicted trait values with the true heritability, +#' - predicted trait values are due to purely additive effects, +#' - variance in the predicted trait values is proportional to the additive genetic variance (~Va) +#' - variance in the true trait values represent the total phenotypic variance (Vp) +#' @examples #' y_pred = stats::rnorm(100) -#' y_true = y_pred + rnorm(100) -#' list_metrics = fn_prediction_performance_metrics(y_true=y_true, y_pred=y_pred) -fn_prediction_performance_metrics = function(y_true, y_pred) { +#' y_true = y_pred + stats::rnorm(100) +#' list_metrics = fn_prediction_performance_metrics(y_true=y_true, y_pred=y_pred, verbose=TRUE) +#' @export +fn_prediction_performance_metrics = function(y_true, y_pred, verbose=FALSE) { + ################################################### + ### TEST # y_true = stats::rnorm(100) # y_pred = stats::rnorm(100); # y_pred = y_true + stats::rnorm(n=100, mean=0, stats::sd=0.1) + # # y_pred = rep(NA, length(y_true)) + # verbose = TRUE + ################################################### y_true = as.vector(y_true) y_pred = as.vector(y_pred) - if (length(y_true) != length(y_pred)) { - print("Please make sure the observed and predicted phenotype vectors are the same length.") - return(list(mbe=NA, mae=NA, rmse=NA, r2=NA, corr=NA, , power_t10=NA, power_b10=NA)) + n = length(y_true) + if (n != length(y_pred)) { + error = methods::new("gpError", + code=000, + message=paste0( + "Error in metrics::fn_prediction_performance_metrics(...). ", + "The vector of observed phenotypes has a length of ", n, + " which does not match the length of the vector of predicted phenotypes with ", + length(y_pred), " elements." + )) + return(error) } error = y_true - y_pred - mbe = mean(error) - mae = mean(abs(error)) - rmse = sqrt(mean(error^2)) - r2 = 1 - (sum(error^2) / sum((y_true-mean(y_true))^2)) - corr = suppressWarnings(stats::cor(y_true, y_pred, method="pearson")) - if (is.na(corr)) { - corr = 0.0 + mbe = mean(error, na.rm=TRUE) + mae = mean(abs(error), na.rm=TRUE) + rmse = sqrt(mean(error^2, na.rm=TRUE)) + if (sum(is.na(error)) == n) { + r2 = NA + } else { + r2 = 1 - (sum(error^2, na.rm=TRUE) / sum((y_true-mean(y_true, na.rm=TRUE))^2, na.rm=TRUE)) } + corr = suppressWarnings(stats::cor(y_true, y_pred, method="pearson")) ### Power to select true top and bottom 10% - n = length(y_true) n_top_or_bottom_10 = max(c(1, round(0.1*n))) top10_dec_true = order(y_true, decreasing=TRUE)[1:n_top_or_bottom_10] - top10_dec_pred = order(y_pred, decreasing=TRUE)[1:n_top_or_bottom_10] - power_t10 = sum((top10_dec_true %in% top10_dec_pred)) / n_top_or_bottom_10 bottom10_dec_true = order(y_true, decreasing=FALSE)[1:n_top_or_bottom_10] - bottom10_dec_pred = order(y_pred, decreasing=FALSE)[1:n_top_or_bottom_10] - power_b10 = sum((bottom10_dec_true %in% bottom10_dec_pred)) / n_top_or_bottom_10 + if (sum(is.na(error)) == n) { + power_t10 = NA + power_b10 = NA + } else { + top10_dec_pred = order(y_pred, decreasing=TRUE)[1:n_top_or_bottom_10] + bottom10_dec_pred = order(y_pred, decreasing=FALSE)[1:n_top_or_bottom_10] + power_t10 = sum((top10_dec_true %in% top10_dec_pred)) / n_top_or_bottom_10 + power_b10 = sum((bottom10_dec_true %in% bottom10_dec_pred)) / n_top_or_bottom_10 + } ### Narrow-sense heritability scaled by prediction accuracy in terms of Pearson's correlation - ### Assumptions: - ### - the predicted trait values are due to purely additive effects, - ### - the variance in the predicted trait values is proportional to the additive genetic variance (~Va) - ### - the variance in the true trait values represent the total phenotypic variance (Vp) - ### - the correlation between predicted and true trait values is the factor that relates the variance of the predicted trait values with the true heritability, i.e. h2_predicted = corr(true,pred) * h2_true - h2 = round((stats::var(y_pred) / stats::var(y_true)) / corr, 10) + h2 = round((stats::var(y_pred, na.rm=TRUE) / stats::var(y_true)) / corr, 10) if (!is.na(h2)) { if ((h2 < 0.0) | (h2 > 1.0)) { h2 = NA } - } else { - h2 = 0.0 + } + if (verbose) { + vec_idx = which(!is.na(y_true) & !is.na(y_pred)) + if (length(vec_idx) > 0) { + print("Scatter plot of the observed and predicted phenotypes") + txtplot::txtplot(x=y_true, y=y_pred) + } else { + print("All pairs of phenotype values have missing data.") + } + print(paste0("Mean bias error (mbe) = ", mbe)) + print(paste0("Mean absolute error (mae) = ", mae)) + print(paste0("Root mean square error (rmse) = ", rmse)) + print(paste0("Coefficient of determination (r2) = ", r2)) + print(paste0("Power to identify top 10 (power_t10) = ", mbe)) + print(paste0("Power to identify bottom 10 (power_b10) = ", power_b10)) + print(paste0("Pearson's correlation (corr) = ", corr)) + print(paste0("Narrow-sense heritability estimate (h2) = ", h2)) } ### Output return(list( @@ -71,21 +114,3 @@ fn_prediction_performance_metrics = function(y_true, y_pred) { power_b10=power_b10, h2=h2)) } - -##################################################################### -############################### Tests ############################### -##################################################################### -tests_metrics = function() { - test_that( - "fn_prediction_performance_metrics", { - print("fn_prediction_performance_metrics:") - n = 100 - y1 = 1:n - y2 = y1 + pi - m0 = fn_prediction_performance_metrics(y1, y1) - m1 = fn_prediction_performance_metrics(y1, y2) - expect_equal(m0, list(mbe=0, mae=0, rmse=0, r2=1, corr=1, power_t10=1, power_b10=1, h2=1)) - expect_equal(m1, list(mbe=-pi, mae=pi, rmse=pi, r2=(1-(n*(pi^2)/sum((y1-mean(y1))^2))), corr=1, power_t10=1, power_b10=1, h2=1)) - } - ) -} diff --git a/R/models.R b/R/models.R index 09bd249..ff57fd1 100644 --- a/R/models.R +++ b/R/models.R @@ -1,14 +1,10 @@ -### Load the genomic_selection conda environment: conda activate genomic_selection -suppressPackageStartupMessages(library(MASS)) +# source("R/load.R") +# source("R/metrics.R") suppressWarnings(suppressPackageStartupMessages(library(glmnet))) suppressWarnings(suppressPackageStartupMessages(library(BGLR))) suppressWarnings(suppressPackageStartupMessages(library(sommer))) # suppressWarnings(suppressPackageStartupMessages(library(epistasisfeatures))) -# dirname_functions = dirname(sys.frame(1)$ofile) -# # dirname_functions = "/group/pasture/Jeff/genomic_selection/src" -# source(paste0(dirname_functions, "/metrics.R")) - ### New models can be added by having the same input structure, i.e.: ### 1.) nxp matrix of numeric genotype data without missing values ### 2.) n-sized vector of numeric phenotype data without missing values @@ -16,69 +12,96 @@ suppressWarnings(suppressPackageStartupMessages(library(sommer))) ### 4.) vector of numeric indexes corresponding to the index of the validation set in the genotype matrix and phenotype vector ### 5.) a list of other parameters required by the internal functions -fn_least_squares_moore_penrose = function(G, y, idx_training, idx_validation, other_params=list(covariate=NULL)) { - # n = 100; n_alleles = 2 - # G = simquantgen::fn_simulate_genotypes(n=n, l=500, ploidy=2, n_alleles=n_alleles, min_allele_freq=0.001, n_chr=5, max_pos=135e6, dist_bp_at_50perc_r2=5e6, n_threads=2) - # y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.75, pheno_reps=1)$Y - # y = scale(y, center=TRUE, scale=TRUE) - # idx_validation = c(1:round(n*0.1)) - # idx_training = !(c(1:n) %in% idx_validation) - # # other_params=list(tol=.Machine$double.eps) - # other_params=list(tol=.Machine$double.eps, covariate=matrix(stats::rnorm(2*n), ncol=2)) - ### Make sure that our genotype and phenotype data are labelled, i.e. have row and column names - if (is.null(rownames(G)) | is.null(colnames(G)) | is.null(rownames(y))) { - print("Error: Genotype and/or phenotype data have no labels. Please include the row names (entry names) and column names (loci names).") - return(-1) +fn_ols = function(list_merged, vec_idx_training, vec_idx_validation, other_params=list(diag_inflate=1e-4)) { + ################################################### + ### TEST + list_sim = fn_simulate_data(n_pop=3, verbose=TRUE) + G = fn_load_genotype(fname_geno=list_sim$fname_geno_vcf) + list_pheno = fn_load_phenotype(fname_pheno=list_sim$fname_pheno_tsv) + COVAR = G %*% t(G); # rownames(COVAR) = NULL + list_merged = fn_merge_genotype_and_phenotype(G=G, list_pheno=list_pheno, COVAR=COVAR, verbose=TRUE) + n = nrow(list_merged$G) + vec_idx_training = sample(c(1:n), floor(n/2)) + vec_idx_validation = c(1:n)[!(c(1:n) %in% vec_idx_training)] + other_params=list(diag_inflate=1e-4) + ################################################### + if (methods::is(list_merged, "gpError")) { + error = chain(list_merged, + methods::new("gpError", + code=000, + message=paste0( + "Error in models::fn_ols(list_merged, vec_idx_training, vec_idx_validation, other_params). ", + "Input data is an error type." + ))) + return(error) } - X_training = G[idx_training, ] - y_training = y[idx_training] - X_validation = G[idx_validation, ] - y_validation = y[idx_validation] + X_training = list_merged$G[vec_idx_training, ] + y_training = list_merged$list_pheno$y[vec_idx_training] + X_validation = list_merged$G[vec_idx_validation, ] + y_validation = list_merged$list_pheno$y[vec_idx_validation] ### Adding covariate to explanatory matrix - C = other_params$covariate - if (is.null(C) == FALSE) { - if (nrow(C) < max(c(idx_training, idx_validation))) { - print("Error: The covariate matrix is incompatible with the genotype data, i.e. there are less rows in the covariate matrix that the indices in idx_training oridx_validation.") - return(-1) + if (!is.null(list_merged$COVAR)) { + X_training = cbind(list_merged$COVAR[vec_idx_training, ], X_training) + X_validation = cbind(list_merged$COVAR[vec_idx_validation, ], X_validation) + } + + if (nrow(X_training) > ncol(X_training)) { + A = t(X_training) %*% X_training + inv_A = tryCatch(solve(A), error=function(x) { + B = A + diag(B) = diag(B) + other_params$diag_inflate + out = tryCatch(solve(B), error=function(e){return(NA)}) + return(out) + }) + diag(A %*% inv_A) + if (is.na(inv_A)) { + error = methods::new("gpError", + code=000, + message=paste0( + "Error in models::fn_ols(list_merged, vec_idx_training, vec_idx_validation, other_params)" + )) } - X_training = cbind(C[idx_training, ], X_training) - X_validation = cbind(C[idx_validation, ], X_validation) } - b = MASS::ginv(t(X_training) %*% X_training, tol=.Machine$double.eps) %*% t(X_training) %*% y_training + + + + A = t(X_training) %*% X_training + inv_A = solve() + b = inv_A %*% t(X_training) %*% y_training y_pred = X_validation %*% b - df_y = merge(data.frame(id=rownames(y)[idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id") + df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id") perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred) perf$y_pred = y_pred perf$n_non_zero = sum(abs(b) > .Machine$double.eps) return(perf) } -fn_ridge = function(G, y, idx_training, idx_validation, other_params=list(covariate=NULL)) { +fn_ridge = function(G, y, vec_idx_training, vec_idx_validation, other_params=list(covariate=NULL)) { # n = 100; n_alleles = 2 # G = simquantgen::fn_simulate_genotypes(n=n, l=500, ploidy=2, n_alleles=n_alleles, min_allele_freq=0.001, n_chr=5, max_pos=135e6, dist_bp_at_50perc_r2=5e6, n_threads=2) # y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.75, pheno_reps=1)$Y # y = scale(y, center=TRUE, scale=TRUE) - # idx_validation = c(1:round(n*0.1)) - # idx_training = !(c(1:n) %in% idx_validation) + # vec_idx_validation = c(1:round(n*0.1)) + # vec_idx_training = !(c(1:n) %in% vec_idx_validation) # other_params=list(covariate=NULL) ### Make sure that our genotype and phenotype data are labelled, i.e. have row and column names if (is.null(rownames(G)) | is.null(colnames(G)) | is.null(rownames(y))) { print("Error: Genotype and/or phenotype data have no labels. Please include the row names (entry names) and column names (loci names).") return(-1) } - X_training = G[idx_training, ] - y_training = y[idx_training] - X_validation = G[idx_validation, ] - y_validation = y[idx_validation] + X_training = G[vec_idx_training, ] + y_training = y[vec_idx_training] + X_validation = G[vec_idx_validation, ] + y_validation = y[vec_idx_validation] ### Adding covariate to explanatory matrix C = other_params$covariate if (is.null(C) == FALSE) { - if (nrow(C) < max(c(idx_training, idx_validation))) { - print("Error: The covariate matrix is incompatible with the genotype data, i.e. there are less rows in the covariate matrix that the indices in idx_training oridx_validation.") + if (nrow(C) < max(c(vec_idx_training, vec_idx_validation))) { + print("Error: The covariate matrix is incompatible with the genotype data, i.e. there are less rows in the covariate matrix that the indices in vec_idx_training oridx_validation.") return(-1) } - X_training = cbind(C[idx_training, ], X_training) - X_validation = cbind(C[idx_validation, ], X_validation) + X_training = cbind(C[vec_idx_training, ], X_training) + X_validation = cbind(C[vec_idx_validation, ], X_validation) } sol = glmnet::cv.glmnet(x=X_training, y=y_training, alpha=0) ### Ridge -> alpha = 0.0 ### If the optimisation picks a lambda which does not provide estimates of the effects of the genotype data then we use the next best lambda @@ -92,44 +115,44 @@ fn_ridge = function(G, y, idx_training, idx_validation, other_params=list(covari } n_non_zero = sol$nzero[idx] b_hat = c(sol$glmnet.fit$a0[idx], sol$glmnet.fit$beta[, idx]) - y_pred = cbind(rep(1, length(idx_validation)), X_validation) %*% b_hat + y_pred = cbind(rep(1, length(vec_idx_validation)), X_validation) %*% b_hat } else { y_pred = stats::predict(sol, newx=X_validation, s="lambda.min") } colnames(y_pred) = "pred" - df_y = merge(data.frame(id=rownames(y)[idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id") + df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id") perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred) perf$y_pred = y_pred perf$n_non_zero = n_non_zero; names(perf$n_non_zero) = NULL return(perf) } -fn_lasso = function(G, y, idx_training, idx_validation, other_params=list(covariate=NULL)) { +fn_lasso = function(G, y, vec_idx_training, vec_idx_validation, other_params=list(covariate=NULL)) { # n = 100; n_alleles = 2 # G = simquantgen::fn_simulate_genotypes(n=n, l=500, ploidy=2, n_alleles=n_alleles, min_allele_freq=0.001, n_chr=5, max_pos=135e6, dist_bp_at_50perc_r2=5e6, n_threads=2) # y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.75, pheno_reps=1)$Y # y = scale(y, center=TRUE, scale=TRUE) - # idx_validation = c(1:round(n*0.1)) - # idx_training = !(c(1:n) %in% idx_validation) + # vec_idx_validation = c(1:round(n*0.1)) + # vec_idx_training = !(c(1:n) %in% vec_idx_validation) # other_params=list(covariate=NULL) ### Make sure that our genotype and phenotype data are labelled, i.e. have row and column names if (is.null(rownames(G)) | is.null(colnames(G)) | is.null(rownames(y))) { print("Error: Genotype and/or phenotype data have no labels. Please include the row names (entry names) and column names (loci names).") return(-1) } - X_training = G[idx_training, ] - y_training = y[idx_training] - X_validation = G[idx_validation, ] - y_validation = y[idx_validation] + X_training = G[vec_idx_training, ] + y_training = y[vec_idx_training] + X_validation = G[vec_idx_validation, ] + y_validation = y[vec_idx_validation] ### Adding covariate to explanatory matrix C = other_params$covariate if (is.null(C) == FALSE) { - if (nrow(C) < max(c(idx_training, idx_validation))) { - print("Error: The covariate matrix is incompatible with the genotype data, i.e. there are less rows in the covariate matrix that the indices in idx_training oridx_validation.") + if (nrow(C) < max(c(vec_idx_training, vec_idx_validation))) { + print("Error: The covariate matrix is incompatible with the genotype data, i.e. there are less rows in the covariate matrix that the indices in vec_idx_training oridx_validation.") return(-1) } - X_training = cbind(C[idx_training, ], X_training) - X_validation = cbind(C[idx_validation, ], X_validation) + X_training = cbind(C[vec_idx_training, ], X_training) + X_validation = cbind(C[vec_idx_validation, ], X_validation) } sol = glmnet::cv.glmnet(x=X_training, y=y_training, alpha=1) ### Lasso -> alpha = 1.0 ### If the optimisation picks a lambda which does not provide estimates of the effects of the genotype data then we use the next best lambda @@ -143,44 +166,44 @@ fn_lasso = function(G, y, idx_training, idx_validation, other_params=list(covari } n_non_zero = sol$nzero[idx] b_hat = c(sol$glmnet.fit$a0[idx], sol$glmnet.fit$beta[, idx]) - y_pred = cbind(rep(1, length(idx_validation)), X_validation) %*% b_hat + y_pred = cbind(rep(1, length(vec_idx_validation)), X_validation) %*% b_hat } else { y_pred = stats::predict(sol, newx=X_validation, s="lambda.min") } colnames(y_pred) = "pred" - df_y = merge(data.frame(id=rownames(y)[idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id") + df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id") perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred) perf$y_pred = y_pred perf$n_non_zero = n_non_zero; names(perf$n_non_zero) = NULL return(perf) } -fn_elastic_net = function(G, y, idx_training, idx_validation, other_params=list(covariate=NULL)) { +fn_elastic_net = function(G, y, vec_idx_training, vec_idx_validation, other_params=list(covariate=NULL)) { # n = 100; n_alleles = 2 # G = simquantgen::fn_simulate_genotypes(n=n, l=500, ploidy=2, n_alleles=n_alleles, min_allele_freq=0.001, n_chr=5, max_pos=135e6, dist_bp_at_50perc_r2=5e6, n_threads=2) # y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.75, pheno_reps=1)$Y # y = scale(y, center=TRUE, scale=TRUE) - # idx_validation = c(1:round(n*0.1)) - # idx_training = !(c(1:n) %in% idx_validation) + # vec_idx_validation = c(1:round(n*0.1)) + # vec_idx_training = !(c(1:n) %in% vec_idx_validation) # other_params=list(covariate=NULL) ### Make sure that our genotype and phenotype data are labelled, i.e. have row and column names if (is.null(rownames(G)) | is.null(colnames(G)) | is.null(rownames(y))) { print("Error: Genotype and/or phenotype data have no labels. Please include the row names (entry names) and column names (loci names).") return(-1) } - X_training = G[idx_training, ] - y_training = y[idx_training] - X_validation = G[idx_validation, ] - y_validation = y[idx_validation] + X_training = G[vec_idx_training, ] + y_training = y[vec_idx_training] + X_validation = G[vec_idx_validation, ] + y_validation = y[vec_idx_validation] ### Adding covariate to explanatory matrix C = other_params$covariate if (is.null(C) == FALSE) { - if (nrow(C) < max(c(idx_training, idx_validation))) { - print("Error: The covariate matrix is incompatible with the genotype data, i.e. there are less rows in the covariate matrix that the indices in idx_training oridx_validation.") + if (nrow(C) < max(c(vec_idx_training, vec_idx_validation))) { + print("Error: The covariate matrix is incompatible with the genotype data, i.e. there are less rows in the covariate matrix that the indices in vec_idx_training oridx_validation.") return(-1) } - X_training = cbind(C[idx_training, ], X_training) - X_validation = cbind(C[idx_validation, ], X_validation) + X_training = cbind(C[vec_idx_training, ], X_training) + X_validation = cbind(C[vec_idx_validation, ], X_validation) } sol = glmnet::cv.glmnet(x=X_training, y=y_training) ### Elastic-net -> alpha is optimised ### If the optimisation picks a lambda which does not provide estimates of the effects of the genotype data then we use the next best lambda @@ -194,25 +217,25 @@ fn_elastic_net = function(G, y, idx_training, idx_validation, other_params=list( } n_non_zero = sol$nzero[idx] b_hat = c(sol$glmnet.fit$a0[idx], sol$glmnet.fit$beta[, idx]) - y_pred = cbind(rep(1, length(idx_validation)), X_validation) %*% b_hat + y_pred = cbind(rep(1, length(vec_idx_validation)), X_validation) %*% b_hat } else { y_pred = stats::predict(sol, newx=X_validation, s="lambda.min") } colnames(y_pred) = "pred" - df_y = merge(data.frame(id=rownames(y)[idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id") + df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id") perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred) perf$y_pred = y_pred perf$n_non_zero = n_non_zero; names(perf$n_non_zero) = NULL return(perf) } -fn_Bayes_A = function(G, y, idx_training, idx_validation, other_params=list(nIter=12e3, burnIn=2e3, h2=0.5, out_prefix="bglr_bayesA-", covariate=NULL)) { +fn_Bayes_A = function(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=12e3, burnIn=2e3, h2=0.5, out_prefix="bglr_bayesA-", covariate=NULL)) { # n = 100; n_alleles = 2 # G = simquantgen::fn_simulate_genotypes(n=n, l=500, ploidy=2, n_alleles=n_alleles, min_allele_freq=0.001, n_chr=5, max_pos=135e6, dist_bp_at_50perc_r2=5e6, n_threads=2) # y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.75, pheno_reps=1)$Y # y = scale(y, center=TRUE, scale=TRUE) - # idx_validation = c(1:round(n*0.1)) - # idx_training = !(c(1:n) %in% idx_validation) + # vec_idx_validation = c(1:round(n*0.1)) + # vec_idx_training = !(c(1:n) %in% vec_idx_validation) # other_params=list(nIter=1000, # burnIn=100, # h2=0.5, @@ -225,7 +248,7 @@ fn_Bayes_A = function(G, y, idx_training, idx_validation, other_params=list(nIte } ### Set validation set to missing yNA = y - yNA[idx_validation] = NA + yNA[vec_idx_validation] = NA if (is.null(other_params$covariate)==FALSE) { if (nrow(other_params$covariate) != nrow(G)) { print("The covariate and genotype matrices are incompatible, i.e. unequal number of rows.") @@ -238,9 +261,9 @@ fn_Bayes_A = function(G, y, idx_training, idx_validation, other_params=list(nIte } other_params$out_prefix = gsub(" ", "-", paste(other_params$out_prefix, Sys.time(), stats::runif(1), sep="-")) ### attempt at preventing overwrites to the running Gibbs samplers in parallel sol = BGLR::BGLR(y=yNA, ETA=ETA, nIter=other_params$nIter, burnIn=other_params$burnIn, R2=other_params$h2, saveAt=other_params$out_prefix, verbose=FALSE) - y_pred = sol$yHat[idx_validation] - names(y_pred) = rownames(yNA)[idx_validation] - df_y = merge(data.frame(id=rownames(y)[idx_validation], true=y[idx_validation]), data.frame(id=names(y_pred), pred=y_pred), by="id") + y_pred = sol$yHat[vec_idx_validation] + names(y_pred) = rownames(yNA)[vec_idx_validation] + df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y[vec_idx_validation]), data.frame(id=names(y_pred), pred=y_pred), by="id") perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred) perf$y_pred = matrix(y_pred, ncol=1) colnames(perf$y_pred) = "pred" @@ -252,13 +275,13 @@ fn_Bayes_A = function(G, y, idx_training, idx_validation, other_params=list(nIte return(perf) } -fn_Bayes_B = function(G, y, idx_training, idx_validation, other_params=list(nIter=12e3, burnIn=2e3, h2=0.5, out_prefix="bglr_bayesB-", covariate=NULL)) { +fn_Bayes_B = function(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=12e3, burnIn=2e3, h2=0.5, out_prefix="bglr_bayesB-", covariate=NULL)) { # n = 100; n_alleles = 2 # G = simquantgen::fn_simulate_genotypes(n=n, l=500, ploidy=2, n_alleles=n_alleles, min_allele_freq=0.001, n_chr=5, max_pos=135e6, dist_bp_at_50perc_r2=5e6, n_threads=2) # y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.75, pheno_reps=1)$Y # y = scale(y, center=TRUE, scale=TRUE) - # idx_validation = c(1:round(n*0.1)) - # idx_training = !(c(1:n) %in% idx_validation) + # vec_idx_validation = c(1:round(n*0.1)) + # vec_idx_training = !(c(1:n) %in% vec_idx_validation) # other_params=list(nIter=1000, # burnIn=100, # h2=0.5, @@ -271,7 +294,7 @@ fn_Bayes_B = function(G, y, idx_training, idx_validation, other_params=list(nIte } ### Set validation set to missing yNA = y - yNA[idx_validation] = NA + yNA[vec_idx_validation] = NA if (is.null(other_params$covariate)==FALSE) { if (nrow(other_params$covariate) != nrow(G)) { print("The covariate and genotype matrices are incompatible, i.e. unequal number of rows.") @@ -284,9 +307,9 @@ fn_Bayes_B = function(G, y, idx_training, idx_validation, other_params=list(nIte } other_params$out_prefix = gsub(" ", "-", paste(other_params$out_prefix, Sys.time(), stats::runif(1), sep="-")) ### attempt at preventing overwrites to the running Gibbs samplers in parallel sol = BGLR::BGLR(y=yNA, ETA=ETA, nIter=other_params$nIter, burnIn=other_params$burnIn, R2=other_params$h2, saveAt=other_params$out_prefix, verbose=FALSE) - y_pred = sol$yHat[idx_validation] - names(y_pred) = rownames(yNA)[idx_validation] - df_y = merge(data.frame(id=rownames(y)[idx_validation], true=y[idx_validation]), data.frame(id=names(y_pred), pred=y_pred), by="id") + y_pred = sol$yHat[vec_idx_validation] + names(y_pred) = rownames(yNA)[vec_idx_validation] + df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y[vec_idx_validation]), data.frame(id=names(y_pred), pred=y_pred), by="id") perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred) perf$y_pred = matrix(y_pred, ncol=1) colnames(perf$y_pred) = "pred" @@ -298,13 +321,13 @@ fn_Bayes_B = function(G, y, idx_training, idx_validation, other_params=list(nIte return(perf) } -fn_Bayes_C = function(G, y, idx_training, idx_validation, other_params=list(nIter=12e3, burnIn=2e3, h2=0.5, out_prefix="bglr_bayesC-", covariate=NULL)) { +fn_Bayes_C = function(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=12e3, burnIn=2e3, h2=0.5, out_prefix="bglr_bayesC-", covariate=NULL)) { # n = 100; n_alleles = 2 # G = simquantgen::fn_simulate_genotypes(n=n, l=500, ploidy=2, n_alleles=n_alleles, min_allele_freq=0.001, n_chr=5, max_pos=135e6, dist_bp_at_50perc_r2=5e6, n_threads=2) # y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.75, pheno_reps=1)$Y # y = scale(y, center=TRUE, scale=TRUE) - # idx_validation = c(1:round(n*0.1)) - # idx_training = !(c(1:n) %in% idx_validation) + # vec_idx_validation = c(1:round(n*0.1)) + # vec_idx_training = !(c(1:n) %in% vec_idx_validation) # other_params=list(nIter=1000, # burnIn=100, # h2=0.5, @@ -317,7 +340,7 @@ fn_Bayes_C = function(G, y, idx_training, idx_validation, other_params=list(nIte } ### Set validation set to missing yNA = y - yNA[idx_validation] = NA + yNA[vec_idx_validation] = NA if (is.null(other_params$covariate)==FALSE) { if (nrow(other_params$covariate) != nrow(G)) { print("The covariate and genotype matrices are incompatible, i.e. unequal number of rows.") @@ -330,9 +353,9 @@ fn_Bayes_C = function(G, y, idx_training, idx_validation, other_params=list(nIte } other_params$out_prefix = gsub(" ", "-", paste(other_params$out_prefix, Sys.time(), stats::runif(1), sep="-")) ### attempt at preventing overwrites to the running Gibbs samplers in parallel sol = BGLR::BGLR(y=yNA, ETA=ETA, nIter=other_params$nIter, burnIn=other_params$burnIn, R2=other_params$h2, saveAt=other_params$out_prefix, verbose=FALSE) - y_pred = sol$yHat[idx_validation] - names(y_pred) = rownames(yNA)[idx_validation] - df_y = merge(data.frame(id=rownames(y)[idx_validation], true=y[idx_validation]), data.frame(id=names(y_pred), pred=y_pred), by="id") + y_pred = sol$yHat[vec_idx_validation] + names(y_pred) = rownames(yNA)[vec_idx_validation] + df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y[vec_idx_validation]), data.frame(id=names(y_pred), pred=y_pred), by="id") perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred) perf$y_pred = matrix(y_pred, ncol=1) colnames(perf$y_pred) = "pred" @@ -344,13 +367,13 @@ fn_Bayes_C = function(G, y, idx_training, idx_validation, other_params=list(nIte return(perf) } -fn_gBLUP = function(G, y, idx_training, idx_validation, other_params=list(covariate=NULL, mem_mb=1000)) { +fn_gBLUP = function(G, y, vec_idx_training, vec_idx_validation, other_params=list(covariate=NULL, mem_mb=1000)) { # n = 100; n_alleles = 2 # G = simquantgen::fn_simulate_genotypes(n=n, l=500, ploidy=2, n_alleles=n_alleles, min_allele_freq=0.001, n_chr=5, max_pos=135e6, dist_bp_at_50perc_r2=5e6, n_threads=2) # y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.75, pheno_reps=1)$Y # y = scale(y, center=TRUE, scale=TRUE) - # idx_validation = c(1:round(n*0.1)) - # idx_training = !(c(1:n) %in% idx_validation) + # vec_idx_validation = c(1:round(n*0.1)) + # vec_idx_training = !(c(1:n) %in% vec_idx_validation) # other_params=list(covariate=NULL, mem_mb=1000) ### Make sure that our genotype and phenotype data are labelled, i.e. have row and column names if (is.null(rownames(G)) | is.null(colnames(G)) | is.null(rownames(y))) { @@ -367,8 +390,8 @@ fn_gBLUP = function(G, y, idx_training, idx_validation, other_params=list(covari } ### Define training and validation datasets y_training = y - y_training[idx_validation] = NA - y_validation = y[idx_validation, 1, drop=FALSE] + y_training[vec_idx_validation] = NA + y_validation = y[vec_idx_validation, 1, drop=FALSE] ### Generate the genetic relationship matrix from the genotype data A = sommer::A.mat(G) ### Build the phenotype and id data frame @@ -410,15 +433,15 @@ tests_models = function() { y = simquantgen::fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=TRUE, n_networks=1, n_effects_per_network=50, h2=0.5, pheno_reps=1, verbose=TRUE)$Y z = fn_simulate_phenotypes(G, n_alleles=n_alleles, dist_effects="norm", n_effects=10, purely_additive=FALSE, n_networks=50, n_effects_per_network=500, h2=0.5, pheno_reps=1, verbose=TRUE)$Y C = matrix(stats::rnorm(3*n), ncol=3) - idx_validation = c(1:round(n*0.1)) - idx_training = !(c(1:n) %in% idx_validation) + vec_idx_validation = c(1:round(n*0.1)) + vec_idx_training = !(c(1:n) %in% vec_idx_validation) test_that( - "fn_least_squares_moore_penrose", { - print("fn_least_squares_moore_penrose:") - perf = fn_least_squares_moore_penrose(G, y, idx_training, idx_validation) + "fn_ols", { + print("fn_ols:") + perf = fn_ols(G, y, vec_idx_training, vec_idx_validation) expect_equal(perf$corr > 0.0, TRUE) expect_equal(perf$n_non_zero == 500, TRUE) - perf = fn_least_squares_moore_penrose(G, y, idx_training, idx_validation, other_params=list(tol=.Machine$double.eps, covariate=C)) + perf = fn_ols(G, y, vec_idx_training, vec_idx_validation, other_params=list(tol=.Machine$double.eps, covariate=C)) expect_equal(perf$corr > 0.0, TRUE) expect_equal(perf$n_non_zero == 503, TRUE) } @@ -427,10 +450,10 @@ tests_models = function() { test_that( "fn_ridge", { print("fn_ridge:") - perf = fn_ridge(G, y, idx_training, idx_validation) + perf = fn_ridge(G, y, vec_idx_training, vec_idx_validation) expect_equal(perf$corr > 0.0, TRUE) expect_equal(perf$n_non_zero == 500, TRUE) - perf = fn_ridge(G, y, idx_training, idx_validation, other_params=list(covariate=C)) + perf = fn_ridge(G, y, vec_idx_training, vec_idx_validation, other_params=list(covariate=C)) expect_equal(perf$corr > 0.0, TRUE) expect_equal(perf$n_non_zero == 503, TRUE) } @@ -439,10 +462,10 @@ tests_models = function() { test_that( "fn_lasso", { print("fn_lasso:") - perf = fn_lasso(G, y, idx_training, idx_validation) + perf = fn_lasso(G, y, vec_idx_training, vec_idx_validation) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero < 500, TRUE) - perf = fn_lasso(G, y, idx_training, idx_validation, other_params=list(covariate=C)) + perf = fn_lasso(G, y, vec_idx_training, vec_idx_validation, other_params=list(covariate=C)) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero < 503, TRUE) } @@ -451,10 +474,10 @@ tests_models = function() { test_that( "fn_elastic_net", { print("fn_elastic_net:") - perf = fn_elastic_net(G, y, idx_training, idx_validation) + perf = fn_elastic_net(G, y, vec_idx_training, vec_idx_validation) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero < 500, TRUE) - perf = fn_elastic_net(G, y, idx_training, idx_validation, other_params=list(covariate=C)) + perf = fn_elastic_net(G, y, vec_idx_training, vec_idx_validation, other_params=list(covariate=C)) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero < 503, TRUE) } @@ -463,10 +486,10 @@ tests_models = function() { test_that( "fn_Bayes_A", { print("fn_Bayes_A:") - perf = fn_Bayes_A(G, y, idx_training, idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesA")) + perf = fn_Bayes_A(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesA")) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero == 500, TRUE) - perf = fn_Bayes_A(G, y, idx_training, idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesA", covariate=C)) + perf = fn_Bayes_A(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesA", covariate=C)) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero == 500, TRUE) ### Excludes the fixed covariate effects } @@ -475,10 +498,10 @@ tests_models = function() { test_that( "fn_Bayes_B", { print("fn_Bayes_B:") - perf = fn_Bayes_B(G, y, idx_training, idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesB")) + perf = fn_Bayes_B(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesB")) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero == 500, TRUE) - perf = fn_Bayes_B(G, y, idx_training, idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesB", covariate=C)) + perf = fn_Bayes_B(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesB", covariate=C)) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero == 500, TRUE) ### Excludes the fixed covariate effects } @@ -487,10 +510,10 @@ tests_models = function() { test_that( "fn_Bayes_C", { print("fn_Bayes_C:") - perf = fn_Bayes_C(G, y, idx_training, idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesC")) + perf = fn_Bayes_C(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesC")) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero == 500, TRUE) - perf = fn_Bayes_C(G, y, idx_training, idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesC", covariate=C)) + perf = fn_Bayes_C(G, y, vec_idx_training, vec_idx_validation, other_params=list(nIter=1000, burnIn=100, h2=0.5, out_prefix="bglr_bayesC", covariate=C)) expect_equal(perf$corr > 0.1, TRUE) expect_equal(perf$n_non_zero == 500, TRUE) ### Excludes the fixed covariate effects } @@ -499,10 +522,10 @@ tests_models = function() { test_that( "fn_gBLUP", { print("fn_gBLUP:") - perf = fn_gBLUP(G, y, idx_training, idx_validation) + perf = fn_gBLUP(G, y, vec_idx_training, vec_idx_validation) expect_equal(perf$corr > 0.1, TRUE) expect_equal(is.na(perf$n_non_zero), TRUE) - perf = fn_gBLUP(G, y, idx_training, idx_validation, other_params=list(covariate=C)) + perf = fn_gBLUP(G, y, vec_idx_training, vec_idx_validation, other_params=list(covariate=C)) expect_equal(perf$corr > 0.1, TRUE) expect_equal(is.na(perf$n_non_zero), TRUE) ### Excludes the fixed covariate effects } diff --git a/man/fn_simulate_data.Rd b/man/fn_simulate_data.Rd index cf2e90d..015754e 100644 --- a/man/fn_simulate_data.Rd +++ b/man/fn_simulate_data.Rd @@ -31,13 +31,13 @@ fn_simulate_data( \item{n_alleles}{macimum number of alleles per locus (Default=2)} -\item{min_depth}{minimum depth per locus \link{Default=5}} +\item{min_depth}{minimum depth per locus (Default=5)} -\item{max_depth}{maximum depth per locus \link{Default=5000}} +\item{max_depth}{maximum depth per locus (Default=5000)} -\item{n_pop}{number of randomly assigned population groupings \link{Default=1}} +\item{n_pop}{number of randomly assigned population groupings (Default=1)} -\item{seed}{randomisation seed for replicability \link{Default=12345}} +\item{seed}{randomisation seed for replicability (Default=12345)} \item{save_pheno_tsv}{save the phenotype data as a tab-delimited file? (Default=TRUE)} diff --git a/tests/testthat/test-load.R b/tests/testthat/test-load.R index b33126c..bb2ff5c 100644 --- a/tests/testthat/test-load.R +++ b/tests/testthat/test-load.R @@ -1,3 +1,4 @@ +# library(testthat) # source("/group/pasture/Jeff/gp/R/load.R") test_that("fn_G_extract_names", {