working on models + will need to reestimate maximum number of threads…

… for within, and various across population cross-validations
jeffersonfparil · May 31, 2024 · 61ba298 · 61ba298
1 parent 09b7491
commit 61ba298
Show file tree

Hide file tree

Showing 11 changed files with 129 additions and 47 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,2 +1 @@
-.github/
-^simulated_*
+.github/
diff --git a/.gitignore b/.gitignore
@@ -46,4 +46,4 @@ docs/
 po/*~
 
 # RStudio Connect folder
-rsconnect/
+rsconnect/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,9 +17,6 @@ Imports:
     methods,
     vcfR,
     parallel,
-    cluster,
-    mgcv,
-    nlme,
     glmnet,
     BGLR,
     sommer,

diff --git a/R/cross_validation.R b/R/cross_validation.R
@@ -472,6 +472,14 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
         print("##################################################")
         print("Across population cross-validation:")
         print("(Leave-one-population-out CV)")
+
+
+
+        ### REDUCE NUMBER OF THREADS HERE ARE DATA SIZE differs from WITHIN POP CV!
+        ### USE MAXIMUM NUMBER OF THREADS ESTIMATION
+
+
+
         list_out = fn_leave_one_population_out_cross_validation(G=G, 
                                                                 y=y,
                                                                 pop=pop,

diff --git a/R/load.R b/R/load.R
@@ -693,7 +693,7 @@ fn_classify_allele_frequencies = function(G, ploidy=2, verbose=FALSE) {
 #' @param n number of samples (Default=100)
 #' @param l number of loci (Default=1000)
 #' @param ploidy ploidy level which can refer to the number of haploid genomes to simulate pools (Default=2)
-#' @param n_alleles macimum number of alleles per locus (Default=2)
+#' @param n_alleles maximum number of alleles per locus (Default=2)
 #' @param min_depth minimum depth per locus (Default=5)
 #' @param max_depth maximum depth per locus (Default=5000)
 #' @param n_pop number of randomly assigned population groupings (Default=1)
@@ -755,10 +755,11 @@ fn_simulate_data = function(n=100, l=1000, ploidy=2, n_alleles=2, min_depth=5, m
     fname_geno_rds = NULL
     fname_pheno_tsv = NULL
     ### Define date-time-random-number identifier so that we minimise the possibility of unintentional over-writing of the output file/s
-    date = gsub("-", "", gsub("[.]", "", gsub(":", "", gsub(" ", "", as.character(Sys.time())))))
+    # date = gsub("-", "", gsub("[.]", "", gsub(":", "", gsub(" ", "", as.character(Sys.time())))))
     ### Save phenotype file
     if (save_pheno_tsv) {
-        fname_pheno_tsv = file.path(getwd(), paste0("simulated_phenotype-", date, ".tsv"))
+        # fname_pheno_tsv = file.path(getwd(), paste0("simulated_phenotype-", date, ".tsv"))
+        fname_pheno_tsv = tempfile(fileext=".tsv")
         utils::write.table(df, file=fname_pheno_tsv, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
         if (verbose) {
             print("Output phenotype file (tsv):")
@@ -777,7 +778,8 @@ fn_simulate_data = function(n=100, l=1000, ploidy=2, n_alleles=2, min_depth=5, m
             error = chain(vcf, error)
             return(error)
         }
-        fname_geno_vcf = file.path(getwd(), paste0("simulated_genotype-", date, ".vcf.gz"))
+        # fname_geno_vcf = file.path(getwd(), paste0("simulated_genotype-", date, ".vcf.gz"))
+        fname_geno_vcf = tempfile(fileext=".vcf.gz")
         vcfR::write.vcf(vcf, file=fname_geno_vcf)
         if (verbose) {
             print("Output genotype file (sync.gz):")
@@ -787,7 +789,8 @@ fn_simulate_data = function(n=100, l=1000, ploidy=2, n_alleles=2, min_depth=5, m
     if (save_geno_tsv)  {
         list_ids_chr_pos_all = fn_G_extract_names(mat_genotypes=G, verbose=verbose)
         df_geno = data.frame(chr=list_ids_chr_pos_all$vec_chr, pos=list_ids_chr_pos_all$vec_pos, allele=list_ids_chr_pos_all$vec_all, t(G))
-        fname_geno_tsv = file.path(getwd(), paste0("simulated_genotype-", date, ".tsv"))
+        # fname_geno_tsv = file.path(getwd(), paste0("simulated_genotype-", date, ".tsv"))
+        fname_geno_tsv = tempfile(fileext=".tsv")
         utils::write.table(df_geno, file=fname_geno_tsv, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
         if (verbose) {
             print("Output genotype file (tsv):")
@@ -796,7 +799,8 @@ fn_simulate_data = function(n=100, l=1000, ploidy=2, n_alleles=2, min_depth=5, m
     }
     if (save_geno_rds) {
         ### Convert loci names so that the chromosome, position and allele are tab-delimited
-        fname_geno_rds = file.path(getwd(), paste0("simulated_genotype-", date, ".Rds"))
+        # fname_geno_rds = file.path(getwd(), paste0("simulated_genotype-", date, ".Rds"))
+        fname_geno_rds = tempfile(fileext=".Rds")
         ### Revert to numeric if we have more than 52 alleles (limit of the Latin alphabet)
         if (!non_numeric_Rds) {
             saveRDS(G, file=fname_geno_rds)
@@ -987,7 +991,7 @@ fn_load_genotype = function(fname_geno, ploidy=NULL, retain_minus_one_alleles_pe
 #'  REF_ALT=paste0("allele_1,allele_alt"))
 #' df_snp_list$REF_ALT[1:100] = "allele_2,allele_4"
 #' colnames(df_snp_list) = c("#CHROM", "POS", "REF,ALT")
-#' fname_snp_list = "tmp_snp_list.txt"
+#' fname_snp_list = tempfile(fileext=".snplist")
 #' utils::write.table(df_snp_list, file=fname_snp_list, sep="\t", 
 #'  row.names=FALSE, col.names=TRUE, quote=FALSE)
 #' ### Filter
@@ -1021,7 +1025,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
     # df_snp_list = data.frame(CHROM=mat_loci[,1], POS=as.numeric(mat_loci[,2]), REF_ALT=paste0("allele_1,allele_alt"))
     # df_snp_list$REF_ALT[1:100] = "allele_2,allele_4"
     # colnames(df_snp_list) = c("#CHROM", "POS", "REF,ALT")
-    # fname_snp_list = "tmp_snp_list.txt"
+    # fname_snp_list = tempfile(fileext=".snplist")
     # utils::write.table(df_snp_list, file=fname_snp_list, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
     # verbose = TRUE
     ###################################################

diff --git a/R/metrics.R b/R/metrics.R
@@ -16,6 +16,8 @@
 #'  $corr: Pearson's product moment correlation
 #'  $power_t10: fraction of observed top 10 phenotype values correctly predicted
 #'  $power_b10: fraction of observed bottom 10 phenotype values correctly predicted
+#'  $var_pred: variance of predicted phenotype values (estimator of additive genetic variance)
+#'  $var_true: variance of observed phenotype values (estimator of total phenotypic variance)
 #'  $h2: estimated narrow-sense heritability weighted by Pearson's correlation between observed and predicted phenotype values
 #'      Formula:
 #'      - h2_predicted = corr(observed, predicted) * h2_true
@@ -80,7 +82,9 @@ fn_prediction_performance_metrics = function(y_true, y_pred, verbose=FALSE) {
         power_b10 = sum((bottom10_dec_true %in% bottom10_dec_pred)) / n_top_or_bottom_10
     }
     ### Narrow-sense heritability scaled by prediction accuracy in terms of Pearson's correlation
-    h2 = round((stats::var(y_pred, na.rm=TRUE) / stats::var(y_true)) / corr, 10)
+    var_pred = stats::var(y_pred, na.rm=TRUE)
+    var_true = stats::var(y_true, na.rm=TRUE)
+    h2 = round((var_pred/var_true) / corr, 10)
     if (!is.na(h2)) {
         if ((h2 < 0.0) | (h2 > 1.0)) {
             h2 = NA
@@ -101,6 +105,8 @@ fn_prediction_performance_metrics = function(y_true, y_pred, verbose=FALSE) {
         print(paste0("Power to identify top 10 (power_t10) = ", mbe))
         print(paste0("Power to identify bottom 10 (power_b10) = ", power_b10))
         print(paste0("Pearson's correlation (corr) = ", corr))
+        print(paste0("Variance of predicted phenotypes (var_pred) = ", var_pred, " (estimator of additive genetic variance)"))
+        print(paste0("Variance of observed phenotypes (var_true) = ", var_true, " (estimator of total phenotypic variance)"))
         print(paste0("Narrow-sense heritability estimate (h2) = ", h2))
     }
     ### Output
@@ -112,5 +118,7 @@ fn_prediction_performance_metrics = function(y_true, y_pred, verbose=FALSE) {
         corr=corr,
         power_t10=power_t10,
         power_b10=power_b10,
+        var_pred=var_pred,
+        var_true=var_true,
         h2=h2))
 }
diff --git a/R/models.R b/R/models.R
@@ -12,29 +12,60 @@ suppressWarnings(suppressPackageStartupMessages(library(sommer)))
 ###     4.) vector of numeric indexes corresponding to the index of the validation set in the genotype matrix and phenotype vector
 ###     5.) a list of other parameters required by the internal functions
 
-fn_ols = function(list_merged, vec_idx_training, vec_idx_validation, other_params=list(diag_inflate=1e-4)) {
+#' Ordinary least squares model
+#'
+#' @param list_merged list of merged genotype matrix, and phenotype vector, as well as an optional covariate matrix
+#'  $G: numeric n samples x p loci-alleles matrix of allele frequencies with non-null row and column names.
+#'      Row names can be any string of characters which identify the sample or entry or pool names.
+#'      Column names need to be tab-delimited, where first element refers to the chromosome or scaffold name, 
+#'      the second should be numeric which refers to the position in the chromosome/scaffold, and 
+#'      subsequent elements are optional which may refer to the allele identifier and other identifiers.
+#'  $list_pheno:
+#'      $y: named vector of numeric phenotype data
+#'      $pop: population or groupings corresponding to each element of y
+#'      $trait_name: name of the trait
+#'  $COVAR: numeric n samples x k covariates matrix with non-null row and column names.
+#' @param vec_idx_training vector of numeric indexes referring to the training set
+#' @param vec_idx_validation vector of numeric indexes referring to the validation set
+fn_ols = function(list_merged, vec_idx_training, vec_idx_validation, other_params=list(diag_inflate=1e-4), verbose=FALSE) {
     ###################################################
     ### TEST
-    list_sim = fn_simulate_data(n_pop=3, verbose=TRUE)
-    G = fn_load_genotype(fname_geno=list_sim$fname_geno_vcf)
-    list_pheno = fn_load_phenotype(fname_pheno=list_sim$fname_pheno_tsv)
-    COVAR = G %*% t(G); # rownames(COVAR) = NULL
-    list_merged = fn_merge_genotype_and_phenotype(G=G, list_pheno=list_pheno, COVAR=COVAR, verbose=TRUE)
-    n = nrow(list_merged$G)
-    vec_idx_training = sample(c(1:n), floor(n/2))
-    vec_idx_validation = c(1:n)[!(c(1:n) %in% vec_idx_training)]
-    other_params=list(diag_inflate=1e-4)
+    # list_sim = fn_simulate_data(n_pop=3, verbose=TRUE)
+    # G = fn_load_genotype(fname_geno=list_sim$fname_geno_vcf)
+    # list_pheno = fn_load_phenotype(fname_pheno=list_sim$fname_pheno_tsv)
+    # COVAR = G %*% t(G); # rownames(COVAR) = NULL
+    # list_merged = fn_merge_genotype_and_phenotype(G=G, list_pheno=list_pheno, COVAR=COVAR, verbose=TRUE)
+    # n = nrow(list_merged$G)
+    # vec_idx_training = sample(c(1:n), floor(n/2))
+    # vec_idx_validation = c(1:n)[!(c(1:n) %in% vec_idx_training)]
+    # other_params=list(diag_inflate=1e-6)
+    # verbose = TRUE
     ###################################################
     if (methods::is(list_merged, "gpError")) {
         error = chain(list_merged, 
             methods::new("gpError",
                 code=000,
                 message=paste0(
-                    "Error in models::fn_ols(list_merged, vec_idx_training, vec_idx_validation, other_params). ",
+                    "Error in models::fn_ols(...). ",
                     "Input data is an error type."
                 )))
         return(error)
     }
+    if (!is.logical(c(vec_idx_training, vec_idx_training)) |
+        (sum(is.na(c(vec_idx_training, vec_idx_training))) > 0) |
+        (min(c(vec_idx_training, vec_idx_training)) < 0) |
+        (max(c(vec_idx_training, vec_idx_training)) > nrow(list_merged$G))) {
+            error = methods::new("gpError",
+                code=000,
+                message=paste0(
+                    "Error in models::fn_ols(...). ",
+                    "Please make sure that the vector of indexes for the training and validation sets are not booleans. ",
+                    "We require the indexes to be positive integers without missing values. ",
+                    "Also, the maximum index (", max(c(vec_idx_training, vec_idx_training)), 
+                    ") should be less than or equal to the total number of samples/entries/pools in the input data: (", 
+                    nrow(list_merged$G), ")."))
+            return(error)
+    }
     X_training = list_merged$G[vec_idx_training, ]
     y_training = list_merged$list_pheno$y[vec_idx_training]
     X_validation = list_merged$G[vec_idx_validation, ]
@@ -44,35 +75,68 @@ fn_ols = function(list_merged, vec_idx_training, vec_idx_validation, other_param
         X_training = cbind(list_merged$COVAR[vec_idx_training, ], X_training)
         X_validation = cbind(list_merged$COVAR[vec_idx_validation, ], X_validation)
     }
-
-    if (nrow(X_training) > ncol(X_training)) {
+    ### Solve the least squares equations
+    n = nrow(X_training)
+    p = ncol(X_training)
+    if (n >= p) {
+        ### Canonical least squares equations
         A = t(X_training) %*% X_training
-        inv_A = tryCatch(solve(A), error=function(x) {
+        A_inv = tryCatch(solve(A), error=function(x) {
+            diag(A) = diag(A) + other_params$diag_inflate
+            x_inv = tryCatch(solve(A), error=function(x) {
+                NA
+            })
+        })
+        if (is.na(A_inv)) {
+            error = methods::new("gpError",
+                code=000,
+                message=paste0(
+                    "Error in models::fn_ols(...). ",
+                    "Failed to compute the inverse of X'X. ",
+                    "Consider increasing the other_params$diag_inflate: ", other_params$diag_inflate, 
+                    " to force the calculation of the inverse of X'X."
+                ))
+            return(error)
+        }
+        b = A_inv %*% t(X_training) %*% y_training
+    } else {
+        ### Non-canonical least squares equations
+        A = X_training %*% t(X_training)
+        A_inv = tryCatch(solve(A), error=function(A) {
             B = A
             diag(B) = diag(B) + other_params$diag_inflate
-            out = tryCatch(solve(B), error=function(e){return(NA)})
-            return(out)
+            B_inv = tryCatch(solve(B), error=function(B) {
+                return(NA)
+            })
+            return(B_inv)
         })
-        diag(A %*% inv_A)
-        if (is.na(inv_A)) {
+        if (is.na(A_inv)) {
             error = methods::new("gpError",
                 code=000,
                 message=paste0(
-                    "Error in models::fn_ols(list_merged, vec_idx_training, vec_idx_validation, other_params)"
+                    "Error in models::fn_ols(...). ",
+                    "Failed to compute the inverse of XX'. ",
+                    "Consider increasing the other_params$diag_inflate: ", other_params$diag_inflate, 
+                    " to force the calculation of the inverse of XX'."
                 ))
+            return(error)
         }
+        b = t(X_training) %*% A_inv %*% y_training
     }
-
-
-
-    A = t(X_training) %*% X_training
-    inv_A = solve()
-    b = inv_A %*% t(X_training) %*% y_training
     y_pred = X_validation %*% b
-    df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id")
-    perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred)
+    df_y = merge(
+        data.frame(id=names(y_validation), true=y_validation), 
+        data.frame(id=rownames(y_pred), pred=y_pred),
+        by="id")
+    perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred, verbose=verbose)
     perf$y_pred = y_pred
+    perf$b = b[,1]; names(perf$b) = colnames(X_training)
     perf$n_non_zero = sum(abs(b) > .Machine$double.eps)
+    if (verbose) {
+        print("Allele effects distribution:")
+        txtplot::txtdensity(perf$b[!is.na(perf$b) & !is.infinite(perf$b)])
+        print(paste0("Number of non-zero effects: ", perf$n_non_zero, " (", round(100*p/perf$n_non_zero), "%)"))
+    }
     return(perf)
 }
 

diff --git a/man/fn_filter_genotype.Rd b/man/fn_filter_genotype.Rd
diff --git a/man/fn_simulate_data.Rd b/man/fn_simulate_data.Rd
diff --git a/tests/testthat/test-load.R b/tests/testthat/test-load.R
@@ -146,7 +146,7 @@ test_that("fn_filter_genotype", {
     df_snp_list = data.frame(CHROM=mat_loci[,1], POS=as.numeric(mat_loci[,2]), REF_ALT=paste0("allele_1,allele_alt"))
     df_snp_list$REF_ALT[1:n_sim_missing] = "allele_2,allele_4"
     colnames(df_snp_list) = c("#CHROM", "POS", "REF,ALT")
-    fname_snp_list = "tmp_snp_list.txt"
+    fname_snp_list = tempfile(fileext=".snplist")
     utils::write.table(df_snp_list, file=fname_snp_list, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
     ### Filter with the alternative alleles
     G_filtered_2 = fn_filter_genotype(G=G, maf=0.05, fname_snp_list=fname_snp_list, verbose=TRUE)

diff --git a/tests/testthat/test-metrics.R b/tests/testthat/test-metrics.R
@@ -10,8 +10,8 @@ test_that("fn_prediction_performance_metrics", {
     list_metrics_2 = fn_prediction_performance_metrics(y_true, y_pred)
     list_metrics_3 = fn_prediction_performance_metrics(y_true, NA)
     list_metrics_4 = fn_prediction_performance_metrics(y_true, rep(NA, length(y_true)))
-    expect_equal(list_metrics_1, list(mbe=0, mae=0, rmse=0, r2=1, corr=1, power_t10=1, power_b10=1, h2=1))
-    expect_equal(list_metrics_2, list(mbe=-pi, mae=pi, rmse=pi, r2=(1-(n*(pi^2)/sum((y_true-mean(y_true))^2))), corr=1, power_t10=1, power_b10=1, h2=1))
+    expect_equal(list_metrics_1, list(mbe=0, mae=0, rmse=0, r2=1, corr=1, power_t10=1, power_b10=1, var_pred=841.666666666666, var_true=841.666666666666, h2=1))
+    expect_equal(list_metrics_2, list(mbe=-pi, mae=pi, rmse=pi, r2=(1-(n*(pi^2)/sum((y_true-mean(y_true))^2))), corr=1, power_t10=1, power_b10=1, var_pred=841.666666666666, var_true=841.666666666666, h2=1))
     expect_equal(methods::is(list_metrics_3, "gpError"), TRUE)
     expect_equal(is.na(list_metrics_4$mbe), TRUE)
     expect_equal(is.na(list_metrics_4$mae), TRUE)
@@ -20,5 +20,7 @@ test_that("fn_prediction_performance_metrics", {
     expect_equal(is.na(list_metrics_4$corr), TRUE)
     expect_equal(is.na(list_metrics_4$power_t10), TRUE)
     expect_equal(is.na(list_metrics_4$power_b10), TRUE)
+    expect_equal(is.na(list_metrics_4$var_pred), TRUE)
+    expect_equal(list_metrics_4$var_true, 841.666666666666)
     expect_equal(is.na(list_metrics_4$h2), TRUE)
 })
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,9 +17,6 @@ Imports: @@
         methods,
         vcfR,
         parallel,
-        cluster,
-        mgcv,
-        nlme,
         glmnet,
         BGLR,
         sommer,
@@ Expand Down @@