From 03c26ebb3b6a4e1639932fb97540367af595fea0 Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Thu, 23 May 2024 16:59:16 +1000 Subject: [PATCH] refactoring genotype loading + adding non-numeric genotype data simulation and loading with improvements over previous version which accounts for ploidy instead of assuming diploidy --- R/cross_validation.R | 97 +++++++++----------------- R/load.R | 161 ++++++++++++++++++++++++++++--------------- R/main.R | 25 ++++--- 3 files changed, 155 insertions(+), 128 deletions(-) diff --git a/R/cross_validation.R b/R/cross_validation.R index 714feeb..218b4ae 100644 --- a/R/cross_validation.R +++ b/R/cross_validation.R @@ -424,8 +424,6 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp ################################################## print("##################################################") print("Within population cross-validation:") - # df_metrics = NULL - # df_y_pred = NULL vec_fnames_metrics = c() vec_fnames_y_pred = c() for (pop_id in vec_pop) { @@ -454,14 +452,9 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp list_out$df_metrics$pop = pop_id vec_fnames_metrics = c(vec_fnames_metrics, list_out$fnames_metrics) vec_fnames_y_pred = c(vec_fnames_y_pred, list_out$fnames_y_pred) - # if (is.null(df_metrics)) { - # df_metrics = list_out$df_metrics - # df_y_pred = list_out$df_y_pred - # } else { - # df_metrics = rbind(df_metrics, list_out$df_metrics) - # df_y_pred = rbind(df_y_pred, list_out$df_y_pred) - # } } + METRICS_WITHIN_POP = list_out$df_metrics + YPRED_WITHIN_POP = list_out$df_y_pred ### Cleanup for (i in 1:length(vec_fnames_metrics)) { unlink(vec_fnames_metrics[i]) @@ -499,25 +492,27 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp ################################################################ ### ACROSS POPULATIONS: pairwise population cross-validation ### ################################################################ + vec_pop = sort(unique(pop)) + METRICS_ACROSS_POP_PAIRWISE = NULL + YPRED_ACROSS_POP_PAIRWISE = NULL if ((args$skip_pairwise_cv==TRUE) | (length(unique(pop))==1)) { args$skip_pairwise_cv = TRUE - METRICS_ACROSS_POP_PAIRWISE = NULL - YPRED_ACROSS_POP_PAIRWISE = NULL print("Skipping leave-one-population-out cross-validation as only one population was supplied.") } else { - # df_metrics = NULL - # df_y_pred = NULL + print("##################################################") + print("Across population cross-validation:") + print("(Pairwise population CV)") vec_fnames_metrics = c() vec_fnames_y_pred = c() for (pop_id_1 in vec_pop) { for (pop_id_2 in vec_pop) { # pop_id_1 = vec_pop[1]; pop_id_2 = vec_pop[2] + idx_1 = which(pop == pop_id_1) + idx_2 = which(pop == pop_id_2) if (pop_id_1==pop_id_2) { next } print("-----------------------------------") - idx_1 = which(pop == pop_id_1) - idx_2 = which(pop == pop_id_2) print(paste0("Training population: ", pop_id_1, " (n=", length(idx_1), " x p=", ncol(G), ")")) print(paste0("Validation population: ", pop_id_2, " (n=", length(idx_2), " x p=", ncol(G), ")")) prefix = file.path(dir_tmp, gsub(".vcf.gz$", "", ignore.case=TRUE, gsub(".vcf$", "", ignore.case=TRUE, gsub(".rds$", "", ignore.case=TRUE, basename(args$fname_rds_or_vcf))))) @@ -540,49 +535,25 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp } list_out$df_metrics$training_pop = pop_id_1 list_out$df_metrics$validation_pop = pop_id_2 + list_out$df_y_pred$training_pop = pop_id_1 + list_out$df_y_pred$validation_pop = pop_id_2 + if (is.null(METRICS_ACROSS_POP_PAIRWISE) & is.null(YPRED_ACROSS_POP_PAIRWISE)) { + METRICS_ACROSS_POP_PAIRWISE = list_out$df_metrics + YPRED_ACROSS_POP_PAIRWISE = list_out$df_y_pred + } else { + METRICS_ACROSS_POP_PAIRWISE = rbind(METRICS_ACROSS_POP_PAIRWISE, list_out$df_metrics) + YPRED_ACROSS_POP_PAIRWISE = rbind(YPRED_ACROSS_POP_PAIRWISE, list_out$df_y_pred) + } vec_fnames_metrics = c(vec_fnames_metrics, list_out$fnames_metrics) vec_fnames_y_pred = c(vec_fnames_y_pred, list_out$fnames_y_pred) - # if (is.null(df_metrics)) { - # df_metrics = list_out$df_metrics - # df_y_pred = list_out$df_y_pred - # } else { - # df_metrics = rbind(df_metrics, list_out$df_metrics) - # df_y_pred = rbind(df_y_pred, list_out$df_y_pred) - # } } } - METRICS_ACROSS_POP_PAIRWISE = list_out$df_metrics - YPRED_ACROSS_POP_PAIRWISE = list_out$df_y_pred ### Cleanup for (i in 1:length(vec_fnames_metrics)) { unlink(vec_fnames_metrics[i]) unlink(vec_fnames_y_pred[i]) } } - - - - - - - - - - - - - - - - - - - - - - - - ###################################################################### ### PER SE GENOMIC PREDICTION: using the best model per population ### ###################################################################### @@ -592,8 +563,8 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp print("(using Pearson's correlation)") ### Only considering within population k-fold cross-validation to identify the best model per population ### Using Pearson's correlation as the genomic prediction accuracy metric - agg_corr = aggregate(corr ~ model + pop, FUN=mean, data=df_metrics) - agg_corr_sd = aggregate(corr ~ model + pop, FUN=sd, data=df_metrics) + agg_corr = aggregate(corr ~ model + pop, FUN=mean, data=METRICS_WITHIN_POP) + agg_corr_sd = aggregate(corr ~ model + pop, FUN=sd, data=METRICS_WITHIN_POP) agg_corr = merge(agg_corr, agg_corr_sd, by=c("model", "pop")) colnames(agg_corr) = c("model", "pop", "corr", "corr_sd") vec_popns = c() @@ -612,8 +583,8 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp vec_corr_sd = c(vec_corr_sd, corr_sd) } ### Append overall best model across all populations - agg_overall_mean = aggregate(corr ~ model, FUN=mean, data=df_metrics) - agg_overall_sdev = aggregate(corr ~ model, FUN=sd, data=df_metrics) + agg_overall_mean = aggregate(corr ~ model, FUN=mean, data=METRICS_WITHIN_POP) + agg_overall_sdev = aggregate(corr ~ model, FUN=sd, data=METRICS_WITHIN_POP) agg_overall = merge(agg_overall_mean, agg_overall_sdev, by="model") colnames(agg_overall) = c("model", "corr", "corr_sd") agg_overall = agg_overall[which(agg_overall[,2]==max(agg_overall[,2], na.rm=TRUE))[1], ] @@ -621,8 +592,8 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp vec_model = c(vec_model, as.character(agg_overall$model)) vec_corr = c(vec_corr, agg_overall$corr) vec_corr_sd = c(vec_corr_sd, agg_overall$corr_sd) - df_top_models = data.frame(pop=vec_popns, model=vec_model, corr=vec_corr, corr_sd=vec_corr_sd) - print(df_top_models) + SUMMARY = data.frame(pop=vec_popns, model=vec_model, corr=vec_corr, corr_sd=vec_corr_sd) + print(SUMMARY) ### Genomic prediction per se if (length(idx_entries_for_genomic_prediction) == 0) { GENOMIC_PREDICTIONS = NULL @@ -655,15 +626,15 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp } idx_training = which(!is.na(y[,1])) idx_validation = which(is.na(y[,1])) - idx_top_model = which(df_top_models$pop == pop)[1] + idx_top_model = which(SUMMARY$pop == pop)[1] if (length(idx_top_model)==0) { ### Use the overall top model if the entries do not belong to any other populations - idx_top_model = which(df_top_models$pop == "overall")[1] + idx_top_model = which(SUMMARY$pop == "overall")[1] } - model = df_top_models$model[idx_top_model] - corr = df_top_models$corr[idx_top_model] - corr_sd = df_top_models$corr_sd[idx_top_model] - corr_pop = df_top_models$pop[idx_top_model] + model = SUMMARY$model[idx_top_model] + corr = SUMMARY$corr[idx_top_model] + corr_sd = SUMMARY$corr_sd[idx_top_model] + corr_pop = SUMMARY$pop[idx_top_model] if (grepl("Bayes", model)==TRUE) { other_params = list(nIter=12e3, burnIn=2e3, h2=0.5, out_prefix=paste0("bglr_", model, "-"), covariate=COVAR) @@ -689,9 +660,9 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp ### Output out_per_phenotype = list( TRAIT_NAME = list_y_pop$trait_name, - SUMMARY = df_top_models, - METRICS_WITHIN_POP = df_metrics, - YPRED_WITHIN_POP = df_y_pred, + SUMMARY = SUMMARY, + METRICS_WITHIN_POP = METRICS_WITHIN_POP, + YPRED_WITHIN_POP = YPRED_WITHIN_POP, METRICS_ACROSS_POP_LOPO = METRICS_ACROSS_POP_LOPO, YPRED_ACROSS_POP_LOPO = YPRED_ACROSS_POP_LOPO, METRICS_ACROSS_POP_PAIRWISE = METRICS_ACROSS_POP_PAIRWISE, diff --git a/R/load.R b/R/load.R index fcb0a62..e7682d4 100644 --- a/R/load.R +++ b/R/load.R @@ -3,8 +3,7 @@ suppressWarnings(suppressPackageStartupMessages(library(vcfR))) suppressWarnings(suppressPackageStartupMessages(library(txtplot))) suppressWarnings(suppressPackageStartupMessages(library(testthat))) -# dirname_functions = dirname(sys.frame(1)$ofile) -# dirname_functions = "/group/pasture/Jeff/genomic_selection/src"; source(file.path(dirname_functions, "load.R")) +# source("/group/pasture/Jeff/gp/R/load.R") ### Create an error class setClass("gpError", representation(code="numeric", message="character"), prototype(code=0, message="Empty error message. Please define.")) @@ -39,18 +38,19 @@ setMethod(f="chain", #' @param save_geno_tsv save the genotype data as a tab-delimited allele frequency table file [Default=FALSE] #' @param save_geno_rds save the named genotype matrix as an Rds file [Default=FALSE] #' @param save_pheno_tsv save the phenotype data as a tab-delimited file [Default=FALSE] +#' @param non_numeric_Rds save non-numeric Rds genotype file? Note that this only works for biallelic loci and a maximum ploidy of 8. [Default=FALSE] #' @param verbose show simulation messages? [Default=FALSE] #' @returns #' vcf: simulated genotype data as a vcfR object #' df: simulated phenotype data as a data frame -#' fname_geno_vcf: filename of the simulated genotype data as a vcf file -#' fname_geno_tsv: filename of the simulated genotype data as a tab-delimited allele frequency table file -#' fname_geno_rds: filename of the simulated named genotype matrix as an Rds file +#' fname_genotype_vcf: filename of the simulated genotype data as a vcf file +#' fname_genotype_tsv: filename of the simulated genotype data as a tab-delimited allele frequency table file +#' fname_genotype_rds: filename of the simulated named genotype matrix as an Rds file #' fname_pheno_tsv: filename of the simulated phenotype data as a tab-delimited file #' @examples #' list_sim = fn_simulate_data(verbose=TRUE, save_geno_vcf=TRUE, save_geno_tsv=TRUE, save_geno_rds=TRUE, save_pheno_tsv=TRUE) #' @export -fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, seed=12345, save_geno_vcf=FALSE, save_geno_tsv=FALSE, save_geno_rds=FALSE, save_pheno_tsv=FALSE, verbose=FALSE) { +fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, seed=12345, save_geno_vcf=FALSE, save_geno_tsv=FALSE, save_geno_rds=FALSE, save_pheno_tsv=FALSE, non_numeric_Rds=FALSE, verbose=FALSE) { ################################################### ### TEST # n = 100 @@ -64,6 +64,7 @@ fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, se # save_geno_tsv = TRUE # save_geno_rds = TRUE # save_pheno_tsv = TRUE + # non_numeric_Rds = FALSE # verbose = TRUE ################################################### set.seed(seed) @@ -130,27 +131,51 @@ fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, se } ### Save into files date = gsub("-", "", gsub("[.]", "", gsub(":", "", gsub(" ", "", as.character(Sys.time()))))) - fname_geno_vcf = NULL - fname_geno_tsv = NULL - fname_geno_rds = NULL + fname_genotype_vcf = NULL + fname_genotype_tsv = NULL + fname_genotype_rds = NULL fname_pheno_tsv = NULL if (save_geno_vcf) { - fname_geno_vcf = file.path(getwd(), paste0("simulated_genotype-", date, ".vcf.gz")) - vcfR::write.vcf(vcf, file=fname_geno_vcf) + fname_genotype_vcf = file.path(getwd(), paste0("simulated_genotype-", date, ".vcf.gz")) + vcfR::write.vcf(vcf, file=fname_genotype_vcf) print("Output genotype file (sync.gz):") - print(fname_geno_vcf) + print(fname_genotype_vcf) } if (save_geno_tsv | save_geno_rds) { df_geno = data.frame(chr=vec_chr, pos=vec_pos, allele=vec_ref, t(G)) - fname_geno_tsv = file.path(getwd(), paste0("simulated_genotype-", date, ".tsv")) - write.table(df_geno, file=fname_geno_tsv, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE) + fname_genotype_tsv = file.path(getwd(), paste0("simulated_genotype-", date, ".tsv")) + write.table(df_geno, file=fname_genotype_tsv, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE) print("Output genotype file (tsv):") - print(fname_geno_tsv) + print(fname_genotype_tsv) if (save_geno_rds) { - fname_geno_rds = file.path(getwd(), paste0("simulated_genotype-", date, ".Rds")) - saveRDS(G, file=fname_geno_rds) - print("Output genotype file (Rds):") - print(fname_geno_rds) + fname_genotype_rds = file.path(getwd(), paste0("simulated_genotype-", date, ".Rds")) + ### Revert to numeric if we have more than 52 alleles (limit of the Latin alphabet) + if (non_numeric_Rds & ((n_alleles != 2) | (ploidy > 8))) { + print("Reverting numeric genotype.") + print("Non-numeric genotypes only available for biallelic loci and a maximum ploidy of 8.") + non_numeric_Rds = FALSE + } + if (!non_numeric_Rds) { + print("Output genotype file (Rds):") + print(fname_genotype_rds) + saveRDS(G, file=fname_genotype_rds) + } else { + G_non_numeric = matrix("", nrow=nrow(G), ncol=ncol(G)) + if (verbose) {pb = txtProgressBar(min=0, max=n, style=3)} + for (i in 1:nrow(G)) { + # i = 1; j = 1 + for (j in 1:ncol(G)) { + n_ref = 1 + round(G[i, j]*(ploidy-1)) + n_alt = ploidy - n_ref + G_non_numeric[i, j] = paste(c(rep("A", times=n_ref), rep("B", times=n_alt)), collapse="") + } + if (verbose) {setTxtProgressBar(pb, i)} + } + if (verbose) {close(pb)} + print("Output non-numeric genotype file (Rds):") + print(fname_genotype_rds) + saveRDS(G_non_numeric, file=fname_genotype_rds) + } } } if (save_pheno_tsv) { @@ -162,9 +187,9 @@ fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, se return(list( vcf=vcf, df=df, - fname_geno_vcf=fname_geno_vcf, - fname_geno_tsv=fname_geno_tsv, - fname_geno_rds=fname_geno_rds, + fname_genotype_vcf=fname_genotype_vcf, + fname_genotype_tsv=fname_genotype_tsv, + fname_genotype_rds=fname_genotype_rds, fname_pheno_tsv=fname_pheno_tsv )) } @@ -325,54 +350,78 @@ fn_classify_allele_frequencies = function(mat_genotypes, ploidy, strict_boundari } ### Outputs a $n$ entries x $l$ loci x (n_alleles-1) matrix -fn_load_genotype = function(fname_genotype, retain_minus_one_alleles_per_locus=TRUE) { +fn_load_genotype = function(fname_genotype, retain_minus_one_alleles_per_locus=TRUE, verbose=FALSE) { ################################################### ### TEST - fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_vcf=FALSE)$fname_genotype_vcf - fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_vcf=FALSE)$fname_genotype_tsv - fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_vcf=FALSE)$fname_genotype_rds + fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_rds=TRUE)$fname_genotype_rds + fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_vcf=TRUE)$fname_genotype_vcf + fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_tsv=TRUE)$fname_genotype_tsv + + ### Non-numeric biallelic diploid + fname_genotype = fn_simulate_data(n_alleles=2, ploidy=8, non_numeric_Rds=TRUE, save_geno_rds=TRUE, verbose=TRUE)$fname_genotype_rds + + retain_minus_one_alleles_per_locus = TRUE + verbose = TRUE ################################################### ### Load the genotype matrix (n x p) - G = tryCatch(readRDS(fname_genotype), error=function(e) { - tryCatch(readRDS(fname_genotype), error=function(e) { + ### TryCatch Rds, vcf, then tsv + G = tryCatch({ + ### RDS file + G = readRDS(fname_genotype) + ### If the input genotype matrix is non-numeric, then we assume biallelic loci, e.g. "AA", "AB", and "BB". + ### Convert them into numerics setting the first allele as 0 and the second as 1, such that "AA" = 0.0, "AB" = 1.0, and "BB" = 2.0 + ### Assuming "A" is the reference allele, then what we end up with is the alternative allele counts + if (is.numeric(G)==FALSE) { + G_numeric = matrix(0, nrow=nrow(G), ncol=ncol(G)) + rownames(G_numeric) = rownames(G) + colnames(G_numeric) = colnames(G) + if (verbose) {pb = txtProgressBar(min=0, max=ncol(G), style=3)} + for (j in 1:ncol(G)) { + # j = 1 + alleles = sort(unique(unlist(strsplit(unique(G[, j]), "")))) + if (length(alleles) == 2) { + G_numeric[, j] = unlist(lapply(strsplit(gsub(alleles[2], 1, gsub(alleles[1], 0, G[, j])), ""), FUN=function(x){sum(as.numeric(x)) / length(x)})) + } + if (length(alleles) > 2) { + print(paste0("Error reading genotype file: ", fname_genotype, ".")) + print(paste0(" - Are you certain that your file is biallelic diploid, i.e. it has a maximum of 2 alleles per locus?")) + print(paste0(" - Then explain these alleles: ", paste(paste0("'", alleles, "'"), collapse=","), ".")) + print(paste0(" - Do these look like your genotype data: ", paste(head(G_numeric[, j]), collapse=","), "?")) + print(paste0(" - These too: ", paste(tail(G_numeric[, j]), collapse=","), "?")) + quit() + } + if (verbose) {setTxtProgressBar(pb, j)} + } + if (verbose) {close(pb)} + G = G_numeric + } + }, + error=function(e) { + tryCatch({ + ### VCF file vcf = vcfR::read.vcfR(fname_genotype, verbose=TRUE) mat_genotypes = fn_extract_allele_frequencies(vcf) - if (class(mat_genotypes) == "gpError") { + if (class(mat_genotypes)[1] == "gpError") { error = chain(mat_genotypes, new("gpError", code=108, message="Error in load::fn_load_genotype: error loading the vcf file.")) return(error) } else { return(mat_genotypes) } + }, + error=function(e) { + ### TSV: allele frequency table file + df = read.delim(fname_genotype, sep="\t", header=TRUE) + vec_loci_names = paste(df$chr, df$pos, df$allele, sep="_") + vec_entries = colnames(df)[c(-1:-3)] + mat_genotypes = as.matrix(t(df[, c(-1:-3)])) + rownames(mat_genotypes) = vec_entries + colnames(mat_genotypes) = vec_loci_names + return(mat_genotypes) }) }) - ### If the input genotype matrix is non-numeric, then we assume biallelic loci, e.g. "AA", "AB", and "BB". - ### Convert them into numerics setting the first allele as 0 and the second as 1, such that "AA" = 0.0, "AB" = 1.0, and "BB" = 2.0 - ### Assuming "A" is the reference allele, then what we end up with is the alternative allele counts - if (is.numeric(G)==FALSE) { - G_numeric = matrix(0, nrow=nrow(G), ncol=ncol(G)) - rownames(G_numeric) = rownames(G) - colnames(G_numeric) = colnames(G) - pb = txtProgressBar(min=0, max=ncol(G), style=3) - for (j in 1:ncol(G)) { - # j = 1 - alleles = sort(unique(unlist(strsplit(unique(G[, j]), "")))) - if (length(alleles) == 2) { - G_numeric[, j] = unlist(lapply(strsplit(gsub(alleles[2], 1, gsub(alleles[1], 0, G[, j])), ""), FUN=function(x){as.numeric(x[1]) + as.numeric(x[2])})) - } - if (length(alleles) > 2) { - print(paste0("Error reading genotype file: ", fname_genotype, ".")) - print(paste0(" - Are you certain that your file is biallelic diploid, i.e. it has a maximum of 2 alleles per locus?")) - print(paste0(" - Then explain these alleles: ", paste(paste0("'", alleles, "'"), collapse=","), ".")) - print(paste0(" - Do these look like your genotype data: ", paste(head(G_numeric[, j]), collapse=","), "?")) - print(paste0(" - These too: ", paste(tail(G_numeric[, j]), collapse=","), "?")) - quit() - } - setTxtProgressBar(pb, j) - } - close(pb) - G = G_numeric - } + print(G[1:10,1:10]) + ### Retain a-1 allele/s per locus, i.e. remove duplicates assuming all loci are biallelic if (retain_minus_one_alleles_per_locus==TRUE) { n = nrow(G) diff --git a/R/main.R b/R/main.R index 1cf4be8..0a79fc0 100644 --- a/R/main.R +++ b/R/main.R @@ -14,6 +14,7 @@ parser$add_argument("--phenotype-column-data", dest="idx_col_y", type="character parser$add_argument("--phenotype-missing-strings", dest="na_strings", type="character", default=c("", "-", "NA", "na", "NaN", "missing", "MISSING"), help="How were the missing data in the phenotype file coded? [Default=',-,NA,na,NaN,missing,MISSING'") parser$add_argument("--populations-to-include", dest="vec_pop", type="character", default=c("all"), help="Which populations do you wish to include? [Default='all']") parser$add_argument("--skip-leave-one-population-out-cross-validation", dest="skip_lopo_cv", type="logical", default=FALSE, help="Do you wish to skip leave-one-populatiop-out cross-validataion? [Default=FALSE]") +parser$add_argument("--skip-pairwise-population-cross-validation", dest="skip_pairwise_cv", type="logical", default=FALSE, help="Do you wish to skip pairise-populatiop cross-validataion? [Default=FALSE]") parser$add_argument("--models", dest="models_to_test", type="character", default=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C", "gBLUP"), help="Genomic prediction models to use [Default='ridge,lasso,elastic_net,Bayes_A,Bayes_B,Bayes_C'") parser$add_argument("--k-folds", dest="k_folds", type="integer", default=10, help="Number of folds to perform in within population k-fold cross-validation [Default=10]") parser$add_argument("--n-reps", dest="n_reps", type="integer", default=10, help="Number of reshuffled replications per fold of within population k-fold cross-validation [Default=3]") @@ -35,7 +36,8 @@ args = parser$parse_args() # idx_col_y="8", # na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"), # vec_pop=c("DB-MS-31-22-001"), -# skip_lopo_cv=TRUE, +# skip_lopo_cv=FALSE, +# skip_pairwise_cv=FALSE, # models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C", "gBLUP"), # k_folds=2, # n_reps=10, @@ -181,18 +183,11 @@ print("##################################################") print("Genomic prediction cross-validations per trait:") out = list() for (idx_col_y in vec_idx_col_y) { + # idx_col_y = vec_idx_col_y[1] gp = fn_within_across_perse_genomic_prediction(G=X, idx_col_y=idx_col_y, args=args, dir_tmp=dir_tmp) trait_name = gp$TRAIT_NAME eval(parse(text=paste0("out$`", trait_name, "` = gp"))) } -### Each item of the out list is: -# list(TRAIT_NAME = character, -# SUMMARY = data.frame, -# METRICS_WITHIN_POP = data.frame, -# YPRED_WITHIN_POP = data.frame, -# METRICS_ACROSS_POP = data.frame, -# YPRED_ACROSS_POP = data.frame, -# GENOMIC_PREDICTIONS = data.frame) ### Output Rds of list of data frames suffix = paste0(gsub("/", "", format(Sys.time(), format="%D%H%M%S")), round(runif(n=1, min=1e6, max=9e6))) if (args$output_file_prefix == "") { @@ -207,6 +202,18 @@ unlink(dir_tmp, recursive=TRUE) print("##################################################") print("Finished successfully!") print("##################################################") +print("Each output list contains:") +print("gp = list(") +print(" TRAIT_NAME = list_y_pop$trait_name,") +print(" SUMMARY = df_top_models,") +print(" METRICS_WITHIN_POP = df_metrics,") +print(" YPRED_WITHIN_POP = df_y_pred,") +print(" METRICS_ACROSS_POP_LOPO = METRICS_ACROSS_POP_LOPO,") +print(" YPRED_ACROSS_POP_LOPO = YPRED_ACROSS_POP_LOPO,") +print(" METRICS_ACROSS_POP_PAIRWISE = METRICS_ACROSS_POP_PAIRWISE,") +print(" YPRED_ACROSS_POP_PAIRWISE = YPRED_ACROSS_POP_PAIRWISE,") +print(" GENOMIC_PREDICTIONS = GENOMIC_PREDICTIONS") +print(")") print("Summary table of the genomic prediction accuracies within populations:") summary_table = do.call(rbind, lapply(out, FUN=function(x) { data.frame(trait=x$TRAIT_NAME, x$SUMMARY)