From 03c26ebb3b6a4e1639932fb97540367af595fea0 Mon Sep 17 00:00:00 2001
From: jeffersonfparil <jeffersonparil@gmail.com>
Date: Thu, 23 May 2024 16:59:16 +1000
Subject: [PATCH] refactoring genotype loading + adding non-numeric genotype
 data simulation and loading with improvements over previous version which
 accounts for ploidy instead of assuming diploidy

---
 R/cross_validation.R |  97 +++++++++-----------------
 R/load.R             | 161 ++++++++++++++++++++++++++++---------------
 R/main.R             |  25 ++++---
 3 files changed, 155 insertions(+), 128 deletions(-)

diff --git a/R/cross_validation.R b/R/cross_validation.R
index 714feeb..218b4ae 100644
--- a/R/cross_validation.R
+++ b/R/cross_validation.R
@@ -424,8 +424,6 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
     ##################################################
     print("##################################################")
     print("Within population cross-validation:")
-    # df_metrics = NULL
-    # df_y_pred = NULL
     vec_fnames_metrics = c()
     vec_fnames_y_pred = c()
     for (pop_id in vec_pop) {
@@ -454,14 +452,9 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
         list_out$df_metrics$pop = pop_id
         vec_fnames_metrics = c(vec_fnames_metrics, list_out$fnames_metrics)
         vec_fnames_y_pred = c(vec_fnames_y_pred, list_out$fnames_y_pred)
-        # if (is.null(df_metrics)) {
-        #     df_metrics = list_out$df_metrics
-        #     df_y_pred = list_out$df_y_pred
-        # } else {
-        #     df_metrics = rbind(df_metrics, list_out$df_metrics)
-        #     df_y_pred = rbind(df_y_pred, list_out$df_y_pred)
-        # }
     }
+    METRICS_WITHIN_POP = list_out$df_metrics
+    YPRED_WITHIN_POP = list_out$df_y_pred
     ### Cleanup
     for (i in 1:length(vec_fnames_metrics)) {
         unlink(vec_fnames_metrics[i])
@@ -499,25 +492,27 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
     ################################################################
     ### ACROSS POPULATIONS: pairwise population cross-validation ###
     ################################################################
+    vec_pop = sort(unique(pop))
+    METRICS_ACROSS_POP_PAIRWISE = NULL
+    YPRED_ACROSS_POP_PAIRWISE = NULL
     if ((args$skip_pairwise_cv==TRUE) | (length(unique(pop))==1)) {
         args$skip_pairwise_cv = TRUE
-        METRICS_ACROSS_POP_PAIRWISE = NULL
-        YPRED_ACROSS_POP_PAIRWISE = NULL
         print("Skipping leave-one-population-out cross-validation as only one population was supplied.")
     } else {
-        # df_metrics = NULL
-        # df_y_pred = NULL
+        print("##################################################")
+        print("Across population cross-validation:")
+        print("(Pairwise population CV)")
         vec_fnames_metrics = c()
         vec_fnames_y_pred = c()
         for (pop_id_1 in vec_pop) {
             for (pop_id_2 in vec_pop) {
                 # pop_id_1 = vec_pop[1]; pop_id_2 = vec_pop[2]
+                idx_1 = which(pop == pop_id_1)
+                idx_2 = which(pop == pop_id_2)
                 if (pop_id_1==pop_id_2) {
                     next
                 }
                 print("-----------------------------------")
-                idx_1 = which(pop == pop_id_1)
-                idx_2 = which(pop == pop_id_2)
                 print(paste0("Training population: ", pop_id_1, " (n=", length(idx_1), " x p=", ncol(G), ")"))
                 print(paste0("Validation population: ", pop_id_2, " (n=", length(idx_2), " x p=", ncol(G), ")"))
                 prefix = file.path(dir_tmp, gsub(".vcf.gz$", "", ignore.case=TRUE, gsub(".vcf$", "", ignore.case=TRUE, gsub(".rds$", "", ignore.case=TRUE, basename(args$fname_rds_or_vcf)))))
@@ -540,49 +535,25 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
                 }
                 list_out$df_metrics$training_pop = pop_id_1
                 list_out$df_metrics$validation_pop = pop_id_2
+                list_out$df_y_pred$training_pop = pop_id_1
+                list_out$df_y_pred$validation_pop = pop_id_2
+                if (is.null(METRICS_ACROSS_POP_PAIRWISE) & is.null(YPRED_ACROSS_POP_PAIRWISE)) {
+                    METRICS_ACROSS_POP_PAIRWISE = list_out$df_metrics
+                    YPRED_ACROSS_POP_PAIRWISE = list_out$df_y_pred
+                } else {
+                    METRICS_ACROSS_POP_PAIRWISE = rbind(METRICS_ACROSS_POP_PAIRWISE, list_out$df_metrics)
+                    YPRED_ACROSS_POP_PAIRWISE = rbind(YPRED_ACROSS_POP_PAIRWISE, list_out$df_y_pred)
+                }
                 vec_fnames_metrics = c(vec_fnames_metrics, list_out$fnames_metrics)
                 vec_fnames_y_pred = c(vec_fnames_y_pred, list_out$fnames_y_pred)
-                # if (is.null(df_metrics)) {
-                #     df_metrics = list_out$df_metrics
-                #     df_y_pred = list_out$df_y_pred
-                # } else {
-                #     df_metrics = rbind(df_metrics, list_out$df_metrics)
-                #     df_y_pred = rbind(df_y_pred, list_out$df_y_pred)
-                # }
             }
         }
-        METRICS_ACROSS_POP_PAIRWISE = list_out$df_metrics
-        YPRED_ACROSS_POP_PAIRWISE = list_out$df_y_pred
         ### Cleanup
         for (i in 1:length(vec_fnames_metrics)) {
             unlink(vec_fnames_metrics[i])
             unlink(vec_fnames_y_pred[i])
         }
     }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
     ######################################################################
     ### PER SE GENOMIC PREDICTION: using the best model per population ###
     ######################################################################
@@ -592,8 +563,8 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
     print("(using Pearson's correlation)")
     ### Only considering within population k-fold cross-validation to identify the best model per population
     ### Using Pearson's correlation as the genomic prediction accuracy metric
-    agg_corr = aggregate(corr ~ model + pop, FUN=mean, data=df_metrics)
-    agg_corr_sd = aggregate(corr ~ model + pop, FUN=sd, data=df_metrics)
+    agg_corr = aggregate(corr ~ model + pop, FUN=mean, data=METRICS_WITHIN_POP)
+    agg_corr_sd = aggregate(corr ~ model + pop, FUN=sd, data=METRICS_WITHIN_POP)
     agg_corr = merge(agg_corr, agg_corr_sd, by=c("model", "pop"))
     colnames(agg_corr) = c("model", "pop", "corr", "corr_sd")
     vec_popns = c()
@@ -612,8 +583,8 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
         vec_corr_sd = c(vec_corr_sd, corr_sd)
     }
     ### Append overall best model across all populations
-    agg_overall_mean = aggregate(corr ~ model, FUN=mean, data=df_metrics)
-    agg_overall_sdev = aggregate(corr ~ model, FUN=sd, data=df_metrics)
+    agg_overall_mean = aggregate(corr ~ model, FUN=mean, data=METRICS_WITHIN_POP)
+    agg_overall_sdev = aggregate(corr ~ model, FUN=sd, data=METRICS_WITHIN_POP)
     agg_overall = merge(agg_overall_mean, agg_overall_sdev, by="model")
     colnames(agg_overall) = c("model", "corr", "corr_sd")
     agg_overall = agg_overall[which(agg_overall[,2]==max(agg_overall[,2], na.rm=TRUE))[1], ]
@@ -621,8 +592,8 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
     vec_model = c(vec_model, as.character(agg_overall$model))
     vec_corr = c(vec_corr, agg_overall$corr)
     vec_corr_sd = c(vec_corr_sd, agg_overall$corr_sd)
-    df_top_models = data.frame(pop=vec_popns, model=vec_model, corr=vec_corr, corr_sd=vec_corr_sd)
-    print(df_top_models)
+    SUMMARY = data.frame(pop=vec_popns, model=vec_model, corr=vec_corr, corr_sd=vec_corr_sd)
+    print(SUMMARY)
     ### Genomic prediction per se
     if (length(idx_entries_for_genomic_prediction) == 0) {
         GENOMIC_PREDICTIONS = NULL
@@ -655,15 +626,15 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
             }
             idx_training = which(!is.na(y[,1]))
             idx_validation = which(is.na(y[,1]))
-            idx_top_model = which(df_top_models$pop == pop)[1]
+            idx_top_model = which(SUMMARY$pop == pop)[1]
             if (length(idx_top_model)==0) {
                 ### Use the overall top model if the entries do not belong to any other populations
-                idx_top_model = which(df_top_models$pop == "overall")[1]
+                idx_top_model = which(SUMMARY$pop == "overall")[1]
             }
-            model = df_top_models$model[idx_top_model]
-            corr = df_top_models$corr[idx_top_model]
-            corr_sd = df_top_models$corr_sd[idx_top_model]
-            corr_pop = df_top_models$pop[idx_top_model]
+            model = SUMMARY$model[idx_top_model]
+            corr = SUMMARY$corr[idx_top_model]
+            corr_sd = SUMMARY$corr_sd[idx_top_model]
+            corr_pop = SUMMARY$pop[idx_top_model]
 
             if (grepl("Bayes", model)==TRUE) {
                 other_params = list(nIter=12e3, burnIn=2e3, h2=0.5, out_prefix=paste0("bglr_", model, "-"), covariate=COVAR)
@@ -689,9 +660,9 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
     ### Output    
     out_per_phenotype = list(
         TRAIT_NAME                  = list_y_pop$trait_name,
-        SUMMARY                     = df_top_models,
-        METRICS_WITHIN_POP          = df_metrics,
-        YPRED_WITHIN_POP            = df_y_pred,
+        SUMMARY                     = SUMMARY,
+        METRICS_WITHIN_POP          = METRICS_WITHIN_POP,
+        YPRED_WITHIN_POP            = YPRED_WITHIN_POP,
         METRICS_ACROSS_POP_LOPO     = METRICS_ACROSS_POP_LOPO,
         YPRED_ACROSS_POP_LOPO       = YPRED_ACROSS_POP_LOPO,
         METRICS_ACROSS_POP_PAIRWISE = METRICS_ACROSS_POP_PAIRWISE,
diff --git a/R/load.R b/R/load.R
index fcb0a62..e7682d4 100644
--- a/R/load.R
+++ b/R/load.R
@@ -3,8 +3,7 @@ suppressWarnings(suppressPackageStartupMessages(library(vcfR)))
 suppressWarnings(suppressPackageStartupMessages(library(txtplot)))
 suppressWarnings(suppressPackageStartupMessages(library(testthat)))
 
-# dirname_functions = dirname(sys.frame(1)$ofile)
-# dirname_functions = "/group/pasture/Jeff/genomic_selection/src"; source(file.path(dirname_functions, "load.R"))
+# source("/group/pasture/Jeff/gp/R/load.R")
 
 ### Create an error class
 setClass("gpError", representation(code="numeric", message="character"), prototype(code=0, message="Empty error message. Please define."))
@@ -39,18 +38,19 @@ setMethod(f="chain",
 #' @param save_geno_tsv save the genotype data as a tab-delimited allele frequency table file [Default=FALSE]
 #' @param save_geno_rds save the named genotype matrix as an Rds file [Default=FALSE]
 #' @param save_pheno_tsv save the phenotype data as a tab-delimited file [Default=FALSE]
+#' @param non_numeric_Rds save non-numeric Rds genotype file? Note that this only works for biallelic loci and a maximum ploidy of 8. [Default=FALSE]
 #' @param verbose show simulation messages? [Default=FALSE]
 #' @returns
 #' vcf: simulated genotype data as a vcfR object
 #' df: simulated phenotype data as a data frame
-#' fname_geno_vcf: filename of the simulated genotype data as a vcf file
-#' fname_geno_tsv: filename of the simulated genotype data as a tab-delimited allele frequency table file
-#' fname_geno_rds: filename of the simulated named genotype matrix as an Rds file
+#' fname_genotype_vcf: filename of the simulated genotype data as a vcf file
+#' fname_genotype_tsv: filename of the simulated genotype data as a tab-delimited allele frequency table file
+#' fname_genotype_rds: filename of the simulated named genotype matrix as an Rds file
 #' fname_pheno_tsv: filename of the simulated phenotype data as a tab-delimited file
 #' @examples
 #' list_sim = fn_simulate_data(verbose=TRUE, save_geno_vcf=TRUE, save_geno_tsv=TRUE, save_geno_rds=TRUE, save_pheno_tsv=TRUE)
 #' @export
-fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, seed=12345, save_geno_vcf=FALSE, save_geno_tsv=FALSE, save_geno_rds=FALSE, save_pheno_tsv=FALSE, verbose=FALSE) {
+fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, seed=12345, save_geno_vcf=FALSE, save_geno_tsv=FALSE, save_geno_rds=FALSE, save_pheno_tsv=FALSE, non_numeric_Rds=FALSE, verbose=FALSE) {
     ###################################################
     ### TEST
     # n = 100
@@ -64,6 +64,7 @@ fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, se
     # save_geno_tsv = TRUE
     # save_geno_rds = TRUE
     # save_pheno_tsv = TRUE
+    # non_numeric_Rds = FALSE
     # verbose = TRUE
     ###################################################
     set.seed(seed)
@@ -130,27 +131,51 @@ fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, se
     }
     ### Save into files
     date = gsub("-", "", gsub("[.]", "", gsub(":", "", gsub(" ", "", as.character(Sys.time())))))
-    fname_geno_vcf = NULL
-    fname_geno_tsv = NULL
-    fname_geno_rds = NULL
+    fname_genotype_vcf = NULL
+    fname_genotype_tsv = NULL
+    fname_genotype_rds = NULL
     fname_pheno_tsv = NULL
     if (save_geno_vcf) {
-        fname_geno_vcf = file.path(getwd(), paste0("simulated_genotype-", date, ".vcf.gz"))
-        vcfR::write.vcf(vcf, file=fname_geno_vcf)
+        fname_genotype_vcf = file.path(getwd(), paste0("simulated_genotype-", date, ".vcf.gz"))
+        vcfR::write.vcf(vcf, file=fname_genotype_vcf)
         print("Output genotype file (sync.gz):")
-        print(fname_geno_vcf)
+        print(fname_genotype_vcf)
     }
     if (save_geno_tsv | save_geno_rds) {
         df_geno = data.frame(chr=vec_chr, pos=vec_pos, allele=vec_ref, t(G))
-        fname_geno_tsv = file.path(getwd(), paste0("simulated_genotype-", date, ".tsv"))
-        write.table(df_geno, file=fname_geno_tsv, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
+        fname_genotype_tsv = file.path(getwd(), paste0("simulated_genotype-", date, ".tsv"))
+        write.table(df_geno, file=fname_genotype_tsv, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
         print("Output genotype file (tsv):")
-        print(fname_geno_tsv)
+        print(fname_genotype_tsv)
         if (save_geno_rds) {
-            fname_geno_rds = file.path(getwd(), paste0("simulated_genotype-", date, ".Rds"))
-            saveRDS(G, file=fname_geno_rds)
-            print("Output genotype file (Rds):")
-            print(fname_geno_rds)
+            fname_genotype_rds = file.path(getwd(), paste0("simulated_genotype-", date, ".Rds"))
+            ### Revert to numeric if we have more than 52 alleles (limit of the Latin alphabet)
+            if (non_numeric_Rds & ((n_alleles != 2) | (ploidy > 8))) {
+                print("Reverting numeric genotype.")
+                print("Non-numeric genotypes only available for biallelic loci and a maximum ploidy of 8.")
+                non_numeric_Rds = FALSE
+            }
+            if (!non_numeric_Rds) {
+                print("Output genotype file (Rds):")
+                print(fname_genotype_rds)
+                saveRDS(G, file=fname_genotype_rds)
+            } else {
+                G_non_numeric = matrix("", nrow=nrow(G), ncol=ncol(G))
+                if (verbose) {pb = txtProgressBar(min=0, max=n, style=3)}
+                for (i in 1:nrow(G)) {
+                    # i = 1; j = 1
+                    for (j in 1:ncol(G)) {
+                        n_ref = 1 + round(G[i, j]*(ploidy-1))
+                        n_alt = ploidy - n_ref
+                        G_non_numeric[i, j] = paste(c(rep("A", times=n_ref), rep("B", times=n_alt)), collapse="")
+                    }
+                    if (verbose) {setTxtProgressBar(pb, i)}
+                }
+                if (verbose) {close(pb)}
+                print("Output non-numeric genotype file (Rds):")
+                print(fname_genotype_rds)
+                saveRDS(G_non_numeric, file=fname_genotype_rds)
+            }
         }
     }
     if (save_pheno_tsv) {
@@ -162,9 +187,9 @@ fn_simulate_data = function(n=100, l=1000, ploidy=42, n_alleles=2, depth=100, se
     return(list(
         vcf=vcf,
         df=df,
-        fname_geno_vcf=fname_geno_vcf,
-        fname_geno_tsv=fname_geno_tsv,
-        fname_geno_rds=fname_geno_rds,
+        fname_genotype_vcf=fname_genotype_vcf,
+        fname_genotype_tsv=fname_genotype_tsv,
+        fname_genotype_rds=fname_genotype_rds,
         fname_pheno_tsv=fname_pheno_tsv
     ))
 }
@@ -325,54 +350,78 @@ fn_classify_allele_frequencies = function(mat_genotypes, ploidy, strict_boundari
 }
 
 ### Outputs a $n$ entries x $l$ loci x (n_alleles-1) matrix
-fn_load_genotype = function(fname_genotype, retain_minus_one_alleles_per_locus=TRUE) {
+fn_load_genotype = function(fname_genotype, retain_minus_one_alleles_per_locus=TRUE, verbose=FALSE) {
     ###################################################
     ### TEST
-    fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_vcf=FALSE)$fname_genotype_vcf
-    fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_vcf=FALSE)$fname_genotype_tsv
-    fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_vcf=FALSE)$fname_genotype_rds
+    fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_rds=TRUE)$fname_genotype_rds
+    fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_vcf=TRUE)$fname_genotype_vcf
+    fname_genotype = fn_simulate_data(verbose=TRUE, save_geno_tsv=TRUE)$fname_genotype_tsv
+
+    ### Non-numeric biallelic diploid
+    fname_genotype = fn_simulate_data(n_alleles=2, ploidy=8, non_numeric_Rds=TRUE, save_geno_rds=TRUE, verbose=TRUE)$fname_genotype_rds
+
+
     retain_minus_one_alleles_per_locus = TRUE
+    verbose = TRUE
     ###################################################
     ### Load the genotype matrix (n x p)
-    G = tryCatch(readRDS(fname_genotype), error=function(e) {
-        tryCatch(readRDS(fname_genotype), error=function(e) {
+    ### TryCatch Rds, vcf, then tsv
+    G = tryCatch({
+        ### RDS file
+        G = readRDS(fname_genotype)
+        ### If the input genotype matrix is non-numeric, then we assume biallelic loci, e.g. "AA", "AB", and "BB".
+        ### Convert them into numerics setting the first allele as 0 and the second as 1, such that "AA" = 0.0, "AB" = 1.0, and "BB" = 2.0
+        ### Assuming "A" is the reference allele, then what we end up with is the alternative allele counts
+        if (is.numeric(G)==FALSE) {
+            G_numeric = matrix(0, nrow=nrow(G), ncol=ncol(G))
+            rownames(G_numeric) = rownames(G)
+            colnames(G_numeric) = colnames(G)
+            if (verbose) {pb = txtProgressBar(min=0, max=ncol(G), style=3)}
+            for (j in 1:ncol(G)) {
+                # j = 1
+                alleles = sort(unique(unlist(strsplit(unique(G[, j]), ""))))
+                if (length(alleles) == 2) {
+                    G_numeric[, j] = unlist(lapply(strsplit(gsub(alleles[2], 1, gsub(alleles[1], 0, G[, j])), ""), FUN=function(x){sum(as.numeric(x)) / length(x)}))
+                }
+                if (length(alleles) > 2) {
+                    print(paste0("Error reading genotype file: ", fname_genotype, "."))
+                    print(paste0("      - Are you certain that your file is biallelic diploid, i.e. it has a maximum of 2 alleles per locus?"))
+                    print(paste0("      - Then explain these alleles: ", paste(paste0("'", alleles, "'"), collapse=","), "."))
+                    print(paste0("      - Do these look like your genotype data: ", paste(head(G_numeric[, j]), collapse=","), "?"))
+                    print(paste0("      - These too: ", paste(tail(G_numeric[, j]), collapse=","), "?"))
+                    quit()
+                }
+                if (verbose) {setTxtProgressBar(pb, j)}
+            }
+            if (verbose) {close(pb)}
+            G = G_numeric
+        }
+    }, 
+    error=function(e) {
+        tryCatch({
+            ### VCF file
             vcf = vcfR::read.vcfR(fname_genotype, verbose=TRUE)
             mat_genotypes = fn_extract_allele_frequencies(vcf)
-            if (class(mat_genotypes) == "gpError") {
+            if (class(mat_genotypes)[1] == "gpError") {
                 error = chain(mat_genotypes, new("gpError", code=108, message="Error in load::fn_load_genotype: error loading the vcf file."))
                 return(error)
             } else {
                 return(mat_genotypes)
             }
+        }, 
+        error=function(e) {
+            ### TSV: allele frequency table file
+            df = read.delim(fname_genotype, sep="\t", header=TRUE)
+            vec_loci_names = paste(df$chr, df$pos, df$allele, sep="_")
+            vec_entries = colnames(df)[c(-1:-3)]
+            mat_genotypes = as.matrix(t(df[, c(-1:-3)]))
+            rownames(mat_genotypes) = vec_entries
+            colnames(mat_genotypes) = vec_loci_names
+            return(mat_genotypes)
         })
     })
-    ### If the input genotype matrix is non-numeric, then we assume biallelic loci, e.g. "AA", "AB", and "BB".
-    ### Convert them into numerics setting the first allele as 0 and the second as 1, such that "AA" = 0.0, "AB" = 1.0, and "BB" = 2.0
-    ### Assuming "A" is the reference allele, then what we end up with is the alternative allele counts
-    if (is.numeric(G)==FALSE) {
-        G_numeric = matrix(0, nrow=nrow(G), ncol=ncol(G))
-        rownames(G_numeric) = rownames(G)
-        colnames(G_numeric) = colnames(G)
-        pb = txtProgressBar(min=0, max=ncol(G), style=3)
-        for (j in 1:ncol(G)) {
-            # j = 1
-            alleles = sort(unique(unlist(strsplit(unique(G[, j]), ""))))
-            if (length(alleles) == 2) {
-                G_numeric[, j] = unlist(lapply(strsplit(gsub(alleles[2], 1, gsub(alleles[1], 0, G[, j])), ""), FUN=function(x){as.numeric(x[1]) + as.numeric(x[2])}))
-            }
-            if (length(alleles) > 2) {
-                print(paste0("Error reading genotype file: ", fname_genotype, "."))
-                print(paste0("      - Are you certain that your file is biallelic diploid, i.e. it has a maximum of 2 alleles per locus?"))
-                print(paste0("      - Then explain these alleles: ", paste(paste0("'", alleles, "'"), collapse=","), "."))
-                print(paste0("      - Do these look like your genotype data: ", paste(head(G_numeric[, j]), collapse=","), "?"))
-                print(paste0("      - These too: ", paste(tail(G_numeric[, j]), collapse=","), "?"))
-                quit()
-            }
-            setTxtProgressBar(pb, j)
-        }
-        close(pb)
-        G = G_numeric
-    }
+    print(G[1:10,1:10])
+    
     ### Retain a-1 allele/s per locus, i.e. remove duplicates assuming all loci are biallelic
     if (retain_minus_one_alleles_per_locus==TRUE) {
         n = nrow(G)
diff --git a/R/main.R b/R/main.R
index 1cf4be8..0a79fc0 100644
--- a/R/main.R
+++ b/R/main.R
@@ -14,6 +14,7 @@ parser$add_argument("--phenotype-column-data", dest="idx_col_y", type="character
 parser$add_argument("--phenotype-missing-strings", dest="na_strings", type="character", default=c("", "-", "NA", "na", "NaN", "missing", "MISSING"), help="How were the missing data in the phenotype file coded? [Default=',-,NA,na,NaN,missing,MISSING'")
 parser$add_argument("--populations-to-include", dest="vec_pop", type="character", default=c("all"), help="Which populations do you wish to include? [Default='all']")
 parser$add_argument("--skip-leave-one-population-out-cross-validation", dest="skip_lopo_cv", type="logical", default=FALSE, help="Do you wish to skip leave-one-populatiop-out cross-validataion? [Default=FALSE]")
+parser$add_argument("--skip-pairwise-population-cross-validation", dest="skip_pairwise_cv", type="logical", default=FALSE, help="Do you wish to skip pairise-populatiop cross-validataion? [Default=FALSE]")
 parser$add_argument("--models", dest="models_to_test", type="character", default=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C", "gBLUP"), help="Genomic prediction models to use [Default='ridge,lasso,elastic_net,Bayes_A,Bayes_B,Bayes_C'")
 parser$add_argument("--k-folds", dest="k_folds", type="integer", default=10, help="Number of folds to perform in within population k-fold cross-validation [Default=10]")
 parser$add_argument("--n-reps", dest="n_reps", type="integer", default=10, help="Number of reshuffled replications per fold of within population k-fold cross-validation [Default=3]")
@@ -35,7 +36,8 @@ args = parser$parse_args()
 #     idx_col_y="8",
 #     na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
 #     vec_pop=c("DB-MS-31-22-001"),
-#     skip_lopo_cv=TRUE,
+#     skip_lopo_cv=FALSE,
+#     skip_pairwise_cv=FALSE,
 #     models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C", "gBLUP"),
 #     k_folds=2,
 #     n_reps=10,
@@ -181,18 +183,11 @@ print("##################################################")
 print("Genomic prediction cross-validations per trait:")
 out = list()
 for (idx_col_y in vec_idx_col_y) {
+    # idx_col_y = vec_idx_col_y[1]
     gp = fn_within_across_perse_genomic_prediction(G=X, idx_col_y=idx_col_y, args=args, dir_tmp=dir_tmp)
     trait_name = gp$TRAIT_NAME
     eval(parse(text=paste0("out$`", trait_name, "` = gp")))
 }
-### Each item of the out list is:
-# list(TRAIT_NAME          = character,
-#      SUMMARY             = data.frame,
-#      METRICS_WITHIN_POP  = data.frame,
-#      YPRED_WITHIN_POP    = data.frame,
-#      METRICS_ACROSS_POP  = data.frame,
-#      YPRED_ACROSS_POP    = data.frame,
-#      GENOMIC_PREDICTIONS = data.frame)
 ### Output Rds of list of data frames
 suffix = paste0(gsub("/", "", format(Sys.time(), format="%D%H%M%S")), round(runif(n=1, min=1e6, max=9e6)))
 if (args$output_file_prefix == "") {
@@ -207,6 +202,18 @@ unlink(dir_tmp, recursive=TRUE)
 print("##################################################")
 print("Finished successfully!")
 print("##################################################")
+print("Each output list contains:")
+print("gp = list(")
+print("    TRAIT_NAME                  = list_y_pop$trait_name,")
+print("    SUMMARY                     = df_top_models,")
+print("    METRICS_WITHIN_POP          = df_metrics,")
+print("    YPRED_WITHIN_POP            = df_y_pred,")
+print("    METRICS_ACROSS_POP_LOPO     = METRICS_ACROSS_POP_LOPO,")
+print("    YPRED_ACROSS_POP_LOPO       = YPRED_ACROSS_POP_LOPO,")
+print("    METRICS_ACROSS_POP_PAIRWISE = METRICS_ACROSS_POP_PAIRWISE,")
+print("    YPRED_ACROSS_POP_PAIRWISE   = YPRED_ACROSS_POP_PAIRWISE,")
+print("    GENOMIC_PREDICTIONS         = GENOMIC_PREDICTIONS")
+print(")")
 print("Summary table of the genomic prediction accuracies within populations:")
 summary_table = do.call(rbind, lapply(out, FUN=function(x) {
     data.frame(trait=x$TRAIT_NAME, x$SUMMARY)