working on a very simplified API

jeffersonfparil · Jun 21, 2024 · 9664c39 · 9664c39
1 parent bc2e79b
commit 9664c39
Show file tree

Hide file tree

Showing 13 changed files with 1,821 additions and 51 deletions.
diff --git a/.gitignore b/.gitignore
@@ -56,6 +56,12 @@ _libs/
 
 # Test files
 inst/exec_Rscript/gp_*
+inst/exec_Rscript/bglr*
+inst/exec_Rscript/*.vcf
+inst/exec_Rscript/*.vcf.gz
+inst/exec_Rscript/*.tsv
+inst/exec_Rscript/*.Rds
+inst/exec_Rscript/GENOMIC_PREDICTIONS_OUTPUT-*
 inst/exec_Rscript/output/
 
 # SQL databases

diff --git a/R/io.R b/R/io.R
@@ -1569,11 +1569,11 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
         if (verbose & (length(vec_idx_loci_to_remove)==ncol(G))) {
             error = methods::new("gpError",
                 code=000,
-                message=paste0("All loci were filtered out. Please increase the max_sparsity_per_locus from ", max_sparsity_per_locus, 
+                message=paste0("All loci were filtered out. Please consider increasing the max_sparsity_per_locus from ", max_sparsity_per_locus, 
                 ", given that the mean sparsity per locus ranges from ", min(vec_sparsity_per_locus, na.rm=TRUE), 
                 " to ", max(vec_sparsity_per_locus, na.rm=TRUE), 
                 " with a mean of ", mean(vec_sparsity_per_locus, na.rm=TRUE), 
-                " and median ", stats::median(vec_sparsity_per_locus, na.rm=TRUE)))
+                " and a median of ", stats::median(vec_sparsity_per_locus, na.rm=TRUE)))
             return(error)
         }
     }
@@ -1587,7 +1587,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
         if (verbose & (length(vec_idx_loci_to_remove)==ncol(G))) {
             error = methods::new("gpError",
                 code=000,
-                message=paste0("All loci were filtered out. Please decrease the frac_topmost_sparse_loci_to_remove from ", frac_topmost_sparse_loci_to_remove, 
+                message=paste0("All loci were filtered out. Please consider decreasing the frac_topmost_sparse_loci_to_remove from ", frac_topmost_sparse_loci_to_remove, 
                 " to something more reasonable."))
             return(error)
         }
@@ -1601,7 +1601,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
         if (verbose & (length(vec_idx_loci_to_remove)==ncol(G))) {
             error = methods::new("gpError",
                 code=000,
-                message=paste0("All loci were filtered out. Please decrease the n_topmost_sparse_loci_to_remove from ", n_topmost_sparse_loci_to_remove, 
+                message=paste0("All loci were filtered out. Please consider decreasing the n_topmost_sparse_loci_to_remove from ", n_topmost_sparse_loci_to_remove, 
                 " to something more reasonable, if it please you m'lady/m'lord."))
             return(error)
         }
@@ -1626,11 +1626,11 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
         if (verbose & (length(vec_idx_samples_to_remove)==nrow(G))) {
             error = methods::new("gpError", 
                 code=000,
-                message=paste0("All samples were filtered out. Please increase the max_sparsity_per_sample from ", max_sparsity_per_sample, 
+                message=paste0("All samples were filtered out. Please consider increasing the max_sparsity_per_sample from ", max_sparsity_per_sample, 
                 ", given that the mean sparsity per sample ranges from ", min(vec_sparsity_per_sample, na.rm=TRUE), 
                 " to ", max(vec_sparsity_per_sample, na.rm=TRUE), 
                 " with a mean of ", mean(vec_sparsity_per_sample, na.rm=TRUE), 
-                " and median ", stats::median(vec_sparsity_per_sample, na.rm=TRUE)))
+                " and a median of ", stats::median(vec_sparsity_per_sample, na.rm=TRUE)))
             return(error)
         }
     }
@@ -1644,7 +1644,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
         if (verbose & (length(vec_idx_samples_to_remove)==nrow(G))) {
             error = methods::new("gpError", 
                 code=000,
-                message=paste0("All samples were filtered out. Please decrease the frac_topmost_sparse_samples_to_remove from ", frac_topmost_sparse_samples_to_remove, 
+                message=paste0("All samples were filtered out. Please consider decreasing the frac_topmost_sparse_samples_to_remove from ", frac_topmost_sparse_samples_to_remove, 
                 " to something more reasonable."))
             return(error)
         }
@@ -1658,7 +1658,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
         if (verbose & (length(vec_idx_samples_to_remove)==nrow(G))) {
             error = methods::new("gpError", 
                 code=000,
-                message=paste0("All samples were filtered out. Please decrease the n_topmost_sparse_samples_to_remove from ", n_topmost_sparse_samples_to_remove, 
+                message=paste0("All samples were filtered out. Please consider decreasing the n_topmost_sparse_samples_to_remove from ", n_topmost_sparse_samples_to_remove, 
                 " to something more reasonable."))
             return(error)
         }
@@ -1803,7 +1803,7 @@ fn_save_genotype = function(G, fname, file_type=c("RDS", "TSV")[1], verbose=FALS
 
 #' Load phenotype data from a text-delimited file
 #'
-#' @param fname_pheno filname of the text-delimited phenotype data
+#' @param fname_pheno filename of the text-delimited phenotype data
 #' @param sep column-delimiter in the phenotype file (Default="\\t")
 #' @param header does the phenotype file have a header line? (Default=TRUE)
 #' @param idx_col_id which column correspond to the sample/entry/pool/genotype names? (Default=1)
@@ -1929,9 +1929,9 @@ fn_filter_phenotype = function(list_pheno, remove_outliers=TRUE, remove_NA=FALSE
     # verbose = TRUE
     ###################################################
     if (verbose) {
-        print("############################")
-        print("### Filter genotype data ###")
-        print("############################")
+        print("#############################")
+        print("### Filter phenotype data ###")
+        print("#############################")
     }
     if (methods::is(list_pheno, "gpError")) {
         error = chain(list_pheno,

diff --git a/R/main.R b/R/main.R
@@ -24,7 +24,7 @@
 #'      top most sparse loci will be removed (see ?fn_filter_genotype for details)
 #'  - $geno_n_topmost_sparse_loci_to_remove: number of top most sparse loci to be removed
 #'      (see ?fn_filter_genotype for details)
-#'  - $geno_max_sparsity_per_sample:  maximum sparsity or fraction of missing data per sample
+#'  - $geno_max_sparsity_per_sample: maximum sparsity or fraction of missing data per sample
 #'      (see ?fn_filter_genotype for details)
 #'  - $geno_frac_topmost_sparse_samples_to_remove: fraction of the total number of samples from which 
 #'      the top most sparse samples will be removed (see ?fn_filter_genotype for details)
@@ -340,7 +340,17 @@ gp = function(args) {
         n_topmost_sparse_samples_to_remove=args$geno_n_topmost_sparse_samples_to_remove,
         verbose=args$verbose
     )
-    if (methods::is(G, "gpError")) {return(G)}
+    if (methods::is(G, "gpError")) {
+        error = chain(G, methods::new("gpError",
+            code=000,
+            message=paste0(
+                "All loci were filtered out. ",
+                "Please consider reducing --geno-min-depth (", args$geno_min_depth, ") and/or ",
+                "increasing --geno-max-depth (", args$geno_max_depth, ") and/or ",
+                "impute your input genotype data (", args$fname_geno, ")."
+            )))
+        return(error)
+    }
     gc()
     list_pheno = fn_filter_phenotype(
         list_pheno=list_pheno,
@@ -358,6 +368,21 @@ gp = function(args) {
         verbose=args$verbose
     )
     if (methods::is(list_merged, "gpError")) {return(list_merged)}
+    ### Missing values are not allowed in the genotype data
+    if (sum(rowSums(is.na(list_merged$G))) > 0) {
+        error = methods::new("gpError",
+            code=000,
+            message=paste0(
+                "Genotype data has missing values. ",
+                "Please impute the missing data. ",
+                "You may also consider reducing ", 
+                "--geno-min-depth (", args$geno_min_depth, ") and/or increasing ",
+                "--geno-max-depth (", args$geno_max_depth, "), if possible. ",
+                "You may also set the following to zero: --geno-max-sparsity-per-locus (", args$geno_max_sparsity_per_locus, ") and ",
+                "--geno-max-sparsity-per-sample (", args$geno_max_sparsity_per_sample, ")."
+            ))
+        return(error)
+    }
     ### Clean-up
     rm("G")
     rm("list_pheno")

diff --git a/inst/exec_Rscript/.Rprofile b/inst/exec_Rscript/.Rprofile
diff --git a/inst/exec_Rscript/0-checks_and_submision.sh b/inst/exec_Rscript/0-checks_and_submision.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+
+# ### Load the conda environment
+# module load Miniconda3/22.11.1-1
+# conda init bash
+# source ~/.bashrc
+# conda activate genomic_selection
+echo '
+if (!require("gp", character.only = TRUE)) {
+    install.packages("devtools")
+    devtools::install_github("jeffersonfparil/gp")
+}' > install_gp.R
+Rscript install_gp.R
+rm install_gp.R
+
+################################################################
+### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
+### Please edit the input variables below to match your dataset:
+################################################################
+### (1) Full path to the location of the executable Rscript gp.R
+DIR=$(dirname $0)
+cd $DIR
+DIR=$(pwd)
+### Input variables (use the absolute path to files to be precise)
+### (2) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. 
+###     Should have no missing data or else will be imputed via mean value imputation.
+GENOTYPE_DATA_RDS=${DIR}/input/test_geno.Rds
+### (3) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
+###     Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
+###     Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
+###     Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
+PHENOTYPE_DATA_TSV=${DIR}/input/test_pheno.tsv
+### (4) Number of folds for k-fold cross-validation.
+KFOLDS=5
+### (5) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
+NREPS=3
+
+### Check if the genotype file exists
+if [ ! -f $GENOTYPE_DATA_RDS ]
+then
+    echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
+    exit 101
+fi
+### Check if the phenotype file exists
+if [ ! -f $PHENOTYPE_DATA_TSV ]
+then
+    echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
+    exit 102
+fi
+### Check if the genotype file is a valid Rds file
+echo 'args = commandArgs(trailingOnly=TRUE)
+geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
+' > test_geno_rds.R
+if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
+then
+    echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
+    exit 103
+fi
+rm test_geno_rds.R
+### Check if the phenotype file is formatted according to the required specifications
+echo 'args = commandArgs(trailingOnly=TRUE)
+pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
+' > test_pheno_rds.R
+if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
+then
+    echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
+    exit 104
+fi
+rm test_pheno_rds.R
+### Check if the genomic_selection repo folder exists
+if [ ! -d $DIR ]
+then
+    echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
+    exit 105
+fi
+### Check if the genomic_selection repo belongs to the user
+if [ ! -w $DIR ]
+then
+    echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
+    exit 106
+fi
+### Check if the genomic_selection repo has contains the slurm array job submission script
+if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
+then
+    echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
+    exit 107
+fi
+### Initialise the output directory which will contain all the output Rds files across populations and traits
+if [ ! -d ${DIR}/output ]
+then
+    mkdir ${DIR}/output
+fi
+### Submit an array of jobs equivalent to the number of traits in the phenotype file
+cd $DIR/
+N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
+N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
+sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
+    1-gp_slurm_job.sh \
+        ${GENOTYPE_DATA_RDS} \
+        ${PHENOTYPE_DATA_TSV} \
+        ${KFOLDS} \
+        ${NREPS} \
+        ${DIR}
diff --git a/inst/exec_Rscript/1-gp_slurm_job.sh b/inst/exec_Rscript/1-gp_slurm_job.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+#SBATCH --job-name="GS"
+#SBATCH --account="dbiopast1"   ### EDIT ME: Pick the appropriate account name, e.g. dbiopast1 or dbiopast2
+#SBATCH --ntasks=1              ### LEAVE ME:Request a single task as we will be submitting this as an array job where each job corresponds to a trait
+#SBATCH --cpus-per-task=16      ### EDIT ME: Parallelisation across replications, folds and models (more cpu means faster execution time but probably longer time to wait for the Slurm scheduler to find resources to allocate to the job)
+#SBATCH --mem=100G              ### EDIT ME: Proportional to the input data (will need to test the appropriate memory required, hint use `seff ${JOBID}`)
+#SBATCH --time=1-0:0:00         ### EDIT ME: Proportional to the input data, number of folds, replications, and models to be used
+###################################################################################################
+### Edit the Slurm settings above to match your requirements. 
+###################################################################################################
+
+###################################################################################################
+### The variables below will be exported from `00_gs_slurm_job_wrapper.sh`:
+###################################################################################################
+### Input variables (use the absolute path to files to be precise)
+### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. 
+###     Should have no missing data or else will be imputed via mean value imputation.
+# GENOTYPE_DATA_RDS='/group/pasture/Jeff/genomic_selection/tests/grape.rds'
+GENOTYPE_DATA_RDS=$1
+### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
+###     Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
+###     Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
+###     Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
+# PHENOTYPE_DATA_TSV='/group/pasture/Jeff/genomic_selection/tests/grape_pheno.txt'
+PHENOTYPE_DATA_TSV=$2
+### (3) Number of folds for k-fold cross-validation.
+# KFOLDS=5
+KFOLDS=$3
+### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
+# NREPS=3
+NREPS=$4
+### (5) Full path to the location of the executable Rscript gp.R
+# DIR='/group/pasture/Jeff/gp/inst/exec_Rscript'
+DIR=$5
+###################################################################################################
+### Edit the code below, if and only if you have read the documentation or familiar with `src/*.R`:
+###################################################################################################
+### Define the trait and population to include
+N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
+TRAIT_IDX=$(echo "((${SLURM_ARRAY_TASK_ID}-1) / ${N_POPS}) + 1" | bc)
+POP_IDX=$(echo "${SLURM_ARRAY_TASK_ID} % ${N_POPS}" | bc)
+if [ "${POP_IDX}" -eq 0 ]
+then
+    POP_IDX=${N_POPS}
+fi
+COLUMN_ID=$(echo 2 + ${TRAIT_IDX} | bc)
+TRAIT=$(head -n1 $PHENOTYPE_DATA_TSV | cut -f${COLUMN_ID})
+POP=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | head -n${POP_IDX} | tail -n1)
+### Skip leave-one-population-out cross-validation if there is only one population
+if [ "${N_POPS}" -eq 1 ]
+then
+    BOOL_ACROSS=FALSE
+else
+    if [ "${POP_IDX}" -eq 1 ]
+    then
+        BOOL_ACROSS=TRUE
+    else
+        BOOL_ACROSS=FALSE
+    fi
+fi
+### Output directories
+DIR_OUT_MAIN=${DIR}/output
+DIR_OUT=${DIR_OUT_MAIN}/output-${TRAIT}-${POP}
+mkdir $DIR_OUT
+### Log messages
+echo JOB_${SLURM_ARRAY_TASK_ID}-TRAIT_${TRAIT}-POP_${POP} > ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
+echo "==========================================
+-------------------------------------------
+	    Job Info
+-------------------------------------------
+SLURM_JOB_ID = $SLURM_JOB_ID
+SLURM_JOB_NAME = $SLURM_JOB_NAME
+SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST
+SLURM_SUBMIT_HOST = $SLURM_SUBMIT_HOST
+SLURM_SUBMIT_DIR = $SLURM_SUBMIT_DIR
+SLURM_NTASKS = $SLURM_NTASKS
+SLURM_ARRAY_TASK_ID = $SLURM_ARRAY_TASK_ID
+SLURM_MEM_PER_NODE = $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) GB
+SLURM_CPUS_PER_TASK = $SLURM_CPUS_PER_TASK
+-------------------------------------------
+	    Variables
+-------------------------------------------
+GENOTYPE_DATA_RDS	: $GENOTYPE_DATA_RDS
+PHENOTYPE_DATA_TSV	: $PHENOTYPE_DATA_TSV
+KFOLDS		        : $KFOLDS
+NREPS		        : $NREPS
+TRAIT		        : $TRAIT
+POPULATION          : $POP
+-------------------------------------------
+	    Output directory
+-------------------------------------------
+${DIR_OUT}
+==========================================" >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
+
+### Load the conda environment
+module load Miniconda3/22.11.1-1
+conda init bash
+source ~/.bashrc
+conda activate genomic_selection
+
+### Run within and across population replicated k-fold cross-validation and prediction of missing phenotypes
+time \
+Rscript ${DIR}/gp.R \
+    --fname-geno $GENOTYPE_DATA_RDS \
+    --fname-pheno $PHENOTYPE_DATA_TSV \
+    --population $POP \
+    --dir-output $DIR_OUT \
+    --pheno-idx-col-y $COLUMN_ID \
+    --bool-within TRUE \
+    --bool-across $BOOL_ACROSS \
+    --n-folds $KFOLDS \
+    --n-reps $NREPS \
+    --bool-parallel TRUE \
+    --max-mem-Gb $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) \
+    --n-threads $SLURM_CPUS_PER_TASK \
+    --verbose TRUE >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
+### Clean-up
+mv ${DIR_OUT}/GENOMIC_PREDICTIONS_OUTPUT-*.Rds ${DIR_OUT_MAIN}
+mv ${DIR_OUT}/job_info-${TRAIT}-${POP}.log ${DIR_OUT_MAIN}
+rm -R ${DIR_OUT}