Merge pull request #3 from jeffersonfparil/dev

Dev
jeffersonfparil · Jun 8, 2024 · 0feae0f · 0feae0f
2 parents bf9b492 + e062028
commit 0feae0f
Show file tree

Hide file tree

Showing 8 changed files with 509 additions and 6 deletions.
diff --git a/R/cross_validation.R b/R/cross_validation.R
@@ -424,7 +424,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
             n_folds=n_folds,
             n_reps=n_reps, 
             memory_requested_Gb=max_mem_Gb,
-            memory_multiplier=40,
             verbose=verbose)
     } else if (cv_type == 2) {
         ############################################
@@ -455,7 +454,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
             n_folds=n_folds,
             n_reps=1, 
             memory_requested_Gb=max_mem_Gb,
-            memory_multiplier=40,
             verbose=verbose)
     } else if (cv_type == 3) {
         #################################################
@@ -486,7 +484,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
             n_folds=n_folds,
             n_reps=1, 
             memory_requested_Gb=max_mem_Gb,
-            memory_multiplier=40,
             verbose=verbose)
     } else {
         error = methods::new("gpError",

diff --git a/R/io.R b/R/io.R
@@ -2276,7 +2276,7 @@ fn_subset_merged_genotype_and_phenotype = function(list_merged, vec_idx, verbose
 #' @param n_folds number of cross-validation folds (Default=10)
 #' @param n_reps number of cross-validation replication (Default=10)
 #' @param memory_requested_Gb memory requested or available for use (Default=400)
-#' @param memory_multiplier estimated memory usage multiplier (Default=40)
+#' @param memory_multiplier estimated memory usage multiplier (Default=50)
 #' @param verbose show memory usage estimation messages? (Default=FALSE)
 #' @returns
 #'  - Ok:
@@ -2288,15 +2288,16 @@ fn_subset_merged_genotype_and_phenotype = function(list_merged, vec_idx, verbose
 #' list_mem = fn_estimate_memory_footprint(X=rnorm(10000), verbose=TRUE)
 #' @export
 fn_estimate_memory_footprint = function(X, n_models=7, n_folds=10, n_reps=10, 
-    memory_requested_Gb=400, memory_multiplier=40, verbose=FALSE) {
+    memory_requested_Gb=400, memory_multiplier=50, verbose=FALSE)
+{
     ###################################################
     ### TEST
     # X = matrix(0.0, nrow=492, ncol=455255)
     # n_models = 7
     # n_reps = 10
     # n_folds = 10
     # memory_requested_Gb = 400
-    # memory_multiplier = 40
+    # memory_multiplier = 50
     # verbose = TRUE
     ###################################################
     if ((as.numeric(utils::object.size(X)) == 0) | (n_models <= 0) | (n_folds <= 0) | (n_reps <= 0) | (memory_requested_Gb <= 0) | (memory_multiplier <= 0)) {

diff --git a/exec/00_gp_slurm_job_wrapper-LUCERNE.sh b/exec/00_gp_slurm_job_wrapper-LUCERNE.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+### Load the conda environment
+module load Miniconda3/22.11.1-1
+conda init bash
+source ~/.bashrc
+conda activate genomic_selection
+
+################################################################
+### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
+### Please edit the input variables below to match your dataset:
+################################################################
+
+### Input variables (use the absolute path to files to be precise)
+### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. 
+###     Should have no missing data or else will be imputed via mean value imputation.
+GENOTYPE_DATA_RDS='/group/pasture/Jeff/lucerne/workdir/FINAL-IMPUTED-noTrailingAllele-filteredSNPlist.Rds'
+### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
+###     Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
+###     Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
+###     Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
+PHENOTYPE_DATA_TSV='/group/pasture/Jeff/lucerne/workdir/Lucerne_PhenomicsDB_2024-05-27-BiomassPredicted.tsv'
+### (3) Number of folds for k-fold cross-validation.
+KFOLDS=5
+### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
+NREPS=5
+### (5) Full path to the location of the executable Rscript gp.R
+DIR='/group/pasture/Jeff/gp/exec'
+
+### Check if the genotype file exists
+if [ ! -f $GENOTYPE_DATA_RDS ]
+then
+    echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
+    exit 101
+fi
+### Check if the phenotype file exists
+if [ ! -f $PHENOTYPE_DATA_TSV ]
+then
+    echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
+    exit 102
+fi
+### Check if the genotype file is a valid Rds file
+echo 'args = commandArgs(trailingOnly=TRUE)
+geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
+' > test_geno_rds.R
+if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
+then
+    echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
+    exit 103
+fi
+rm test_geno_rds.R
+### Check if the phenotype file is formatted according to the required specifications
+echo 'args = commandArgs(trailingOnly=TRUE)
+pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
+' > test_pheno_rds.R
+if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
+then
+    echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
+    exit 104
+fi
+rm test_pheno_rds.R
+### Check if the genomic_selection repo folder exists
+if [ ! -d $DIR ]
+then
+    echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
+    exit 105
+fi
+### Check if the genomic_selection repo belongs to the user
+if [ ! -w $DIR ]
+then
+    echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
+    exit 106
+fi
+### Check if the genomic_selection repo has contains the slurm array job submission script
+if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
+then
+    echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
+    exit 107
+fi
+### Initialise the output directory which will contain all the output Rds files across populations and traits
+if [ ! -d ${DIR}/output ]
+then
+    mkdir ${DIR}/output
+fi
+### Submit an array of jobs equivalent to the number of traits in the phenotype file
+cd $DIR/
+N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
+N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
+sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
+    01_gp_slurm_job-LUCERNE.sh \
+        ${GENOTYPE_DATA_RDS} \
+        ${PHENOTYPE_DATA_TSV} \
+        ${KFOLDS} \
+        ${NREPS} \
+        ${DIR}
diff --git a/exec/00_gp_slurm_job_wrapper-RYEGRASS.sh b/exec/00_gp_slurm_job_wrapper-RYEGRASS.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+### Load the conda environment
+module load Miniconda3/22.11.1-1
+conda init bash
+source ~/.bashrc
+conda activate genomic_selection
+
+################################################################
+### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
+### Please edit the input variables below to match your dataset:
+################################################################
+
+### Input variables (use the absolute path to files to be precise)
+### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. 
+###     Should have no missing data or else will be imputed via mean value imputation.
+GENOTYPE_DATA_RDS='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_HS-1717536141.3435302.3200855812-IMPUTED.Rds'
+### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
+###     Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
+###     Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
+###     Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
+PHENOTYPE_DATA_TSV='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_PhenomicsDB_2024-05-20-BiomassPredictedAndGroudtruth.tsv'
+### (3) Number of folds for k-fold cross-validation.
+KFOLDS=5
+### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
+NREPS=5
+### (5) Full path to the location of the executable Rscript gp.R
+DIR='/group/pasture/Jeff/gp/exec'
+
+### Check if the genotype file exists
+if [ ! -f $GENOTYPE_DATA_RDS ]
+then
+    echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
+    exit 101
+fi
+### Check if the phenotype file exists
+if [ ! -f $PHENOTYPE_DATA_TSV ]
+then
+    echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
+    exit 102
+fi
+### Check if the genotype file is a valid Rds file
+echo 'args = commandArgs(trailingOnly=TRUE)
+geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
+' > test_geno_rds.R
+if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
+then
+    echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
+    exit 103
+fi
+rm test_geno_rds.R
+### Check if the phenotype file is formatted according to the required specifications
+echo 'args = commandArgs(trailingOnly=TRUE)
+pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
+' > test_pheno_rds.R
+if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
+then
+    echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
+    exit 104
+fi
+rm test_pheno_rds.R
+### Check if the genomic_selection repo folder exists
+if [ ! -d $DIR ]
+then
+    echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
+    exit 105
+fi
+### Check if the genomic_selection repo belongs to the user
+if [ ! -w $DIR ]
+then
+    echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
+    exit 106
+fi
+### Check if the genomic_selection repo has contains the slurm array job submission script
+if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
+then
+    echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
+    exit 107
+fi
+### Initialise the output directory which will contain all the output Rds files across populations and traits
+if [ ! -d ${DIR}/output ]
+then
+    mkdir ${DIR}/output
+fi
+### Submit an array of jobs equivalent to the number of traits in the phenotype file
+cd $DIR/
+N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
+N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
+sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
+    01_gp_slurm_job-RYEGRASS.sh \
+        ${GENOTYPE_DATA_RDS} \
+        ${PHENOTYPE_DATA_TSV} \
+        ${KFOLDS} \
+        ${NREPS} \
+        ${DIR}
diff --git a/exec/01_gp_slurm_job-LUCERNE.sh b/exec/01_gp_slurm_job-LUCERNE.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+#SBATCH --job-name="GS"
+#SBATCH --account="dbiopast2"   ### EDIT ME: Pick the appropriate account name, e.g. dbiopast1 or dbiopast2
+#SBATCH --ntasks=1              ### LEAVE ME:Request a single task as we will be submitting this as an array job where each job corresponds to a trait
+#SBATCH --cpus-per-task=32      ### EDIT ME: Parallelisation across replications, folds and models (more cpu means faster execution time but probably longer time to wait for the Slurm scheduler to find resources to allocate to the job)
+#SBATCH --mem=400G              ### EDIT ME: Proportional to the input data (will need to test the appropriate memory required, hint use `seff ${JOBID}`)
+#SBATCH --time=7-0:0:00         ### EDIT ME: Proportional to the input data, number of folds, replications, and models to be used
+###################################################################################################
+### Edit the Slurm settings above to match your requirements. 
+###################################################################################################
+
+###################################################################################################
+### The variables below will be exported from `00_gs_slurm_job_wrapper.sh`:
+###################################################################################################
+### Input variables (use the absolute path to files to be precise)
+### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. 
+###     Should have no missing data or else will be imputed via mean value imputation.
+# GENOTYPE_DATA_RDS='/group/pasture/Jeff/genomic_selection/tests/grape.rds'
+GENOTYPE_DATA_RDS=$1
+### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
+###     Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
+###     Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
+###     Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
+# PHENOTYPE_DATA_TSV='/group/pasture/Jeff/genomic_selection/tests/grape_pheno.txt'
+PHENOTYPE_DATA_TSV=$2
+### (3) Number of folds for k-fold cross-validation.
+# KFOLDS=5
+KFOLDS=$3
+### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
+# NREPS=3
+NREPS=$4
+### (5) Full path to the location of the executable Rscript gp.R
+# DIR='/group/pasture/Jeff/gp/exec'
+DIR=$5
+###################################################################################################
+### Edit the code below, if and only if you have read the documentation or familiar with `src/*.R`:
+###################################################################################################
+### Define the trait and population to include
+N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
+TRAIT_IDX=$(echo "((${SLURM_ARRAY_TASK_ID}-1) / ${N_POPS}) + 1" | bc)
+POP_IDX=$(echo "${SLURM_ARRAY_TASK_ID} % ${N_POPS}" | bc)
+if [ "${POP_IDX}" -eq 0 ]
+then
+    POP_IDX=${N_POPS}
+fi
+COLUMN_ID=$(echo 2 + ${TRAIT_IDX} | bc)
+TRAIT=$(head -n1 $PHENOTYPE_DATA_TSV | cut -f${COLUMN_ID})
+POP=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | head -n${POP_IDX} | tail -n1)
+### Skip leave-one-population-out cross-validation if there is only one population
+if [ "${N_POPS}" -eq 1 ]
+then
+    BOOL_ACROSS=FALSE
+else
+    if [ "${POP_IDX}" -eq 1 ]
+    then
+        BOOL_ACROSS=TRUE
+    else
+        BOOL_ACROSS=FALSE
+    fi
+fi
+### Output directories
+DIR_OUT_MAIN=${DIR}/output
+DIR_OUT=${DIR_OUT_MAIN}/output-${TRAIT}-${POP}
+mkdir $DIR_OUT
+### Log messages
+echo JOB_${SLURM_ARRAY_TASK_ID}-TRAIT_${TRAIT}-POP_${POP} > ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
+echo "==========================================
+-------------------------------------------
+	    Job Info
+-------------------------------------------
+SLURM_JOB_ID = $SLURM_JOB_ID
+SLURM_JOB_NAME = $SLURM_JOB_NAME
+SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST
+SLURM_SUBMIT_HOST = $SLURM_SUBMIT_HOST
+SLURM_SUBMIT_DIR = $SLURM_SUBMIT_DIR
+SLURM_NTASKS = $SLURM_NTASKS
+SLURM_ARRAY_TASK_ID = $SLURM_ARRAY_TASK_ID
+SLURM_MEM_PER_NODE = $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) GB
+SLURM_CPUS_PER_TASK = $SLURM_CPUS_PER_TASK
+-------------------------------------------
+	    Variables
+-------------------------------------------
+GENOTYPE_DATA_RDS	: $GENOTYPE_DATA_RDS
+PHENOTYPE_DATA_TSV	: $PHENOTYPE_DATA_TSV
+KFOLDS		        : $KFOLDS
+NREPS		        : $NREPS
+TRAIT		        : $TRAIT
+POPULATION          : $POP
+-------------------------------------------
+	    Output directory
+-------------------------------------------
+${DIR_OUT}
+==========================================" >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
+
+### Load the conda environment
+module load Miniconda3/22.11.1-1
+conda init bash
+source ~/.bashrc
+conda activate genomic_selection
+
+### Run within and across population replicated k-fold cross-validation and prediction of missing phenotypes
+time \
+Rscript ${DIR}/gp.R \
+    --fname-geno $GENOTYPE_DATA_RDS \
+    --fname-pheno $PHENOTYPE_DATA_TSV \
+    --population $POP \
+    --dir-output $DIR_OUT \
+    --pheno-idx-col-y $COLUMN_ID \
+    --bool-within TRUE \
+    --bool-across $BOOL_ACROSS \
+    --n-folds $KFOLDS \
+    --n-reps $NREPS \
+    --bool-parallel TRUE \
+    --max-mem-Gb $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) \
+    --n-threads $SLURM_CPUS_PER_TASK \
+    --verbose TRUE >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
+### Clean-up
+mv ${DIR_OUT}/GENOMIC_PREDICTIONS_OUTPUT-*.Rds ${DIR_OUT_MAIN}
+mv ${DIR_OUT}/job_info-${TRAIT}-${POP}.log ${DIR_OUT_MAIN}
+rm -R ${DIR_OUT}