Skip to content

Commit

Permalink
Merge pull request #3 from jeffersonfparil/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jeffersonfparil authored Jun 8, 2024
2 parents bf9b492 + e062028 commit 0feae0f
Show file tree
Hide file tree
Showing 8 changed files with 509 additions and 6 deletions.
3 changes: 0 additions & 3 deletions R/cross_validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
n_folds=n_folds,
n_reps=n_reps,
memory_requested_Gb=max_mem_Gb,
memory_multiplier=40,
verbose=verbose)
} else if (cv_type == 2) {
############################################
Expand Down Expand Up @@ -455,7 +454,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
n_folds=n_folds,
n_reps=1,
memory_requested_Gb=max_mem_Gb,
memory_multiplier=40,
verbose=verbose)
} else if (cv_type == 3) {
#################################################
Expand Down Expand Up @@ -486,7 +484,6 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
n_folds=n_folds,
n_reps=1,
memory_requested_Gb=max_mem_Gb,
memory_multiplier=40,
verbose=verbose)
} else {
error = methods::new("gpError",
Expand Down
7 changes: 4 additions & 3 deletions R/io.R
Original file line number Diff line number Diff line change
Expand Up @@ -2276,7 +2276,7 @@ fn_subset_merged_genotype_and_phenotype = function(list_merged, vec_idx, verbose
#' @param n_folds number of cross-validation folds (Default=10)
#' @param n_reps number of cross-validation replication (Default=10)
#' @param memory_requested_Gb memory requested or available for use (Default=400)
#' @param memory_multiplier estimated memory usage multiplier (Default=40)
#' @param memory_multiplier estimated memory usage multiplier (Default=50)
#' @param verbose show memory usage estimation messages? (Default=FALSE)
#' @returns
#' - Ok:
Expand All @@ -2288,15 +2288,16 @@ fn_subset_merged_genotype_and_phenotype = function(list_merged, vec_idx, verbose
#' list_mem = fn_estimate_memory_footprint(X=rnorm(10000), verbose=TRUE)
#' @export
fn_estimate_memory_footprint = function(X, n_models=7, n_folds=10, n_reps=10,
memory_requested_Gb=400, memory_multiplier=40, verbose=FALSE) {
memory_requested_Gb=400, memory_multiplier=50, verbose=FALSE)
{
###################################################
### TEST
# X = matrix(0.0, nrow=492, ncol=455255)
# n_models = 7
# n_reps = 10
# n_folds = 10
# memory_requested_Gb = 400
# memory_multiplier = 40
# memory_multiplier = 50
# verbose = TRUE
###################################################
if ((as.numeric(utils::object.size(X)) == 0) | (n_models <= 0) | (n_folds <= 0) | (n_reps <= 0) | (memory_requested_Gb <= 0) | (memory_multiplier <= 0)) {
Expand Down
95 changes: 95 additions & 0 deletions exec/00_gp_slurm_job_wrapper-LUCERNE.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

################################################################
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
### Please edit the input variables below to match your dataset:
################################################################

### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
GENOTYPE_DATA_RDS='/group/pasture/Jeff/lucerne/workdir/FINAL-IMPUTED-noTrailingAllele-filteredSNPlist.Rds'
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
PHENOTYPE_DATA_TSV='/group/pasture/Jeff/lucerne/workdir/Lucerne_PhenomicsDB_2024-05-27-BiomassPredicted.tsv'
### (3) Number of folds for k-fold cross-validation.
KFOLDS=5
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
NREPS=5
### (5) Full path to the location of the executable Rscript gp.R
DIR='/group/pasture/Jeff/gp/exec'

### Check if the genotype file exists
if [ ! -f $GENOTYPE_DATA_RDS ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
exit 101
fi
### Check if the phenotype file exists
if [ ! -f $PHENOTYPE_DATA_TSV ]
then
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
exit 102
fi
### Check if the genotype file is a valid Rds file
echo 'args = commandArgs(trailingOnly=TRUE)
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
' > test_geno_rds.R
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
exit 103
fi
rm test_geno_rds.R
### Check if the phenotype file is formatted according to the required specifications
echo 'args = commandArgs(trailingOnly=TRUE)
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
' > test_pheno_rds.R
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
exit 104
fi
rm test_pheno_rds.R
### Check if the genomic_selection repo folder exists
if [ ! -d $DIR ]
then
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
exit 105
fi
### Check if the genomic_selection repo belongs to the user
if [ ! -w $DIR ]
then
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
exit 106
fi
### Check if the genomic_selection repo has contains the slurm array job submission script
if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
then
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
exit 107
fi
### Initialise the output directory which will contain all the output Rds files across populations and traits
if [ ! -d ${DIR}/output ]
then
mkdir ${DIR}/output
fi
### Submit an array of jobs equivalent to the number of traits in the phenotype file
cd $DIR/
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
01_gp_slurm_job-LUCERNE.sh \
${GENOTYPE_DATA_RDS} \
${PHENOTYPE_DATA_TSV} \
${KFOLDS} \
${NREPS} \
${DIR}
95 changes: 95 additions & 0 deletions exec/00_gp_slurm_job_wrapper-RYEGRASS.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

################################################################
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
### Please edit the input variables below to match your dataset:
################################################################

### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
GENOTYPE_DATA_RDS='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_HS-1717536141.3435302.3200855812-IMPUTED.Rds'
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
PHENOTYPE_DATA_TSV='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_PhenomicsDB_2024-05-20-BiomassPredictedAndGroudtruth.tsv'
### (3) Number of folds for k-fold cross-validation.
KFOLDS=5
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
NREPS=5
### (5) Full path to the location of the executable Rscript gp.R
DIR='/group/pasture/Jeff/gp/exec'

### Check if the genotype file exists
if [ ! -f $GENOTYPE_DATA_RDS ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
exit 101
fi
### Check if the phenotype file exists
if [ ! -f $PHENOTYPE_DATA_TSV ]
then
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
exit 102
fi
### Check if the genotype file is a valid Rds file
echo 'args = commandArgs(trailingOnly=TRUE)
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
' > test_geno_rds.R
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
exit 103
fi
rm test_geno_rds.R
### Check if the phenotype file is formatted according to the required specifications
echo 'args = commandArgs(trailingOnly=TRUE)
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
' > test_pheno_rds.R
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
exit 104
fi
rm test_pheno_rds.R
### Check if the genomic_selection repo folder exists
if [ ! -d $DIR ]
then
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
exit 105
fi
### Check if the genomic_selection repo belongs to the user
if [ ! -w $DIR ]
then
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
exit 106
fi
### Check if the genomic_selection repo has contains the slurm array job submission script
if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
then
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
exit 107
fi
### Initialise the output directory which will contain all the output Rds files across populations and traits
if [ ! -d ${DIR}/output ]
then
mkdir ${DIR}/output
fi
### Submit an array of jobs equivalent to the number of traits in the phenotype file
cd $DIR/
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
01_gp_slurm_job-RYEGRASS.sh \
${GENOTYPE_DATA_RDS} \
${PHENOTYPE_DATA_TSV} \
${KFOLDS} \
${NREPS} \
${DIR}
120 changes: 120 additions & 0 deletions exec/01_gp_slurm_job-LUCERNE.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/bin/bash
#SBATCH --job-name="GS"
#SBATCH --account="dbiopast2" ### EDIT ME: Pick the appropriate account name, e.g. dbiopast1 or dbiopast2
#SBATCH --ntasks=1 ### LEAVE ME:Request a single task as we will be submitting this as an array job where each job corresponds to a trait
#SBATCH --cpus-per-task=32 ### EDIT ME: Parallelisation across replications, folds and models (more cpu means faster execution time but probably longer time to wait for the Slurm scheduler to find resources to allocate to the job)
#SBATCH --mem=400G ### EDIT ME: Proportional to the input data (will need to test the appropriate memory required, hint use `seff ${JOBID}`)
#SBATCH --time=7-0:0:00 ### EDIT ME: Proportional to the input data, number of folds, replications, and models to be used
###################################################################################################
### Edit the Slurm settings above to match your requirements.
###################################################################################################

###################################################################################################
### The variables below will be exported from `00_gs_slurm_job_wrapper.sh`:
###################################################################################################
### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
# GENOTYPE_DATA_RDS='/group/pasture/Jeff/genomic_selection/tests/grape.rds'
GENOTYPE_DATA_RDS=$1
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
# PHENOTYPE_DATA_TSV='/group/pasture/Jeff/genomic_selection/tests/grape_pheno.txt'
PHENOTYPE_DATA_TSV=$2
### (3) Number of folds for k-fold cross-validation.
# KFOLDS=5
KFOLDS=$3
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
# NREPS=3
NREPS=$4
### (5) Full path to the location of the executable Rscript gp.R
# DIR='/group/pasture/Jeff/gp/exec'
DIR=$5
###################################################################################################
### Edit the code below, if and only if you have read the documentation or familiar with `src/*.R`:
###################################################################################################
### Define the trait and population to include
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
TRAIT_IDX=$(echo "((${SLURM_ARRAY_TASK_ID}-1) / ${N_POPS}) + 1" | bc)
POP_IDX=$(echo "${SLURM_ARRAY_TASK_ID} % ${N_POPS}" | bc)
if [ "${POP_IDX}" -eq 0 ]
then
POP_IDX=${N_POPS}
fi
COLUMN_ID=$(echo 2 + ${TRAIT_IDX} | bc)
TRAIT=$(head -n1 $PHENOTYPE_DATA_TSV | cut -f${COLUMN_ID})
POP=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | head -n${POP_IDX} | tail -n1)
### Skip leave-one-population-out cross-validation if there is only one population
if [ "${N_POPS}" -eq 1 ]
then
BOOL_ACROSS=FALSE
else
if [ "${POP_IDX}" -eq 1 ]
then
BOOL_ACROSS=TRUE
else
BOOL_ACROSS=FALSE
fi
fi
### Output directories
DIR_OUT_MAIN=${DIR}/output
DIR_OUT=${DIR_OUT_MAIN}/output-${TRAIT}-${POP}
mkdir $DIR_OUT
### Log messages
echo JOB_${SLURM_ARRAY_TASK_ID}-TRAIT_${TRAIT}-POP_${POP} > ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
echo "==========================================
-------------------------------------------
Job Info
-------------------------------------------
SLURM_JOB_ID = $SLURM_JOB_ID
SLURM_JOB_NAME = $SLURM_JOB_NAME
SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST
SLURM_SUBMIT_HOST = $SLURM_SUBMIT_HOST
SLURM_SUBMIT_DIR = $SLURM_SUBMIT_DIR
SLURM_NTASKS = $SLURM_NTASKS
SLURM_ARRAY_TASK_ID = $SLURM_ARRAY_TASK_ID
SLURM_MEM_PER_NODE = $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) GB
SLURM_CPUS_PER_TASK = $SLURM_CPUS_PER_TASK
-------------------------------------------
Variables
-------------------------------------------
GENOTYPE_DATA_RDS : $GENOTYPE_DATA_RDS
PHENOTYPE_DATA_TSV : $PHENOTYPE_DATA_TSV
KFOLDS : $KFOLDS
NREPS : $NREPS
TRAIT : $TRAIT
POPULATION : $POP
-------------------------------------------
Output directory
-------------------------------------------
${DIR_OUT}
==========================================" >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

### Run within and across population replicated k-fold cross-validation and prediction of missing phenotypes
time \
Rscript ${DIR}/gp.R \
--fname-geno $GENOTYPE_DATA_RDS \
--fname-pheno $PHENOTYPE_DATA_TSV \
--population $POP \
--dir-output $DIR_OUT \
--pheno-idx-col-y $COLUMN_ID \
--bool-within TRUE \
--bool-across $BOOL_ACROSS \
--n-folds $KFOLDS \
--n-reps $NREPS \
--bool-parallel TRUE \
--max-mem-Gb $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) \
--n-threads $SLURM_CPUS_PER_TASK \
--verbose TRUE >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
### Clean-up
mv ${DIR_OUT}/GENOMIC_PREDICTIONS_OUTPUT-*.Rds ${DIR_OUT_MAIN}
mv ${DIR_OUT}/job_info-${TRAIT}-${POP}.log ${DIR_OUT_MAIN}
rm -R ${DIR_OUT}
Loading

0 comments on commit 0feae0f

Please sign in to comment.