Skip to content

Commit

Permalink
increasing default memory usage multiplier from 40 to 50 to fix OOM e…
Browse files Browse the repository at this point in the history
…rror in test data runs
  • Loading branch information
jeffersonfparil committed Jun 8, 2024
1 parent 7924ce4 commit e062028
Show file tree
Hide file tree
Showing 6 changed files with 505 additions and 0 deletions.
95 changes: 95 additions & 0 deletions exec/00_gp_slurm_job_wrapper-LUCERNE.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

################################################################
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
### Please edit the input variables below to match your dataset:
################################################################

### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
GENOTYPE_DATA_RDS='/group/pasture/Jeff/lucerne/workdir/FINAL-IMPUTED-noTrailingAllele-filteredSNPlist.Rds'
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
PHENOTYPE_DATA_TSV='/group/pasture/Jeff/lucerne/workdir/Lucerne_PhenomicsDB_2024-05-27-BiomassPredicted.tsv'
### (3) Number of folds for k-fold cross-validation.
KFOLDS=5
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
NREPS=5
### (5) Full path to the location of the executable Rscript gp.R
DIR='/group/pasture/Jeff/gp/exec'

### Check if the genotype file exists
if [ ! -f $GENOTYPE_DATA_RDS ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
exit 101
fi
### Check if the phenotype file exists
if [ ! -f $PHENOTYPE_DATA_TSV ]
then
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
exit 102
fi
### Check if the genotype file is a valid Rds file
echo 'args = commandArgs(trailingOnly=TRUE)
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
' > test_geno_rds.R
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
exit 103
fi
rm test_geno_rds.R
### Check if the phenotype file is formatted according to the required specifications
echo 'args = commandArgs(trailingOnly=TRUE)
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
' > test_pheno_rds.R
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
exit 104
fi
rm test_pheno_rds.R
### Check if the genomic_selection repo folder exists
if [ ! -d $DIR ]
then
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
exit 105
fi
### Check if the genomic_selection repo belongs to the user
if [ ! -w $DIR ]
then
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
exit 106
fi
### Check if the genomic_selection repo has contains the slurm array job submission script
if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
then
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
exit 107
fi
### Initialise the output directory which will contain all the output Rds files across populations and traits
if [ ! -d ${DIR}/output ]
then
mkdir ${DIR}/output
fi
### Submit an array of jobs equivalent to the number of traits in the phenotype file
cd $DIR/
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
01_gp_slurm_job-LUCERNE.sh \
${GENOTYPE_DATA_RDS} \
${PHENOTYPE_DATA_TSV} \
${KFOLDS} \
${NREPS} \
${DIR}
95 changes: 95 additions & 0 deletions exec/00_gp_slurm_job_wrapper-RYEGRASS.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

################################################################
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
### Please edit the input variables below to match your dataset:
################################################################

### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
GENOTYPE_DATA_RDS='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_HS-1717536141.3435302.3200855812-IMPUTED.Rds'
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
PHENOTYPE_DATA_TSV='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_PhenomicsDB_2024-05-20-BiomassPredictedAndGroudtruth.tsv'
### (3) Number of folds for k-fold cross-validation.
KFOLDS=5
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
NREPS=5
### (5) Full path to the location of the executable Rscript gp.R
DIR='/group/pasture/Jeff/gp/exec'

### Check if the genotype file exists
if [ ! -f $GENOTYPE_DATA_RDS ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
exit 101
fi
### Check if the phenotype file exists
if [ ! -f $PHENOTYPE_DATA_TSV ]
then
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
exit 102
fi
### Check if the genotype file is a valid Rds file
echo 'args = commandArgs(trailingOnly=TRUE)
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
' > test_geno_rds.R
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
exit 103
fi
rm test_geno_rds.R
### Check if the phenotype file is formatted according to the required specifications
echo 'args = commandArgs(trailingOnly=TRUE)
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
' > test_pheno_rds.R
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
exit 104
fi
rm test_pheno_rds.R
### Check if the genomic_selection repo folder exists
if [ ! -d $DIR ]
then
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
exit 105
fi
### Check if the genomic_selection repo belongs to the user
if [ ! -w $DIR ]
then
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
exit 106
fi
### Check if the genomic_selection repo has contains the slurm array job submission script
if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
then
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
exit 107
fi
### Initialise the output directory which will contain all the output Rds files across populations and traits
if [ ! -d ${DIR}/output ]
then
mkdir ${DIR}/output
fi
### Submit an array of jobs equivalent to the number of traits in the phenotype file
cd $DIR/
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
01_gp_slurm_job-RYEGRASS.sh \
${GENOTYPE_DATA_RDS} \
${PHENOTYPE_DATA_TSV} \
${KFOLDS} \
${NREPS} \
${DIR}
120 changes: 120 additions & 0 deletions exec/01_gp_slurm_job-LUCERNE.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/bin/bash
#SBATCH --job-name="GS"
#SBATCH --account="dbiopast2" ### EDIT ME: Pick the appropriate account name, e.g. dbiopast1 or dbiopast2
#SBATCH --ntasks=1 ### LEAVE ME:Request a single task as we will be submitting this as an array job where each job corresponds to a trait
#SBATCH --cpus-per-task=32 ### EDIT ME: Parallelisation across replications, folds and models (more cpu means faster execution time but probably longer time to wait for the Slurm scheduler to find resources to allocate to the job)
#SBATCH --mem=400G ### EDIT ME: Proportional to the input data (will need to test the appropriate memory required, hint use `seff ${JOBID}`)
#SBATCH --time=7-0:0:00 ### EDIT ME: Proportional to the input data, number of folds, replications, and models to be used
###################################################################################################
### Edit the Slurm settings above to match your requirements.
###################################################################################################

###################################################################################################
### The variables below will be exported from `00_gs_slurm_job_wrapper.sh`:
###################################################################################################
### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
# GENOTYPE_DATA_RDS='/group/pasture/Jeff/genomic_selection/tests/grape.rds'
GENOTYPE_DATA_RDS=$1
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
# PHENOTYPE_DATA_TSV='/group/pasture/Jeff/genomic_selection/tests/grape_pheno.txt'
PHENOTYPE_DATA_TSV=$2
### (3) Number of folds for k-fold cross-validation.
# KFOLDS=5
KFOLDS=$3
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
# NREPS=3
NREPS=$4
### (5) Full path to the location of the executable Rscript gp.R
# DIR='/group/pasture/Jeff/gp/exec'
DIR=$5
###################################################################################################
### Edit the code below, if and only if you have read the documentation or familiar with `src/*.R`:
###################################################################################################
### Define the trait and population to include
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
TRAIT_IDX=$(echo "((${SLURM_ARRAY_TASK_ID}-1) / ${N_POPS}) + 1" | bc)
POP_IDX=$(echo "${SLURM_ARRAY_TASK_ID} % ${N_POPS}" | bc)
if [ "${POP_IDX}" -eq 0 ]
then
POP_IDX=${N_POPS}
fi
COLUMN_ID=$(echo 2 + ${TRAIT_IDX} | bc)
TRAIT=$(head -n1 $PHENOTYPE_DATA_TSV | cut -f${COLUMN_ID})
POP=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | head -n${POP_IDX} | tail -n1)
### Skip leave-one-population-out cross-validation if there is only one population
if [ "${N_POPS}" -eq 1 ]
then
BOOL_ACROSS=FALSE
else
if [ "${POP_IDX}" -eq 1 ]
then
BOOL_ACROSS=TRUE
else
BOOL_ACROSS=FALSE
fi
fi
### Output directories
DIR_OUT_MAIN=${DIR}/output
DIR_OUT=${DIR_OUT_MAIN}/output-${TRAIT}-${POP}
mkdir $DIR_OUT
### Log messages
echo JOB_${SLURM_ARRAY_TASK_ID}-TRAIT_${TRAIT}-POP_${POP} > ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
echo "==========================================
-------------------------------------------
Job Info
-------------------------------------------
SLURM_JOB_ID = $SLURM_JOB_ID
SLURM_JOB_NAME = $SLURM_JOB_NAME
SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST
SLURM_SUBMIT_HOST = $SLURM_SUBMIT_HOST
SLURM_SUBMIT_DIR = $SLURM_SUBMIT_DIR
SLURM_NTASKS = $SLURM_NTASKS
SLURM_ARRAY_TASK_ID = $SLURM_ARRAY_TASK_ID
SLURM_MEM_PER_NODE = $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) GB
SLURM_CPUS_PER_TASK = $SLURM_CPUS_PER_TASK
-------------------------------------------
Variables
-------------------------------------------
GENOTYPE_DATA_RDS : $GENOTYPE_DATA_RDS
PHENOTYPE_DATA_TSV : $PHENOTYPE_DATA_TSV
KFOLDS : $KFOLDS
NREPS : $NREPS
TRAIT : $TRAIT
POPULATION : $POP
-------------------------------------------
Output directory
-------------------------------------------
${DIR_OUT}
==========================================" >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

### Run within and across population replicated k-fold cross-validation and prediction of missing phenotypes
time \
Rscript ${DIR}/gp.R \
--fname-geno $GENOTYPE_DATA_RDS \
--fname-pheno $PHENOTYPE_DATA_TSV \
--population $POP \
--dir-output $DIR_OUT \
--pheno-idx-col-y $COLUMN_ID \
--bool-within TRUE \
--bool-across $BOOL_ACROSS \
--n-folds $KFOLDS \
--n-reps $NREPS \
--bool-parallel TRUE \
--max-mem-Gb $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) \
--n-threads $SLURM_CPUS_PER_TASK \
--verbose TRUE >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
### Clean-up
mv ${DIR_OUT}/GENOMIC_PREDICTIONS_OUTPUT-*.Rds ${DIR_OUT_MAIN}
mv ${DIR_OUT}/job_info-${TRAIT}-${POP}.log ${DIR_OUT_MAIN}
rm -R ${DIR_OUT}
Loading

0 comments on commit e062028

Please sign in to comment.