-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
increasing default memory usage multiplier from 40 to 50 to fix OOM e…
…rror in test data runs
- Loading branch information
1 parent
7924ce4
commit e062028
Showing
6 changed files
with
505 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/bin/bash | ||
|
||
### Load the conda environment | ||
module load Miniconda3/22.11.1-1 | ||
conda init bash | ||
source ~/.bashrc | ||
conda activate genomic_selection | ||
|
||
################################################################ | ||
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT | ||
### Please edit the input variables below to match your dataset: | ||
################################################################ | ||
|
||
### Input variables (use the absolute path to files to be precise) | ||
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. | ||
### Should have no missing data or else will be imputed via mean value imputation. | ||
GENOTYPE_DATA_RDS='/group/pasture/Jeff/lucerne/workdir/FINAL-IMPUTED-noTrailingAllele-filteredSNPlist.Rds' | ||
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column. | ||
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc. | ||
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to. | ||
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING. | ||
PHENOTYPE_DATA_TSV='/group/pasture/Jeff/lucerne/workdir/Lucerne_PhenomicsDB_2024-05-27-BiomassPredicted.tsv' | ||
### (3) Number of folds for k-fold cross-validation. | ||
KFOLDS=5 | ||
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data. | ||
NREPS=5 | ||
### (5) Full path to the location of the executable Rscript gp.R | ||
DIR='/group/pasture/Jeff/gp/exec' | ||
|
||
### Check if the genotype file exists | ||
if [ ! -f $GENOTYPE_DATA_RDS ] | ||
then | ||
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?" | ||
exit 101 | ||
fi | ||
### Check if the phenotype file exists | ||
if [ ! -f $PHENOTYPE_DATA_TSV ] | ||
then | ||
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?" | ||
exit 102 | ||
fi | ||
### Check if the genotype file is a valid Rds file | ||
echo 'args = commandArgs(trailingOnly=TRUE) | ||
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")})) | ||
' > test_geno_rds.R | ||
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ] | ||
then | ||
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file." | ||
exit 103 | ||
fi | ||
rm test_geno_rds.R | ||
### Check if the phenotype file is formatted according to the required specifications | ||
echo 'args = commandArgs(trailingOnly=TRUE) | ||
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")})) | ||
' > test_pheno_rds.R | ||
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ] | ||
then | ||
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present." | ||
exit 104 | ||
fi | ||
rm test_pheno_rds.R | ||
### Check if the genomic_selection repo folder exists | ||
if [ ! -d $DIR ] | ||
then | ||
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?" | ||
exit 105 | ||
fi | ||
### Check if the genomic_selection repo belongs to the user | ||
if [ ! -w $DIR ] | ||
then | ||
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?" | ||
exit 106 | ||
fi | ||
### Check if the genomic_selection repo has contains the slurm array job submission script | ||
if [ ! -f ${DIR}/01_gp_slurm_job.sh ] | ||
then | ||
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?" | ||
exit 107 | ||
fi | ||
### Initialise the output directory which will contain all the output Rds files across populations and traits | ||
if [ ! -d ${DIR}/output ] | ||
then | ||
mkdir ${DIR}/output | ||
fi | ||
### Submit an array of jobs equivalent to the number of traits in the phenotype file | ||
cd $DIR/ | ||
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc) | ||
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l) | ||
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \ | ||
01_gp_slurm_job-LUCERNE.sh \ | ||
${GENOTYPE_DATA_RDS} \ | ||
${PHENOTYPE_DATA_TSV} \ | ||
${KFOLDS} \ | ||
${NREPS} \ | ||
${DIR} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/bin/bash | ||
|
||
### Load the conda environment | ||
module load Miniconda3/22.11.1-1 | ||
conda init bash | ||
source ~/.bashrc | ||
conda activate genomic_selection | ||
|
||
################################################################ | ||
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT | ||
### Please edit the input variables below to match your dataset: | ||
################################################################ | ||
|
||
### Input variables (use the absolute path to files to be precise) | ||
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. | ||
### Should have no missing data or else will be imputed via mean value imputation. | ||
GENOTYPE_DATA_RDS='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_HS-1717536141.3435302.3200855812-IMPUTED.Rds' | ||
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column. | ||
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc. | ||
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to. | ||
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING. | ||
PHENOTYPE_DATA_TSV='/group/pasture/Jeff/ryegrass/workdir/STR_NUE_WUE_PhenomicsDB_2024-05-20-BiomassPredictedAndGroudtruth.tsv' | ||
### (3) Number of folds for k-fold cross-validation. | ||
KFOLDS=5 | ||
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data. | ||
NREPS=5 | ||
### (5) Full path to the location of the executable Rscript gp.R | ||
DIR='/group/pasture/Jeff/gp/exec' | ||
|
||
### Check if the genotype file exists | ||
if [ ! -f $GENOTYPE_DATA_RDS ] | ||
then | ||
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?" | ||
exit 101 | ||
fi | ||
### Check if the phenotype file exists | ||
if [ ! -f $PHENOTYPE_DATA_TSV ] | ||
then | ||
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?" | ||
exit 102 | ||
fi | ||
### Check if the genotype file is a valid Rds file | ||
echo 'args = commandArgs(trailingOnly=TRUE) | ||
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")})) | ||
' > test_geno_rds.R | ||
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ] | ||
then | ||
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file." | ||
exit 103 | ||
fi | ||
rm test_geno_rds.R | ||
### Check if the phenotype file is formatted according to the required specifications | ||
echo 'args = commandArgs(trailingOnly=TRUE) | ||
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")})) | ||
' > test_pheno_rds.R | ||
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ] | ||
then | ||
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present." | ||
exit 104 | ||
fi | ||
rm test_pheno_rds.R | ||
### Check if the genomic_selection repo folder exists | ||
if [ ! -d $DIR ] | ||
then | ||
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?" | ||
exit 105 | ||
fi | ||
### Check if the genomic_selection repo belongs to the user | ||
if [ ! -w $DIR ] | ||
then | ||
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?" | ||
exit 106 | ||
fi | ||
### Check if the genomic_selection repo has contains the slurm array job submission script | ||
if [ ! -f ${DIR}/01_gp_slurm_job.sh ] | ||
then | ||
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?" | ||
exit 107 | ||
fi | ||
### Initialise the output directory which will contain all the output Rds files across populations and traits | ||
if [ ! -d ${DIR}/output ] | ||
then | ||
mkdir ${DIR}/output | ||
fi | ||
### Submit an array of jobs equivalent to the number of traits in the phenotype file | ||
cd $DIR/ | ||
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc) | ||
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l) | ||
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \ | ||
01_gp_slurm_job-RYEGRASS.sh \ | ||
${GENOTYPE_DATA_RDS} \ | ||
${PHENOTYPE_DATA_TSV} \ | ||
${KFOLDS} \ | ||
${NREPS} \ | ||
${DIR} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name="GS" | ||
#SBATCH --account="dbiopast2" ### EDIT ME: Pick the appropriate account name, e.g. dbiopast1 or dbiopast2 | ||
#SBATCH --ntasks=1 ### LEAVE ME:Request a single task as we will be submitting this as an array job where each job corresponds to a trait | ||
#SBATCH --cpus-per-task=32 ### EDIT ME: Parallelisation across replications, folds and models (more cpu means faster execution time but probably longer time to wait for the Slurm scheduler to find resources to allocate to the job) | ||
#SBATCH --mem=400G ### EDIT ME: Proportional to the input data (will need to test the appropriate memory required, hint use `seff ${JOBID}`) | ||
#SBATCH --time=7-0:0:00 ### EDIT ME: Proportional to the input data, number of folds, replications, and models to be used | ||
################################################################################################### | ||
### Edit the Slurm settings above to match your requirements. | ||
################################################################################################### | ||
|
||
################################################################################################### | ||
### The variables below will be exported from `00_gs_slurm_job_wrapper.sh`: | ||
################################################################################################### | ||
### Input variables (use the absolute path to files to be precise) | ||
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. | ||
### Should have no missing data or else will be imputed via mean value imputation. | ||
# GENOTYPE_DATA_RDS='/group/pasture/Jeff/genomic_selection/tests/grape.rds' | ||
GENOTYPE_DATA_RDS=$1 | ||
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column. | ||
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc. | ||
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to. | ||
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING. | ||
# PHENOTYPE_DATA_TSV='/group/pasture/Jeff/genomic_selection/tests/grape_pheno.txt' | ||
PHENOTYPE_DATA_TSV=$2 | ||
### (3) Number of folds for k-fold cross-validation. | ||
# KFOLDS=5 | ||
KFOLDS=$3 | ||
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data. | ||
# NREPS=3 | ||
NREPS=$4 | ||
### (5) Full path to the location of the executable Rscript gp.R | ||
# DIR='/group/pasture/Jeff/gp/exec' | ||
DIR=$5 | ||
################################################################################################### | ||
### Edit the code below, if and only if you have read the documentation or familiar with `src/*.R`: | ||
################################################################################################### | ||
### Define the trait and population to include | ||
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l) | ||
TRAIT_IDX=$(echo "((${SLURM_ARRAY_TASK_ID}-1) / ${N_POPS}) + 1" | bc) | ||
POP_IDX=$(echo "${SLURM_ARRAY_TASK_ID} % ${N_POPS}" | bc) | ||
if [ "${POP_IDX}" -eq 0 ] | ||
then | ||
POP_IDX=${N_POPS} | ||
fi | ||
COLUMN_ID=$(echo 2 + ${TRAIT_IDX} | bc) | ||
TRAIT=$(head -n1 $PHENOTYPE_DATA_TSV | cut -f${COLUMN_ID}) | ||
POP=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | head -n${POP_IDX} | tail -n1) | ||
### Skip leave-one-population-out cross-validation if there is only one population | ||
if [ "${N_POPS}" -eq 1 ] | ||
then | ||
BOOL_ACROSS=FALSE | ||
else | ||
if [ "${POP_IDX}" -eq 1 ] | ||
then | ||
BOOL_ACROSS=TRUE | ||
else | ||
BOOL_ACROSS=FALSE | ||
fi | ||
fi | ||
### Output directories | ||
DIR_OUT_MAIN=${DIR}/output | ||
DIR_OUT=${DIR_OUT_MAIN}/output-${TRAIT}-${POP} | ||
mkdir $DIR_OUT | ||
### Log messages | ||
echo JOB_${SLURM_ARRAY_TASK_ID}-TRAIT_${TRAIT}-POP_${POP} > ${DIR_OUT}/job_info-${TRAIT}-${POP}.log | ||
echo "========================================== | ||
------------------------------------------- | ||
Job Info | ||
------------------------------------------- | ||
SLURM_JOB_ID = $SLURM_JOB_ID | ||
SLURM_JOB_NAME = $SLURM_JOB_NAME | ||
SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST | ||
SLURM_SUBMIT_HOST = $SLURM_SUBMIT_HOST | ||
SLURM_SUBMIT_DIR = $SLURM_SUBMIT_DIR | ||
SLURM_NTASKS = $SLURM_NTASKS | ||
SLURM_ARRAY_TASK_ID = $SLURM_ARRAY_TASK_ID | ||
SLURM_MEM_PER_NODE = $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) GB | ||
SLURM_CPUS_PER_TASK = $SLURM_CPUS_PER_TASK | ||
------------------------------------------- | ||
Variables | ||
------------------------------------------- | ||
GENOTYPE_DATA_RDS : $GENOTYPE_DATA_RDS | ||
PHENOTYPE_DATA_TSV : $PHENOTYPE_DATA_TSV | ||
KFOLDS : $KFOLDS | ||
NREPS : $NREPS | ||
TRAIT : $TRAIT | ||
POPULATION : $POP | ||
------------------------------------------- | ||
Output directory | ||
------------------------------------------- | ||
${DIR_OUT} | ||
==========================================" >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log | ||
|
||
### Load the conda environment | ||
module load Miniconda3/22.11.1-1 | ||
conda init bash | ||
source ~/.bashrc | ||
conda activate genomic_selection | ||
|
||
### Run within and across population replicated k-fold cross-validation and prediction of missing phenotypes | ||
time \ | ||
Rscript ${DIR}/gp.R \ | ||
--fname-geno $GENOTYPE_DATA_RDS \ | ||
--fname-pheno $PHENOTYPE_DATA_TSV \ | ||
--population $POP \ | ||
--dir-output $DIR_OUT \ | ||
--pheno-idx-col-y $COLUMN_ID \ | ||
--bool-within TRUE \ | ||
--bool-across $BOOL_ACROSS \ | ||
--n-folds $KFOLDS \ | ||
--n-reps $NREPS \ | ||
--bool-parallel TRUE \ | ||
--max-mem-Gb $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) \ | ||
--n-threads $SLURM_CPUS_PER_TASK \ | ||
--verbose TRUE >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log | ||
### Clean-up | ||
mv ${DIR_OUT}/GENOMIC_PREDICTIONS_OUTPUT-*.Rds ${DIR_OUT_MAIN} | ||
mv ${DIR_OUT}/job_info-${TRAIT}-${POP}.log ${DIR_OUT_MAIN} | ||
rm -R ${DIR_OUT} |
Oops, something went wrong.