Skip to content

Commit

Permalink
working on a very simplified API
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffersonfparil committed Jun 21, 2024
1 parent bc2e79b commit 9664c39
Show file tree
Hide file tree
Showing 13 changed files with 1,821 additions and 51 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ _libs/

# Test files
inst/exec_Rscript/gp_*
inst/exec_Rscript/bglr*
inst/exec_Rscript/*.vcf
inst/exec_Rscript/*.vcf.gz
inst/exec_Rscript/*.tsv
inst/exec_Rscript/*.Rds
inst/exec_Rscript/GENOMIC_PREDICTIONS_OUTPUT-*
inst/exec_Rscript/output/

# SQL databases
Expand Down
24 changes: 12 additions & 12 deletions R/io.R
Original file line number Diff line number Diff line change
Expand Up @@ -1569,11 +1569,11 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
if (verbose & (length(vec_idx_loci_to_remove)==ncol(G))) {
error = methods::new("gpError",
code=000,
message=paste0("All loci were filtered out. Please increase the max_sparsity_per_locus from ", max_sparsity_per_locus,
message=paste0("All loci were filtered out. Please consider increasing the max_sparsity_per_locus from ", max_sparsity_per_locus,
", given that the mean sparsity per locus ranges from ", min(vec_sparsity_per_locus, na.rm=TRUE),
" to ", max(vec_sparsity_per_locus, na.rm=TRUE),
" with a mean of ", mean(vec_sparsity_per_locus, na.rm=TRUE),
" and median ", stats::median(vec_sparsity_per_locus, na.rm=TRUE)))
" and a median of ", stats::median(vec_sparsity_per_locus, na.rm=TRUE)))
return(error)
}
}
Expand All @@ -1587,7 +1587,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
if (verbose & (length(vec_idx_loci_to_remove)==ncol(G))) {
error = methods::new("gpError",
code=000,
message=paste0("All loci were filtered out. Please decrease the frac_topmost_sparse_loci_to_remove from ", frac_topmost_sparse_loci_to_remove,
message=paste0("All loci were filtered out. Please consider decreasing the frac_topmost_sparse_loci_to_remove from ", frac_topmost_sparse_loci_to_remove,
" to something more reasonable."))
return(error)
}
Expand All @@ -1601,7 +1601,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
if (verbose & (length(vec_idx_loci_to_remove)==ncol(G))) {
error = methods::new("gpError",
code=000,
message=paste0("All loci were filtered out. Please decrease the n_topmost_sparse_loci_to_remove from ", n_topmost_sparse_loci_to_remove,
message=paste0("All loci were filtered out. Please consider decreasing the n_topmost_sparse_loci_to_remove from ", n_topmost_sparse_loci_to_remove,
" to something more reasonable, if it please you m'lady/m'lord."))
return(error)
}
Expand All @@ -1626,11 +1626,11 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
if (verbose & (length(vec_idx_samples_to_remove)==nrow(G))) {
error = methods::new("gpError",
code=000,
message=paste0("All samples were filtered out. Please increase the max_sparsity_per_sample from ", max_sparsity_per_sample,
message=paste0("All samples were filtered out. Please consider increasing the max_sparsity_per_sample from ", max_sparsity_per_sample,
", given that the mean sparsity per sample ranges from ", min(vec_sparsity_per_sample, na.rm=TRUE),
" to ", max(vec_sparsity_per_sample, na.rm=TRUE),
" with a mean of ", mean(vec_sparsity_per_sample, na.rm=TRUE),
" and median ", stats::median(vec_sparsity_per_sample, na.rm=TRUE)))
" and a median of ", stats::median(vec_sparsity_per_sample, na.rm=TRUE)))
return(error)
}
}
Expand All @@ -1644,7 +1644,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
if (verbose & (length(vec_idx_samples_to_remove)==nrow(G))) {
error = methods::new("gpError",
code=000,
message=paste0("All samples were filtered out. Please decrease the frac_topmost_sparse_samples_to_remove from ", frac_topmost_sparse_samples_to_remove,
message=paste0("All samples were filtered out. Please consider decreasing the frac_topmost_sparse_samples_to_remove from ", frac_topmost_sparse_samples_to_remove,
" to something more reasonable."))
return(error)
}
Expand All @@ -1658,7 +1658,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
if (verbose & (length(vec_idx_samples_to_remove)==nrow(G))) {
error = methods::new("gpError",
code=000,
message=paste0("All samples were filtered out. Please decrease the n_topmost_sparse_samples_to_remove from ", n_topmost_sparse_samples_to_remove,
message=paste0("All samples were filtered out. Please consider decreasing the n_topmost_sparse_samples_to_remove from ", n_topmost_sparse_samples_to_remove,
" to something more reasonable."))
return(error)
}
Expand Down Expand Up @@ -1803,7 +1803,7 @@ fn_save_genotype = function(G, fname, file_type=c("RDS", "TSV")[1], verbose=FALS

#' Load phenotype data from a text-delimited file
#'
#' @param fname_pheno filname of the text-delimited phenotype data
#' @param fname_pheno filename of the text-delimited phenotype data
#' @param sep column-delimiter in the phenotype file (Default="\\t")
#' @param header does the phenotype file have a header line? (Default=TRUE)
#' @param idx_col_id which column correspond to the sample/entry/pool/genotype names? (Default=1)
Expand Down Expand Up @@ -1929,9 +1929,9 @@ fn_filter_phenotype = function(list_pheno, remove_outliers=TRUE, remove_NA=FALSE
# verbose = TRUE
###################################################
if (verbose) {
print("############################")
print("### Filter genotype data ###")
print("############################")
print("#############################")
print("### Filter phenotype data ###")
print("#############################")
}
if (methods::is(list_pheno, "gpError")) {
error = chain(list_pheno,
Expand Down
29 changes: 27 additions & 2 deletions R/main.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#' top most sparse loci will be removed (see ?fn_filter_genotype for details)
#' - $geno_n_topmost_sparse_loci_to_remove: number of top most sparse loci to be removed
#' (see ?fn_filter_genotype for details)
#' - $geno_max_sparsity_per_sample: maximum sparsity or fraction of missing data per sample
#' - $geno_max_sparsity_per_sample: maximum sparsity or fraction of missing data per sample
#' (see ?fn_filter_genotype for details)
#' - $geno_frac_topmost_sparse_samples_to_remove: fraction of the total number of samples from which
#' the top most sparse samples will be removed (see ?fn_filter_genotype for details)
Expand Down Expand Up @@ -340,7 +340,17 @@ gp = function(args) {
n_topmost_sparse_samples_to_remove=args$geno_n_topmost_sparse_samples_to_remove,
verbose=args$verbose
)
if (methods::is(G, "gpError")) {return(G)}
if (methods::is(G, "gpError")) {
error = chain(G, methods::new("gpError",
code=000,
message=paste0(
"All loci were filtered out. ",
"Please consider reducing --geno-min-depth (", args$geno_min_depth, ") and/or ",
"increasing --geno-max-depth (", args$geno_max_depth, ") and/or ",
"impute your input genotype data (", args$fname_geno, ")."
)))
return(error)
}
gc()
list_pheno = fn_filter_phenotype(
list_pheno=list_pheno,
Expand All @@ -358,6 +368,21 @@ gp = function(args) {
verbose=args$verbose
)
if (methods::is(list_merged, "gpError")) {return(list_merged)}
### Missing values are not allowed in the genotype data
if (sum(rowSums(is.na(list_merged$G))) > 0) {
error = methods::new("gpError",
code=000,
message=paste0(
"Genotype data has missing values. ",
"Please impute the missing data. ",
"You may also consider reducing ",
"--geno-min-depth (", args$geno_min_depth, ") and/or increasing ",
"--geno-max-depth (", args$geno_max_depth, "), if possible. ",
"You may also set the following to zero: --geno-max-sparsity-per-locus (", args$geno_max_sparsity_per_locus, ") and ",
"--geno-max-sparsity-per-sample (", args$geno_max_sparsity_per_sample, ")."
))
return(error)
}
### Clean-up
rm("G")
rm("list_pheno")
Expand Down
Empty file added inst/exec_Rscript/.Rprofile
Empty file.
103 changes: 103 additions & 0 deletions inst/exec_Rscript/0-checks_and_submision.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/bin/bash

# ### Load the conda environment
# module load Miniconda3/22.11.1-1
# conda init bash
# source ~/.bashrc
# conda activate genomic_selection
echo '
if (!require("gp", character.only = TRUE)) {
install.packages("devtools")
devtools::install_github("jeffersonfparil/gp")
}' > install_gp.R
Rscript install_gp.R
rm install_gp.R

################################################################
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
### Please edit the input variables below to match your dataset:
################################################################
### (1) Full path to the location of the executable Rscript gp.R
DIR=$(dirname $0)
cd $DIR
DIR=$(pwd)
### Input variables (use the absolute path to files to be precise)
### (2) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
GENOTYPE_DATA_RDS=${DIR}/input/test_geno.Rds
### (3) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
PHENOTYPE_DATA_TSV=${DIR}/input/test_pheno.tsv
### (4) Number of folds for k-fold cross-validation.
KFOLDS=5
### (5) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
NREPS=3

### Check if the genotype file exists
if [ ! -f $GENOTYPE_DATA_RDS ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS does not exist. Are you specifying the full path? Is the name correct?"
exit 101
fi
### Check if the phenotype file exists
if [ ! -f $PHENOTYPE_DATA_TSV ]
then
echo "Error: The phenotype file: $PHENOTYPE_DATA_TSV does not exist. Are you specifying the full path? Is the name correct?"
exit 102
fi
### Check if the genotype file is a valid Rds file
echo 'args = commandArgs(trailingOnly=TRUE)
geno = suppressWarnings(tryCatch(readRDS(args[1]), error=function(e){print("Error loading genotype file.")}))
' > test_geno_rds.R
if [ $(Rscript test_geno_rds.R $GENOTYPE_DATA_RDS | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The genotype file: $GENOTYPE_DATA_RDS is not an Rds file."
exit 103
fi
rm test_geno_rds.R
### Check if the phenotype file is formatted according to the required specifications
echo 'args = commandArgs(trailingOnly=TRUE)
pheno = suppressWarnings(tryCatch(read.delim(args[1], sep="\t", header=TRUE), error=function(e){print("Error loading phenotype file.")}))
' > test_pheno_rds.R
if [ $(Rscript test_pheno_rds.R $PHENOTYPE_DATA_TSV | grep -i "error" | wc -l) -eq 1 ]
then
echo "Error: The phenotype file: $GENOTYPE_DATA_RDS is not formatted according to specifications. It should be tab-delimited and a header line must be present."
exit 104
fi
rm test_pheno_rds.R
### Check if the genomic_selection repo folder exists
if [ ! -d $DIR ]
then
echo "Error: The genotype_selection directory: $DIR does not exist. Are you specifying the full path? Is the name correct?"
exit 105
fi
### Check if the genomic_selection repo belongs to the user
if [ ! -w $DIR ]
then
echo "Error: You do not have permission to write in the genotype_selection directory: $DIR. Did you clone the genomic_selection repository into a directory you have write-access to?"
exit 106
fi
### Check if the genomic_selection repo has contains the slurm array job submission script
if [ ! -f ${DIR}/01_gp_slurm_job.sh ]
then
echo "Error: The genotype_selection directory: $DIR does not contain the script: 01_gp_slurm_job.sh. Are you sure this is the genomic_selection repo directory?"
exit 107
fi
### Initialise the output directory which will contain all the output Rds files across populations and traits
if [ ! -d ${DIR}/output ]
then
mkdir ${DIR}/output
fi
### Submit an array of jobs equivalent to the number of traits in the phenotype file
cd $DIR/
N_TRAITS=$(echo $(head -n1 $PHENOTYPE_DATA_TSV | awk '{print NF}') - 2 | bc)
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
sbatch --array 1-$(echo "${N_TRAITS} * ${N_POPS}" | bc) \
1-gp_slurm_job.sh \
${GENOTYPE_DATA_RDS} \
${PHENOTYPE_DATA_TSV} \
${KFOLDS} \
${NREPS} \
${DIR}
120 changes: 120 additions & 0 deletions inst/exec_Rscript/1-gp_slurm_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/bin/bash
#SBATCH --job-name="GS"
#SBATCH --account="dbiopast1" ### EDIT ME: Pick the appropriate account name, e.g. dbiopast1 or dbiopast2
#SBATCH --ntasks=1 ### LEAVE ME:Request a single task as we will be submitting this as an array job where each job corresponds to a trait
#SBATCH --cpus-per-task=16 ### EDIT ME: Parallelisation across replications, folds and models (more cpu means faster execution time but probably longer time to wait for the Slurm scheduler to find resources to allocate to the job)
#SBATCH --mem=100G ### EDIT ME: Proportional to the input data (will need to test the appropriate memory required, hint use `seff ${JOBID}`)
#SBATCH --time=1-0:0:00 ### EDIT ME: Proportional to the input data, number of folds, replications, and models to be used
###################################################################################################
### Edit the Slurm settings above to match your requirements.
###################################################################################################

###################################################################################################
### The variables below will be exported from `00_gs_slurm_job_wrapper.sh`:
###################################################################################################
### Input variables (use the absolute path to files to be precise)
### (1) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
# GENOTYPE_DATA_RDS='/group/pasture/Jeff/genomic_selection/tests/grape.rds'
GENOTYPE_DATA_RDS=$1
### (2) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
# PHENOTYPE_DATA_TSV='/group/pasture/Jeff/genomic_selection/tests/grape_pheno.txt'
PHENOTYPE_DATA_TSV=$2
### (3) Number of folds for k-fold cross-validation.
# KFOLDS=5
KFOLDS=$3
### (4) Number of replications of the k-fold cross-validation each representing a random sorting of the samples hence yielding different ways of partitioning the data.
# NREPS=3
NREPS=$4
### (5) Full path to the location of the executable Rscript gp.R
# DIR='/group/pasture/Jeff/gp/inst/exec_Rscript'
DIR=$5
###################################################################################################
### Edit the code below, if and only if you have read the documentation or familiar with `src/*.R`:
###################################################################################################
### Define the trait and population to include
N_POPS=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | wc -l)
TRAIT_IDX=$(echo "((${SLURM_ARRAY_TASK_ID}-1) / ${N_POPS}) + 1" | bc)
POP_IDX=$(echo "${SLURM_ARRAY_TASK_ID} % ${N_POPS}" | bc)
if [ "${POP_IDX}" -eq 0 ]
then
POP_IDX=${N_POPS}
fi
COLUMN_ID=$(echo 2 + ${TRAIT_IDX} | bc)
TRAIT=$(head -n1 $PHENOTYPE_DATA_TSV | cut -f${COLUMN_ID})
POP=$(tail -n+2 $PHENOTYPE_DATA_TSV | cut -f2 - | sort | uniq | head -n${POP_IDX} | tail -n1)
### Skip leave-one-population-out cross-validation if there is only one population
if [ "${N_POPS}" -eq 1 ]
then
BOOL_ACROSS=FALSE
else
if [ "${POP_IDX}" -eq 1 ]
then
BOOL_ACROSS=TRUE
else
BOOL_ACROSS=FALSE
fi
fi
### Output directories
DIR_OUT_MAIN=${DIR}/output
DIR_OUT=${DIR_OUT_MAIN}/output-${TRAIT}-${POP}
mkdir $DIR_OUT
### Log messages
echo JOB_${SLURM_ARRAY_TASK_ID}-TRAIT_${TRAIT}-POP_${POP} > ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
echo "==========================================
-------------------------------------------
Job Info
-------------------------------------------
SLURM_JOB_ID = $SLURM_JOB_ID
SLURM_JOB_NAME = $SLURM_JOB_NAME
SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST
SLURM_SUBMIT_HOST = $SLURM_SUBMIT_HOST
SLURM_SUBMIT_DIR = $SLURM_SUBMIT_DIR
SLURM_NTASKS = $SLURM_NTASKS
SLURM_ARRAY_TASK_ID = $SLURM_ARRAY_TASK_ID
SLURM_MEM_PER_NODE = $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) GB
SLURM_CPUS_PER_TASK = $SLURM_CPUS_PER_TASK
-------------------------------------------
Variables
-------------------------------------------
GENOTYPE_DATA_RDS : $GENOTYPE_DATA_RDS
PHENOTYPE_DATA_TSV : $PHENOTYPE_DATA_TSV
KFOLDS : $KFOLDS
NREPS : $NREPS
TRAIT : $TRAIT
POPULATION : $POP
-------------------------------------------
Output directory
-------------------------------------------
${DIR_OUT}
==========================================" >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log

### Load the conda environment
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc
conda activate genomic_selection

### Run within and across population replicated k-fold cross-validation and prediction of missing phenotypes
time \
Rscript ${DIR}/gp.R \
--fname-geno $GENOTYPE_DATA_RDS \
--fname-pheno $PHENOTYPE_DATA_TSV \
--population $POP \
--dir-output $DIR_OUT \
--pheno-idx-col-y $COLUMN_ID \
--bool-within TRUE \
--bool-across $BOOL_ACROSS \
--n-folds $KFOLDS \
--n-reps $NREPS \
--bool-parallel TRUE \
--max-mem-Gb $(echo "$SLURM_MEM_PER_NODE / (2^10)" | bc) \
--n-threads $SLURM_CPUS_PER_TASK \
--verbose TRUE >> ${DIR_OUT}/job_info-${TRAIT}-${POP}.log
### Clean-up
mv ${DIR_OUT}/GENOMIC_PREDICTIONS_OUTPUT-*.Rds ${DIR_OUT_MAIN}
mv ${DIR_OUT}/job_info-${TRAIT}-${POP}.log ${DIR_OUT_MAIN}
rm -R ${DIR_OUT}
Loading

0 comments on commit 9664c39

Please sign in to comment.