From 4c80d8f8af2f3479d8d392e2b08ec02ac4b07a37 Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Sat, 22 Jun 2024 15:06:47 +1000 Subject: [PATCH] drafting a set of straigtforward one- or two-step genomic prediction pipeline --- inst/exec_Rscript/0-checks_and_submision.sh | 53 ++++++++------ inst/exec_Rscript/gp.R | 80 --------------------- 2 files changed, 33 insertions(+), 100 deletions(-) diff --git a/inst/exec_Rscript/0-checks_and_submision.sh b/inst/exec_Rscript/0-checks_and_submision.sh index d72899a..23d3f4e 100644 --- a/inst/exec_Rscript/0-checks_and_submision.sh +++ b/inst/exec_Rscript/0-checks_and_submision.sh @@ -1,34 +1,47 @@ #!/bin/bash -# ### Load the conda environment -# module load Miniconda3/22.11.1-1 -# conda init bash -# source ~/.bashrc -# conda activate genomic_selection -echo ' -if (!require("gp", character.only = TRUE)) { - install.packages("devtools") - devtools::install_github("jeffersonfparil/gp") -}' > install_gp.R -Rscript install_gp.R -rm install_gp.R +### Full path to the location of the executable Rscript `gp.R`` which should co-locate with this script: `0-checks_and_submission.sh`` as well as `1-gp_slurm_job.sh`. +DIR=$(dirname $0) +cd $DIR +DIR=$(pwd) + +################################## +### Load the conda environment ### +################################## +module load Miniconda3/22.11.1-1 +conda init bash +source ~/.bashrc + +if [ $(conda env list | grep "^genomic_selection " | wc -l) -gt 0 ] +then + conda activate genomic_selection +else + conda env create -f ${DIR}/../../conda.yml +fi + +####################################### +### Install gp if not installed yet ### +####################################### +Rscript -e 'if (!require("gp", character.only = TRUE)) {install.packages("devtools", repos="https://cloud.r-project.org"); devtools::install_github("jeffersonfparil/gp")}' ################################################################ ### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT ### Please edit the input variables below to match your dataset: ################################################################ -### (1) Full path to the location of the executable Rscript gp.R -DIR=$(dirname $0) -cd $DIR -DIR=$(pwd) +# ### (1) Full path to the location of the executable Rscript `gp.R`` which should co-locate with this script: `0-checks_and_submission.sh`` as well as `1-gp_slurm_job.sh`. +# DIR=$(dirname $0) +# cd $DIR +# DIR=$(pwd) ### Input variables (use the absolute path to files to be precise) ### (2) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. -### Should have no missing data or else will be imputed via mean value imputation. +### - The genotype data can be coded as any numeric range of values, e.g. (0,1,2), (-1,0,1), and (0.00,0.25,0.50,0.75,1.00) or as biallelic characters, e.g. for diploids: "AA", "AB", "BB", and for tetraploids: "AAAA", "AAAB", "AABB", "ABBB", and "BBBB".. It is recommended that this data should be filtered and imputed beforehand. +### - The rows are expected to have names of the samples corresponding to the names in the phenotype file. +### - The columns are expected to contain the loci names but does need to follow a specific format: chromosome name and position separated by a tab character (`\t`) and an optional allele identifier, e.g. `chr-1\t12345\tallele_A` GENOTYPE_DATA_RDS=${DIR}/input/test_geno.Rds ### (3) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column. -### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc. -### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to. -### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING. +### - Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc. +### - Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to. +### - Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING. PHENOTYPE_DATA_TSV=${DIR}/input/test_pheno.tsv ### (4) Number of folds for k-fold cross-validation. KFOLDS=5 diff --git a/inst/exec_Rscript/gp.R b/inst/exec_Rscript/gp.R index ed894ba..ff1ac82 100644 --- a/inst/exec_Rscript/gp.R +++ b/inst/exec_Rscript/gp.R @@ -82,83 +82,3 @@ print(" |_ (oo)|_______") print(" (__)| )|/|") print(" ||----w |") print(" || ||") - -### TEST ON LUCERNE -# args = list( -# fname_geno='/group/pasture/Jeff/lucerne/workdir/FINAL-IMPUTED-noTrailingAllele-filteredSNPlist.Rds', -# fname_pheno='/group/pasture/Jeff/lucerne/workdir/Lucerne_PhenomicsDB_2024-05-27-BiomassPredicted.tsv', -# population="DB-MS-31-22-001", -# fname_covar=NULL, -# dir_output="outdir/lucerne", -# geno_fname_snp_list=NULL, -# geno_ploidy=NULL, -# geno_bool_force_biallelic=TRUE, -# geno_bool_retain_minus_one_alleles_per_locus=FALSE, -# geno_min_depth=0, -# geno_max_depth=.Machine$integer.max, -# geno_maf=0.01, -# geno_sdev_min=0.0001, -# geno_max_n_alleles=NULL, -# geno_max_sparsity_per_locus=NULL, -# geno_frac_topmost_sparse_loci_to_remove=NULL, -# geno_n_topmost_sparse_loci_to_remove=NULL, -# geno_max_sparsity_per_sample=NULL, -# geno_frac_topmost_sparse_samples_to_remove=NULL, -# geno_n_topmost_sparse_samples_to_remove=NULL, -# pheno_sep="\t", -# pheno_header=TRUE, -# pheno_idx_col_id=1, -# pheno_idx_col_pop=2, -# pheno_idx_col_y=3, -# pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"), -# pheno_bool_remove_NA=FALSE, -# bool_within=TRUE, -# bool_across=TRUE, -# n_folds=5, -# n_reps=1, -# vec_models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C","gBLUP"), -# bool_parallel=TRUE, -# max_mem_Gb=360, -# n_threads=32, -# verbose=TRUE -# ) - -### TEST ON GRAPE -# args = list( -# fname_geno='grape.Rds', -# fname_pheno='grape_pheno.txt', -# population="g_1", -# fname_covar=NULL, -# dir_output="outdir", -# geno_fname_snp_list=NULL, -# geno_ploidy=NULL, -# geno_bool_force_biallelic=TRUE, -# geno_bool_retain_minus_one_alleles_per_locus=FALSE, -# geno_min_depth=0, -# geno_max_depth=.Machine$integer.max, -# geno_maf=0.01, -# geno_sdev_min=0.0001, -# geno_max_n_alleles=NULL, -# geno_max_sparsity_per_locus=NULL, -# geno_frac_topmost_sparse_loci_to_remove=NULL, -# geno_n_topmost_sparse_loci_to_remove=NULL, -# geno_max_sparsity_per_sample=NULL, -# geno_frac_topmost_sparse_samples_to_remove=NULL, -# geno_n_topmost_sparse_samples_to_remove=NULL, -# pheno_sep="\t", -# pheno_header=TRUE, -# pheno_idx_col_id=1, -# pheno_idx_col_pop=2, -# pheno_idx_col_y=4, -# pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"), -# pheno_bool_remove_NA=FALSE, -# bool_within=TRUE, -# bool_across=TRUE, -# n_folds=5, -# n_reps=2, -# vec_models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C","gBLUP"), -# bool_parallel=TRUE, -# max_mem_Gb=60, -# n_threads=32, -# verbose=TRUE -# )