drafting a set of straigtforward one- or two-step genomic prediction …

…pipeline
jeffersonfparil · Jun 22, 2024 · 4c80d8f · 4c80d8f
1 parent 9664c39
commit 4c80d8f
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 100 deletions.
diff --git a/inst/exec_Rscript/0-checks_and_submision.sh b/inst/exec_Rscript/0-checks_and_submision.sh
@@ -1,34 +1,47 @@
 #!/bin/bash
 
-# ### Load the conda environment
-# module load Miniconda3/22.11.1-1
-# conda init bash
-# source ~/.bashrc
-# conda activate genomic_selection
-echo '
-if (!require("gp", character.only = TRUE)) {
-    install.packages("devtools")
-    devtools::install_github("jeffersonfparil/gp")
-}' > install_gp.R
-Rscript install_gp.R
-rm install_gp.R
+### Full path to the location of the executable Rscript `gp.R`` which should co-locate with this script: `0-checks_and_submission.sh`` as well as `1-gp_slurm_job.sh`.
+DIR=$(dirname $0)
+cd $DIR
+DIR=$(pwd)
+
+##################################
+### Load the conda environment ###
+##################################
+module load Miniconda3/22.11.1-1
+conda init bash
+source ~/.bashrc
+
+if [ $(conda env list | grep "^genomic_selection " | wc -l) -gt 0 ]
+then
+    conda activate genomic_selection
+else
+    conda env create -f ${DIR}/../../conda.yml
+fi
+
+#######################################
+### Install gp if not installed yet ###
+#######################################
+Rscript -e 'if (!require("gp", character.only = TRUE)) {install.packages("devtools", repos="https://cloud.r-project.org"); devtools::install_github("jeffersonfparil/gp")}'
 
 ################################################################
 ### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
 ### Please edit the input variables below to match your dataset:
 ################################################################
-### (1) Full path to the location of the executable Rscript gp.R
-DIR=$(dirname $0)
-cd $DIR
-DIR=$(pwd)
+# ### (1) Full path to the location of the executable Rscript `gp.R`` which should co-locate with this script: `0-checks_and_submission.sh`` as well as `1-gp_slurm_job.sh`.
+# DIR=$(dirname $0)
+# cd $DIR
+# DIR=$(pwd)
 ### Input variables (use the absolute path to files to be precise)
 ### (2) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci. 
-###     Should have no missing data or else will be imputed via mean value imputation.
+###     - The genotype data can be coded as any numeric range of values, e.g. (0,1,2), (-1,0,1), and (0.00,0.25,0.50,0.75,1.00) or as biallelic characters, e.g. for diploids: "AA", "AB", "BB", and for tetraploids: "AAAA", "AAAB", "AABB", "ABBB", and "BBBB".. It is recommended that this data should be filtered and imputed beforehand.
+###     - The rows are expected to have names of the samples corresponding to the names in the phenotype file.
+###     - The columns are expected to contain the loci names but does need to follow a specific format: chromosome name and position separated by a tab character (`\t`) and an optional allele identifier, e.g. `chr-1\t12345\tallele_A`
 GENOTYPE_DATA_RDS=${DIR}/input/test_geno.Rds
 ### (3) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
-###     Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
-###     Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
-###     Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
+###     - Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
+###     - Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
+###     - Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
 PHENOTYPE_DATA_TSV=${DIR}/input/test_pheno.tsv
 ### (4) Number of folds for k-fold cross-validation.
 KFOLDS=5

diff --git a/inst/exec_Rscript/gp.R b/inst/exec_Rscript/gp.R
@@ -82,83 +82,3 @@ print("         |_ (oo)|_______")
 print("            (__)|       )|/|")
 print("                ||----w |")
 print("                ||     ||")
-
-### TEST ON LUCERNE
-# args = list(
-#     fname_geno='/group/pasture/Jeff/lucerne/workdir/FINAL-IMPUTED-noTrailingAllele-filteredSNPlist.Rds',
-#     fname_pheno='/group/pasture/Jeff/lucerne/workdir/Lucerne_PhenomicsDB_2024-05-27-BiomassPredicted.tsv',
-#     population="DB-MS-31-22-001",
-#     fname_covar=NULL,
-#     dir_output="outdir/lucerne",
-#     geno_fname_snp_list=NULL,
-#     geno_ploidy=NULL,
-#     geno_bool_force_biallelic=TRUE,
-#     geno_bool_retain_minus_one_alleles_per_locus=FALSE,
-#     geno_min_depth=0,
-#     geno_max_depth=.Machine$integer.max,
-#     geno_maf=0.01,
-#     geno_sdev_min=0.0001,
-#     geno_max_n_alleles=NULL,
-#     geno_max_sparsity_per_locus=NULL,
-#     geno_frac_topmost_sparse_loci_to_remove=NULL,
-#     geno_n_topmost_sparse_loci_to_remove=NULL,
-#     geno_max_sparsity_per_sample=NULL,
-#     geno_frac_topmost_sparse_samples_to_remove=NULL,
-#     geno_n_topmost_sparse_samples_to_remove=NULL,
-#     pheno_sep="\t",
-#     pheno_header=TRUE,
-#     pheno_idx_col_id=1,
-#     pheno_idx_col_pop=2,
-#     pheno_idx_col_y=3,
-#     pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
-#     pheno_bool_remove_NA=FALSE,
-#     bool_within=TRUE,
-#     bool_across=TRUE,
-#     n_folds=5,
-#     n_reps=1,
-#     vec_models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C","gBLUP"),
-#     bool_parallel=TRUE,
-#     max_mem_Gb=360,
-#     n_threads=32,
-#     verbose=TRUE
-# )
-
-### TEST ON GRAPE
-# args = list(
-#     fname_geno='grape.Rds',
-#     fname_pheno='grape_pheno.txt',
-#     population="g_1",
-#     fname_covar=NULL,
-#     dir_output="outdir",
-#     geno_fname_snp_list=NULL,
-#     geno_ploidy=NULL,
-#     geno_bool_force_biallelic=TRUE,
-#     geno_bool_retain_minus_one_alleles_per_locus=FALSE,
-#     geno_min_depth=0,
-#     geno_max_depth=.Machine$integer.max,
-#     geno_maf=0.01,
-#     geno_sdev_min=0.0001,
-#     geno_max_n_alleles=NULL,
-#     geno_max_sparsity_per_locus=NULL,
-#     geno_frac_topmost_sparse_loci_to_remove=NULL,
-#     geno_n_topmost_sparse_loci_to_remove=NULL,
-#     geno_max_sparsity_per_sample=NULL,
-#     geno_frac_topmost_sparse_samples_to_remove=NULL,
-#     geno_n_topmost_sparse_samples_to_remove=NULL,
-#     pheno_sep="\t",
-#     pheno_header=TRUE,
-#     pheno_idx_col_id=1,
-#     pheno_idx_col_pop=2,
-#     pheno_idx_col_y=4,
-#     pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
-#     pheno_bool_remove_NA=FALSE,
-#     bool_within=TRUE,
-#     bool_across=TRUE,
-#     n_folds=5,
-#     n_reps=2,
-#     vec_models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C","gBLUP"),
-#     bool_parallel=TRUE,
-#     max_mem_Gb=60,
-#     n_threads=32,
-#     verbose=TRUE
-# )