Skip to content

Commit

Permalink
drafting a set of straigtforward one- or two-step genomic prediction …
Browse files Browse the repository at this point in the history
…pipeline
  • Loading branch information
jeffersonfparil committed Jun 22, 2024
1 parent 9664c39 commit 4c80d8f
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 100 deletions.
53 changes: 33 additions & 20 deletions inst/exec_Rscript/0-checks_and_submision.sh
Original file line number Diff line number Diff line change
@@ -1,34 +1,47 @@
#!/bin/bash

# ### Load the conda environment
# module load Miniconda3/22.11.1-1
# conda init bash
# source ~/.bashrc
# conda activate genomic_selection
echo '
if (!require("gp", character.only = TRUE)) {
install.packages("devtools")
devtools::install_github("jeffersonfparil/gp")
}' > install_gp.R
Rscript install_gp.R
rm install_gp.R
### Full path to the location of the executable Rscript `gp.R`` which should co-locate with this script: `0-checks_and_submission.sh`` as well as `1-gp_slurm_job.sh`.
DIR=$(dirname $0)
cd $DIR
DIR=$(pwd)

##################################
### Load the conda environment ###
##################################
module load Miniconda3/22.11.1-1
conda init bash
source ~/.bashrc

if [ $(conda env list | grep "^genomic_selection " | wc -l) -gt 0 ]
then
conda activate genomic_selection
else
conda env create -f ${DIR}/../../conda.yml
fi

#######################################
### Install gp if not installed yet ###
#######################################
Rscript -e 'if (!require("gp", character.only = TRUE)) {install.packages("devtools", repos="https://cloud.r-project.org"); devtools::install_github("jeffersonfparil/gp")}'

################################################################
### TOP-LEVEL SLURM ARRAY JOB SUBMISSION SCRIPT
### Please edit the input variables below to match your dataset:
################################################################
### (1) Full path to the location of the executable Rscript gp.R
DIR=$(dirname $0)
cd $DIR
DIR=$(pwd)
# ### (1) Full path to the location of the executable Rscript `gp.R`` which should co-locate with this script: `0-checks_and_submission.sh`` as well as `1-gp_slurm_job.sh`.
# DIR=$(dirname $0)
# cd $DIR
# DIR=$(pwd)
### Input variables (use the absolute path to files to be precise)
### (2) R matrix object with n rows corresponding to samples, and p columns corresponding to the markers or loci.
### Should have no missing data or else will be imputed via mean value imputation.
### - The genotype data can be coded as any numeric range of values, e.g. (0,1,2), (-1,0,1), and (0.00,0.25,0.50,0.75,1.00) or as biallelic characters, e.g. for diploids: "AA", "AB", "BB", and for tetraploids: "AAAA", "AAAB", "AABB", "ABBB", and "BBBB".. It is recommended that this data should be filtered and imputed beforehand.
### - The rows are expected to have names of the samples corresponding to the names in the phenotype file.
### - The columns are expected to contain the loci names but does need to follow a specific format: chromosome name and position separated by a tab character (`\t`) and an optional allele identifier, e.g. `chr-1\t12345\tallele_A`
GENOTYPE_DATA_RDS=${DIR}/input/test_geno.Rds
### (3) Tab-delimited phenotype file where column 1: sample names, column 2: population name, columns 3 and so on refers to the phenotype values of one trait per column.
### Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
### - Headers for the columns should be named appropriately, e.g. ID, POP, TRAIT1, TRAIT2, etc.
### - Missing values are allowed for samples whose phenotypes will be predicted by the best model identified within the population they belong to.
### - Missing values may be coded as empty cells, -, NA, na, NaN, missing, and/or MISSING.
PHENOTYPE_DATA_TSV=${DIR}/input/test_pheno.tsv
### (4) Number of folds for k-fold cross-validation.
KFOLDS=5
Expand Down
80 changes: 0 additions & 80 deletions inst/exec_Rscript/gp.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,83 +82,3 @@ print(" |_ (oo)|_______")
print(" (__)| )|/|")
print(" ||----w |")
print(" || ||")

### TEST ON LUCERNE
# args = list(
# fname_geno='/group/pasture/Jeff/lucerne/workdir/FINAL-IMPUTED-noTrailingAllele-filteredSNPlist.Rds',
# fname_pheno='/group/pasture/Jeff/lucerne/workdir/Lucerne_PhenomicsDB_2024-05-27-BiomassPredicted.tsv',
# population="DB-MS-31-22-001",
# fname_covar=NULL,
# dir_output="outdir/lucerne",
# geno_fname_snp_list=NULL,
# geno_ploidy=NULL,
# geno_bool_force_biallelic=TRUE,
# geno_bool_retain_minus_one_alleles_per_locus=FALSE,
# geno_min_depth=0,
# geno_max_depth=.Machine$integer.max,
# geno_maf=0.01,
# geno_sdev_min=0.0001,
# geno_max_n_alleles=NULL,
# geno_max_sparsity_per_locus=NULL,
# geno_frac_topmost_sparse_loci_to_remove=NULL,
# geno_n_topmost_sparse_loci_to_remove=NULL,
# geno_max_sparsity_per_sample=NULL,
# geno_frac_topmost_sparse_samples_to_remove=NULL,
# geno_n_topmost_sparse_samples_to_remove=NULL,
# pheno_sep="\t",
# pheno_header=TRUE,
# pheno_idx_col_id=1,
# pheno_idx_col_pop=2,
# pheno_idx_col_y=3,
# pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
# pheno_bool_remove_NA=FALSE,
# bool_within=TRUE,
# bool_across=TRUE,
# n_folds=5,
# n_reps=1,
# vec_models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C","gBLUP"),
# bool_parallel=TRUE,
# max_mem_Gb=360,
# n_threads=32,
# verbose=TRUE
# )

### TEST ON GRAPE
# args = list(
# fname_geno='grape.Rds',
# fname_pheno='grape_pheno.txt',
# population="g_1",
# fname_covar=NULL,
# dir_output="outdir",
# geno_fname_snp_list=NULL,
# geno_ploidy=NULL,
# geno_bool_force_biallelic=TRUE,
# geno_bool_retain_minus_one_alleles_per_locus=FALSE,
# geno_min_depth=0,
# geno_max_depth=.Machine$integer.max,
# geno_maf=0.01,
# geno_sdev_min=0.0001,
# geno_max_n_alleles=NULL,
# geno_max_sparsity_per_locus=NULL,
# geno_frac_topmost_sparse_loci_to_remove=NULL,
# geno_n_topmost_sparse_loci_to_remove=NULL,
# geno_max_sparsity_per_sample=NULL,
# geno_frac_topmost_sparse_samples_to_remove=NULL,
# geno_n_topmost_sparse_samples_to_remove=NULL,
# pheno_sep="\t",
# pheno_header=TRUE,
# pheno_idx_col_id=1,
# pheno_idx_col_pop=2,
# pheno_idx_col_y=4,
# pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
# pheno_bool_remove_NA=FALSE,
# bool_within=TRUE,
# bool_across=TRUE,
# n_folds=5,
# n_reps=2,
# vec_models_to_test=c("ridge","lasso","elastic_net","Bayes_A","Bayes_B","Bayes_C","gBLUP"),
# bool_parallel=TRUE,
# max_mem_Gb=60,
# n_threads=32,
# verbose=TRUE
# )

0 comments on commit 4c80d8f

Please sign in to comment.