Skip to content

Commit

Permalink
modularising R shiny app
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffersonfparil committed Jun 19, 2024
1 parent eff6ffd commit 6e9eb1f
Show file tree
Hide file tree
Showing 22 changed files with 409 additions and 2,124 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.github/
.vscode/
exec/
inst/
_libs/
shell.nix
.*sqlite
9 changes: 4 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ rsconnect/
_libs/

# Test files
exec/slurm-*.out
exec/output/
exec/tests/outdir/
exec/tests/output/
exec/tests/logfile
exec/

# SQL databases
*.sqlite
124 changes: 96 additions & 28 deletions R/io.R
Original file line number Diff line number Diff line change
Expand Up @@ -588,16 +588,20 @@ fn_G_to_vcf = function(G, min_depth=100, max_depth=1000, verbose=FALSE) {
#' @param retain_minus_one_alleles_per_locus omit the alternative or trailing allele per locus? (Default=TRUE)
#' @param verbose show vcf to allele frequency genotype matrix conversion messages? (Default=FALSE)
#' @returns
#' - Ok: named n samples x p biallelic loci matrix.
#' Row names can be any string of characters.
#' Column names need to be tab-delimited, where first element refers to the chromosome or scaffold name,
#' the second should be numeric which refers to the position in the chromosome/scaffold, and
#' subsequent elements are optional which may refer to the allele identifier and other identifiers.
#' - Ok:
#' + G: named n samples x p biallelic loci matrix of allele frequencies.
#' Row names can be any string of characters.
#' Column names need to be tab-delimited, where first element refers to the chromosome or scaffold name,
#' the second should be numeric which refers to the position in the chromosome/scaffold, and
#' subsequent elements are optional which may refer to the allele identifier and other identifiers.
#' + D: named n samples x p biallelic loci matrix of sequencing depth per locus with the same
#' row and column naming conventions as G but only includes the names of the reference alleles per locus.
#' - Err: gpError
#' @examples
#' G = simquantgen::fn_simulate_genotypes(verbose=TRUE)
#' vcf = fn_G_to_vcf(G=G, verbose=TRUE)
#' G_back = fn_vcf_to_G(vcf=vcf, verbose=TRUE)
#' list_G_D = fn_vcf_to_G(vcf=vcf, verbose=TRUE)
#' G_back = list_G_D$G
#' @export
fn_vcf_to_G = function(vcf, min_depth=0, max_depth=.Machine$integer.max, force_biallelic=TRUE, retain_minus_one_alleles_per_locus=TRUE, verbose=FALSE) {
###################################################
Expand All @@ -606,6 +610,7 @@ fn_vcf_to_G = function(vcf, min_depth=0, max_depth=.Machine$integer.max, force_b
# vcf = fn_G_to_vcf(G=G, verbose=TRUE)
# min_depth = 10
# max_depth = 500
# force_biallelic = TRUE
# retain_minus_one_alleles_per_locus = TRUE
# verbose = TRUE
###################################################
Expand Down Expand Up @@ -682,7 +687,8 @@ fn_vcf_to_G = function(vcf, min_depth=0, max_depth=.Machine$integer.max, force_b
mat_depth = vcfR::extract.gt(vcf, element="DP", as.numeric=TRUE)
if (verbose) {
print("Distribution of allele depths:")
txtplot::txtdensity(mat_depth[!is.na(mat_depth)])
vec_sample_depths = sample(mat_depth, size=min(1e4, c(prod(dim(mat_depth)))))
txtplot::txtdensity(vec_sample_depths[!is.na(vec_sample_depths)])
}
mat_idx = (mat_depth < min_depth) | ((mat_depth > max_depth))
if (sum(mat_idx, na.rm=TRUE) > 0) {
Expand All @@ -691,12 +697,13 @@ fn_vcf_to_G = function(vcf, min_depth=0, max_depth=.Machine$integer.max, force_b
print(vcf)
}
vcf@gt[, -1][mat_idx] = NA
mat_depth = vcfR::extract.gt(vcf, element="DP", as.numeric=TRUE)
if (verbose) {
print(paste0("After defining missing data by minimum and maximum depths:"))
print(vcf)
print("Distribution of allele depths after defining missing data by minimum and maximum depths:")
mat_depth = vcfR::extract.gt(vcf, element="DP", as.numeric=TRUE)
txtplot::txtdensity(mat_depth[!is.na(mat_depth)])
vec_sample_depths = sample(mat_depth, size=min(1e4, c(prod(dim(mat_depth)))))
txtplot::txtdensity(vec_sample_depths[!is.na(vec_sample_depths)])
}
}
### Extract genotype data where the AD field takes precedence over the GT field
Expand Down Expand Up @@ -759,8 +766,12 @@ fn_vcf_to_G = function(vcf, min_depth=0, max_depth=.Machine$integer.max, force_b
print(paste0(c("q_min=", "q_max="), round(range(G, na.rm=TRUE), 4)))
txtplot::txtdensity(G[!is.na(G)])
}
### Output n x p matrix of allele frequencies
return(t(G))
### Output n x p matrix of allele frequencies and matrix of depths
rownames(mat_depth) = vec_loci_ref_names
colnames(mat_depth) = vec_pool_names
return(list(
G=t(G),
D=t(mat_depth)))
}

#' Classify or bin allele frequencies into genotype classes.
Expand Down Expand Up @@ -1089,7 +1100,8 @@ fn_load_genotype = function(fname_geno, ploidy=NULL, force_biallelic=TRUE, retai
### VCF file ###
################
vcf = vcfR::read.vcfR(fname_geno, verbose=TRUE)
G = fn_vcf_to_G(vcf, force_biallelic=force_biallelic, retain_minus_one_alleles_per_locus=retain_minus_one_alleles_per_locus, min_depth=min_depth, max_depth=max_depth, verbose=verbose)
list_G_D = fn_vcf_to_G(vcf, force_biallelic=force_biallelic, retain_minus_one_alleles_per_locus=retain_minus_one_alleles_per_locus, min_depth=min_depth, max_depth=max_depth, verbose=verbose)
G = list_G_D$G
if (methods::is(G, "gpError")) {
error = chain(G, methods::new("gpError",
code=000,
Expand All @@ -1100,6 +1112,7 @@ fn_load_genotype = function(fname_geno, ploidy=NULL, force_biallelic=TRUE, retai
} else {
if (verbose) {print("Genotype loaded from a VCF file.")}
rm("vcf")
rm("list_G_D")
gc()
return(G)
}
Expand Down Expand Up @@ -1861,6 +1874,16 @@ fn_load_phenotype = function(fname_pheno, sep="\t", header=TRUE,
"These too: ", paste(utils::tail(y), collapse=", "), "?"))
return(error)
}
### Emit an error if there is no phenotypic variance
if (stats::var(y, na.rm=TRUE) < .Machine$double.eps) {
error = methods::new("gpError",
code=000,
message=paste0(
"Error in io::fn_load_phenotype(...). ",
"No variance in phenotype data. ",
"We require variance because without it, we are lost in the dark without even a match to guide us out."))
return(error)
}
names(y) = entry
if (verbose) {
print("Distribution of phenotype data:")
Expand All @@ -1880,6 +1903,8 @@ fn_load_phenotype = function(fname_pheno, sep="\t", header=TRUE,
#' (1) $y: a named vector of numeric phenotype data,
#' (2) $pop: population or groupings corresponding to each element of y, and
#' (3) $trait_name: name of the trait.
#' @param remove_outliers remove missing data in the y vector?
#' If true, this removes the corresponding element/s in the pop vector. (Default=TRUE)
#' @param remove_NA remove missing data in the y vector?
#' If true, this removes the corresponding element/s in the pop vector. (Default=FALSE)
#' @param verbose show phenotype filtering messages? (Default=FALSE)
Expand All @@ -1896,7 +1921,7 @@ fn_load_phenotype = function(fname_pheno, sep="\t", header=TRUE,
#' list_pheno$y[2] = NA
#' list_pheno_filtered = fn_filter_phenotype(list_pheno, remove_NA=TRUE, verbose=TRUE)
#' @export
fn_filter_phenotype = function(list_pheno, remove_NA=FALSE, verbose=FALSE) {
fn_filter_phenotype = function(list_pheno, remove_outliers=TRUE, remove_NA=FALSE, verbose=FALSE) {
###################################################
### TEST
# list_sim = fn_simulate_data(n_pop=3, verbose=TRUE)
Expand Down Expand Up @@ -1933,21 +1958,23 @@ fn_filter_phenotype = function(list_pheno, remove_NA=FALSE, verbose=FALSE) {
return(error)
}
### Identify outliers with graphics::boxplot, i.e. values beyond -2.698 standard deviations (definition of R::graphics::boxplot whiskers)
b = graphics::boxplot(list_pheno$y, plot=FALSE)
vec_idx = which(list_pheno$y %in% b$out)
if (length(vec_idx) == 0) {
if (verbose) {"The phenotype data do not have any outliers."}
} else {
if (verbose) {
print("Before removing outlier/s:")
print(paste0("n=", n))
txtplot::txtdensity(list_pheno$y[!is.na(list_pheno$y) & !is.infinite(list_pheno$y)])
}
list_pheno$y[vec_idx] = NA
if (verbose) {
print("After removing outlier/s:")
print(paste0("n=", length(list_pheno$y)))
txtplot::txtdensity(list_pheno$y[!is.na(list_pheno$y) & !is.infinite(list_pheno$y)])
if (remove_outliers) {
b = graphics::boxplot(list_pheno$y, plot=FALSE)
vec_idx = which(list_pheno$y %in% b$out)
if (length(vec_idx) == 0) {
if (verbose) {"The phenotype data do not have any outliers."}
} else {
if (verbose) {
print("Before removing outlier/s:")
print(paste0("n=", n))
txtplot::txtdensity(list_pheno$y[!is.na(list_pheno$y) & !is.infinite(list_pheno$y)])
}
list_pheno$y[vec_idx] = NA
if (verbose) {
print("After removing outlier/s:")
print(paste0("n=", length(list_pheno$y)))
txtplot::txtdensity(list_pheno$y[!is.na(list_pheno$y) & !is.infinite(list_pheno$y)])
}
}
}
### Removing missing data
Expand All @@ -1970,6 +1997,16 @@ fn_filter_phenotype = function(list_pheno, remove_NA=FALSE, verbose=FALSE) {
}
}
}
### Emit an error if there is no phenotypic variance after filtering
if (stats::var(list_pheno$y, na.rm=TRUE) < .Machine$double.eps) {
error = methods::new("gpError",
code=000,
message=paste0(
"Error in io::fn_filter_phenotype(...). ",
"No variance in phenotype data after filtering. ",
"Consider transforming your phenotype data."))
return(error)
}
return(list_pheno)
}

Expand Down Expand Up @@ -2342,3 +2379,34 @@ fn_estimate_memory_footprint = function(X, n_models=7, n_folds=10, n_reps=10,
size_total=size_total,
n_threads=n_threads))
}




### Getting frustrated with how the data are stored and labelled.
### Let's use a proper database for direct I/O in and out of R:
fn_load_into_database = function() {

mydb = DBI::dbConnect(RSQLite::SQLite(), "test.sqlite")
DBI::dbDisconnect(mydb)

mydb = DBI::dbConnect(RSQLite::SQLite(), "")
DBI::dbWriteTable(mydb, "mtcars", mtcars)
DBI::dbWriteTable(mydb, "iris", iris)
DBI::dbListTables(mydb)

DBI::dbGetQuery(mydb, 'SELECT * FROM mtcars LIMIT 5')
DBI::dbGetQuery(mydb, 'SELECT * FROM iris WHERE "Sepal.Length" < 4.6')
DBI::dbGetQuery(mydb, 'SELECT * FROM iris WHERE "Sepal.Length" < :x', params=list(x=4.6))

rs = DBI::dbSendQuery(mydb, 'SELECT * FROM mtcars')
while (!DBI::dbHasCompleted(rs)) {
df = DBI::dbFetch(rs, n=10)
print(nrow(df))
}

rs = DBI::dbSendQuery(mydb, 'SELECT * FROM iris WHERE "Sepal.Length" = :x')
DBI::dbBind(rs, params=list(x=seq(4, 4.4, by=0.1)))
nrow(DBI::dbFetch(rs))

}
5 changes: 5 additions & 0 deletions R/main.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#' (see ?fn_load_phenotype for details)
#' - $pheno_na_strings: strings of characters corresponding to missing data in the phenotype file
#' (see ?fn_load_phenotype for details)
#' - $pheno_bool_remove_outliers: remove outliers from the phenotype file?
#' - $pheno_bool_remove_NA: remove samples missing phenotype data in the phenotype file?
#' (see ?fn_load_phenotype for details)
#' - $bool_within: perform within population k-fold cross-validation?
Expand Down Expand Up @@ -236,6 +237,7 @@
#' pheno_idx_col_pop=2,
#' pheno_idx_col_y=3,
#' pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
#' pheno_bool_remove_outliers=TRUE,
#' pheno_bool_remove_NA=FALSE,
#' bool_within=TRUE,
#' bool_across=TRUE,
Expand Down Expand Up @@ -285,6 +287,7 @@ gp = function(args) {
# pheno_idx_col_pop=2,
# pheno_idx_col_y=3,
# pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
# pheno_bool_remove_outliers=FALSE,
# pheno_bool_remove_NA=FALSE,
# bool_within=TRUE,
# bool_across=TRUE,
Expand Down Expand Up @@ -341,6 +344,7 @@ gp = function(args) {
gc()
list_pheno = fn_filter_phenotype(
list_pheno=list_pheno,
remove_outliers=args$pheno_bool_remove_outliers,
remove_NA=args$pheno_bool_remove_NA,
verbose=args$verbose
)
Expand Down Expand Up @@ -458,6 +462,7 @@ gp = function(args) {
gc()
list_pheno = fn_filter_phenotype(
list_pheno=list_merged$list_pheno,
remove_outliers=args$pheno_bool_remove_outliers,
remove_NA=args$pheno_bool_remove_NA,
verbose=args$verbose
)
Expand Down
95 changes: 0 additions & 95 deletions exec/00_gp_slurm_job_wrapper-LUCERNE.sh

This file was deleted.

Loading

0 comments on commit 6e9eb1f

Please sign in to comment.