Skip to content

Commit

Permalink
working on models + will need to reestimate maximum number of threads…
Browse files Browse the repository at this point in the history
… for within, and various across population cross-validations
  • Loading branch information
jeffersonfparil committed May 31, 2024
1 parent 09b7491 commit 61ba298
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 47 deletions.
3 changes: 1 addition & 2 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
.github/
^simulated_*
.github/
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ docs/
po/*~

# RStudio Connect folder
rsconnect/
rsconnect/
3 changes: 0 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@ Imports:
methods,
vcfR,
parallel,
cluster,
mgcv,
nlme,
glmnet,
BGLR,
sommer,
Expand Down
8 changes: 8 additions & 0 deletions R/cross_validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,14 @@ fn_within_across_perse_genomic_prediction = function(G, idx_col_y, args, dir_tmp
print("##################################################")
print("Across population cross-validation:")
print("(Leave-one-population-out CV)")



### REDUCE NUMBER OF THREADS HERE ARE DATA SIZE differs from WITHIN POP CV!
### USE MAXIMUM NUMBER OF THREADS ESTIMATION



list_out = fn_leave_one_population_out_cross_validation(G=G,
y=y,
pop=pop,
Expand Down
20 changes: 12 additions & 8 deletions R/load.R
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,7 @@ fn_classify_allele_frequencies = function(G, ploidy=2, verbose=FALSE) {
#' @param n number of samples (Default=100)
#' @param l number of loci (Default=1000)
#' @param ploidy ploidy level which can refer to the number of haploid genomes to simulate pools (Default=2)
#' @param n_alleles macimum number of alleles per locus (Default=2)
#' @param n_alleles maximum number of alleles per locus (Default=2)
#' @param min_depth minimum depth per locus (Default=5)
#' @param max_depth maximum depth per locus (Default=5000)
#' @param n_pop number of randomly assigned population groupings (Default=1)
Expand Down Expand Up @@ -755,10 +755,11 @@ fn_simulate_data = function(n=100, l=1000, ploidy=2, n_alleles=2, min_depth=5, m
fname_geno_rds = NULL
fname_pheno_tsv = NULL
### Define date-time-random-number identifier so that we minimise the possibility of unintentional over-writing of the output file/s
date = gsub("-", "", gsub("[.]", "", gsub(":", "", gsub(" ", "", as.character(Sys.time())))))
# date = gsub("-", "", gsub("[.]", "", gsub(":", "", gsub(" ", "", as.character(Sys.time())))))
### Save phenotype file
if (save_pheno_tsv) {
fname_pheno_tsv = file.path(getwd(), paste0("simulated_phenotype-", date, ".tsv"))
# fname_pheno_tsv = file.path(getwd(), paste0("simulated_phenotype-", date, ".tsv"))
fname_pheno_tsv = tempfile(fileext=".tsv")
utils::write.table(df, file=fname_pheno_tsv, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
if (verbose) {
print("Output phenotype file (tsv):")
Expand All @@ -777,7 +778,8 @@ fn_simulate_data = function(n=100, l=1000, ploidy=2, n_alleles=2, min_depth=5, m
error = chain(vcf, error)
return(error)
}
fname_geno_vcf = file.path(getwd(), paste0("simulated_genotype-", date, ".vcf.gz"))
# fname_geno_vcf = file.path(getwd(), paste0("simulated_genotype-", date, ".vcf.gz"))
fname_geno_vcf = tempfile(fileext=".vcf.gz")
vcfR::write.vcf(vcf, file=fname_geno_vcf)
if (verbose) {
print("Output genotype file (sync.gz):")
Expand All @@ -787,7 +789,8 @@ fn_simulate_data = function(n=100, l=1000, ploidy=2, n_alleles=2, min_depth=5, m
if (save_geno_tsv) {
list_ids_chr_pos_all = fn_G_extract_names(mat_genotypes=G, verbose=verbose)
df_geno = data.frame(chr=list_ids_chr_pos_all$vec_chr, pos=list_ids_chr_pos_all$vec_pos, allele=list_ids_chr_pos_all$vec_all, t(G))
fname_geno_tsv = file.path(getwd(), paste0("simulated_genotype-", date, ".tsv"))
# fname_geno_tsv = file.path(getwd(), paste0("simulated_genotype-", date, ".tsv"))
fname_geno_tsv = tempfile(fileext=".tsv")
utils::write.table(df_geno, file=fname_geno_tsv, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
if (verbose) {
print("Output genotype file (tsv):")
Expand All @@ -796,7 +799,8 @@ fn_simulate_data = function(n=100, l=1000, ploidy=2, n_alleles=2, min_depth=5, m
}
if (save_geno_rds) {
### Convert loci names so that the chromosome, position and allele are tab-delimited
fname_geno_rds = file.path(getwd(), paste0("simulated_genotype-", date, ".Rds"))
# fname_geno_rds = file.path(getwd(), paste0("simulated_genotype-", date, ".Rds"))
fname_geno_rds = tempfile(fileext=".Rds")
### Revert to numeric if we have more than 52 alleles (limit of the Latin alphabet)
if (!non_numeric_Rds) {
saveRDS(G, file=fname_geno_rds)
Expand Down Expand Up @@ -987,7 +991,7 @@ fn_load_genotype = function(fname_geno, ploidy=NULL, retain_minus_one_alleles_pe
#' REF_ALT=paste0("allele_1,allele_alt"))
#' df_snp_list$REF_ALT[1:100] = "allele_2,allele_4"
#' colnames(df_snp_list) = c("#CHROM", "POS", "REF,ALT")
#' fname_snp_list = "tmp_snp_list.txt"
#' fname_snp_list = tempfile(fileext=".snplist")
#' utils::write.table(df_snp_list, file=fname_snp_list, sep="\t",
#' row.names=FALSE, col.names=TRUE, quote=FALSE)
#' ### Filter
Expand Down Expand Up @@ -1021,7 +1025,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
# df_snp_list = data.frame(CHROM=mat_loci[,1], POS=as.numeric(mat_loci[,2]), REF_ALT=paste0("allele_1,allele_alt"))
# df_snp_list$REF_ALT[1:100] = "allele_2,allele_4"
# colnames(df_snp_list) = c("#CHROM", "POS", "REF,ALT")
# fname_snp_list = "tmp_snp_list.txt"
# fname_snp_list = tempfile(fileext=".snplist")
# utils::write.table(df_snp_list, file=fname_snp_list, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
# verbose = TRUE
###################################################
Expand Down
10 changes: 9 additions & 1 deletion R/metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#' $corr: Pearson's product moment correlation
#' $power_t10: fraction of observed top 10 phenotype values correctly predicted
#' $power_b10: fraction of observed bottom 10 phenotype values correctly predicted
#' $var_pred: variance of predicted phenotype values (estimator of additive genetic variance)
#' $var_true: variance of observed phenotype values (estimator of total phenotypic variance)
#' $h2: estimated narrow-sense heritability weighted by Pearson's correlation between observed and predicted phenotype values
#' Formula:
#' - h2_predicted = corr(observed, predicted) * h2_true
Expand Down Expand Up @@ -80,7 +82,9 @@ fn_prediction_performance_metrics = function(y_true, y_pred, verbose=FALSE) {
power_b10 = sum((bottom10_dec_true %in% bottom10_dec_pred)) / n_top_or_bottom_10
}
### Narrow-sense heritability scaled by prediction accuracy in terms of Pearson's correlation
h2 = round((stats::var(y_pred, na.rm=TRUE) / stats::var(y_true)) / corr, 10)
var_pred = stats::var(y_pred, na.rm=TRUE)
var_true = stats::var(y_true, na.rm=TRUE)
h2 = round((var_pred/var_true) / corr, 10)
if (!is.na(h2)) {
if ((h2 < 0.0) | (h2 > 1.0)) {
h2 = NA
Expand All @@ -101,6 +105,8 @@ fn_prediction_performance_metrics = function(y_true, y_pred, verbose=FALSE) {
print(paste0("Power to identify top 10 (power_t10) = ", mbe))
print(paste0("Power to identify bottom 10 (power_b10) = ", power_b10))
print(paste0("Pearson's correlation (corr) = ", corr))
print(paste0("Variance of predicted phenotypes (var_pred) = ", var_pred, " (estimator of additive genetic variance)"))
print(paste0("Variance of observed phenotypes (var_true) = ", var_true, " (estimator of total phenotypic variance)"))
print(paste0("Narrow-sense heritability estimate (h2) = ", h2))
}
### Output
Expand All @@ -112,5 +118,7 @@ fn_prediction_performance_metrics = function(y_true, y_pred, verbose=FALSE) {
corr=corr,
power_t10=power_t10,
power_b10=power_b10,
var_pred=var_pred,
var_true=var_true,
h2=h2))
}
118 changes: 91 additions & 27 deletions R/models.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,60 @@ suppressWarnings(suppressPackageStartupMessages(library(sommer)))
### 4.) vector of numeric indexes corresponding to the index of the validation set in the genotype matrix and phenotype vector
### 5.) a list of other parameters required by the internal functions

fn_ols = function(list_merged, vec_idx_training, vec_idx_validation, other_params=list(diag_inflate=1e-4)) {
#' Ordinary least squares model
#'
#' @param list_merged list of merged genotype matrix, and phenotype vector, as well as an optional covariate matrix
#' $G: numeric n samples x p loci-alleles matrix of allele frequencies with non-null row and column names.
#' Row names can be any string of characters which identify the sample or entry or pool names.
#' Column names need to be tab-delimited, where first element refers to the chromosome or scaffold name,
#' the second should be numeric which refers to the position in the chromosome/scaffold, and
#' subsequent elements are optional which may refer to the allele identifier and other identifiers.
#' $list_pheno:
#' $y: named vector of numeric phenotype data
#' $pop: population or groupings corresponding to each element of y
#' $trait_name: name of the trait
#' $COVAR: numeric n samples x k covariates matrix with non-null row and column names.
#' @param vec_idx_training vector of numeric indexes referring to the training set
#' @param vec_idx_validation vector of numeric indexes referring to the validation set
fn_ols = function(list_merged, vec_idx_training, vec_idx_validation, other_params=list(diag_inflate=1e-4), verbose=FALSE) {
###################################################
### TEST
list_sim = fn_simulate_data(n_pop=3, verbose=TRUE)
G = fn_load_genotype(fname_geno=list_sim$fname_geno_vcf)
list_pheno = fn_load_phenotype(fname_pheno=list_sim$fname_pheno_tsv)
COVAR = G %*% t(G); # rownames(COVAR) = NULL
list_merged = fn_merge_genotype_and_phenotype(G=G, list_pheno=list_pheno, COVAR=COVAR, verbose=TRUE)
n = nrow(list_merged$G)
vec_idx_training = sample(c(1:n), floor(n/2))
vec_idx_validation = c(1:n)[!(c(1:n) %in% vec_idx_training)]
other_params=list(diag_inflate=1e-4)
# list_sim = fn_simulate_data(n_pop=3, verbose=TRUE)
# G = fn_load_genotype(fname_geno=list_sim$fname_geno_vcf)
# list_pheno = fn_load_phenotype(fname_pheno=list_sim$fname_pheno_tsv)
# COVAR = G %*% t(G); # rownames(COVAR) = NULL
# list_merged = fn_merge_genotype_and_phenotype(G=G, list_pheno=list_pheno, COVAR=COVAR, verbose=TRUE)
# n = nrow(list_merged$G)
# vec_idx_training = sample(c(1:n), floor(n/2))
# vec_idx_validation = c(1:n)[!(c(1:n) %in% vec_idx_training)]
# other_params=list(diag_inflate=1e-6)
# verbose = TRUE
###################################################
if (methods::is(list_merged, "gpError")) {
error = chain(list_merged,
methods::new("gpError",
code=000,
message=paste0(
"Error in models::fn_ols(list_merged, vec_idx_training, vec_idx_validation, other_params). ",
"Error in models::fn_ols(...). ",
"Input data is an error type."
)))
return(error)
}
if (!is.logical(c(vec_idx_training, vec_idx_training)) |
(sum(is.na(c(vec_idx_training, vec_idx_training))) > 0) |
(min(c(vec_idx_training, vec_idx_training)) < 0) |
(max(c(vec_idx_training, vec_idx_training)) > nrow(list_merged$G))) {
error = methods::new("gpError",
code=000,
message=paste0(
"Error in models::fn_ols(...). ",
"Please make sure that the vector of indexes for the training and validation sets are not booleans. ",
"We require the indexes to be positive integers without missing values. ",
"Also, the maximum index (", max(c(vec_idx_training, vec_idx_training)),
") should be less than or equal to the total number of samples/entries/pools in the input data: (",
nrow(list_merged$G), ")."))
return(error)
}
X_training = list_merged$G[vec_idx_training, ]
y_training = list_merged$list_pheno$y[vec_idx_training]
X_validation = list_merged$G[vec_idx_validation, ]
Expand All @@ -44,35 +75,68 @@ fn_ols = function(list_merged, vec_idx_training, vec_idx_validation, other_param
X_training = cbind(list_merged$COVAR[vec_idx_training, ], X_training)
X_validation = cbind(list_merged$COVAR[vec_idx_validation, ], X_validation)
}

if (nrow(X_training) > ncol(X_training)) {
### Solve the least squares equations
n = nrow(X_training)
p = ncol(X_training)
if (n >= p) {
### Canonical least squares equations
A = t(X_training) %*% X_training
inv_A = tryCatch(solve(A), error=function(x) {
A_inv = tryCatch(solve(A), error=function(x) {
diag(A) = diag(A) + other_params$diag_inflate
x_inv = tryCatch(solve(A), error=function(x) {
NA
})
})
if (is.na(A_inv)) {
error = methods::new("gpError",
code=000,
message=paste0(
"Error in models::fn_ols(...). ",
"Failed to compute the inverse of X'X. ",
"Consider increasing the other_params$diag_inflate: ", other_params$diag_inflate,
" to force the calculation of the inverse of X'X."
))
return(error)
}
b = A_inv %*% t(X_training) %*% y_training
} else {
### Non-canonical least squares equations
A = X_training %*% t(X_training)
A_inv = tryCatch(solve(A), error=function(A) {
B = A
diag(B) = diag(B) + other_params$diag_inflate
out = tryCatch(solve(B), error=function(e){return(NA)})
return(out)
B_inv = tryCatch(solve(B), error=function(B) {
return(NA)
})
return(B_inv)
})
diag(A %*% inv_A)
if (is.na(inv_A)) {
if (is.na(A_inv)) {
error = methods::new("gpError",
code=000,
message=paste0(
"Error in models::fn_ols(list_merged, vec_idx_training, vec_idx_validation, other_params)"
"Error in models::fn_ols(...). ",
"Failed to compute the inverse of XX'. ",
"Consider increasing the other_params$diag_inflate: ", other_params$diag_inflate,
" to force the calculation of the inverse of XX'."
))
return(error)
}
b = t(X_training) %*% A_inv %*% y_training
}



A = t(X_training) %*% X_training
inv_A = solve()
b = inv_A %*% t(X_training) %*% y_training
y_pred = X_validation %*% b
df_y = merge(data.frame(id=rownames(y)[vec_idx_validation], true=y_validation), data.frame(id=rownames(y_pred), pred=y_pred), by="id")
perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred)
df_y = merge(
data.frame(id=names(y_validation), true=y_validation),
data.frame(id=rownames(y_pred), pred=y_pred),
by="id")
perf = fn_prediction_performance_metrics(y_true=df_y$true, y_pred=df_y$pred, verbose=verbose)
perf$y_pred = y_pred
perf$b = b[,1]; names(perf$b) = colnames(X_training)
perf$n_non_zero = sum(abs(b) > .Machine$double.eps)
if (verbose) {
print("Allele effects distribution:")
txtplot::txtdensity(perf$b[!is.na(perf$b) & !is.infinite(perf$b)])
print(paste0("Number of non-zero effects: ", perf$n_non_zero, " (", round(100*p/perf$n_non_zero), "%)"))
}
return(perf)
}

Expand Down
2 changes: 1 addition & 1 deletion man/fn_filter_genotype.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/fn_simulate_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test-load.R
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ test_that("fn_filter_genotype", {
df_snp_list = data.frame(CHROM=mat_loci[,1], POS=as.numeric(mat_loci[,2]), REF_ALT=paste0("allele_1,allele_alt"))
df_snp_list$REF_ALT[1:n_sim_missing] = "allele_2,allele_4"
colnames(df_snp_list) = c("#CHROM", "POS", "REF,ALT")
fname_snp_list = "tmp_snp_list.txt"
fname_snp_list = tempfile(fileext=".snplist")
utils::write.table(df_snp_list, file=fname_snp_list, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
### Filter with the alternative alleles
G_filtered_2 = fn_filter_genotype(G=G, maf=0.05, fname_snp_list=fname_snp_list, verbose=TRUE)
Expand Down
6 changes: 4 additions & 2 deletions tests/testthat/test-metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ test_that("fn_prediction_performance_metrics", {
list_metrics_2 = fn_prediction_performance_metrics(y_true, y_pred)
list_metrics_3 = fn_prediction_performance_metrics(y_true, NA)
list_metrics_4 = fn_prediction_performance_metrics(y_true, rep(NA, length(y_true)))
expect_equal(list_metrics_1, list(mbe=0, mae=0, rmse=0, r2=1, corr=1, power_t10=1, power_b10=1, h2=1))
expect_equal(list_metrics_2, list(mbe=-pi, mae=pi, rmse=pi, r2=(1-(n*(pi^2)/sum((y_true-mean(y_true))^2))), corr=1, power_t10=1, power_b10=1, h2=1))
expect_equal(list_metrics_1, list(mbe=0, mae=0, rmse=0, r2=1, corr=1, power_t10=1, power_b10=1, var_pred=841.666666666666, var_true=841.666666666666, h2=1))
expect_equal(list_metrics_2, list(mbe=-pi, mae=pi, rmse=pi, r2=(1-(n*(pi^2)/sum((y_true-mean(y_true))^2))), corr=1, power_t10=1, power_b10=1, var_pred=841.666666666666, var_true=841.666666666666, h2=1))
expect_equal(methods::is(list_metrics_3, "gpError"), TRUE)
expect_equal(is.na(list_metrics_4$mbe), TRUE)
expect_equal(is.na(list_metrics_4$mae), TRUE)
Expand All @@ -20,5 +20,7 @@ test_that("fn_prediction_performance_metrics", {
expect_equal(is.na(list_metrics_4$corr), TRUE)
expect_equal(is.na(list_metrics_4$power_t10), TRUE)
expect_equal(is.na(list_metrics_4$power_b10), TRUE)
expect_equal(is.na(list_metrics_4$var_pred), TRUE)
expect_equal(list_metrics_4$var_true, 841.666666666666)
expect_equal(is.na(list_metrics_4$h2), TRUE)
})

0 comments on commit 61ba298

Please sign in to comment.