Skip to content

Commit

Permalink
drafted metrics.R + working on models.R now
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffersonfparil committed May 31, 2024
1 parent 5dc51ff commit 1f6575e
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 185 deletions.
3 changes: 2 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.github/
.github/
^simulated_*
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Suggests:
Remotes:
jeffersonfparil/simquantgen
Imports:
methods,
vcfR,
parallel,
cluster,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ export(fn_filter_phenotype)
export(fn_load_genotype)
export(fn_load_phenotype)
export(fn_merge_genotype_and_phenotype)
export(fn_prediction_performance_metrics)
export(fn_simulate_data)
export(fn_vcf_to_G)
29 changes: 14 additions & 15 deletions R/load.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
suppressWarnings(suppressPackageStartupMessages(library(simquantgen)))
suppressWarnings(suppressPackageStartupMessages(library(vcfR)))
suppressWarnings(suppressPackageStartupMessages(library(txtplot)))
suppressWarnings(suppressPackageStartupMessages(library(testthat)))

### Create an error class
methods::setClass("gpError", representation(code="numeric", message="character"), prototype(code=0, message="Empty error message. Please define."))
### Create an error chaining method
methods::setGeneric("chain", function(x, y){
methods::standardGeneric("chain")
standardGeneric("chain")
})
methods::setMethod(f="chain",
signature=c(x="gpError", y="gpError"),
Expand Down Expand Up @@ -695,10 +694,10 @@ fn_classify_allele_frequencies = function(G, ploidy=2, verbose=FALSE) {
#' @param l number of loci (Default=1000)
#' @param ploidy ploidy level which can refer to the number of haploid genomes to simulate pools (Default=2)
#' @param n_alleles macimum number of alleles per locus (Default=2)
#' @param min_depth minimum depth per locus [Default=5]
#' @param max_depth maximum depth per locus [Default=5000]
#' @param n_pop number of randomly assigned population groupings [Default=1]
#' @param seed randomisation seed for replicability [Default=12345]
#' @param min_depth minimum depth per locus (Default=5)
#' @param max_depth maximum depth per locus (Default=5000)
#' @param n_pop number of randomly assigned population groupings (Default=1)
#' @param seed randomisation seed for replicability (Default=12345)
#' @param save_pheno_tsv save the phenotype data as a tab-delimited file? (Default=TRUE)
#' @param save_geno_vcf save the genotype data as a vcf file? (Default=TRUE)
#' @param save_geno_tsv save the genotype data as a tab-delimited allele frequency table file? (Default=FALSE)
Expand Down Expand Up @@ -955,16 +954,16 @@ fn_load_genotype = function(fname_geno, ploidy=NULL, retain_minus_one_alleles_pe
#' @param maf minimum allele frequency (Default=0.01)
#' @param sdev_min minimum allele frequency standard deviation (constant allele frequency across samples is just another intercept) (Default=0.0001)
#' @param fname_snp_list name of the file containing the list of expected SNPs including their coordinates and alleles.
#' This is a tab-delimited file with 3 columns: '#CHROM', 'POS', 'REF,ALT', corresponding to([Default=NULL)
#' This is a tab-delimited file with 3 columns: '#CHROM', 'POS', 'REF,ALT', corresponding to (Default=NULL)
#' chromosome names (e.g. 'chr1' & 'chrom_1'),
#' numeric positions (e.g. 12345 & 100001), and
#' reference-alternative allele strings separated by a comma (e.g. 'A,T' & 'allele_1,allele_alt')([Default=NULL)
#' @param max_sparsity_per_locus maximum mean sparsity per locus, e.g. 0.1 or 0.5([Default=NULL)
#' @param frac_topmost_sparse_loci_to_remove fraction of the top-most sparse loci to remove, e.g. 0.01 or 0.25([Default=NULL)
#' @param n_topmost_sparse_loci_to_remove number of top-most sparse loci to remove, e.g. 100 or 1000([Default=NULL)
#' @param max_sparsity_per_sample maximum mean sparsity per sample, e.g. 0.3 or 0.5([Default=NULL)
#' @param frac_topmost_sparse_samples_to_remove fraction of the top-most sparse samples to remove, e.g. 0.01 or 0.05([Default=NULL)
#' @param n_topmost_sparse_samples_to_remove number of top-most sparse samples to remove, e.g. 5 or 10([Default=NULL)
#' reference-alternative allele strings separated by a comma (e.g. 'A,T' & 'allele_1,allele_alt') (Default=NULL)
#' @param max_sparsity_per_locus maximum mean sparsity per locus, e.g. 0.1 or 0.5 (Default=NULL)
#' @param frac_topmost_sparse_loci_to_remove fraction of the top-most sparse loci to remove, e.g. 0.01 or 0.25 (Default=NULL)
#' @param n_topmost_sparse_loci_to_remove number of top-most sparse loci to remove, e.g. 100 or 1000 (Default=NULL)
#' @param max_sparsity_per_sample maximum mean sparsity per sample, e.g. 0.3 or 0.5 (Default=NULL)
#' @param frac_topmost_sparse_samples_to_remove fraction of the top-most sparse samples to remove, e.g. 0.01 or 0.05 (Default=NULL)
#' @param n_topmost_sparse_samples_to_remove number of top-most sparse samples to remove, e.g. 5 or 10 (Default=NULL)
#' @param verbose show genotype filtering messages? (Default=FALSE)
#' @returns
#' Ok: numeric n samples x p loci-alleles matrix of allele frequencies with non-null row and column names.
Expand Down Expand Up @@ -1274,7 +1273,7 @@ fn_filter_genotype = function(G, maf=0.01, sdev_min=0.0001,
#' Load phenotype data from a text-delimited file
#'
#' @param fname_pheno filname of the text-delimited phenotype data
#' @param sep column-delimiter in the phenotype file (Default='\t')
#' @param sep column-delimiter in the phenotype file (Default="\t")
#' @param header does the phenotype file have a header line? (Default=TRUE)
#' @param idx_col_id which column correspond to the sample/entry/pool/genotype names? (Default=1)
#' @param idx_col_pop which column correspond to the population groupings? (Default=2)
Expand Down
121 changes: 73 additions & 48 deletions R/metrics.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# source("R/load.R")

#' Genomic prediction metrics
#' Add more metrics which we may be interested in.
#' NOTES:
#' - the number of non-zero estimated marker effects are extracted within each model, i.e. in `models.R`
#' - the run-time of each genomic prediction and validation on a single set is measured during each cross-validation replicate, i.e in `cross-validation.R`
#' - Add more metrics which we may be interested in. while keeping the function signatures, i.e. input and output consistent
#' - The number of non-zero estimated marker effects are extracted within each model, i.e. in `models.R`
#' - Run time of each genomic prediction and validation on a single set is measured during each cross-validation replicate, i.e in `cross-validation.R`
#'
#' @param y_true numeric vector observed phenotype values
#' @param y_pred numeric vector predicted phenotype values
Expand All @@ -15,50 +17,91 @@
#' $power_t10: fraction of observed top 10 phenotype values correctly predicted
#' $power_b10: fraction of observed bottom 10 phenotype values correctly predicted
#' $h2: estimated narrow-sense heritability weighted by Pearson's correlation between observed and predicted phenotype values
#' @example
#' Formula:
#' - h2_predicted = corr(observed, predicted) * h2_true
#' - Caveat: yields NA if the estimated heritability lies beyond 0 and 1
#' which happens when correlation between observed and predicted phenotypes is negative, and
#' the predicted phenotypes have higher variance than observed phenotypes.
#' This unexpected higher variance in the predictions implies that the genomic prediction model performs very poorly,
#' and therefore heritability estimates from them should not be considered.
#' Assumptions:
#' - correlation between predicted and true trait values relates the variance of the predicted trait values with the true heritability,
#' - predicted trait values are due to purely additive effects,
#' - variance in the predicted trait values is proportional to the additive genetic variance (~Va)
#' - variance in the true trait values represent the total phenotypic variance (Vp)
#' @examples
#' y_pred = stats::rnorm(100)
#' y_true = y_pred + rnorm(100)
#' list_metrics = fn_prediction_performance_metrics(y_true=y_true, y_pred=y_pred)
fn_prediction_performance_metrics = function(y_true, y_pred) {
#' y_true = y_pred + stats::rnorm(100)
#' list_metrics = fn_prediction_performance_metrics(y_true=y_true, y_pred=y_pred, verbose=TRUE)
#' @export
fn_prediction_performance_metrics = function(y_true, y_pred, verbose=FALSE) {
###################################################
### TEST
# y_true = stats::rnorm(100)
# y_pred = stats::rnorm(100); # y_pred = y_true + stats::rnorm(n=100, mean=0, stats::sd=0.1)
# # y_pred = rep(NA, length(y_true))
# verbose = TRUE
###################################################
y_true = as.vector(y_true)
y_pred = as.vector(y_pred)
if (length(y_true) != length(y_pred)) {
print("Please make sure the observed and predicted phenotype vectors are the same length.")
return(list(mbe=NA, mae=NA, rmse=NA, r2=NA, corr=NA, , power_t10=NA, power_b10=NA))
n = length(y_true)
if (n != length(y_pred)) {
error = methods::new("gpError",
code=000,
message=paste0(
"Error in metrics::fn_prediction_performance_metrics(...). ",
"The vector of observed phenotypes has a length of ", n,
" which does not match the length of the vector of predicted phenotypes with ",
length(y_pred), " elements."
))
return(error)
}
error = y_true - y_pred
mbe = mean(error)
mae = mean(abs(error))
rmse = sqrt(mean(error^2))
r2 = 1 - (sum(error^2) / sum((y_true-mean(y_true))^2))
corr = suppressWarnings(stats::cor(y_true, y_pred, method="pearson"))
if (is.na(corr)) {
corr = 0.0
mbe = mean(error, na.rm=TRUE)
mae = mean(abs(error), na.rm=TRUE)
rmse = sqrt(mean(error^2, na.rm=TRUE))
if (sum(is.na(error)) == n) {
r2 = NA
} else {
r2 = 1 - (sum(error^2, na.rm=TRUE) / sum((y_true-mean(y_true, na.rm=TRUE))^2, na.rm=TRUE))
}
corr = suppressWarnings(stats::cor(y_true, y_pred, method="pearson"))
### Power to select true top and bottom 10%
n = length(y_true)
n_top_or_bottom_10 = max(c(1, round(0.1*n)))
top10_dec_true = order(y_true, decreasing=TRUE)[1:n_top_or_bottom_10]
top10_dec_pred = order(y_pred, decreasing=TRUE)[1:n_top_or_bottom_10]
power_t10 = sum((top10_dec_true %in% top10_dec_pred)) / n_top_or_bottom_10
bottom10_dec_true = order(y_true, decreasing=FALSE)[1:n_top_or_bottom_10]
bottom10_dec_pred = order(y_pred, decreasing=FALSE)[1:n_top_or_bottom_10]
power_b10 = sum((bottom10_dec_true %in% bottom10_dec_pred)) / n_top_or_bottom_10
if (sum(is.na(error)) == n) {
power_t10 = NA
power_b10 = NA
} else {
top10_dec_pred = order(y_pred, decreasing=TRUE)[1:n_top_or_bottom_10]
bottom10_dec_pred = order(y_pred, decreasing=FALSE)[1:n_top_or_bottom_10]
power_t10 = sum((top10_dec_true %in% top10_dec_pred)) / n_top_or_bottom_10
power_b10 = sum((bottom10_dec_true %in% bottom10_dec_pred)) / n_top_or_bottom_10
}
### Narrow-sense heritability scaled by prediction accuracy in terms of Pearson's correlation
### Assumptions:
### - the predicted trait values are due to purely additive effects,
### - the variance in the predicted trait values is proportional to the additive genetic variance (~Va)
### - the variance in the true trait values represent the total phenotypic variance (Vp)
### - the correlation between predicted and true trait values is the factor that relates the variance of the predicted trait values with the true heritability, i.e. h2_predicted = corr(true,pred) * h2_true
h2 = round((stats::var(y_pred) / stats::var(y_true)) / corr, 10)
h2 = round((stats::var(y_pred, na.rm=TRUE) / stats::var(y_true)) / corr, 10)
if (!is.na(h2)) {
if ((h2 < 0.0) | (h2 > 1.0)) {
h2 = NA
}
} else {
h2 = 0.0
}
if (verbose) {
vec_idx = which(!is.na(y_true) & !is.na(y_pred))
if (length(vec_idx) > 0) {
print("Scatter plot of the observed and predicted phenotypes")
txtplot::txtplot(x=y_true, y=y_pred)
} else {
print("All pairs of phenotype values have missing data.")
}
print(paste0("Mean bias error (mbe) = ", mbe))
print(paste0("Mean absolute error (mae) = ", mae))
print(paste0("Root mean square error (rmse) = ", rmse))
print(paste0("Coefficient of determination (r2) = ", r2))
print(paste0("Power to identify top 10 (power_t10) = ", mbe))
print(paste0("Power to identify bottom 10 (power_b10) = ", power_b10))
print(paste0("Pearson's correlation (corr) = ", corr))
print(paste0("Narrow-sense heritability estimate (h2) = ", h2))
}
### Output
return(list(
Expand All @@ -71,21 +114,3 @@ fn_prediction_performance_metrics = function(y_true, y_pred) {
power_b10=power_b10,
h2=h2))
}

#####################################################################
############################### Tests ###############################
#####################################################################
tests_metrics = function() {
test_that(
"fn_prediction_performance_metrics", {
print("fn_prediction_performance_metrics:")
n = 100
y1 = 1:n
y2 = y1 + pi
m0 = fn_prediction_performance_metrics(y1, y1)
m1 = fn_prediction_performance_metrics(y1, y2)
expect_equal(m0, list(mbe=0, mae=0, rmse=0, r2=1, corr=1, power_t10=1, power_b10=1, h2=1))
expect_equal(m1, list(mbe=-pi, mae=pi, rmse=pi, r2=(1-(n*(pi^2)/sum((y1-mean(y1))^2))), corr=1, power_t10=1, power_b10=1, h2=1))
}
)
}
Loading

0 comments on commit 1f6575e

Please sign in to comment.