Skip to content

Commit

Permalink
excluding missing phenotypes in the cross-validation and only adding …
Browse files Browse the repository at this point in the history
…them for genomic prediction per se, i.e. last step
  • Loading branch information
jeffersonfparil committed Dec 13, 2024
1 parent cb0c908 commit 5f38085
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 138 deletions.
86 changes: 39 additions & 47 deletions R/cross_validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,7 @@ fn_cv_1 = function(i, list_merged, df_params, mat_idx_shuffle, vec_set_partition
"Error in cross_validation::fn_cv_1(...). ",
"Input data (list_merged) is an error type."
)))
cat(error$message)
return(error)
cat(error@message; return(error)
}
if ((i < 1) | (i > nrow(df_params))) {
error = methods::new("gpError",
Expand All @@ -145,8 +144,7 @@ fn_cv_1 = function(i, list_merged, df_params, mat_idx_shuffle, vec_set_partition
"Error in cross_validation::fn_cv_1(...). ",
"The index (i) of df_params is beyond the number of rows in df_params (may also be less than 1)."
))
cat(error$message)
return(error)
cat(error@message; return(error)
}
if (sum((colnames(df_params) == c("rep", "fold", "model"))) != 3) {
error = methods::new("gpError",
Expand All @@ -156,8 +154,7 @@ fn_cv_1 = function(i, list_merged, df_params, mat_idx_shuffle, vec_set_partition
"The data frame of parameters is incorrect. We are expecting the following columns in order: 'rep', 'fold', and 'model'.",
"The supplied data frame has the following columns or fields: ", paste(colnames(df_params), collapse=", ")
))
cat(error$message)
return(error)
cat(error@message; return(error)
}
if (nrow(mat_idx_shuffle) != nrow(list_merged$G)) {
error = methods::new("gpError",
Expand All @@ -168,8 +165,7 @@ fn_cv_1 = function(i, list_merged, df_params, mat_idx_shuffle, vec_set_partition
"does not match the number of samples in the input genotype and phenotype (and covariate) data (",
nrow(list_merged$G) , " rows)."
))
cat(error$message)
return(error)
cat(error@message; return(error)
}
if (ncol(mat_idx_shuffle) != max(df_params$rep)) {
error = methods::new("gpError",
Expand All @@ -179,8 +175,7 @@ fn_cv_1 = function(i, list_merged, df_params, mat_idx_shuffle, vec_set_partition
"The number of columns in the shuffling matrix (mat_idx_shuffle; ", ncol(mat_idx_shuffle), " columns) ",
"does not match the replications requested (", max(df_params$rep) , " replications)."
))
cat(error$message)
return(error)
cat(error@message; return(error)
}
if (length(vec_set_partition_groupings) != nrow(list_merged$G)) {
error = methods::new("gpError",
Expand All @@ -191,8 +186,7 @@ fn_cv_1 = function(i, list_merged, df_params, mat_idx_shuffle, vec_set_partition
length(vec_set_partition_groupings), " elements) does not match the number of samples in ",
"the input genotype and phenotype (and covariate) data (", nrow(list_merged$G) , " rows)."
))
cat(error$message)
return(error)
cat(error@message; return(error)
}
if (sum(range(vec_set_partition_groupings) == range(df_params$fold)) != 2) {
error = methods::new("gpError",
Expand All @@ -204,8 +198,7 @@ fn_cv_1 = function(i, list_merged, df_params, mat_idx_shuffle, vec_set_partition
"does not match the number of folds requested (fold ", min(df_params$fold), " to fold ",
max(df_params$fold), ")."
))
cat(error$message)
return(error)
cat(error@message; return(error)
}
### Define prefix of intermediate output files
if ((prefix_tmp == "") | is.na(prefix_tmp) | is.null(prefix_tmp)) {
Expand Down Expand Up @@ -240,8 +233,7 @@ fn_cv_1 = function(i, list_merged, df_params, mat_idx_shuffle, vec_set_partition
"Error in cross_validation::fn_cv_1(...). ",
"Unable to fit the model, ", model, " and/or assess genomic prediction accuracy."
)))
cat(error$message)
return(error)
cat(error@message; return(error)
}
### One-liner data frame of the prediction performance metrics
df_metrics = data.frame(
Expand Down Expand Up @@ -388,7 +380,7 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
"Error in cross_validation::fn_cross_validation_preparation(...). ",
"Input data (list_merged) is an error type."
)))
return(error)
cat(error@message; return(error)
}
if (cv_type == 1) {
###############################
Expand All @@ -408,7 +400,7 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
"Error in cross_validation::fn_cross_validation_preparation(...). ",
"The size of the data set is too small, n= ", n, "."
))
return(error)
cat(error@message; return(error)
}
vec_set_partition_groupings = rep(1:n_folds, each=set_size)
if (length(vec_set_partition_groupings) < n) {
Expand Down Expand Up @@ -449,7 +441,7 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
"Cannot perform pairwise-population cross-validation (cv_type=2) ",
"because the number of populations (", n_folds, " populations) in the data set is not equal to 2."
))
return(error)
cat(error@message; return(error)
}
### No shuffling needed as cross-validation is not replicated
mat_idx_shuffle = matrix(1:n, ncol=1)
Expand Down Expand Up @@ -479,7 +471,7 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
"Cannot perform leave-one-population-out cross-validation (cv_type=3) ",
"because there is only one population in the data set."
))
return(error)
cat(error@message; return(error)
}
### No shuffling needed as cross-validation is not replicated
mat_idx_shuffle = matrix(1:n, ncol=1)
Expand All @@ -504,7 +496,7 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
" --> '2' for pairwise-population cross-validation, e.g. training on population A and validation on population B. ",
" --> '3' for leave-one-population-out cross-validation, e.g. training on populations 1 to 9 and validation on population 10."
))
return(error)
cat(error@message; return(error)
}
### Memory allocation error handling
if (methods::is(list_mem, "gpError")) {
Expand All @@ -515,7 +507,7 @@ fn_cross_validation_preparation = function(list_merged, cv_type=1, n_folds=10, n
"Failed to estimate memory allocation requirements for parallel computations ",
"and the maximum number of threads which can be used to avoid out-of-memory (OOM) error."
)))
return(error)
cat(error@message; return(error)
}
### Print the full list of cross-validation sets, replications and models combinations
if (verbose) {
Expand Down Expand Up @@ -653,7 +645,7 @@ fn_cross_validation_within_population = function(list_merged, n_folds=10, n_reps
"Error in cross_validation::fn_cross_validation_within_population(...). ",
"Input data (list_merged) is an error type."
)))
return(error)
cat(error@message; return(error)
}
### Define the output directory
if (!is.null(dir_output)) {
Expand All @@ -673,7 +665,7 @@ fn_cross_validation_within_population = function(list_merged, n_folds=10, n_reps
"Unable to create the output directory: ", dir_output, ". ",
"Please check your permissions to write into that directory."
))
return(error)
cat(error@message; return(error)
}
### Determine the number of populations
vec_populations = sort(unique(list_merged$list_pheno$pop))
Expand All @@ -691,7 +683,7 @@ fn_cross_validation_within_population = function(list_merged, n_folds=10, n_reps
"Error in cross_validation::fn_cross_validation_within_population(...). ",
"Failed to subset the data set."
)))
return(error)
cat(error@message; return(error)
}
### Define the cross-validation parameters as well as the maximum number of threads we can safely use in parallel
list_cv_params = fn_cross_validation_preparation(
Expand All @@ -709,7 +701,7 @@ fn_cross_validation_within_population = function(list_merged, n_folds=10, n_reps
"Error in cross_validation::fn_cross_validation_within_population(...). ",
"Failed to define the cross-validation parameters."
)))
return(error)
cat(error@message; return(error)
}
if (list_cv_params$list_mem$n_threads <= 1) {
if (verbose) {
Expand Down Expand Up @@ -741,7 +733,7 @@ fn_cross_validation_within_population = function(list_merged, n_folds=10, n_reps
"Please check re-run cross_validation::fn_cross_validation_within_population(...) with ",
"bool_parallel=FALSE to identify the error."
)))
return(error)
cat(error@message; return(error)
}
}
} else {
Expand All @@ -768,7 +760,7 @@ fn_cross_validation_within_population = function(list_merged, n_folds=10, n_reps
"fold: ", list_cv_params$df_params$fold[i], ", and ",
"model: ", list_cv_params$df_params$model[i], "."
)))
return(error)
cat(error@message; return(error)
}
eval(parse(text=paste0("list_list_perf$`", i, "` = list_perf")))
}
Expand All @@ -788,7 +780,7 @@ fn_cross_validation_within_population = function(list_merged, n_folds=10, n_reps
"fold: ", list_cv_params$df_params$fold[i], ", and ",
"model: ", list_cv_params$df_params$model[i], "."
)))
return(error)
cat(error@message; return(error)
}
if (is.null(df_metrics) & is.null(df_y_validation)) {
df_metrics = list_perf$df_metrics
Expand Down Expand Up @@ -976,7 +968,7 @@ fn_cross_validation_across_populations_bulk = function(list_merged, n_folds=10,
"Error in cross_validation::fn_cross_validation_across_populations_bulk(...). ",
"Input data (list_merged) is an error type."
)))
return(error)
cat(error@message; return(error)
}
### Define the output directory
if (!is.null(dir_output)) {
Expand All @@ -996,7 +988,7 @@ fn_cross_validation_across_populations_bulk = function(list_merged, n_folds=10,
"Unable to create the output directory: ", dir_output, ". ",
"Please check your permissions to write into that directory."
))
return(error)
cat(error@message; return(error)
}
### Check if we have more than 1 population
vec_populations = sort(unique(list_merged$list_pheno$pop))
Expand All @@ -1008,7 +1000,7 @@ fn_cross_validation_across_populations_bulk = function(list_merged, n_folds=10,
"Cannot perform bulked across populations cross-validation ",
"because there is only 1 population in the data set."
))
return(error)
cat(error@message; return(error)
}
### Define the cross-validation parameters as well as the maximum number of threads we can safely use in parallel
list_cv_params = fn_cross_validation_preparation(
Expand All @@ -1026,7 +1018,7 @@ fn_cross_validation_across_populations_bulk = function(list_merged, n_folds=10,
"Error in cross_validation::fn_cross_validation_within_population(...). ",
"Failed to define the cross-validation parameters."
)))
return(error)
cat(error@message; return(error)
}
if (list_cv_params$list_mem$n_threads <= 1) {
if (verbose) {
Expand Down Expand Up @@ -1059,7 +1051,7 @@ fn_cross_validation_across_populations_bulk = function(list_merged, n_folds=10,
"Please check re-run cross_validation::fn_cross_validation_across_populations_bulk(...) with ",
"bool_parallel=FALSE to identify the error."
)))
return(error)
cat(error@message; return(error)
}
}
} else {
Expand Down Expand Up @@ -1087,7 +1079,7 @@ fn_cross_validation_across_populations_bulk = function(list_merged, n_folds=10,
"fold: ", list_cv_params$df_params$fold[i], ", and ",
"model: ", list_cv_params$df_params$model[i], "."
)))
return(error)
cat(error@message; return(error)
}
eval(parse(text=paste0("list_list_perf$`", i, "` = list_perf")))
}
Expand Down Expand Up @@ -1241,7 +1233,7 @@ fn_cross_validation_across_populations_pairwise = function(list_merged,
"Error in cross_validation::fn_cross_validation_across_populations_pairwise(...). ",
"Input data (list_merged) is an error type."
)))
return(error)
cat(error@message; return(error)
}
### Define the output directory
if (!is.null(dir_output)) {
Expand All @@ -1261,7 +1253,7 @@ fn_cross_validation_across_populations_pairwise = function(list_merged,
"Unable to create the output directory: ", dir_output, ". ",
"Please check your permissions to write into that directory."
))
return(error)
cat(error@message; return(error)
}
### Determine the number of populations
vec_populations = sort(unique(list_merged$list_pheno$pop))
Expand All @@ -1273,7 +1265,7 @@ fn_cross_validation_across_populations_pairwise = function(list_merged,
"Cannot perform pairwise-population cross-validation ",
"because there is only 1 population in the data set."
))
return(error)
cat(error@message; return(error)
}
### Instantiate the vector of Rds filenames containing the temporary output data per population
vec_fname_across_pairwise_Rds = c()
Expand All @@ -1296,7 +1288,7 @@ fn_cross_validation_across_populations_pairwise = function(list_merged,
"Error in cross_validation::fn_cross_validation_across_populations_pairwise(...). ",
"Failed to subset the data set."
)))
return(error)
cat(error@message; return(error)
}
### Define the cross-validation parameters as well as the maximum number of threads we can safely use in parallel
list_cv_params = fn_cross_validation_preparation(
Expand All @@ -1314,7 +1306,7 @@ fn_cross_validation_across_populations_pairwise = function(list_merged,
"Error in cross_validation::fn_cross_validation_within_population(...). ",
"Failed to define the cross-validation parameters."
)))
return(error)
cat(error@message; return(error)
}
if (list_cv_params$list_mem$n_threads <= 1) {
if (verbose) {
Expand Down Expand Up @@ -1346,7 +1338,7 @@ fn_cross_validation_across_populations_pairwise = function(list_merged,
"Please check re-run cross_validation::fn_cross_validation_across_populations_pairwise(...) with ",
"bool_parallel=FALSE to identify the error."
)))
return(error)
cat(error@message; return(error)
}
}
} else {
Expand Down Expand Up @@ -1374,7 +1366,7 @@ fn_cross_validation_across_populations_pairwise = function(list_merged,
"fold: ", list_cv_params$df_params$fold[i], ", and ",
"model: ", list_cv_params$df_params$model[i], "."
)))
return(error)
cat(error@message; return(error)
}
eval(parse(text=paste0("list_list_perf$`", i, "` = list_perf")))
}
Expand Down Expand Up @@ -1558,7 +1550,7 @@ fn_cross_validation_across_populations_lopo = function(list_merged,
"Error in cross_validation::fn_cross_validation_across_populations_lopo(...). ",
"Input data (list_merged) is an error type."
)))
return(error)
cat(error@message; return(error)
}
### Define the output directory
if (!is.null(dir_output)) {
Expand All @@ -1578,7 +1570,7 @@ fn_cross_validation_across_populations_lopo = function(list_merged,
"Unable to create the output directory: ", dir_output, ". ",
"Please check your permissions to write into that directory."
))
return(error)
cat(error@message; return(error)
}
### Define the cross-validation parameters as well as the maximum number of threads we can safely use in parallel
list_cv_params = fn_cross_validation_preparation(
Expand All @@ -1596,7 +1588,7 @@ fn_cross_validation_across_populations_lopo = function(list_merged,
"Error in cross_validation::fn_cross_validation_across_populations_lopo(...). ",
"Failed to instantiate the cross-validation parameters."
)))
return(error)
cat(error@message; return(error)
}
if (list_cv_params$list_mem$n_threads <= 1) {
if (verbose) {
Expand Down Expand Up @@ -1628,7 +1620,7 @@ fn_cross_validation_across_populations_lopo = function(list_merged,
"Please check re-run cross_validation::fn_cross_validation_across_populations_lopo(...) with ",
"bool_parallel=FALSE to identify the error."
)))
return(error)
cat(error@message; return(error)
}
}
} else {
Expand All @@ -1655,7 +1647,7 @@ fn_cross_validation_across_populations_lopo = function(list_merged,
"fold: ", list_cv_params$df_params$fold[i], ", and ",
"model: ", list_cv_params$df_params$model[i], "."
)))
return(error)
cat(error@message; return(error)
}
eval(parse(text=paste0("list_list_perf$`", i, "` = list_perf")))
}
Expand Down
Loading

0 comments on commit 5f38085

Please sign in to comment.