emitting additive genetic effects and/or BLUPs for each model and pop…

…ulation using the entire dataset
jeffersonfparil · Jun 14, 2024 · 7c4980f · 7c4980f
1 parent 23a8b41
commit 7c4980f
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 25 deletions.
diff --git a/R/main.R b/R/main.R
@@ -511,7 +511,6 @@ gp = function(args) {
         vec_idx_validation = which(is.na(list_merged$list_pheno$y))
         vec_idx_training = which(!is.na(list_merged$list_pheno$y))
         if (length(vec_idx_validation)==0) {
-            ADDITIVE_GENETIC_EFFECTS = NA
             GENOMIC_PREDICTIONS = NA
         } else {
             ### Find the best model in the args$population
@@ -529,13 +528,40 @@ gp = function(args) {
                 other_params = list(n_folds=10)
             }
             perf = eval(parse(text=paste0("fn_", model, "(list_merged=list_merged, vec_idx_training=vec_idx_training, vec_idx_validation=vec_idx_validation, other_params=other_params, verbose=args$verbose)")))
-            ADDITIVE_GENETIC_EFFECTS = list(b=perf$vec_effects, model=model)
             GENOMIC_PREDICTIONS = data.frame(perf$df_y_validation, model=model)
         }
     } else {
-        ADDITIVE_GENETIC_EFFECTS = NA
         GENOMIC_PREDICTIONS = NA
     }
+    ###############################################################################
+    ### EXTRACT WITHIN POPULATION ADDITIVE GENETIC EFFECTS OF EACH MODEL TESTED ###
+    ###############################################################################
+    ### Extract the additive genetic effects from penalised and Bayesian regression models, and sample BLUPs from gBLUP
+    if (args$bool_within) {
+        vec_idx_training = which(!is.na(list_merged$list_pheno$y))
+        ADDITIVE_GENETIC_EFFECTS = NULL
+        for (model in args$vec_models_to_test) {
+            # model = args$vec_models_to_test[1]
+            if ((grepl("Bayes", model)==TRUE) | (grepl("gBLUP", model)==TRUE)) {
+                if (is.null(args$dir_output)) {
+                    args$dir_output = dirname(tempfile())
+                }
+                vec_idx_validation = c()
+                other_params = list(nIter=12e3, burnIn=2e3, out_prefix=file.path(args$dir_output, paste0("bglr_", model)))
+            } else {
+                vec_idx_validation = vec_idx_training
+                other_params = list(n_folds=10)
+            }
+            perf = eval(parse(text=paste0("fn_", model, "(list_merged=list_merged, vec_idx_training=vec_idx_training, vec_idx_validation=vec_idx_validation, other_params=other_params, verbose=FALSE)")))
+            if (is.null(ADDITIVE_GENETIC_EFFECTS)) {
+                ADDITIVE_GENETIC_EFFECTS = eval(parse(text=paste0("list(`", model, "`=list(b=perf$vec_effects))")))
+            } else {
+                eval(parse(text=paste0("ADDITIVE_GENETIC_EFFECTS$`", model, "`=list(b=perf$vec_effects)")))
+            }
+        }
+    } else {
+        ADDITIVE_GENETIC_EFFECTS = NA
+    }
     ### Clean-up
     rm("list_merged")
     gc()

diff --git a/R/models.R b/R/models.R
@@ -315,7 +315,7 @@ fn_ridge = function(list_merged, vec_idx_training, vec_idx_validation, other_par
         txtplot::txtplot(b_hat[!is.na(b_hat) & !is.infinite(b_hat)])
         print(paste0("Number of non-zero effects: ", n_non_zero, " (", round(100*n_non_zero/p), "%)"))
     }
-    ### Evalute prediction performance
+    ### Evaluate prediction performance
     df_y_validation = merge(
         data.frame(id=names(y_validation), pop=list_merged$list_pheno$pop[vec_idx_validation], y_true=y_validation), 
         data.frame(id=rownames(y_pred), y_pred=y_pred),
@@ -459,7 +459,7 @@ fn_lasso = function(list_merged, vec_idx_training, vec_idx_validation, other_par
         txtplot::txtplot(b_hat[!is.na(b_hat) & !is.infinite(b_hat)])
         print(paste0("Number of non-zero effects: ", n_non_zero, " (", round(100*n_non_zero/p), "%)"))
     }
-    ### Evalute prediction performance
+    ### Evaluate prediction performance
     df_y_validation = merge(
         data.frame(id=names(y_validation), pop=list_merged$list_pheno$pop[vec_idx_validation], y_true=y_validation), 
         data.frame(id=rownames(y_pred), y_pred=y_pred),
@@ -603,7 +603,7 @@ fn_elastic_net = function(list_merged, vec_idx_training, vec_idx_validation, oth
         txtplot::txtplot(b_hat[!is.na(b_hat) & !is.infinite(b_hat)])
         print(paste0("Number of non-zero effects: ", n_non_zero, " (", round(100*n_non_zero/p), "%)"))
     }
-    ### Evalute prediction performance
+    ### Evaluate prediction performance
     df_y_validation = merge(
         data.frame(id=names(y_validation), pop=list_merged$list_pheno$pop[vec_idx_validation], y_true=y_validation), 
         data.frame(id=rownames(y_pred), y_pred=y_pred),
@@ -744,7 +744,7 @@ fn_Bayes_A = function(list_merged, vec_idx_training, vec_idx_validation,
         txtplot::txtplot(b_hat[!is.na(b_hat) & !is.infinite(b_hat)])
         print(paste0("Number of non-zero effects: ", n_non_zero, " (", round(100*n_non_zero/p), "%)"))
     }
-    ### Evalute prediction performance
+    ### Evaluate prediction performance
     df_y_validation = merge(
         data.frame(id=names(list_merged$list_pheno$y[vec_idx_validation]), pop=list_merged$list_pheno$pop[vec_idx_validation], y_true=list_merged$list_pheno$y[vec_idx_validation]), 
         data.frame(id=names(yNA)[vec_idx_validation], y_pred=sol$yHat[vec_idx_validation]),
@@ -891,7 +891,7 @@ fn_Bayes_B = function(list_merged, vec_idx_training, vec_idx_validation,
         txtplot::txtplot(b_hat[!is.na(b_hat) & !is.infinite(b_hat)])
         print(paste0("Number of non-zero effects: ", n_non_zero, " (", round(100*n_non_zero/p), "%)"))
     }
-    ### Evalute prediction performance
+    ### Evaluate prediction performance
     df_y_validation = merge(
         data.frame(id=names(list_merged$list_pheno$y[vec_idx_validation]), pop=list_merged$list_pheno$pop[vec_idx_validation], y_true=list_merged$list_pheno$y[vec_idx_validation]), 
         data.frame(id=names(yNA)[vec_idx_validation], y_pred=sol$yHat[vec_idx_validation]),
@@ -1038,7 +1038,7 @@ fn_Bayes_C = function(list_merged, vec_idx_training, vec_idx_validation,
         txtplot::txtplot(b_hat[!is.na(b_hat) & !is.infinite(b_hat)])
         print(paste0("Number of non-zero effects: ", n_non_zero, " (", round(100*n_non_zero/p), "%)"))
     }
-    ### Evalute prediction performance
+    ### Evaluate prediction performance
     df_y_validation = merge(
         data.frame(id=names(list_merged$list_pheno$y[vec_idx_validation]), pop=list_merged$list_pheno$pop[vec_idx_validation], y_true=list_merged$list_pheno$y[vec_idx_validation]), 
         data.frame(id=names(yNA)[vec_idx_validation], y_pred=sol$yHat[vec_idx_validation]),
@@ -1190,7 +1190,7 @@ fn_gBLUP = function(list_merged, vec_idx_training, vec_idx_validation, other_par
         txtplot::txtdensity(u_hat[!is.na(u_hat) & !is.infinite(u_hat)])
         print(paste0("Number of non-zero effects: ", n_non_zero, " (", round(100*n_non_zero/p), "%)"))
     }
-    ### Evalute prediction performance
+    ### Evaluate prediction performance
     df_y_validation = merge(
         data.frame(id=names(list_merged$list_pheno$y[vec_idx_validation]), pop=list_merged$list_pheno$pop[vec_idx_validation], y_true=list_merged$list_pheno$y[vec_idx_validation]), 
         data.frame(id=names(mod$U$`u:id`$y), y_pred=unlist(mod$U)),

diff --git a/exec/tests/test.sh b/exec/tests/test.sh
@@ -40,27 +40,52 @@ grep -A10 "Finished after" output/job_info-*.log
 # vec_correlations = c()
 # for (fname in vec_fnames_Rds) {
 #     # fname = vec_fnames_Rds[1]
-#     list_output = readRDS(fname, check.names=FALSE)
+#     list_output = readRDS(fname)
 #     vec_traits = c(vec_traits, list_output$TRAIT_NAME)
 #     vec_populations = c(vec_populations, unique(list_output$METRICS_WITHIN_POP$pop_training))
 #     vec_training_size = c(vec_training_size, mean(list_output$METRICS_WITHIN_POP$n_training))
 #     vec_validation_size = c(vec_validation_size, mean(list_output$METRICS_WITHIN_POP$n_validation, na.rm=TRUE))
 #     vec_n_folds = c(vec_n_folds, max(list_output$METRICS_WITHIN_POP$fold, na.rm=TRUE))
 #     vec_n_reps = c(vec_n_reps, max(list_output$METRICS_WITHIN_POP$rep, na.rm=TRUE))
-#     vec_correlations = c(vec_correlations, mean(list_output$METRICS_WITHIN_POP$corr, na.rm=TRUE))
+#     vec_idx = which(list_output$METRICS_WITHIN_POP$model == "Bayes_A")
+#     vec_correlations = c(vec_correlations, mean(list_output$METRICS_WITHIN_POP$corr[vec_idx], na.rm=TRUE))
 # }
 # df = data.frame(
 #     trait=vec_traits,
 #     population=vec_populations,
-#     training_siz=vec_training_size,
-#     validation_siz=vec_validation_size,
+#     training_size=vec_training_size,
+#     validation_size=vec_validation_size,
 #     n_fold=vec_n_folds,
 #     n_rep=vec_n_reps,
 #     correlation=vec_correlations)
 # df = df[order(df$trait), ]
 # vec_idx_sort_by_trait_name_length = unlist(lapply(df$trait, FUN=function(x){length(unlist(strsplit(x, "")))}))
 # df = df[order(vec_idx_sort_by_trait_name_length, decreasing=FALSE), ]
-# write.table(df, file="SUMMARY_WITHIN_POP_CORRELATIONS.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
+# write.table(df, file="SUMMARY_WITHIN_POP_CORRELATIONS-Bayes_A.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
+
+# ### Extract lucerne additive effects for Bayes B in traits: 
+# cd output/
+# R
+# vec_fnames_Rds = list.files(path=".", pattern="*.Rds")
+# vec_requested_dates = c(
+#     "biomass_SUM_2024a_20240213",
+#     "biomass_AUT_2024_20240312",
+#     "biomass_LSP_2023_20231023")
+
+# for (date in vec_requested_dates) {
+#     # date = vec_requested_dates[1]
+#     vec_idx = which(grepl(date, vec_fnames_Rds))
+#     for (idx in vec_idx) {
+#         # idx = vec_idx[1]
+#         list_output = readRDS(vec_fnames_Rds[idx])
+#         if (!is.na(list_output$ADDITIVE_GENETIC_EFFECTS[1])[1]) {
+#             df = data.frame(loc=names(list_output$ADDITIVE_GENETIC_EFFECTS$b), b=list_output$ADDITIVE_GENETIC_EFFECTS$b)
+#             trait_pop = paste(rev(rev(unlist(strsplit(vec_fnames_Rds[idx], "-"))[-1])[-1]), collapse="-")
+#             fname_out = paste0("MARKER_EFFECTS-", list_output$ADDITIVE_GENETIC_EFFECTS$model, "-", trait_pop, ".tsv")
+#             write.table(df, file=fname_out, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
+#         }
+#     }
+# }
 
 ########################################
 ### COMPLETE SET OF INPUT PARAMETERS ###

diff --git a/app.R → inst/plot_gs_gp/app.R b/app.R → inst/plot_gs_gp/app.R
@@ -41,7 +41,8 @@ ui <- page_fillable(
         shinyWidgets::pickerInput(inputId="model", label="Filter by model:", choices="", multiple=TRUE, options=list(`live-search`=TRUE, `actions-box`=TRUE)),
         shinyWidgets::pickerInput(inputId="metric", label="Use the genomic prediction accuracy metric:", choices="", multiple=FALSE, options=list(`live-search`=TRUE, `actions-box`=TRUE)),
         shinyWidgets::pickerInput(inputId="group_by", label="Group by:", choices=c("trait", "pop_training", "model"), selected=c("trait", "pop", "model"), multiple=TRUE, options=list(`live-search`=TRUE, `actions-box`=TRUE)),
-        shinyWidgets::pickerInput(inputId="sort_by", label="Sort by:", choices=c("increasing_mean", "decreasing_mean", "increasing_median", "decreasing_median", "alphabetical"), selected="increasing_mean", multiple=FALSE, options=list(`live-search`=TRUE, `actions-box`=TRUE))
+        shinyWidgets::pickerInput(inputId="sort_by", label="Sort by:", choices=c("increasing_mean", "decreasing_mean", "increasing_median", "decreasing_median", "alphabetical"), selected="increasing_mean", multiple=FALSE, options=list(`live-search`=TRUE, `actions-box`=TRUE)),
+        shinyWidgets::materialSwitch(inputId="within_box_with_labels", label="Mean-labelled boxplot", value=FALSE, status="primary", right=FALSE)
       ),
       mainPanel(
         width=750,
@@ -484,14 +485,37 @@ server <- function(input, output, session) {
       df$grouping = factor(df$grouping, levels=as.character(df_agg$grouping[order(df_agg$metric, decreasing=TRUE)]))
     }
     ### Plot
-    p = plot_ly(data=df,
-      y=~metric,
-      x=~grouping,
-      type='violin',
-      box=list(visible=TRUE),
-      meanline = list(visible=TRUE),
-      split=~grouping
-    )
+    if (!input$within_box_with_labels) {
+      p = plot_ly(data=df,
+        y=~metric,
+        x=~grouping,
+        type='violin',
+        box=list(visible=TRUE),
+        meanline = list(visible=TRUE),
+        split=~grouping
+      )
+    } else {
+      list_bp = boxplot(metric ~ grouping, data=df, plot=FALSE)
+      n_groups = ncol(list_bp$stats)
+      df_agg = aggregate(metric ~ grouping, data=df, FUN=mean, na.rm=TRUE)
+      colnames(df_agg)[2] = "metric_mean"
+      df = merge(merge(df, df_agg, sort=FALSE, by="grouping"),
+        data.frame(grouping=list_bp$names, pos_x=seq(0, n_groups+1, length=n_groups+1)[1:n_groups], pos_y=1.05*min(df$metric, na.rm=TRUE)),
+        sort=FALSE, by="grouping")
+      p = plot_ly(data=df,
+        y=~metric,
+        x=~grouping,
+        type='box',
+        boxmean=TRUE,
+        split=~grouping
+      )
+      p = p %>% add_annotations(
+        x=~grouping,
+        y=~pos_y, 
+        text=~round(metric_mean, 4),
+        showarrow=FALSE
+      )
+    }
     p = p %>% layout(
       title=title,
       yaxis=list(title=input$metric),
@@ -668,7 +692,7 @@ server <- function(input, output, session) {
       p = plotly::plot_ly(data=df, x=~x, y=~y, type="scatter", mode='markers', text=~ID)
     } else {
       ### Prepare pop_validation x model matrix of GP metric data
-      vec_lopo_pop = sort(unique(df_metrics_across_pop$pop_validation))
+      vec_lopo_pop = sort(input$lopo_pop)
       n = length(vec_lopo_pop)
       m = length(input$lopo_model)
       M = matrix(NA, nrow=n, ncol=m)