Merge pull request #27 from afsc-assessments/diagnostics

Diagnostics
afsc-assessments · Aug 21, 2024 · b1883c8 · b1883c8
2 parents 45359af + ae52670
commit b1883c8
Show file tree

Hide file tree

Showing 33 changed files with 4,249 additions and 264 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -41,7 +41,7 @@ BugReports: https://github.com/JaneSullivan-NOAA/rema/issues
 License: GPL-3
 LazyData: true
 VignetteBuilder: knitr
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Encoding: UTF-8
 Suggests: 
     testthat (>= 3.0.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(read_rep)
 export(tidy_extra_cv)
 export(tidy_rema)
 import(ggplot2)
+import(grid)
 importFrom(magrittr,"%>%")
 useDynLib(rema)
 useDynLib(rema, .registration = TRUE)
diff --git a/R/get_osa_residuals.R b/R/get_osa_residuals.R
@@ -1,11 +1,10 @@
 #' Get one-step-head (OSA)
 #'
-#' Takes list output from \code{\link{tidy_rema}} and returns a list of
-#' \code{ggplot2} objects to be plotted or saved. This feature is
-#' experimental, and OSA residuals are still in Beta mode in
-#' \code{\link[TMB:oneStepPredict]{TMB::oneStepPredict}}. The default "cdf"
-#' method sometimes results in NA values for residuals, especially when
-#' observation errors are small.
+#' Takes the rema model output from \code{\link{fit_rema}} and returns OSA
+#' residuals calculated using
+#' \code{\link[TMB:oneStepPredict]{TMB::oneStepPredict}} with accompanying
+#' residual analysis plots. IMPORTANT: OSA residuals do not work for users
+#' implementing the Tweedie distribution.
 #'
 #' @param rema_model list out output from \code{\link{fit_rema}}, which includes
 #'   model results but also inputs. Of note to OSA residual calculations is the
@@ -14,28 +13,32 @@
 #'   associated with them.
 #' @param options list of options for calculating OSA residuals, passed to
 #'   \code{\link[TMB:oneStepPredict]{TMB::oneStepPredict}}. Default:
-#'   \code{options = list(method = "cdf", parallel = TRUE)}.
+#'   \code{options = list(method = "fullGaussian", parallel = TRUE)}.
+#'   Alternative methods include "cdf", "oneStepGeneric",
+#'   "oneStepGaussianOffMode", and "oneStepGaussian".
 #'
 #' @return a list of tidied data.frames containing the biomass and CPUE survey
-#'   residuals with accompanying data, as well as ggplot2 plots to visual
-#'   results from the OSA residual analysis.
+#'   residuals with accompanying data, as well as a QQ-plot, histogram
+#'   of residuals, and plots of residuals~year and residuals~fitted values by
+#'   strata for the biomass and CPUE survey.
 #'
 #' @import ggplot2
+#' @import grid
 #' @export
 #' @seealso \code{\link{tidy_rema}}
 #' @examples
 #' \dontrun{
 #' # placeholder for example
 #' }
 get_osa_residuals <- function(rema_model,
-                              options = list(method = "cdf",
-                                             # "oneStepGaussianOffMode", "fullGaussian", "oneStepGeneric",
-                                             # "oneStepGaussian", "cdf"
+                              options = list(method = "fullGaussian",
+                                             # "cdf","oneStepGeneric","oneStepGaussianOffMode","oneStepGaussian"),
                                              parallel = TRUE)) {
 
-  # rema_model = m
-  # options = list(method = "cdf", parallel = TRUE)
+  # rema_model = m1
+  # options = list(method = "fullGaussian", parallel = TRUE)
 
+  print("**WARNING: OSA residuals do not currently work for the Tweedie distribution.**")
 
   p_resids <- function(dat) {
 
@@ -50,35 +53,59 @@ get_osa_residuals <- function(rema_model,
       labs(x = 'Year', y = 'Residual')
   }
 
-  p_qq <- function(dat) {
-    ggplot(data = dat, aes(sample = residual)) +
+  p_qq <- function(dat, facet_strata = TRUE) {
+    # dat = osa_resids
+    sdnr <- sd(dat$residual)
+    sdnr <- paste0('SDNR = ', formatC(sdnr, format = "f", digits = 2))
+    p <- ggplot(data = dat, aes(sample = residual, col = strata)) +
       stat_qq() +
-      stat_qq_line() +
-      facet_wrap(~strata) +
-      labs(x = 'Theoretical quantiles', y = 'Sample quantiles')
-  }
+      # stat_qq_line() + # puts through IQR
+      geom_abline(slope = 1, intercept = 0) +
+      labs(x = 'Theoretical quantiles', y = 'Sample quantiles', col = 'Stratum') +
+      ggplot2::scale_colour_viridis_d(direction = 1)
+
+    if(isTRUE(facet_strata)) {p + facet_wrap(~strata)
+    } else {p + annotation_custom(grid::grobTree(grid::textGrob(sdnr, x=0.1,  y=0.95, hjust=0,
+                                              gp=grid::gpar(col="black", fontsize=10))))}
 
-  p_hist <- function(dat) {
-    ggplot(data = dat, aes(x = residual)) +
-      # geom_histogram() +
-      geom_histogram(aes(y = ..density..), colour = "black", fill = "white")+
-      geom_density(alpha = 0.2, fill = "#FF6666") +
-      facet_wrap(~strata, scales = 'free') +
-      labs(x = 'Residual', y = 'Density')
   }
 
+  # p_hist <- function(dat) {
+  #   ggplot(data = dat, aes(x = residual)) +
+  #     geom_histogram(aes(y = after_stat(density)), colour = "black", fill = "white") +
+  #     geom_density(fill = fill_alpha("#21918c", 0.6)) +
+  #     labs(x = 'Residual', y = 'Density')
+  # }
+
   p_fitted <- function(dat) {
 
     ybound <- max(abs(dat$residual), na.rm = TRUE) * 1.1
 
     ggplot(data = dat, aes(x = log_pred, y = residual)) +
       geom_hline(yintercept = 0, colour = "grey", size = 1) +
+      geom_segment(aes(x = log_pred, xend = log_pred, y = 0, yend = residual)) +
       geom_point() +
       expand_limits(y = c(-ybound, ybound)) +
       facet_wrap(~strata, scales = 'free_x') +
       labs(x = 'Fitted values (log-scale)', y = 'Residual')
   }
 
+  # p_acf <- function(dat) {
+  #   # dat = osa_resids
+  #   # dat = biomass_resids
+  #
+  #   biomacf <- acf(x = dat$residual, na.action = na.pass, plot = FALSE)
+  #   acfci <- qnorm((1 + 0.95)/2)/sqrt(biomacf$n.used)
+  #   biomacf <- data.frame(Lag = 1:(nrow(biomacf$acf)), ACF = biomacf$acf, lci = -acfci, uci = acfci)
+  #   ggplot(biomacf, aes(x = Lag, y = ACF)) +
+  #     geom_hline(yintercept = 0, colour = "grey", size = 1) +
+  #     geom_hline(yintercept = biomacf$uci, colour = "blue", linetype = 2) +
+  #     geom_hline(yintercept = biomacf$lci, colour = "blue", linetype = 2) +
+  #     geom_segment(aes(x = Lag, xend = Lag, y = 0, yend = ACF)) +
+  #     geom_point()
+  #
+  # }
+
   if(is.null(rema_model$err)) {
 
     if(rema_model$is_sdrep) { # only do OSA residuals if sdrep ran
@@ -104,10 +131,12 @@ get_osa_residuals <- function(rema_model,
                            dplyr::filter(survey == 'Biomass survey') %>%
                            dplyr::select(year, strata, residual))
 
+      p1_qq <- p_qq(dat = osa_resids, facet_strata = FALSE)
+      # p2_acf <- p_acf(dat = osa_resids)
+      # p3_hist <- p_hist(dat = osa_resids)
       p1_biomass <- p_resids(dat = biomass_resids)
-      p2_biomass <- p_qq(dat = biomass_resids)
-      p3_biomass <- p_hist(dat = biomass_resids)
-      p4_biomass <- p_fitted(dat = biomass_resids)
+      p2_biomass <- p_fitted(dat = biomass_resids)
+      p3_biomass <- p_qq(dat = biomass_resids, facet_strata = TRUE)
 
       biomass_resids <- biomass_resids %>%
         dplyr::select(model_name, variable, strata, year, log_pred, log_obs, residual)
@@ -121,9 +150,8 @@ get_osa_residuals <- function(rema_model,
                              dplyr::select(year, strata, residual))
 
         p1_cpue <- p_resids(dat = cpue_resids)
-        p2_cpue <- p_qq(dat = cpue_resids)
-        p3_cpue <- p_hist(dat = cpue_resids)
-        p4_cpue <- p_fitted(dat = cpue_resids)
+        p2_cpue <- p_fitted(dat = cpue_resids)
+        p3_cpue <- p_qq(dat = cpue_resids, facet_strata = TRUE)
 
         cpue_resids <- cpue_resids %>%
           dplyr::select(model_name, variable, strata, year, log_pred, log_obs, residual)
@@ -135,15 +163,15 @@ get_osa_residuals <- function(rema_model,
       out$residuals <- list(biomass = biomass_resids,
                             cpue = cpue_resids)
 
-      out$plots <- list(biomass_resids = p1_biomass,
-                        biomass_qqplot = p2_biomass,
-                        biomass_hist = p3_biomass,
-                        biomass_fitted = p4_biomass,
+      out$plots <- list(qq = p1_qq,
+                        # acf = p2_acf,
+                        # histo = p3_hist,
+                        biomass_resids = p1_biomass,
+                        biomass_fitted = p2_biomass,
+                        biomass_qq = p3_biomass,
                         cpue_resids = p1_cpue,
-                        cpue_qqplot = p2_cpue,
-                        cpue_hist = p3_cpue,
-                        cpue_fitted = p4_cpue)
-      warning("OSA residuals are experimental and underlying methods in TMB::oneStepPredict are still in Beta mode. The default 'cdf' method sometimes results in NA values for residuals, especially when observation errors are small.")
+                        cpue_fitted = p2_cpue,
+                        cpue_qq = p3_cpue)
       return(out)
       }
 

diff --git a/R/prepare_rema_input.R b/R/prepare_rema_input.R
@@ -200,14 +200,16 @@
 #'     }
 #'
 #' \code{extra_biomass_cv} allows the user to specify options for estimating an
-#' additional CV parameter ("tau" in the source code) for the biomass survey
-#' observations. If \code{extra_biomass_cv = NULL} (default), no extra CV is
-#' estimated. The user can modify the default \code{extra_biomass_cv} options
-#' using the following list of entries:
+#' additional CV parameter (\code{log_tau_biomass} in the source code, estimated in
+#' log-space) for the biomass survey observations. If \code{extra_biomass_cv =
+#' NULL} (default), no extra CV is estimated. The user can modify the default
+#' \code{extra_biomass_cv} options using the following list of entries:
 #' \describe{
 #'     \item{$assumption}{A string identifying what assumption is used for the
 #'     biomass survey observations. Options include "none" (default in which no
-#'     extra CV is estimated) or "extra_cv". If \code{extra_biomass_cv} is not
+#'     extra CV is estimated) or "extra_cv". If assumption = "extra_cv", by
+#'     default only one extra CV will be estimated, regardless of how many
+#'     biomass strata are defined. If \code{extra_biomass_cv} is not
 #'     NULL, user must define appropriate assumption.}
 #'     \item{$pointer_extra_biomass_cv}{An index to customize the assignment of
 #'     extra CV parameters to individual biomass survey strata. Vector with
@@ -216,59 +218,49 @@
 #'     three biomass survey strata and user wanted to estimate an extra CV per
 #'     stratum, they would specify \code{pointer_extra_biomass_cv = c(1, 2, 3)}.
 #'     By default, only one additional parameter is estimated, regardless of how
-#'     many strata area defined (i.e. \code{pointer_extra_biomass_cv = c(1, 1, 1)}).}
+#'     many strata are defined (i.e. \code{pointer_extra_biomass_cv = c(1, 1, 1)}).}
 #'     \item{$initial_pars}{A vector of initial values for the extra biomass
-#'     \code{logit_tau}. The default initial value for each logit_tau is
-#'     -Inf (0 in real space).}
+#'     \code{log_tau_biomass}. The default initial value for each log_tau_biomass is
+#'     log(1e-7) (approximately 0 on the arithmetic scale).}
 #'     \item{$fix_pars}{Option to fix extra biomass CV parameters, where the
 #'     user specifies the index value of the parameter they would like to fix at
 #'     the initial value. For example, if there are three biomass survey strata
 #'     defined in \code{pointer_extra_biomass_cv}, and the user wants to fix the
-#'     \code{logit_tau} for the second stratum but estimate the \code{logit_tau}
+#'     \code{log_tau_biomass} for the second stratum but estimate the \code{log_tau_biomass}
 #'     for the first and third strata they would specify \code{fix_pars =
 #'     c(2)}.}
-#'     \item{$upper_bound}{A vector of user-specified values for the upper bound
-#'     in real space of the estimated extra biomass CV, where the default upper
-#'     bound is 1.5. For example, if there are three biomass survey strata
-#'     defined in \code{pointer_extra_biomass_cv}, and the user wants to
-#'     increase the upper bound of the extra biomass CV from 1.5 to 2, they
-#'     would specify \code{upper_bound = c(2.0, 2.0, 2.0)}.}
 #' }
 #'
 #' \code{extra_cpue_cv} allows the user to specify options for estimating an
-#' additional CV parameter ("tau" in the source code) for the CPUE survey
-#' observations. If \code{extra_cpue_cv = NULL} (default), no extra CV is
-#' estimated. The user can modify the default \code{extra_cpue_cv} options
-#' using the following list of entries:
+#' additional CV parameter (\code{log_tau_cpue} in the source code, estimated in
+#' log-space) for the cpue survey observations. If \code{extra_cpue_cv =
+#' NULL} (default), no extra CV is estimated. The user can modify the default
+#' \code{extra_cpue_cv} options using the following list of entries:
 #' \describe{
 #'     \item{$assumption}{A string identifying what assumption is used for the
-#'     CPUE survey observations. Options include "none" (default in which no
-#'     extra CV is estimated) or "extra_cv".If \code{extra_cpue_cv} is not
+#'     cpue survey observations. Options include "none" (default in which no
+#'     extra CV is estimated) or "extra_cv". If assumption = "extra_cv", by
+#'     default only one extra CV will be estimated, regardless of how many
+#'     cpue strata are defined. If \code{extra_cpue_cv} is not
 #'     NULL, user must define appropriate assumption.}
 #'     \item{$pointer_extra_cpue_cv}{An index to customize the assignment of
-#'     extra CV parameters to individual CPUE survey strata. Vector with length
-#'     = number of CPUE strata, starting with an index of 1 and ending with the
-#'     number of unique extra CV parameters estimated. If there are three CPUE
-#'     survey strata and user wanted to estimate an extra CV per stratum, they
-#'     would specify \code{pointer_extra_cpue_cv = c(1, 2, 3)}. By default, only
-#'     one additional parameter is estimated, regardless of how many strata area
-#'     defined (i.e. \code{pointer_extra_cpue_cv = c(1, 1, 1)}).}
+#'     extra CV parameters to individual cpue survey strata. Vector with
+#'     length = number of cpue strata, starting with an index of 1 and ending
+#'     with the number of unique extra CV parameters estimated. If there are
+#'     three cpue survey strata and user wanted to estimate an extra CV per
+#'     stratum, they would specify \code{pointer_extra_cpue_cv = c(1, 2, 3)}.
+#'     By default, only one additional parameter is estimated, regardless of how
+#'     many strata are defined (i.e. \code{pointer_extra_cpue_cv = c(1, 1, 1)}).}
 #'     \item{$initial_pars}{A vector of initial values for the extra cpue
-#'     \code{logit_tau}. The default initial value for each logit_tau is
-#'     -Inf (0 in real space).}
-#'     \item{$fix_pars}{Option to fix extra CPUE CV parameters, where the user
-#'     specifies the index value of the parameter they would like to fix at the
-#'     initial value. For example, if there are three CPUE survey strata defined
-#'     in \code{pointer_extra_cpue_cv}, and the user wants to fix the
-#'     \code{logit_tau} for the second stratum but estimate the \code{logit_tau}
+#'     \code{log_tau_cpue}. The default initial value for each log_tau_cpue is
+#'     log(1e-7) (approximately 0 on the arithmetic scale).}
+#'     \item{$fix_pars}{Option to fix extra cpue CV parameters, where the
+#'     user specifies the index value of the parameter they would like to fix at
+#'     the initial value. For example, if there are three cpue survey strata
+#'     defined in \code{pointer_extra_cpue_cv}, and the user wants to fix the
+#'     \code{log_tau_cpue} for the second stratum but estimate the \code{log_tau_cpue}
 #'     for the first and third strata they would specify \code{fix_pars =
 #'     c(2)}.}
-#'     \item{$upper_bound}{A vector of user-specified values for the upper bound
-#'     in real space of the estimated extra CPUE CV, where the default upper
-#'     bound is 1.5. For example, if there are three CPUE survey strata defined
-#'     in \code{pointer_extra_cpue_cv}, and the user wants to increase the upper
-#'     bound of the extra cpue CV from 1.5 to 2, they would specify
-#'     \code{upper_bound = c(2.0, 2.0, 2.0)}.}
 #' }
 #'
 #' @param model_name name of stock or other identifier for REMA model

diff --git a/R/set_defaults.R b/R/set_defaults.R
@@ -38,15 +38,15 @@ set_defaults <- function(input, admb_re = NULL) {
   data$pmu_log_q <- NA
   data$psig_log_q <- NA
 
-  data$extra_biomass_cv <- 0
-  data$extra_cpue_cv <- 0
+  # data$extra_biomass_cv <- 0
+  # data$extra_cpue_cv <- 0
 
-  data$tau_biomass_upper <- rep(1.5, length(unique(data$pointer_extra_biomass_cv)))
-  data$tau_cpue_upper <- rep(1.5, length(unique(data$pointer_extra_cpue_cv)))
+  # data$tau_biomass_upper <- rep(1.5, length(unique(data$pointer_extra_biomass_cv)))
+  # data$tau_cpue_upper <- rep(1.5, length(unique(data$pointer_extra_cpue_cv)))
 
   # process error and scaling parameters
-  par$log_PE <- rep(1, length(unique(data$pointer_PE_biomass)))
-  par$log_q <- rep(1, length(unique(data$pointer_q_cpue)))
+  par$log_PE <- rep(0, length(unique(data$pointer_PE_biomass)))
+  par$log_q <- rep(0, length(unique(data$pointer_q_cpue)))
 
   # tweedie parameters starting vals
   # tweedie_p_init = 1.6 # based on Dave Warton (ref?)
@@ -61,9 +61,15 @@ set_defaults <- function(input, admb_re = NULL) {
     par$logit_tweedie_p <- rep(0.4054651, 2)
   }
 
-  # extra CV for biomass or CPUE survey (-Inf = 0 in logit space)
-  par$logit_tau_biomass <- rep(-Inf, length(unique(data$pointer_extra_biomass_cv)))
-  par$logit_tau_cpue <- rep(-Inf, length(unique(data$pointer_extra_cpue_cv)))
+  # # extra CV for biomass or CPUE survey (-Inf = 0 in logit space)
+  # par$logit_tau_biomass <- rep(-Inf, length(unique(data$pointer_extra_biomass_cv)))
+  # par$logit_tau_cpue <- rep(-Inf, length(unique(data$pointer_extra_cpue_cv)))
+
+  # extra CV for biomass or CPUE survey (originally this was logit transformed
+  # but corrected to a log transformation in June 2024). by default the extra
+  # cvs are fixed at approximately 0
+  par$log_tau_biomass <- rep(log(1e-7), length(unique(data$pointer_extra_biomass_cv)))
+  par$log_tau_cpue <- rep(log(1e-7), length(unique(data$pointer_extra_cpue_cv)))
 
   # if admb_re is provided using read_admb_re(), use log biomass predictions as
   # initial values for the model. if not, use linear interpolation to initiate
@@ -74,11 +80,11 @@ set_defaults <- function(input, admb_re = NULL) {
   if(!is.null(admb_re)) {
     par$log_biomass_pred <- admb_re$init_log_biomass_pred
   } else {
-  tmp <- data$biomass_obs
-  tmp[tmp < 0.01] <- NA # just for starting values
-  par$log_biomass_pred <- log(apply(X = tmp,
-                                    MARGIN = 2,
-                                    FUN = zoo::na.approx, maxgap = 100, rule = 2))
+    tmp <- data$biomass_obs
+    tmp[tmp < 0.01] <- NA # just for starting values
+    par$log_biomass_pred <- log(apply(X = tmp,
+                                      MARGIN = 2,
+                                      FUN = zoo::na.approx, maxgap = 100, rule = 2))
   }
 
   # set map
@@ -93,9 +99,9 @@ set_defaults <- function(input, admb_re = NULL) {
 
   map$logit_tweedie_p <- fill_vals(par$logit_tweedie_p, NA)
 
-  map$logit_tau_biomass <- fill_vals(par$logit_tau_biomass, NA)
+  map$log_tau_biomass <- fill_vals(par$log_tau_biomass, NA)
 
-  map$logit_tau_cpue <- fill_vals(par$logit_tau_cpue, NA)
+  map$log_tau_cpue <- fill_vals(par$log_tau_cpue, NA)
 
   map$log_biomass_pred <- as.factor(1:length(map$log_biomass_pred))