Skip to content

Commit

Permalink
Merge pull request #27 from afsc-assessments/diagnostics
Browse files Browse the repository at this point in the history
Diagnostics
  • Loading branch information
JaneSullivan-NOAA authored Aug 21, 2024
2 parents 45359af + ae52670 commit b1883c8
Show file tree
Hide file tree
Showing 33 changed files with 4,249 additions and 264 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ BugReports: https://github.com/JaneSullivan-NOAA/rema/issues
License: GPL-3
LazyData: true
VignetteBuilder: knitr
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Encoding: UTF-8
Suggests:
testthat (>= 3.0.0),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export(read_rep)
export(tidy_extra_cv)
export(tidy_rema)
import(ggplot2)
import(grid)
importFrom(magrittr,"%>%")
useDynLib(rema)
useDynLib(rema, .registration = TRUE)
110 changes: 69 additions & 41 deletions R/get_osa_residuals.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#' Get one-step-head (OSA)
#'
#' Takes list output from \code{\link{tidy_rema}} and returns a list of
#' \code{ggplot2} objects to be plotted or saved. This feature is
#' experimental, and OSA residuals are still in Beta mode in
#' \code{\link[TMB:oneStepPredict]{TMB::oneStepPredict}}. The default "cdf"
#' method sometimes results in NA values for residuals, especially when
#' observation errors are small.
#' Takes the rema model output from \code{\link{fit_rema}} and returns OSA
#' residuals calculated using
#' \code{\link[TMB:oneStepPredict]{TMB::oneStepPredict}} with accompanying
#' residual analysis plots. IMPORTANT: OSA residuals do not work for users
#' implementing the Tweedie distribution.
#'
#' @param rema_model list out output from \code{\link{fit_rema}}, which includes
#' model results but also inputs. Of note to OSA residual calculations is the
Expand All @@ -14,28 +13,32 @@
#' associated with them.
#' @param options list of options for calculating OSA residuals, passed to
#' \code{\link[TMB:oneStepPredict]{TMB::oneStepPredict}}. Default:
#' \code{options = list(method = "cdf", parallel = TRUE)}.
#' \code{options = list(method = "fullGaussian", parallel = TRUE)}.
#' Alternative methods include "cdf", "oneStepGeneric",
#' "oneStepGaussianOffMode", and "oneStepGaussian".
#'
#' @return a list of tidied data.frames containing the biomass and CPUE survey
#' residuals with accompanying data, as well as ggplot2 plots to visual
#' results from the OSA residual analysis.
#' residuals with accompanying data, as well as a QQ-plot, histogram
#' of residuals, and plots of residuals~year and residuals~fitted values by
#' strata for the biomass and CPUE survey.
#'
#' @import ggplot2
#' @import grid
#' @export
#' @seealso \code{\link{tidy_rema}}
#' @examples
#' \dontrun{
#' # placeholder for example
#' }
get_osa_residuals <- function(rema_model,
options = list(method = "cdf",
# "oneStepGaussianOffMode", "fullGaussian", "oneStepGeneric",
# "oneStepGaussian", "cdf"
options = list(method = "fullGaussian",
# "cdf","oneStepGeneric","oneStepGaussianOffMode","oneStepGaussian"),
parallel = TRUE)) {

# rema_model = m
# options = list(method = "cdf", parallel = TRUE)
# rema_model = m1
# options = list(method = "fullGaussian", parallel = TRUE)

print("**WARNING: OSA residuals do not currently work for the Tweedie distribution.**")

p_resids <- function(dat) {

Expand All @@ -50,35 +53,59 @@ get_osa_residuals <- function(rema_model,
labs(x = 'Year', y = 'Residual')
}

p_qq <- function(dat) {
ggplot(data = dat, aes(sample = residual)) +
p_qq <- function(dat, facet_strata = TRUE) {
# dat = osa_resids
sdnr <- sd(dat$residual)
sdnr <- paste0('SDNR = ', formatC(sdnr, format = "f", digits = 2))
p <- ggplot(data = dat, aes(sample = residual, col = strata)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~strata) +
labs(x = 'Theoretical quantiles', y = 'Sample quantiles')
}
# stat_qq_line() + # puts through IQR
geom_abline(slope = 1, intercept = 0) +
labs(x = 'Theoretical quantiles', y = 'Sample quantiles', col = 'Stratum') +
ggplot2::scale_colour_viridis_d(direction = 1)

if(isTRUE(facet_strata)) {p + facet_wrap(~strata)
} else {p + annotation_custom(grid::grobTree(grid::textGrob(sdnr, x=0.1, y=0.95, hjust=0,
gp=grid::gpar(col="black", fontsize=10))))}

p_hist <- function(dat) {
ggplot(data = dat, aes(x = residual)) +
# geom_histogram() +
geom_histogram(aes(y = ..density..), colour = "black", fill = "white")+
geom_density(alpha = 0.2, fill = "#FF6666") +
facet_wrap(~strata, scales = 'free') +
labs(x = 'Residual', y = 'Density')
}

# p_hist <- function(dat) {
# ggplot(data = dat, aes(x = residual)) +
# geom_histogram(aes(y = after_stat(density)), colour = "black", fill = "white") +
# geom_density(fill = fill_alpha("#21918c", 0.6)) +
# labs(x = 'Residual', y = 'Density')
# }

p_fitted <- function(dat) {

ybound <- max(abs(dat$residual), na.rm = TRUE) * 1.1

ggplot(data = dat, aes(x = log_pred, y = residual)) +
geom_hline(yintercept = 0, colour = "grey", size = 1) +
geom_segment(aes(x = log_pred, xend = log_pred, y = 0, yend = residual)) +
geom_point() +
expand_limits(y = c(-ybound, ybound)) +
facet_wrap(~strata, scales = 'free_x') +
labs(x = 'Fitted values (log-scale)', y = 'Residual')
}

# p_acf <- function(dat) {
# # dat = osa_resids
# # dat = biomass_resids
#
# biomacf <- acf(x = dat$residual, na.action = na.pass, plot = FALSE)
# acfci <- qnorm((1 + 0.95)/2)/sqrt(biomacf$n.used)
# biomacf <- data.frame(Lag = 1:(nrow(biomacf$acf)), ACF = biomacf$acf, lci = -acfci, uci = acfci)
# ggplot(biomacf, aes(x = Lag, y = ACF)) +
# geom_hline(yintercept = 0, colour = "grey", size = 1) +
# geom_hline(yintercept = biomacf$uci, colour = "blue", linetype = 2) +
# geom_hline(yintercept = biomacf$lci, colour = "blue", linetype = 2) +
# geom_segment(aes(x = Lag, xend = Lag, y = 0, yend = ACF)) +
# geom_point()
#
# }

if(is.null(rema_model$err)) {

if(rema_model$is_sdrep) { # only do OSA residuals if sdrep ran
Expand All @@ -104,10 +131,12 @@ get_osa_residuals <- function(rema_model,
dplyr::filter(survey == 'Biomass survey') %>%
dplyr::select(year, strata, residual))

p1_qq <- p_qq(dat = osa_resids, facet_strata = FALSE)
# p2_acf <- p_acf(dat = osa_resids)
# p3_hist <- p_hist(dat = osa_resids)
p1_biomass <- p_resids(dat = biomass_resids)
p2_biomass <- p_qq(dat = biomass_resids)
p3_biomass <- p_hist(dat = biomass_resids)
p4_biomass <- p_fitted(dat = biomass_resids)
p2_biomass <- p_fitted(dat = biomass_resids)
p3_biomass <- p_qq(dat = biomass_resids, facet_strata = TRUE)

biomass_resids <- biomass_resids %>%
dplyr::select(model_name, variable, strata, year, log_pred, log_obs, residual)
Expand All @@ -121,9 +150,8 @@ get_osa_residuals <- function(rema_model,
dplyr::select(year, strata, residual))

p1_cpue <- p_resids(dat = cpue_resids)
p2_cpue <- p_qq(dat = cpue_resids)
p3_cpue <- p_hist(dat = cpue_resids)
p4_cpue <- p_fitted(dat = cpue_resids)
p2_cpue <- p_fitted(dat = cpue_resids)
p3_cpue <- p_qq(dat = cpue_resids, facet_strata = TRUE)

cpue_resids <- cpue_resids %>%
dplyr::select(model_name, variable, strata, year, log_pred, log_obs, residual)
Expand All @@ -135,15 +163,15 @@ get_osa_residuals <- function(rema_model,
out$residuals <- list(biomass = biomass_resids,
cpue = cpue_resids)

out$plots <- list(biomass_resids = p1_biomass,
biomass_qqplot = p2_biomass,
biomass_hist = p3_biomass,
biomass_fitted = p4_biomass,
out$plots <- list(qq = p1_qq,
# acf = p2_acf,
# histo = p3_hist,
biomass_resids = p1_biomass,
biomass_fitted = p2_biomass,
biomass_qq = p3_biomass,
cpue_resids = p1_cpue,
cpue_qqplot = p2_cpue,
cpue_hist = p3_cpue,
cpue_fitted = p4_cpue)
warning("OSA residuals are experimental and underlying methods in TMB::oneStepPredict are still in Beta mode. The default 'cdf' method sometimes results in NA values for residuals, especially when observation errors are small.")
cpue_fitted = p2_cpue,
cpue_qq = p3_cpue)
return(out)
}

Expand Down
74 changes: 33 additions & 41 deletions R/prepare_rema_input.R
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,16 @@
#' }
#'
#' \code{extra_biomass_cv} allows the user to specify options for estimating an
#' additional CV parameter ("tau" in the source code) for the biomass survey
#' observations. If \code{extra_biomass_cv = NULL} (default), no extra CV is
#' estimated. The user can modify the default \code{extra_biomass_cv} options
#' using the following list of entries:
#' additional CV parameter (\code{log_tau_biomass} in the source code, estimated in
#' log-space) for the biomass survey observations. If \code{extra_biomass_cv =
#' NULL} (default), no extra CV is estimated. The user can modify the default
#' \code{extra_biomass_cv} options using the following list of entries:
#' \describe{
#' \item{$assumption}{A string identifying what assumption is used for the
#' biomass survey observations. Options include "none" (default in which no
#' extra CV is estimated) or "extra_cv". If \code{extra_biomass_cv} is not
#' extra CV is estimated) or "extra_cv". If assumption = "extra_cv", by
#' default only one extra CV will be estimated, regardless of how many
#' biomass strata are defined. If \code{extra_biomass_cv} is not
#' NULL, user must define appropriate assumption.}
#' \item{$pointer_extra_biomass_cv}{An index to customize the assignment of
#' extra CV parameters to individual biomass survey strata. Vector with
Expand All @@ -216,59 +218,49 @@
#' three biomass survey strata and user wanted to estimate an extra CV per
#' stratum, they would specify \code{pointer_extra_biomass_cv = c(1, 2, 3)}.
#' By default, only one additional parameter is estimated, regardless of how
#' many strata area defined (i.e. \code{pointer_extra_biomass_cv = c(1, 1, 1)}).}
#' many strata are defined (i.e. \code{pointer_extra_biomass_cv = c(1, 1, 1)}).}
#' \item{$initial_pars}{A vector of initial values for the extra biomass
#' \code{logit_tau}. The default initial value for each logit_tau is
#' -Inf (0 in real space).}
#' \code{log_tau_biomass}. The default initial value for each log_tau_biomass is
#' log(1e-7) (approximately 0 on the arithmetic scale).}
#' \item{$fix_pars}{Option to fix extra biomass CV parameters, where the
#' user specifies the index value of the parameter they would like to fix at
#' the initial value. For example, if there are three biomass survey strata
#' defined in \code{pointer_extra_biomass_cv}, and the user wants to fix the
#' \code{logit_tau} for the second stratum but estimate the \code{logit_tau}
#' \code{log_tau_biomass} for the second stratum but estimate the \code{log_tau_biomass}
#' for the first and third strata they would specify \code{fix_pars =
#' c(2)}.}
#' \item{$upper_bound}{A vector of user-specified values for the upper bound
#' in real space of the estimated extra biomass CV, where the default upper
#' bound is 1.5. For example, if there are three biomass survey strata
#' defined in \code{pointer_extra_biomass_cv}, and the user wants to
#' increase the upper bound of the extra biomass CV from 1.5 to 2, they
#' would specify \code{upper_bound = c(2.0, 2.0, 2.0)}.}
#' }
#'
#' \code{extra_cpue_cv} allows the user to specify options for estimating an
#' additional CV parameter ("tau" in the source code) for the CPUE survey
#' observations. If \code{extra_cpue_cv = NULL} (default), no extra CV is
#' estimated. The user can modify the default \code{extra_cpue_cv} options
#' using the following list of entries:
#' additional CV parameter (\code{log_tau_cpue} in the source code, estimated in
#' log-space) for the cpue survey observations. If \code{extra_cpue_cv =
#' NULL} (default), no extra CV is estimated. The user can modify the default
#' \code{extra_cpue_cv} options using the following list of entries:
#' \describe{
#' \item{$assumption}{A string identifying what assumption is used for the
#' CPUE survey observations. Options include "none" (default in which no
#' extra CV is estimated) or "extra_cv".If \code{extra_cpue_cv} is not
#' cpue survey observations. Options include "none" (default in which no
#' extra CV is estimated) or "extra_cv". If assumption = "extra_cv", by
#' default only one extra CV will be estimated, regardless of how many
#' cpue strata are defined. If \code{extra_cpue_cv} is not
#' NULL, user must define appropriate assumption.}
#' \item{$pointer_extra_cpue_cv}{An index to customize the assignment of
#' extra CV parameters to individual CPUE survey strata. Vector with length
#' = number of CPUE strata, starting with an index of 1 and ending with the
#' number of unique extra CV parameters estimated. If there are three CPUE
#' survey strata and user wanted to estimate an extra CV per stratum, they
#' would specify \code{pointer_extra_cpue_cv = c(1, 2, 3)}. By default, only
#' one additional parameter is estimated, regardless of how many strata area
#' defined (i.e. \code{pointer_extra_cpue_cv = c(1, 1, 1)}).}
#' extra CV parameters to individual cpue survey strata. Vector with
#' length = number of cpue strata, starting with an index of 1 and ending
#' with the number of unique extra CV parameters estimated. If there are
#' three cpue survey strata and user wanted to estimate an extra CV per
#' stratum, they would specify \code{pointer_extra_cpue_cv = c(1, 2, 3)}.
#' By default, only one additional parameter is estimated, regardless of how
#' many strata are defined (i.e. \code{pointer_extra_cpue_cv = c(1, 1, 1)}).}
#' \item{$initial_pars}{A vector of initial values for the extra cpue
#' \code{logit_tau}. The default initial value for each logit_tau is
#' -Inf (0 in real space).}
#' \item{$fix_pars}{Option to fix extra CPUE CV parameters, where the user
#' specifies the index value of the parameter they would like to fix at the
#' initial value. For example, if there are three CPUE survey strata defined
#' in \code{pointer_extra_cpue_cv}, and the user wants to fix the
#' \code{logit_tau} for the second stratum but estimate the \code{logit_tau}
#' \code{log_tau_cpue}. The default initial value for each log_tau_cpue is
#' log(1e-7) (approximately 0 on the arithmetic scale).}
#' \item{$fix_pars}{Option to fix extra cpue CV parameters, where the
#' user specifies the index value of the parameter they would like to fix at
#' the initial value. For example, if there are three cpue survey strata
#' defined in \code{pointer_extra_cpue_cv}, and the user wants to fix the
#' \code{log_tau_cpue} for the second stratum but estimate the \code{log_tau_cpue}
#' for the first and third strata they would specify \code{fix_pars =
#' c(2)}.}
#' \item{$upper_bound}{A vector of user-specified values for the upper bound
#' in real space of the estimated extra CPUE CV, where the default upper
#' bound is 1.5. For example, if there are three CPUE survey strata defined
#' in \code{pointer_extra_cpue_cv}, and the user wants to increase the upper
#' bound of the extra cpue CV from 1.5 to 2, they would specify
#' \code{upper_bound = c(2.0, 2.0, 2.0)}.}
#' }
#'
#' @param model_name name of stock or other identifier for REMA model
Expand Down
38 changes: 22 additions & 16 deletions R/set_defaults.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ set_defaults <- function(input, admb_re = NULL) {
data$pmu_log_q <- NA
data$psig_log_q <- NA

data$extra_biomass_cv <- 0
data$extra_cpue_cv <- 0
# data$extra_biomass_cv <- 0
# data$extra_cpue_cv <- 0

data$tau_biomass_upper <- rep(1.5, length(unique(data$pointer_extra_biomass_cv)))
data$tau_cpue_upper <- rep(1.5, length(unique(data$pointer_extra_cpue_cv)))
# data$tau_biomass_upper <- rep(1.5, length(unique(data$pointer_extra_biomass_cv)))
# data$tau_cpue_upper <- rep(1.5, length(unique(data$pointer_extra_cpue_cv)))

# process error and scaling parameters
par$log_PE <- rep(1, length(unique(data$pointer_PE_biomass)))
par$log_q <- rep(1, length(unique(data$pointer_q_cpue)))
par$log_PE <- rep(0, length(unique(data$pointer_PE_biomass)))
par$log_q <- rep(0, length(unique(data$pointer_q_cpue)))

# tweedie parameters starting vals
# tweedie_p_init = 1.6 # based on Dave Warton (ref?)
Expand All @@ -61,9 +61,15 @@ set_defaults <- function(input, admb_re = NULL) {
par$logit_tweedie_p <- rep(0.4054651, 2)
}

# extra CV for biomass or CPUE survey (-Inf = 0 in logit space)
par$logit_tau_biomass <- rep(-Inf, length(unique(data$pointer_extra_biomass_cv)))
par$logit_tau_cpue <- rep(-Inf, length(unique(data$pointer_extra_cpue_cv)))
# # extra CV for biomass or CPUE survey (-Inf = 0 in logit space)
# par$logit_tau_biomass <- rep(-Inf, length(unique(data$pointer_extra_biomass_cv)))
# par$logit_tau_cpue <- rep(-Inf, length(unique(data$pointer_extra_cpue_cv)))

# extra CV for biomass or CPUE survey (originally this was logit transformed
# but corrected to a log transformation in June 2024). by default the extra
# cvs are fixed at approximately 0
par$log_tau_biomass <- rep(log(1e-7), length(unique(data$pointer_extra_biomass_cv)))
par$log_tau_cpue <- rep(log(1e-7), length(unique(data$pointer_extra_cpue_cv)))

# if admb_re is provided using read_admb_re(), use log biomass predictions as
# initial values for the model. if not, use linear interpolation to initiate
Expand All @@ -74,11 +80,11 @@ set_defaults <- function(input, admb_re = NULL) {
if(!is.null(admb_re)) {
par$log_biomass_pred <- admb_re$init_log_biomass_pred
} else {
tmp <- data$biomass_obs
tmp[tmp < 0.01] <- NA # just for starting values
par$log_biomass_pred <- log(apply(X = tmp,
MARGIN = 2,
FUN = zoo::na.approx, maxgap = 100, rule = 2))
tmp <- data$biomass_obs
tmp[tmp < 0.01] <- NA # just for starting values
par$log_biomass_pred <- log(apply(X = tmp,
MARGIN = 2,
FUN = zoo::na.approx, maxgap = 100, rule = 2))
}

# set map
Expand All @@ -93,9 +99,9 @@ set_defaults <- function(input, admb_re = NULL) {

map$logit_tweedie_p <- fill_vals(par$logit_tweedie_p, NA)

map$logit_tau_biomass <- fill_vals(par$logit_tau_biomass, NA)
map$log_tau_biomass <- fill_vals(par$log_tau_biomass, NA)

map$logit_tau_cpue <- fill_vals(par$logit_tau_cpue, NA)
map$log_tau_cpue <- fill_vals(par$log_tau_cpue, NA)

map$log_biomass_pred <- as.factor(1:length(map$log_biomass_pred))

Expand Down
Loading

0 comments on commit b1883c8

Please sign in to comment.