diff --git a/DESCRIPTION b/DESCRIPTION index 47e26ce..7546bdd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: rloadest Type: Package Title: River Load Estimation -Version: 0.4.0 -Date: 2015-02-27 +Version: 0.4.2 +Date: 2015-07-20 Author: Dave Lorenz, Rob Runkel, Laura De Cicco Maintainer: Dave Lorenz Description: Collection of functions to make constituent load estimations @@ -10,13 +10,17 @@ Description: Collection of functions to make constituent load estimations License: CC0 Depends: R (>= 3.0.0), + smwrBase, smwrGraphs, smwrStats, smwrQW Imports: - stats + lubridate, + stats, + segmented Suggests: - smwrBase, - dataRetrieval + dataRetrieval, + EGRET, + survival LazyLoad: yes LazyData: yes diff --git a/NAMESPACE b/NAMESPACE index 6493bd1..f5447c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ -# Generated by roxygen2 (4.0.2): do not edit by hand +# Generated by roxygen2 (4.1.0): do not edit by hand +S3method(censoring,Surv) S3method(coef,loadReg) S3method(fitted,loadReg) S3method(logLik,loadReg) @@ -25,8 +26,14 @@ export(nashSutcliffe) export(predConc) export(predLoad) export(resampleUVdata) +export(seg) +export(segLoadReg) +export(segment) export(selBestModel) export(selBestSubset) +import(lubridate) +import(segmented) +import(smwrBase) import(smwrGraphs) import(smwrQW) import(smwrStats) diff --git a/R/Models.R b/R/Models.R new file mode 100644 index 0000000..d320b0c --- /dev/null +++ b/R/Models.R @@ -0,0 +1,19 @@ +#' Models + +#' A dataset that describes the equivalent formula for each of the +#'predefined model number. +#' @name Models +#' @docType data +#' @usage Models +#' @format Data frame with 9 rows and 2 columns\cr +#'\tabular{lll}{ +#'Name \tab Type \tab Description\cr +#'Number \tab integer \tab The predefined model number\cr +#'Formula \tab factor \tab The equivalent formula for the predefined model number\cr +#'} +#' @keywords datasets +#' @seealso \code{\link{model}} +#' @examples +#' data(Models) +#' print(Models) +NULL diff --git a/R/censoring.Surv.R b/R/censoring.Surv.R new file mode 100644 index 0000000..726f52e --- /dev/null +++ b/R/censoring.Surv.R @@ -0,0 +1,32 @@ +#' Describe Censoring +#' +#' Gets the type of censoring ("none," "left," "multiple") for an object. +#' +#' +#' @param x the object to get the type of censoring. For an object of class +#'"Surv," the type must be "interval." +#' @return A character string "none," "left," or "multiple" describing the type +#'of censoring present in the object. +#' @note This function is mostly used within other functions to determine the +#''best' technique to use for analysis. +#' @keywords censored attribute +#' @examples +#'\dontrun{ +#'library(survival) +#'censoring(Surv(2.3, 2.3, type="interval2")) +#'} +#' @export +#' @method censoring Surv +censoring.Surv <- function(x) { + if(attr(x, "type") != "interval") + stop("Only interval type censoring supported") + # Strip NAs from x + x <- x[!is.na(x)] + if(all(x[, 3] == 1)) { + return("none") + } else if(any(x[, 3] %in% c(0,3))) { + return("multiple") + } + return("left") +} + diff --git a/R/loadReg.R b/R/loadReg.R index 40f147d..1fa9110 100644 --- a/R/loadReg.R +++ b/R/loadReg.R @@ -9,8 +9,14 @@ #'For un- or left-censored data, AMLE is used unless weights are specified in #'the model, then MLE is used, through a call to \code{survreg}. For any other #'censored data, MLE is used.\cr -#'See \code{\link{loadConvFactor}} for details about valid values for -#'\code{flow.units}, \code{con.units} and \code{load.units}. +#' +#'Typically, \code{loadReg} expects the response variable to have units of +#'concentration, mass per volume. For these models, See \code{\link{loadConvFactor}} +#'for details about valid values for \code{flow.units}, \code{conc.units} and +#'\code{load.units}. For some applications, like bed load estimation, the response +#'variable can have units of flux, mass per time. For these models, \code{conc.units} +#'can be expressed as any valid \code{load.units} per day. The rate must be expressed +#'in terms of "/d," "/dy," or "/day." #' #' @param formula a formula describing the regression model. See \bold{Details}. #' @param data the data to search for the variables in \code{formula}. @@ -44,14 +50,22 @@ #' flow = "FLOW", dates = "DATES", conc.units="mg/L", #' station="Illinois River at Marseilles, Ill.") #'print(app1.lr) -#' +#' @import smwrBase #' @import smwrQW #' @export loadReg <- function(formula, data, subset, na.action, flow, dates, flow.units="cfs", conc.units="", load.units="kg", time.step="day", station="") { - ## Coding history: - ## 2013May31 DLLorenz Original Coding + ## Function to check if conc.units are in terms of loads + ## returns logical with attribute of loads if TRUE + ckLoad <- function(units) { + # check if rate is /d, /dy, or /day (actually any /d) + retval <- grepl("/d", units) + if(retval) { # strip off rate + attr(retval, "load") <- sub("/d.*", "", units) + } + return(retval) + } ## ## Trap model number specification PredMod <- terms(formula, "model", data = data) @@ -199,41 +213,49 @@ loadReg <- function(formula, data, subset, na.action, flow, dates, } Tdys <- min(Tdifs) if(Tdys < 7) - warning("The minimum spacing between daily loads is ", Tdys, + warning("The minimum spacing between daily loads is ", signif(Tdys, 2L), " days. The time between observations should be at least ", " 7 days to avoid autocorrelation issues.") } else { # Need unit checks too Tdys <- difftime(Time, shiftData(Time), units="days") Tdys <- min(Tdys, na.rm=TRUE) if(Tdys < 7) - warning("The minimum spacing between daily loads is ", Tdys, + warning("The minimum spacing between observed loads is ", signif(Tdys, 2L), " days. The time between observations should be at least ", " 7 days to avoid autocorrelation issues.") } - ## OK, construct the concentration fit - if(class(Y) == "lcens") { - cfit <- censReg_AMLE.fit(Y, X, "lognormal") - fit1 <- censReg_AMLE.fit(Y, X[,1L, drop=FALSE], "lognormal") - } else { - cfit <- censReg_MLE.fit(Y, X, rep(1, length(Y)), "lognormal") - fit1 <- censReg_MLE.fit(Y, X[,1L, drop=FALSE], - rep(1, length(Y)), "lognormal") - } - cfit$LLR1 <- fit1$LLR - cfit$call <- call - cfit$terms <- Terms - cfit$na.action <- saved.na.action - cfit$na.message <- na.message - cfit$xlevels <- xlevels - class(cfit) <- "censReg" - ## Not the load model fit - ## Check on concentration units + ## Checks on concentration units if(conc.units == "") { warning("Concentration units assumed to be mg/L") conc.units <- "mg/L" } - CF <- loadConvFactor(flow.units, conc.units, load.units) - Y <- Y * CF * Flow + load.only <- ckLoad(conc.units) + if(load.only) { + cfit <- NULL + ## Prepare for load model fit + CF <- loadUnitConv(attr(load.only, "load"), load.units) + Y <- Y * CF + } else { + ## OK, construct the concentration fit + if(class(Y) == "lcens") { + cfit <- censReg_AMLE.fit(Y, X, "lognormal") + fit1 <- censReg_AMLE.fit(Y, X[,1L, drop=FALSE], "lognormal") + } else { + cfit <- censReg_MLE.fit(Y, X, rep(1, length(Y)), "lognormal") + fit1 <- censReg_MLE.fit(Y, X[,1L, drop=FALSE], + rep(1, length(Y)), "lognormal") + } + cfit$LLR1 <- fit1$LLR + cfit$call <- call + cfit$terms <- Terms + cfit$na.action <- saved.na.action + cfit$na.message <- na.message + cfit$xlevels <- xlevels + class(cfit) <- "censReg" + ## Now prepare the load model fit + CF <- loadConvFactor(flow.units, conc.units, load.units) + Y <- Y * CF * Flow + } if(class(Y) == "lcens") { lfit <- censReg_AMLE.fit(Y, X, "lognormal") fit1 <- censReg_AMLE.fit(Y, X[,1L, drop=FALSE], "lognormal") diff --git a/R/plot.loadReg.R b/R/plot.loadReg.R index 6ad6223..a3343ad 100644 --- a/R/plot.loadReg.R +++ b/R/plot.loadReg.R @@ -147,7 +147,7 @@ plot.loadReg <- function(x, which='All', set.up=TRUE, span=1.0, ...) { if(doPlot[4L]) corGram(dectime(x$Time), Resids) ## Add details of call on regression model to next plots - Mod <- format(x$lfit$call$formula) + Mod <- paste0(as.character(x$lfit$call$formula[2L]), " ~ model(", x$model.no, ")") ## 3rd plot, S-L RSE <- rmse(x$lfit) # now the resid std. error if(doPlot[3L]) { diff --git a/R/predConc.R b/R/predConc.R index af9fe4e..fa7f182 100644 --- a/R/predConc.R +++ b/R/predConc.R @@ -39,17 +39,14 @@ #' station="Illinois River at Marseilles, Ill.") #'predConc(app1.lr, app1.calib) #' @useDynLib rloadest estlday +#' @import lubridate #' @export predConc <- function(fit, newdata, by="day", allow.incomplete=FALSE, conf.int=0.95) { - ## Coding history: - ## 2013Jul18 DLLorenz Original Coding from predLoad - ## 2013Dec05 DLLorenz Bug fix - ## 2014Sep23 DLLorenz Missing check on newdata ## ## By options and other preliminary code - if(any(is.na(newdata))) - stop("newdata contains missing values, remove before prediction") + if(is.null(fit$cfit)) + stop("model cannot predict concentration values") if(nrow(newdata) > 176000L) stop("newdata has too many rows, the size limit is 176000") ByOpt <- c("unit", "day") @@ -97,6 +94,8 @@ predConc <- function(fit, newdata, by="day", } else { model.inp <- setXLDat(newdata, flow, dates, Qadj, Tadj, model.no) } + if(any(is.na(model.inp))) + stop("Prediction data contains missing values, remove before prediction") ## Construct the structure for aggregating the data ## Deal with byn == 0 directly in the last estimation section checkDays <- list() # Create empty list for day checking diff --git a/R/predLoad.R b/R/predLoad.R index a933f0a..d743203 100644 --- a/R/predLoad.R +++ b/R/predLoad.R @@ -64,8 +64,6 @@ predLoad <- function(fit, newdata, load.units=fit$load.units, by="total", ## 2014Sep23 DLLorenz Missing check on newdata ## ## By options and other preliminary code - if(any(is.na(newdata))) - stop("newdata contains missing values, remove before prediction") if(nrow(newdata) > 176000L) stop("newdata has too many rows, the size limit is 176000") ByOpt <- c("unit", "day", "month", "water year", "calendar year", "total") @@ -106,12 +104,9 @@ predLoad <- function(fit, newdata, load.units=fit$load.units, by="total", newdata <- aggregate(newdata, list(time.step=gps), mean) attr(newdata[[dates]], "tzone") <- ntz } else { # Must be instantaneous - gps <- format(newdata[[dates]]) - difdays <- range(newdata[[dates]]) - difdays <- difftime(ceiling_date(difdays[2L], unit="day"), - floor_date(difdays[1L], unit="day"), - units="days") - gps.nday <- round(nrow(newdata)/as.numeric(difdays), 6L) # should get to near integer + gps <- format(newdata[[dates]], format="%Y-%m-%d") + numdays <- length(unique(gps)) # the number of days in the datset + gps.nday <- round(nrow(newdata)/as.numeric(numdays), 6L) # should get to near integer if((gps.nday %% 1) != 0 && !(by %in% c("unit", "day"))) stop("newdata must have the same number of observations each day") gps.nday <- as.integer(gps.nday) @@ -124,6 +119,8 @@ predLoad <- function(fit, newdata, load.units=fit$load.units, by="total", } else { model.inp <- setXLDat(newdata, flow, dates, Qadj, Tadj, model.no) } + if(any(is.na(model.inp))) + stop("Prediction data contains missing values, remove before prediction") ## Construct the structure for aggregating the data ## Deal with byn == 0 directly in the last estimation section checkDays <- list() # Create empty list for day checking diff --git a/R/print.loadReg.R b/R/print.loadReg.R index 09515b9..f98bcf0 100644 --- a/R/print.loadReg.R +++ b/R/print.loadReg.R @@ -8,18 +8,28 @@ #' @param load.only print only the load model and not concentration model results. #' @param \dots further arguments passed to or from other methods. #' @return The object \code{x} is returned invisibly. -#' @note The printed output includes the call, the coefficent table, the -#'estimated residual standard error, the log-likelihood of the model and -#'null model with the attained p-value, and the computational method. -#' NEED more details about what is printed and the difference in output due to arguments +#' @note The printed output replicates the output described in Runkel (2004) and +#'includes a short section summarizing the data, the load model and coefficients, +#'regression statistics, and comparison of observed and estimated loads. If +#'\code{load.only} is set to \code{FALSE}, then similar output is generated for the +#'concetration model. If \code{brief} is \code{FALSE}, then additional descriptions +#'of selected sections of the output are produced. +#' +#'If the estimation method is "MLE," then the estimated loads used in the comparison +#'to observed loads are approximate becuase they are estimated using MLE, rather than +#'AMLE, wihch is used for \code{predLoad} and \code{predConc}. The bias is very small +#'when the residual varaince is less than 0.5, but can be large when the residual +#'variance is greater than 1. +#' #' @seealso \code{\link{loadReg}} #' @keywords utilities -#' +#' @references +#'Runkel, R.L., Crawford, C.G., and Cohn, T.A., 2004, Load estimator (LOADEST): +#'A FORTRAN program for estimating constituent loads in streams and rivers: +#'U.S. Geological Survey Techniques and Methods book 4, chap. A5, 69 p. #' @export #' @method print loadReg print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { - ## Coding history: - ## 2013May31 DLLorenz Original Coding ## ## Explanations of model terms Explan <- c(lnQ="ln(Q) - center of ln(Q)", @@ -66,7 +76,8 @@ print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { " Constituent Output File Part Ia: Calibration (Load Regression)\n", "----------------------------------------------------------------------\n\n", sep="") - Ncen <- sum(x$lfit$CENSFLAG) + # CENSFLAG is logical for AMLE, integer for MLE, this protects + Ncen <- sum(x$lfit$CENSFLAG != 0) cat(" Number of Observations: ", x$lfit$NOBSC, "\n", "Number of Uncensored Observations: ", x$lfit$NOBSC - Ncen, "\n", @@ -105,7 +116,7 @@ print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { RSQ <- signif(x$lfit$RSQ, digits) RSQtxt <- "R-squared: " } else { - RSQ <- 100*signif(1 - exp(x$lfit$LLR1 - x$lfit$LLR)^(2/x$lfit$NOBSC), digits) + RSQ <- 100*signif(1 - exp((x$lfit$LLR1 - x$lfit$LLR)*2/x$lfit$NOBSC), digits) RSQtxt <- "Generalized R-squared: " } G2 <- signif(2*(x$lfit$LLR - x$lfit$LLR1), digits) @@ -117,20 +128,33 @@ print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { pval <- format(round(pval, 4), scientific=5) ## Compute the PPCC Res <- residuals(x$lfit, type="working", suppress.na.action=TRUE) - ppcc <- censPPCC.test(as.lcens(Res, censor.codes=x$lfit$CENSFLAG)) - cat("\n", x$method, " Regression Statistics\n", - "Residual variance: ", - signif(x$lfit$PARAML[x$lfit$NPAR + 1L], digits), "\n", - RSQtxt, RSQ, " percent\n", - "G-squared: ", G2, " on ", x$lfit$NPAR - 1L, - " degrees of freedom\n", - "P-value: ", pval, "\n", - "Prob. Plot Corr. Coeff. (PPCC):\n", - " r = ", round(ppcc$statistic, digits), "\n", - " p-value = ", round(ppcc$p.value, digits), "\n", - "Serial Correlation of Residuals: ", - round(x$lfit$SCORR, digits), "\n\n", - sep="") + if(x$method == "AMLE") { + ppcc <- censPPCC.test(as.lcens(Res, censor.codes=x$lfit$CENSFLAG)) + cat("\n", x$method, " Regression Statistics\n", + "Residual variance: ", + signif(x$lfit$PARAML[x$lfit$NPAR + 1L], digits), "\n", + RSQtxt, RSQ, " percent\n", + "G-squared: ", G2, " on ", x$lfit$NPAR - 1L, + " degrees of freedom\n", + "P-value: ", pval, "\n", + "Prob. Plot Corr. Coeff. (PPCC):\n", + " r = ", round(ppcc$statistic, digits), "\n", + " p-value = ", round(ppcc$p.value, digits), "\n", + "Serial Correlation of Residuals: ", + round(x$lfit$SCORR, digits), "\n\n", + sep="") + } else { # ppcc test cannot be computed for interval or right-censored + cat("\n", x$method, " Regression Statistics\n", + "Residual variance: ", + signif(x$lfit$PARAML[x$lfit$NPAR + 1L], digits), "\n", + RSQtxt, RSQ, " percent\n", + "G-squared: ", G2, " on ", x$lfit$NPAR - 1L, + " degrees of freedom\n", + "P-value: ", pval, "\n", + "Prob. Plot Corr. Coeff. (PPCC) cannot be computed for method MLE\n", + "Serial Correlation of Residuals not computed for method MLE\n", + sep="") + } if(!brief && x$lfit$NPAR > 2L) { # Print the correlation matrixes cat("Correlation Between Explanatory Variables\n", "-----------------------------------------\n\n", sep="") @@ -154,8 +178,13 @@ print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { colnames(vifs) <- "VIF" print(vifs, digits=digits) } - cat("\nComparison of Observed and Estimated Loads\n", - "------------------------------------------\n", sep="") + if(x$method == "AMLE") { + cat("\nComparison of Observed and Estimated Loads\n", + "------------------------------------------\n", sep="") + } else { # method is MLE + cat("\nComparison of Observed and Approximate Estimated Loads\n", + "------------------------------------------------------\n", sep="") + } if(!brief) cat(" The summary statistics and bias diagnostics presented below are based\n", "on a comparison of observed and estimated loads for all dates/times within\n", @@ -251,7 +280,7 @@ print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { sep="") } ## Concentration output - if(!load.only) { + if(!load.only && !is.null(x$cfit)) { if(!brief) cat("--------------------------------------------------------------------------\n", " Constituent Output File Part Ia: Calibration (Concentration Regression)\n", @@ -295,22 +324,40 @@ print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { pval <- format(round(pval, 4), scientific=5) ## Compute the PPCC Res <- residuals(x$cfit, type="response") - ppcc <- censPPCC.test(as.lcens(Res, censor.codes=x$cfit$CENSFLAG)) - cat("\n", x$method, " Regression Statistics\n", - "Residual variance: ", - signif(x$cfit$PARAML[x$cfit$NPAR + 1L], digits), "\n", - RSQtxt, RSQ, " percent\n", - "G-squared: ", G2, " on ", x$cfit$NPAR - 1L, - " degrees of freedom\n", - "P-value: ", pval, "\n", - "Prob. Plot Corr. Coeff. (PPCC):\n", - " r = ", round(ppcc$statistic, digits), "\n", - " p-value = ", round(ppcc$p.value, digits), "\n", - "Serial Correlation of Residuals: ", - round(x$cfit$SCORR, digits), "\n\n", - sep="") - cat("\nComparison of Observed and Estimated Concentrations\n", - "---------------------------------------------------\n", sep="") + if(x$method == "AMLE") { + ppcc <- censPPCC.test(as.lcens(Res, censor.codes=x$cfit$CENSFLAG)) + cat("\n", x$method, " Regression Statistics\n", + "Residual variance: ", + signif(x$cfit$PARAML[x$cfit$NPAR + 1L], digits), "\n", + RSQtxt, RSQ, " percent\n", + "G-squared: ", G2, " on ", x$cfit$NPAR - 1L, + " degrees of freedom\n", + "P-value: ", pval, "\n", + "Prob. Plot Corr. Coeff. (PPCC):\n", + " r = ", round(ppcc$statistic, digits), "\n", + " p-value = ", round(ppcc$p.value, digits), "\n", + "Serial Correlation of Residuals: ", + round(x$cfit$SCORR, digits), "\n\n", + sep="") + } else { + cat("\n", x$method, " Regression Statistics\n", + "Residual variance: ", + signif(x$cfit$PARAML[x$cfit$NPAR + 1L], digits), "\n", + RSQtxt, RSQ, " percent\n", + "G-squared: ", G2, " on ", x$cfit$NPAR - 1L, + " degrees of freedom\n", + "P-value: ", pval, "\n", + "Prob. Plot Corr. Coeff. (PPCC) cannot be computed for method MLE\n", + "Serial Correlation of Residuals not be computed for method MLE\n", + sep="") + } + if(x$method == "AMLE") { + cat("\nComparison of Observed and Estimated Concentrations\n", + "---------------------------------------------------\n", sep="") + } else { + cat("\nComparison of Observed and Approximate Estimated Concentrations\n", + "---------------------------------------------------------------\n", sep="") + } if(!brief) cat(" The summary statistics and bias diagnostics presented below are based\n", "on a comparison of observed and estimated concentrations for all dates/times\n", @@ -405,6 +452,6 @@ print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { " 7, 8, 9, and 17 of the LOADEST documentation (Runkel et al., 2004).\n\n", sep="") } - } # End of load.only section + } # End of not load.only section invisible(x) } diff --git a/R/resampleUVdata.R b/R/resampleUVdata.R index 7b2445f..2225da7 100644 --- a/R/resampleUVdata.R +++ b/R/resampleUVdata.R @@ -17,12 +17,16 @@ #' @param max.diff a character string indicating the maximum difference in time to #'sucessfully resample the unit-value data. The default is "2 hours" see #'\code{\link[smwrBase]{mergeNearest}} for details. +#' @param missing.days a characer string indicating what action should be taken for days +#'not present in \code{UVdata}. Must be either "exclude" to remove those days from the output, +#'or "include" to retain them. Can be abbreviated. If \code{missing.days} is "include," +#'then partial days within \code{max.diff} will be included in the output data frame. #' @return A data frame like \code{UVdata} but having a uniform time step. #' #' @export resampleUVdata <- function(UVdata, time.step=15, start.date="", end.date="", - max.diff="2 hours") { - # Verify that one column of class POSIXt is in UVdata + max.diff="2 hours", missing.days="exclude") { + # Verify that one column of class POSIXt is in UVdata, and other checks DateTime <- names(UVdata)[sapply(UVdata, function(x) inherits(x, "POSIXt"))] if(length(DateTime) > 1L) stop("Multiple datetime columns in UVdata") @@ -31,8 +35,9 @@ resampleUVdata <- function(UVdata, time.step=15, start.date="", end.date="", # Verify that time step divides an hour if((60 %% time.step) != 0) stop("Invalid time step: ", as.character(time.step)) + missing.days <- match.arg(missing.days, c("exclude", "include")) # OK make a difftime and get timezone info - ts.dt <- as.difftime(time.step, unit="mins") + ts.dt <- as.difftime(time.step, units="mins") tz <- attr(UVdata[[DateTime]], "tzone") # Set start/end times; requires lubridate if(start.date == "") { @@ -47,6 +52,10 @@ resampleUVdata <- function(UVdata, time.step=15, start.date="", end.date="", } end.date <- end.date - ts.dt ## OK, we are ready to go. + # If necessary retain only days on UVdata + if(missing.days == "exclude") { + days <- unique(as.Date(as.POSIXlt(UVdata[[DateTime]]))) + } # Create the target sequence, force that same number of observations per day targseq <- seq(start.date, end.date, by=ts.dt) targdts <- as.Date(as.POSIXlt(targseq)) @@ -68,5 +77,9 @@ resampleUVdata <- function(UVdata, time.step=15, start.date="", end.date="", retval <- mergeNearest(retval, DateTime, right=UVdata, dates.right=DateTime, max.diff=max.diff) names(retval)[1L] <- DateTime + # Remove missing days if requested + if(missing.days == "exclude") { + retval <- retval[as.Date(as.POSIXlt(retval[[1L]])) %in% days, ] + } return(retval) } \ No newline at end of file diff --git a/R/rloadest-package.R b/R/rloadest-package.R index 3ecfc89..b5fa18b 100644 --- a/R/rloadest-package.R +++ b/R/rloadest-package.R @@ -1,11 +1,11 @@ #' LOADEST functions for R #' #' \tabular{ll}{ -#' Package: \tab LOADEST\cr +#' Package: \tab rloadest\cr #' Type: \tab Package\cr -#' Version: \tab 0.4.0\cr -#' Date: \tab 2015-02-27\cr -#' License: \tab File\cr +#' Version: \tab 0.4.2\cr +#' Date: \tab 2015-07-20\cr +#' License: \tab CC0\cr #' LazyLoad: \tab yes\cr #' } #' @@ -13,6 +13,8 @@ #'estimating constituent loads in streams and rivers. Some subtle differences #'between the output for LOADEST and rloadest include: #' +#'The least absolute deviation (LAD) method is not supported in rloadest. +#' #'LOADEST uses centered time when computing the sine and cosine terms in model #'numbers 4, 6, 7, 8, and 9, but the functions in rloadest use the actual decimal #'time so that the seasonality can more easily be assessed by the user. @@ -24,7 +26,7 @@ #'the most users of R would recognize from other linear model output rather then #'the printed output from LOADEST. #' -#'Furhtermore, the model building capability in the rloadest functions make easier +#'Furthermore, the model building capability in the rloadest functions make easier #'to explore other forms of rating-curve models than LOADEST. #' #' @name rloadest-package diff --git a/R/seg.R b/R/seg.R new file mode 100644 index 0000000..acbe0fa --- /dev/null +++ b/R/seg.R @@ -0,0 +1,18 @@ +#' Load Model +#' +#' Support function for building a segmented rating curve load model. Required +#'in the formula in \code{segLoadReg} to define the segmented model. +#' +#' @param x the data to segment. Missing values are permitted and result +#'corresponing in missing values in output. +#' @param N the number of breaks in the segmented model. +#' +#' @return The data in \code{x} with attributes to build the segmented model. +#' +#' @export +seg <- function(x, N) { + if(missing(N)) + stop("The number of segment breaks must be specified") + attr(x, "N") <- N + return(x) +} diff --git a/R/segLoadReg.R b/R/segLoadReg.R new file mode 100644 index 0000000..2900484 --- /dev/null +++ b/R/segLoadReg.R @@ -0,0 +1,155 @@ +#' Load Estimation +#' +#' Build a segmented rating-curve (regression) model for river load estimation. +#' +#' The left-hand side of the formula can be any numeric variable, just +#'as with \code{lm} or a variable of class "lcens," "mcens," "qw," or "Surv." +#'The response variable must be a column in \code{data}; it cannot be constructed using +#'\code{as.lcens}, \code{as.mcens}, or \code{Surv}. The initial segmented model is +#'based on uncensored data---simple substitution of 1/2 the reporting limit is used +#'for left-censored values and multiply censored values cause the analysis to fail. +#' +#'The first term of right-hand side must be defined by the \code{seg} function +#'with the number of segments. The first term may be followed by any number of +#'additional terms. The final model will place the segmeted term in the last postition +#'and \code{seg} will be replaced by the proper call to \code{segment}. +#' +#' @param formula a formula describing the regression model. See \bold{Details}. +#' @param data the data to search for the variables in \code{formula}. +#' @param subset an expression to select a subset of the data. +#' @param na.action what to do with missing values. +#' @param flow character string indicating the name of the +#'flow column. +#' @param dates character string indicating the name of the +#'date column. +#' @param flow.units character string describing the flow unit. +#' @param conc.units character string describing the concentration +#'unit. +#' @param load.units character string describing the load unit. +#' @param time.step character string describing the time step of +#'the calibration data. +#' @param station character string description of the station. +#' @param trace if logical, then if \code{TRUE} print a short summary of the +#'segmented fit. Otherwise a character string and the segmented model is saved +#'as that object name. +#' +#' @return An object of class "loadReg." +#' @seealso \code{\link{censReg}}, \code{link{seg}}, \code{\link{segment}} +#' @references will need some. +#' @keywords regression censored loads +#' @examples +#'# From application 1 in the vignettes +#'data(app1.calib) +#'app1.lr <- loadReg(Phosphorus ~ model(1), data = app1.calib, +#' flow = "FLOW", dates = "DATES", conc.units="mg/L", +#' station="Illinois River at Marseilles, Ill.") +#'print(app1.lr) +#' @import segmented +#' @export +segLoadReg <- function(formula, data, subset, na.action, flow, dates, + flow.units="cfs", conc.units="", load.units="kg", + time.step="day", station="",trace=TRUE) { + ## Coding history: + ## 2014Jun02 DLLorenz Original Coding + ## + ## Trap model number specification + PredMod <- terms(formula, "seg", data = data) + indSeg <- attr(PredMod, "specials")$seg + if(is.null(indSeg)) { + stop("You must include the seg function to indicate which variable is segmented") + } else if(indSeg != 2) + stop("The segmented term must be the first one on the right-hand side of the formula") + time.step <- match.arg(time.step, + c("instantaneous", "1 hour", "2 hours", + "3 hours", "4 hours", "6 hours", + "8 hours", "12 hours", "day")) + call <- match.call() + m <- match.call(expand.dots = FALSE) + ## remove components not needed for model.frame + m$flow <- m$dates <- m$flow.units <- m$conc.units <- NULL + m$load.units <- m$time.step <- m$station <- m$trace <- NULL + mkeep <- m + m[[1L]] <- as.name("model.frame") + # Extract the column to segment + m <- eval(m, parent.frame()) + cm1 <- censoring(m[[1L]]) + if(inherits(m[[1L]], "qw")) { + m[[1L]] <- as.numeric(m[[1L]]) # Will fail on multiple censoring + } else if(inherits(m[[1L]], "mcens")) { + if(cm1 == "none") { + m[[1L]] <- m[[1L]]@.data[, 1L] + } else if(cm1 == "left") { + m[[1L]] <- ifelse(m[[1L]]@censor.codes == -1L, m[[1L]]@.data[, 1L]/2, m[[1L]]@.data[, 1L]) + warning("Simple substitution used for left-censored values") + } else { + stop("multiple censoring not supported") + } + } else if(inherits(m[[1L]], "lcens")) { + if(cm1 == "none") { + m[[1L]] <- m[[1L]]@.data[, 1L] + } else { + m[[1L]] <- ifelse(m[[1L]]@censor.codes, m[[1L]]@.data[, 1L]/2, m[[1L]]@.data[, 1L]) + warning("Simple substitution used for left-censored values") + } + } else if(inherits(m[[1L]], "Surv")) { + if(cm1 == "none") { + m[[1L]] <- m[[1L]]@.data[, 1L] + } else if(cm1 == "left") { + m[[1L]] <- ifelse(m[[1L]][, 3L] == 2, m[[1L]][, 1L]/2, m[[1L]][, 1L]) + warning("Simple substitution used for left-censored values") + } else { + stop("multiple censoring not supported") + } + } # Anything else must be purely numeric + segVar <- m[[2L]] + segTrm <- names(m)[2L] + segNam <- make.names(names(m)[2L]) + segNam <- gsub(".", "", segNam, fixed=TRUE) + segDat <- cbind(segVar, m) + names(segDat)[1L] <- segNam + # transform response + respTrm <- names(m)[1L] + segDat[[respTrm]] <- log(segDat[[respTrm]]) + # Construct lm model to find breaks + m <- mkeep + m$data <- segDat + m[[1L]] <- as.name("lm") + segN <- attr(segVar, "N") + segBrk <- quantile(segVar, probs=(seq(segN)/(segN+1))) + # Modify formula with single name rather than what was called + form <- format(m$formula) + form <- gsub(segTrm, segNam, form, fixed=TRUE) + m[[2]] <- as.formula(form) # this should be the term in the formula + m <- eval(m, parent.frame()) # The linear regression model + lmAIC <- AIC(m) + # OK, try segmented + m <- segmented(m, seg.Z=as.formula(paste("~", segNam)), + psi=segBrk) + # Deal with the segmented model + if(is.logical(trace)) { + if(trace) { + cat("Segmented regression results:\n AIC:\n") + print(c(lm=lmAIC, sm=AIC(m)), digits=6) + cat("Breakpoints (psi):\n") + psi <- m$psi + row.names(psi) <- substring(row.names(psi), 1, 4) + print(signif(psi, 4)) + } + } else { + assign(trace, m, pos=1L) + cat("Segmented model saved as ", trace, "\n", sep="") + } + # Construct new call using segmented and psi replacing seg and N + segBrk <- signif(m$psi[, 2L], 4) + segBrk <- paste(segBrk, collapse=",") + call[[1L]] <- as.name("loadReg") + form <- as.formula(call$formula) + segLstCom <- gregexpr(",", segTrm)[[1L]] + segLstCom <- segLstCom[length(segLstCom)] - 1 + newTrm <- paste("segment", substring(segTrm, 4, segLstCom), + ", c(", segBrk, "))", sep="") + form <- update(form, as.formula(paste("~. - ", segTrm, " + ", newTrm, sep=""))) + call$formula <- form + call$trace <- NULL + return(eval(call)) +} diff --git a/R/segment.R b/R/segment.R new file mode 100644 index 0000000..254cd4f --- /dev/null +++ b/R/segment.R @@ -0,0 +1,26 @@ +#' Segmented Model +#' +#' Computes the basis for a segmented model. Used primarily in a linear +#'regression or load model. +#' +#' @param x the data to segment. Missing values are permitted and result +#'corresponding in missing values in output. +#' @param psi a numeric vector of the breakpoints. +#' +#' @return A matrix contining a column named X of the data in \code{x} and +#'paired U and P columns for each of the breaks in \code{psi} that form the +#'basis for that segment. +#' +#' @export +segment <- function(x, psi) { + # Set up matrix for segmented regression + N <- length(psi) + u <- matrix(rep(x, N), ncol=N) + # Sweep out the values of psi, and set min to 0 + u <- sweep(u, 2L, psi) + u <- apply(u, 2, function(x) pmax(0,x)) + colnames(u) <- paste("U", seq(N), sep="") + p <- sign(u) + colnames(p) <- paste("P", seq(N), sep="") + return(cbind(X=x, u, p)) +} \ No newline at end of file diff --git a/R/selBestModel.R b/R/selBestModel.R index 845b843..90b5954 100644 --- a/R/selBestModel.R +++ b/R/selBestModel.R @@ -19,6 +19,8 @@ #' @param time.step character string describing the time step of #'the calibration data. #' @param station character string description of the station. +#' @param criterion the criterion to use for subset selection, must be +#'one of "AIC," "SPCC," or "AICc." #' #' @return An object of class "loadReg." #' @seealso \code{\link{censReg}} @@ -35,20 +37,22 @@ #'@export selBestModel <- function(constituent, data, subset, na.action, flow, dates, flow.units="cfs", conc.units="", load.units="kg", - time.step="day", station="") { - ## Coding history: - ## 2013May31 DLLorenz Original Coding + time.step="day", station="", + criterion=c("AIC", "SPCC", "AICc")) { + ## + criterion <- match.arg(criterion) ## ## Modify to make call to loadReg cl <- match.call() cl[[1L]] <- as.name("loadReg") names(cl)[2L] <- "formula" cl[[2L]] <- as.formula(paste(constituent, " ~ model(1)", sep="")) + cl$criterion <- NULL retval <- model <- eval(cl) model.eval <- model$model.eval # Append AICc model.eval$AICc <- AICc(model) - AICbest <- model.eval$AIC + AICbest <- model.eval[[criterion]] ## Supress multiple warnings warn <- options("warn") options(warn=-1L) @@ -58,8 +62,8 @@ selBestModel <- function(constituent, data, subset, na.action, flow, dates, model.tmp <- model$model.eval model.tmp$AICc <- AICc(model) model.eval <- rbind(model.eval, model.tmp) - if(model$model.eval$AIC < AICbest) { - AICbest <- model$model.eval$AIC + if(model.tmp[[criterion]] < AICbest) { + AICbest <- model.tmp[[criterion]] retval <- model } } diff --git a/R/selBestSubset.R b/R/selBestSubset.R index a78abd4..d94c483 100644 --- a/R/selBestSubset.R +++ b/R/selBestSubset.R @@ -22,8 +22,8 @@ #' @param time.step character string describing the time step of #'the calibration data. #' @param station character string description of the station. -#' @param criterion the criterion to use for subset selection, must be either "AIC" -#'or "SPPC." +#' @param criterion the criterion to use for subset selection, must be +#'one of "AIC," "SPCC," or "AICc." #' #' @seealso \code{\link{censReg}}, \code{\link[stats]{step}} #' @return An object of class "loadReg." @@ -37,7 +37,8 @@ #' @export selBestSubset <- function(formula, min.formula=~1, data, subset, na.action, flow, dates, flow.units = "cfs", conc.units = "", load.units = "kg", - time.step = "day", station = "", criterion=c("AIC", "SPPC")) { + time.step = "day", station = "", + criterion=c("AIC", "SPCC", "AICc")) { # First extract the call to construct a data frame for the model criterion <- match.arg(criterion) m <- call <- match.call() @@ -89,13 +90,22 @@ selBestSubset <- function(formula, min.formula=~1, data, subset, na.action, flow m$load.units <- m$time.step <- m$station <- m$criterion <- NULL m$data <- df m$dist <- "lognormal" - k <- if(criterion == "AIC") 2 else log(nrow(df)) + if(criterion == "AIC") { + k <- 2 + correct <- FALSE + } else if(criterion == "SPCC") { + k <- log(nrow(df)) + correct <- FALSE + } else { # Must be AICc + k <- 2 + correct <- TRUE + } m <- eval(m) ## Warn if fewer N than 10*potential params if(m$NOBSC < 10L * (m$NPAR - 1L)) warning("Selected model may be over fit: only ", m$NOBSC, " observations for ", m$NPAR, " possible parameters.") - best <- step(m, scope=min.formula, direction="both", trace=0, k=k) + best <- step(m, scope=min.formula, direction="both", trace=0, k=k, correct=correct) # retain the step wise model selections model.eval <- best$anova names(model.eval)[[6L]] <- criterion # Fix this to represent what was really done diff --git a/README.md b/README.md index a8257ec..546c57a 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,12 @@ This software is provided "AS IS." Installation ---------- - install.packages(c("rloadest"), - repos=c("http://usgs-r.github.com","http://cran.us.r-project.org"), - dependencies=TRUE) +```R +install.packages("rloadest", + repos=c("http://owi.usgs.gov/R", + "http://cran.us.r-project.org"), + dependencies = TRUE) +``` + +Linux: [![travis](https://travis-ci.org/USGS-R/rloadest.svg?branch=master)](https://travis-ci.org/USGS-R/rloadest) + diff --git a/data/Models.RData b/data/Models.RData new file mode 100644 index 0000000..6eaecea Binary files /dev/null and b/data/Models.RData differ diff --git a/inst/doc/IncorporatingHysteresis.R b/inst/doc/IncorporatingHysteresis.R new file mode 100644 index 0000000..8994b91 --- /dev/null +++ b/inst/doc/IncorporatingHysteresis.R @@ -0,0 +1,120 @@ +### R code from vignette source 'IncorporatingHysteresis.Rnw' + +################################################### +### code chunk number 1: IncorporatingHysteresis.Rnw:21-32 +################################################### +# Load the necessary packages and the data +library(rloadest) +library(dataRetrieval) +# Get the QW data +Boyer <- "06609500" +BoyerQW <- importNWISqw(Boyer, "00631", + begin.date="2003-10-01", end.date="2012-09-30") +# Now the Daily values data +BoyerQ <- readNWISdv(Boyer, "00060", startDate="2003-10-01", + endDate="2012-09-30") +BoyerQ <- renameNWISColumns(BoyerQ) + + +################################################### +### code chunk number 2: IncorporatingHysteresis.Rnw:40-53 +################################################### +# Compute the hysteresis metrics. +BoyerQ <- transform(BoyerQ, + dQ1 = hysteresis(log(Flow), 1), + dQ3 = hysteresis(log(Flow), 3)) +# Rename the date column in QW so that the data can be merged +names(BoyerQW)[2] <- "Date" +# Merge the data +BoyerData <- mergeQ(BoyerQW, FLOW=c("Flow", "dQ1", "dQ3"), + DATES="Date", Qdata=BoyerQ, Plot=F) +# Create the initial model +Boyer.lr <- selBestModel("NO2PlusNO3.N", BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) + + +################################################### +### code chunk number 3: IncorporatingHysteresis.Rnw:58-72 +################################################### +# residuals and hysteresis +setSweave("graph01", 6, 8) +AA.lo <- setLayout(num.rows=2) +setGraph(1, AA.lo) +AA.pl <- xyPlot(BoyerData$dQ1, residuals(Boyer.lr), + ytitle="Residuals", xtitle="1-day lag hysteresis", + xaxis.range=c(-1,3)) +addSLR(AA.pl) +setGraph(2, AA.lo) +AA.pl <- xyPlot(BoyerData$dQ3, residuals(Boyer.lr), + ytitle="Residuals", xtitle="3-day lag hysteresis", + xaxis.range=c(-1,3)) +addSLR(AA.pl) +dev.off() + + +################################################### +### code chunk number 4: IncorporatingHysteresis.Rnw:85-90 +################################################### +# Construct the model +Boyer.lr <- loadReg(NO2PlusNO3.N ~ quadratic(log(Flow)) + + quadratic(dectime(Date)) + fourier(Date) + dQ1, data=BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) + + +################################################### +### code chunk number 5: IncorporatingHysteresis.Rnw:95-99 +################################################### +# Plot the overall fit, choose "fourier(Date)cos(k=1)" +setSweave("graph02", 6, 6) +plot(Boyer.lr, which="fourier(Date)cos(k=1)", set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 6: IncorporatingHysteresis.Rnw:108-113 +################################################### +# Construct the revised model +Boyer.lr <- loadReg(NO2PlusNO3.N ~ quadratic(log(Flow)) + + dectime(Date) + fourier(Date, 2) + dQ1, data=BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) + + +################################################### +### code chunk number 7: IncorporatingHysteresis.Rnw:118-122 +################################################### +# Plot the residual Q-normal graph. +setSweave("graph03", 6, 6) +plot(Boyer.lr, which = "fourier(Date, 2)cos(k=1)", set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 8: IncorporatingHysteresis.Rnw:131-135 +################################################### +# Plot the overall fit, choose plot number 2. +setSweave("graph04", 6, 6) +plot(Boyer.lr, which = 4, set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 9: IncorporatingHysteresis.Rnw:144-148 +################################################### +# Plot the S-L grpah. +setSweave("graph05", 6, 6) +plot(Boyer.lr, which = 3, set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 10: IncorporatingHysteresis.Rnw:157-161 +################################################### +# Plot the residual Q-normal graph. +setSweave("graph06", 6, 6) +plot(Boyer.lr, which = 5, set.up=FALSE) +dev.off() + + diff --git a/inst/doc/IncorporatingHysteresis.Rnw b/inst/doc/IncorporatingHysteresis.Rnw new file mode 100644 index 0000000..657d864 --- /dev/null +++ b/inst/doc/IncorporatingHysteresis.Rnw @@ -0,0 +1,183 @@ +\documentclass{article} +\parskip 6pt +%\VignetteIndexEntry{Incorporating Hysteresis} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright + +\title{Incorporating Hysteresis in a Load Model} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates how to set up and incorporate hysteresis into a load model. Hysteresis occurs when concentrations (and thereby loads) are different on rising and falling limbs of an event hydrograph for the same magnitude of streamflow. + +This example extends the analysis of Garrett (2012) of nitrate plus nitrite loads in the Boyer River at Logan, Iowa, USGS gaging station 06609500. The original time frame for the analysis was for water years 2004 through 2008. This example extends the analysis through water year 2012. Loads will not be estimated from this model---it demonstrates only the building and evaluation phases. + + +<>= +# Load the necessary packages and the data +library(rloadest) +library(dataRetrieval) +# Get the QW data +Boyer <- "06609500" +BoyerQW <- importNWISqw(Boyer, "00631", + begin.date="2003-10-01", end.date="2012-09-30") +# Now the Daily values data +BoyerQ <- readNWISdv(Boyer, "00060", startDate="2003-10-01", + endDate="2012-09-30") +BoyerQ <- renameNWISColumns(BoyerQ) +@ + +\eject +\section{Compute the Hysteresis Variables and Merge the Data} + +There is no direct diagnostic test to determine if hysteresis is important. The best way to assess any hysteresis is to compute the hysteresis metric and plot residuals against that metric. The first section of code will replicate that effort, beginning with the ''best'' predefined model rather than simply replicating the model used by Garrett (2012). Garrett used a 1-day lag. This example will use that and add a 3-day lag metric. The function that computes the metric is \texttt{hysteresis} in \textbf{smwrBase}. + +<>= +# Compute the hysteresis metrics. +BoyerQ <- transform(BoyerQ, + dQ1 = hysteresis(log(Flow), 1), + dQ3 = hysteresis(log(Flow), 3)) +# Rename the date column in QW so that the data can be merged +names(BoyerQW)[2] <- "Date" +# Merge the data +BoyerData <- mergeQ(BoyerQW, FLOW=c("Flow", "dQ1", "dQ3"), + DATES="Date", Qdata=BoyerQ, Plot=F) +# Create the initial model +Boyer.lr <- selBestModel("NO2PlusNO3.N", BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) +@ + +The model selected was number 9, which includes second-order flow and time terms and the first-order seasonal terms. Reviewing the table of model evaluation criteria, model number 2 had a very small value for AIC and the second smallest for SPPC. Model number 2 would have been the equivalent starting model in Garrett (2012), including only the second-order flow terms. The printed results indicate good bias statistics, but the PPCC p-value is much less than 0.05. The next section of code illustrates plotting the residuals and hysteresis metrics. The 1-day lag appears to fit better than the 3-day lag. + +<>= +# residuals and hysteresis +setSweave("graph01", 6, 8) +AA.lo <- setLayout(num.rows=2) +setGraph(1, AA.lo) +AA.pl <- xyPlot(BoyerData$dQ1, residuals(Boyer.lr), + ytitle="Residuals", xtitle="1-day lag hysteresis", + xaxis.range=c(-1,3)) +addSLR(AA.pl) +setGraph(2, AA.lo) +AA.pl <- xyPlot(BoyerData$dQ3, residuals(Boyer.lr), + ytitle="Residuals", xtitle="3-day lag hysteresis", + xaxis.range=c(-1,3)) +addSLR(AA.pl) +dev.off() +@ +\includegraphics{graph01.pdf} +\paragraph{} + +\textbf{Figure 1.} The residuals versus hysteresis metrics. + +\eject +\section{Construct and Validate the Hysteresis Model} + +The model... + + +<>= +# Construct the model +Boyer.lr <- loadReg(NO2PlusNO3.N ~ quadratic(log(Flow)) + + quadratic(dectime(Date)) + fourier(Date) + dQ1, data=BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) +@ + +The printed output shows an improved model, but the PPCC test still indicates lack of normality. A review of the linearity of the explanatory variables indicates the need for second-order seasonal terms (figure 1). The sine term is also nonlinear, but not shown. The second order linear time term will be dropped because it is not significant at the 0.05 level. + +<>= +# Plot the overall fit, choose "fourier(Date)cos(k=1)" +setSweave("graph02", 6, 6) +plot(Boyer.lr, which="fourier(Date)cos(k=1)", set.up=FALSE) +dev.off() +@ +\includegraphics{graph02.pdf} +\paragraph{} + +\textbf{Figure 2.} The diagnostic plot for the linearity of a seasonal term. + +Construct the revised model that drops the second order time term and adds the second order seasonal terms. The diagnostic plots follow. + +<>= +# Construct the revised model +Boyer.lr <- loadReg(NO2PlusNO3.N ~ quadratic(log(Flow)) + + dectime(Date) + fourier(Date, 2) + dQ1, data=BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) +@ + +A complete review of the partial residual graphs is not included in this example. Only the partial residual for \texttt{fourier(Date)cos(k=1)} is shown to show that the linearity has improved. No partial residual plot indicates a serious lack of linearity. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph03", 6, 6) +plot(Boyer.lr, which = "fourier(Date, 2)cos(k=1)", set.up=FALSE) +dev.off() +@ +\includegraphics{graph03.pdf} +\paragraph{} + +\textbf{Figure 3.} The partial residual for \texttt{fourier(Date, 2)cos(k=1)} graph. + +The correlogram indicates a much better seasonal fit and no long-term lack of fit. + +<>= +# Plot the overall fit, choose plot number 2. +setSweave("graph04", 6, 6) +plot(Boyer.lr, which = 4, set.up=FALSE) +dev.off() +@ +\includegraphics{graph04.pdf} +\paragraph{} + +\textbf{Figure 4.} The correlogram for the revised model. + +For this model, the S-L plot is shown. It shows a slight decrease in heteroscedasticity as the fitted values increase. The decrease is small and likely drive by the 3 largest fitted values and there is very little visual decrease in scatter as the fitted values increase. + +<>= +# Plot the S-L grpah. +setSweave("graph05", 6, 6) +plot(Boyer.lr, which = 3, set.up=FALSE) +dev.off() +@ +\includegraphics{graph05.pdf} +\paragraph{} + +\textbf{Figure 5.} The S-L graph for the revised model. + +The Q-normal graph shows a non-normal distribution, but without distinctive skewness or kurtosis. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph06", 6, 6) +plot(Boyer.lr, which = 5, set.up=FALSE) +dev.off() +@ +\includegraphics{graph06.pdf} +\paragraph{} + +\textbf{Figure 6.} The residual Q-normal graph. + +The non-normality of the residuals poses at least three potential problems: (1) lack of confidence in the confidence limits for regression parameter estimates, (2) bias in the back-transformation corrections for the mean load, and (3) lack of confidence in the confidence intervals of the load estimate. For the first potential problem, the confidence intervals of the parameter estimates and their significance are not critical to load estimation and the non-normality is not so large that this is a major concern (Greene, 2012). Several factors must be considered to address the second potential problem---the magnitude of the bias correction and the measured bias. The actual bias correction factor (BCF) used is AMLE, which cannot be directly computed, but can be estimated by using the MLE BCF. The MLE BCF for this model is \texttt{exp(0.06958/2) = 1.0354}. For non-normal data, Duan's smoothing BCF is often used (Helsel and Hirsch, 2002); the value for that BCF is \texttt{mean(exp(residuals(Boyer.lr))) = 1.0302}. The BCFs are very similar, which suggests that the bias from the back transform could be small. That is confirmed by the very small value for the percent bias in the printed report (0.2405). The third potential problem and no way to address it other than by using bootstrapping methods. Any reported confidence intervals on loads or fluxes should be treated as approximate. + +\begin{thebibliography}{9} + +\bibitem{JG} +Garrett, J.D., 2012, Concentrations, loads, and yields of select constituents from major tributaries of the Mississippi and Missouri Rivers in Iowa, water years 2004???2008: U.S. Geological Survey Scientific Investigations Report 2012???-5240, 61 p. + +\bibitem{WG} +Greene, W.H., 2012, Econometric analysis, seventh edition: Upper Saddle River, New. Jersey, Prentice Hall, 1198 p. + +\bibitem{HH} +Helsel, D.R., and Hirsch, R.M., 2002, Statistical methods in water resources: U.S. Geological Survey Techniques of Water-Resources Investigations, book 4, chap. A3, 522 p. + +\end{thebibliography} + +\end{document} diff --git a/inst/doc/IncorporatingHysteresis.pdf b/inst/doc/IncorporatingHysteresis.pdf new file mode 100644 index 0000000..8cd49db Binary files /dev/null and b/inst/doc/IncorporatingHysteresis.pdf differ diff --git a/inst/doc/InstantaneousTimeStep.R b/inst/doc/InstantaneousTimeStep.R new file mode 100644 index 0000000..37d1fb7 --- /dev/null +++ b/inst/doc/InstantaneousTimeStep.R @@ -0,0 +1,178 @@ +### R code from vignette source 'InstantaneousTimeStep.Rnw' + +################################################### +### code chunk number 1: InstantaneousTimeStep.Rnw:21-45 +################################################### +# Load the necessary packages and the data +library(rloadest) +library(dataRetrieval) +# What unit values are available? +subset(whatNWISdata("04027000", "uv"), + select=c("parm_cd", "srsname", "begin_date", "end_date")) +# Get the QW data +BadQW <- importNWISqw("04027000", "00940", + begin.date="2011-04-01", end.date="2014-09-30") +# Merge data and time and set timezone (2 steps) +BadQW <- transform(BadQW, dateTime=sample_dt + as.timeDay(sample_tm)) +BadQW <- transform(BadQW, dateTime=setTZ(dateTime, tzone_cd)) +# Now the Unit values data +BadUV <- readNWISuv("04027000", c("00060", "00095", "00300", "63680"), + startDate="2011-04-01", endDate="2014-09-30", tz="America/Chicago") +BadUV <- renameNWISColumns(BadUV) +names(BadUV) +# Strip _Inst off column names +names(BadUV) <- sub("_Inst", "", names(BadUV)) +# Merge the data +BadData <- mergeNearest(BadQW, "dateTime", right=BadUV, dates.right="dateTime", + max.diff="4 hours") +# Rename the left-hand dateTime column +names(BadData)[9] <- "dateTime" + + +################################################### +### code chunk number 2: InstantaneousTimeStep.Rnw:53-55 +################################################### +# Print the number of missing values in each column +sapply(BadData, function(col) sum(is.na(col))) + + +################################################### +### code chunk number 3: InstantaneousTimeStep.Rnw:60-67 +################################################### +# Create the and print the candidate model. +BadChloride.lr <- selBestSubset(Chloride ~ log(Flow) + fourier(dateTime) + + log(SpecCond) + log(DO) + log(Turb), data=BadData, + flow="Flow", dates="dateTime", time.step="instantaneous", + station="Bad River near Odanah", criterion="SPCC") + +print(BadChloride.lr) + + +################################################### +### code chunk number 4: InstantaneousTimeStep.Rnw:74-78 +################################################### +# Plot the overall fit, choose plot number 2. +setSweave("graph01", 6, 6) +plot(BadChloride.lr, which = 2, set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 5: InstantaneousTimeStep.Rnw:87-91 +################################################### +# Plot the residual Q-normal graph. +setSweave("graph02", 6, 6) +plot(BadChloride.lr, which = 5, set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 6: InstantaneousTimeStep.Rnw:100-104 +################################################### +# Plot the residual Q-normal graph. +setSweave("graph03", 6, 6) +plot(BadChloride.lr, which = "log(Turb)", set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 7: InstantaneousTimeStep.Rnw:113-120 +################################################### +# Create the and print the revised model. +BadChloride.lr <- loadReg(Chloride ~ log(Flow) + fourier(dateTime) + + log(SpecCond) + Turb, data=BadData, + flow="Flow", dates="dateTime", time.step="instantaneous", + station="Bad River near Odanah") + +print(BadChloride.lr, load.only=FALSE) + + +################################################### +### code chunk number 8: InstantaneousTimeStep.Rnw:128-132 +################################################### +# Plot the overall fit, choose plot number 2. +setSweave("graph04", 6, 6) +plot(BadChloride.lr, which = 2, set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 9: InstantaneousTimeStep.Rnw:141-145 +################################################### +# Plot the S-L grpah. +setSweave("graph05", 6, 6) +plot(BadChloride.lr, which = 3, set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 10: InstantaneousTimeStep.Rnw:154-158 +################################################### +# Plot the residual Q-normal graph. +setSweave("graph06", 6, 6) +plot(BadChloride.lr, which = 5, set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 11: InstantaneousTimeStep.Rnw:168-172 +################################################### +# Plot the residual Q-normal graph. +setSweave("graph07", 6, 6) +plot(BadChloride.lr, which = "Turb", set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 12: InstantaneousTimeStep.Rnw:186-202 +################################################### +# Extract one day from the UV data +Bad063014 <- subset(BadUV, as.Date(as.POSIXlt(dateTime)) == "2014-06-30") +# Remove the unecessary surrogates from the data set. +# This reduces the likelihood of missing values in the dataset +Bad063014 <- Bad063014[, c("dateTime", "Flow", "SpecCond", "Turb")] +# Simple check +any(is.na(Bad063014)) +# Estimate concetrations +Bad063014.est <- predConc(BadChloride.lr, Bad063014, by="unit") +# Display the first and last few rows. +head(Bad063014.est) +tail(Bad063014.est) +# The daily mean concentration can also be easily estimated +predConc(BadChloride.lr, Bad063014, by="day") +# Compare to the mean of the unit values: +with(Bad063014.est, mean(Conc)) + + +################################################### +### code chunk number 13: InstantaneousTimeStep.Rnw:214-241 +################################################### +# Extract one month from the UV data, done in two steps +Bad0714 <- subset(BadUV, as.Date(as.POSIXlt(dateTime)) >= "2014-07-01") +Bad0714 <- subset(Bad0714, as.Date(as.POSIXlt(dateTime)) <= "2014-07-31") +# Remove the unecessary surrogates from the data set. +# This reduces the likelihood of missing values in the dataset +Bad0714 <- Bad0714[, c("dateTime", "Flow", "SpecCond", "Turb")] +# Simple check on each column, how many in each column? +sapply(Bad0714, function(x) sum(is.na(x))) +# Fix each column, using the defaults of fillMissing +Bad0714$SpecCond <- fillMissing(Bad0714$SpecCond) +Bad0714$Turb <- fillMissing(Bad0714$Turb) +# Verify filled values +sapply(Bad0714, function(x) sum(is.na(x))) +# Estimate daily loads +Bad0714.day <- predLoad(BadChloride.lr, Bad0714, by="day") +# Display the first and last few rows. +head(Bad0714.day) +tail(Bad0714.day) +# And the month +Bad0714.mon <- predLoad(BadChloride.lr, Bad0714, by="month") +Bad0714.mon +# Compare to the results using the approximate standard error: +# For long periods, the processing time to the exact seopt can be very large +# and may be desireable to use the approximation. +predLoad(BadChloride.lr, Bad0714, by="month", seopt="app") +# Compare to the mean of the daily values: +with(Bad0714.day, mean(Flux)) + + diff --git a/inst/doc/InstantaneousTimeStep.Rnw b/inst/doc/InstantaneousTimeStep.Rnw new file mode 100644 index 0000000..370f1cf --- /dev/null +++ b/inst/doc/InstantaneousTimeStep.Rnw @@ -0,0 +1,245 @@ +\documentclass{article} +\parskip 6pt +%\VignetteIndexEntry{Instantaneous Time-Step Model} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright + +\title{Instantaneous Time-Step Model} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates how to set up and use a instantaneous time-step model. These models are typically used when there is additional explanatory variable information such as surrogate unit values, like specific conductance. The intent is often to model both the concentration or flux at any time and the load over a period of time. + +This example uses data from the Bad River near Odanah, Wisc., USGS gaging station 04027000. The example will build a model of chloride. + + +<>= +# Load the necessary packages and the data +library(rloadest) +library(dataRetrieval) +# What unit values are available? +subset(whatNWISdata("04027000", "uv"), + select=c("parm_cd", "srsname", "begin_date", "end_date")) +# Get the QW data +BadQW <- importNWISqw("04027000", "00940", + begin.date="2011-04-01", end.date="2014-09-30") +# Merge data and time and set timezone (2 steps) +BadQW <- transform(BadQW, dateTime=sample_dt + as.timeDay(sample_tm)) +BadQW <- transform(BadQW, dateTime=setTZ(dateTime, tzone_cd)) +# Now the Unit values data +BadUV <- readNWISuv("04027000", c("00060", "00095", "00300", "63680"), + startDate="2011-04-01", endDate="2014-09-30", tz="America/Chicago") +BadUV <- renameNWISColumns(BadUV) +names(BadUV) +# Strip _Inst off column names +names(BadUV) <- sub("_Inst", "", names(BadUV)) +# Merge the data +BadData <- mergeNearest(BadQW, "dateTime", right=BadUV, dates.right="dateTime", + max.diff="4 hours") +# Rename the left-hand dateTime column +names(BadData)[9] <- "dateTime" +@ + +\eject +\section{Build the Instantaneous Time-Step Model} + +The first step in building the model is to determine which of the surrogates are most appropriate to include in the model. There can be many factors that contribute to deciding which explanatory variables to include in the model. From previous experience the user may decide to include or exclude specific surrogates and flow or seasonal terms. For this example, temperature (parameter code 00010) and pH (parameter code 000400) were excluded as they typically have very little influence on nitrate or nitrate concentration. Other factors include the availability of surrogate values. The output in the code below indicates that NTU\_Turb has few observations (more missing values) that Turb, and will not be included in the candidate explanatory variables. + +<>= +# Print the number of missing values in each column +sapply(BadData, function(col) sum(is.na(col))) +@ + +This example will include the other surrogates and flow and seasonal terms in the candidate model. The code below demonstrates the use of \texttt{selBestSubset} to select the initial candidate model. + +<>= +# Create the and print the candidate model. +BadChloride.lr <- selBestSubset(Chloride ~ log(Flow) + fourier(dateTime) + + log(SpecCond) + log(DO) + log(Turb), data=BadData, + flow="Flow", dates="dateTime", time.step="instantaneous", + station="Bad River near Odanah", criterion="SPCC") + +print(BadChloride.lr) +@ + +Only log(DO) was dropped from the model. The printed report indicates some potential problems with the regression---the PPCC test indicates the residuals are not normally distributed and several variance inflation factors are relatively large, greater than 10. But the bias diagnostics show very little bias in the comparison of the estimated to observed values. + +A few selected graphs will help understand the issues identified in the printed report and suggest an alternative model. Figure 1 shows the residuals versus fitted graph, which indicates some very large residuals at larger fitted values. It also suggests some heteroscedasticity in the residual pattern. + +<>= +# Plot the overall fit, choose plot number 2. +setSweave("graph01", 6, 6) +plot(BadChloride.lr, which = 2, set.up=FALSE) +dev.off() +@ +\includegraphics{graph01.pdf} +\paragraph{} + +\textbf{Figure 1.} The residuals versus fitted graph. + +The S-L plot is not shown. The residual Q-normal graph indicates the reason for the very low p-value indicated by the PPCC test---the large residual values indicated in figure 1 skew the distribution. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph02", 6, 6) +plot(BadChloride.lr, which = 5, set.up=FALSE) +dev.off() +@ +\includegraphics{graph02.pdf} +\paragraph{} + +\textbf{Figure 2.} The residual Q-normal graph. + +A complete review of the partial residual graphs is not included in this example. Only the partial residual for \texttt{log(Turb)} is shown. The graph indicates the lack of fit, especially for the largest values of Turbidity. This suggests that the log transform is not appropriate. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph03", 6, 6) +plot(BadChloride.lr, which = "log(Turb)", set.up=FALSE) +dev.off() +@ +\includegraphics{graph03.pdf} +\paragraph{} + +\textbf{Figure 3.} The partial residual for \texttt{log(Turb)} graph. + +Build the model excluding \texttt{log(DO)} that was dropped in the subset selection procedure and changing \texttt{log(Turb)} to \texttt{Turb}. + +<>= +# Create the and print the revised model. +BadChloride.lr <- loadReg(Chloride ~ log(Flow) + fourier(dateTime) + + log(SpecCond) + Turb, data=BadData, + flow="Flow", dates="dateTime", time.step="instantaneous", + station="Bad River near Odanah") + +print(BadChloride.lr, load.only=FALSE) +@ + +The report for the revised model indicates less severe problems than from the first candidate model---the p-value for the PPCC test is greater than 0.05, the variance inflation inflation factors are lower although \texttt{log(Flow)} is still greater than 10, and the bias diagnostics from the observed and estimated loads and concentrations are still good. + +A review of selected diagnostic plots indicates a much better overall fit. Figure 4 shows the residuals versus fitted graph, which indicates a less severe problem of large residuals at larger fitted values. It also suggests some heteroscedasticity in the residual pattern as with the first candidate model. + + +<>= +# Plot the overall fit, choose plot number 2. +setSweave("graph04", 6, 6) +plot(BadChloride.lr, which = 2, set.up=FALSE) +dev.off() +@ +\includegraphics{graph04.pdf} +\paragraph{} + +\textbf{Figure 4.} The residuals versus fitted graph for the revised model. + +For this model, the S-L plot is shown. It shows an increase in heteroscedasticity as the fitted values increase. That heteroscedasticity can introduce bias into the estimated values as the bias correction factor will be a bit too small for the larger values and too large for the smaller values. The potential bias for this model is expected to be small because the residual variance is small, 0.03476 natural log units, therefore the bias correction is very small, less than 2 percent, and the potential change to the bias correction very small, much less than 1/2 percent. + +<>= +# Plot the S-L grpah. +setSweave("graph05", 6, 6) +plot(BadChloride.lr, which = 3, set.up=FALSE) +dev.off() +@ +\includegraphics{graph05.pdf} +\paragraph{} + +\textbf{Figure 5.} The S-L graph for the revised model. + +The residual Q-normal graph shows much better agreement to the normal distribution than the original candidate model---the effect of the lowest residuals is much less. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph06", 6, 6) +plot(BadChloride.lr, which = 5, set.up=FALSE) +dev.off() +@ +\includegraphics{graph06.pdf} +\paragraph{} + +\textbf{Figure 6.} The residual Q-normal graph for the revised model. + + +A complete review of the partial residual graphs is not included in this example. Only the partial residual for \texttt{Turb} is shown to compare to the original model. In this case, the untransformed variable appears to fit reasonably well. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph07", 6, 6) +plot(BadChloride.lr, which = "Turb", set.up=FALSE) +dev.off() +@ +\includegraphics{graph07.pdf} +\paragraph{} + +\textbf{Figure 7.} The partial residual for \texttt{Turb} graph for the revised model. + +\eject +\section{Instantaneous Concentrations} + +Estimating the instantaneous concentrations or loads from the model is relatively straight forward. The \texttt{predConc} and \texttt{predLoad} functions will estimate concentrations or loads, actually fluxes, for any time, given explanatory variables with no missing values. This example will focus one a single day, June 30, 2014. + + + +<>= +# Extract one day from the UV data +Bad063014 <- subset(BadUV, as.Date(as.POSIXlt(dateTime)) == "2014-06-30") +# Remove the unecessary surrogates from the data set. +# This reduces the likelihood of missing values in the dataset +Bad063014 <- Bad063014[, c("dateTime", "Flow", "SpecCond", "Turb")] +# Simple check +any(is.na(Bad063014)) +# Estimate concetrations +Bad063014.est <- predConc(BadChloride.lr, Bad063014, by="unit") +# Display the first and last few rows. +head(Bad063014.est) +tail(Bad063014.est) +# The daily mean concentration can also be easily estimated +predConc(BadChloride.lr, Bad063014, by="day") +# Compare to the mean of the unit values: +with(Bad063014.est, mean(Conc)) +@ + +\eject +\section{Aggregate Loads} + +Estimating concentrations or loads by day assumes, but does not require consistent number of unit values per day. Both \texttt{predLoad} and \texttt{predConc} assume that inconsistent number of unit values per day are due to missing values and return missing values for the estimates for days that do not have the average number of observations per day. Inconsistent number of observations per day can be the result of deleted bad values, maintenance, or a change in frequency of sampling. The data can be resampled to a uniform number per day using the \texttt{resampleUVdata} function or the check can be suppressed by setting the \texttt{allow.incomplete} argument to \texttt{TRUE}. + +Estimating loads for periods longer than one day requires consistent number of unit values in each day. The consistent number per day is required to be able to keep track of within-day and between day variances. The \texttt{resampleUVdata} function can be used to force a consistent number of unit values per day. It is not required for this example, but useful when the unit values are not consistent or when there is a change to or from daylight savings time. + +Just as with estimating instantaneous values, missing values are not permitted. Missing values can occur with surrogates due to short-term malfunctions, calibration, or long-term malfunctions. Missing values from short-term malfunctions, generally spikes in the data that are removed during processing, or that occur during calibrations can easily be interpolated using the \texttt{fillMissing} function in \textbf(smwrBase) and are illustrated in this example. Longer-term missing values are much more difficult to fix. They require the careful balancing of need, developing alternate regression models and possible caveats of the interpretation of loads. + +<>= +# Extract one month from the UV data, done in two steps +Bad0714 <- subset(BadUV, as.Date(as.POSIXlt(dateTime)) >= "2014-07-01") +Bad0714 <- subset(Bad0714, as.Date(as.POSIXlt(dateTime)) <= "2014-07-31") +# Remove the unecessary surrogates from the data set. +# This reduces the likelihood of missing values in the dataset +Bad0714 <- Bad0714[, c("dateTime", "Flow", "SpecCond", "Turb")] +# Simple check on each column, how many in each column? +sapply(Bad0714, function(x) sum(is.na(x))) +# Fix each column, using the defaults of fillMissing +Bad0714$SpecCond <- fillMissing(Bad0714$SpecCond) +Bad0714$Turb <- fillMissing(Bad0714$Turb) +# Verify filled values +sapply(Bad0714, function(x) sum(is.na(x))) +# Estimate daily loads +Bad0714.day <- predLoad(BadChloride.lr, Bad0714, by="day") +# Display the first and last few rows. +head(Bad0714.day) +tail(Bad0714.day) +# And the month +Bad0714.mon <- predLoad(BadChloride.lr, Bad0714, by="month") +Bad0714.mon +# Compare to the results using the approximate standard error: +# For long periods, the processing time to the exact seopt can be very large +# and may be desireable to use the approximation. +predLoad(BadChloride.lr, Bad0714, by="month", seopt="app") +# Compare to the mean of the daily values: +with(Bad0714.day, mean(Flux)) +@ + + +\end{document} diff --git a/inst/doc/InstantaneousTimeStep.pdf b/inst/doc/InstantaneousTimeStep.pdf index aac8449..9e219ff 100644 Binary files a/inst/doc/InstantaneousTimeStep.pdf and b/inst/doc/InstantaneousTimeStep.pdf differ diff --git a/inst/doc/UsingEGRETData.R b/inst/doc/UsingEGRETData.R new file mode 100644 index 0000000..9480c53 --- /dev/null +++ b/inst/doc/UsingEGRETData.R @@ -0,0 +1,107 @@ +### R code from vignette source 'UsingEGRETData.Rnw' + +################################################### +### code chunk number 1: UsingEGRETData.Rnw:23-30 +################################################### +# Load the necessary packages and the data +library(survival) # required for Surv +library(rloadest) +library(EGRET) +# Get the QW and daily flow data +Chop.QW <- Choptank_eList$Sample +Chop.Q <- Choptank_eList$Daily + + +################################################### +### code chunk number 2: UsingEGRETData.Rnw:39-43 +################################################### +# Compute the 7-parameter model. +Chop.lr <- loadReg(Surv(ConcLow, ConcHigh, type="interval2") ~ model(9), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") + + +################################################### +### code chunk number 3: UsingEGRETData.Rnw:48-52 +################################################### +# Plot the overall fit +setSweave("graph01", 6, 6) +plot(Chop.lr, which=1, set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 4: UsingEGRETData.Rnw:60-79 +################################################### +# Plot the explanatory variable fits +setSweave("graph02", 6, 9) +AA.lo <- setLayout(num.rows=3, num.cols=2) +# Flow and flow squared +setGraph(1, AA.lo) +plot(Chop.lr, which="lnQ", set.up=FALSE) +setGraph(2, AA.lo) +plot(Chop.lr, which="lnQ2", set.up=FALSE) +# Time and time squared +setGraph(3, AA.lo) +plot(Chop.lr, which="DECTIME", set.up=FALSE) +setGraph(4, AA.lo) +plot(Chop.lr, which="DECTIME2", set.up=FALSE) +# Seasonality +setGraph(5, AA.lo) +plot(Chop.lr, which="sin.DECTIME", set.up=FALSE) +setGraph(6, AA.lo) +plot(Chop.lr, which="cos.DECTIME", set.up=FALSE) +dev.off() + + +################################################### +### code chunk number 5: UsingEGRETData.Rnw:88-93 +################################################### +# Plot tconcentration and flow +setSweave("graph03", 6, 6) +# Use the average concentration (only one censored value) +with(Chop.QW, xyPlot(Q, ConcAve, yaxis.log=TRUE, xaxis.log=TRUE)) +dev.off() + + +################################################### +### code chunk number 6: UsingEGRETData.Rnw:105-119 +################################################### +# Compute the breakpoint--the seg term must be the first term on +# the right-hand side. +Chop.lr <- segLoadReg(ConcAve ~ seg(LogQ, 1) + DecYear + + fourier(DecYear, 2), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") +# From the printed output, the breakpoint is 1.994 in natural log units, +# about 7.4 cms +# Compute and print the final model +Chop.lr <- loadReg(Surv(ConcLow, ConcHigh, type="interval2") ~ + segment(LogQ, 1.994) + DecYear + fourier(DecYear, 2), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") +print(Chop.lr) + + +################################################### +### code chunk number 7: UsingEGRETData.Rnw:126-143 +################################################### +# Plot the explanatory variable fits +setSweave("graph04", 6, 9) +AA.lo <- setLayout(num.rows=3, num.cols=2) +# Segmented flow +setGraph(1, AA.lo) +plot(Chop.lr, which="segment(LogQ, 1.994)X", set.up=FALSE) +setGraph(2, AA.lo) +plot(Chop.lr, which="segment(LogQ, 1.994)U1", set.up=FALSE) +# Time +setGraph(3, AA.lo) +plot(Chop.lr, which="DecYear", set.up=FALSE) +# Seasonality +setGraph(5, AA.lo) +plot(Chop.lr, which="fourier(DecYear, 2)sin(k=2)", set.up=FALSE) +setGraph(6, AA.lo) +plot(Chop.lr, which="fourier(DecYear, 2)cos(k=2)", set.up=FALSE) +dev.off() + + diff --git a/inst/doc/UsingEGRETData.Rnw b/inst/doc/UsingEGRETData.Rnw new file mode 100644 index 0000000..dd5c1d7 --- /dev/null +++ b/inst/doc/UsingEGRETData.Rnw @@ -0,0 +1,163 @@ +\documentclass{article} +\parskip 6pt +%\VignetteIndexEntry{Using EGRET Data} +%\VignetteDepends{rloadest} +%\VignetteDepends{EGRET} +%\VignetteDepends{survival} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright + +\title{Using EGRET Data in a rloadest Model} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates how to set up and use data retrieved and processed for an EGRET (Hirsch and De Cicco, 2015) in a rloadest load model. EGRET includes the statistical algorithm Weighted Regressions on Time, Discharge, and Season (WRTDS) that can compute loads and concentrations. WRTDS uses locally weighted regression on linear time, linear flow (discharge), and the first-order sine and cosine terms to model constituent concentrations and fluxes over time and through the range for flow. + +This example uses the processed data supplied in the EGRET package, but any data retrieved and processed by the \textit{readNWISDaily}, \textit{readNWISSample}, \textit{readNWISInfo} and \textit{mergeReport} functions in EGRET can be used. The sullied data are nitrate plus nitrite data collected in the Choptank River near Greensboro, Maryland (USGS site identifier 01491000). + + +<>= +# Load the necessary packages and the data +library(survival) # required for Surv +library(rloadest) +library(EGRET) +# Get the QW and daily flow data +Chop.QW <- Choptank_eList$Sample +Chop.Q <- Choptank_eList$Daily +@ + +\eject +\section{Compute the Initial rloadest Model} + +The 7-parameter model (model number 9) is a typical model for relatively long-term records, longer than about 7 years and can be a good starting point for building a good model. The water-quality data in the Sample dataset for EGRET is stored in four columns---the minimum value, maximum value, an indicator of censoring, and the average value. That format can be converted to a valid response variable for \textit{loadReg} using either \textit{as.mcens} or \textit{Surv}; \textit{Surv} is preferred because if the data are uncensored or left-censored, then the ''AMLE'' method is used rather that the ''MLE'' method, +which is always used with a response variable of class ''mcens.'' + +<>= +# Compute the 7-parameter model. +Chop.lr <- loadReg(Surv(ConcLow, ConcHigh, type="interval2") ~ model(9), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") +@ + +One of the first steps in assessing the fit is to look at the diagnostic plots for the linearity of the overall fit and each explanatory variable. The overall fit (figure 1) looks linear, but there are three low outliers and a tendency to larger scatter at larger predicted values + +<>= +# Plot the overall fit +setSweave("graph01", 6, 6) +plot(Chop.lr, which=1, set.up=FALSE) +dev.off() +@ +\includegraphics{graph01.pdf} +\paragraph{} + +\textbf{Figure 1.} The overall fit. + +The linearity of the explanatory variables is shown in figure 2. The partial residual plots for flow (lnQ and lnQ2) show nonlinearity in the second order (lnQ2). The partial residual plots for time (DECTIME and DECTIME2) show no nonlinearity, but the second-order term (DECTIME2) shows no trend and can therefore be removed from the model. The partial residual plots for seasonality (DECTIME and DECTIME2) show nonlinearity in both terms, suggesting the need for higher order seasonal terms. +<>= +# Plot the explanatory variable fits +setSweave("graph02", 6, 9) +AA.lo <- setLayout(num.rows=3, num.cols=2) +# Flow and flow squared +setGraph(1, AA.lo) +plot(Chop.lr, which="lnQ", set.up=FALSE) +setGraph(2, AA.lo) +plot(Chop.lr, which="lnQ2", set.up=FALSE) +# Time and time squared +setGraph(3, AA.lo) +plot(Chop.lr, which="DECTIME", set.up=FALSE) +setGraph(4, AA.lo) +plot(Chop.lr, which="DECTIME2", set.up=FALSE) +# Seasonality +setGraph(5, AA.lo) +plot(Chop.lr, which="sin.DECTIME", set.up=FALSE) +setGraph(6, AA.lo) +plot(Chop.lr, which="cos.DECTIME", set.up=FALSE) +dev.off() +@ +\includegraphics{graph02.pdf} +\paragraph{} + +\textbf{Figure 2.} The linearity of the explanatory variables. + +Figure 3 shows the relation between concentration and flow. The relation is not quadratic, but it appears that there is a distinct change at about 10 cubic meters per second. That relation can be modeled using piecewise linear, or segmented, terms. + +<>= +# Plot tconcentration and flow +setSweave("graph03", 6, 6) +# Use the average concentration (only one censored value) +with(Chop.QW, xyPlot(Q, ConcAve, yaxis.log=TRUE, xaxis.log=TRUE)) +dev.off() +@ +\includegraphics{graph03.pdf} +\paragraph{} + +\textbf{Figure 3.} The relation between concentration and flow. + +\eject +\section{Construct the Modified rloadest Model} + +The \textit{segLoadReg} can be used to build a piecewise linear model. It relies on the segmented package, which cannot model censored data to identify the breakpoints. For the first step censored values will be approximated by simple substitution; for the final model, the censored values are restored. One other quirk of \textit{segLoadReg} is that the response term must be a variable, it cannot be constructed using \textit{Surv} or any other function. Therefore, the breakpoint for this model will be identified using ConcAve, but the final model will be built using the censoring information. + +<>= +# Compute the breakpoint--the seg term must be the first term on +# the right-hand side. +Chop.lr <- segLoadReg(ConcAve ~ seg(LogQ, 1) + DecYear + + fourier(DecYear, 2), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") +# From the printed output, the breakpoint is 1.994 in natural log units, +# about 7.4 cms +# Compute and print the final model +Chop.lr <- loadReg(Surv(ConcLow, ConcHigh, type="interval2") ~ + segment(LogQ, 1.994) + DecYear + fourier(DecYear, 2), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") +print(Chop.lr) +@ + +This segmented model has three variables--- with names ending in X, U1, and P1. The coefficient for the variable ending in X is the slope for the variable less that the breakpoint, the coefficient for the variable ending in U1 is the change in slope above the breakpoint, and the coefficient for the variable ending in P1 should always be close to 0. + +No partial residual plot indicates any nonlinearity. Figure 4 shows 5 selected partial residual plots. + +<>= +# Plot the explanatory variable fits +setSweave("graph04", 6, 9) +AA.lo <- setLayout(num.rows=3, num.cols=2) +# Segmented flow +setGraph(1, AA.lo) +plot(Chop.lr, which="segment(LogQ, 1.994)X", set.up=FALSE) +setGraph(2, AA.lo) +plot(Chop.lr, which="segment(LogQ, 1.994)U1", set.up=FALSE) +# Time +setGraph(3, AA.lo) +plot(Chop.lr, which="DecYear", set.up=FALSE) +# Seasonality +setGraph(5, AA.lo) +plot(Chop.lr, which="fourier(DecYear, 2)sin(k=2)", set.up=FALSE) +setGraph(6, AA.lo) +plot(Chop.lr, which="fourier(DecYear, 2)cos(k=2)", set.up=FALSE) +dev.off() +@ +\includegraphics{graph04.pdf} +\paragraph{} + +\textbf{Figure 4.} partial residual plots. + +\eject +\section{Further Considerations} + +To be continued. + + +\begin{thebibliography}{9} + +\bibitem{HD} +Hirsch, R.M. and De Cicco, L.A., 2015, User guide to Exploration and Graphics for RivEr Trends (EGRET) and dataRetrieval R for hydrologic data (version 2.0, February 2015): U.S. Geological Survey Techniques and Methods book 4, chap A10, 93 p. Available at http://dx.doi.org/10.3133/tm4A10. + +\end{thebibliography} + +\end{document} diff --git a/inst/doc/UsingEGRETData.pdf b/inst/doc/UsingEGRETData.pdf new file mode 100644 index 0000000..a23c6ef Binary files /dev/null and b/inst/doc/UsingEGRETData.pdf differ diff --git a/inst/doc/app1.R b/inst/doc/app1.R new file mode 100644 index 0000000..24b2a73 --- /dev/null +++ b/inst/doc/app1.R @@ -0,0 +1,150 @@ +### R code from vignette source 'app1.Rnw' + +################################################### +### code chunk number 1: app1.Rnw:23-27 +################################################### +# Load the rloadest package and the data +library(rloadest) +data(app1.calib) +head(app1.calib) + + +################################################### +### code chunk number 2: app1.Rnw:37-41 +################################################### +# Create the load model. +app1.lr <- loadReg(Phosphorus ~ model(1), data = app1.calib, flow = "FLOW", + dates = "DATES", conc.units="mg/L", + station="Illinois River at Marseilles, Ill.") + + +################################################### +### code chunk number 3: app1.Rnw:49-50 +################################################### +print(app1.lr, brief=FALSE, load.only=FALSE) + + +################################################### +### code chunk number 4: app1.Rnw:78-80 +################################################### +predLoad(app1.lr, newdata = app1.calib, load.units="tons", by="total", + print=TRUE) + + +################################################### +### code chunk number 5: app1.Rnw:89-93 +################################################### +# setSweave is required for the vignette. +setSweave("app1_01", 5, 5) +plot(app1.lr, which=1, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 6: app1.Rnw:95-97 +################################################### +cat("\\includegraphics{app1_01.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 7: app1.Rnw:104-108 +################################################### +# setSweave is required for the vignette. +setSweave("app1_02", 5, 5) +plot(app1.lr, which=2, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 8: app1.Rnw:110-112 +################################################### +cat("\\includegraphics{app1_02.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 9: app1.Rnw:119-123 +################################################### +# setSweave is required for the vignette. +setSweave("app1_03", 5, 5) +plot(app1.lr, which=3, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 10: app1.Rnw:125-127 +################################################### +cat("\\includegraphics{app1_03.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 11: app1.Rnw:134-138 +################################################### +# setSweave is required for the vignette. +setSweave("app1_04", 5, 5) +plot(app1.lr, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 12: app1.Rnw:140-142 +################################################### +cat("\\includegraphics{app1_04.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 13: app1.Rnw:149-153 +################################################### +# setSweave is required for the vignette. +setSweave("app1_05", 5, 5) +plot(app1.lr, which=5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 14: app1.Rnw:155-157 +################################################### +cat("\\includegraphics{app1_05.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 15: app1.Rnw:164-168 +################################################### +# setSweave is required for the vignette. +setSweave("app1_06", 5, 5) +plot(app1.lr, which=6, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 16: app1.Rnw:170-172 +################################################### +cat("\\includegraphics{app1_06.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 17: app1.Rnw:181-190 +################################################### +# Create and print the revised load model. +app1.lr7 <- loadReg(Phosphorus ~ model(7), data = app1.calib, flow = "FLOW", + dates = "DATES", conc.units="mg/L", + station="Illinois River at Marseilles, Ill.") +print(app1.lr7) +# setSweave is required for the vignette. +setSweave("app1_07", 5, 5) +plot(app1.lr7, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 18: app1.Rnw:192-194 +################################################### +cat("\\includegraphics{app1_07.pdf}\n") +cat("\\paragraph{}\n") + + diff --git a/inst/doc/app1.Rnw b/inst/doc/app1.Rnw new file mode 100644 index 0000000..53a49f9 --- /dev/null +++ b/inst/doc/app1.Rnw @@ -0,0 +1,201 @@ +\documentclass{article} +\parskip 3pt +%\VignetteIndexEntry{Analysis of an Uncensored Constituent using a Predefined Model} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright +\parindent 30 pt + +\title{Application 1: Analysis of an Uncensored Constituent using a Predefined Model} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates the format of the input datasets and the format of the calls to build a predefined rating curve model and to make estimates in rloadest. A predefined model that describes the linear relation between the log of constituent load and log streamflow is built and used for load estimation. + +The data used in this application were collected from the Illinois River at Marseilles, Illinois (p. 257, Helsel and Hirsch, 2002). Ninety six water-quality samples for total phosphorus collected between November 1974 and April 1985 are used in conjunction with observed streamflow data to build the regression model and the calibration data are used to estimate loads as was dome in Helsel and Hirsch (2002) and Runkel and others (2004). + +Part 2 illustrates the diagnostic graphs that can be used to improve the model and offers a step-by-step approach to building a calibrated model. + +<>= +# Load the rloadest package and the data +library(rloadest) +data(app1.calib) +head(app1.calib) +@ + +\eject +\section{Build the Model} + +The \texttt{loadReg} function is used to build the rating-curve model for constituent load estimation. The basic form of the call to \texttt{loadReg} is similar to the call to \texttt{lm} in that it requires a formula and data source. The response variable in the formula is the constituent concentration, which is converted to load per day (flux) based on the units of concentration and the units of flow. The \texttt{conc.units}, \texttt{flow.units}, and \texttt{load.units} arguments to \texttt{loadReg} define the conversion. For these data, the concentration units (\texttt{conc.units}) are "mg/L", the flow units are "cfs" (the default), and the load units for the model are "kg" (also the default). If \texttt{conc.units} is not set, they are assumed to be "mg/L" and a warning is issued. Two additional pieces of information are required for \texttt{loadReg}---the names of the flow column and the dates column. A final option, the station identifier, can also be specified. + +Predefined models can easily be constructed using the \texttt{model} function as the response variable. For the call to \texttt{loadReg}, only the model number is needed---the \texttt{loadReg} automatically constructs the required input. This example uses model number 1. The model numbers match the terms in Runkel and others (2004), but the order is different---decimal time terms precede seasonal time terms. + +<>= +# Create the load model. +app1.lr <- loadReg(Phosphorus ~ model(1), data = app1.calib, flow = "FLOW", + dates = "DATES", conc.units="mg/L", + station="Illinois River at Marseilles, Ill.") +@ + +\eject +\section{Print the Model Report} + +An abbreviated form of the model report can be printed simply by typing the name of the model object (\texttt{app1.lr} in this case) in the R Console window. A more complete form that closely matches the output from LOADEST can be obtained by using the \texttt{print} function as shown below. + +<>= +print(app1.lr, brief=FALSE, load.only=FALSE) +@ + +Aside, from the cosmetic differences, there will be some differences in the actual numeric output. Major differences are listed below. + +LOADEST prints a modified form of AIC and SPCC, whereas the AIC and SPCC computed by this version are consistent with AIC and BIC computed for the same model using different methods, like simple linear regression (using \texttt{lm}) in this case of no censoring. + +The format for the model output matches the general format for regression model output in \texttt{R} rather than the format in LOADEST. It is expected that users of rloadest will be familiar with the general format for regression model output in \texttt{R}. + +This version prints G-squared, which is the test statistic for the overall model fit, and it's attained p-value. + +Finally, the summary statistics of loads and concentrations are based on the units defined in the call to \texttt{loadReg} rather than the specified output in LOADEST. + +\eject +\section{Estimate Loads} + +Unlike LOADEST, rloadest requires to the user to build the rating-curve model before estimating loads. For this application, we will follow the general layout of LOADEST and estimate loads directly from the model created earlier in this application. + +The \texttt{predLoad} function is used to estimate loads. It estimates loads in units per day, which is referred to as flux in rloadest. The arguments for \texttt{predLoad} are \texttt{fit}, the model output from \texttt{loadReg}; \texttt{newdata}, the estimation dataset; \texttt{load.units}, the load units for the estimates, which are taken from the model output if not specified; \texttt{by}, a character string indicating how to aggregate the load estimates; \texttt{seopt}, how to compute the standard error of the load; \texttt{allow.incomplete}, a logical value that indicates whether or not to allow incomplete periods to be estimated; and \texttt{print}, indicating whether to print a summary. + +Unlike the \texttt{predict} function in base \texttt{R}, \texttt{newdata} is required. The columns in \texttt{newdata} must match the column names in the calibration dataset. For predefined models, the column names for dates and flow must match. + +The \texttt{by} argument must be "unit," "day,", "month," "water year," "calendar year," "total," or the name of a grouping column in \texttt{newdata}. The "unit" option is not available in version 0.1. + +The argument \texttt{allow.incomplete} is not fully implemented in version 0.1. + +Application 1 in LOADEST uses the identical data for estimation as was used for calibration. This application will use the same dataset. The call in the \texttt{R} code below simply prints the results and discards the data frame that is calculated. + +<>= +predLoad(app1.lr, newdata = app1.calib, load.units="tons", by="total", + print=TRUE) +@ + +\eject +\section{Part 2, Diagnostic Plots} + +The rloadest package contains a \texttt{plot} function that creates diagnostic plots of the load model. Most often the user will just enter \texttt{plot(app1.lr)} (for this example) in the R Console window to generate the full suite of plots, but this example application will generate each plot individually. And, in general, the user will not need to set up a graphics device. But for this vignette, the graphics device must be set up for each graph. + +Figure 1 is related to figure 7 in Runkel and others (2004) because there is only a single explanatory variable. Figure 1 shows the AMLE regression line as a dashed line and the solid line is a LOWESS smooth curve. The LOWESS curve agrees very well with the regression line. +<>= +# setSweave is required for the vignette. +setSweave("app1_01", 5, 5) +plot(app1.lr, which=1, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app1_01.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 1.} The rating-curve regression model. + +\eject +Figure 2 is the same as figure 8 in Runkel and others (2004).The horizontal dashed line is at zero and the solid line is the LOWESS smooth. The LOWESS smooth is very close to the zero line and indicates no lack of fit. +<>= +# setSweave is required for the vignette. +setSweave("app1_02", 5, 5) +plot(app1.lr, which=2, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app1_02.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 2.} The residuals versus fit for the regression model. + +\eject +Figure 3 is a scale-location (S-L) graph that is a useful graph for assessing heteroscedasticity of the residuals. The horizontal dashed line is the expected value of the square root of the absolute value of the residuals and the solid line is the LOWESS smooth. Even though there is a small slope in the LOWESS line, it is not enough to cause concern for unequal variance in the estimates. +<>= +# setSweave is required for the vignette. +setSweave("app1_03", 5, 5) +plot(app1.lr, which=3, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app1_03.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 3.} The scale-location graph for the regression model. + +\eject +The correlogram in figure 4 is a adaptation of the correlogram from time-series analysis, which deals with regular samples. The horizontal dashed line is the zero value and the solid line is a kernel smooth rather than a LOWESS line. The kernel smooth gives a better fit in this case. The solid line should be very close to the horizontal line. In this case, because the solid line is consistently above the horizontal line for more than 1 year, we have concern for a lack of fit over time---a linear time term should be added to the model. There is also a slight regular pattern with a higher line at 0 and 1 and a low line at about 0.5. This might suggest a seasonal lack of fit. +<>= +# setSweave is required for the vignette. +setSweave("app1_04", 5, 5) +plot(app1.lr, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app1_04.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 4.} The correlogram from the regression model. + +\eject +Figure 5 is the same as figure 9 in Runkel and others (2004), except that figure 5 shows the standardized residuals, which are assumed to have a standard deviation of 1. The solid line is the theoretical fit of mean of 0 and standard deviation of 1. The visual appearance of figure 5 confirms the results of the PPCC test in the printed output---the residuals are reasonably normal in distribution. +<>= +# setSweave is required for the vignette. +setSweave("app1_05", 5, 5) +plot(app1.lr, which=5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app1_05.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 5.} The Q-normal plot of the residuals. + +\eject +Figure 6 is an extended box plot---a truncated box plot, at the 5 and 95 percentiles that shows the individual values larger than the 95th percentile and smaller than the 5th percentile. The box plots in figure 6 show the distributions of the actual and estimated loads. The appearance of these box plots agrees with what is expected---the range of the estimated values is a little smaller than the range of the actual,because of "regression to the mean," and the location is similar, the median and quartiles match reasonably well. This figure confirms the bias diagnostics section of the printed report. +<>= +# setSweave is required for the vignette. +setSweave("app1_06", 5, 5) +plot(app1.lr, which=6, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app1_06.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 6.} Box plot comparing estimated and observed values. + +\eject +\section{Add Linear and Seasonal Time to the Model} + +The correlogram (fig. 4) suggested that at least one reason for the relatively large value for the serial correlation (0.2303) being relatively large is a lack of fit over time and possibly a seasonal component. Model number 7 includes linear flow, linear time and seasonal time. Note the brevity of the brief version (default) of the report. The correlogram, fig. 7, does not show a regular pattern that might indicate any lack of fit over time. The remaining diagnostic plots are not displayed in this vignette. +<>= +# Create and print the revised load model. +app1.lr7 <- loadReg(Phosphorus ~ model(7), data = app1.calib, flow = "FLOW", + dates = "DATES", conc.units="mg/L", + station="Illinois River at Marseilles, Ill.") +print(app1.lr7) +# setSweave is required for the vignette. +setSweave("app1_07", 5, 5) +plot(app1.lr7, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app1_07.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 7.} The correlogram from the revised regression model. + + + +\end{document} diff --git a/inst/doc/app1.pdf b/inst/doc/app1.pdf index 443acd0..2d871d5 100644 Binary files a/inst/doc/app1.pdf and b/inst/doc/app1.pdf differ diff --git a/inst/doc/app2.R b/inst/doc/app2.R new file mode 100644 index 0000000..2e32aad --- /dev/null +++ b/inst/doc/app2.R @@ -0,0 +1,214 @@ +### R code from vignette source 'app2.Rnw' + +################################################### +### code chunk number 1: app2.Rnw:29-38 +################################################### +# Load the rloadest package and the data +library(rloadest) +data(app2.calib) +head(app2.calib) +# Plot the seasonal pattern of Atrazine +# setSweave is required for the vignette. +setSweave("app2_01", 5, 5) +with(app2.calib, seasonPlot(DATES, Atrazine, yaxis.log=TRUE)) +graphics.off() + + +################################################### +### code chunk number 2: app2.Rnw:40-42 +################################################### +cat("\\includegraphics{app2_01.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 3: app2.Rnw:51-54 +################################################### +# Add Period to the calibration data. +app2.calib <- transform(app2.calib, Period=seasons(DATES, + breaks=c("Apr", "Jul"))) + + +################################################### +### code chunk number 4: app2.Rnw:61-69 +################################################### +# Create and print the load model. +app2.lr <- loadReg(Atrazine ~ Period*center(log(FLOW)), data = app2.calib, + flow = "FLOW", dates = "DATES", conc.units="ug/L", + load.units="pounds", + station="St.Joseph River near Newville, Ind.") +# Warnings are not printed in the vignette +warnings() +app2.lr + + +################################################### +### code chunk number 5: app2.Rnw:77-81 +################################################### +# Load the estimation data and add Period +data(app2.est) +app2.est <- transform(app2.est, Period=seasons(DATES, + breaks=c("Apr", "Jul"))) + + +################################################### +### code chunk number 6: app2.Rnw:94-96 +################################################### +predLoad(app2.lr, newdata = app2.est, by="total", + print=TRUE) + + +################################################### +### code chunk number 7: app2.Rnw:100-102 +################################################### +app2.est <- transform(app2.est, Season=seasons(DATES, + breaks=c("Jan", "Apr", "Jul", "Oct"))) + + +################################################### +### code chunk number 8: app2.Rnw:106-108 +################################################### +app2.seas <- predLoad(app2.lr, newdata = app2.est, by="Season") +app2.seas + + +################################################### +### code chunk number 9: app2.Rnw:114-118 +################################################### +# setSweave is required for the vignette. +setSweave("app2_02", 5, 5) +plot(app2.lr, which=1, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 10: app2.Rnw:120-122 +################################################### +cat("\\includegraphics{app2_02.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 11: app2.Rnw:129-133 +################################################### +# setSweave is required for the vignette. +setSweave("app2_03", 5, 5) +plot(app2.lr, which=3, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 12: app2.Rnw:135-137 +################################################### +cat("\\includegraphics{app2_03.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 13: app2.Rnw:144-148 +################################################### +# setSweave is required for the vignette. +setSweave("app2_04", 5, 5) +plot(app2.lr, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 14: app2.Rnw:150-152 +################################################### +cat("\\includegraphics{app2_04.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 15: app2.Rnw:159-163 +################################################### +# setSweave is required for the vignette. +setSweave("app2_05", 5, 5) +plot(app2.lr, which=5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 16: app2.Rnw:165-167 +################################################### +cat("\\includegraphics{app2_05.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 17: app2.Rnw:181-185 +################################################### +# Create the limited regression model. +app2.lm <- lm(log(Atrazine) ~ center(log(FLOW)), data = app2.calib) +app2.sp <- seasonalPeak(dectime(app2.calib$DATES), residuals(app2.lm)) +app2.sp + + +################################################### +### code chunk number 18: app2.Rnw:190-197 +################################################### +# Show the plot for this example +setSweave("app2_06", 5, 5) +confirm(app2.sp, plot.only=TRUE) +graphics.off() +# Confirm the seasonalPeak analysis for a single peak. +app2.sp <- confirm(app2.sp, all=TRUE) +app2.sp + + +################################################### +### code chunk number 19: app2.Rnw:199-201 +################################################### +cat("\\includegraphics{app2_06.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 20: app2.Rnw:208-213 +################################################### +# Add Dectime. +app2.calib <- transform(app2.calib, Dectime=dectime(DATES)) +# Find the best model +selBestWave(log(Atrazine) ~ center(log(FLOW)), data = app2.calib, + "Dectime", app2.sp, exhaustive=TRUE) + + +################################################### +### code chunk number 21: app2.Rnw:218-226 +################################################### +# Create and print the seasonal-wave load model. +app2.lrsw <- loadReg(Atrazine ~ center(log(FLOW)) + + seasonalWave(Dectime, 0.45, 2, 3), + data = app2.calib, flow = "FLOW", + dates = "DATES", conc.units="ug/L", + load.units="pounds", + station="St.Joseph River near Newville, Ind.") +app2.lrsw + + +################################################### +### code chunk number 22: app2.Rnw:231-237 +################################################### +# Use colorPlot to show the relation by season +setSweave("app2_07", 5, 5) +AA.pl <- with(app2.calib, colorPlot(FLOW, Atrazine, color=Period, + yaxis.log=TRUE, xaxis.log=TRUE)) +addExplanation(AA.pl, "ul", title="Period") +graphics.off() + + +################################################### +### code chunk number 23: app2.Rnw:239-241 +################################################### +cat("\\includegraphics{app2_07.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 24: app2.Rnw:248-250 +################################################### +subset(app2.calib, Period=="Season Ending Jul" & Atrazine < .7) +subset(app2.calib, Period=="Season Ending Apr" & Atrazine > .7) + + diff --git a/inst/doc/app2.Rnw b/inst/doc/app2.Rnw new file mode 100644 index 0000000..f0f50cc --- /dev/null +++ b/inst/doc/app2.Rnw @@ -0,0 +1,253 @@ +\documentclass{article} +\parskip 3pt +%\VignetteIndexEntry{Analysis of an Uncensored Constituent using a Seasonal Model} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright +\parindent 30 pt + +\title{Application 2: Analysis of an Uncensored Constituent using a Seasonal Model} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates the construction of a rating-curve load model from a user-defined formula rather than a predefined rating curve model. It also demonstrates how to make load estimates for data outside of the calibration data. + +Many constituents are subject to seasonal loading patterns that are driven by external factors such as the length of the growing season, air temperature, and anthropogenic activities. Some of the predefined models implement sine and cosine terms to reflect the seasonality of constituent load. These models are applicable to constituents that change continuously over the seasonal cycle, but are not appropriate for other constituents that are subject to an abrupt seasonal change. Such is the case for pesticides, where springtime application results in a distinct change in the loading pattern for a discrete period of time (fig. 1). When such changes are evident in the data, it often is beneficial to use categorical explanatory variables, often called dummy variables in older literature, (Judge and others, 1988; Helsel and Hirsch, 2002) to handle the abrupt change. + +In this example, data from the St. Joseph River near Newville, Ind., is used with a user-defined periodic seasonal model to estimate atrazine loads. Observations of atrazine load show two distinct relations with regard to streamflow; for a given streamflow, atrazine loads are elevated during the 3 month period following pesticide application, May through July, and dramatically lower thereafter. This two-tiered loading response is modeled using the categorical explanatory variables present in per, +\begin{equation} +\log(Load_i)=\alpha_0+\alpha_1 per_i+\alpha_2{lnQ_i}+\alpha_3{per_i}{lnQ_i}+\epsilon_i, +\end{equation} +where $lnQ_i$ is the centered log of streamflow and $per_i$ is a categorical variable that indicates the seasonal period for observation $i$. The regression line for estimation during the May though July time period will have an intercept equal to $a_0$+$a_1$, and a slope of $a_2$+$a_3$. For the other time period (August through April), per will equal 0 and the regression equation corresponds to the simple model used in Application 1. + +Part 2 illustrates the diagnostic graphs that can be used to verify the model assumptions or improve the model. A seasonal-wave approach to modeling seasonal is introduced and other graphs to help better understand the model. + +<>= +# Load the rloadest package and the data +library(rloadest) +data(app2.calib) +head(app2.calib) +# Plot the seasonal pattern of Atrazine +# setSweave is required for the vignette. +setSweave("app2_01", 5, 5) +with(app2.calib, seasonPlot(DATES, Atrazine, yaxis.log=TRUE)) +graphics.off() +@ +<>= +cat("\\includegraphics{app2_01.pdf}\n") +cat("\\paragraph{}\n") +@ +\textbf{Figure 1.} The seasonal pattern of atrazine concentrations. + +\eject +\section{Build the Model} + +The first step is to create the categorical explanatory variable in the calibration data set. That is easily done using the \texttt{seasons} function, which is in the USGSwsBase package. The order of the \texttt{breaks} argument in \texttt{seasons} must be sequential through the year, but can overlap the end of the year. + +<>= +# Add Period to the calibration data. +app2.calib <- transform(app2.calib, Period=seasons(DATES, + breaks=c("Apr", "Jul"))) +@ + +The \texttt{loadReg} function is used to build the rating-curve model for constituent load estimation. The basic form of the call to \texttt{loadReg} is similar to the call to \texttt{lm} in that it requires a formula and data source. The response variable in the formula is the constituent concentration, which is converted to load per day (flux) based on the units of concentration and the units of flow. The \texttt{conc.units}, \texttt{flow.units}, and \texttt{load.units} arguments to \texttt{loadReg} define the conversion. For these data, the concentration units (\texttt{conc.units}) are "ug/L", the flow units are "cfs" (the default), and the load units for the model are "pounds." Two additional pieces of information are required for \texttt{loadReg}---the names of the flow column and the dates column. A final option, the station identifier, can also be specified. + +User defined models can be constructed using using the usual rules for constructing regression models in \texttt{R}. For this example, we'll take advantage of the * operator to add both the period and log centered flow terms and the interaction term. Details of the differences between the printed output from LOADEST and rloadest were described in application 1, so the short form is printed in this example. + +<>= +# Create and print the load model. +app2.lr <- loadReg(Atrazine ~ Period*center(log(FLOW)), data = app2.calib, + flow = "FLOW", dates = "DATES", conc.units="ug/L", + load.units="pounds", + station="St.Joseph River near Newville, Ind.") +# Warnings are not printed in the vignette +warnings() +app2.lr +@ + +\eject +\section{Estimate Loads} + +Unlike LOADEST, rloadest requires to the user to build the rating-curve model before estimating loads and will only estimate loads for one type of aggregation at a time, for example total load or loads by season. Before we can estimate loads, we need to get the data and add the same seasonal definition that was used to build the load model. + +<>= +# Load the estimation data and add Period +data(app2.est) +app2.est <- transform(app2.est, Period=seasons(DATES, + breaks=c("Apr", "Jul"))) +@ + +The \texttt{predLoad} function is used to estimate loads. It estimates loads in units per day, which is referred to as flux in rloadest. The arguments for \texttt{predLoad} are \texttt{fit}, the model output from \texttt{loadReg}; \texttt{newdata}, the estimation dataset; \texttt{load.units}, the load units for the estimates, which are taken from the model output if not specified; \texttt{by}, a character string indicating how to aggregate the load estimates; \texttt{sd}, how to compute the standard error of the load; \texttt{allow.incomplete}, a logical value that indicates whether or not to allow incomplete periods to be estimated; and \texttt{print}, indicating whether to print a summary. + +Unlike the \texttt{predict} function in base \texttt{R}, \texttt{newdata} is required. The columns in \texttt{newdata} must match the column names in the calibration dataset. For predefined models, the column names for dates and flow must match. + +The \texttt{by} argument must be "unit," "day,", "month," "water year," "calendar year," "total," or the name of a grouping column in \texttt{newdata}. The "unit" option is not available in version 0.1. + +The argument \texttt{sd} must be "exact" in version 0.1. Any other value will not give correct values. The argument \texttt{allow.incomplete} is not fully implemented in version 0.1. + +The total load estimate for Application 2 that matches that in LOADEST can be made using call in the \texttt{R} code below. + +<>= +predLoad(app2.lr, newdata = app2.est, by="total", + print=TRUE) +@ + +To create the matching seasonal loads in LOADEST, a column that defines the seasons must be added to the estimation data set. That can also be accomplished using the \texttt{seasons} function as shown below. The order of the seasons will be different from LOADEST, but the season names will make sense. +<>= +app2.est <- transform(app2.est, Season=seasons(DATES, + breaks=c("Jan", "Apr", "Jul", "Oct"))) +@ +The seasonal load estimates for Application 2 that matches that in LOADEST can be made using call in the \texttt{R} code below. In this case, we'll save the output dataset and print it. + +<>= +app2.seas <- predLoad(app2.lr, newdata = app2.est, by="Season") +app2.seas +@ +\eject +\section{Diagnostic Plots} + +Figure 2 shows the AMLE 1:1 line as a dashed line and the solid line is a LOWESS smooth curve. The LOWESS curve indicates departure from regression line for larger fitted values. Figure 2 is related for figure 11 in Runkel and others (2004), but collapses the two regression lines into a single line. The largest 5 values show exactly the same pattern between the two figures,being above the regression line, which helps to understand the -14.06 Bp statistic. +<>= +# setSweave is required for the vignette. +setSweave("app2_02", 5, 5) +plot(app2.lr, which=1, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app2_02.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 2.} The rating-curve regression model. + +\eject +Figure 3 is a scale-location (S-L) graph that is a useful graph for assessing heteroscedasticity of the residuals. The horizontal dashed line is the expected value of the square root of the absolute value of the residuals and the solid line is the LOWESS smooth. In this case, only 1 of the largest seven residuals is below the expected value line and most of the other smaller residuals are below the line, which suggests in increasing variance as the estimated load increases. +<>= +# setSweave is required for the vignette. +setSweave("app2_03", 5, 5) +plot(app2.lr, which=3, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app2_03.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 3.} The scale-location graph for the regression model. + +\eject +The correlogram in figure 4 is a adaptation of the correlogram from time-series analysis, which deals with regular samples. The horizontal dashed line is the zero value and the solid line is a kernel smooth rather than a LOWESS line. The kernel smooth gives a better fit in this case. The solid line should be very close to the horizontal line. In this case, there is a suggestion of a seasonal lack of fit. Note that because the time frame of the calibration period is only 20 months long, the smoothed line is not very reliable greater than about 10 months (about 0.8 units on the X-axis). +<>= +# setSweave is required for the vignette. +setSweave("app2_04", 5, 5) +plot(app2.lr, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app2_04.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 4.} The correlogram from the regression model. + +\eject +Figure 5 shows the q-normal plot of the residuals. The visual appearance of figure 5 confirms the results of the PPCC test in the printed output---the residuals are not normally distributed but are left-skewed. +<>= +# setSweave is required for the vignette. +setSweave("app2_05", 5, 5) +plot(app2.lr, which=5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app2_05.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 5.} The Q-normal plot of the residuals. + +\eject +\section{Part 2, Building a Seasonal-wave Load Model} + +All of the diagnostic plots in the previous section indicated a cause for concern about the validity of the periodic regression model. Vecchia and others (2008) describe a seasonal-wave function that often works well for pesticide models. + +The USGSwsStats package contains the tools necessary to construct a seasonal-wave model. Building a good regression model is a multi-step process, required identifying the timing of the peak concentration and the other parameters of the seasonal-wave model. + +The first step in constructing the seasonal-wave model is the identify the peak and potential values for the other parameters of the model. That involves building a regression model without any seasonal terms, and using the \texttt{seasonalPeak} function on the residuals to construct a first guess on those parameters. In this case, because there are no censored values, we can use \texttt{lm} instead of \texttt{censReg}. Note that it does not matter whether we use load or concentration because the residuals are the same. + +<>= +# Create the limited regression model. +app2.lm <- lm(log(Atrazine) ~ center(log(FLOW)), data = app2.calib) +app2.sp <- seasonalPeak(dectime(app2.calib$DATES), residuals(app2.lm)) +app2.sp +@ + +The next step in constructing the seasonal-wave model is to confirm the peak. This step requires the \texttt{confirm} function, which is interactive and cannot be demonstrated in a vignette. In this case, we can accept the default selection and estimated parameters. The user should step through the interactive process. + +<>= +# Show the plot for this example +setSweave("app2_06", 5, 5) +confirm(app2.sp, plot.only=TRUE) +graphics.off() +# Confirm the seasonalPeak analysis for a single peak. +app2.sp <- confirm(app2.sp, all=TRUE) +app2.sp +@ +<>= +cat("\\includegraphics{app2_06.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 6.} The seasonal peak graph. + +The \texttt{selBestWave} function can be used to select the "best" parameters for the seasonal-wave model. It requires a column in decimal time format. The following code adds the column \texttt{Dectime} and executes \texttt{selBestWave}. The results from \texttt{selBestWave} are simply printed, but could be saved. Even though the timing of the peak is pretty clear from the graph, we'll take advantage of the exhaustive search to find the "best" peak too. + +<>= +# Add Dectime. +app2.calib <- transform(app2.calib, Dectime=dectime(DATES)) +# Find the best model +selBestWave(log(Atrazine) ~ center(log(FLOW)), data = app2.calib, + "Dectime", app2.sp, exhaustive=TRUE) +@ + +The "best" model has the timing of the peak at about .45 instead of 0.419 (a bit later in the year), a pesticide loading period of 2 months and a decay rate indicated by a half-life of 3 months (the second slowest decay rate among the default choices). We are now ready to build and evaluate the seasonal-wave load model. + +<>= +# Create and print the seasonal-wave load model. +app2.lrsw <- loadReg(Atrazine ~ center(log(FLOW)) + + seasonalWave(Dectime, 0.45, 2, 3), + data = app2.calib, flow = "FLOW", + dates = "DATES", conc.units="ug/L", + load.units="pounds", + station="St.Joseph River near Newville, Ind.") +app2.lrsw +@ + +The Bias Diagnostics indicate a much poorer fit for this model than the periodic seasonal model originally fit. The seasonal-wave model should be rejected. The periodic seasonal model could be used, but the load estimates are poor. A better understanding of why the periodic seasonal model is better than the seasonal-wave model can be gained by graphing the relation between flow and concentration (fig. 7). + +<>= +# Use colorPlot to show the relation by season +setSweave("app2_07", 5, 5) +AA.pl <- with(app2.calib, colorPlot(FLOW, Atrazine, color=Period, + yaxis.log=TRUE, xaxis.log=TRUE)) +addExplanation(AA.pl, "ul", title="Period") +graphics.off() +@ +<>= +cat("\\includegraphics{app2_07.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 7.} The relation between atrazine concentration and streamflow by period. + +Figure 7 is related to figure 11 in Runkel and others (2004), but shows concentration instead of load. Four values stand out as lying outside of their respective seasons---three in the "Season Ending Jul" have concentrations less than about 0.7 and one in the "Season Ending Apr" has a concentrations greater than about 0.7. They are shown below. These atypical observations could contribute to the poor performance of the load models. + +<>= +subset(app2.calib, Period=="Season Ending Jul" & Atrazine < .7) +subset(app2.calib, Period=="Season Ending Apr" & Atrazine > .7) +@ + +\end{document} diff --git a/inst/doc/app2.pdf b/inst/doc/app2.pdf index d4b2e48..5bd73b2 100644 Binary files a/inst/doc/app2.pdf and b/inst/doc/app2.pdf differ diff --git a/inst/doc/app3.R b/inst/doc/app3.R new file mode 100644 index 0000000..2ee60d2 --- /dev/null +++ b/inst/doc/app3.R @@ -0,0 +1,265 @@ +### R code from vignette source 'app3.Rnw' + +################################################### +### code chunk number 1: app3.Rnw:39-47 +################################################### +# Load the rloadest package, which requires the USGSwsQW and +# other packages that contain the necessary functions +library(dataRetrieval) +library(rloadest) +app3.qw <- importNWISqw("01646580", params="00660", + begin.date="2001-10-01", end.date="2010-09-30") +app3.flow <- renameNWISColumns(readNWISdv("01646502", "00060", + startDate="2001-10-01", endDate="2010-09-30")) + + +################################################### +### code chunk number 2: app3.Rnw:52-62 +################################################### +# There are duplicated samples in this dataset. Print them +subset(app3.qw, sample_dt %in% + app3.qw[duplicated(app3.qw$sample_dt), "sample_dt"]) +# Remove the duplicates +app3.qw <- subset(app3.qw, !duplicated(sample_dt)) +# Now change the date column name and merge +names(app3.qw)[2] <- "Date" +# Supress the plot in this merge +app3.calib <- mergeQ(app3.qw, FLOW="Flow", DATES="Date", + Qdata=app3.flow, Plot=FALSE) + + +################################################### +### code chunk number 3: app3.Rnw:70-75 +################################################### +# Create and print the load model. +app3.lr <- loadReg(OrthoPhosphate.PO4 ~ model(9), data = app3.calib, + flow = "Flow", dates = "Date", + station="Potomac River at Chain Bridge, at Washington, DC") +print(app3.lr) + + +################################################### +### code chunk number 4: app3.Rnw:84-88 +################################################### +# setSweave is required for the vignette. +setSweave("app3_01", 5, 5) +plot(app3.lr, which=1, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 5: app3.Rnw:90-92 +################################################### +cat("\\includegraphics{app3_01.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 6: app3.Rnw:99-103 +################################################### +# setSweave is required for the vignette. +setSweave("app3_02", 5, 5) +plot(app3.lr, which=3, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 7: app3.Rnw:105-107 +################################################### +cat("\\includegraphics{app3_02.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 8: app3.Rnw:114-118 +################################################### +# setSweave is required for the vignette. +setSweave("app3_03", 5, 5) +plot(app3.lr, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 9: app3.Rnw:120-122 +################################################### +cat("\\includegraphics{app3_03.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 10: app3.Rnw:129-133 +################################################### +# setSweave is required for the vignette. +setSweave("app3_04", 5, 5) +plot(app3.lr, which=5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 11: app3.Rnw:135-137 +################################################### +cat("\\includegraphics{app3_04.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 12: app3.Rnw:144-148 +################################################### +# setSweave is required for the vignette. +setSweave("app3_05", 5, 5) +plot(app3.lr, which="DECTIME", set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 13: app3.Rnw:150-152 +################################################### +cat("\\includegraphics{app3_05.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 14: app3.Rnw:161-168 +################################################### +# setSweave is required for the vignette. +setSweave("app3_06", 5, 5) +timePlot(app3.calib$Date, residuals(app3.lr), + Plot=list(what="points"), ytitle="Residuals") +refLine(horizontal=0) +refLine(vertical=as.Date("2001-10-01") + years(0:9)) +graphics.off() + + +################################################### +### code chunk number 15: app3.Rnw:170-172 +################################################### +cat("\\includegraphics{app3_06.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 16: app3.Rnw:179-181 +################################################### +mean(app3.flow$Flow) +with(app3.flow, tapply(Flow, waterYear(Date), mean)) + + +################################################### +### code chunk number 17: app3.Rnw:195-201 +################################################### +app3.anom <- renameNWISColumns(readNWISdv("01646502", "00060", + startDate="1999-10-01", endDate="2010-09-30")) +app3.anom <- cbind(app3.anom, anomalies(log(app3.anom$Flow), + a1yr=365)) +# The head would show missing values for a1yr and HFV +tail(app3.anom) + + +################################################### +### code chunk number 18: app3.Rnw:206-210 +################################################### +# Supress the plot in this merge and overwrite app3.calib +app3.calib <- mergeQ(app3.qw, FLOW=c("Flow", "a1yr", "HFV"), + DATES="Date", + Qdata=app3.anom, Plot=FALSE) + + +################################################### +### code chunk number 19: app3.Rnw:215-221 +################################################### +app3.lra <- loadReg(OrthoPhosphate.PO4 ~ a1yr + HFV + dectime(Date) + + fourier(Date), + data = app3.calib, + flow = "Flow", dates = "Date", + station="Potomac River at Chain Bridge, at Washington, DC") +print(app3.lra) + + +################################################### +### code chunk number 20: app3.Rnw:229-233 +################################################### +# setSweave is required for the vignette. +setSweave("app3_07", 5, 5) +plot(app3.lra, which=1, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 21: app3.Rnw:235-237 +################################################### +cat("\\includegraphics{app3_07.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 22: app3.Rnw:244-248 +################################################### +# setSweave is required for the vignette. +setSweave("app3_08", 5, 5) +plot(app3.lra, which=3, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 23: app3.Rnw:250-252 +################################################### +cat("\\includegraphics{app3_08.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 24: app3.Rnw:259-263 +################################################### +# setSweave is required for the vignette. +setSweave("app3_09", 5, 5) +plot(app3.lra, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 25: app3.Rnw:265-267 +################################################### +cat("\\includegraphics{app3_09.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 26: app3.Rnw:274-278 +################################################### +# setSweave is required for the vignette. +setSweave("app3_10", 5, 5) +plot(app3.lra, which=5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 27: app3.Rnw:280-282 +################################################### +cat("\\includegraphics{app3_10.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 28: app3.Rnw:289-293 +################################################### +# setSweave is required for the vignette. +setSweave("app3_11", 5, 5) +plot(app3.lra, which="dectime(Date)", set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 29: app3.Rnw:295-297 +################################################### +cat("\\includegraphics{app3_11.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 30: app3.Rnw:307-310 +################################################### +app3.est <- subset(app3.anom, Date > as.Date("2001-09-30")) +predLoad(app3.lra, newdata = app3.est, by="water year", + print=TRUE) + + diff --git a/inst/doc/app3.Rnw b/inst/doc/app3.Rnw new file mode 100644 index 0000000..56a4740 --- /dev/null +++ b/inst/doc/app3.Rnw @@ -0,0 +1,313 @@ +\documentclass{article} +\parskip 3pt +\parindent 30pt +\usepackage[margin=1.25in]{geometry} +\usepackage{amsmath} + +%\VignetteIndexEntry{Analysis of an Censored Constituent using a Seasonal Model} +%\VignetteDepends{rloadest} +%\VignetteDepends{dataRetrieval} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright +\parindent 30 pt + +\title{Application 3: Analysis of an Censored Constituent using a Seasonal Model} + +\author{Dave Lorenz} + +\maketitle + +This application illustrates the "7-parameter model," a predefined model that has been shown to perform well for loading analyses of constituents in large (> 100 square miles) watersheds (Cohn and others, 1992). In addition, the application illustrates the use of LOADEST when portions of the calibration data set are subject to censoring. + +As in the previous example, a constituent with a seasonal loading pattern is considered here. In this case, constituent concentrations are assumed to vary in a continuous manner, as opposed to the abrupt changes considered in Application 2. Several of the predefined models (models 4 and 6--9; Section 3.2.2, table 7) use a first-order Fourier series (sine and cosine terms) to consider seasonality. In this application, the 7-parameter model (model 9) is developed for nutrient loading on the Potomac River. The 7-parameter model is given by: +\begin{equation} +\log(Load_i)=\alpha_0+\alpha_1{lnQ_i}+\alpha_2{lnQ_i^2}+\alpha_3{cT_i}+\alpha_4{cT_i^2}+\alpha_5 \sin(2\pi{dT_i})+\alpha_6 \cos(2\pi{dT_i})+\epsilon_i, +\end{equation} +where $lnQ_i$ is the centered log of flow, $cT_i$ is centered decimal time, and $dT_i$ is the decimal time for observation $i$. Within the model, explanatory variables one and two account for the dependence on flow, explanatory variables three and four account for the time trend, and explanatory variables five and six are a first-order Fourier series to account for seasonal variability. + +The load regression model for orthophosphate data collected near USGS gaging station 01646580 on the Potomac River uses equation 1. The retrieved dataset includes 237 observations of concentrations collected from 2002 to 2010; many of the observations are below the laboratory detection limit, resulting in a censored data set. The flow data will be from USGS gaging station 01646502, located just upstream from the water-quality gage. + +\eject +\section{Retrieve and Build the Datasets} + +Instead of relying on a packaged dataset, this example will retrieve data from NWISweb. You must be connected to the Internet in order to replicate the results in this example. + +The first step is to retrieve the water-quality and flow data. The water-quality data are retrieved using the \texttt{importNWISqw} function, which requires the station identifier and the parameter code. It also accepts beginning and ending dates. The flow data are retrieved using the \texttt{readNWIS} function, which requires only the station identifier and also accepts beginning and ending dates as well as other arguments not used. The \texttt{renCol} function simply renames the flow column so that it is more readable by humans. + +<>= +# Load the rloadest package, which requires the USGSwsQW and +# other packages that contain the necessary functions +library(dataRetrieval) +library(rloadest) +app3.qw <- importNWISqw("01646580", params="00660", + begin.date="2001-10-01", end.date="2010-09-30") +app3.flow <- renameNWISColumns(readNWISdv("01646502", "00060", + startDate="2001-10-01", endDate="2010-09-30")) +@ + +The second step is to merge the flow data with the water-quality data to produce a calibration dataset. The function \texttt{mergeQ} extracts the flow data from the flow dataset and merges the daily flow with the sample date in the water-quality dataset. For this analysis, we assume that a sample on any given day represents a valid estimate of the mean daily concentration. It requires that the names of the dates column match between the two datasets; the column \texttt{sample\_dt} in the water-quality data set is renamed to \texttt{Date} to match the date column in the flow dataset. A further requirement of \texttt{mergeQ} is that there are no replicate samples taken on the same day. In general, the concentration values with a day agree very well, for this example, simply delete the duplicated days. For other cases, it may be better to compute a mean-daily concentration. + +<>= +# There are duplicated samples in this dataset. Print them +subset(app3.qw, sample_dt %in% + app3.qw[duplicated(app3.qw$sample_dt), "sample_dt"]) +# Remove the duplicates +app3.qw <- subset(app3.qw, !duplicated(sample_dt)) +# Now change the date column name and merge +names(app3.qw)[2] <- "Date" +# Supress the plot in this merge +app3.calib <- mergeQ(app3.qw, FLOW="Flow", DATES="Date", + Qdata=app3.flow, Plot=FALSE) +@ + +\eject +\section{Build the Model} + +The \texttt{loadReg} function is used to build the rating-curve model for constituent load estimation. The basic form of the call to \texttt{loadReg} is similar to the call to \texttt{lm} in that it requires a formula and data source. The response variable in the formula is the constituent concentration, which is converted to load per day (flux) based on the units of concentration and the units of flow. The \texttt{conc.units}, \texttt{flow.units}, and \texttt{load.units} arguments to \texttt{loadReg} define the conversion. For these data, the concentration units (\texttt{conc.units}) are "mg/L" (as orthophosphate) and are known within the column so do not need to be specified, the flow units are "cfs" (the default), and the load units for the model are "kilograms." Two additional pieces of information are required for \texttt{loadReg}---the names of the flow column and the dates column. A final option, the station identifier, can also be specified. + +<>= +# Create and print the load model. +app3.lr <- loadReg(OrthoPhosphate.PO4 ~ model(9), data = app3.calib, + flow = "Flow", dates = "Date", + station="Potomac River at Chain Bridge, at Washington, DC") +print(app3.lr) +@ + +A few details from the printed report deserve mention--the second order flow and decimal time terms have p-values that are greater than 0.05 and may not be necessary; the p-value of the PPCC test is less that 0.05, which suggests a lack of normality; the serial correlation of the residuals is 0.3576, which is quite large; and Bp is relatively large at 19.08. + +\eject +\section{Diagnostic Plots} + +Figure 1 shows the AMLE 1:1 line as a dashed line and the solid line is a LOWESS smooth curve. The LOWESS curve indicates a good fit. +<>= +# setSweave is required for the vignette. +setSweave("app3_01", 5, 5) +plot(app3.lr, which=1, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_01.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 1.} The rating-curve regression model. + +\eject +Figure 2 is a scale-location (S-L) graph that is a useful graph for assessing heteroscedasticity of the residuals. The horizontal dashed line is the expected value of the square root of the absolute value of the residuals and the solid line is the LOWESS smooth. In this case, only 1 of the seven largest residuals is above the expected value line, which suggests in decreasing variance as the estimated load increases. +<>= +# setSweave is required for the vignette. +setSweave("app3_02", 5, 5) +plot(app3.lr, which=3, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_02.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 2.} The scale-location graph for the regression model. + +\eject +The correlogram in figure 3 is a adaptation of the correlogram from time-series analysis, which deals with regular samples. The horizontal dashed line is the zero value and the solid line is a kernel smooth rather than a LOWESS line. The kernel smooth gives a better fit in this case. The solid line should be very close to the horizontal line. In this case, there is a suggestion of a long-term lack of fit because the solid line is above the horizontal line for a 1-year lag. +<>= +# setSweave is required for the vignette. +setSweave("app3_03", 5, 5) +plot(app3.lr, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_03.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 3.} The correlogram from the regression model. + +\eject +Figure 4 shows the q-normal plot of the residuals. The visual appearance of figure 4 confirms the results of the PPCC test in the printed output---the largest residuals trail off the line. +<>= +# setSweave is required for the vignette. +setSweave("app3_04", 5, 5) +plot(app3.lr, which=5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_04.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 4.} The Q-normal plot of the residuals. + +\eject +Figure 5 shows the partial residual plot for decimal time (DECTIME). This one was selected because of the long-term lack of fit over time suggested by figure 3. The dashed line is the linear fit and the solid line is the LOWESS smooth. In this case, the LOWESS smooth does follow the fitted line, but there is a distinct pattern in the left part of the graph--most of the residuals are above the line up to a DECTIME value of about -3 and then most residuals are below the line to about -1, after which the residuals are fairly well behaved. +<>= +# setSweave is required for the vignette. +setSweave("app3_05", 5, 5) +plot(app3.lr, which="DECTIME", set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_05.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 5.} The partial residual plot for decimal time. + +\eject +\section{Further Diagnostics} + +Figure 5 suggested a distinct pattern in the residuals in the early part of the record. Figure 6 replots the residuals on a date axis, so that it will be easier to relate to the date. The second call to \texttt{refLine} adds vertical lines at the water years. +<>= +# setSweave is required for the vignette. +setSweave("app3_06", 5, 5) +timePlot(app3.calib$Date, residuals(app3.lr), + Plot=list(what="points"), ytitle="Residuals") +refLine(horizontal=0) +refLine(vertical=as.Date("2001-10-01") + years(0:9)) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_06.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 6.} Model residuals by date. + +The residuals for water-year 2002 are mostly greater than 0; those for watery-year 2003 are trending down and those for water-year 2004 are mostly less than 0. This raises the question about whether more persistent flow patterns affect the relation between flow and concentration. The code immediately below computes the average (first line) and water-year average (second line) flow. The pattern of water-year average flows closely matches the pattern of the residuals. + +<>= +mean(app3.flow$Flow) +with(app3.flow, tapply(Flow, waterYear(Date), mean)) +@ + +\eject +\section{Modeling Flow Anomalies} + +Vecchia and others (2008) describe an approach for breaking down stream flow into what they call anomalies--long- to intermediate-term deviations from average flow and the residual high-frequency variation or daily residuals. That approach can be very useful in cases such as this where there is a strong relation between flow and concentration, but relatively persistent patterns of flow are not captured. + +The first step in modeling flow anomalies required retrieving data for a longer period of time. We'll retrieve data from two years prior to the start of the sampling record that we are working with. The additional two years of record were selected because the first one year would be all missing values and one additional year to establish a pattern going into 2001. Then we'll construct a single 1-year anomaly, which seems to make sense from figure 6. This 6-parameter anomaly model is given by: +\begin{equation} +\log(Load_i)=\alpha_0+\alpha_1{A1yr_i}+\alpha_2{HFV_i}+\alpha_3{dT_i}+\alpha_4 \sin(2\pi{dT_i})+\alpha_5 \cos(2\pi{dT_i})+\epsilon_i, +\end{equation} +where $A1yr_i$ is the 1-year anomaly of the log of flow, $HFV_i$ is remaining high-frequency variation in the log of flow, and $dT_i$ is the decimal time for observation $i$. + +<>= +app3.anom <- renameNWISColumns(readNWISdv("01646502", "00060", + startDate="1999-10-01", endDate="2010-09-30")) +app3.anom <- cbind(app3.anom, anomalies(log(app3.anom$Flow), + a1yr=365)) +# The head would show missing values for a1yr and HFV +tail(app3.anom) +@ + +The next step is to merge the flow and anomaly data with the water-quality data. + +<>= +# Supress the plot in this merge and overwrite app3.calib +app3.calib <- mergeQ(app3.qw, FLOW=c("Flow", "a1yr", "HFV"), + DATES="Date", + Qdata=app3.anom, Plot=FALSE) +@ + +The final step is to construct the model. Note that flow is not a necessary part of the model because it is represented by the anomalies, linear time is represented by \texttt{dectime(Date)}, and the seasonal components by \texttt{fourier(Date)}. Note also that decimal time is not centered, but could be by using the \texttt{center} function. + +<>= +app3.lra <- loadReg(OrthoPhosphate.PO4 ~ a1yr + HFV + dectime(Date) + + fourier(Date), + data = app3.calib, + flow = "Flow", dates = "Date", + station="Potomac River at Chain Bridge, at Washington, DC") +print(app3.lra) +@ + +The residual variance is much smaller than the original model, 0.4021 rather than 0.5126. The PPCC p-value is still less than 0.05, but much closer to 0.05. The serial correlation of the residuals is much smaller than the original 0.1914 rather than 0.3576. But the Bp statistic is a bit larger 22.63 percent rather than 19.08. In spite of the larger Bp statistic, the Nash-Sutcliffe statistic (E) is larger 0.6723 rather than 0.5439. All of this suggests a better model. Review some of the diagnostic plots. + +\eject + +Figure 7 shows the AMLE 1:1 line as a dashed line and the solid line is a LOWESS smooth curve. The LOWESS curve indicates a good fit. +<>= +# setSweave is required for the vignette. +setSweave("app3_07", 5, 5) +plot(app3.lra, which=1, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_07.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 7.} The revised rating-curve regression model. + +\eject +Figure 8 shows the S-L graph, which indicates some decrease in variance for larger fitted values than for smaller. +<>= +# setSweave is required for the vignette. +setSweave("app3_08", 5, 5) +plot(app3.lra, which=3, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_08.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 8.} The scale-location graph for the revised regression model. + +\eject +The correlogram in figure 9 shows more variability than one would like, but no distinct long-term or seasonal patterns. +<>= +# setSweave is required for the vignette. +setSweave("app3_09", 5, 5) +plot(app3.lra, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_09.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 9.} The correlogram from the revised regression model. + +\eject +Figure 10 shows the q-normal plot of the residuals. The largest residuals trail off the line for this analysis but not quite as much as in the original model. +<>= +# setSweave is required for the vignette. +setSweave("app3_10", 5, 5) +plot(app3.lra, which=5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_10.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 10.} The Q-normal plot of the residuals from the revised model. + +\eject +Figure 11 shows the partial residual plot for decimal time. This one was selected because of the long-term lack of fit over time suggested by figure 3. The dashed line is the linear fit and the solid line is the LOWESS smooth. In this case, the LOWESS smooth does follow the fitted line, but there is a distinct pattern in the left part of the graph--most of the residuals are above the line up to a DECTIME value of about -3 and then most residuals are below the line to about -1, after which the residuals are fairly well behaved. +<>= +# setSweave is required for the vignette. +setSweave("app3_11", 5, 5) +plot(app3.lra, which="dectime(Date)", set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app3_11.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 11.} The partial residual plot for decimal time. + +\eject +\section{Load Estimates} + +Because we used anomalies in the regression model, we must be very careful to use the same anomalies in the estimation data. The data that were retrieved to compute the anomalies include dates outside of the calibration period, so must be subsetted to the calibration period. We'll compute load estimates for the water years 2002 through 2010 + +<>= +app3.est <- subset(app3.anom, Date > as.Date("2001-09-30")) +predLoad(app3.lra, newdata = app3.est, by="water year", + print=TRUE) +@ + +\end{document} diff --git a/inst/doc/app3.pdf b/inst/doc/app3.pdf index fc66724..09b306f 100644 Binary files a/inst/doc/app3.pdf and b/inst/doc/app3.pdf differ diff --git a/inst/doc/app4.R b/inst/doc/app4.R new file mode 100644 index 0000000..da723ba --- /dev/null +++ b/inst/doc/app4.R @@ -0,0 +1,214 @@ +### R code from vignette source 'app4.Rnw' + +################################################### +### code chunk number 1: app4.Rnw:23-27 +################################################### +# Load the rloadest package and the data +library(rloadest) +data(app4.calib) +head(app4.calib) + + +################################################### +### code chunk number 2: app4.Rnw:32-35 +################################################### +# Convert Buty and Alach to class "qw" +app4.calib <- convert2qw(app4.calib, scheme="partial") +head(app4.calib) + + +################################################### +### code chunk number 3: app4.Rnw:43-51 +################################################### +# Create the "best" load model. +app4.lr <- selBestModel("Buty", data = app4.calib, flow = "FLOW", + dates = "DATES", conc.units="ug/L", + station="White River at Hazleton, Ind.") +# Print the warning in the vignette +warnings() +# Print the results +app4.lr + + +################################################### +### code chunk number 4: app4.Rnw:64-68 +################################################### +# setSweave is required for the vignette. +setSweave("app4_01", 5, 5) +plot(app4.lr, which=1, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 5: app4.Rnw:70-72 +################################################### +cat("\\includegraphics{app4_01.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 6: app4.Rnw:80-84 +################################################### +# setSweave is required for the vignette. +setSweave("app4_02", 5, 5) +plot(app4.lr, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 7: app4.Rnw:86-88 +################################################### +cat("\\includegraphics{app4_02.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 8: app4.Rnw:96-100 +################################################### +# setSweave is required for the vignette. +setSweave("app4_03", 5, 5) +plot(app4.lr, which=5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 9: app4.Rnw:102-104 +################################################### +cat("\\includegraphics{app4_03.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 10: app4.Rnw:112-116 +################################################### +# setSweave is required for the vignette. +setSweave("app4_04", 5, 5) +plot(app4.lr, which=6, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 11: app4.Rnw:118-120 +################################################### +cat("\\includegraphics{app4_04.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 12: app4.Rnw:134-139 +################################################### +# Create the limited regression model. +app4.cr <- censReg(Buty ~ center(log(FLOW)) + dectime(DATES), + data = app4.calib, dist="lognormal") +app4.sp <- seasonalPeak(dectime(app4.calib$DATES), residuals(app4.cr)) +app4.sp + + +################################################### +### code chunk number 13: app4.Rnw:144-151 +################################################### +# Show the plot for this example +setSweave("app4_05", 5, 5) +confirm(app4.sp, plot.only=TRUE) +graphics.off() +# Confirm the seasonalPeak analysis for a single peak. +app4.sp <- confirm(app4.sp, all=TRUE) +app4.sp + + +################################################### +### code chunk number 14: app4.Rnw:153-155 +################################################### +cat("\\includegraphics{app4_05.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 15: app4.Rnw:162-169 +################################################### +# Add Dectime. +app4.calib <- transform(app4.calib, Dectime=dectime(DATES)) +# Find the best model +selBestWave(Buty ~ center(log(FLOW)) + dectime(DATES), + data = app4.calib, + "Dectime", app4.sp, exhaustive=TRUE, Regression=censReg, + dist="lognormal") + + +################################################### +### code chunk number 16: app4.Rnw:174-182 +################################################### +# Create and print the seasonal-wave load model. +# Note that we can use Dectime directly in this model +app4.lrsw <- loadReg(Buty ~ center(log(FLOW)) + Dectime + + seasonalWave(Dectime, 0.393, 1, 1), + data = app4.calib, flow = "FLOW", + dates = "DATES", conc.units="ug/L", + station="White River at Hazleton, Ind.") +app4.lrsw + + +################################################### +### code chunk number 17: app4.Rnw:190-194 +################################################### +# setSweave is required for the vignette. +setSweave("app4_06", 5, 5) +plot(app4.lrsw, which=1, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 18: app4.Rnw:196-198 +################################################### +cat("\\includegraphics{app4_06.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 19: app4.Rnw:206-210 +################################################### +# setSweave is required for the vignette. +setSweave("app4_07", 5, 5) +plot(app4.lrsw, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 20: app4.Rnw:212-214 +################################################### +cat("\\includegraphics{app4_07.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 21: app4.Rnw:222-226 +################################################### +# setSweave is required for the vignette. +setSweave("app4_08", 5, 5) +plot(app4.lrsw, which=5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 22: app4.Rnw:228-230 +################################################### +cat("\\includegraphics{app4_08.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 23: app4.Rnw:238-242 +################################################### +# setSweave is required for the vignette. +setSweave("app4_09", 5, 5) +plot(app4.lrsw, which=6, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 24: app4.Rnw:244-246 +################################################### +cat("\\includegraphics{app4_09.pdf}\n") +cat("\\paragraph{}\n") + + diff --git a/inst/doc/app4.Rnw b/inst/doc/app4.Rnw new file mode 100644 index 0000000..4a62b51 --- /dev/null +++ b/inst/doc/app4.Rnw @@ -0,0 +1,251 @@ +\documentclass{article} +\parskip 3pt +%\VignetteIndexEntry{Automated Model Selection} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright +\parindent 30 pt + +\title{Application 4: Automated Model Selection} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates the the automated model selection function. The example calibration data set include 4 constituents that include uncensored, Atra (atrazine); slightly censored, Alach (alachlor); highly censored, Buty (butlyate); and data having missing observations, SuspSed (suspended sediment). + +The data used in this application were collected from the White River at Hazleton, Indiana. Forty five water-quality samples collected between October 1992 and September 1994 are used in conjunction with observed streamflow data to build the regression model. + +Part 2 illustrates the seasonal-wave approach to modeling the seasonal pattern of pesticides. The user may wish to explore the seasonal-wave analysis for atrazine and alachor, both of which indicate a strong periodic seasonal lack of fit when using only the sine and cosine seasonl terms. + +<>= +# Load the rloadest package and the data +library(rloadest) +data(app4.calib) +head(app4.calib) +@ + +The censored data in \texttt{app4.calib} are stored in 2 columns---the recorded value in the column identified by the constituent abbreviation and the remark code, "<" indicating a less-than value in the corresponding column with the .rmk suffix. In order to take advantage of the of \texttt{loadReg} function to automatically recognize censored data, the censored data in the example dataset should be converted to type "water-quality." This can be done using the \texttt{convert2qw} function in the \texttt{smwrQW} package, which is required by \texttt{rloadest}. The naming convention in \texttt{app.calib} is consistent with the "partial" scheme for \texttt{convert2qw}. The conversion is accomplished in the R code below. The \texttt{app4.calib} is overwritten with the new data. + +<>= +# Convert Buty and Alach to class "qw" +app4.calib <- convert2qw(app4.calib, scheme="partial") +head(app4.calib) +@ + +\eject +\section{Build the Model} + +The \texttt{selBestModel} function can be used to select the predefined model with the smallest AIC value. It also records the SPPC value, also known as BIC, which has a greater penalty for additional terms and will generally choose a more parsimonious model. The requirements for \texttt{selBestModel} are similar to \texttt{loadReg}, except that the name of the constituent is requires instead of the formula. + +<>= +# Create the "best" load model. +app4.lr <- selBestModel("Buty", data = app4.calib, flow = "FLOW", + dates = "DATES", conc.units="ug/L", + station="White River at Hazleton, Ind.") +# Print the warning in the vignette +warnings() +# Print the results +app4.lr +@ + +The model with the lowest AIC value is 7, which includes linear flow, linear time and the seasonal time terms. Model 7 also has the smallest SPCC value, but model 4 is only slightly larger (requires 2 decimal digits to see the difference recorded in the \texttt{model.eval} component in the output object). + + +\eject +\section{Diagnostic Plots} + +The rloadest package contains a \texttt{plot} function that creates diagnostic plots of the load model. Most often the user will just enter \texttt{plot(app4.lr)} (for this example) in the R Console window to generate the full suite of plots, but this example application will generate each plot individually. And, in general, the user will not need to set up a graphics device. But for this vignette, the graphics device must be set up for each graph. + +Figure 1 shows the response versus the fitted values, censored observations are shown by open circles. It is perhaps a bit disconcerting that there are no censored observations below the dashed regression line for fitted values less than about -3, but that is not inconceivable. That discrepancy in the location of censored values also drives the appearance of greater scatter in larger fitted values; plots 2 and 3 are not shown. + +<>= +# setSweave is required for the vignette. +setSweave("app4_01", 5, 5) +plot(app4.lr, which=1, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app4_01.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 1.} The rating-curve regression model. + +\eject +Figure 2 is the correlogram. It suggests a seasonal lack of fit, although the shape of the smooth line is not conclusive. + +<>= +# setSweave is required for the vignette. +setSweave("app4_02", 5, 5) +plot(app4.lr, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app4_02.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 2.} The correlogram. + +\eject +Figure 3 is q-normal plot and shows the standardized residuals, which are assumed to have a standard deviation of 1. The solid line is the theoretical fit of mean of 0 and standard deviation of 1. The visual appearance of figure 5 confirms the results of the PPCC test in the printed output---the residuals show deviation from the normal distribution. The open circles are censored observations and the plotted value is the expected value returned by the \texttt{residuals} functions using the argument \texttt{type} set to "working." + +<>= +# setSweave is required for the vignette. +setSweave("app4_03", 5, 5) +plot(app4.lr, which=5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app4_03.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 3.} The Q-normal plot of the residuals. + +\eject +Figure 4 is an extended box plot---a truncated box plot, at the 5 and 95 percentiles that shows the individual values larger than the 95th percentile and smaller than the 5th percentile. The box plots in figure 6 show the distributions of the actual and estimated loads. The appearance of these box plots helps to understand the printed bias diagnostics, which show a general over estimate. The upper and lower ends of both boxes are similar, but the box plot of the estimates shows a median value which is much larger than the median value for the observed values. Therefore, the middle range is over estimated. + +<>= +# setSweave is required for the vignette. +setSweave("app4_04", 5, 5) +plot(app4.lr, which=6, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app4_04.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 4.} Box plot comparing estimated and observed values. + +\eject +\section{Part 2, Building a Seasonal-wave Load Model} + +All of the diagnostic plots in the previous section indicated a cause for concern about the validity of the periodic regression model. Vecchia and others (2008) describe a seasonal-wave function that often works well for pesticide models. + +The smwrStats package contains the tools necessary to construct a seasonal-wave model. Building a good regression model is a multi-step process, required identifying the timing of the peak concentration and the other parameters of the seasonal-wave model. + +The first step in constructing the seasonal-wave model is the identify the peak and potential values for the other parameters of the model. That involves building a regression model without any seasonal terms, and using the \texttt{seasonalPeak} function on the residuals to construct a first guess on those parameters. In this case, because there are censored values, we must use \texttt{censReg}. Note that it does not matter whether we use load or concentration because the residuals are the same. + +<>= +# Create the limited regression model. +app4.cr <- censReg(Buty ~ center(log(FLOW)) + dectime(DATES), + data = app4.calib, dist="lognormal") +app4.sp <- seasonalPeak(dectime(app4.calib$DATES), residuals(app4.cr)) +app4.sp +@ + +The next step in constructing the seasonal-wave model is to confirm the peak. This step requires the \texttt{confirm} function, which is interactive and cannot be demonstrated in a vignette. In this case, we can accept the default selection and estimated parameters. The user should step through the interactive process. + +<>= +# Show the plot for this example +setSweave("app4_05", 5, 5) +confirm(app4.sp, plot.only=TRUE) +graphics.off() +# Confirm the seasonalPeak analysis for a single peak. +app4.sp <- confirm(app4.sp, all=TRUE) +app4.sp +@ +<>= +cat("\\includegraphics{app4_05.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 5.} The seasonal peak graph. + +The \texttt{selBestWave} function can be used to select the "best" parameters for the seasonal-wave model. It requires a column in decimal time format. The following code adds the column \texttt{Dectime} and executes \texttt{selBestWave}. The results from \texttt{selBestWave} are simply printed, but could be saved. Even though the timing of the peak is pretty clear from the graph, we'll take advantage of the exhaustive search to find the "best" peak too. + +<>= +# Add Dectime. +app4.calib <- transform(app4.calib, Dectime=dectime(DATES)) +# Find the best model +selBestWave(Buty ~ center(log(FLOW)) + dectime(DATES), + data = app4.calib, + "Dectime", app4.sp, exhaustive=TRUE, Regression=censReg, + dist="lognormal") +@ + +The "best" model has the timing of the peak at about 0.393, a pesticide loading period of 1 months and a decay rate indicated by a half-life of 1 month (the fastest decay rate among the default choices). We are now ready to build and evaluate the seasonal-wave load model. + +<>= +# Create and print the seasonal-wave load model. +# Note that we can use Dectime directly in this model +app4.lrsw <- loadReg(Buty ~ center(log(FLOW)) + Dectime + + seasonalWave(Dectime, 0.393, 1, 1), + data = app4.calib, flow = "FLOW", + dates = "DATES", conc.units="ug/L", + station="White River at Hazleton, Ind.") +app4.lrsw +@ + +The Bias Diagnostics indicate a much better fit for this model than the seasonal model originally fit. The diagnostic plots confirm the improvement in the fit. + +\eject +Figure 6 shows the response versus the fitted values, censored observations are shown by open circles. It is still a bit disconcerting that there is only one censored observation below the dashed regression line for fitted values less than about -3. That discrepancy in the location of censored values also drives the appearance of greater scatter in larger fitted values; plots 2 and 3 are not shown. + +<>= +# setSweave is required for the vignette. +setSweave("app4_06", 5, 5) +plot(app4.lrsw, which=1, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app4_06.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 6.} The rating-curve regression model using a seasonal wave. + +\eject +Figure 7 is the correlogram. It does not suggest a seasonal lack of fit, but the smooth line is not as flat as one would like. + +<>= +# setSweave is required for the vignette. +setSweave("app4_07", 5, 5) +plot(app4.lrsw, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app4_07.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 7.} The correlogram for the model using a seasonal wave. + +\eject +Figure 8 is q-normal plot and shows the standardized residuals. The visual appearance of figure 8 confirms the results of the PPCC test in the printed output---the residuals show little deviation from the normal distribution, but there is one outlying observation. The open circles are censored observations and the plotted value is the expected value returned by the \texttt{residuals} functions using the argument \texttt{type} set to "working." + +<>= +# setSweave is required for the vignette. +setSweave("app4_08", 5, 5) +plot(app4.lrsw, which=5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app4_08.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 8.} The Q-normal plot of the residuals for the model using a seasonal wave. + +\eject +Figure 9 is an extended box plot---a truncated box plot, at the 5 and 95 percentiles that shows the individual values larger than the 95th percentile and smaller than the 5th percentile. The box plots in figure 6 show the distributions of the actual and estimated loads. The appearance of these box plots helps to understand the printed bias diagnostics, which show a small over estimate. The upper and lower ends of of the estimated values extend beyond the observed values and the box plot of the estimates shows a median value which is a bit larger than the median value for the observed values. Therefore, the middle and upper ranges are slightly over estimated. But the overall over estimate is less than 10 percent. + +<>= +# setSweave is required for the vignette. +setSweave("app4_09", 5, 5) +plot(app4.lrsw, which=6, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app4_09.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 9.} Box plot comparing estimated and observed values for the model using a seasonal wave. + +\end{document} diff --git a/inst/doc/app4.pdf b/inst/doc/app4.pdf index f1d6ba5..021e42d 100644 Binary files a/inst/doc/app4.pdf and b/inst/doc/app4.pdf differ diff --git a/inst/doc/app5.R b/inst/doc/app5.R new file mode 100644 index 0000000..5f0a607 --- /dev/null +++ b/inst/doc/app5.R @@ -0,0 +1,251 @@ +### R code from vignette source 'app5.Rnw' + +################################################### +### code chunk number 1: app5.Rnw:24-28 +################################################### +# Load the rloadest package and the data +library(rloadest) +data(app5.calib) +head(app5.calib) + + +################################################### +### code chunk number 2: app5.Rnw:38-43 +################################################### +# Create the and print load model. +app5.lr <- loadReg(Alkalinity ~ log(FLOW) + log(SC), data = app5.calib, + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") +app5.lr + + +################################################### +### code chunk number 3: app5.Rnw:53-57 +################################################### +# setSweave is required for the vignette. +setSweave("app5_01", 5, 5) +plot(app5.lr, which=1, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 4: app5.Rnw:59-61 +################################################### +cat("\\includegraphics{app5_01.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 5: app5.Rnw:69-73 +################################################### +# setSweave is required for the vignette. +setSweave("app5_02", 5, 5) +plot(app5.lr, which=3, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 6: app5.Rnw:75-77 +################################################### +cat("\\includegraphics{app5_02.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 7: app5.Rnw:85-89 +################################################### +# setSweave is required for the vignette. +setSweave("app5_03", 5, 5) +plot(app5.lr, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 8: app5.Rnw:91-93 +################################################### +cat("\\includegraphics{app5_03.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 9: app5.Rnw:101-105 +################################################### +# setSweave is required for the vignette. +setSweave("app5_04", 5, 5) +plot(app5.lr, which=5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 10: app5.Rnw:107-109 +################################################### +cat("\\includegraphics{app5_04.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 11: app5.Rnw:118-122 +################################################### +# setSweave is required for the vignette. +setSweave("app5_05", 5, 5) +plot(app5.lr, which="log(FLOW)", span=0.5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 12: app5.Rnw:124-126 +################################################### +cat("\\includegraphics{app5_05.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 13: app5.Rnw:136-146 +################################################### +# Create the revised load model and plot the correlogram. +app5.lrR1 <- loadReg(Alkalinity ~ quadratic(log(FLOW)) + log(SC), + data = app5.calib, + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") + +# setSweave is required for the vignette. +setSweave("app5_06", 5, 5) +plot(app5.lrR1, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 14: app5.Rnw:148-150 +################################################### +cat("\\includegraphics{app5_06.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 15: app5.Rnw:162-170 +################################################### +# setSweave is required for the vignette. +setSweave("app5_07", 5, 5) +AA.pl <- with(app5.calib, xyPlot(FLOW, Alkalinity, yaxis.log=TRUE, + xaxis.log=TRUE)) +with(subset(app5.calib, DATES > "1998-01-01" & DATES < "1998-05-01"), + addXY(FLOW, Alkalinity,Plot=list(what="points", color="red"), + current=AA.pl)) +graphics.off() + + +################################################### +### code chunk number 16: app5.Rnw:172-174 +################################################### +cat("\\includegraphics{app5_07.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 17: app5.Rnw:184-195 +################################################### +# Create, print the revised load model and plot the correlogram. +app5.lrR2 <- loadReg(Alkalinity ~ quadratic(log(FLOW)) + log(SC) + + fourier(DATES), + data = app5.calib, subset=DATES < "1998-01-01", + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") +app5.lrR2 +# setSweave is required for the vignette. +setSweave("app5_08", 5, 5) +plot(app5.lrR2, which=4, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 18: app5.Rnw:197-199 +################################################### +cat("\\includegraphics{app5_08.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 19: app5.Rnw:208-212 +################################################### +# setSweave is required for the vignette. +setSweave("app5_09", 5, 5) +plot(app5.lrR2, which=1, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 20: app5.Rnw:214-216 +################################################### +cat("\\includegraphics{app5_09.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 21: app5.Rnw:224-228 +################################################### +# setSweave is required for the vignette. +setSweave("app5_10", 5, 5) +plot(app5.lrR2, which=3, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 22: app5.Rnw:230-232 +################################################### +cat("\\includegraphics{app5_10.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 23: app5.Rnw:240-244 +################################################### +# setSweave is required for the vignette. +setSweave("app5_11", 5, 5) +plot(app5.lrR2, which=5, set.up=FALSE) +graphics.off() + + +################################################### +### code chunk number 24: app5.Rnw:246-248 +################################################### +cat("\\includegraphics{app5_11.pdf}\n") +cat("\\paragraph{}\n") + + +################################################### +### code chunk number 25: app5.Rnw:260-285 +################################################### +# Get the estimation data +data(app5.est) +# Predict daily loads for 1999 +app5.ld <- predLoad(app5.lrR2, app5.est, by="day", + load.units="pounds") +# Get the 1999 sample data and merge to get daily flows and compute +# load, note that the units must match what was selected for +# estimation! The mergeQ function is in the USGSwsBase package; +# the default names for dates and flow agree with the current datasets. +data(app5.1999) +app5.1999 <- mergeQ(app5.1999, Qdata=app5.est, Plot=FALSE) +app5.1999$Load <- with(app5.1999, c2load(Alkalinity, FLOW, + conc.units="mg/L", + load.units="pounds")) +# Create the graph +setSweave("app5_12", 5, 5) +AA.pl <- with(app5.ld, timePlot(Date, Flux, + Plot=list(name="Daily load estimate", what="overlaid", + size=0.03), + ytitle="Alkanity Load, in pounds per day")) +AA.pl <- with(app5.1999, addXY(DATES, Load, + Plot=list(name="Observed load", what="points"), + current=AA.pl)) +addExplanation(AA.pl, where="ur", title="") +graphics.off() + + +################################################### +### code chunk number 26: app5.Rnw:287-289 +################################################### +cat("\\includegraphics{app5_12.pdf}\n") +cat("\\paragraph{}\n") + + diff --git a/inst/doc/app5.Rnw b/inst/doc/app5.Rnw new file mode 100644 index 0000000..9801146 --- /dev/null +++ b/inst/doc/app5.Rnw @@ -0,0 +1,296 @@ +\documentclass{article} +\parskip 3pt +%\VignetteIndexEntry{User-Defined Model with an Additional Variable} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright +\parindent 30 pt + +\title{Application 5: User-Defined Model with an Additional Variable} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates the development of a user-defined model that includes and extra explanatory variable that is not related to flow or time. In this case, the extra explanatory variable is specific conductance. + +The advent of water-quality monitors has facilitated the inclusion of surrogate variables such as specific conductance, pH, water temperature, dissolved oxygen, and turbidity that can be useful in the formulation of rating curve models for load estimation. Following the work of Christiansen and others (2000), this example uses 103 observations of streamflow and specific conductance collected from 1995 to 1998 to build a regression model for alkalinity in the Little Arkansas River, Kansas. + +Part 2 delves into the diagnostic plots and builds a "better" model. It then reproduces figure 22 from Runkel and others (2004). + + +<>= +# Load the rloadest package and the data +library(rloadest) +data(app5.calib) +head(app5.calib) +@ + +\eject +\section{Build the Model} + +The \texttt{loadReg} function is used to build the rating-curve model for constituent load estimation. The basic form of the call to \texttt{loadReg} is similar to the call to \texttt{lm} in that it requires a formula and data source. The response variable in the formula is the constituent concentration, which is converted to load per day (flux) based on the units of concentration and the units of flow. The \texttt{conc.units}, \texttt{flow.units}, and \texttt{load.units} arguments to \texttt{loadReg} define the conversion. For these data, the concentration units (\texttt{conc.units}) are "mg/L", the flow units are "cfs" (the default), and the load units for the model are "kg" (also the default). If \texttt{conc.units} is not set, they are assumed to be "mg/L" and a warning is issued. Two additional pieces of information are required for \texttt{loadReg}---the names of the flow column and the dates column. A final option, the station identifier, can also be specified. + +For any load model, the centered log flow is not necessary. It is used throughout LOADEST, but is optional for the construction of the rating-curve model. This example application does not use centered log flow in the initial model formulation. + +<>= +# Create the and print load model. +app5.lr <- loadReg(Alkalinity ~ log(FLOW) + log(SC), data = app5.calib, + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") +app5.lr +@ + +\eject +\section{Diagnostic Plots} + +The rloadest package contains a \texttt{plot} function that creates diagnostic plots of the load model. Most often the user will just enter \texttt{plot(app5.lr)} (for this example) in the R Console window to generate the full suite of plots, but this example application will generate each plot individually. And, in general, the user will not need to set up a graphics device. But for this vignette, the graphics device must be set up for each graph. + +Figure 1 shows the response versus the fitted values. The LOWESS curve agrees very well with the regression line and the scatter is very small. There does appear to be a small amount of curvature in the fit suggested by the response values at the ends are above the fit and a bump below the line between the fitted values of 11 and 12. Subsequent diagnostic plots can be used to assess whether the nonlinearity is a concern or not. + +<>= +# setSweave is required for the vignette. +setSweave("app5_01", 5, 5) +plot(app5.lr, which=1, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_01.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 1.} The rating-curve regression model. + +\eject +Figure 2 is a scale-location (S-L) graph that is a useful graph for assessing heteroscedasticity of the residuals. The horizontal dashed line is the expected value of the square root of the absolute value of the residuals and the solid line is the LOWESS smooth. The slope of the LOWESS line indicates some cause concern for unequal variance in the estimates. + +<>= +# setSweave is required for the vignette. +setSweave("app5_02", 5, 5) +plot(app5.lr, which=3, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_02.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 2.} The scale-location graph for the regression model. + +\eject +The correlogram in figure 4 is a adaptation of the correlogram from time-series analysis, which deals with regular samples. The horizontal dashed line is the zero value and the solid line is a kernel smooth rather than a LOWESS line. The kernel smooth gives a better fit in this case. The solid line should be very close to the horizontal line. In this case, there is a slight regular pattern with a higher line at 0 and 1 and a low line at about 0.5. This might suggest a seasonal lack of fit. + +<>= +# setSweave is required for the vignette. +setSweave("app5_03", 5, 5) +plot(app5.lr, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_03.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 3.} The correlogram from the regression model. + +\eject +Figure 4 is a q-normal plots that shows the standardized residuals, which are assumed to have a standard deviation of 1. The solid line is the theoretical fit of mean of 0 and standard deviation of 1. The visual appearance of figure 5 confirms the results of the PPCC test in the printed output---the residuals are reasonably normal in distribution. + +<>= +# setSweave is required for the vignette. +setSweave("app5_04", 5, 5) +plot(app5.lr, which=5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_04.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 4.} The Q-normal plot of the residuals. + +\eject +Figure 5 is one of the partial-residual plots showing the relation to the log of flow. For this example, the span is reduced from the default of 1.0 to 0.5 to emphasize the curvature in the relation. The smooth line is does better +represent the curvature of the relation and the second order polynomial test for linearity clearly indicates the strength of the curvature because the p-value is much less than 0.05. + +<>= +# setSweave is required for the vignette. +setSweave("app5_05", 5, 5) +plot(app5.lr, which="log(FLOW)", span=0.5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_05.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 5.} Partial residual plot for the log flow. + +\eject +\section{Part 2 Revise the Model} + +Figure 5 clearly indicated that the relation with log flow was not linear. The most straight forward option for these data is to use a second order polynomial. The correlogram (fig. 3) suggested that seasonality might be a concern and adding quadratic terms can account for some seasonality when the largest or smallest values are not being estimated correctly, so the revised model will include only quadratic flow and specific conductance. The residuals will be looked at to explain and resolve the structure of the correlogram. + +<>= +# Create the revised load model and plot the correlogram. +app5.lrR1 <- loadReg(Alkalinity ~ quadratic(log(FLOW)) + log(SC), + data = app5.calib, + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") + +# setSweave is required for the vignette. +setSweave("app5_06", 5, 5) +plot(app5.lrR1, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_06.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 6.} The correlogram from the revised regression model. + +\eject +Normally, the correlogram shown if figure 6 would suggest that seasonal terms should be added and that would be the logical next step. However, for these data, that would not show a substantial difference in the correlogram and the sine and cosine terms would not significantly improve the model. + +To improve the model, one must understand the setting and the hydrologic conditions. There is a low-head dam at this site, which can affect the relation between flow, specific conductance and alkalinity. Flows were much higher in January through April of 1998 when groundwater would normally be expected to dominate and the presence of the low-head dam could affect those relations during periods of abnormal flows during a season. + +Figure 7 shows the relation between alkalinity and flow with the data for winter and early spring 1998 highlighted in red. The red points are all to the right of the cloud of other points, suggesting a change in the structural relation between alkalinity and flow for that time period. For the purposes of estimating loads for 1996, we will simply subset the data to exclude all of 1998. + +<>= +# setSweave is required for the vignette. +setSweave("app5_07", 5, 5) +AA.pl <- with(app5.calib, xyPlot(FLOW, Alkalinity, yaxis.log=TRUE, + xaxis.log=TRUE)) +with(subset(app5.calib, DATES > "1998-01-01" & DATES < "1998-05-01"), + addXY(FLOW, Alkalinity,Plot=list(what="points", color="red"), + current=AA.pl)) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_07.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 7.} The relation between alkalinity and flow highlighting winter and early spring 1998 (in red). + +\eject +The correlogram in figure 6 does indicate seasonality, so the second revision of the load model will include the sine and cosine terms (using the function \texttt{fourier}). The correlogram for the second revised model (fig. 8) is much improved. + +The report for the new model does indicate a poor goodness-of-fit for the normality of the residuals and the variance inflation factors for linear flow and SC are relatively large, greater than 5. The normality of the residuals is assessed later in this section. For this model, the relatively large variance inflation factors should not be a concern because the correlation structure is similar between flow and specific conductance in the calibration and estimation data, but a bit stronger in the calibration (\Sexpr{round(with(subset(app5.calib, DATES < "1998-01-01"), cor(log(FLOW), log(SC))), 3)}) than in the estimation data (\Sexpr{round(with(app5.est, cor(log(FLOW), log(SC))), 3)}). + +<>= +# Create, print the revised load model and plot the correlogram. +app5.lrR2 <- loadReg(Alkalinity ~ quadratic(log(FLOW)) + log(SC) + + fourier(DATES), + data = app5.calib, subset=DATES < "1998-01-01", + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") +app5.lrR2 +# setSweave is required for the vignette. +setSweave("app5_08", 5, 5) +plot(app5.lrR2, which=4, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_08.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 8.} The correlogram from the second revised regression model. + +\eject + +Figure 9 shows the response versus the fitted values for the second revised model. The curvature is reduced from figure 1. + +<>= +# setSweave is required for the vignette. +setSweave("app5_09", 5, 5) +plot(app5.lrR2, which=1, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_09.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 9.} The rating-curve regression model. + +\eject +Figure 10 is the scale-location (S-L) graph of the second revised model. There is still a bit of heteroscedasticity, particularly lower scatter in the lower fitted values, but constant in the larger values, where most of the load is transported. + +<>= +# setSweave is required for the vignette. +setSweave("app5_10", 5, 5) +plot(app5.lrR2, which=3, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_10.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 10.} The scale-location graph for the regression model. + +\eject +Figure 11 is the q-normal plot for the second revised model. In this case, there are a couple of outliers, one high and one low, which contribute to the small p-value recorded in the PPCC test. + +<>= +# setSweave is required for the vignette. +setSweave("app5_11", 5, 5) +plot(app5.lrR2, which=5, set.up=FALSE) +graphics.off() +@ +<>= +cat("\\includegraphics{app5_11.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 11.} The Q-normal plot of the residuals. + +The complete view of the diagnsotic plots suggests that there is some nonlinearity in the sine and cosine terms. Most often that would suggest adding additional, second-order, sine and cosine terms, or using a different seasonal term such as the seasonal wave, which was shown in applications 2 and 4. For this particular example, adding terms does not substantially improve the model, so we stop at the first-order terms. + +\eject +\section{Predict Daily Loads for 1999} + +Figure 22 in Runkel and others (2004) requires daily load estimates and the calibration load data. The \texttt{c2load} function can be used to calculate the daily loads in the 1999 dataset (app5.1999). Note that the daily estimation data set, \texttt{app5.est}, has 7 missing days and so has only 358 observations instead of 365 for 1999. + +<>= +# Get the estimation data +data(app5.est) +# Predict daily loads for 1999 +app5.ld <- predLoad(app5.lrR2, app5.est, by="day", + load.units="pounds") +# Get the 1999 sample data and merge to get daily flows and compute +# load, note that the units must match what was selected for +# estimation! The mergeQ function is in the USGSwsBase package; +# the default names for dates and flow agree with the current datasets. +data(app5.1999) +app5.1999 <- mergeQ(app5.1999, Qdata=app5.est, Plot=FALSE) +app5.1999$Load <- with(app5.1999, c2load(Alkalinity, FLOW, + conc.units="mg/L", + load.units="pounds")) +# Create the graph +setSweave("app5_12", 5, 5) +AA.pl <- with(app5.ld, timePlot(Date, Flux, + Plot=list(name="Daily load estimate", what="overlaid", + size=0.03), + ytitle="Alkanity Load, in pounds per day")) +AA.pl <- with(app5.1999, addXY(DATES, Load, + Plot=list(name="Observed load", what="points"), + current=AA.pl)) +addExplanation(AA.pl, where="ur", title="") +graphics.off() +@ +<>= +cat("\\includegraphics{app5_12.pdf}\n") +cat("\\paragraph{}\n") +@ + +\textbf{Figure 12.} Daily load estimates for 1999. + +The estimated loads for 1999 are extrapolated beyond the record used for calibration and as such are reasonable checks for the rating-curve model. In general, the largest loads shown in figure 12 are larger than those shown in figure 22 in Runkel and others (2004). The reason for that general observation can mostly be attributed to including the second order flow term in the second revised model, which in this case increase the estimates of the largest loads becuase the largest loads were underestimated in the original model. The observed loads agree fairly well with the estimated loads, but 2 deserve additional explanation---the observed loads in July and early August agree better with these estimated loads than in Runkel and other (2004) figure 22. + +\end{document} diff --git a/inst/doc/app5.pdf b/inst/doc/app5.pdf index 6bc8672..a6a66a8 100644 Binary files a/inst/doc/app5.pdf and b/inst/doc/app5.pdf differ diff --git a/inst/doc/app6.R b/inst/doc/app6.R new file mode 100644 index 0000000..64cd8eb --- /dev/null +++ b/inst/doc/app6.R @@ -0,0 +1,85 @@ +### R code from vignette source 'app6.Rnw' + +################################################### +### code chunk number 1: app6.Rnw:21-25 +################################################### +# Load the rloadest package and the data +library(rloadest) +data(app5.calib) +head(app5.calib) + + +################################################### +### code chunk number 2: app6.Rnw:33-40 +################################################### +# Create the and print load model with concentration. +app6.lr <- loadReg(Alkalinity ~ quadratic(log(FLOW)) + log(SC) + + fourier(DATES), + data = app5.calib, subset=DATES < "1998-01-01", + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") +print(app6.lr, load.only=FALSE) + + +################################################### +### code chunk number 3: app6.Rnw:45-50 +################################################### +# Get the estimation data +data(app5.est) +# Predict daily concentrations +app6.cd <- predConc(app6.lr, app5.est, by="day") +head(app6.cd) + + +################################################### +### code chunk number 4: app6.Rnw:61-67 +################################################### +# Create synthetic flow values +app6.calib <- transform(app5.calib, Sflow=1/c2load(1, 1, + conc.units="mg/L")) +app6.est <- transform(app5.est, Sflow=1/c2load(1, 1, + conc.units="mg/L")) +head(app6.calib) + + +################################################### +### code chunk number 5: app6.Rnw:72-79 +################################################### +# Create the and print load model. +app6.lrTWM <- loadReg(Alkalinity ~ quadratic(log(FLOW)) + log(SC) + + fourier(DATES), + data = app6.calib, subset=DATES < "1998-01-01", + flow = "Sflow", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") +print(app6.lrTWM) + + +################################################### +### code chunk number 6: app6.Rnw:84-89 +################################################### +# Predict monthly TWM concentrations using the \textt{predLoad} function. +app6.TWM <- predLoad(app6.lrTWM, app6.est, by="month") +# Change the name of the Flux column to Conc +names(app6.TWM)[3] <- "Conc" +app6.TWM + + +################################################### +### code chunk number 7: app6.Rnw:94-95 +################################################### +with(app6.cd, tapply(Conc, month(Date, label=TRUE), mean)) + + +################################################### +### code chunk number 8: app6.Rnw:104-112 +################################################### +# Compute the monthly fluxes. +app6.FWM <- predLoad(app6.lr, app6.est, by="month") +# Compute the mean flows +app6.FWM$Flow <- as.vector(with(app6.est, tapply(FLOW, month(DATES), mean))) +# Compute the FWM concentration +app6.FWM <- transform(app6.FWM, FWMC=Flux/Flow/ + c2load(1, 1, conc.units="mg/L")) +app6.FWM + + diff --git a/inst/doc/app6.Rnw b/inst/doc/app6.Rnw new file mode 100644 index 0000000..ef88dda --- /dev/null +++ b/inst/doc/app6.Rnw @@ -0,0 +1,116 @@ +\documentclass{article} +\parskip 6pt +%\VignetteIndexEntry{Regression Model for Concentration} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright + +\title{Application 6: Regression Model for Concentration} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates how to use the regression model to estimate concentration rather than load. Concentrations can be estimated for daily or shorter time frames directly from the load regression output. Time-weighted mean concentrations for longer time periods can be computed using the method described for application 6 in Runkel and others (2004), or simply computing the mean daily if confidence intervals are not needed. Flow-weighted mean concentrations can be computed by estimating the flux for each period of time, dividing by the mean flow and dividing by the correct conversion factor; the confidence interval can be computed in the same manner. + +This example uses the second revised model from example application 5. The user is directed to that example vignette for details on constructing that model. Part 2 illustrates how to estimate time-weighted mean (TWM) monthly concentrations. Part 3 illustrates how to estimate flow-weighted mean (FWM) concentrations. + + +<>= +# Load the rloadest package and the data +library(rloadest) +data(app5.calib) +head(app5.calib) +@ + +\eject +\section{Predict Daily Concentrations for 1999} + +The concentration model is computed using \texttt{loadReg}. The default print does not print the concentration model, but it can be printed by setting \texttt{load.only} to \texttt{FALSE}. + +<>= +# Create the and print load model with concentration. +app6.lr <- loadReg(Alkalinity ~ quadratic(log(FLOW)) + log(SC) + + fourier(DATES), + data = app5.calib, subset=DATES < "1998-01-01", + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") +print(app6.lr, load.only=FALSE) +@ + +The \texttt{rloadest} package contains the function \texttt{predConc} that will estimate concentrations for daily or shorter time periods, depending on the time step. Note that the daily estimation data set, \texttt{app5.est}, has 7 missing days and so has only 358 observations instead of 365 for 1999. + +<>= +# Get the estimation data +data(app5.est) +# Predict daily concentrations +app6.cd <- predConc(app6.lr, app5.est, by="day") +head(app6.cd) +@ + +\eject +\section{Part 2 Predict Monthly Time-weighted Mean Concentrations for 1999} + +The estimation of TWM concentrations requires the user to "trick" the regression model by substituting a constant synthetic value for the flow column that actually estimates concentrations in stead of load when using the \texttt{predLoad} function. The details of the development of the method are described in Runkel and others (2004). This example application only describes the implementation using \texttt{rloadest} functions. + +The first step is to create a synthetic flow column in both the calibration and the estimation data sets. The value of the data is the conversion factor from load to concentration, or the reciprocal of the conversion factor from concentration to loads that can be gotten by using the \texttt{c2load} function. + + +<>= +# Create synthetic flow values +app6.calib <- transform(app5.calib, Sflow=1/c2load(1, 1, + conc.units="mg/L")) +app6.est <- transform(app5.est, Sflow=1/c2load(1, 1, + conc.units="mg/L")) +head(app6.calib) +@ + +The next step is to construct the calibrated model. This example will use the previously calibrated model, so that diagnostics plots will not be created. The only difference between this model and the previous model is that the flow column is defined as the synthetic flow column. The printed report for this load model should agree with the printed report for the concentration model in the previous section. + +<>= +# Create the and print load model. +app6.lrTWM <- loadReg(Alkalinity ~ quadratic(log(FLOW)) + log(SC) + + fourier(DATES), + data = app6.calib, subset=DATES < "1998-01-01", + flow = "Sflow", dates = "DATES", conc.units="mg/L", + station="Arkansas River at Halstead, Ks.") +print(app6.lrTWM) +@ + +The user can now estimate TWM concentrations for periods longer than 1 day. Remember that the estimation dataset has 7 missing days and no estimates for those months will be made. + +<>= +# Predict monthly TWM concentrations using the \textt{predLoad} function. +app6.TWM <- predLoad(app6.lrTWM, app6.est, by="month") +# Change the name of the Flux column to Conc +names(app6.TWM)[3] <- "Conc" +app6.TWM +@ + +A quick check of the mean daily concentrations verifies that computations. Note that there is no protection against incomplete months when the TWM concentrations are computed manually, so values are obtained for June, July and September. + +<>= +with(app6.cd, tapply(Conc, month(Date, label=TRUE), mean)) +@ + +\eject +\section{Part 3 Predict Monthly Flow-weighted Mean Concentrations for 1999} + +The estimation of FWM concentrations is a two-step process. The first step is to build the rating-curve model for loads and estimate the flux desired for each period. The second step is to compute the mean flow for each period and divide the flux by the flow and correct to concentration units. The load model for this example (app6.lr) was created in the first part of this vignette. + + +<>= +# Compute the monthly fluxes. +app6.FWM <- predLoad(app6.lr, app6.est, by="month") +# Compute the mean flows +app6.FWM$Flow <- as.vector(with(app6.est, tapply(FLOW, month(DATES), mean))) +# Compute the FWM concentration +app6.FWM <- transform(app6.FWM, FWMC=Flux/Flow/ + c2load(1, 1, conc.units="mg/L")) +app6.FWM +@ + +For these data, the FWM concentration is less than the TWM concentration. In general, the FWM concentration will less than the TWM concentration when the concentration and flow are negatively correlated. +\end{document} diff --git a/inst/doc/app6.pdf b/inst/doc/app6.pdf index b78bc0f..c489a66 100644 Binary files a/inst/doc/app6.pdf and b/inst/doc/app6.pdf differ diff --git a/man/AICc.Rd b/man/AICc.Rd index 3fb865d..211dd6d 100644 --- a/man/AICc.Rd +++ b/man/AICc.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/AICc.R \name{AICc} \alias{AICc} \title{Akaike's An Information Criterion with Correction} diff --git a/man/Atrazine.Rd b/man/Atrazine.Rd deleted file mode 100644 index 47021f2..0000000 --- a/man/Atrazine.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand -\docType{data} -\name{Atrazine} -\alias{Atrazine} -\title{Example Atrazine data included in LOADEST package} -\description{ -Example data representing atrazine -} -\keyword{data} -\keyword{quality} -\keyword{water} - diff --git a/man/LOADEST-package.Rd b/man/LOADEST-package.Rd deleted file mode 100644 index f8d67a4..0000000 --- a/man/LOADEST-package.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand -\docType{package} -\name{LOADEST-package} -\alias{LOADEST-package} -\title{LOADEST functions for R} -\description{ -\tabular{ll}{ -Package: \tab LOADEST\cr -Type: \tab Package\cr -Version: \tab 0.4.0\cr -Date: \tab 2015-02-27\cr -License: \tab File\cr -LazyLoad: \tab yes\cr -} -} -\details{ -This package is intended to replicate and extend the LOADEST program for -estimating constituent loads in streams and rivers. Some subtle differences -between the output for LOADEST and rloadest include: - -LOADEST uses centered time when computing the sine and cosine terms in model -numbers 4, 6, 7, 8, and 9, but the functions in rloadest use the actual decimal -time so that the seasonality can more easily be assessed by the user. - -The order of the terms in the predefined models is different between LOADEST -and the rloadest functions. - -The printed output of the model descriptions from rloadest matches the format -the most users of R would recognize from other linear model output rather then -the printed output from LOADEST. - -Furhtermore, the model building capability in the rloadest functions make easier -to explore other forms of rating-curve models than LOADEST. -} -\author{ -Dave Lorenz \email{lorenz@usgs.gov} -} -\keyword{estimation} -\keyword{load} - diff --git a/man/Models.Rd b/man/Models.Rd new file mode 100644 index 0000000..40bf33a --- /dev/null +++ b/man/Models.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/Models.R +\docType{data} +\name{Models} +\alias{Models} +\title{Models +A dataset that describes the equivalent formula for each of the +predefined model number.} +\format{Data frame with 9 rows and 2 columns\cr +\tabular{lll}{ +Name \tab Type \tab Description\cr +Number \tab integer \tab The predefined model number\cr +Formula \tab factor \tab The equivalent formula for the predefined model number\cr +}} +\usage{ +Models +} +\description{ +Models +A dataset that describes the equivalent formula for each of the +predefined model number. +} +\examples{ +data(Models) +print(Models) +} +\seealso{ +\code{\link{model}} +} +\keyword{datasets} + diff --git a/man/c2load.Rd b/man/c2load.Rd index b96b1d0..5ce9db4 100644 --- a/man/c2load.Rd +++ b/man/c2load.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/c2load.R \name{c2load} \alias{c2load} \title{Loads} diff --git a/man/censoring.Surv.Rd b/man/censoring.Surv.Rd new file mode 100644 index 0000000..48bf935 --- /dev/null +++ b/man/censoring.Surv.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/censoring.Surv.R +\name{censoring.Surv} +\alias{censoring.Surv} +\title{Describe Censoring} +\usage{ +\method{censoring}{Surv}(x) +} +\arguments{ +\item{x}{the object to get the type of censoring. For an object of class +"Surv," the type must be "interval."} +} +\value{ +A character string "none," "left," or "multiple" describing the type +of censoring present in the object. +} +\description{ +Gets the type of censoring ("none," "left," "multiple") for an object. +} +\note{ +This function is mostly used within other functions to determine the +'best' technique to use for analysis. +} +\examples{ +\dontrun{ +library(survival) +censoring(Surv(2.3, 2.3, type="interval2")) +} +} +\keyword{attribute} +\keyword{censored} + diff --git a/man/center.Rd b/man/center.Rd index f8fa093..e097c0e 100644 --- a/man/center.Rd +++ b/man/center.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/center.R \name{center} \alias{center} \title{Centered Linear Terms} diff --git a/man/coef.loadReg.Rd b/man/coef.loadReg.Rd index 8a40d8d..8738881 100644 --- a/man/coef.loadReg.Rd +++ b/man/coef.loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/coef.loadReg.R \name{coef.loadReg} \alias{coef.loadReg} \title{Extract Model Coefficients} diff --git a/man/dailyAg.Rd b/man/dailyAg.Rd index 8d999a5..e7886c6 100644 --- a/man/dailyAg.Rd +++ b/man/dailyAg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/dailyAg.R \name{dailyAg} \alias{dailyAg} \title{Daily Mean} diff --git a/man/fitted.loadReg.Rd b/man/fitted.loadReg.Rd index 2926544..dd8472c 100644 --- a/man/fitted.loadReg.Rd +++ b/man/fitted.loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/fitted.loadReg.R \name{fitted.loadReg} \alias{fitted.loadReg} \title{Extract Model Fitted Values} diff --git a/man/loadConvFactor.Rd b/man/loadConvFactor.Rd index eb2e635..2056670 100644 --- a/man/loadConvFactor.Rd +++ b/man/loadConvFactor.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/loadConvFactor.R \name{loadConvFactor} \alias{loadConvFactor} \title{Unit Conversion} diff --git a/man/loadReg.Rd b/man/loadReg.Rd index 928f893..19b54f5 100644 --- a/man/loadReg.Rd +++ b/man/loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/loadReg.R \name{loadReg} \alias{loadReg} \title{Load Estimation} @@ -48,8 +49,14 @@ variables constructed using \code{Surv} of \code{type} "right,", "interval," or For un- or left-censored data, AMLE is used unless weights are specified in the model, then MLE is used, through a call to \code{survreg}. For any other censored data, MLE is used.\cr -See \code{\link{loadConvFactor}} for details about valid values for -\code{flow.units}, \code{con.units} and \code{load.units}. + +Typically, \code{loadReg} expects the response variable to have units of +concentration, mass per volume. For these models, See \code{\link{loadConvFactor}} +for details about valid values for \code{flow.units}, \code{conc.units} and +\code{load.units}. For some applications, like bed load estimation, the response +variable can have units of flux, mass per time. For these models, \code{conc.units} +can be expressed as any valid \code{load.units} per day. The rate must be expressed +in terms of "/d," "/dy," or "/day." } \examples{ # From application 1 in the vignettes diff --git a/man/loadReport.Rd b/man/loadReport.Rd index d4634af..f72545e 100644 --- a/man/loadReport.Rd +++ b/man/loadReport.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/loadReport.R \name{loadReport} \alias{loadReport} \title{Create Load Report} diff --git a/man/loadStats.Rd b/man/loadStats.Rd index bccd699..72e4637 100644 --- a/man/loadStats.Rd +++ b/man/loadStats.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/loadStats.R \name{loadStats} \alias{loadStats} \title{Summary Statistics} diff --git a/man/loadUnitConv.Rd b/man/loadUnitConv.Rd index 6cb2b73..fe82796 100644 --- a/man/loadUnitConv.Rd +++ b/man/loadUnitConv.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/loadUnitConv.R \name{loadUnitConv} \alias{loadUnitConv} \title{Unit Conversion} diff --git a/man/loadestQadj.Rd b/man/loadestQadj.Rd index d77296f..96a6c4e 100644 --- a/man/loadestQadj.Rd +++ b/man/loadestQadj.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/loadestQadj.R \name{loadestQadj} \alias{loadestQadj} \title{Center Flow} diff --git a/man/loadestTadj.Rd b/man/loadestTadj.Rd index b4339a7..6e6f4aa 100644 --- a/man/loadestTadj.Rd +++ b/man/loadestTadj.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/loadestTadj.R \name{loadestTadj} \alias{loadestTadj} \title{Center Time} diff --git a/man/logLik.loadReg.Rd b/man/logLik.loadReg.Rd index 8388471..1972053 100644 --- a/man/logLik.loadReg.Rd +++ b/man/logLik.loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/logLik.loadReg.R \name{logLik.loadReg} \alias{logLik.loadReg} \title{Extract Log-Likelihood} diff --git a/man/makepredictcall.center.Rd b/man/makepredictcall.center.Rd index 83d43ea..b6f7fc5 100644 --- a/man/makepredictcall.center.Rd +++ b/man/makepredictcall.center.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/makepredictcall.center.R \name{makepredictcall.center} \alias{makepredictcall.center} \title{Utility Function for Safe Prediction} diff --git a/man/mean.Rd b/man/mean.Rd index 316261c..3c120cd 100644 --- a/man/mean.Rd +++ b/man/mean.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/mean.R \name{mean.factor} \alias{mean.character} \alias{mean.factor} diff --git a/man/model.Rd b/man/model.Rd index 98bcf69..d7dd962 100644 --- a/man/model.Rd +++ b/man/model.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/model.R \name{model} \alias{model} \title{Load Model} diff --git a/man/nashSutcliffe.Rd b/man/nashSutcliffe.Rd index dbb134d..fb65a6d 100644 --- a/man/nashSutcliffe.Rd +++ b/man/nashSutcliffe.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/nashSutcliffe.R \name{nashSutcliffe} \alias{nashSutcliffe} \title{Nash Sutcliffe} diff --git a/man/plot.loadReg.Rd b/man/plot.loadReg.Rd index 6a6d73a..4bfe432 100644 --- a/man/plot.loadReg.Rd +++ b/man/plot.loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/plot.loadReg.R \name{plot.loadReg} \alias{plot.loadReg} \title{Diagnostic Plot} diff --git a/man/predConc.Rd b/man/predConc.Rd index 73daa7a..7d5f50e 100644 --- a/man/predConc.Rd +++ b/man/predConc.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/predConc.R \name{predConc} \alias{predConc} \title{Predict Concentrations} @@ -38,7 +39,9 @@ The time frame specified by \code{by} must be either "unit" or If \code{allow.incomplete} is \code{TRUE}, then concentrations will be computed based on all nonmissing values, otherwise missing values \code{NAs} will be returned. For this application, missing values -includes \code{NAs} and incomplete days. +includes \code{NAs} and incomplete days. For prediction by "day" when +there are variable number of unit values per day, \code{allow.incomplete} +must be set to \code{TRUE}. The term confidence interval is used here as in the original documentation for LOADEST, but the values that are reported are diff --git a/man/predLoad.Rd b/man/predLoad.Rd index d09a6dd..1c46fd1 100644 --- a/man/predLoad.Rd +++ b/man/predLoad.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/predLoad.R \name{predLoad} \alias{predLoad} \title{Predict Loads} @@ -55,7 +56,9 @@ computed based on all nonmissing values, otherwise missing values \code{NAs} will be returned. For this application, missing values includes \code{NAs} and gaps in the record, except for \code{by} set to "total" or user defined groups where missing values only -includes \code{NAs}. +includes \code{NAs}. For prediction by "day" when there are variable +number of unit values per day, \code{allow.incomplete} must be +set to \code{TRUE}. The term confidence interval is used here as in the original documentation for LOADEST, but the values that are reported are diff --git a/man/print.loadReg.Rd b/man/print.loadReg.Rd index 053f924..ce133e7 100644 --- a/man/print.loadReg.Rd +++ b/man/print.loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/print.loadReg.R \name{print.loadReg} \alias{print.loadReg} \title{Print Results} @@ -24,10 +25,23 @@ The object \code{x} is returned invisibly. Print the results of an load rating-curve regression. } \note{ -The printed output includes the call, the coefficent table, the -estimated residual standard error, the log-likelihood of the model and -null model with the attained p-value, and the computational method. -NEED more details about what is printed and the difference in output due to arguments +The printed output replicates the output described in Runkel (2004) and +includes a short section summarizing the data, the load model and coefficients, +regression statistics, and comparison of observed and estimated loads. If +\code{load.only} is set to \code{FALSE}, then similar output is generated for the +concetration model. If \code{brief} is \code{FALSE}, then additional descriptions +of selected sections of the output are produced. + +If the estimation method is "MLE," then the estimated loads used in the comparison +to observed loads are approximate becuase they are estimated using MLE, rather than +AMLE, wihch is used for \code{predLoad} and \code{predConc}. The bias is very small +when the residual varaince is less than 0.5, but can be large when the residual +variance is greater than 1. +} +\references{ +Runkel, R.L., Crawford, C.G., and Cohn, T.A., 2004, Load estimator (LOADEST): +A FORTRAN program for estimating constituent loads in streams and rivers: +U.S. Geological Survey Techniques and Methods book 4, chap. A5, 69 p. } \seealso{ \code{\link{loadReg}} diff --git a/man/resampleUVdata.Rd b/man/resampleUVdata.Rd index faa66d0..7e44fee 100644 --- a/man/resampleUVdata.Rd +++ b/man/resampleUVdata.Rd @@ -1,10 +1,11 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/resampleUVdata.R \name{resampleUVdata} \alias{resampleUVdata} \title{Resample Unit-Value Data} \usage{ resampleUVdata(UVdata, time.step = 15, start.date = "", end.date = "", - max.diff = "2 hours") + max.diff = "2 hours", missing.days = "exclude") } \arguments{ \item{UVdata}{the dataset containing the unit-values data. Must have one column that @@ -22,7 +23,12 @@ default value ("") indicates use the last day in \code{UVdata}.} \item{max.diff}{a character string indicating the maximum difference in time to sucessfully resample the unit-value data. The default is "2 hours" see -\code{\link{mergeNearest}} for details.} +\code{\link[smwrBase]{mergeNearest}} for details.} + +\item{missing.days}{a characer string indicating what action should be taken for days +not present in \code{UVdata}. Must be either "exclude" to remove those days from the output, +or "include" to retain them. Can be abbreviated. If \code{missing.days} is "include," +then partial days within \code{max.diff} will be included in the output data frame.} } \value{ A data frame like \code{UVdata} but having a uniform time step. diff --git a/man/residuals.loadReg.Rd b/man/residuals.loadReg.Rd index adf161e..dca4ab1 100644 --- a/man/residuals.loadReg.Rd +++ b/man/residuals.loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/residuals.loadReg.R \name{residuals.loadReg} \alias{residuals.loadReg} \title{Extract Model Residuals} diff --git a/man/rloadest-package.Rd b/man/rloadest-package.Rd index 894ad0c..3d5f5f8 100644 --- a/man/rloadest-package.Rd +++ b/man/rloadest-package.Rd @@ -1,38 +1,42 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/rloadest-package.R \docType{package} \name{rloadest-package} \alias{rloadest-package} -\title{Load Estimation Functions} +\title{LOADEST functions for R} \description{ - \tabular{ll}{ Package: \tab rloadest\cr Type: \tab - Package\cr Version: \tab 0.2\cr Date: \tab 2013-12-19\cr - License: \tab File LICENSE\cr LazyLoad: \tab yes\cr } +\tabular{ll}{ +Package: \tab rloadest\cr +Type: \tab Package\cr +Version: \tab 0.4.2\cr +Date: \tab 2015-07-20\cr +License: \tab CC0\cr +LazyLoad: \tab yes\cr +} } \details{ - This package is intended to replicate and extend the - LOADEST program for estimating constituent loads in - streams and rivers. Some subtle differences between the - output for LOADEST and rloadest include: +This package is intended to replicate and extend the LOADEST program for +estimating constituent loads in streams and rivers. Some subtle differences +between the output for LOADEST and rloadest include: + +The least absolute deviation (LAD) method is not supported in rloadest. - LOADEST uses centered time when computing the sine and - cosine terms in model numbers 4, 6, 7, 8, and 9, but the - functions in the rloadest package use the actual decimal time so that - the seasonality can more easily be assessed by the user. +LOADEST uses centered time when computing the sine and cosine terms in model +numbers 4, 6, 7, 8, and 9, but the functions in rloadest use the actual decimal +time so that the seasonality can more easily be assessed by the user. - The order of the terms in the predefined models is - different between LOADEST and the rloadest package functions. +The order of the terms in the predefined models is different between LOADEST +and the rloadest functions. - The printed output of the model descriptions from - rloadest package functions matches the format the most users of R would - recognize from other linear model output rather then the - printed output from LOADEST. +The printed output of the model descriptions from rloadest matches the format +the most users of R would recognize from other linear model output rather then +the printed output from LOADEST. - Furhtermore, the model building capability in the - rloadest package functions make easier to explore other forms of - rating-curve models than LOADEST. +Furthermore, the model building capability in the rloadest functions make easier +to explore other forms of rating-curve models than LOADEST. } \author{ - Dave Lorenz \email{lorenz@usgs.gov}, Laura De Cicco \email{ldecicco@usgs.gov}, - Rob Runkel \email{runkel@usgs.gov} +Dave Lorenz \email{lorenz@usgs.gov} } \keyword{estimation} \keyword{load} diff --git a/man/rmse.loadReg.Rd b/man/rmse.loadReg.Rd index 4927da2..359376e 100644 --- a/man/rmse.loadReg.Rd +++ b/man/rmse.loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/rmse.loadReg.R \name{rmse.loadReg} \alias{rmse.loadReg} \title{Root-Mean-Squared Error} diff --git a/man/seg.Rd b/man/seg.Rd new file mode 100644 index 0000000..bfcfffe --- /dev/null +++ b/man/seg.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/seg.R +\name{seg} +\alias{seg} +\title{Load Model} +\usage{ +seg(x, N) +} +\arguments{ +\item{x}{the data to segment. Missing values are permitted and result +corresponing in missing values in output.} + +\item{N}{the number of breaks in the segmented model.} +} +\value{ +The data in \code{x} with attributes to build the segmented model. +} +\description{ +Support function for building a segmented rating curve load model. Required +in the formula in \code{segLoadReg} to define the segmented model. +} + diff --git a/man/segLoadReg.Rd b/man/segLoadReg.Rd new file mode 100644 index 0000000..249d1dd --- /dev/null +++ b/man/segLoadReg.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/segLoadReg.R +\name{segLoadReg} +\alias{segLoadReg} +\title{Load Estimation} +\usage{ +segLoadReg(formula, data, subset, na.action, flow, dates, flow.units = "cfs", + conc.units = "", load.units = "kg", time.step = "day", station = "", + trace = TRUE) +} +\arguments{ +\item{formula}{a formula describing the regression model. See \bold{Details}.} + +\item{data}{the data to search for the variables in \code{formula}.} + +\item{subset}{an expression to select a subset of the data.} + +\item{na.action}{what to do with missing values.} + +\item{flow}{character string indicating the name of the +flow column.} + +\item{dates}{character string indicating the name of the +date column.} + +\item{flow.units}{character string describing the flow unit.} + +\item{conc.units}{character string describing the concentration +unit.} + +\item{load.units}{character string describing the load unit.} + +\item{time.step}{character string describing the time step of +the calibration data.} + +\item{station}{character string description of the station.} + +\item{trace}{if logical, then if \code{TRUE} print a short summary of the +segmented fit. Otherwise a character string and the segmented model is saved +as that object name.} +} +\value{ +An object of class "loadReg." +} +\description{ +Build a segmented rating-curve (regression) model for river load estimation. +} +\details{ +The left-hand side of the formula can be any numeric variable, just +as with \code{lm} or a variable of class "lcens," "mcens," "qw," or "Surv." +The response variable must be a column in \code{data}; it cannot be constructed using +\code{as.lcens}, \code{as.mcens}, or \code{Surv}. The initial segmented model is +based on uncensored data---simple substitution of 1/2 the reporting limit is used +for left-censored values and multiply censored values cause the analysis to fail. + +The first term of right-hand side must be defined by the \code{seg} function +with the number of segments. The first term may be followed by any number of +additional terms. The final model will place the segmeted term in the last postition +and \code{seg} will be replaced by the proper call to \code{segment}. +} +\examples{ +# From application 1 in the vignettes +data(app1.calib) +app1.lr <- loadReg(Phosphorus ~ model(1), data = app1.calib, + flow = "FLOW", dates = "DATES", conc.units="mg/L", + station="Illinois River at Marseilles, Ill.") +print(app1.lr) +} +\references{ +will need some. +} +\seealso{ +\code{\link{censReg}}, \code{link{seg}}, \code{\link{segment}} +} +\keyword{censored} +\keyword{loads} +\keyword{regression} + diff --git a/man/segment.Rd b/man/segment.Rd new file mode 100644 index 0000000..3ad197f --- /dev/null +++ b/man/segment.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/segment.R +\name{segment} +\alias{segment} +\title{Segmented Model} +\usage{ +segment(x, psi) +} +\arguments{ +\item{x}{the data to segment. Missing values are permitted and result +corresponding in missing values in output.} + +\item{psi}{a numeric vector of the breakpoints.} +} +\value{ +A matrix contining a column named X of the data in \code{x} and +paired U and P columns for each of the breaks in \code{psi} that form the +basis for that segment. +} +\description{ +Computes the basis for a segmented model. Used primarily in a linear +regression or load model. +} + diff --git a/man/selBestModel.Rd b/man/selBestModel.Rd index 9698849..845f73d 100644 --- a/man/selBestModel.Rd +++ b/man/selBestModel.Rd @@ -1,11 +1,12 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/selBestModel.R \name{selBestModel} \alias{selBestModel} \title{Load Estimation} \usage{ selBestModel(constituent, data, subset, na.action, flow, dates, flow.units = "cfs", conc.units = "", load.units = "kg", - time.step = "day", station = "") + time.step = "day", station = "", criterion = c("AIC", "SPCC", "AICc")) } \arguments{ \item{constituent}{a character string giving the name of the response @@ -34,6 +35,9 @@ unit.} the calibration data.} \item{station}{character string description of the station.} + +\item{criterion}{the criterion to use for subset selection, must be +one of "AIC," "SPCC," or "AICc."} } \value{ An object of class "loadReg." diff --git a/man/selBestSubset.Rd b/man/selBestSubset.Rd index 973b7e9..d53e78a 100644 --- a/man/selBestSubset.Rd +++ b/man/selBestSubset.Rd @@ -1,11 +1,12 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/selBestSubset.R \name{selBestSubset} \alias{selBestSubset} \title{Load Estimation} \usage{ selBestSubset(formula, min.formula = ~1, data, subset, na.action, flow, dates, flow.units = "cfs", conc.units = "", load.units = "kg", - time.step = "day", station = "", criterion = c("AIC", "SPPC")) + time.step = "day", station = "", criterion = c("AIC", "SPCC", "AICc")) } \arguments{ \item{formula}{a formula describing the regression model. See \code{\link{loadReg}} @@ -39,8 +40,8 @@ the calibration data.} \item{station}{character string description of the station.} -\item{criterion}{the criterion to use for subset selection, must be either "AIC" -or "SPPC."} +\item{criterion}{the criterion to use for subset selection, must be +one of "AIC," "SPCC," or "AICc."} } \value{ An object of class "loadReg." diff --git a/man/setXLDat.Rd b/man/setXLDat.Rd index 4a67a54..2d22445 100644 --- a/man/setXLDat.Rd +++ b/man/setXLDat.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/setXLDat.R \name{setXLDat} \alias{setXLDat} \title{Model Matrix} diff --git a/man/vif.loadReg.Rd b/man/vif.loadReg.Rd index e84b970..b328095 100644 --- a/man/vif.loadReg.Rd +++ b/man/vif.loadReg.Rd @@ -1,4 +1,5 @@ -% Generated by roxygen2 (4.0.2): do not edit by hand +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/vif.loadReg.R \name{vif.loadReg} \alias{vif.loadReg} \title{Variance Inflation Factors} diff --git a/vignettes/IncorporatingHysteresis.Rnw b/vignettes/IncorporatingHysteresis.Rnw new file mode 100644 index 0000000..657d864 --- /dev/null +++ b/vignettes/IncorporatingHysteresis.Rnw @@ -0,0 +1,183 @@ +\documentclass{article} +\parskip 6pt +%\VignetteIndexEntry{Incorporating Hysteresis} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright + +\title{Incorporating Hysteresis in a Load Model} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates how to set up and incorporate hysteresis into a load model. Hysteresis occurs when concentrations (and thereby loads) are different on rising and falling limbs of an event hydrograph for the same magnitude of streamflow. + +This example extends the analysis of Garrett (2012) of nitrate plus nitrite loads in the Boyer River at Logan, Iowa, USGS gaging station 06609500. The original time frame for the analysis was for water years 2004 through 2008. This example extends the analysis through water year 2012. Loads will not be estimated from this model---it demonstrates only the building and evaluation phases. + + +<>= +# Load the necessary packages and the data +library(rloadest) +library(dataRetrieval) +# Get the QW data +Boyer <- "06609500" +BoyerQW <- importNWISqw(Boyer, "00631", + begin.date="2003-10-01", end.date="2012-09-30") +# Now the Daily values data +BoyerQ <- readNWISdv(Boyer, "00060", startDate="2003-10-01", + endDate="2012-09-30") +BoyerQ <- renameNWISColumns(BoyerQ) +@ + +\eject +\section{Compute the Hysteresis Variables and Merge the Data} + +There is no direct diagnostic test to determine if hysteresis is important. The best way to assess any hysteresis is to compute the hysteresis metric and plot residuals against that metric. The first section of code will replicate that effort, beginning with the ''best'' predefined model rather than simply replicating the model used by Garrett (2012). Garrett used a 1-day lag. This example will use that and add a 3-day lag metric. The function that computes the metric is \texttt{hysteresis} in \textbf{smwrBase}. + +<>= +# Compute the hysteresis metrics. +BoyerQ <- transform(BoyerQ, + dQ1 = hysteresis(log(Flow), 1), + dQ3 = hysteresis(log(Flow), 3)) +# Rename the date column in QW so that the data can be merged +names(BoyerQW)[2] <- "Date" +# Merge the data +BoyerData <- mergeQ(BoyerQW, FLOW=c("Flow", "dQ1", "dQ3"), + DATES="Date", Qdata=BoyerQ, Plot=F) +# Create the initial model +Boyer.lr <- selBestModel("NO2PlusNO3.N", BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) +@ + +The model selected was number 9, which includes second-order flow and time terms and the first-order seasonal terms. Reviewing the table of model evaluation criteria, model number 2 had a very small value for AIC and the second smallest for SPPC. Model number 2 would have been the equivalent starting model in Garrett (2012), including only the second-order flow terms. The printed results indicate good bias statistics, but the PPCC p-value is much less than 0.05. The next section of code illustrates plotting the residuals and hysteresis metrics. The 1-day lag appears to fit better than the 3-day lag. + +<>= +# residuals and hysteresis +setSweave("graph01", 6, 8) +AA.lo <- setLayout(num.rows=2) +setGraph(1, AA.lo) +AA.pl <- xyPlot(BoyerData$dQ1, residuals(Boyer.lr), + ytitle="Residuals", xtitle="1-day lag hysteresis", + xaxis.range=c(-1,3)) +addSLR(AA.pl) +setGraph(2, AA.lo) +AA.pl <- xyPlot(BoyerData$dQ3, residuals(Boyer.lr), + ytitle="Residuals", xtitle="3-day lag hysteresis", + xaxis.range=c(-1,3)) +addSLR(AA.pl) +dev.off() +@ +\includegraphics{graph01.pdf} +\paragraph{} + +\textbf{Figure 1.} The residuals versus hysteresis metrics. + +\eject +\section{Construct and Validate the Hysteresis Model} + +The model... + + +<>= +# Construct the model +Boyer.lr <- loadReg(NO2PlusNO3.N ~ quadratic(log(Flow)) + + quadratic(dectime(Date)) + fourier(Date) + dQ1, data=BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) +@ + +The printed output shows an improved model, but the PPCC test still indicates lack of normality. A review of the linearity of the explanatory variables indicates the need for second-order seasonal terms (figure 1). The sine term is also nonlinear, but not shown. The second order linear time term will be dropped because it is not significant at the 0.05 level. + +<>= +# Plot the overall fit, choose "fourier(Date)cos(k=1)" +setSweave("graph02", 6, 6) +plot(Boyer.lr, which="fourier(Date)cos(k=1)", set.up=FALSE) +dev.off() +@ +\includegraphics{graph02.pdf} +\paragraph{} + +\textbf{Figure 2.} The diagnostic plot for the linearity of a seasonal term. + +Construct the revised model that drops the second order time term and adds the second order seasonal terms. The diagnostic plots follow. + +<>= +# Construct the revised model +Boyer.lr <- loadReg(NO2PlusNO3.N ~ quadratic(log(Flow)) + + dectime(Date) + fourier(Date, 2) + dQ1, data=BoyerData, + flow="Flow", dates="Date", station=Boyer) +print(Boyer.lr) +@ + +A complete review of the partial residual graphs is not included in this example. Only the partial residual for \texttt{fourier(Date)cos(k=1)} is shown to show that the linearity has improved. No partial residual plot indicates a serious lack of linearity. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph03", 6, 6) +plot(Boyer.lr, which = "fourier(Date, 2)cos(k=1)", set.up=FALSE) +dev.off() +@ +\includegraphics{graph03.pdf} +\paragraph{} + +\textbf{Figure 3.} The partial residual for \texttt{fourier(Date, 2)cos(k=1)} graph. + +The correlogram indicates a much better seasonal fit and no long-term lack of fit. + +<>= +# Plot the overall fit, choose plot number 2. +setSweave("graph04", 6, 6) +plot(Boyer.lr, which = 4, set.up=FALSE) +dev.off() +@ +\includegraphics{graph04.pdf} +\paragraph{} + +\textbf{Figure 4.} The correlogram for the revised model. + +For this model, the S-L plot is shown. It shows a slight decrease in heteroscedasticity as the fitted values increase. The decrease is small and likely drive by the 3 largest fitted values and there is very little visual decrease in scatter as the fitted values increase. + +<>= +# Plot the S-L grpah. +setSweave("graph05", 6, 6) +plot(Boyer.lr, which = 3, set.up=FALSE) +dev.off() +@ +\includegraphics{graph05.pdf} +\paragraph{} + +\textbf{Figure 5.} The S-L graph for the revised model. + +The Q-normal graph shows a non-normal distribution, but without distinctive skewness or kurtosis. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph06", 6, 6) +plot(Boyer.lr, which = 5, set.up=FALSE) +dev.off() +@ +\includegraphics{graph06.pdf} +\paragraph{} + +\textbf{Figure 6.} The residual Q-normal graph. + +The non-normality of the residuals poses at least three potential problems: (1) lack of confidence in the confidence limits for regression parameter estimates, (2) bias in the back-transformation corrections for the mean load, and (3) lack of confidence in the confidence intervals of the load estimate. For the first potential problem, the confidence intervals of the parameter estimates and their significance are not critical to load estimation and the non-normality is not so large that this is a major concern (Greene, 2012). Several factors must be considered to address the second potential problem---the magnitude of the bias correction and the measured bias. The actual bias correction factor (BCF) used is AMLE, which cannot be directly computed, but can be estimated by using the MLE BCF. The MLE BCF for this model is \texttt{exp(0.06958/2) = 1.0354}. For non-normal data, Duan's smoothing BCF is often used (Helsel and Hirsch, 2002); the value for that BCF is \texttt{mean(exp(residuals(Boyer.lr))) = 1.0302}. The BCFs are very similar, which suggests that the bias from the back transform could be small. That is confirmed by the very small value for the percent bias in the printed report (0.2405). The third potential problem and no way to address it other than by using bootstrapping methods. Any reported confidence intervals on loads or fluxes should be treated as approximate. + +\begin{thebibliography}{9} + +\bibitem{JG} +Garrett, J.D., 2012, Concentrations, loads, and yields of select constituents from major tributaries of the Mississippi and Missouri Rivers in Iowa, water years 2004???2008: U.S. Geological Survey Scientific Investigations Report 2012???-5240, 61 p. + +\bibitem{WG} +Greene, W.H., 2012, Econometric analysis, seventh edition: Upper Saddle River, New. Jersey, Prentice Hall, 1198 p. + +\bibitem{HH} +Helsel, D.R., and Hirsch, R.M., 2002, Statistical methods in water resources: U.S. Geological Survey Techniques of Water-Resources Investigations, book 4, chap. A3, 522 p. + +\end{thebibliography} + +\end{document} diff --git a/vignettes/InstantaneousTimeStep.Rnw b/vignettes/InstantaneousTimeStep.Rnw index a156c7a..370f1cf 100644 --- a/vignettes/InstantaneousTimeStep.Rnw +++ b/vignettes/InstantaneousTimeStep.Rnw @@ -62,7 +62,7 @@ This example will include the other surrogates and flow and seasonal terms in th BadChloride.lr <- selBestSubset(Chloride ~ log(Flow) + fourier(dateTime) + log(SpecCond) + log(DO) + log(Turb), data=BadData, flow="Flow", dates="dateTime", time.step="instantaneous", - station="Bad River near Odanah", criterion="SPPC") + station="Bad River near Odanah", criterion="SPCC") print(BadChloride.lr) @ diff --git a/vignettes/UsingEGRETData.Rnw b/vignettes/UsingEGRETData.Rnw new file mode 100644 index 0000000..dd5c1d7 --- /dev/null +++ b/vignettes/UsingEGRETData.Rnw @@ -0,0 +1,163 @@ +\documentclass{article} +\parskip 6pt +%\VignetteIndexEntry{Using EGRET Data} +%\VignetteDepends{rloadest} +%\VignetteDepends{EGRET} +%\VignetteDepends{survival} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright + +\title{Using EGRET Data in a rloadest Model} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates how to set up and use data retrieved and processed for an EGRET (Hirsch and De Cicco, 2015) in a rloadest load model. EGRET includes the statistical algorithm Weighted Regressions on Time, Discharge, and Season (WRTDS) that can compute loads and concentrations. WRTDS uses locally weighted regression on linear time, linear flow (discharge), and the first-order sine and cosine terms to model constituent concentrations and fluxes over time and through the range for flow. + +This example uses the processed data supplied in the EGRET package, but any data retrieved and processed by the \textit{readNWISDaily}, \textit{readNWISSample}, \textit{readNWISInfo} and \textit{mergeReport} functions in EGRET can be used. The sullied data are nitrate plus nitrite data collected in the Choptank River near Greensboro, Maryland (USGS site identifier 01491000). + + +<>= +# Load the necessary packages and the data +library(survival) # required for Surv +library(rloadest) +library(EGRET) +# Get the QW and daily flow data +Chop.QW <- Choptank_eList$Sample +Chop.Q <- Choptank_eList$Daily +@ + +\eject +\section{Compute the Initial rloadest Model} + +The 7-parameter model (model number 9) is a typical model for relatively long-term records, longer than about 7 years and can be a good starting point for building a good model. The water-quality data in the Sample dataset for EGRET is stored in four columns---the minimum value, maximum value, an indicator of censoring, and the average value. That format can be converted to a valid response variable for \textit{loadReg} using either \textit{as.mcens} or \textit{Surv}; \textit{Surv} is preferred because if the data are uncensored or left-censored, then the ''AMLE'' method is used rather that the ''MLE'' method, +which is always used with a response variable of class ''mcens.'' + +<>= +# Compute the 7-parameter model. +Chop.lr <- loadReg(Surv(ConcLow, ConcHigh, type="interval2") ~ model(9), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") +@ + +One of the first steps in assessing the fit is to look at the diagnostic plots for the linearity of the overall fit and each explanatory variable. The overall fit (figure 1) looks linear, but there are three low outliers and a tendency to larger scatter at larger predicted values + +<>= +# Plot the overall fit +setSweave("graph01", 6, 6) +plot(Chop.lr, which=1, set.up=FALSE) +dev.off() +@ +\includegraphics{graph01.pdf} +\paragraph{} + +\textbf{Figure 1.} The overall fit. + +The linearity of the explanatory variables is shown in figure 2. The partial residual plots for flow (lnQ and lnQ2) show nonlinearity in the second order (lnQ2). The partial residual plots for time (DECTIME and DECTIME2) show no nonlinearity, but the second-order term (DECTIME2) shows no trend and can therefore be removed from the model. The partial residual plots for seasonality (DECTIME and DECTIME2) show nonlinearity in both terms, suggesting the need for higher order seasonal terms. +<>= +# Plot the explanatory variable fits +setSweave("graph02", 6, 9) +AA.lo <- setLayout(num.rows=3, num.cols=2) +# Flow and flow squared +setGraph(1, AA.lo) +plot(Chop.lr, which="lnQ", set.up=FALSE) +setGraph(2, AA.lo) +plot(Chop.lr, which="lnQ2", set.up=FALSE) +# Time and time squared +setGraph(3, AA.lo) +plot(Chop.lr, which="DECTIME", set.up=FALSE) +setGraph(4, AA.lo) +plot(Chop.lr, which="DECTIME2", set.up=FALSE) +# Seasonality +setGraph(5, AA.lo) +plot(Chop.lr, which="sin.DECTIME", set.up=FALSE) +setGraph(6, AA.lo) +plot(Chop.lr, which="cos.DECTIME", set.up=FALSE) +dev.off() +@ +\includegraphics{graph02.pdf} +\paragraph{} + +\textbf{Figure 2.} The linearity of the explanatory variables. + +Figure 3 shows the relation between concentration and flow. The relation is not quadratic, but it appears that there is a distinct change at about 10 cubic meters per second. That relation can be modeled using piecewise linear, or segmented, terms. + +<>= +# Plot tconcentration and flow +setSweave("graph03", 6, 6) +# Use the average concentration (only one censored value) +with(Chop.QW, xyPlot(Q, ConcAve, yaxis.log=TRUE, xaxis.log=TRUE)) +dev.off() +@ +\includegraphics{graph03.pdf} +\paragraph{} + +\textbf{Figure 3.} The relation between concentration and flow. + +\eject +\section{Construct the Modified rloadest Model} + +The \textit{segLoadReg} can be used to build a piecewise linear model. It relies on the segmented package, which cannot model censored data to identify the breakpoints. For the first step censored values will be approximated by simple substitution; for the final model, the censored values are restored. One other quirk of \textit{segLoadReg} is that the response term must be a variable, it cannot be constructed using \textit{Surv} or any other function. Therefore, the breakpoint for this model will be identified using ConcAve, but the final model will be built using the censoring information. + +<>= +# Compute the breakpoint--the seg term must be the first term on +# the right-hand side. +Chop.lr <- segLoadReg(ConcAve ~ seg(LogQ, 1) + DecYear + + fourier(DecYear, 2), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") +# From the printed output, the breakpoint is 1.994 in natural log units, +# about 7.4 cms +# Compute and print the final model +Chop.lr <- loadReg(Surv(ConcLow, ConcHigh, type="interval2") ~ + segment(LogQ, 1.994) + DecYear + fourier(DecYear, 2), + data=Chop.QW, flow="Q", dates="Date", conc.units="mg/L", + flow.units="cms", station="Choptank") +print(Chop.lr) +@ + +This segmented model has three variables--- with names ending in X, U1, and P1. The coefficient for the variable ending in X is the slope for the variable less that the breakpoint, the coefficient for the variable ending in U1 is the change in slope above the breakpoint, and the coefficient for the variable ending in P1 should always be close to 0. + +No partial residual plot indicates any nonlinearity. Figure 4 shows 5 selected partial residual plots. + +<>= +# Plot the explanatory variable fits +setSweave("graph04", 6, 9) +AA.lo <- setLayout(num.rows=3, num.cols=2) +# Segmented flow +setGraph(1, AA.lo) +plot(Chop.lr, which="segment(LogQ, 1.994)X", set.up=FALSE) +setGraph(2, AA.lo) +plot(Chop.lr, which="segment(LogQ, 1.994)U1", set.up=FALSE) +# Time +setGraph(3, AA.lo) +plot(Chop.lr, which="DecYear", set.up=FALSE) +# Seasonality +setGraph(5, AA.lo) +plot(Chop.lr, which="fourier(DecYear, 2)sin(k=2)", set.up=FALSE) +setGraph(6, AA.lo) +plot(Chop.lr, which="fourier(DecYear, 2)cos(k=2)", set.up=FALSE) +dev.off() +@ +\includegraphics{graph04.pdf} +\paragraph{} + +\textbf{Figure 4.} partial residual plots. + +\eject +\section{Further Considerations} + +To be continued. + + +\begin{thebibliography}{9} + +\bibitem{HD} +Hirsch, R.M. and De Cicco, L.A., 2015, User guide to Exploration and Graphics for RivEr Trends (EGRET) and dataRetrieval R for hydrologic data (version 2.0, February 2015): U.S. Geological Survey Techniques and Methods book 4, chap A10, 93 p. Available at http://dx.doi.org/10.3133/tm4A10. + +\end{thebibliography} + +\end{document} diff --git a/vignettes/app4.Rnw b/vignettes/app4.Rnw index 30fb5a2..4a62b51 100644 --- a/vignettes/app4.Rnw +++ b/vignettes/app4.Rnw @@ -27,7 +27,7 @@ data(app4.calib) head(app4.calib) @ -The censored data in \texttt{app4.calib} are stored in 2 columns---the recorded value in the column identified by the constituent abbreviation and the remark code, "<" indicating a less-than value in the corresponding column with the .rmk suffix. In order to take advantage of the of \texttt{loadReg} function to automatically recognize censored data, the censored data in the example dataset should be converted to type "water-quality." This can be done using the \texttt{convert2qw} function in the \texttt{USGSwsQW} package, which is required by \texttt{rloadest}. The naming convention in \texttt{app.calib} is consistent with the "partial" scheme for \texttt{convert2qw}. The conversion is accomplished in the R code below. The \texttt{app4.calib} is overwritten with the new data. +The censored data in \texttt{app4.calib} are stored in 2 columns---the recorded value in the column identified by the constituent abbreviation and the remark code, "<" indicating a less-than value in the corresponding column with the .rmk suffix. In order to take advantage of the of \texttt{loadReg} function to automatically recognize censored data, the censored data in the example dataset should be converted to type "water-quality." This can be done using the \texttt{convert2qw} function in the \texttt{smwrQW} package, which is required by \texttt{rloadest}. The naming convention in \texttt{app.calib} is consistent with the "partial" scheme for \texttt{convert2qw}. The conversion is accomplished in the R code below. The \texttt{app4.calib} is overwritten with the new data. <>= # Convert Buty and Alach to class "qw" @@ -127,14 +127,14 @@ cat("\\paragraph{}\n") All of the diagnostic plots in the previous section indicated a cause for concern about the validity of the periodic regression model. Vecchia and others (2008) describe a seasonal-wave function that often works well for pesticide models. -The USGSwsStats package contains the tools necessary to construct a seasonal-wave model. Building a good regression model is a multi-step process, required identifying the timing of the peak concentration and the other parameters of the seasonal-wave model. +The smwrStats package contains the tools necessary to construct a seasonal-wave model. Building a good regression model is a multi-step process, required identifying the timing of the peak concentration and the other parameters of the seasonal-wave model. The first step in constructing the seasonal-wave model is the identify the peak and potential values for the other parameters of the model. That involves building a regression model without any seasonal terms, and using the \texttt{seasonalPeak} function on the residuals to construct a first guess on those parameters. In this case, because there are censored values, we must use \texttt{censReg}. Note that it does not matter whether we use load or concentration because the residuals are the same. <>= # Create the limited regression model. -app4.cr <- censReg(log(Buty) ~ center(log(FLOW)) + dectime(DATES), - data = app4.calib) +app4.cr <- censReg(Buty ~ center(log(FLOW)) + dectime(DATES), + data = app4.calib, dist="lognormal") app4.sp <- seasonalPeak(dectime(app4.calib$DATES), residuals(app4.cr)) app4.sp @ @@ -163,9 +163,10 @@ The \texttt{selBestWave} function can be used to select the "best" parameters fo # Add Dectime. app4.calib <- transform(app4.calib, Dectime=dectime(DATES)) # Find the best model -selBestWave(log(Buty) ~ center(log(FLOW)) + dectime(DATES), +selBestWave(Buty ~ center(log(FLOW)) + dectime(DATES), data = app4.calib, - "Dectime", app4.sp, exhaustive=TRUE, Regression=censReg) + "Dectime", app4.sp, exhaustive=TRUE, Regression=censReg, + dist="lognormal") @ The "best" model has the timing of the peak at about 0.393, a pesticide loading period of 1 months and a decay rate indicated by a half-life of 1 month (the fastest decay rate among the default choices). We are now ready to build and evaluate the seasonal-wave load model.