diff --git a/R/predConc.R b/R/predConc.R index c6cabaa..af9fe4e 100644 --- a/R/predConc.R +++ b/R/predConc.R @@ -9,7 +9,9 @@ #' If \code{allow.incomplete} is \code{TRUE}, then concentrations will be #'computed based on all nonmissing values, otherwise missing values #'\code{NAs} will be returned. For this application, missing values -#'includes \code{NAs} and incomplete days. +#'includes \code{NAs} and incomplete days. For prediction by "day" when +#'there are variable number of unit values per day, \code{allow.incomplete} +#'must be set to \code{TRUE}. #' #' The term confidence interval is used here as in the original #'documentation for LOADEST, but the values that are reported are @@ -211,6 +213,9 @@ predConc <- function(fit, newdata, by="day", Kdy <- as.integer(KDate) KDate <- unique(KDate) Kdy <- Kdy - Kdy[1L] + 1L # make relative to first day (Index) + if(length(unique(table(Kdy))) > 1L && !allow.incomplete) { + warning("Variable observations per day, either set the allow.incomplete argument to TRUE or use the resampleUVdata function to construct a uniform series") + } KinAll <- unique(Kdy) ## Make it daily flow, Flow0 indicates a partial 0 flow Flow0 <- tapply(Flow, Kdy, min) diff --git a/R/predLoad.R b/R/predLoad.R index 521e2ae..a933f0a 100644 --- a/R/predLoad.R +++ b/R/predLoad.R @@ -14,7 +14,9 @@ #'\code{NAs} will be returned. For this application, missing values #'includes \code{NAs} and gaps in the record, except for \code{by} #'set to "total" or user defined groups where missing values only -#'includes \code{NAs}. +#'includes \code{NAs}. For prediction by "day" when there are variable +#'number of unit values per day, \code{allow.incomplete} must be +#'set to \code{TRUE}. #' #' The term confidence interval is used here as in the original #'documentation for LOADEST, but the values that are reported are @@ -52,7 +54,6 @@ #' @useDynLib rloadest estlday #' @useDynLib rloadest estltot #' @export - predLoad <- function(fit, newdata, load.units=fit$load.units, by="total", seopt="exact", allow.incomplete=FALSE, conf.int=0.95, print=FALSE) { @@ -239,6 +240,9 @@ predLoad <- function(fit, newdata, load.units=fit$load.units, by="total", Kdy <- as.integer(KDate) KDate <- unique(KDate) Kdy <- Kdy - Kdy[1L] + 1L # make relative to first day (Index) + if(length(unique(table(Kdy))) > 1L && !allow.incomplete) { + warning("Variable observations per day, either set the allow.incomplete argument to TRUE or use the resampleUVdata function to construct a uniform series") + } KinAll <- unique(Kdy) ## Make it daily flow, Flow0 indicates a partial 0 flow Flow0 <- tapply(Flow, Kdy, min) @@ -470,14 +474,19 @@ predLoad <- function(fit, newdata, load.units=fit$load.units, by="total", cat("Streamflow Summary Statistics\n", "-------------------------------------------\n\n", sep="") Qsum <- rbind(Cal.=fit$Sum.flow, Est.=summary(Flow)) - if(diff(range(Qsum[, ncol(Qsum)])) > 0) + if(Qsum[2L, ncol(Qsum)] > Qsum[1L, ncol(Qsum)]) { cat("WARNING: The maximum estimation data set steamflow exceeds the maximum\n", "calibration data set streamflow. Load estimates require extrapolation.\n\n", sep="") - else + } else if(Qsum[2L, 1L] < Qsum[1L, 1L]) { + cat("WARNING: The minimum estimation data set steamflow exceeds the minimum\n", + "calibration data set streamflow. Load estimates require extrapolation.\n\n", + sep="") + } else { cat("The maximum estimation data set streamflow does not exceed the maximum\n", "calibration data set streamflow. No extrapolation is required.\n\n", sep="") + } if(!(by %in% c("day", "unit"))) { cat("\n-------------------------------------------------------------\n", "Constituent Output File Part IIb: Estimation (Load Estimates)\n", diff --git a/R/print.loadReg.R b/R/print.loadReg.R index 7ec3f21..09515b9 100644 --- a/R/print.loadReg.R +++ b/R/print.loadReg.R @@ -116,7 +116,7 @@ print.loadReg <- function(x, digits=4, brief=TRUE, load.only=brief, ...) { else pval <- format(round(pval, 4), scientific=5) ## Compute the PPCC - Res <- residuals(x$lfit, type="working") + Res <- residuals(x$lfit, type="working", suppress.na.action=TRUE) ppcc <- censPPCC.test(as.lcens(Res, censor.codes=x$lfit$CENSFLAG)) cat("\n", x$method, " Regression Statistics\n", "Residual variance: ", diff --git a/R/rloadest-package.R b/R/rloadest-package.R index 090cf77..3ecfc89 100644 --- a/R/rloadest-package.R +++ b/R/rloadest-package.R @@ -27,17 +27,11 @@ #'Furhtermore, the model building capability in the rloadest functions make easier #'to explore other forms of rating-curve models than LOADEST. #' -#' @name LOADEST-package +#' @name rloadest-package #' @docType package #' @author Dave Lorenz \email{lorenz@@usgs.gov} #' @keywords load estimation NULL - -#' Example Atrazine data included in LOADEST package -#' -#' Example data representing atrazine -#' -#' @name Atrazine -#' @docType data -#' @keywords water quality data -NULL +.onAttach <- function(libname, pkgname) { + packageStartupMessage("Although this software program has been used by the U.S. Geological Survey (USGS), no warranty, expressed or implied, is made by the USGS or the U.S. Government as to the accuracy and functioning of the program and related program material nor shall the fact of distribution constitute any such warranty, and no responsibility is assumed by the USGS in connection therewith.") +} diff --git a/demo/00Index b/demo/00Index index 08b1fbe..478293b 100644 --- a/demo/00Index +++ b/demo/00Index @@ -1 +1,2 @@ BatchLoad A script for batch data retrieval and load estimation. +ChangeLoadGraph Create a surface plot showing the change in loading between two years. diff --git a/demo/ChangeLoadGraph.r b/demo/ChangeLoadGraph.r new file mode 100644 index 0000000..ffce5ed --- /dev/null +++ b/demo/ChangeLoadGraph.r @@ -0,0 +1,56 @@ +# Use app1 (model7) from the vignette +library(rloadest) +data(app1.calib) +# construct the model, see the vignette for app1 for details +app1.lr7 <- loadReg(Phosphorus ~ model(7), data = app1.calib, flow = "FLOW", + dates = "DATES", conc.units="mg/L", + station="Illinois River at Marseilles, Ill.") +# construct the grids +FLOW <- seq(3000, 60000, length.out=21) +DATES75 <- seq(as.Date("1975-01-01"), as.Date("1976-01-01"), length.out=24) +Tmp75 <- expand.grid(DATES=DATES75, FLOW=FLOW) +Tmp75$C <- predConc(app1.lr7, Tmp75)$Conc +Tmp75$L <- predLoad(app1.lr7, Tmp75, by="day")$Flux +Z.conc75 <- matrix(Tmp75$C, nrow=24) +Z.load75 <- matrix(Tmp75$L, nrow=24) + +DATES84 <- seq(as.Date("1984-01-01"), as.Date("1985-01-01"), length.out=24) +Tmp84 <- expand.grid(DATES=DATES84, FLOW=FLOW) +Tmp84$C <- predConc(app1.lr7, Tmp84)$Conc +Tmp84$L <- predLoad(app1.lr7, Tmp84, by="day")$Flux +Z.conc84 <- matrix(Tmp84$C, nrow=24) +Z.load84 <- matrix(Tmp84$L, nrow=24) + +# The range in concentration should be from .24 to .92 +AA.lev <- seq(.24, .92, by=.02) +# the maximum load is 84,000+, so set range to 100,000 +# Construct the 1975 load/concentration surface +# The shape of the surface indicates the potential loading given +# The date and flow. The color of the surface indicates the +# potential concentration given the date and flow. +preSurface(DATES75, FLOW, Z.load75, zaxis.range = c(0, 100000), + batch="I") -> AA75.pre +# I selected +# Construct the 1984 load/concentration surface +preSurface(DATES84, FLOW, Z.load84, zaxis.range = c(0, 100000), + batch="I") -> AA84.pre +# I selected +# Proceed: +#set the page to landscape +setPDF("land", basename="Change_Load") +setLayout(width=c(4.5,4.5), height=5.5, explanation=list(right=1.2)) -> AA.lo +setGraph(1, AA.lo) +surfacePlot(pre=AA75.pre, z.color=Z.conc75, + Surface=list(name="Concentration", levels=AA.lev), + xtitle="1975", ytitle="Streamflow", ztitle="Load") -> AA75.pl +addCaption("Change in Phosphorus Loading in the Illinois River at Marseilles, Ill. from 1975 to 1984") +# Construct the 1984 load/concentration surface +setGraph(2, AA.lo) +surfacePlot(pre=AA84.pre, z.color=Z.conc84, + Surface=list(name="Concentration", levels=AA.lev), + xtitle="1984", ytitle="Streamflow", ztitle="Load") -> AA84.pl +# The +setGraph("explanation", AA.lo) +addExplanation(AA84.pl) +dev.off() + diff --git a/inst/doc/InstantaneousTimeStep.pdf b/inst/doc/InstantaneousTimeStep.pdf new file mode 100644 index 0000000..aac8449 Binary files /dev/null and b/inst/doc/InstantaneousTimeStep.pdf differ diff --git a/vignettes/InstantaneousTimeStep.Rnw b/vignettes/InstantaneousTimeStep.Rnw new file mode 100644 index 0000000..a156c7a --- /dev/null +++ b/vignettes/InstantaneousTimeStep.Rnw @@ -0,0 +1,245 @@ +\documentclass{article} +\parskip 6pt +%\VignetteIndexEntry{Instantaneous Time-Step Model} +%\VignetteDepends{rloadest} + +\begin{document} +\SweaveOpts{concordance=TRUE} +\raggedright + +\title{Instantaneous Time-Step Model} + +\author{Dave Lorenz} + +\maketitle + +This example illustrates how to set up and use a instantaneous time-step model. These models are typically used when there is additional explanatory variable information such as surrogate unit values, like specific conductance. The intent is often to model both the concentration or flux at any time and the load over a period of time. + +This example uses data from the Bad River near Odanah, Wisc., USGS gaging station 04027000. The example will build a model of chloride. + + +<>= +# Load the necessary packages and the data +library(rloadest) +library(dataRetrieval) +# What unit values are available? +subset(whatNWISdata("04027000", "uv"), + select=c("parm_cd", "srsname", "begin_date", "end_date")) +# Get the QW data +BadQW <- importNWISqw("04027000", "00940", + begin.date="2011-04-01", end.date="2014-09-30") +# Merge data and time and set timezone (2 steps) +BadQW <- transform(BadQW, dateTime=sample_dt + as.timeDay(sample_tm)) +BadQW <- transform(BadQW, dateTime=setTZ(dateTime, tzone_cd)) +# Now the Unit values data +BadUV <- readNWISuv("04027000", c("00060", "00095", "00300", "63680"), + startDate="2011-04-01", endDate="2014-09-30", tz="America/Chicago") +BadUV <- renameNWISColumns(BadUV) +names(BadUV) +# Strip _Inst off column names +names(BadUV) <- sub("_Inst", "", names(BadUV)) +# Merge the data +BadData <- mergeNearest(BadQW, "dateTime", right=BadUV, dates.right="dateTime", + max.diff="4 hours") +# Rename the left-hand dateTime column +names(BadData)[9] <- "dateTime" +@ + +\eject +\section{Build the Instantaneous Time-Step Model} + +The first step in building the model is to determine which of the surrogates are most appropriate to include in the model. There can be many factors that contribute to deciding which explanatory variables to include in the model. From previous experience the user may decide to include or exclude specific surrogates and flow or seasonal terms. For this example, temperature (parameter code 00010) and pH (parameter code 000400) were excluded as they typically have very little influence on nitrate or nitrate concentration. Other factors include the availability of surrogate values. The output in the code below indicates that NTU\_Turb has few observations (more missing values) that Turb, and will not be included in the candidate explanatory variables. + +<>= +# Print the number of missing values in each column +sapply(BadData, function(col) sum(is.na(col))) +@ + +This example will include the other surrogates and flow and seasonal terms in the candidate model. The code below demonstrates the use of \texttt{selBestSubset} to select the initial candidate model. + +<>= +# Create the and print the candidate model. +BadChloride.lr <- selBestSubset(Chloride ~ log(Flow) + fourier(dateTime) + + log(SpecCond) + log(DO) + log(Turb), data=BadData, + flow="Flow", dates="dateTime", time.step="instantaneous", + station="Bad River near Odanah", criterion="SPPC") + +print(BadChloride.lr) +@ + +Only log(DO) was dropped from the model. The printed report indicates some potential problems with the regression---the PPCC test indicates the residuals are not normally distributed and several variance inflation factors are relatively large, greater than 10. But the bias diagnostics show very little bias in the comparison of the estimated to observed values. + +A few selected graphs will help understand the issues identified in the printed report and suggest an alternative model. Figure 1 shows the residuals versus fitted graph, which indicates some very large residuals at larger fitted values. It also suggests some heteroscedasticity in the residual pattern. + +<>= +# Plot the overall fit, choose plot number 2. +setSweave("graph01", 6, 6) +plot(BadChloride.lr, which = 2, set.up=FALSE) +dev.off() +@ +\includegraphics{graph01.pdf} +\paragraph{} + +\textbf{Figure 1.} The residuals versus fitted graph. + +The S-L plot is not shown. The residual Q-normal graph indicates the reason for the very low p-value indicated by the PPCC test---the large residual values indicated in figure 1 skew the distribution. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph02", 6, 6) +plot(BadChloride.lr, which = 5, set.up=FALSE) +dev.off() +@ +\includegraphics{graph02.pdf} +\paragraph{} + +\textbf{Figure 2.} The residual Q-normal graph. + +A complete review of the partial residual graphs is not included in this example. Only the partial residual for \texttt{log(Turb)} is shown. The graph indicates the lack of fit, especially for the largest values of Turbidity. This suggests that the log transform is not appropriate. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph03", 6, 6) +plot(BadChloride.lr, which = "log(Turb)", set.up=FALSE) +dev.off() +@ +\includegraphics{graph03.pdf} +\paragraph{} + +\textbf{Figure 3.} The partial residual for \texttt{log(Turb)} graph. + +Build the model excluding \texttt{log(DO)} that was dropped in the subset selection procedure and changing \texttt{log(Turb)} to \texttt{Turb}. + +<>= +# Create the and print the revised model. +BadChloride.lr <- loadReg(Chloride ~ log(Flow) + fourier(dateTime) + + log(SpecCond) + Turb, data=BadData, + flow="Flow", dates="dateTime", time.step="instantaneous", + station="Bad River near Odanah") + +print(BadChloride.lr, load.only=FALSE) +@ + +The report for the revised model indicates less severe problems than from the first candidate model---the p-value for the PPCC test is greater than 0.05, the variance inflation inflation factors are lower although \texttt{log(Flow)} is still greater than 10, and the bias diagnostics from the observed and estimated loads and concentrations are still good. + +A review of selected diagnostic plots indicates a much better overall fit. Figure 4 shows the residuals versus fitted graph, which indicates a less severe problem of large residuals at larger fitted values. It also suggests some heteroscedasticity in the residual pattern as with the first candidate model. + + +<>= +# Plot the overall fit, choose plot number 2. +setSweave("graph04", 6, 6) +plot(BadChloride.lr, which = 2, set.up=FALSE) +dev.off() +@ +\includegraphics{graph04.pdf} +\paragraph{} + +\textbf{Figure 4.} The residuals versus fitted graph for the revised model. + +For this model, the S-L plot is shown. It shows an increase in heteroscedasticity as the fitted values increase. That heteroscedasticity can introduce bias into the estimated values as the bias correction factor will be a bit too small for the larger values and too large for the smaller values. The potential bias for this model is expected to be small because the residual variance is small, 0.03476 natural log units, therefore the bias correction is very small, less than 2 percent, and the potential change to the bias correction very small, much less than 1/2 percent. + +<>= +# Plot the S-L grpah. +setSweave("graph05", 6, 6) +plot(BadChloride.lr, which = 3, set.up=FALSE) +dev.off() +@ +\includegraphics{graph05.pdf} +\paragraph{} + +\textbf{Figure 5.} The S-L graph for the revised model. + +The residual Q-normal graph shows much better agreement to the normal distribution than the original candidate model---the effect of the lowest residuals is much less. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph06", 6, 6) +plot(BadChloride.lr, which = 5, set.up=FALSE) +dev.off() +@ +\includegraphics{graph06.pdf} +\paragraph{} + +\textbf{Figure 6.} The residual Q-normal graph for the revised model. + + +A complete review of the partial residual graphs is not included in this example. Only the partial residual for \texttt{Turb} is shown to compare to the original model. In this case, the untransformed variable appears to fit reasonably well. + +<>= +# Plot the residual Q-normal graph. +setSweave("graph07", 6, 6) +plot(BadChloride.lr, which = "Turb", set.up=FALSE) +dev.off() +@ +\includegraphics{graph07.pdf} +\paragraph{} + +\textbf{Figure 7.} The partial residual for \texttt{Turb} graph for the revised model. + +\eject +\section{Instantaneous Concentrations} + +Estimating the instantaneous concentrations or loads from the model is relatively straight forward. The \texttt{predConc} and \texttt{predLoad} functions will estimate concentrations or loads, actually fluxes, for any time, given explanatory variables with no missing values. This example will focus one a single day, June 30, 2014. + + + +<>= +# Extract one day from the UV data +Bad063014 <- subset(BadUV, as.Date(as.POSIXlt(dateTime)) == "2014-06-30") +# Remove the unecessary surrogates from the data set. +# This reduces the likelihood of missing values in the dataset +Bad063014 <- Bad063014[, c("dateTime", "Flow", "SpecCond", "Turb")] +# Simple check +any(is.na(Bad063014)) +# Estimate concetrations +Bad063014.est <- predConc(BadChloride.lr, Bad063014, by="unit") +# Display the first and last few rows. +head(Bad063014.est) +tail(Bad063014.est) +# The daily mean concentration can also be easily estimated +predConc(BadChloride.lr, Bad063014, by="day") +# Compare to the mean of the unit values: +with(Bad063014.est, mean(Conc)) +@ + +\eject +\section{Aggregate Loads} + +Estimating concentrations or loads by day assumes, but does not require consistent number of unit values per day. Both \texttt{predLoad} and \texttt{predConc} assume that inconsistent number of unit values per day are due to missing values and return missing values for the estimates for days that do not have the average number of observations per day. Inconsistent number of observations per day can be the result of deleted bad values, maintenance, or a change in frequency of sampling. The data can be resampled to a uniform number per day using the \texttt{resampleUVdata} function or the check can be suppressed by setting the \texttt{allow.incomplete} argument to \texttt{TRUE}. + +Estimating loads for periods longer than one day requires consistent number of unit values in each day. The consistent number per day is required to be able to keep track of within-day and between day variances. The \texttt{resampleUVdata} function can be used to force a consistent number of unit values per day. It is not required for this example, but useful when the unit values are not consistent or when there is a change to or from daylight savings time. + +Just as with estimating instantaneous values, missing values are not permitted. Missing values can occur with surrogates due to short-term malfunctions, calibration, or long-term malfunctions. Missing values from short-term malfunctions, generally spikes in the data that are removed during processing, or that occur during calibrations can easily be interpolated using the \texttt{fillMissing} function in \textbf(smwrBase) and are illustrated in this example. Longer-term missing values are much more difficult to fix. They require the careful balancing of need, developing alternate regression models and possible caveats of the interpretation of loads. + +<>= +# Extract one month from the UV data, done in two steps +Bad0714 <- subset(BadUV, as.Date(as.POSIXlt(dateTime)) >= "2014-07-01") +Bad0714 <- subset(Bad0714, as.Date(as.POSIXlt(dateTime)) <= "2014-07-31") +# Remove the unecessary surrogates from the data set. +# This reduces the likelihood of missing values in the dataset +Bad0714 <- Bad0714[, c("dateTime", "Flow", "SpecCond", "Turb")] +# Simple check on each column, how many in each column? +sapply(Bad0714, function(x) sum(is.na(x))) +# Fix each column, using the defaults of fillMissing +Bad0714$SpecCond <- fillMissing(Bad0714$SpecCond) +Bad0714$Turb <- fillMissing(Bad0714$Turb) +# Verify filled values +sapply(Bad0714, function(x) sum(is.na(x))) +# Estimate daily loads +Bad0714.day <- predLoad(BadChloride.lr, Bad0714, by="day") +# Display the first and last few rows. +head(Bad0714.day) +tail(Bad0714.day) +# And the month +Bad0714.mon <- predLoad(BadChloride.lr, Bad0714, by="month") +Bad0714.mon +# Compare to the results using the approximate standard error: +# For long periods, the processing time to the exact seopt can be very large +# and may be desireable to use the approximation. +predLoad(BadChloride.lr, Bad0714, by="month", seopt="app") +# Compare to the mean of the daily values: +with(Bad0714.day, mean(Flux)) +@ + + +\end{document} diff --git a/vignettes/app6.Rnw b/vignettes/app6.Rnw index 9336dfb..ef88dda 100644 --- a/vignettes/app6.Rnw +++ b/vignettes/app6.Rnw @@ -1,12 +1,11 @@ \documentclass{article} -\parskip 3pt +\parskip 6pt %\VignetteIndexEntry{Regression Model for Concentration} %\VignetteDepends{rloadest} \begin{document} \SweaveOpts{concordance=TRUE} \raggedright -\parindent 30 pt \title{Application 6: Regression Model for Concentration}