version 0.4.0

jobstdavid · Dec 27, 2023 · a0d35c1 · a0d35c1
1 parent 5626d38
commit a0d35c1
Show file tree

Hide file tree

Showing 87 changed files with 1,949 additions and 2,248 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.Rhistory b/.Rhistory
diff --git a/.github/.DS_Store b/.github/.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: eppverification
 Type: Package
 Title: Verification Tools for the Statistical Postprocessing of Ensemble Forecasts 
-Version: 0.3.0
+Version: 0.4.0
 Authors@R: c(
     person(given = "David",
            family = "Jobst",
@@ -14,7 +14,6 @@ URL: https://github.com/jobstdavid/eppverification
 BugReports: https://github.com/jobstdavid/eppverification/issues
 Imports: 
     ggplot2,
-    fields,
     vegan,
     pcaPP,
     Rfast,

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,20 +2,20 @@
 
 export(ae)
 export(bh.test)
+export(bias)
 export(bs)
 export(cov.plot)
 export(cpi)
 export(cpi.plot)
 export(crps)
+export(dispersion)
 export(dm.test)
 export(ds)
 export(dss)
 export(ee)
 export(ent)
 export(es)
-export(line.plot)
 export(logs)
-export(m.pit)
 export(ment)
 export(mri)
 export(mrnk)
@@ -28,7 +28,6 @@ export(rnk)
 export(score.plot)
 export(se)
 export(ss)
-export(var.pit)
 export(vr.hist)
 export(vs)
 importFrom(Rcpp,sourceCpp)
@@ -38,7 +37,6 @@ importFrom(Rfast,rowMedians)
 importFrom(Rfast,rowVars)
 importFrom(Rfast,rowmeans)
 importFrom(Rfast,rowsums)
-importFrom(fields,rdist)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,after_stat)
 importFrom(ggplot2,element_text)
@@ -55,7 +53,6 @@ importFrom(ggplot2,ggtitle)
 importFrom(ggplot2,labs)
 importFrom(ggplot2,scale_color_manual)
 importFrom(ggplot2,scale_fill_gradient)
-importFrom(ggplot2,scale_linetype_manual)
 importFrom(ggplot2,scale_x_continuous)
 importFrom(ggplot2,scale_x_discrete)
 importFrom(ggplot2,scale_y_continuous)
@@ -69,8 +66,9 @@ importFrom(stats,acf)
 importFrom(stats,complete.cases)
 importFrom(stats,median)
 importFrom(stats,na.omit)
-importFrom(stats,pt)
+importFrom(stats,pnorm)
 importFrom(stats,setNames)
 importFrom(stats,var)
+importFrom(utils,tail)
 importFrom(vegan,spantree)
 useDynLib(eppverification, .registration = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,17 @@
+# eppverification 0.4.0 (December 27, 2023)
+
+**Note**: Most of the following changes serve as preparation for the next version with groundbreaking innovations!
+
+- change `na.rm` and `mean` argument in nearly all functions to `na.action` and `aggregate`. This allows a more flexible handling of missing values and score aggregation. 
+- expand functionality of `bh.test`.
+- change test statistic in `dm.test` back to the one in version 0.2.0. 
+- change argument name `interval.range` to `nominal.coverage` in the functions `cpi`, `cpi.plot` and `cov.plot`.  
+- change argument name `ri` to `reliability` and `ent` to `entropy` in the functions `vr.hist` and `mvr.hist`.  
+- changed name `var.pit` and `m.pit` to `dispersion` and `bias`, respectively.
+- add weight matrix symmetry check for `vs`.
+- remove function `line.plot`.
+
+
 # eppverification 0.3.0 (September 30, 2023)
 
 - univariate and multivariate score calculation is speeded-up using C++ code by the R-packages `Rfast`, `Rcpp`, `RcppArmadillo`.

diff --git a/R/.Rhistory b/R/.Rhistory
@@ -0,0 +1,172 @@
+library(boostCopula)
+library(VineCopula)
+library(tictoc)
+logit <- function(x, inv = FALSE) {
+if (!inv) {
+log(x/(1-x))
+} else {
+exp(x)/(1+exp(x))
+}
+}
+link_fun <- function(family, x) {
+if (family %in% 1:2) {
+sin(pi/2*tanh(x))
+} else if (family %in% c(3,13)) {
+2*logit(x, inv = T)/(1-logit(x, inv = T))
+} else if (family %in% c(23, 33)) {
+-2*logit(x, inv = T)/(1-logit(x, inv = T))
+} else if (family %in% c(4,14)) {
+1/(1-logit(x, inv = T))
+} else if (family %in% c(24, 34)) {
+-1/(1-logit(x, inv = T))
+}
+}
+# copula families
+fams <- c(1, 2, 3, 13, 23, 33, 4, 14, 24, 34)
+n <- 100 # number of simulation repetitions
+N <- 60 # number of samples for each simulation
+p <- 100 # number of variables
+d <- 5
+coef_mat <- array(0, dim = c(p+1, d, d), dimnames = list(c("intercept", paste("x", sep = "", 1:p)), c(), c()))
+positives_lin <- array(NA, dim = c(2, d, d), dimnames = list(c("true", "false"), c(), c()))
+par_mae_lin <- array(NA, dim = c(d, d))
+fams_lin <- array(0, dim = c(d, d))
+# define 5-dimensional R-vine tree structure matrix
+Matrix <- c(5, 0, 0, 0, 0,
+2, 2, 0, 0, 0,
+3, 3, 3, 0, 0,
+1, 4, 4, 4, 0,
+4, 1, 1, 1, 1)
+Matrix <- matrix(Matrix, 5, 5, byrow = T)
+# define R-vine pair-copula family matrix
+family <- c(0, 0, 0, 0, 0,
+1, 0, 0, 0, 0,
+24, 2, 0, 0, 0,
+4, 33, 13, 0, 0,
+23, 34, 14, 3, 0)
+family <- matrix(family, 5, 5, byrow = T)
+# define R-vine pair-copula parameter matrix
+par <- c(0, 0, 0, 0, 0,
+0.5, 0, 0, 0, 0,
+-2, 0.5, 0, 0, 0,
+2, -2, 2, 0, 0,
+-2, -2, 2, 2, 0)
+par <- matrix(par, 5, 5, byrow = T)
+# define second R-vine pair-copula parameter matrix
+par2 <- c(0, 0, 0, 0, 0,
+0, 0, 0, 0, 0,
+0, 4, 0, 0, 0,
+0, 0, 0, 0, 0,
+0, 0, 0, 0, 0)
+par2 <- matrix(par2, 5, 5, byrow = T)
+boostRVC <- RVineMatrix(Matrix = Matrix,
+family = family,
+par = par, par2 = par2,
+names = c("U1", "U2", "U3", "U4", "U5"))
+# set seed for reproducubility
+set.seed(5)
+x <- sapply(1:p, function(l) runif(N, -1, 1))
+colnames(x) <- paste0("x", 1:p)
+u <- matrix(runif(N*d), ncol = d, dimnames = list(c(), paste0("U", 1:d, sep = "")))
+# simulate linear model
+U <- boostRVineSimStudy(boostRVC, u, x, mode = "linear")
+library(boostCopula)
+library(VineCopula)
+library(tictoc)
+logit <- function(x, inv = FALSE) {
+if (!inv) {
+log(x/(1-x))
+} else {
+exp(x)/(1+exp(x))
+}
+}
+link_fun <- function(family, x) {
+if (family %in% 1:2) {
+sin(pi/2*tanh(x))
+} else if (family %in% c(3,13)) {
+2*logit(x, inv = T)/(1-logit(x, inv = T))
+} else if (family %in% c(23, 33)) {
+-2*logit(x, inv = T)/(1-logit(x, inv = T))
+} else if (family %in% c(4,14)) {
+1/(1-logit(x, inv = T))
+} else if (family %in% c(24, 34)) {
+-1/(1-logit(x, inv = T))
+}
+}
+# copula families
+fams <- c(1, 2, 3, 13, 23, 33, 4, 14, 24, 34)
+n <- 100 # number of simulation repetitions
+N <- 6000 # number of samples for each simulation
+p <- 100 # number of variables
+d <- 5
+coef_mat <- array(0, dim = c(p+1, d, d), dimnames = list(c("intercept", paste("x", sep = "", 1:p)), c(), c()))
+positives_lin <- array(NA, dim = c(2, d, d), dimnames = list(c("true", "false"), c(), c()))
+par_mae_lin <- array(NA, dim = c(d, d))
+fams_lin <- array(0, dim = c(d, d))
+# define 5-dimensional R-vine tree structure matrix
+Matrix <- c(5, 0, 0, 0, 0,
+2, 2, 0, 0, 0,
+3, 3, 3, 0, 0,
+1, 4, 4, 4, 0,
+4, 1, 1, 1, 1)
+Matrix <- matrix(Matrix, 5, 5, byrow = T)
+# define R-vine pair-copula family matrix
+family <- c(0, 0, 0, 0, 0,
+1, 0, 0, 0, 0,
+24, 2, 0, 0, 0,
+4, 33, 13, 0, 0,
+23, 34, 14, 3, 0)
+family <- matrix(family, 5, 5, byrow = T)
+# define R-vine pair-copula parameter matrix
+par <- c(0, 0, 0, 0, 0,
+0.5, 0, 0, 0, 0,
+-2, 0.5, 0, 0, 0,
+2, -2, 2, 0, 0,
+-2, -2, 2, 2, 0)
+par <- matrix(par, 5, 5, byrow = T)
+# define second R-vine pair-copula parameter matrix
+par2 <- c(0, 0, 0, 0, 0,
+0, 0, 0, 0, 0,
+0, 4, 0, 0, 0,
+0, 0, 0, 0, 0,
+0, 0, 0, 0, 0)
+par2 <- matrix(par2, 5, 5, byrow = T)
+boostRVC <- RVineMatrix(Matrix = Matrix,
+family = family,
+par = par, par2 = par2,
+names = c("U1", "U2", "U3", "U4", "U5"))
+# set seed for reproducubility
+set.seed(5)
+x <- sapply(1:p, function(l) runif(N, -1, 1))
+colnames(x) <- paste0("x", 1:p)
+u <- matrix(runif(N*d), ncol = d, dimnames = list(c(), paste0("U", 1:d, sep = "")))
+# simulate linear model
+U <- boostRVineSimStudy(boostRVC, u, x, mode = "linear")
+tic()
+object <- boostRVineCopSelect(U = U,
+X = x,
+boostRVM = boostRVineMatrix(Matrix = Matrix,
+family = family,
+formula = matrix("~.", d, d, byrow = T)),
+selectioncrit = "loglik",
+familyset = NA,
+par = FALSE,
+type = "glm",
+control = boost_control(mstop = 500),
+cval = cval_control(cval = F),
+deselect = deselect_control(deselect = T),
+center = TRUE,
+stabilization = "none",
+na.action = na.omit,
+cores = 10)
+toc()
+253.249/60
+ll_true <- boostRVineTrueLL(U = U,
+X = x,
+boostRVM = boostRVineMatrix(Matrix = Matrix,
+family = family,
+formula = matrix("~.", d, d, byrow = T)),
+cores = 10)
+sum(unlist(ll_true), na.rm = T)
+object$loglik
+100/17
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -9,10 +9,6 @@ crps_sml_cpp <- function(y, x) {
     .Call(`_eppverification_crps_sml_cpp`, y, x)
 }
 
-crps_mc_cpp <- function(y, x) {
-    .Call(`_eppverification_crps_mc_cpp`, y, x)
-}
-
 euclnorm_cpp <- function(x) {
     .Call(`_eppverification_euclnorm_cpp`, x)
 }
@@ -33,3 +29,7 @@ ds_cpp <- function(x) {
     .Call(`_eppverification_ds_cpp`, x)
 }
 
+rdist_cpp <- function(x) {
+    .Call(`_eppverification_rdist_cpp`, x)
+}
+
diff --git a/R/ae.R b/R/ae.R
@@ -1,46 +1,48 @@
 #' Absolute Error
 #'
-#' This function calculates the Absolute Error (AE) given observations of a one-dimensional variable and ensemble forecasts/samples of a predictive distribution.
+#' This function calculates the Absolute Error (AE) given observations of a univariate variable and samples of a predictive distribution.
 #'
 #' @param y vector of observations
-#' @param x matrix of ensemble forecasts/samples of a predictive distribution or vector of medians of ensemble forecasts/a predictive distribution (depending on \code{y}; see details)
-#' @param mean logical; if \code{TRUE} the mean of the AE values is calculated for output; if \code{FALSE} the single AE values are used as output; default: \code{FALSE}
-#' @param na.rm logical; if \code{TRUE} NA are removed after the computation; if \code{FALSE} NA are used in the computation; default: \code{FALSE}
+#' @param x matrix of samples of a predictive distribution or vector containing the medians of a predictive distribution (depending on \code{y}; see details)
+#' @param na.action function to handle the NA's. Default: \code{na.omit}.
+#' @param aggregate logical or function for aggregating the single scores, e.g. \code{sum}, \code{mean}, \code{weighted.mean}, ....
+#' Default: \code{FALSE}, i.e. no aggregation function.
+#' @param ... further arguments passed to the \code{aggregate} function.
 #'
 #' @details
-#' For a vector \code{y} of length n, \code{x} can be given as matrix of ensemble forecasts/samples of a predictive distribution
+#' For a vector \code{y} of length n, \code{x} can be given as matrix of samples of a predictive distribution
 #' with n rows, where the i-th entry of \code{y} belongs to the i-th row
-#' of \code{x}. The columns of \code{x} represent the samples of a predictive distribution
-#' or ensemble forecasts. The row-wise medians are determined by its sample version.
+#' of \code{x}. The columns of \code{x} represent the samples of a predictive distribution.
+#' The row-wise medians are determined by its sample version.
 #'
-#' If the medians of ensemble forecasts or of a predictive distribution are directly
-#' available, \code{x} can be given as vector of medians, where
+#' If the medians of a predictive distribution are directly
+#' available, \code{x} can be given as vector containing the medians, where
 #' the i-th entry of \code{y} belongs to the i-th entry of \code{x}.
 #'
-#' In addition, with this function, the Mean Absolute Error (MAE) can be calculated.
+#' In addition, with this function, the Mean Absolute Error (MAE) can be calculated by specifying \code{aggregate = mean}.
 #'
 #' A lower AE indicates a better forecast.
 #'
 #' @return
 #' Vector of score value(s).
 #'
 #' @examples
-#' #simulated data
+#' # simulated data
 #' n <- 30
 #' m <- 50
 #' y <- rnorm(n)
 #' x1 <- matrix(rnorm(n*m), ncol = m)
 #' x2 <- apply(x1, 1, median)
 #'
-#' #ae calculation
-#' ae(y = y, x = x1, mean = FALSE)
-#' ae(y = y, x = x1, mean = TRUE)
+#' # ae calculation
+#' ae(y = y, x = x1)
+#' ae(y = y, x = x1, aggregate = mean)
 #'
-#' ae(y = y, x = x2, mean = FALSE)
-#' (mae <- ae(y = y, x = x2, mean = TRUE))
+#' ae(y = y, x = x2)
+#' (mae <- ae(y = y, x = x2, aggregate = mean))
 #'
 #' @references
-#' Vannitsem, S., Wilks, D. and Messner, J. (2018). Statistical Postprocessing of Ensemble Forecasts. Elsevier. 164.
+#' Gneiting, T. (2011). Making and Evaluating Point Forecasts. Journal of the American Statistical Association, 106(494), 746-762.
 #'
 #' @author David Jobst
 #'
@@ -50,7 +52,7 @@
 #' @importFrom Rfast rowMedians
 #'
 #' @export
-ae <- function(y, x, mean = FALSE, na.rm = FALSE) {
+ae <- function(y, x, na.action = na.omit, aggregate = FALSE, ...) {
   if (!is.vector(y)) {
     stop("'y' should be a vector!")
   }
@@ -71,13 +73,11 @@ ae <- function(y, x, mean = FALSE, na.rm = FALSE) {
     stop("'x' should be a vector or matrix!")
   }
 
-  if (na.rm == TRUE) {
-    ae.value <- as.vector(na.omit(ae.value))
+  ae.value <- as.vector(na.action(ae.value))
+  if (!isFALSE(aggregate)) {
+    ae.value <- do.call(aggregate, list(ae.value, ...))
   }
 
-  if (mean == TRUE) {
-    ae.value <- mean(ae.value)
-  }
   return(as.numeric(ae.value))
 
 }