add trace, messages. some structural changes and bugFix in summary

ncn-foreigners · Aug 26, 2023 · a48e396 · Kertoo · Aug 26, 2023 · a48e396
1 parent 25ea616
commit a48e396
Show file tree

Hide file tree

Showing 23 changed files with 221 additions and 107 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,7 +10,11 @@ Authors@R:
       person(given = "Maciej",
              family = "Beręsewicz",
              role = c("aut", "ctb"),
-             email = "maciej.beresewicz@ue.poznan.pl"))
+             email = "maciej.beresewicz@ue.poznan.pl"),
+      person(given = "Piotr",
+             family = "Chlebicki",
+             role = c("aut", "ctb"),
+             email = "piochl@st.amu.edu.pl"))
 Description: An R package for statistical inference with non-probability samples when auxiliary information from external sources like probability samples or population totals/means are available.
 License: file LICENSE
 Encoding: UTF-8
@@ -19,5 +23,5 @@ RdMacros: mathjaxr
 RoxygenNote: 7.2.3
 Suggests: tinytest, covr
 Depends: survey
-Imports: maxLik, stats, Matrix, MASS, ncvreg, betareg, RANN, sampling, Rcpp, RcppArmadillo, mathjaxr, nleqslv, doParallel, foreach, parallel
+Imports: maxLik, stats, Matrix, MASS, ncvreg, betareg, RANN, sampling, Rcpp, RcppArmadillo, mathjaxr, nleqslv, doParallel, foreach, parallel, HelpersMG
 LinkingTo: RcppArmadillo, Rcpp
diff --git a/NAMESPACE b/NAMESPACE
@@ -31,6 +31,7 @@ export(probit)
 import(Rcpp)
 import(RcppArmadillo)
 import(mathjaxr)
+importFrom(HelpersMG,ExtractAIC.glm)
 importFrom(MASS,ginv)
 importFrom(Matrix,Diagonal)
 importFrom(Matrix,Matrix)

diff --git a/R/EstimationMethods.R b/R/EstimationMethods.R
@@ -163,19 +163,19 @@ gee <- function(...) {
   }
 
   make_t <- function(X, ps, psd, b, y_rand, y_nons, h, N, method_selection, weights) {
-    if (h == "1") {
+    if (h == 1) {
       t <- X %*% t(as.matrix(b)) + y_rand - 1/N * sum(weights * y_nons)
-    } else if (h == "2") {
+    } else if (h == 2) {
       t <- as.vector(ps) * X %*% t(as.matrix(b)) + y_rand - 1/N * sum(weights * y_nons)
     }
     t
   }
 
   make_var_nonprob <- function(ps, psd, y, y_pred, h_n, X, b, N, h, method_selection, weights, pop_totals) {
-    if (!is.null(pop_totals)) h <- "1" # perhaps to remove, just check if appropriate var is calculated
-    if (h == "2") {
+    if (!is.null(pop_totals)) h <- 1 # perhaps to remove, just check if appropriate var is calculated
+    if (h == 2) {
       var_nonprob <- 1/N^2 * sum((1 - ps) * ((weights*(y - y_pred - h_n)/ps) - b %*% t(X))^2)
-    } else if (h == "1") {
+    } else if (h == 1) {
       var_nonprob <- 1/N^2 * sum((1 - ps) * ((weights*(y - y_pred - h_n) - b %*% t(X))/ps)^2)
     }
     as.numeric(var_nonprob)
@@ -242,7 +242,8 @@ gee <- function(...) {
 
 }
 
-mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection, family, boot = FALSE) { # TODO
+# bias correction
+mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection, family, boot = FALSE) {
 
   method <- get_method(method_selection)
   inv_link <- method$make_link_inv
@@ -324,6 +325,7 @@ mm <- function(X, y, weights, weights_rand, R, n_nons, n_rand, method_selection,
                       grad = multiroot$fvec[1:(p)], # TODO
                       hess = hess, # TODO
                       ps_nons = ps_nons,
+                      est_ps_rand = est_ps_rand,
                       variance_covariance = vcov_selection,
                       df_residual = df_residual,
                       log_likelihood = "NULL")

diff --git a/R/OutcomeMethods.R b/R/OutcomeMethods.R
@@ -1,7 +1,7 @@
 #' @importFrom stats predict.glm
 #' @importFrom stats glm.fit
 #' @importFrom stats summary.glm
-glm <- function(outcome,
+glm_nonprobsvy <- function(outcome,
                 data,
                 weights,
                 family_outcome,
@@ -39,13 +39,13 @@ glm <- function(outcome,
       y_nons_pred <- model_out$glm$fitted.values
     } else {
       model <- stats::glm.fit(x = X_nons,
-                               y = y_nons,
-                               weights = weights,
-                               family = get_method(family_outcome),
-                               control = list(control$epsilon,
+                              y = y_nons,
+                              weights = weights,
+                              family = get_method(family_outcome),
+                              control = list(control$epsilon,
                                               control$maxit,
                                               control$trace),
-                               intercept = FALSE)
+                              intercept = FALSE)
       model_summ <- stats::summary.glm(model)
       parameters <- model_summ$coefficients
       model_nons_coefs <- model$coefficients
@@ -69,7 +69,7 @@ glm <- function(outcome,
 }
 
 
-nn <- function(outcome,
+nn_nonprobsvy <- function(outcome,
                data,
                weights,
                family_outcome,
@@ -81,8 +81,7 @@ nn <- function(outcome,
                n_rand,
                vars_selection,
                pop_totals,
-               model_frame = NULL) {
-
+               model_frame = NULL) { # TODO consider add data standardization before modelling
   model_nons <- nonprobMI_nn(data = X_nons,
                              query = X_nons,
                              k = control$k,

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,7 +1,7 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-cv_nonprobsvy_rcpp <- function(X, R, weights_X, method_selection, h, maxit, eps, lambda_min, nlambda, nfolds, penalty, a, pop_totals, lambda = -1) {
-    .Call('_nonprobsvy_cv_nonprobsvy_rcpp', PACKAGE = 'nonprobsvy', X, R, weights_X, method_selection, h, maxit, eps, lambda_min, nlambda, nfolds, penalty, a, pop_totals, lambda)
+cv_nonprobsvy_rcpp <- function(X, R, weights_X, method_selection, h, maxit, eps, lambda_min, nlambda, nfolds, penalty, a, pop_totals, verbose, lambda = -1) {
+    .Call('_nonprobsvy_cv_nonprobsvy_rcpp', PACKAGE = 'nonprobsvy', X, R, weights_X, method_selection, h, maxit, eps, lambda_min, nlambda, nfolds, penalty, a, pop_totals, verbose, lambda)
 }
 
diff --git a/R/bootstraps.R b/R/bootstraps.R
@@ -15,6 +15,7 @@ bootMI <- function(X_rand,
                    method,
                    control,
                    pop_totals,
+                   verbose,
                    ...
                    ){ # TODO add methods instead of conditional loops
 
@@ -65,6 +66,10 @@ bootMI <- function(X_rand,
         #mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap)
         mu_hat_boot <- weighted.mean(x = y_strap_rand, w = weights_rand_strap_svy)
         mu_hats[k] <- mu_hat_boot
+        if (verbose) {
+          info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+          print(info)
+        }
         k <- k + 1
       }
     } else if (method == "nn") {
@@ -95,6 +100,10 @@ bootMI <- function(X_rand,
 
         mu_hat_boot <- weighted.mean(x = y_rand_strap, w = weights_rand_strap)
         mu_hats[k] <- mu_hat_boot
+        if (verbose) {
+          info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+          print(info)
+        }
         k <- k + 1
       }
     }
@@ -119,6 +128,10 @@ bootMI <- function(X_rand,
         #mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap)
         mu_hat_boot <-  as.vector(y_strap_rand/N)
         mu_hats[k] <- mu_hat_boot
+        if (verbose) {
+          info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+          print(info)
+        }
         k <- k + 1
       }
     } else if (method == "nn") {
@@ -135,6 +148,10 @@ bootMI <- function(X_rand,
                                    searchtype = control$searchtype)
         mu_hat_boot <- mean(y_strap[model_rand$nn.idx])
         mu_hats[k] <- mu_hat_boot
+        if (verbose) {
+          info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+          print(info)
+        }
         k <- k + 1
       }
     }
@@ -162,6 +179,7 @@ bootIPW <- function(X_rand,
                     pop_size = NULL,
                     pop_totals = NULL,
                     control_selection,
+                    verbose,
                     ...){
   mu_hats <- vector(mode = "numeric", length = num_boot)
   if (!is.null(weights_rand)) N <- sum(weights_rand)
@@ -204,6 +222,10 @@ bootIPW <- function(X_rand,
                                weights_nons = weights_nons,
                                N = N_est_nons) # IPW estimator
       mu_hats[k] <- mu_hat_boot
+      if (verbose) {
+        info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+        print(info)
+      }
 
     } else {
       strap <- sample.int(replace = TRUE, n = n_nons)
@@ -231,6 +253,10 @@ bootIPW <- function(X_rand,
                                weights_nons = weights_nons,
                                N = N_est_nons) # IPW estimator
       mu_hats[k] <- mu_hat_boot
+      if (verbose) {
+        info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+        print(info)
+      }
     }
     k <- k + 1
   }
@@ -259,6 +285,8 @@ bootDR <- function(SelectionModel,
                    pop_size,
                    pop_totals,
                    pop_means,
+                   bias_correction,
+                   verbose,
                    ...) {
 
   mu_hats <- vector(mode = "numeric", length = num_boot)
@@ -276,7 +304,7 @@ bootDR <- function(SelectionModel,
     family <- family()
   }
 
-  if (est_method == "mm") {
+  if (bias_correction == TRUE) {
     X <- rbind(SelectionModel$X_rand, SelectionModel$X_nons)
     p <- ncol(X)
     y_rand <- vector(mode = "numeric", length = n_rand)
@@ -345,6 +373,10 @@ bootDR <- function(SelectionModel,
                                   N_nons = N_est_nons,
                                   N_rand = N_est_rand)
           mu_hats[k] <- mu_hat_boot
+          if (verbose) {
+            info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+            print(info)
+          }
           k <- k + 1
         }
       } else { # TODO
@@ -388,6 +420,10 @@ bootDR <- function(SelectionModel,
 
           mu_hat_boot <- 1/N_est * sum(weights_nons_strap * (weights_strap * (OutcomeModel$y[strap] - y_nons_pred))) + 1/pop_size * y_rand_pred
           mu_hats[k] <- mu_hat_boot
+          if (verbose) {
+            info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+            print(info)
+          }
           k <- k + 1
         }
       }
@@ -407,7 +443,8 @@ bootDR_sel <- function(X,
                        n_rand,
                        num_boot,
                        par0,
-                       psel) { # TODO function to test
+                       psel,
+                       verbose) { # TODO function to test
   mu_hats <- vector(mode = "numeric", length = num_boot)
   k <- 1
   loc_nons <- which(R == 1)
@@ -447,6 +484,10 @@ bootDR_sel <- function(X,
                             N_nons = N_nons,
                             N_rand = N_rand) #DR estimator
     mu_hats[k] <- mu_hat_boot
+    if (verbose) {
+      info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
+      print(info)
+    }
     k <- k + 1
   }
   boot_var <- 1/num_boot * sum((mu_hats - mu_hat)^2)
@@ -473,6 +514,7 @@ bootMI_multicore <- function(X_rand,
                              control,
                              pop_totals,
                              cores,
+                             verbose,
                              ...) {
   #mu_hats <- vector(mode = "numeric", length = num_boot)
   n_nons <- nrow(X_nons)
@@ -522,7 +564,6 @@ bootMI_multicore <- function(X_rand,
           beta <- model_strap$coefficients
           eta <- X_rand %*% beta
           y_strap_rand <- family_nonprobsvy$mu(eta)
-
           weighted.mean(x = y_strap_rand, w = weights_rand_strap_svy)
         })
     } else if (method == "nn") {
@@ -640,6 +681,7 @@ bootIPW_multicore <- function(X_rand,
                               pop_totals = NULL,
                               control_selection,
                               cores,
+                              verbose,
                               ...) {
 
   if (!is.null(weights_rand)) N <- sum(weights_rand)
@@ -763,7 +805,9 @@ bootDR_multicore <- function(SelectionModel,
                              pop_size,
                              pop_totals,
                              pop_means,
+                             bias_correction,
                              cores,
+                             verbose,
                              ...) {
 
   # mu_hats <- vector(mode = "numeric", length = num_boot)
@@ -781,7 +825,7 @@ bootDR_multicore <- function(SelectionModel,
     family <- family()
   }
 
-  if (est_method == "mm") {
+  if (bias_correction == TRUE) {
     X <- rbind(SelectionModel$X_rand, SelectionModel$X_nons)
     p <- ncol(X)
     y_rand <- vector(mode = "numeric", length = n_rand)
@@ -930,7 +974,8 @@ bootDR_sel_multicore <- function(X,
                                  num_boot,
                                  par0,
                                  psel,
-                                 cores) { # TODO function to test
+                                 cores,
+                                 verbose) { # TODO function to test
   mu_hats <- vector(mode = "numeric", length = num_boot)
   loc_nons <- which(R == 1)
   loc_rand <- which(R == 0)
@@ -953,6 +998,7 @@ bootDR_sel_multicore <- function(X,
     X_strap <- rbind(X_rand[strap_rand, ], X_nons[strap_nons, ])
     y_strap <- c(y_rand[strap_rand], y_nons[strap_nons])
 
+    source("R/EstimationMethods.R")
     model_strap <- mm(X = X_strap,
                       y = y_strap,
                       weights = prior_weights[loc_nons][strap_nons],

diff --git a/R/cloglogModel.R b/R/cloglogModel.R
@@ -159,7 +159,7 @@ cloglog <- function(...) {
         v_2 <- v_2 + v_2i
       }
       v_2 <- 1/N^2 * v_2
-    } else if (est_method == "gee" && h == "1") {
+    } else if (est_method == "gee" && h == 1) {
       if (is.null(N)) {
         N <- sum(1/ps)
         v11 <- 1/N^2 * sum(((1 - ps)/ps^2 * weights * (y - mu)^2)) # TODO
@@ -175,7 +175,7 @@ cloglog <- function(...) {
         v_2i <- (1 - ps[i])/ps[i] * X[i,] %*% t(X[i,])
         v_2 <- v_2 + v_2i
       }
-    } else if (est_method == "gee" && h == "2") {
+    } else if (est_method == "gee" && h == 2) {
       if (is.null(N)) {
         N <- sum(1/ps)
         v11 <- 1/N^2 * sum(((1 - ps)/ps^2 * weights * (y - mu)^2)) # TODO
@@ -210,7 +210,7 @@ cloglog <- function(...) {
       if (est_method == "mle") {
         svydesign$prob <- as.vector(1/log(1 - eps) * svydesign$prob)
       } else if (est_method == "gee") {
-        if (h == "2") svydesign$prob <- as.vector(1/eps * svydesign$prob)
+        if (h == 2) svydesign$prob <- as.vector(1/eps * svydesign$prob)
       }
       if (is.null(postStrata)) {
         cov <- 1/N^2 * svyrecvar(X/svydesign$prob, svydesign$cluster, stratas = svydesign$strata, fpcs = svydesign$fpc)