diff --git a/DESCRIPTION b/DESCRIPTION index 7655cbc..f21de57 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: cat2cat Title: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset -Version: 0.4.6.9008 +Version: 0.4.6.9009 Authors@R: person("Maciej", "Nasinski", email = "nasinski.maciej@gmail.com", role = c("aut", "cre")) Maintainer: Maciej Nasinski Description: diff --git a/NEWS.md b/NEWS.md index 1933fe9..e4bf39b 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# cat2cat 0.4.6.9008 +# cat2cat 0.4.6.9009 * New `cat2cat_ml_run` function to check the ml models performance before `cat2cat` with ml option is run. Now, the ml models are more transparent. * Add tests for cat2cat related journal (softwarex) paper. diff --git a/R/cat2cat.R b/R/cat2cat.R index a7fefdb..8961fbc 100644 --- a/R/cat2cat.R +++ b/R/cat2cat.R @@ -24,9 +24,9 @@ #' `data`, `cat_var`, `method`, `features` and optional `args`. #' @details #' data args -#' \itemize{ +#' \describe{ #' \item{"old"}{ data.frame older time point in a panel} -#' \item{"new"} { data.frame more recent time point in a panel} +#' \item{"new"}{ data.frame more recent time point in a panel} #' \item{"time_var"}{ character(1) name of the time variable.} #' \item{"cat_var"}{ character(1) name of the categorical variable.} #' \item{"cat_var_old"}{ @@ -51,7 +51,7 @@ #' } #' } #' mappings args -#' \itemize{ +#' \describe{ #' \item{"trans"}{ data.frame with 2 columns - mapping (transition) table - #' all categories for cat_var in old and new datasets have to be included. #' First column contains an old encoding and second a new one. @@ -70,7 +70,7 @@ #' } #' } #' Optional ml args -#' \itemize{ +#' \describe{ #' \item{"data"}{ data.frame - dataset with features and the `cat_var`.} #' \item{"cat_var"}{ character(1) - the dependent variable name.} #' \item{"method"}{ @@ -295,7 +295,7 @@ cat2cat <- function(data = list( } #' Validate if the trans table contains all proper mappings -#' @param cats_target vector of unique target period categories +#' @param u_cats_target vector of unique target period categories #' @param mapp transition (mapping) table process with `get_mappings`, #' the "to_base" direction is taken. #' @keywords internal diff --git a/R/cat2cat_ml.R b/R/cat2cat_ml.R index 3a93389..38c39f9 100644 --- a/R/cat2cat_ml.R +++ b/R/cat2cat_ml.R @@ -235,6 +235,9 @@ cat2cat_ml_run <- function(mappings, ml, ...) { target_name <- "old" } + mapps <- get_mappings(mappings$trans) + mapp <- mapps[[paste0("to_", base_name)]] + cat_var <- ml$data[[ml$cat_var]] cat_var_vals <- unlist(mappings$trans[, base_name]) if (sum(cat_var %in% cat_var_vals) / length(cat_var) < elargs$min_match) { @@ -246,10 +249,6 @@ cat2cat_ml_run <- function(mappings, ml, ...) { ) } - mapps <- get_mappings(mappings$trans) - mapp <- mapps[[paste0("to_", base_name)]] - - nobs <- nrow(ml$data) features <- unique(ml$features) methods <- unique(ml$method) @@ -259,14 +258,13 @@ cat2cat_ml_run <- function(mappings, ml, ...) { ) res <- list() - for (cat in unique(names(mapp))) { + for (cat in names(mapp)) { try( { matched_cat <- mapp[[match(cat, names(mapp))]] - g_name <- paste(matched_cat, collapse = "&") - - res[[g_name]] <- list(ncat = length(matched_cat), naive = 1 / length(matched_cat), - acc = stats::setNames(rep(NA_real_, length(methods)), methods), freq = NA_real_) + cat_nam <- if (cat == "") " " else cat + res[[cat_nam]] <- list(naive = NA_real_, + acc = stats::setNames(rep(NA_real_, length(methods)), methods), freq = NA_real_) data_small_g <- do.call(rbind, train_g[matched_cat]) @@ -275,26 +273,24 @@ cat2cat_ml_run <- function(mappings, ml, ...) { next } + res[[cat_nam]][["naive"]] <- 1 / length(matched_cat) + index_tt <- sample(c(0, 1), nrow(data_small_g), prob = c(1 - elargs$test_prop, elargs$test_prop), replace = TRUE) data_test_small <- data_small_g[index_tt == 1, ] data_train_small <- data_small_g[index_tt == 0, ] + cc <- complete.cases(data_test_small[, features]) - gcounts <- table(data_train_small[[ml$cat_var]]) - gfreq <- names(gcounts)[which.max(gcounts)] - - res[[g_name]][["freq"]] <- mean(gfreq == data_test_small[[ml$cat_var]]) - - if (isTRUE(nrow(data_test_small) == 0 || nrow(data_train_small) < 5)) { + if (isTRUE(nrow(data_test_small[cc, ]) == 0 || nrow(data_train_small) < 5)) { next } - cc <- complete.cases(data_test_small[, features]) + gcounts <- table(data_train_small[[ml$cat_var]]) + gfreq <- names(gcounts)[which.max(gcounts)] + res[[cat_nam]][["freq"]] <- mean(gfreq == data_test_small[[ml$cat_var]]) for (m in methods) { - ml_name <- paste0("wei_", m, "_c2c") - if (m == "knn") { group_prediction <- suppressWarnings( caret::knn3( @@ -332,7 +328,7 @@ cat2cat_ml_run <- function(mappings, ml, ...) { as.matrix(data_test_small[cc, features, drop = FALSE]) )$class } - res[[g_name]][["acc"]][m] <- mean(pred == data_test_small[[ml$cat_var]]) + res[[cat_nam]][["acc"]][m] <- mean(pred == data_test_small[[ml$cat_var]]) } }, silent = TRUE @@ -361,7 +357,7 @@ print.cat2cat_ml_run <- function(x, ...) { acc <- mean(vapply(x, function(i) i$acc[m], numeric(1)), na.rm = T) ml_message <- c( ml_message, - sprintf("Average (groups) accurecy for %s ml models: %f", m, acc) + sprintf("Average (groups) accuracy for %s ml models: %f", m, acc) ) howaccn <- mean(vapply(x, function(i) i$naive < mean(i$acc[m], na.rm = TRUE), numeric(1)), na.rm = T) how_ml_message_n <- c( @@ -391,7 +387,7 @@ print.cat2cat_ml_run <- function(x, ...) { "Selected prediction stats:", "", sprintf("Average naive (equal probabilities) guess: %f", acc_naive), - sprintf("Average (groups) accurecy for most frequent category solution: %f", acc_freq), + sprintf("Average (groups) accuracy for most frequent category solution: %f", acc_freq), ml_message, "", na_message, diff --git a/R/cat2cat_utils.R b/R/cat2cat_utils.R index 519fee9..d50bc51 100644 --- a/R/cat2cat_utils.R +++ b/R/cat2cat_utils.R @@ -16,12 +16,12 @@ #' number of rows #' @details #' method - specify a method to reduce number of replications -#' \itemize{ +#' \describe{ #' \item{"nonzero"}{ remove nonzero probabilities} -#' \item{"highest"} { +#' \item{"highest"}{ #' leave only highest probabilities for each subject- accepting ties #' } -#' \item{"highest1"} { +#' \item{"highest1"}{ #' leave only highest probabilities for each subject - #' not accepting ties so always one is returned #' } diff --git a/README.md b/README.md index d941d28..49a5298 100644 --- a/README.md +++ b/README.md @@ -129,13 +129,3 @@ all.equal(nrow(ff), sum(final_data_back$wei_freq_c2c)) ``` **More complex examples are presented in the "Get Started" vignette.** - -## Graph - -The graphs present how the `cat2cat::cat2cat` function works, in this case under a panel dataset without the unique identifiers and only two periods. - -![Backward Mapping](./man/figures/back_nom.png) - -![Forward Mapping](./man/figures/for_nom.png) - - diff --git a/man/cat2cat.Rd b/man/cat2cat.Rd index 0f92482..cf9afa4 100644 --- a/man/cat2cat.Rd +++ b/man/cat2cat.Rd @@ -53,9 +53,9 @@ of replications. } \details{ data args -\itemize{ +\describe{ \item{"old"}{ data.frame older time point in a panel} - \item{"new"} { data.frame more recent time point in a panel} + \item{"new"}{ data.frame more recent time point in a panel} \item{"time_var"}{ character(1) name of the time variable.} \item{"cat_var"}{ character(1) name of the categorical variable.} \item{"cat_var_old"}{ @@ -80,7 +80,7 @@ data args } } mappings args -\itemize{ +\describe{ \item{"trans"}{ data.frame with 2 columns - mapping (transition) table - all categories for cat_var in old and new datasets have to be included. First column contains an old encoding and second a new one. @@ -99,7 +99,7 @@ mappings args } } Optional ml args -\itemize{ +\describe{ \item{"data"}{ data.frame - dataset with features and the `cat_var`.} \item{"cat_var"}{ character(1) - the dependent variable name.} \item{"method"}{ diff --git a/man/prune_c2c.Rd b/man/prune_c2c.Rd index d0e1fb1..7cf521e 100644 --- a/man/prune_c2c.Rd +++ b/man/prune_c2c.Rd @@ -37,12 +37,12 @@ created in the cat2cat procedure. } \details{ method - specify a method to reduce number of replications -\itemize{ +\describe{ \item{"nonzero"}{ remove nonzero probabilities} - \item{"highest"} { + \item{"highest"}{ leave only highest probabilities for each subject- accepting ties } - \item{"highest1"} { + \item{"highest1"}{ leave only highest probabilities for each subject - not accepting ties so always one is returned } diff --git a/man/validate_cover_cats.Rd b/man/validate_cover_cats.Rd index 843a9c2..ae54fb4 100644 --- a/man/validate_cover_cats.Rd +++ b/man/validate_cover_cats.Rd @@ -7,10 +7,10 @@ validate_cover_cats(u_cats_target, mapp) } \arguments{ +\item{u_cats_target}{vector of unique target period categories} + \item{mapp}{transition (mapping) table process with `get_mappings`, the "to_base" direction is taken.} - -\item{cats_target}{vector of unique target period categories} } \description{ Validate if the trans table contains all proper mappings diff --git a/tests/testthat/test-cat2cat_ml.R b/tests/testthat/test-cat2cat_ml.R index da55f79..4db1000 100644 --- a/tests/testthat/test-cat2cat_ml.R +++ b/tests/testthat/test-cat2cat_ml.R @@ -32,7 +32,7 @@ testthat::test_that("cat2cat_ml_run", { testthat::expect_equal(res, res2) testthat::expect_s3_class(res, c("cat2cat_ml_run", "list")) testthat::expect_output(print(res), "Selected prediction stats:") - testthat::expect_output(print(res), "Percent of failed knn ml models: 32") + testthat::expect_output(print(res), "Percent of failed knn ml models:") }) testthat::test_that("cat2cat_ml_run wrong direction", {