From 3a2561193fec0f14f6cd622816469ee2d36bc92c Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Thu, 26 Oct 2023 00:05:36 +0200 Subject: [PATCH 1/6] spelling --- R/cat2cat_ml.R | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/R/cat2cat_ml.R b/R/cat2cat_ml.R index 3a93389..0f01db0 100644 --- a/R/cat2cat_ml.R +++ b/R/cat2cat_ml.R @@ -235,6 +235,9 @@ cat2cat_ml_run <- function(mappings, ml, ...) { target_name <- "old" } + mapps <- get_mappings(mappings$trans) + mapp <- mapps[[paste0("to_", base_name)]] + cat_var <- ml$data[[ml$cat_var]] cat_var_vals <- unlist(mappings$trans[, base_name]) if (sum(cat_var %in% cat_var_vals) / length(cat_var) < elargs$min_match) { @@ -246,10 +249,6 @@ cat2cat_ml_run <- function(mappings, ml, ...) { ) } - mapps <- get_mappings(mappings$trans) - mapp <- mapps[[paste0("to_", base_name)]] - - nobs <- nrow(ml$data) features <- unique(ml$features) methods <- unique(ml$method) @@ -286,15 +285,13 @@ cat2cat_ml_run <- function(mappings, ml, ...) { res[[g_name]][["freq"]] <- mean(gfreq == data_test_small[[ml$cat_var]]) - if (isTRUE(nrow(data_test_small) == 0 || nrow(data_train_small) < 5)) { + cc <- complete.cases(data_test_small[, features]) + + if (isTRUE(nrow(data_test_small[cc, ]) == 0 || nrow(data_train_small) < 5)) { next } - cc <- complete.cases(data_test_small[, features]) - for (m in methods) { - ml_name <- paste0("wei_", m, "_c2c") - if (m == "knn") { group_prediction <- suppressWarnings( caret::knn3( @@ -361,7 +358,7 @@ print.cat2cat_ml_run <- function(x, ...) { acc <- mean(vapply(x, function(i) i$acc[m], numeric(1)), na.rm = T) ml_message <- c( ml_message, - sprintf("Average (groups) accurecy for %s ml models: %f", m, acc) + sprintf("Average (groups) accuracy for %s ml models: %f", m, acc) ) howaccn <- mean(vapply(x, function(i) i$naive < mean(i$acc[m], na.rm = TRUE), numeric(1)), na.rm = T) how_ml_message_n <- c( @@ -391,7 +388,7 @@ print.cat2cat_ml_run <- function(x, ...) { "Selected prediction stats:", "", sprintf("Average naive (equal probabilities) guess: %f", acc_naive), - sprintf("Average (groups) accurecy for most frequent category solution: %f", acc_freq), + sprintf("Average (groups) accuracy for most frequent category solution: %f", acc_freq), ml_message, "", na_message, From 40f100f9aa71ac50ca3e076d079413f36c6db892 Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Sat, 28 Oct 2023 00:21:35 +0200 Subject: [PATCH 2/6] improve ml solution --- R/cat2cat_ml.R | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/R/cat2cat_ml.R b/R/cat2cat_ml.R index 0f01db0..38c39f9 100644 --- a/R/cat2cat_ml.R +++ b/R/cat2cat_ml.R @@ -258,14 +258,13 @@ cat2cat_ml_run <- function(mappings, ml, ...) { ) res <- list() - for (cat in unique(names(mapp))) { + for (cat in names(mapp)) { try( { matched_cat <- mapp[[match(cat, names(mapp))]] - g_name <- paste(matched_cat, collapse = "&") - - res[[g_name]] <- list(ncat = length(matched_cat), naive = 1 / length(matched_cat), - acc = stats::setNames(rep(NA_real_, length(methods)), methods), freq = NA_real_) + cat_nam <- if (cat == "") " " else cat + res[[cat_nam]] <- list(naive = NA_real_, + acc = stats::setNames(rep(NA_real_, length(methods)), methods), freq = NA_real_) data_small_g <- do.call(rbind, train_g[matched_cat]) @@ -274,23 +273,23 @@ cat2cat_ml_run <- function(mappings, ml, ...) { next } + res[[cat_nam]][["naive"]] <- 1 / length(matched_cat) + index_tt <- sample(c(0, 1), nrow(data_small_g), prob = c(1 - elargs$test_prop, elargs$test_prop), replace = TRUE) data_test_small <- data_small_g[index_tt == 1, ] data_train_small <- data_small_g[index_tt == 0, ] - - gcounts <- table(data_train_small[[ml$cat_var]]) - gfreq <- names(gcounts)[which.max(gcounts)] - - res[[g_name]][["freq"]] <- mean(gfreq == data_test_small[[ml$cat_var]]) - cc <- complete.cases(data_test_small[, features]) if (isTRUE(nrow(data_test_small[cc, ]) == 0 || nrow(data_train_small) < 5)) { next } + gcounts <- table(data_train_small[[ml$cat_var]]) + gfreq <- names(gcounts)[which.max(gcounts)] + res[[cat_nam]][["freq"]] <- mean(gfreq == data_test_small[[ml$cat_var]]) + for (m in methods) { if (m == "knn") { group_prediction <- suppressWarnings( @@ -329,7 +328,7 @@ cat2cat_ml_run <- function(mappings, ml, ...) { as.matrix(data_test_small[cc, features, drop = FALSE]) )$class } - res[[g_name]][["acc"]][m] <- mean(pred == data_test_small[[ml$cat_var]]) + res[[cat_nam]][["acc"]][m] <- mean(pred == data_test_small[[ml$cat_var]]) } }, silent = TRUE From f4399ea0a640b0f21b75f119e7e5f0ccec0b174e Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Sat, 28 Oct 2023 00:36:54 +0200 Subject: [PATCH 3/6] fix test --- tests/testthat/test-cat2cat_ml.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-cat2cat_ml.R b/tests/testthat/test-cat2cat_ml.R index da55f79..4db1000 100644 --- a/tests/testthat/test-cat2cat_ml.R +++ b/tests/testthat/test-cat2cat_ml.R @@ -32,7 +32,7 @@ testthat::test_that("cat2cat_ml_run", { testthat::expect_equal(res, res2) testthat::expect_s3_class(res, c("cat2cat_ml_run", "list")) testthat::expect_output(print(res), "Selected prediction stats:") - testthat::expect_output(print(res), "Percent of failed knn ml models: 32") + testthat::expect_output(print(res), "Percent of failed knn ml models:") }) testthat::test_that("cat2cat_ml_run wrong direction", { From d0fb35cb682938fd0a8fa67b10bde370cccb5927 Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Sat, 23 Dec 2023 11:12:54 +0100 Subject: [PATCH 4/6] simplify README file --- DESCRIPTION | 2 +- NEWS.md | 2 +- README.md | 10 ---------- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7655cbc..f21de57 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: cat2cat Title: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset -Version: 0.4.6.9008 +Version: 0.4.6.9009 Authors@R: person("Maciej", "Nasinski", email = "nasinski.maciej@gmail.com", role = c("aut", "cre")) Maintainer: Maciej Nasinski Description: diff --git a/NEWS.md b/NEWS.md index 1933fe9..e4bf39b 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# cat2cat 0.4.6.9008 +# cat2cat 0.4.6.9009 * New `cat2cat_ml_run` function to check the ml models performance before `cat2cat` with ml option is run. Now, the ml models are more transparent. * Add tests for cat2cat related journal (softwarex) paper. diff --git a/README.md b/README.md index d941d28..49a5298 100644 --- a/README.md +++ b/README.md @@ -129,13 +129,3 @@ all.equal(nrow(ff), sum(final_data_back$wei_freq_c2c)) ``` **More complex examples are presented in the "Get Started" vignette.** - -## Graph - -The graphs present how the `cat2cat::cat2cat` function works, in this case under a panel dataset without the unique identifiers and only two periods. - -![Backward Mapping](./man/figures/back_nom.png) - -![Forward Mapping](./man/figures/for_nom.png) - - From 615bb2076e71ef842e73d0183fb0713c1bfd7852 Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Mon, 22 Jan 2024 18:39:49 +0100 Subject: [PATCH 5/6] latex related update --- R/cat2cat.R | 8 ++++---- R/cat2cat_utils.R | 2 +- man/cat2cat.Rd | 6 +++--- man/prune_c2c.Rd | 2 +- man/validate_cover_cats.Rd | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/cat2cat.R b/R/cat2cat.R index a7fefdb..2a0467f 100644 --- a/R/cat2cat.R +++ b/R/cat2cat.R @@ -24,7 +24,7 @@ #' `data`, `cat_var`, `method`, `features` and optional `args`. #' @details #' data args -#' \itemize{ +#' \describe{ #' \item{"old"}{ data.frame older time point in a panel} #' \item{"new"} { data.frame more recent time point in a panel} #' \item{"time_var"}{ character(1) name of the time variable.} @@ -51,7 +51,7 @@ #' } #' } #' mappings args -#' \itemize{ +#' \describe{ #' \item{"trans"}{ data.frame with 2 columns - mapping (transition) table - #' all categories for cat_var in old and new datasets have to be included. #' First column contains an old encoding and second a new one. @@ -70,7 +70,7 @@ #' } #' } #' Optional ml args -#' \itemize{ +#' \describe{ #' \item{"data"}{ data.frame - dataset with features and the `cat_var`.} #' \item{"cat_var"}{ character(1) - the dependent variable name.} #' \item{"method"}{ @@ -295,7 +295,7 @@ cat2cat <- function(data = list( } #' Validate if the trans table contains all proper mappings -#' @param cats_target vector of unique target period categories +#' @param u_cats_target vector of unique target period categories #' @param mapp transition (mapping) table process with `get_mappings`, #' the "to_base" direction is taken. #' @keywords internal diff --git a/R/cat2cat_utils.R b/R/cat2cat_utils.R index 519fee9..ceea114 100644 --- a/R/cat2cat_utils.R +++ b/R/cat2cat_utils.R @@ -16,7 +16,7 @@ #' number of rows #' @details #' method - specify a method to reduce number of replications -#' \itemize{ +#' \describe{ #' \item{"nonzero"}{ remove nonzero probabilities} #' \item{"highest"} { #' leave only highest probabilities for each subject- accepting ties diff --git a/man/cat2cat.Rd b/man/cat2cat.Rd index 0f92482..25bd300 100644 --- a/man/cat2cat.Rd +++ b/man/cat2cat.Rd @@ -53,7 +53,7 @@ of replications. } \details{ data args -\itemize{ +\describe{ \item{"old"}{ data.frame older time point in a panel} \item{"new"} { data.frame more recent time point in a panel} \item{"time_var"}{ character(1) name of the time variable.} @@ -80,7 +80,7 @@ data args } } mappings args -\itemize{ +\describe{ \item{"trans"}{ data.frame with 2 columns - mapping (transition) table - all categories for cat_var in old and new datasets have to be included. First column contains an old encoding and second a new one. @@ -99,7 +99,7 @@ mappings args } } Optional ml args -\itemize{ +\describe{ \item{"data"}{ data.frame - dataset with features and the `cat_var`.} \item{"cat_var"}{ character(1) - the dependent variable name.} \item{"method"}{ diff --git a/man/prune_c2c.Rd b/man/prune_c2c.Rd index d0e1fb1..217055b 100644 --- a/man/prune_c2c.Rd +++ b/man/prune_c2c.Rd @@ -37,7 +37,7 @@ created in the cat2cat procedure. } \details{ method - specify a method to reduce number of replications -\itemize{ +\describe{ \item{"nonzero"}{ remove nonzero probabilities} \item{"highest"} { leave only highest probabilities for each subject- accepting ties diff --git a/man/validate_cover_cats.Rd b/man/validate_cover_cats.Rd index 843a9c2..ae54fb4 100644 --- a/man/validate_cover_cats.Rd +++ b/man/validate_cover_cats.Rd @@ -7,10 +7,10 @@ validate_cover_cats(u_cats_target, mapp) } \arguments{ +\item{u_cats_target}{vector of unique target period categories} + \item{mapp}{transition (mapping) table process with `get_mappings`, the "to_base" direction is taken.} - -\item{cats_target}{vector of unique target period categories} } \description{ Validate if the trans table contains all proper mappings From 202e72956b5d216912c05df63011f78c82a8af85 Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Mon, 22 Jan 2024 19:02:13 +0100 Subject: [PATCH 6/6] latex related --- R/cat2cat.R | 2 +- R/cat2cat_utils.R | 4 ++-- man/cat2cat.Rd | 2 +- man/prune_c2c.Rd | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/cat2cat.R b/R/cat2cat.R index 2a0467f..8961fbc 100644 --- a/R/cat2cat.R +++ b/R/cat2cat.R @@ -26,7 +26,7 @@ #' data args #' \describe{ #' \item{"old"}{ data.frame older time point in a panel} -#' \item{"new"} { data.frame more recent time point in a panel} +#' \item{"new"}{ data.frame more recent time point in a panel} #' \item{"time_var"}{ character(1) name of the time variable.} #' \item{"cat_var"}{ character(1) name of the categorical variable.} #' \item{"cat_var_old"}{ diff --git a/R/cat2cat_utils.R b/R/cat2cat_utils.R index ceea114..d50bc51 100644 --- a/R/cat2cat_utils.R +++ b/R/cat2cat_utils.R @@ -18,10 +18,10 @@ #' method - specify a method to reduce number of replications #' \describe{ #' \item{"nonzero"}{ remove nonzero probabilities} -#' \item{"highest"} { +#' \item{"highest"}{ #' leave only highest probabilities for each subject- accepting ties #' } -#' \item{"highest1"} { +#' \item{"highest1"}{ #' leave only highest probabilities for each subject - #' not accepting ties so always one is returned #' } diff --git a/man/cat2cat.Rd b/man/cat2cat.Rd index 25bd300..cf9afa4 100644 --- a/man/cat2cat.Rd +++ b/man/cat2cat.Rd @@ -55,7 +55,7 @@ of replications. data args \describe{ \item{"old"}{ data.frame older time point in a panel} - \item{"new"} { data.frame more recent time point in a panel} + \item{"new"}{ data.frame more recent time point in a panel} \item{"time_var"}{ character(1) name of the time variable.} \item{"cat_var"}{ character(1) name of the categorical variable.} \item{"cat_var_old"}{ diff --git a/man/prune_c2c.Rd b/man/prune_c2c.Rd index 217055b..7cf521e 100644 --- a/man/prune_c2c.Rd +++ b/man/prune_c2c.Rd @@ -39,10 +39,10 @@ created in the cat2cat procedure. method - specify a method to reduce number of replications \describe{ \item{"nonzero"}{ remove nonzero probabilities} - \item{"highest"} { + \item{"highest"}{ leave only highest probabilities for each subject- accepting ties } - \item{"highest1"} { + \item{"highest1"}{ leave only highest probabilities for each subject - not accepting ties so always one is returned }