From 3a2561193fec0f14f6cd622816469ee2d36bc92c Mon Sep 17 00:00:00 2001
From: Maciej Nasinski <nasinski.maciej@gmail.com>
Date: Thu, 26 Oct 2023 00:05:36 +0200
Subject: [PATCH 1/6] spelling

---
 R/cat2cat_ml.R | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/R/cat2cat_ml.R b/R/cat2cat_ml.R
index 3a93389..0f01db0 100644
--- a/R/cat2cat_ml.R
+++ b/R/cat2cat_ml.R
@@ -235,6 +235,9 @@ cat2cat_ml_run <- function(mappings, ml, ...) {
     target_name <- "old"
   }
 
+  mapps <- get_mappings(mappings$trans)
+  mapp <- mapps[[paste0("to_", base_name)]]
+
   cat_var <- ml$data[[ml$cat_var]]
   cat_var_vals <- unlist(mappings$trans[, base_name])
   if (sum(cat_var %in% cat_var_vals) / length(cat_var) < elargs$min_match) {
@@ -246,10 +249,6 @@ cat2cat_ml_run <- function(mappings, ml, ...) {
     )
   }
 
-  mapps <- get_mappings(mappings$trans)
-  mapp <- mapps[[paste0("to_", base_name)]]
-
-  nobs <- nrow(ml$data)
   features <- unique(ml$features)
   methods <- unique(ml$method)
 
@@ -286,15 +285,13 @@ cat2cat_ml_run <- function(mappings, ml, ...) {
 
         res[[g_name]][["freq"]] <- mean(gfreq == data_test_small[[ml$cat_var]])
 
-        if (isTRUE(nrow(data_test_small) == 0 || nrow(data_train_small) < 5)) {
+        cc <- complete.cases(data_test_small[, features])
+
+        if (isTRUE(nrow(data_test_small[cc, ]) == 0 || nrow(data_train_small) < 5)) {
           next
         }
 
-        cc <- complete.cases(data_test_small[, features])
-
         for (m in methods) {
-          ml_name <- paste0("wei_", m, "_c2c")
-
           if (m == "knn") {
             group_prediction <- suppressWarnings(
               caret::knn3(
@@ -361,7 +358,7 @@ print.cat2cat_ml_run <- function(x, ...) {
     acc <- mean(vapply(x, function(i) i$acc[m], numeric(1)), na.rm = T)
     ml_message <- c(
       ml_message,
-      sprintf("Average (groups) accurecy for %s ml models: %f", m, acc)
+      sprintf("Average (groups) accuracy for %s ml models: %f", m, acc)
     )
     howaccn <- mean(vapply(x, function(i) i$naive < mean(i$acc[m], na.rm = TRUE), numeric(1)), na.rm = T)
     how_ml_message_n <- c(
@@ -391,7 +388,7 @@ print.cat2cat_ml_run <- function(x, ...) {
         "Selected prediction stats:",
         "",
         sprintf("Average naive (equal probabilities) guess: %f", acc_naive),
-        sprintf("Average (groups) accurecy for most frequent category solution: %f", acc_freq),
+        sprintf("Average (groups) accuracy for most frequent category solution: %f", acc_freq),
         ml_message,
         "",
         na_message,

From 40f100f9aa71ac50ca3e076d079413f36c6db892 Mon Sep 17 00:00:00 2001
From: Maciej Nasinski <nasinski.maciej@gmail.com>
Date: Sat, 28 Oct 2023 00:21:35 +0200
Subject: [PATCH 2/6] improve ml solution

---
 R/cat2cat_ml.R | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/R/cat2cat_ml.R b/R/cat2cat_ml.R
index 0f01db0..38c39f9 100644
--- a/R/cat2cat_ml.R
+++ b/R/cat2cat_ml.R
@@ -258,14 +258,13 @@ cat2cat_ml_run <- function(mappings, ml, ...) {
   )
 
   res <- list()
-  for (cat in unique(names(mapp))) {
+  for (cat in names(mapp)) {
     try(
       {
         matched_cat <- mapp[[match(cat, names(mapp))]]
-        g_name <- paste(matched_cat, collapse = "&")
-
-        res[[g_name]] <- list(ncat = length(matched_cat), naive = 1 / length(matched_cat),
-                              acc = stats::setNames(rep(NA_real_, length(methods)), methods), freq = NA_real_)
+        cat_nam <- if (cat == "") " " else cat
+        res[[cat_nam]] <- list(naive = NA_real_,
+                               acc = stats::setNames(rep(NA_real_, length(methods)), methods), freq = NA_real_)
 
         data_small_g <- do.call(rbind, train_g[matched_cat])
 
@@ -274,23 +273,23 @@ cat2cat_ml_run <- function(mappings, ml, ...) {
           next
         }
 
+        res[[cat_nam]][["naive"]] <- 1 / length(matched_cat)
+
         index_tt <- sample(c(0, 1),
                            nrow(data_small_g),
                            prob = c(1 - elargs$test_prop, elargs$test_prop), replace = TRUE)
         data_test_small <- data_small_g[index_tt == 1, ]
         data_train_small <- data_small_g[index_tt == 0, ]
-
-        gcounts <- table(data_train_small[[ml$cat_var]])
-        gfreq <- names(gcounts)[which.max(gcounts)]
-
-        res[[g_name]][["freq"]] <- mean(gfreq == data_test_small[[ml$cat_var]])
-
         cc <- complete.cases(data_test_small[, features])
 
         if (isTRUE(nrow(data_test_small[cc, ]) == 0 || nrow(data_train_small) < 5)) {
           next
         }
 
+        gcounts <- table(data_train_small[[ml$cat_var]])
+        gfreq <- names(gcounts)[which.max(gcounts)]
+        res[[cat_nam]][["freq"]] <- mean(gfreq == data_test_small[[ml$cat_var]])
+
         for (m in methods) {
           if (m == "knn") {
             group_prediction <- suppressWarnings(
@@ -329,7 +328,7 @@ cat2cat_ml_run <- function(mappings, ml, ...) {
               as.matrix(data_test_small[cc, features, drop = FALSE])
             )$class
           }
-          res[[g_name]][["acc"]][m] <- mean(pred == data_test_small[[ml$cat_var]])
+          res[[cat_nam]][["acc"]][m] <- mean(pred == data_test_small[[ml$cat_var]])
         }
       },
       silent = TRUE

From f4399ea0a640b0f21b75f119e7e5f0ccec0b174e Mon Sep 17 00:00:00 2001
From: Maciej Nasinski <nasinski.maciej@gmail.com>
Date: Sat, 28 Oct 2023 00:36:54 +0200
Subject: [PATCH 3/6] fix test

---
 tests/testthat/test-cat2cat_ml.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-cat2cat_ml.R b/tests/testthat/test-cat2cat_ml.R
index da55f79..4db1000 100644
--- a/tests/testthat/test-cat2cat_ml.R
+++ b/tests/testthat/test-cat2cat_ml.R
@@ -32,7 +32,7 @@ testthat::test_that("cat2cat_ml_run", {
   testthat::expect_equal(res, res2)
   testthat::expect_s3_class(res, c("cat2cat_ml_run", "list"))
   testthat::expect_output(print(res), "Selected prediction stats:")
-  testthat::expect_output(print(res), "Percent of failed knn ml models: 32")
+  testthat::expect_output(print(res), "Percent of failed knn ml models:")
 })
 
 testthat::test_that("cat2cat_ml_run wrong direction", {

From d0fb35cb682938fd0a8fa67b10bde370cccb5927 Mon Sep 17 00:00:00 2001
From: Maciej Nasinski <nasinski.maciej@gmail.com>
Date: Sat, 23 Dec 2023 11:12:54 +0100
Subject: [PATCH 4/6] simplify README file

---
 DESCRIPTION |  2 +-
 NEWS.md     |  2 +-
 README.md   | 10 ----------
 3 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 7655cbc..f21de57 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: cat2cat
 Title: Handling an Inconsistently Coded Categorical Variable in a Longitudinal Dataset
-Version: 0.4.6.9008
+Version: 0.4.6.9009
 Authors@R: person("Maciej", "Nasinski", email = "nasinski.maciej@gmail.com", role = c("aut", "cre"))
 Maintainer: Maciej Nasinski <nasinski.maciej@gmail.com>
 Description: 
diff --git a/NEWS.md b/NEWS.md
index 1933fe9..e4bf39b 100755
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-# cat2cat 0.4.6.9008
+# cat2cat 0.4.6.9009
 
 * New `cat2cat_ml_run` function to check the ml models performance before `cat2cat` with ml option is run. Now, the ml models are more transparent.
 * Add tests for cat2cat related journal (softwarex) paper.
diff --git a/README.md b/README.md
index d941d28..49a5298 100644
--- a/README.md
+++ b/README.md
@@ -129,13 +129,3 @@ all.equal(nrow(ff), sum(final_data_back$wei_freq_c2c))
 ```
 
 **More complex examples are presented in the "Get Started" vignette.**
-
-## Graph
-
-The graphs present how the `cat2cat::cat2cat` function works, in this case under a panel dataset without the unique identifiers and only two periods.
-
-![Backward Mapping](./man/figures/back_nom.png)
-
-![Forward Mapping](./man/figures/for_nom.png)
-
-

From 615bb2076e71ef842e73d0183fb0713c1bfd7852 Mon Sep 17 00:00:00 2001
From: Maciej Nasinski <nasinski.maciej@gmail.com>
Date: Mon, 22 Jan 2024 18:39:49 +0100
Subject: [PATCH 5/6] latex related update

---
 R/cat2cat.R                | 8 ++++----
 R/cat2cat_utils.R          | 2 +-
 man/cat2cat.Rd             | 6 +++---
 man/prune_c2c.Rd           | 2 +-
 man/validate_cover_cats.Rd | 4 ++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/R/cat2cat.R b/R/cat2cat.R
index a7fefdb..2a0467f 100644
--- a/R/cat2cat.R
+++ b/R/cat2cat.R
@@ -24,7 +24,7 @@
 #' `data`, `cat_var`, `method`, `features` and optional `args`.
 #' @details
 #' data args
-#' \itemize{
+#' \describe{
 #'  \item{"old"}{ data.frame older time point in a panel}
 #'  \item{"new"} { data.frame more recent time point in a panel}
 #'  \item{"time_var"}{ character(1) name of the time variable.}
@@ -51,7 +51,7 @@
 #'  }
 #' }
 #' mappings args
-#' \itemize{
+#' \describe{
 #'  \item{"trans"}{ data.frame with 2 columns - mapping (transition) table -
 #'   all categories for cat_var in old and new datasets have to be included.
 #'   First column contains an old encoding and second a new one.
@@ -70,7 +70,7 @@
 #'  }
 #' }
 #' Optional ml args
-#' \itemize{
+#' \describe{
 #'  \item{"data"}{ data.frame - dataset with features and the `cat_var`.}
 #'  \item{"cat_var"}{ character(1) - the dependent variable name.}
 #'  \item{"method"}{
@@ -295,7 +295,7 @@ cat2cat <- function(data = list(
 }
 
 #' Validate if the trans table contains all proper mappings
-#' @param cats_target vector of unique target period categories
+#' @param u_cats_target vector of unique target period categories
 #' @param mapp transition (mapping) table process with `get_mappings`,
 #'  the "to_base" direction is taken.
 #' @keywords internal
diff --git a/R/cat2cat_utils.R b/R/cat2cat_utils.R
index 519fee9..ceea114 100644
--- a/R/cat2cat_utils.R
+++ b/R/cat2cat_utils.R
@@ -16,7 +16,7 @@
 #' number of rows
 #' @details
 #' method - specify a method to reduce number of replications
-#' \itemize{
+#' \describe{
 #'  \item{"nonzero"}{ remove nonzero probabilities}
 #'  \item{"highest"} {
 #'  leave only highest probabilities for each subject- accepting ties
diff --git a/man/cat2cat.Rd b/man/cat2cat.Rd
index 0f92482..25bd300 100644
--- a/man/cat2cat.Rd
+++ b/man/cat2cat.Rd
@@ -53,7 +53,7 @@ of replications.
 }
 \details{
 data args
-\itemize{
+\describe{
  \item{"old"}{ data.frame older time point in a panel}
  \item{"new"} { data.frame more recent time point in a panel}
  \item{"time_var"}{ character(1) name of the time variable.}
@@ -80,7 +80,7 @@ data args
  }
 }
 mappings args
-\itemize{
+\describe{
  \item{"trans"}{ data.frame with 2 columns - mapping (transition) table -
   all categories for cat_var in old and new datasets have to be included.
   First column contains an old encoding and second a new one.
@@ -99,7 +99,7 @@ mappings args
  }
 }
 Optional ml args
-\itemize{
+\describe{
  \item{"data"}{ data.frame - dataset with features and the `cat_var`.}
  \item{"cat_var"}{ character(1) - the dependent variable name.}
  \item{"method"}{
diff --git a/man/prune_c2c.Rd b/man/prune_c2c.Rd
index d0e1fb1..217055b 100644
--- a/man/prune_c2c.Rd
+++ b/man/prune_c2c.Rd
@@ -37,7 +37,7 @@ created in the cat2cat procedure.
 }
 \details{
 method - specify a method to reduce number of replications
-\itemize{
+\describe{
  \item{"nonzero"}{ remove nonzero probabilities}
  \item{"highest"} {
  leave only highest probabilities for each subject- accepting ties
diff --git a/man/validate_cover_cats.Rd b/man/validate_cover_cats.Rd
index 843a9c2..ae54fb4 100644
--- a/man/validate_cover_cats.Rd
+++ b/man/validate_cover_cats.Rd
@@ -7,10 +7,10 @@
 validate_cover_cats(u_cats_target, mapp)
 }
 \arguments{
+\item{u_cats_target}{vector of unique target period categories}
+
 \item{mapp}{transition (mapping) table process with `get_mappings`,
 the "to_base" direction is taken.}
-
-\item{cats_target}{vector of unique target period categories}
 }
 \description{
 Validate if the trans table contains all proper mappings

From 202e72956b5d216912c05df63011f78c82a8af85 Mon Sep 17 00:00:00 2001
From: Maciej Nasinski <nasinski.maciej@gmail.com>
Date: Mon, 22 Jan 2024 19:02:13 +0100
Subject: [PATCH 6/6] latex related

---
 R/cat2cat.R       | 2 +-
 R/cat2cat_utils.R | 4 ++--
 man/cat2cat.Rd    | 2 +-
 man/prune_c2c.Rd  | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/cat2cat.R b/R/cat2cat.R
index 2a0467f..8961fbc 100644
--- a/R/cat2cat.R
+++ b/R/cat2cat.R
@@ -26,7 +26,7 @@
 #' data args
 #' \describe{
 #'  \item{"old"}{ data.frame older time point in a panel}
-#'  \item{"new"} { data.frame more recent time point in a panel}
+#'  \item{"new"}{ data.frame more recent time point in a panel}
 #'  \item{"time_var"}{ character(1) name of the time variable.}
 #'  \item{"cat_var"}{ character(1) name of the categorical variable.}
 #'  \item{"cat_var_old"}{
diff --git a/R/cat2cat_utils.R b/R/cat2cat_utils.R
index ceea114..d50bc51 100644
--- a/R/cat2cat_utils.R
+++ b/R/cat2cat_utils.R
@@ -18,10 +18,10 @@
 #' method - specify a method to reduce number of replications
 #' \describe{
 #'  \item{"nonzero"}{ remove nonzero probabilities}
-#'  \item{"highest"} {
+#'  \item{"highest"}{
 #'  leave only highest probabilities for each subject- accepting ties
 #'  }
-#'  \item{"highest1"} {
+#'  \item{"highest1"}{
 #'  leave only highest probabilities for each subject -
 #'  not accepting ties so always one is returned
 #'  }
diff --git a/man/cat2cat.Rd b/man/cat2cat.Rd
index 25bd300..cf9afa4 100644
--- a/man/cat2cat.Rd
+++ b/man/cat2cat.Rd
@@ -55,7 +55,7 @@ of replications.
 data args
 \describe{
  \item{"old"}{ data.frame older time point in a panel}
- \item{"new"} { data.frame more recent time point in a panel}
+ \item{"new"}{ data.frame more recent time point in a panel}
  \item{"time_var"}{ character(1) name of the time variable.}
  \item{"cat_var"}{ character(1) name of the categorical variable.}
  \item{"cat_var_old"}{
diff --git a/man/prune_c2c.Rd b/man/prune_c2c.Rd
index 217055b..7cf521e 100644
--- a/man/prune_c2c.Rd
+++ b/man/prune_c2c.Rd
@@ -39,10 +39,10 @@ created in the cat2cat procedure.
 method - specify a method to reduce number of replications
 \describe{
  \item{"nonzero"}{ remove nonzero probabilities}
- \item{"highest"} {
+ \item{"highest"}{
  leave only highest probabilities for each subject- accepting ties
  }
- \item{"highest1"} {
+ \item{"highest1"}{
  leave only highest probabilities for each subject -
  not accepting ties so always one is returned
  }