From ac10a715fb388f46721e44f059f34c80ee30d398 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Thu, 7 Dec 2023 14:48:28 -0500
Subject: [PATCH 01/12] updated util_corr_fit outcome

---
 R/util_corr_fit.R | 95 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 71 insertions(+), 24 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index 5d0232a..dd0d954 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -31,9 +31,10 @@ util_corr_fit <- function(postsynth, data) {
   synthetic_data <- dplyr::select_if(synthetic_data, is.numeric)
   data <- dplyr::select_if(data, is.numeric)
 
-  # reorder data names
+  # reorder data names (this appears to check if the variables are the same)
   data <- dplyr::select(data, names(synthetic_data))
   
+  
   # helper function to find a correlation matrix with the upper tri set to zeros
   lower_triangle <- function(x) {
     
@@ -49,48 +50,94 @@ util_corr_fit <- function(postsynth, data) {
     return(correlation_matrix)
   }
   
-  # find the lower triangle of the original data linear correlation matrix
-  original_lt <- lower_triangle(data)
+  # find the lower triangle of the original data linear correlation matrix and return a df
+  original_lt <- data.frame(lower_triangle(data))
   
-  # find the lower triangle of the synthetic data linear correlation matrix
-  synthetic_lt <- lower_triangle(synthetic_data)
+  # adding variable 2 column to the original df
+  original_lt$var2 <- colnames(original_lt)
   
-  # compare names
-  if (any(rownames(original_lt) != rownames(synthetic_lt))) {
-    stop("ERROR: rownames are not identical")
-  }
+  # restructuring the correlation matrix so the cols are var1, var2, original
+  original_lt <- original_lt %>%
+    pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>%
+    filter(!is.na(original)) %>%
+    arrange(var1) %>%
+    select(var1, var2, original)
   
-  if (any(colnames(original_lt) != colnames(synthetic_lt))) {
-    stop("ERROR: colnames are not identical")
-  }
+  # find the lower triangle of the synthetic data linear correlation matrix and return a df
+  synthetic_lt <- data.frame(lower_triangle(synthetic_data))
   
-  # find the difference between the matrices
-  difference_lt <- synthetic_lt - original_lt
+  # adding variable 2 column to the synthetic df
+  synthetic_lt$var2 <- colnames(synthetic_lt)
   
-  # find the length of the nonzero values in the matrices
-  n <- choose(ncol(difference_lt), 2)
+  # restructuring the correlation matrix so the cols are var1, var2, synthetic
+  synthetic_lt <- synthetic_lt %>%
+    pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>%
+    filter(!is.na(synthetic)) %>%
+    arrange(var1) %>%
+    select(var1, var2, synthetic)
   
+  # find the difference between the original correlations and the synthetic 
+  correlation_data <- original_lt %>%
+    left_join(synthetic_lt, by = c("var1","var2")) %>%
+    mutate(difference = original - synthetic,
+           proportion_difference = .data$difference / .data$original)
+  
+  # find the length of the nonzero values in the matrices
+  n <- choose(ncol(correlation_data), 2)
+
   # calculate the correlation fit and divide by n
-  correlation_fit <- sqrt(sum(difference_lt ^ 2, na.rm = TRUE)) / n
+  correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n
   
-  difference_vec <- as.numeric(difference_lt)[!is.na(difference_lt)]
+  difference_vec <- as.numeric(correlation_data$difference)
   
   # mean absolute error
   correlation_difference_mae <- difference_vec %>%
     abs() %>%
     mean()
-  
+
   # root mean square error
-  correlation_difference_rmse <- 
-    difference_vec ^ 2%>%
+  correlation_difference_rmse <- difference_vec ^ 2 %>%
     mean() %>%
     sqrt()
+
+  
+   # compare names
+  # if (any(rownames(original_lt) != rownames(synthetic_lt))) {
+  #   stop("ERROR: rownames are not identical")
+  # }
+  # 
+  # if (any(colnames(original_lt) != colnames(synthetic_lt))) {
+  #   stop("ERROR: colnames are not identical")
+  # }
+  # 
+  # # find the difference between the matrices
+  # difference_lt <- synthetic_lt - original_lt
+  # 
+  # # find the length of the nonzero values in the matrices
+  # n <- choose(ncol(difference_lt), 2)
+  
+  # calculate the correlation fit and divide by n
+  # correlation_fit <- sqrt(sum(difference_lt ^ 2, na.rm = TRUE)) / n
+  # 
+  # difference_vec <- as.numeric(difference_lt)[!is.na(difference_lt)]
+  # 
+  # # mean absolute error
+  # correlation_difference_mae <- difference_vec %>%
+  #   abs() %>%
+  #   mean()
+  # 
+  # # root mean square error
+  # correlation_difference_rmse <- 
+  #   difference_vec ^ 2%>%
+  #   mean() %>%
+  #   sqrt()
   
   return(
     list(
-      correlation_original = original_lt,
-      correlation_synthetic = synthetic_lt,
-      correlation_difference = difference_lt,
+      correlation_data = correlation_data,
+      # correlation_original = original_lt,
+      # correlation_synthetic = synthetic_lt
+      # correlation_difference = difference_lt,
       correlation_fit = correlation_fit,
       correlation_difference_mae = correlation_difference_mae,
       correlation_difference_rmse = correlation_difference_rmse

From ecf5cd2b0bf4dd3fd4e8b30d5c9045906af61fc9 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Thu, 21 Dec 2023 15:26:47 -0500
Subject: [PATCH 02/12] added group_by param

---
 R/util_corr_fit.R    | 14 +++++++++++---
 man/util_corr_fit.Rd |  4 +++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index dd0d954..338da9e 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -2,6 +2,7 @@
 #'
 #' @param postsynth A postsynth object from tidysynthesis or a tibble
 #' @param data an original (observed) data set.
+#' @param group_by 
 #'
 #' @return A `list` of fit metrics:
 #'  - `correlation_original`: correlation matrix of the original data.
@@ -16,7 +17,7 @@
 #'
 #' @export
 
-util_corr_fit <- function(postsynth, data) {
+util_corr_fit <- function(postsynth, data, group_by = NULL) {
   
   if (is_postsynth(postsynth)) {
   
@@ -28,12 +29,19 @@ util_corr_fit <- function(postsynth, data) {
     
   }
   
-  synthetic_data <- dplyr::select_if(synthetic_data, is.numeric)
-  data <- dplyr::select_if(data, is.numeric)
+  synthetic_data <- dplyr::select_if(synthetic_data, is.numeric, {{ group_by }})
+  data <- dplyr::select_if(data, is.numeric, {{ group_by }})
 
   # reorder data names (this appears to check if the variables are the same)
   data <- dplyr::select(data, names(synthetic_data))
   
+ if (!is.null({{ group_by }})){
+   for(level in levels(data %>% select({{ group_by }}))){
+     data_sub <- data %>% filter(group_by == level)
+     view(data_sub)
+   }
+   #df <- util_corr_fit(postsynth, data, group_by == NULL)
+ }
   
   # helper function to find a correlation matrix with the upper tri set to zeros
   lower_triangle <- function(x) {
diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd
index 860a090..57126a9 100644
--- a/man/util_corr_fit.Rd
+++ b/man/util_corr_fit.Rd
@@ -4,12 +4,14 @@
 \alias{util_corr_fit}
 \title{Calculate the correlation fit metric of a confidential data set.}
 \usage{
-util_corr_fit(postsynth, data)
+util_corr_fit(postsynth, data, group_by = NULL)
 }
 \arguments{
 \item{postsynth}{A postsynth object from tidysynthesis or a tibble}
 
 \item{data}{an original (observed) data set.}
+
+\item{group_by}{}
 }
 \value{
 A \code{list} of fit metrics:

From ea65fe424130274bb5b3534bae328b3590924ea9 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Thu, 21 Dec 2023 17:22:00 -0500
Subject: [PATCH 03/12] added way to group by a variable for the data table

---
 R/util_corr_fit.R | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index 338da9e..c71da6a 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -2,7 +2,8 @@
 #'
 #' @param postsynth A postsynth object from tidysynthesis or a tibble
 #' @param data an original (observed) data set.
-#' @param group_by 
+#' @param group_var 
+#' @param level
 #'
 #' @return A `list` of fit metrics:
 #'  - `correlation_original`: correlation matrix of the original data.
@@ -17,7 +18,10 @@
 #'
 #' @export
 
-util_corr_fit <- function(postsynth, data, group_by = NULL) {
+util_corr_fit <- function(postsynth,
+                          data, 
+                          group_var = NULL,
+                          level = NULL) {
   
   if (is_postsynth(postsynth)) {
   
@@ -26,22 +30,32 @@ util_corr_fit <- function(postsynth, data, group_by = NULL) {
   } else {
     
     synthetic_data <- postsynth
-    
   }
   
-  synthetic_data <- dplyr::select_if(synthetic_data, is.numeric, {{ group_by }})
-  data <- dplyr::select_if(data, is.numeric, {{ group_by }})
+   
+  synthetic_data <- dplyr::select(synthetic_data, where(is.numeric), {{ group_var }})
+  data <- dplyr::select(data, where(is.numeric), {{ group_var }})
 
   # reorder data names (this appears to check if the variables are the same)
   data <- dplyr::select(data, names(synthetic_data))
   
- if (!is.null({{ group_by }})){
-   for(level in levels(data %>% select({{ group_by }}))){
-     data_sub <- data %>% filter(group_by == level)
-     view(data_sub)
+  # issue: if group_var = NULL is passed into the function, this runs 
+  if(!missing(group_var)){
+
+    levels <- data %>% dplyr::distinct({{ group_var }}) %>% pull()
+    
+    correlation_data <- data.frame()
+ 
+   for(level in levels) {
+       data_sub <- data %>% dplyr::filter({{ group_var }} == level)
+       
+       df <- util_corr_fit(postsynth = synthetic_data, data = data_sub, level = level)$correlation_data
+       
+       correlation_data <- dplyr::bind_rows(correlation_data, df)
    }
-   #df <- util_corr_fit(postsynth, data, group_by == NULL)
- }
+    
+    return(correlation_data)
+  }
   
   # helper function to find a correlation matrix with the upper tri set to zeros
   lower_triangle <- function(x) {
@@ -90,6 +104,11 @@ util_corr_fit <- function(postsynth, data, group_by = NULL) {
     mutate(difference = original - synthetic,
            proportion_difference = .data$difference / .data$original)
   
+  # add level (if level is not null)
+  if(!is.null(level)){
+    correlation_data <- cbind(level, correlation_data)
+  }
+  
   # find the length of the nonzero values in the matrices
   n <- choose(ncol(correlation_data), 2)
 

From 39cf538a8e52e7ffb68f66932053e72b077379a1 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Fri, 29 Dec 2023 14:06:16 -0500
Subject: [PATCH 04/12] extending group_by function to include all outputs

---
 R/util_corr_fit.R    | 105 +++++++++++++++++--------------------------
 man/util_corr_fit.Rd |   6 ++-
 2 files changed, 46 insertions(+), 65 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index c71da6a..af6093b 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -2,8 +2,7 @@
 #'
 #' @param postsynth A postsynth object from tidysynthesis or a tibble
 #' @param data an original (observed) data set.
-#' @param group_var 
-#' @param level
+#' @param group_by 
 #'
 #' @return A `list` of fit metrics:
 #'  - `correlation_original`: correlation matrix of the original data.
@@ -20,8 +19,7 @@
 
 util_corr_fit <- function(postsynth,
                           data, 
-                          group_var = NULL,
-                          level = NULL) {
+                          group_by = NULL) {
   
   if (is_postsynth(postsynth)) {
   
@@ -33,28 +31,48 @@ util_corr_fit <- function(postsynth,
   }
   
    
-  synthetic_data <- dplyr::select(synthetic_data, where(is.numeric), {{ group_var }})
-  data <- dplyr::select(data, where(is.numeric), {{ group_var }})
+  synthetic_data <- dplyr::select(synthetic_data, where(is.numeric), {{ group_by }})
+  data <- dplyr::select(data, where(is.numeric), {{ group_by }})
 
   # reorder data names (this appears to check if the variables are the same)
   data <- dplyr::select(data, names(synthetic_data))
   
-  # issue: if group_var = NULL is passed into the function, this runs 
-  if(!missing(group_var)){
-
-    levels <- data %>% dplyr::distinct({{ group_var }}) %>% pull()
+  # issue: if group_by = NULL is passed into the function, this runs 
+  if(!missing(group_by)){
+    
+    levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull()
     
     correlation_data <- data.frame()
+    correlation_fit = c()
+    correlation_difference_mae = c()
+    correlation_difference_rmse = c()
  
    for(level in levels) {
-       data_sub <- data %>% dplyr::filter({{ group_var }} == level)
+       data_sub <- data %>% dplyr::filter({{ group_by }} == level)
+       
+       # get the results for the subgroup/level
+       result <- util_corr_fit(postsynth = synthetic_data, data = data_sub)
        
-       df <- util_corr_fit(postsynth = synthetic_data, data = data_sub, level = level)$correlation_data
+       df <- result$correlation_data
+       fit <- result$correlation_fit
+       mae <- result$correlation_difference_mae
+       rmse <- result$correlation_difference_rmse
        
-       correlation_data <- dplyr::bind_rows(correlation_data, df)
+       # add the results to a growing list of results for each subgroup/level 
+       correlation_data <- dplyr::bind_rows(correlation_data, cbind(level, df))
+       correlation_fit = c(correlation_fit, fit)
+       correlation_difference_mae = c(correlation_difference_mae, mae)
+       correlation_difference_rmse = c(correlation_difference_rmse, rmse)
    }
     
-    return(correlation_data)
+    return(
+      list(
+        correlation_data = correlation_data,
+        correlation_fit = correlation_fit,
+        correlation_difference_mae = correlation_difference_mae,
+        correlation_difference_rmse = correlation_difference_rmse
+        )
+    )
   }
   
   # helper function to find a correlation matrix with the upper tri set to zeros
@@ -80,10 +98,10 @@ util_corr_fit <- function(postsynth,
   
   # restructuring the correlation matrix so the cols are var1, var2, original
   original_lt <- original_lt %>%
-    pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>%
-    filter(!is.na(original)) %>%
-    arrange(var1) %>%
-    select(var1, var2, original)
+    tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>%
+    dplyr::filter(!is.na(original)) %>%
+    dplyr::arrange(var1) %>%
+    dplyr::select(var1, var2, original)
   
   # find the lower triangle of the synthetic data linear correlation matrix and return a df
   synthetic_lt <- data.frame(lower_triangle(synthetic_data))
@@ -93,22 +111,17 @@ util_corr_fit <- function(postsynth,
   
   # restructuring the correlation matrix so the cols are var1, var2, synthetic
   synthetic_lt <- synthetic_lt %>%
-    pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>%
-    filter(!is.na(synthetic)) %>%
-    arrange(var1) %>%
-    select(var1, var2, synthetic)
+    tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>%
+    dplyr::filter(!is.na(synthetic)) %>%
+    dplyr::arrange(var1) %>%
+    dplyr::select(var1, var2, synthetic)
   
   # find the difference between the original correlations and the synthetic 
   correlation_data <- original_lt %>%
-    left_join(synthetic_lt, by = c("var1","var2")) %>%
-    mutate(difference = original - synthetic,
+    dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>%
+    dplyr::mutate(difference = original - synthetic,
            proportion_difference = .data$difference / .data$original)
   
-  # add level (if level is not null)
-  if(!is.null(level)){
-    correlation_data <- cbind(level, correlation_data)
-  }
-  
   # find the length of the nonzero values in the matrices
   n <- choose(ncol(correlation_data), 2)
 
@@ -126,45 +139,11 @@ util_corr_fit <- function(postsynth,
   correlation_difference_rmse <- difference_vec ^ 2 %>%
     mean() %>%
     sqrt()
-
   
-   # compare names
-  # if (any(rownames(original_lt) != rownames(synthetic_lt))) {
-  #   stop("ERROR: rownames are not identical")
-  # }
-  # 
-  # if (any(colnames(original_lt) != colnames(synthetic_lt))) {
-  #   stop("ERROR: colnames are not identical")
-  # }
-  # 
-  # # find the difference between the matrices
-  # difference_lt <- synthetic_lt - original_lt
-  # 
-  # # find the length of the nonzero values in the matrices
-  # n <- choose(ncol(difference_lt), 2)
-  
-  # calculate the correlation fit and divide by n
-  # correlation_fit <- sqrt(sum(difference_lt ^ 2, na.rm = TRUE)) / n
-  # 
-  # difference_vec <- as.numeric(difference_lt)[!is.na(difference_lt)]
-  # 
-  # # mean absolute error
-  # correlation_difference_mae <- difference_vec %>%
-  #   abs() %>%
-  #   mean()
-  # 
-  # # root mean square error
-  # correlation_difference_rmse <- 
-  #   difference_vec ^ 2%>%
-  #   mean() %>%
-  #   sqrt()
   
   return(
     list(
       correlation_data = correlation_data,
-      # correlation_original = original_lt,
-      # correlation_synthetic = synthetic_lt
-      # correlation_difference = difference_lt,
       correlation_fit = correlation_fit,
       correlation_difference_mae = correlation_difference_mae,
       correlation_difference_rmse = correlation_difference_rmse
diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd
index 57126a9..5be7e4e 100644
--- a/man/util_corr_fit.Rd
+++ b/man/util_corr_fit.Rd
@@ -4,14 +4,16 @@
 \alias{util_corr_fit}
 \title{Calculate the correlation fit metric of a confidential data set.}
 \usage{
-util_corr_fit(postsynth, data, group_by = NULL)
+util_corr_fit(postsynth, data, group_var = NULL, level = NULL)
 }
 \arguments{
 \item{postsynth}{A postsynth object from tidysynthesis or a tibble}
 
 \item{data}{an original (observed) data set.}
 
-\item{group_by}{}
+\item{group_var}{}
+
+\item{level}{}
 }
 \value{
 A \code{list} of fit metrics:

From 543d063e04827292f44f33b2f6f23de2e2082b4d Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Fri, 29 Dec 2023 14:19:24 -0500
Subject: [PATCH 05/12] fixed is.null() issue

---
 R/util_corr_fit.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index af6093b..bb46a67 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -37,8 +37,7 @@ util_corr_fit <- function(postsynth,
   # reorder data names (this appears to check if the variables are the same)
   data <- dplyr::select(data, names(synthetic_data))
   
-  # issue: if group_by = NULL is passed into the function, this runs 
-  if(!missing(group_by)){
+  if(!rlang::quo_is_null(enquo(group_by))){
     
     levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull()
     

From 731cb64eeab772cfb677616bb950e1618775deb0 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Mon, 1 Jan 2024 19:20:12 -0500
Subject: [PATCH 06/12] changed param to group_by

---
 R/util_corr_fit.R    | 4 ++--
 man/util_corr_fit.Rd | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index bb46a67..5701592 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -37,7 +37,7 @@ util_corr_fit <- function(postsynth,
   # reorder data names (this appears to check if the variables are the same)
   data <- dplyr::select(data, names(synthetic_data))
   
-  if(!rlang::quo_is_null(enquo(group_by))){
+  if(!rlang::quo_is_null(rlang::enquo(group_by))){
     
     levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull()
     
@@ -115,7 +115,7 @@ util_corr_fit <- function(postsynth,
     dplyr::arrange(var1) %>%
     dplyr::select(var1, var2, synthetic)
   
-  # find the difference between the original correlations and the synthetic 
+  # combining the data and finding the difference between the original and synthetic correlations 
   correlation_data <- original_lt %>%
     dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>%
     dplyr::mutate(difference = original - synthetic,
diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd
index 5be7e4e..57126a9 100644
--- a/man/util_corr_fit.Rd
+++ b/man/util_corr_fit.Rd
@@ -4,16 +4,14 @@
 \alias{util_corr_fit}
 \title{Calculate the correlation fit metric of a confidential data set.}
 \usage{
-util_corr_fit(postsynth, data, group_var = NULL, level = NULL)
+util_corr_fit(postsynth, data, group_by = NULL)
 }
 \arguments{
 \item{postsynth}{A postsynth object from tidysynthesis or a tibble}
 
 \item{data}{an original (observed) data set.}
 
-\item{group_var}{}
-
-\item{level}{}
+\item{group_by}{}
 }
 \value{
 A \code{list} of fit metrics:

From e8a292fa2e8a5eb668b00f31d1547addc6a6ff16 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Fri, 5 Jan 2024 16:57:18 -0500
Subject: [PATCH 07/12] allowing the function to group by multiple variables,
 issue with lower_triangle function

---
 R/util_corr_fit.R | 105 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 83 insertions(+), 22 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index 5701592..588003c 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -36,33 +36,55 @@ util_corr_fit <- function(postsynth,
 
   # reorder data names (this appears to check if the variables are the same)
   data <- dplyr::select(data, names(synthetic_data))
-  
-  if(!rlang::quo_is_null(rlang::enquo(group_by))){
+
+  # check if a group_by variable was passed
+  if (!rlang::quo_is_null(rlang::enquo(group_by))) {
     
-    levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull()
+    # get all of the level combinations from each group by variable
+    levels <- data %>% 
+      dplyr::select({{ group_by }}) %>%
+      expand.grid() %>%
+      unique()
     
+    # initialize return values
     correlation_data <- data.frame()
     correlation_fit = c()
     correlation_difference_mae = c()
     correlation_difference_rmse = c()
- 
-   for(level in levels) {
-       data_sub <- data %>% dplyr::filter({{ group_by }} == level)
-       
-       # get the results for the subgroup/level
-       result <- util_corr_fit(postsynth = synthetic_data, data = data_sub)
-       
-       df <- result$correlation_data
-       fit <- result$correlation_fit
-       mae <- result$correlation_difference_mae
-       rmse <- result$correlation_difference_rmse
-       
-       # add the results to a growing list of results for each subgroup/level 
-       correlation_data <- dplyr::bind_rows(correlation_data, cbind(level, df))
-       correlation_fit = c(correlation_fit, fit)
-       correlation_difference_mae = c(correlation_difference_mae, mae)
-       correlation_difference_rmse = c(correlation_difference_rmse, rmse)
-   }
+    
+    # get a subset of the data 
+    for (level in 1:nrow(levels)) {
+      
+      data_sub <- data
+      
+      for (i in 1:length(colnames(levels))){
+        col <- colnames(levels)[i]
+        value <- levels[level,i]
+        
+        data_sub <- data_sub %>% filter(!!rlang::sym(col) == value)
+      }
+
+      
+      # get the results for the subgroup/level
+      result <- util_corr_fit(postsynth = synthetic_data, data = data_sub)
+      
+      df <- result$correlation_data
+      fit <- result$correlation_fit
+      mae <- result$correlation_difference_mae
+      rmse <- result$correlation_difference_rmse
+      
+      #view(data_sub)
+      
+      # add the results to a growing list of results for each subgroup/level 
+      correlation_data <- dplyr::bind_rows(correlation_data, cbind("level_add_later", df))
+      correlation_fit = c(correlation_fit, fit)
+      correlation_difference_mae = c(correlation_difference_mae, mae)
+      correlation_difference_rmse = c(correlation_difference_rmse, rmse)
+      
+      #view(correlation_data)
+      
+      stop()
+    }
     
     return(
       list(
@@ -70,8 +92,43 @@ util_corr_fit <- function(postsynth,
         correlation_fit = correlation_fit,
         correlation_difference_mae = correlation_difference_mae,
         correlation_difference_rmse = correlation_difference_rmse
-        )
+      )
     )
+  
+    
+    #levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull()
+    
+   #  correlation_data <- data.frame()
+   #  correlation_fit = c()
+   #  correlation_difference_mae = c()
+   #  correlation_difference_rmse = c()
+   # 
+   # for(level in levels) {
+   #     data_sub <- data %>% dplyr::filter({{ group_by }} == level)
+   #     
+   #     # get the results for the subgroup/level
+   #     result <- util_corr_fit(postsynth = synthetic_data, data = data_sub)
+   #     
+   #     df <- result$correlation_data
+   #     fit <- result$correlation_fit
+   #     mae <- result$correlation_difference_mae
+   #     rmse <- result$correlation_difference_rmse
+   #     
+   #     # add the results to a growing list of results for each subgroup/level 
+   #     correlation_data <- dplyr::bind_rows(correlation_data, cbind(level, df))
+   #     correlation_fit = c(correlation_fit, fit)
+   #     correlation_difference_mae = c(correlation_difference_mae, mae)
+   #     correlation_difference_rmse = c(correlation_difference_rmse, rmse)
+   # }
+   #  
+   #  return(
+   #    list(
+   #      correlation_data = correlation_data,
+   #      correlation_fit = correlation_fit,
+   #      correlation_difference_mae = correlation_difference_mae,
+   #      correlation_difference_rmse = correlation_difference_rmse
+   #      )
+   #  )
   }
   
   # helper function to find a correlation matrix with the upper tri set to zeros
@@ -86,9 +143,13 @@ util_corr_fit <- function(postsynth,
     # set the values in the upper triangle to zero to avoid double counting
     correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA
     
+    # view(correlation_matrix)
+    
     return(correlation_matrix)
   }
   
+  # view(lower_triangle(data))
+  
   # find the lower triangle of the original data linear correlation matrix and return a df
   original_lt <- data.frame(lower_triangle(data))
   

From 6423b3a1673981e5f0a1e10f1ea6757933a8bdd4 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Tue, 9 Jan 2024 09:11:22 -0500
Subject: [PATCH 08/12] finished adding group_by param

---
 R/util_corr_fit.R | 100 +++++++++++++++-------------------------------
 1 file changed, 32 insertions(+), 68 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index 588003c..4f69125 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -2,16 +2,18 @@
 #'
 #' @param postsynth A postsynth object from tidysynthesis or a tibble
 #' @param data an original (observed) data set.
-#' @param group_by 
+#' @param group_by The unquoted name of a (or multiple) grouping variable(s)
 #'
 #' @return A `list` of fit metrics:
-#'  - `correlation_original`: correlation matrix of the original data.
-#'  - `correlation_synthetic`: correlation matrix of the synthetic data.
-#'  - `correlation_difference`: difference between `correlation_synthetic` and
-#'  `correlation_original`.
+#'  - `correlation_data`: A `tibble` of the correlations among the 
+#'  numeric variables for the actual and synthetic data
 #'  - `correlation_fit`: square root of the sum of squared differences between
-#'  `correlation_synthetic` and `correlation_original`, divided by the number of
+#'  the synthetic and original data, divided by the number of
 #'  cells in the correlation matrix.
+#'  - `correlation_difference_mae`: the mean of the absolute correlation 
+#'  differences between the actual and synthetic data
+#'  - `correlation_difference_rmse`: the root mean of the squared correlation 
+#'  differences between the actual and synthetic data
 #'  
 #' @family utility functions
 #'
@@ -41,10 +43,9 @@ util_corr_fit <- function(postsynth,
   if (!rlang::quo_is_null(rlang::enquo(group_by))) {
     
     # get all of the level combinations from each group by variable
-    levels <- data %>% 
-      dplyr::select({{ group_by }}) %>%
-      expand.grid() %>%
-      unique()
+    data_levels <- data %>% dplyr::select({{ group_by }}) 
+    syn_levels <- synthetic_data %>% dplyr::select({{ group_by }}) 
+    combined_levels <- rbind(data_levels, syn_levels) %>% unique()
     
     # initialize return values
     correlation_data <- data.frame()
@@ -53,37 +54,35 @@ util_corr_fit <- function(postsynth,
     correlation_difference_rmse = c()
     
     # get a subset of the data 
-    for (level in 1:nrow(levels)) {
+    for (i in 1:nrow(combined_levels)) {
       
       data_sub <- data
+      syn_sub <- synthetic_data
+      data_levels <- data.frame()
+      current_level <- combined_levels[i,]
       
-      for (i in 1:length(colnames(levels))){
-        col <- colnames(levels)[i]
-        value <- levels[level,i]
+      for (j in 1:length(colnames(combined_levels))){
+        col <- colnames(combined_levels)[j]
+        value <- combined_levels[i,j]
         
+        data_levels <- combined_levels[i,j]
         data_sub <- data_sub %>% filter(!!rlang::sym(col) == value)
+        syn_sub <- syn_sub %>% filter(!!rlang::sym(col) == value)
       }
-
       
       # get the results for the subgroup/level
-      result <- util_corr_fit(postsynth = synthetic_data, data = data_sub)
+      result <- util_corr_fit(postsynth = syn_sub, data = data_sub)
       
       df <- result$correlation_data
       fit <- result$correlation_fit
       mae <- result$correlation_difference_mae
       rmse <- result$correlation_difference_rmse
       
-      #view(data_sub)
-      
       # add the results to a growing list of results for each subgroup/level 
-      correlation_data <- dplyr::bind_rows(correlation_data, cbind("level_add_later", df))
+      correlation_data <- dplyr::bind_rows(correlation_data, cbind(current_level, df, row.names = NULL))
       correlation_fit = c(correlation_fit, fit)
       correlation_difference_mae = c(correlation_difference_mae, mae)
       correlation_difference_rmse = c(correlation_difference_rmse, rmse)
-      
-      #view(correlation_data)
-      
-      stop()
     }
     
     return(
@@ -94,41 +93,6 @@ util_corr_fit <- function(postsynth,
         correlation_difference_rmse = correlation_difference_rmse
       )
     )
-  
-    
-    #levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull()
-    
-   #  correlation_data <- data.frame()
-   #  correlation_fit = c()
-   #  correlation_difference_mae = c()
-   #  correlation_difference_rmse = c()
-   # 
-   # for(level in levels) {
-   #     data_sub <- data %>% dplyr::filter({{ group_by }} == level)
-   #     
-   #     # get the results for the subgroup/level
-   #     result <- util_corr_fit(postsynth = synthetic_data, data = data_sub)
-   #     
-   #     df <- result$correlation_data
-   #     fit <- result$correlation_fit
-   #     mae <- result$correlation_difference_mae
-   #     rmse <- result$correlation_difference_rmse
-   #     
-   #     # add the results to a growing list of results for each subgroup/level 
-   #     correlation_data <- dplyr::bind_rows(correlation_data, cbind(level, df))
-   #     correlation_fit = c(correlation_fit, fit)
-   #     correlation_difference_mae = c(correlation_difference_mae, mae)
-   #     correlation_difference_rmse = c(correlation_difference_rmse, rmse)
-   # }
-   #  
-   #  return(
-   #    list(
-   #      correlation_data = correlation_data,
-   #      correlation_fit = correlation_fit,
-   #      correlation_difference_mae = correlation_difference_mae,
-   #      correlation_difference_rmse = correlation_difference_rmse
-   #      )
-   #  )
   }
   
   # helper function to find a correlation matrix with the upper tri set to zeros
@@ -140,16 +104,13 @@ util_corr_fit <- function(postsynth,
       dplyr::select_if(is.numeric) %>%
       stats::cor()
     
-    # set the values in the upper triangle to zero to avoid double counting
+    # set NA values in the lower triangle to "", set the values in the upper triangle to zero to avoid double counting
+    correlation_matrix[is.na(correlation_matrix[lower.tri(correlation_matrix, diag = FALSE)])] <- ""
     correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA
-    
-    # view(correlation_matrix)
-    
+
     return(correlation_matrix)
   }
   
-  # view(lower_triangle(data))
-  
   # find the lower triangle of the original data linear correlation matrix and return a df
   original_lt <- data.frame(lower_triangle(data))
   
@@ -161,7 +122,8 @@ util_corr_fit <- function(postsynth,
     tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>%
     dplyr::filter(!is.na(original)) %>%
     dplyr::arrange(var1) %>%
-    dplyr::select(var1, var2, original)
+    dplyr::select(var1, var2, original) %>%
+    dplyr::mutate(original = case_when(original == "" ~ NA, .default = original))
   
   # find the lower triangle of the synthetic data linear correlation matrix and return a df
   synthetic_lt <- data.frame(lower_triangle(synthetic_data))
@@ -174,16 +136,18 @@ util_corr_fit <- function(postsynth,
     tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>%
     dplyr::filter(!is.na(synthetic)) %>%
     dplyr::arrange(var1) %>%
-    dplyr::select(var1, var2, synthetic)
+    dplyr::select(var1, var2, synthetic) %>%
+    dplyr::mutate(synthetic = case_when(synthetic == "" ~ NA, .default = synthetic))
   
   # combining the data and finding the difference between the original and synthetic correlations 
   correlation_data <- original_lt %>%
     dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>%
-    dplyr::mutate(difference = original - synthetic,
-           proportion_difference = .data$difference / .data$original)
+    dplyr::mutate(difference = as.numeric(.data$original) - as.numeric(.data$synthetic),
+                  proportion_difference = as.numeric(.data$difference) / as.numeric(.data$original))
   
   # find the length of the nonzero values in the matrices
   n <- choose(ncol(correlation_data), 2)
+  print(n)
 
   # calculate the correlation fit and divide by n
   correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n

From b2cccd6db3997f73518a6f289413e561b6ab6cf2 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Wed, 10 Jan 2024 16:29:11 -0500
Subject: [PATCH 09/12] cleaned up function slightly, added tests for
 util_corr_fit()

---
 R/util_corr_fit.R                   |  30 ++---
 tests/testthat/test-util_corr_fit.R | 199 +++++++++++++++++++++++-----
 2 files changed, 183 insertions(+), 46 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index 4f69125..d76a595 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -31,10 +31,9 @@ util_corr_fit <- function(postsynth,
     
     synthetic_data <- postsynth
   }
-  
    
-  synthetic_data <- dplyr::select(synthetic_data, where(is.numeric), {{ group_by }})
-  data <- dplyr::select(data, where(is.numeric), {{ group_by }})
+  synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }})
+  data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }})
 
   # reorder data names (this appears to check if the variables are the same)
   data <- dplyr::select(data, names(synthetic_data))
@@ -43,9 +42,9 @@ util_corr_fit <- function(postsynth,
   if (!rlang::quo_is_null(rlang::enquo(group_by))) {
     
     # get all of the level combinations from each group by variable
-    data_levels <- data %>% dplyr::select({{ group_by }}) 
-    syn_levels <- synthetic_data %>% dplyr::select({{ group_by }}) 
-    combined_levels <- rbind(data_levels, syn_levels) %>% unique()
+    combined_levels <- rbind(data, synthetic_data) %>% 
+      dplyr::select({{ group_by }}) %>% 
+      unique()
     
     # initialize return values
     correlation_data <- data.frame()
@@ -58,16 +57,16 @@ util_corr_fit <- function(postsynth,
       
       data_sub <- data
       syn_sub <- synthetic_data
-      data_levels <- data.frame()
-      current_level <- combined_levels[i,]
+
+      current_level <- data.frame(combined_levels[i,])
+      colnames(current_level) <- colnames(combined_levels)
       
       for (j in 1:length(colnames(combined_levels))){
         col <- colnames(combined_levels)[j]
         value <- combined_levels[i,j]
-        
-        data_levels <- combined_levels[i,j]
-        data_sub <- data_sub %>% filter(!!rlang::sym(col) == value)
-        syn_sub <- syn_sub %>% filter(!!rlang::sym(col) == value)
+
+        data_sub <- data_sub %>% dplyr::filter(!!rlang::sym(col) == value)
+        syn_sub <- syn_sub %>% dplyr::filter(!!rlang::sym(col) == value)
       }
       
       # get the results for the subgroup/level
@@ -123,7 +122,7 @@ util_corr_fit <- function(postsynth,
     dplyr::filter(!is.na(original)) %>%
     dplyr::arrange(var1) %>%
     dplyr::select(var1, var2, original) %>%
-    dplyr::mutate(original = case_when(original == "" ~ NA, .default = original))
+    dplyr::mutate(original = dplyr::case_when(original == "" ~ NA, .default = original))
   
   # find the lower triangle of the synthetic data linear correlation matrix and return a df
   synthetic_lt <- data.frame(lower_triangle(synthetic_data))
@@ -137,7 +136,7 @@ util_corr_fit <- function(postsynth,
     dplyr::filter(!is.na(synthetic)) %>%
     dplyr::arrange(var1) %>%
     dplyr::select(var1, var2, synthetic) %>%
-    dplyr::mutate(synthetic = case_when(synthetic == "" ~ NA, .default = synthetic))
+    dplyr::mutate(synthetic = dplyr::case_when(synthetic == "" ~ NA, .default = synthetic))
   
   # combining the data and finding the difference between the original and synthetic correlations 
   correlation_data <- original_lt %>%
@@ -146,8 +145,7 @@ util_corr_fit <- function(postsynth,
                   proportion_difference = as.numeric(.data$difference) / as.numeric(.data$original))
   
   # find the length of the nonzero values in the matrices
-  n <- choose(ncol(correlation_data), 2)
-  print(n)
+  n <- nrow(dplyr::filter(correlation_data, !is.na(difference)))
 
   # calculate the correlation fit and divide by n
   correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n
diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R
index 4ff40c8..fbcc380 100644
--- a/tests/testthat/test-util_corr_fit.R
+++ b/tests/testthat/test-util_corr_fit.R
@@ -1,50 +1,189 @@
 # confidential data
-df <- data.frame(a = c(1, 2, 3),
-                 b = c(1, 2, 3),
-                 c = c(1, 2, 3),
-                 RECID = c("a", "b", "c"))
-
-# difference matrix for tests
-diff_matrix <- matrix(
-  c(NA, NA, NA,
-    -2, NA, NA,
-    0, -2, NA),
-  ncol = 3,
-  byrow = TRUE
-) 
-
-rownames(diff_matrix) <- c("a", "c", "b")
-colnames(diff_matrix) <- c("a", "c", "b")
+df <- data.frame(a = c(1, 2, 3, 4, 5),
+                 b = c(1, 2, 3, 4, 5),
+                 c = c(1, 2, 3, 4, 5),
+                 d = c("red", "red", "yellow", "yellow", "yellow"),
+                 e = c("blue", "blue", "green", "green", "green"))
+
+syn <- list(synthetic_data = data.frame(a = c(1, 2, 3, 4, 5),
+                                        b = c(5, 4, 3, 2, 1),
+                                        c = c(1, 2, 3, 4, 5),
+                                        d = c("red", "red", "yellow", "yellow", "yellow"),
+                                        e = c("blue", "blue", "green", "green", "green"))) %>%
+  structure(class = "postsynth")
 
 # test with postsynth
 test_that("util_corr_fit is correct with postsynth ", {
   
-  syn <- list(synthetic_data = data.frame(a = c(1, 2, 3),
-                                          c = c(3, 2, 1),
-                                          b = c(1, 2, 3),
-                                          RECID = c("a", "b", "c"))) %>%
-    structure(class = "postsynth")
-  
   corr <- util_corr_fit(postsynth = syn, data = df)
+  corr_data <- corr$correlation_data 
+  
+  # check dimensions
+  expect_equal(ncol(corr_data), 6)
+  expect_equal(nrow(corr_data), 3)
   
-  expect_equal(corr$correlation_difference, diff_matrix)
+  expect_equal(corr_data$difference, c(2, 0, 2))
   expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3)
   expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2))))
   expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
+  
 })
 
 # test with data
 test_that("util_corr_fit is correct with postsynth ", {
   
-  syn <- data.frame(a = c(1, 2, 3),
-                    c = c(3, 2, 1),
-                    b = c(1, 2, 3),
-                    RECID = c("a", "b", "c"))
+  syn_data <- data.frame(a = c(1, 2, 3, 4, 5),
+                         b = c(5, 4, 3, 2, 1),
+                         c = c(1, 2, 3, 4, 5),
+                         d = c("red", "red", "yellow", "yellow", "yellow"),
+                         e = c("blue", "blue", "green", "green", "green"),
+                         RECID = c("a", "b", "c", "d", "e"))
   
-  corr <- util_corr_fit(postsynth = syn, data = df)
-
-  expect_equal(corr$correlation_difference, diff_matrix)
+  corr <- util_corr_fit(postsynth = syn_data, data = df)
+  corr_data <- corr$correlation_data 
+  
+  # check dimensions
+  expect_equal(ncol(corr_data), 6)
+  expect_equal(nrow(corr_data), 3)
+  
+  expect_equal(corr_data$difference, c(2, 0, 2))
   expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3)
   expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2))))
   expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
+  
+})
+
+
+
+test_that("util_corr_fit is correct when groupped by one variable ", {
+  
+  df <- data.frame(a = c(1, 2, 3, 4, 5),
+                   b = c(1, 2, 3, 4, 5),
+                   c = c(1, 2, 3, 4, 5),
+                   d = c("red", "red", "yellow", "yellow", "yellow"),
+                   e = c("blue", "blue", "green", "green", "green"))
+  
+  syn <- list(synthetic_data = data.frame(a = c(1, 2, 3, 4, 5),
+                                          b = c(5, 4, 3, 2, 1),
+                                          c = c(1, 2, 3, 4, 5),
+                                          d = c("red", "red", "yellow", "yellow", "yellow"),
+                                          e = c("blue", "blue", "green", "green", "green"))) %>%
+    structure(class = "postsynth")
+  
+  corr <- util_corr_fit(postsynth = syn, data = df, group_by = d)
+  corr_data <- corr$correlation_data 
+  
+  # check dimensions
+  expect_equal(ncol(corr_data), 7)
+  expect_equal(nrow(corr_data), 6)
+  
+  expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2))
+  expect_equal(corr$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2))
+  expect_equal(corr$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2))
+  expect_equal(corr$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2))
+  
+})
+
+
+test_that("util_corr_fit is correct when groupped by more than one variable ", {
+  
+  corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e))
+  corr_data <- corr$correlation_data 
+  
+  # check dimensions
+  expect_equal(ncol(corr_data), 8)
+  expect_equal(nrow(corr_data), 6)
+  
+  expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2))
+  expect_equal(corr$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2))
+  expect_equal(corr$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2))
+  expect_equal(corr$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2))
+  
+})
+
+
+test_that("util_corr_fit returns NAs when only one observation is in a grouping ", {
+  
+  df <- data.frame(a = c(1, 2, 3, 4, 5),
+                   b = c(1, 2, 3, 4, 5),
+                   c = c(1, 2, 3, 4, 5),
+                   d = c("red", "red", "yellow", "yellow", "red"),
+                   e = c("blue", "blue", "green", "green", "green"),
+                   RECID = c("a", "b", "c", "d", "e"))
+  
+  syn <- data.frame(a = c(1, 2, 3, 4, 5),
+                    b = c(5, 4, 3, 2, 1),
+                    c = c(1, 2, 3, 4, 5),
+                    d = c("red", "red", "yellow", "yellow", "red"),
+                    e = c("blue", "blue", "green", "green", "green"),
+                    RECID = c("a", "b", "c", "d", "e")) 
+  
+  corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e))
+  corr_data <- corr$correlation_data 
+  
+  expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA))
+  expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN))
+  expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA))
+  expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA))
+  
 })
+
+
+
+test_that("util_corr_fit returns NAs when a grouping in the actual data is not in the synthetic data", {
+  
+  df <- data.frame(a = c(1, 2, 3, 4, 5),
+                   b = c(1, 2, 3, 4, 5),
+                   c = c(1, 2, 3, 4, 5),
+                   d = c("red", "red", "yellow", "yellow", "red"),
+                   e = c("blue", "blue", "green", "green", "green"),
+                   RECID = c("a", "b", "c", "d", "e"))
+  
+  syn <- data.frame(a = c(1, 2, 3, 4, 5),
+                    b = c(5, 4, 3, 2, 1),
+                    c = c(1, 2, 3, 4, 5),
+                    d = c("red", "red", "yellow", "yellow", "yellow"),
+                    e = c("blue", "blue", "green", "green", "green"),
+                    RECID = c("a", "b", "c", "d", "e")) 
+  
+  corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e))
+  corr_data <- corr$correlation_data 
+  
+  expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA))
+  expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN))
+  expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA))
+  expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA))
+  
+})
+
+
+test_that("util_corr_fit returns NAs when a grouping in the synthetic data is not in the actual data", {
+  
+  df <- data.frame(a = c(1, 2, 3, 4, 5),
+                   b = c(1, 2, 3, 4, 5),
+                   c = c(1, 2, 3, 4, 5),
+                   d = c("red", "red", "yellow", "yellow", "yellow"),
+                   e = c("blue", "blue", "green", "green", "green"),
+                   RECID = c("a", "b", "c", "d", "e"))
+  
+  syn <- data.frame(a = c(1, 2, 3, 4, 5),
+                    b = c(5, 4, 3, 2, 1),
+                    c = c(1, 2, 3, 4, 5),
+                    d = c("red", "red", "yellow", "yellow", "red"),
+                    e = c("blue", "blue", "green", "green", "green"),
+                    RECID = c("a", "b", "c", "d", "e")) 
+  
+  corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e))
+  corr_data <- corr$correlation_data 
+  
+  expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA))
+  expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN))
+  expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA))
+  expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA))
+  
+})
+
+
+
+
+

From 7a57123078ffe67e1a601ab280aac449f3381a13 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Tue, 16 Jan 2024 10:07:22 -0500
Subject: [PATCH 10/12] finished formatting code and testing util_corr_fit

---
 R/util_corr_fit.R                   | 40 ++++++++++++++---------------
 man/util_corr_fit.Rd                | 14 +++++-----
 tests/testthat/test-util_corr_fit.R |  1 -
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index d76a595..0b0626d 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -65,8 +65,8 @@ util_corr_fit <- function(postsynth,
         col <- colnames(combined_levels)[j]
         value <- combined_levels[i,j]
 
-        data_sub <- data_sub %>% dplyr::filter(!!rlang::sym(col) == value)
-        syn_sub <- syn_sub %>% dplyr::filter(!!rlang::sym(col) == value)
+        data_sub <- dplyr::filter(data_sub, !!rlang::sym(col) == value)
+        syn_sub <- dplyr::filter(syn_sub, !!rlang::sym(col) == value)
       }
       
       # get the results for the subgroup/level
@@ -78,7 +78,9 @@ util_corr_fit <- function(postsynth,
       rmse <- result$correlation_difference_rmse
       
       # add the results to a growing list of results for each subgroup/level 
-      correlation_data <- dplyr::bind_rows(correlation_data, cbind(current_level, df, row.names = NULL))
+      correlation_data <- dplyr::bind_rows(correlation_data, 
+                                           cbind(current_level, df, 
+                                                 row.names = NULL))
       correlation_fit = c(correlation_fit, fit)
       correlation_difference_mae = c(correlation_difference_mae, mae)
       correlation_difference_rmse = c(correlation_difference_rmse, rmse)
@@ -110,41 +112,39 @@ util_corr_fit <- function(postsynth,
     return(correlation_matrix)
   }
   
-  # find the lower triangle of the original data linear correlation matrix and return a df
+  # find the lower triangle of the linear correlation matrices and add a var column 
   original_lt <- data.frame(lower_triangle(data))
-  
-  # adding variable 2 column to the original df
   original_lt$var2 <- colnames(original_lt)
   
-  # restructuring the correlation matrix so the cols are var1, var2, original
+  synthetic_lt <- data.frame(lower_triangle(synthetic_data))
+  synthetic_lt$var2 <- colnames(synthetic_lt)
+  
+  # restructure the correlation matrix so the cols are var1, var2, original/synthetic
   original_lt <- original_lt %>%
     tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>%
     dplyr::filter(!is.na(original)) %>%
     dplyr::arrange(var1) %>%
     dplyr::select(var1, var2, original) %>%
-    dplyr::mutate(original = dplyr::case_when(original == "" ~ NA, .default = original))
-  
-  # find the lower triangle of the synthetic data linear correlation matrix and return a df
-  synthetic_lt <- data.frame(lower_triangle(synthetic_data))
-  
-  # adding variable 2 column to the synthetic df
-  synthetic_lt$var2 <- colnames(synthetic_lt)
+    dplyr::mutate(original = dplyr::case_when(.data$original == "" ~ NA, 
+                                              .default = .data$original))
   
-  # restructuring the correlation matrix so the cols are var1, var2, synthetic
   synthetic_lt <- synthetic_lt %>%
     tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>%
     dplyr::filter(!is.na(synthetic)) %>%
     dplyr::arrange(var1) %>%
     dplyr::select(var1, var2, synthetic) %>%
-    dplyr::mutate(synthetic = dplyr::case_when(synthetic == "" ~ NA, .default = synthetic))
+    dplyr::mutate(synthetic = dplyr::case_when(.data$synthetic == "" ~ NA, 
+                                               .default = .data$synthetic))
   
-  # combining the data and finding the difference between the original and synthetic correlations 
+  # combine the data and find the difference between the original and synthetic correlations 
   correlation_data <- original_lt %>%
     dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>%
-    dplyr::mutate(difference = as.numeric(.data$original) - as.numeric(.data$synthetic),
-                  proportion_difference = as.numeric(.data$difference) / as.numeric(.data$original))
+    dplyr::mutate(original = as.numeric(.data$original),
+                  synthetic = as.numeric(.data$synthetic),
+                  difference = .data$original - .data$synthetic,
+                  proportion_difference = .data$difference / .data$original)
   
-  # find the length of the nonzero values in the matrices
+  # find the number of values in the lower triangle 
   n <- nrow(dplyr::filter(correlation_data, !is.na(difference)))
 
   # calculate the correlation fit and divide by n
diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd
index 57126a9..721e8a4 100644
--- a/man/util_corr_fit.Rd
+++ b/man/util_corr_fit.Rd
@@ -11,18 +11,20 @@ util_corr_fit(postsynth, data, group_by = NULL)
 
 \item{data}{an original (observed) data set.}
 
-\item{group_by}{}
+\item{group_by}{The unquoted name of a (or multiple) grouping variable(s)}
 }
 \value{
 A \code{list} of fit metrics:
 \itemize{
-\item \code{correlation_original}: correlation matrix of the original data.
-\item \code{correlation_synthetic}: correlation matrix of the synthetic data.
-\item \code{correlation_difference}: difference between \code{correlation_synthetic} and
-\code{correlation_original}.
+\item \code{correlation_data}: A \code{tibble} of the correlations among the
+numeric variables for the actual and synthetic data
 \item \code{correlation_fit}: square root of the sum of squared differences between
-\code{correlation_synthetic} and \code{correlation_original}, divided by the number of
+the synthetic and original data, divided by the number of
 cells in the correlation matrix.
+\item \code{correlation_difference_mae}: the mean of the absolute correlation
+differences between the actual and synthetic data
+\item \code{correlation_difference_rmse}: the root mean of the squared correlation
+differences between the actual and synthetic data
 }
 }
 \description{
diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R
index fbcc380..8d254ee 100644
--- a/tests/testthat/test-util_corr_fit.R
+++ b/tests/testthat/test-util_corr_fit.R
@@ -129,7 +129,6 @@ test_that("util_corr_fit returns NAs when only one observation is in a grouping
 })
 
 
-
 test_that("util_corr_fit returns NAs when a grouping in the actual data is not in the synthetic data", {
   
   df <- data.frame(a = c(1, 2, 3, 4, 5),

From 8309fc468c6e67a82829b1739480d6c2e9da8635 Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Fri, 10 May 2024 12:18:32 -0400
Subject: [PATCH 11/12] updated code for util_corr_fit -> new code does not
 account for instances where the group_by variable(s) are different for the
 synthetic and actual data

---
 R/util_corr_fit.R | 107 +++++++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 63 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index 0b0626d..2228210 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -23,6 +23,8 @@ util_corr_fit <- function(postsynth,
                           data, 
                           group_by = NULL) {
   
+   
+  
   if (is_postsynth(postsynth)) {
   
     synthetic_data <- postsynth$synthetic_data
@@ -31,71 +33,46 @@ util_corr_fit <- function(postsynth,
     
     synthetic_data <- postsynth
   }
-   
-  synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }})
-  data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }})
-
+  
   # reorder data names (this appears to check if the variables are the same)
+  # issue when the groups in the synthetic data do not match the groups in the og data, and vice versa
+  # thinking about filling in all of groupings for each dataset first then running everything else
   data <- dplyr::select(data, names(synthetic_data))
+  
+  synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }})  |>
+    dplyr::arrange(dplyr::across({{ group_by }})) |>
+    dplyr::group_split(dplyr::across({{ group_by }})) 
+  
+  data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) |>
+    dplyr::arrange(dplyr::across({{ group_by }})) |>
+    dplyr::group_split(dplyr::across({{ group_by }}))
+  
+  groups <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) |>
+                     slice(1))
 
-  # check if a group_by variable was passed
-  if (!rlang::quo_is_null(rlang::enquo(group_by))) {
-    
-    # get all of the level combinations from each group by variable
-    combined_levels <- rbind(data, synthetic_data) %>% 
-      dplyr::select({{ group_by }}) %>% 
-      unique()
+  results <- purrr::pmap(
+      .l = list(synthetic_data, data, groups),
+      .f = get_correlations
+    )
+  
+    metrics <- dplyr::bind_cols(
+      correlation_fit = map_dbl(results, "correlation_fit"),
+      correlation_difference_mae = map_dbl(results, "correlation_difference_mae"),
+      correlation_difference_rmse = map_dbl(results, "correlation_difference_rmse"),
+      bind_rows(groups)
+    )
     
-    # initialize return values
-    correlation_data <- data.frame()
-    correlation_fit = c()
-    correlation_difference_mae = c()
-    correlation_difference_rmse = c()
+    corr_data <- dplyr::bind_rows(map_dfr(results, "correlation_data"))
     
-    # get a subset of the data 
-    for (i in 1:nrow(combined_levels)) {
-      
-      data_sub <- data
-      syn_sub <- synthetic_data
-
-      current_level <- data.frame(combined_levels[i,])
-      colnames(current_level) <- colnames(combined_levels)
-      
-      for (j in 1:length(colnames(combined_levels))){
-        col <- colnames(combined_levels)[j]
-        value <- combined_levels[i,j]
+    return(list(
+      corr_data,
+      metrics
+    ))
+}
 
-        data_sub <- dplyr::filter(data_sub, !!rlang::sym(col) == value)
-        syn_sub <- dplyr::filter(syn_sub, !!rlang::sym(col) == value)
-      }
-      
-      # get the results for the subgroup/level
-      result <- util_corr_fit(postsynth = syn_sub, data = data_sub)
-      
-      df <- result$correlation_data
-      fit <- result$correlation_fit
-      mae <- result$correlation_difference_mae
-      rmse <- result$correlation_difference_rmse
-      
-      # add the results to a growing list of results for each subgroup/level 
-      correlation_data <- dplyr::bind_rows(correlation_data, 
-                                           cbind(current_level, df, 
-                                                 row.names = NULL))
-      correlation_fit = c(correlation_fit, fit)
-      correlation_difference_mae = c(correlation_difference_mae, mae)
-      correlation_difference_rmse = c(correlation_difference_rmse, rmse)
-    }
-    
-    return(
-      list(
-        correlation_data = correlation_data,
-        correlation_fit = correlation_fit,
-        correlation_difference_mae = correlation_difference_mae,
-        correlation_difference_rmse = correlation_difference_rmse
-      )
-    )
-  }
-  
+get_correlations <- function(synthetic_data,
+                             data,
+                             groups) {
   # helper function to find a correlation matrix with the upper tri set to zeros
   lower_triangle <- function(x) {
     
@@ -108,10 +85,11 @@ util_corr_fit <- function(postsynth,
     # set NA values in the lower triangle to "", set the values in the upper triangle to zero to avoid double counting
     correlation_matrix[is.na(correlation_matrix[lower.tri(correlation_matrix, diag = FALSE)])] <- ""
     correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA
-
+    
     return(correlation_matrix)
   }
   
+  
   # find the lower triangle of the linear correlation matrices and add a var column 
   original_lt <- data.frame(lower_triangle(data))
   original_lt$var2 <- colnames(original_lt)
@@ -144,9 +122,11 @@ util_corr_fit <- function(postsynth,
                   difference = .data$original - .data$synthetic,
                   proportion_difference = .data$difference / .data$original)
   
+  correlation_data <- bind_cols(correlation_data, groups)
+  
   # find the number of values in the lower triangle 
   n <- nrow(dplyr::filter(correlation_data, !is.na(difference)))
-
+  
   # calculate the correlation fit and divide by n
   correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n
   
@@ -156,7 +136,7 @@ util_corr_fit <- function(postsynth,
   correlation_difference_mae <- difference_vec %>%
     abs() %>%
     mean()
-
+  
   # root mean square error
   correlation_difference_rmse <- difference_vec ^ 2 %>%
     mean() %>%
@@ -172,4 +152,5 @@ util_corr_fit <- function(postsynth,
     )
   )
   
-}
\ No newline at end of file
+}
+

From e42d840864b88220da70dbacf8c7b2bf3c39256b Mon Sep 17 00:00:00 2001
From: emcfalls <emm113@duke.edu>
Date: Fri, 10 May 2024 16:36:46 -0400
Subject: [PATCH 12/12] updated util_corr_fit code and tests

---
 R/util_corr_fit.R                   | 50 +++++++++++----------
 tests/testthat/test-util_corr_fit.R | 67 ++++++++++++++++++++---------
 2 files changed, 73 insertions(+), 44 deletions(-)

diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R
index 2228210..2161387 100644
--- a/R/util_corr_fit.R
+++ b/R/util_corr_fit.R
@@ -7,13 +7,12 @@
 #' @return A `list` of fit metrics:
 #'  - `correlation_data`: A `tibble` of the correlations among the 
 #'  numeric variables for the actual and synthetic data
-#'  - `correlation_fit`: square root of the sum of squared differences between
-#'  the synthetic and original data, divided by the number of
-#'  cells in the correlation matrix.
-#'  - `correlation_difference_mae`: the mean of the absolute correlation 
-#'  differences between the actual and synthetic data
-#'  - `correlation_difference_rmse`: the root mean of the squared correlation 
-#'  differences between the actual and synthetic data
+#'  - `metrics`: Correlation metrics including the correlation fit, square root 
+#'  of the sum of squared differences between the synthetic and original data, 
+#'  divided by the number of cells in the correlation matrix, mean squared
+#'  error, the mean of the absolute correlation differences between the actual
+#'  and synthetic data, and the root mean squared error, the root mean of the 
+#'  squared correlation differences between the actual and synthetic data
 #'  
 #' @family utility functions
 #'
@@ -35,20 +34,23 @@ util_corr_fit <- function(postsynth,
   }
   
   # reorder data names (this appears to check if the variables are the same)
-  # issue when the groups in the synthetic data do not match the groups in the og data, and vice versa
-  # thinking about filling in all of groupings for each dataset first then running everything else
   data <- dplyr::select(data, names(synthetic_data))
   
-  synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }})  |>
-    dplyr::arrange(dplyr::across({{ group_by }})) |>
+  #TODO: Need to update code so if the synthetic data and actual data do not have the same groupings, the function still runs correctly. 
+  # I think we could add all of the groupings to each dataset, the run the rest of the code. 
+  
+  synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }})  %>%
+    dplyr::arrange(dplyr::across({{ group_by }})) %>%
     dplyr::group_split(dplyr::across({{ group_by }})) 
   
-  data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) |>
-    dplyr::arrange(dplyr::across({{ group_by }})) |>
+  data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) %>%
+    dplyr::arrange(dplyr::across({{ group_by }})) %>%
     dplyr::group_split(dplyr::across({{ group_by }}))
   
-  groups <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) |>
-                     slice(1))
+  groups <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) %>%
+                     dplyr::slice(1))
+  n <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) %>%
+                dplyr::count())
 
   results <- purrr::pmap(
       .l = list(synthetic_data, data, groups),
@@ -56,18 +58,20 @@ util_corr_fit <- function(postsynth,
     )
   
     metrics <- dplyr::bind_cols(
-      correlation_fit = map_dbl(results, "correlation_fit"),
-      correlation_difference_mae = map_dbl(results, "correlation_difference_mae"),
-      correlation_difference_rmse = map_dbl(results, "correlation_difference_rmse"),
-      bind_rows(groups)
+      n = unlist(n),
+      correlation_fit = purrr::map_dbl(results, "correlation_fit"),
+      correlation_difference_mae = purrr::map_dbl(results, "correlation_difference_mae"),
+      correlation_difference_rmse =purrr:: map_dbl(results, "correlation_difference_rmse"),
+      dplyr::bind_rows(groups)
     )
     
-    corr_data <- dplyr::bind_rows(map_dfr(results, "correlation_data"))
+    corr_data <- dplyr::bind_rows(purrr::map_dfr(results, "correlation_data"))
     
     return(list(
-      corr_data,
-      metrics
-    ))
+      correlation_data = corr_data,
+      metrics = metrics
+    )
+  )
 }
 
 get_correlations <- function(synthetic_data,
diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R
index 8d254ee..86ec3a4 100644
--- a/tests/testthat/test-util_corr_fit.R
+++ b/tests/testthat/test-util_corr_fit.R
@@ -1,3 +1,5 @@
+library(tidyverse)
+
 # confidential data
 df <- data.frame(a = c(1, 2, 3, 4, 5),
                  b = c(1, 2, 3, 4, 5),
@@ -17,15 +19,20 @@ test_that("util_corr_fit is correct with postsynth ", {
   
   corr <- util_corr_fit(postsynth = syn, data = df)
   corr_data <- corr$correlation_data 
+  metrics <- corr$metrics
   
   # check dimensions
   expect_equal(ncol(corr_data), 6)
   expect_equal(nrow(corr_data), 3)
+  expect_equal(ncol(metrics), 4)
+  expect_equal(nrow(metrics), 1)
   
   expect_equal(corr_data$difference, c(2, 0, 2))
-  expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3)
-  expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2))))
-  expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
+  expect_equal(metrics$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3)
+  expect_equal(metrics$correlation_difference_mae, mean(abs(c(0, -2, -2))))
+  expect_equal(metrics$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
+  # throws an error, may be due to how the n column was added to the data
+  expect_equal(metrics$n, 5)
   
 })
 
@@ -36,21 +43,23 @@ test_that("util_corr_fit is correct with postsynth ", {
                          b = c(5, 4, 3, 2, 1),
                          c = c(1, 2, 3, 4, 5),
                          d = c("red", "red", "yellow", "yellow", "yellow"),
-                         e = c("blue", "blue", "green", "green", "green"),
-                         RECID = c("a", "b", "c", "d", "e"))
+                         e = c("blue", "blue", "green", "green", "green"))
   
   corr <- util_corr_fit(postsynth = syn_data, data = df)
   corr_data <- corr$correlation_data 
+  metrics <- corr$metrics
   
   # check dimensions
   expect_equal(ncol(corr_data), 6)
   expect_equal(nrow(corr_data), 3)
+  expect_equal(ncol(metrics), 4)
+  expect_equal(nrow(metrics), 1)
   
   expect_equal(corr_data$difference, c(2, 0, 2))
-  expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3)
-  expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2))))
-  expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
-  
+  expect_equal(metrics$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3)
+  expect_equal(metrics$correlation_difference_mae, mean(abs(c(0, -2, -2))))
+  expect_equal(metrics$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2)))
+  expect_equal(metrics$n, 5)
 })
 
 
@@ -71,16 +80,20 @@ test_that("util_corr_fit is correct when groupped by one variable ", {
     structure(class = "postsynth")
   
   corr <- util_corr_fit(postsynth = syn, data = df, group_by = d)
-  corr_data <- corr$correlation_data 
+  corr_data <- corr$correlation_data
+  metrics <- corr$metrics
   
   # check dimensions
   expect_equal(ncol(corr_data), 7)
   expect_equal(nrow(corr_data), 6)
+  expect_equal(ncol(metrics), 5)
+  expect_equal(nrow(metrics), 2)
   
   expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2))
-  expect_equal(corr$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2))
-  expect_equal(corr$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2))
-  expect_equal(corr$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2))
+  expect_equal(metrics$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2))
+  expect_equal(metrics$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2))
+  expect_equal(metrics$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2))
+  expect_equal(metrics$n, c(2, 3))
   
 })
 
@@ -89,15 +102,19 @@ test_that("util_corr_fit is correct when groupped by more than one variable ", {
   
   corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e))
   corr_data <- corr$correlation_data 
+  metrics <- corr$metrics
   
   # check dimensions
   expect_equal(ncol(corr_data), 8)
   expect_equal(nrow(corr_data), 6)
+  expect_equal(ncol(metrics), 6)
+  expect_equal(nrow(metrics), 2)
   
   expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2))
-  expect_equal(corr$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2))
-  expect_equal(corr$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2))
-  expect_equal(corr$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2))
+  expect_equal(metrics$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2))
+  expect_equal(metrics$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2))
+  expect_equal(metrics$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2))
+  expect_equal(metrics$n, c(2, 3))
   
 })
 
@@ -119,15 +136,23 @@ test_that("util_corr_fit returns NAs when only one observation is in a grouping
                     RECID = c("a", "b", "c", "d", "e")) 
   
   corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e))
-  corr_data <- corr$correlation_data 
+  corr_data <- corr$correlation_data
+  metrics <- corr$metrics
   
-  expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA))
-  expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN))
-  expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA))
-  expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA))
+  expect_equal(ncol(corr_data), 8)
+  expect_equal(nrow(corr_data), 9)
+  expect_equal(ncol(metrics), 6)
+  expect_equal(nrow(metrics), 3)
+  
+  expect_equal(corr_data$difference, c(2, 0, 2, NA, NA, NA, 2, 0, 2))
+  expect_equal(metrics$correlation_fit, c(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, NaN, sqrt(sum(c(0, -2, -2) ^ 2)) / 3))
+  expect_equal(metrics$correlation_difference_mae, c(mean(abs(c(0, -2, -2))), NA, mean(abs(c(0, -2, -2)))))
+  expect_equal(metrics$correlation_difference_rmse, c(sqrt(mean(c(0, -2, -2) ^ 2)), NA, sqrt(mean(c(0, -2, -2) ^ 2))))
+  expect_equal(metrics$n, c(2, 1, 2))
   
 })
 
+# NOTE: These tests will fail
 
 test_that("util_corr_fit returns NAs when a grouping in the actual data is not in the synthetic data", {