From ac10a715fb388f46721e44f059f34c80ee30d398 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Thu, 7 Dec 2023 14:48:28 -0500 Subject: [PATCH 01/12] updated util_corr_fit outcome --- R/util_corr_fit.R | 95 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 24 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 5d0232a..dd0d954 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -31,9 +31,10 @@ util_corr_fit <- function(postsynth, data) { synthetic_data <- dplyr::select_if(synthetic_data, is.numeric) data <- dplyr::select_if(data, is.numeric) - # reorder data names + # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) + # helper function to find a correlation matrix with the upper tri set to zeros lower_triangle <- function(x) { @@ -49,48 +50,94 @@ util_corr_fit <- function(postsynth, data) { return(correlation_matrix) } - # find the lower triangle of the original data linear correlation matrix - original_lt <- lower_triangle(data) + # find the lower triangle of the original data linear correlation matrix and return a df + original_lt <- data.frame(lower_triangle(data)) - # find the lower triangle of the synthetic data linear correlation matrix - synthetic_lt <- lower_triangle(synthetic_data) + # adding variable 2 column to the original df + original_lt$var2 <- colnames(original_lt) - # compare names - if (any(rownames(original_lt) != rownames(synthetic_lt))) { - stop("ERROR: rownames are not identical") - } + # restructuring the correlation matrix so the cols are var1, var2, original + original_lt <- original_lt %>% + pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>% + filter(!is.na(original)) %>% + arrange(var1) %>% + select(var1, var2, original) - if (any(colnames(original_lt) != colnames(synthetic_lt))) { - stop("ERROR: colnames are not identical") - } + # find the lower triangle of the synthetic data linear correlation matrix and return a df + synthetic_lt <- data.frame(lower_triangle(synthetic_data)) - # find the difference between the matrices - difference_lt <- synthetic_lt - original_lt + # adding variable 2 column to the synthetic df + synthetic_lt$var2 <- colnames(synthetic_lt) - # find the length of the nonzero values in the matrices - n <- choose(ncol(difference_lt), 2) + # restructuring the correlation matrix so the cols are var1, var2, synthetic + synthetic_lt <- synthetic_lt %>% + pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>% + filter(!is.na(synthetic)) %>% + arrange(var1) %>% + select(var1, var2, synthetic) + # find the difference between the original correlations and the synthetic + correlation_data <- original_lt %>% + left_join(synthetic_lt, by = c("var1","var2")) %>% + mutate(difference = original - synthetic, + proportion_difference = .data$difference / .data$original) + + # find the length of the nonzero values in the matrices + n <- choose(ncol(correlation_data), 2) + # calculate the correlation fit and divide by n - correlation_fit <- sqrt(sum(difference_lt ^ 2, na.rm = TRUE)) / n + correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n - difference_vec <- as.numeric(difference_lt)[!is.na(difference_lt)] + difference_vec <- as.numeric(correlation_data$difference) # mean absolute error correlation_difference_mae <- difference_vec %>% abs() %>% mean() - + # root mean square error - correlation_difference_rmse <- - difference_vec ^ 2%>% + correlation_difference_rmse <- difference_vec ^ 2 %>% mean() %>% sqrt() + + + # compare names + # if (any(rownames(original_lt) != rownames(synthetic_lt))) { + # stop("ERROR: rownames are not identical") + # } + # + # if (any(colnames(original_lt) != colnames(synthetic_lt))) { + # stop("ERROR: colnames are not identical") + # } + # + # # find the difference between the matrices + # difference_lt <- synthetic_lt - original_lt + # + # # find the length of the nonzero values in the matrices + # n <- choose(ncol(difference_lt), 2) + + # calculate the correlation fit and divide by n + # correlation_fit <- sqrt(sum(difference_lt ^ 2, na.rm = TRUE)) / n + # + # difference_vec <- as.numeric(difference_lt)[!is.na(difference_lt)] + # + # # mean absolute error + # correlation_difference_mae <- difference_vec %>% + # abs() %>% + # mean() + # + # # root mean square error + # correlation_difference_rmse <- + # difference_vec ^ 2%>% + # mean() %>% + # sqrt() return( list( - correlation_original = original_lt, - correlation_synthetic = synthetic_lt, - correlation_difference = difference_lt, + correlation_data = correlation_data, + # correlation_original = original_lt, + # correlation_synthetic = synthetic_lt + # correlation_difference = difference_lt, correlation_fit = correlation_fit, correlation_difference_mae = correlation_difference_mae, correlation_difference_rmse = correlation_difference_rmse From ecf5cd2b0bf4dd3fd4e8b30d5c9045906af61fc9 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Thu, 21 Dec 2023 15:26:47 -0500 Subject: [PATCH 02/12] added group_by param --- R/util_corr_fit.R | 14 +++++++++++--- man/util_corr_fit.Rd | 4 +++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index dd0d954..338da9e 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -2,6 +2,7 @@ #' #' @param postsynth A postsynth object from tidysynthesis or a tibble #' @param data an original (observed) data set. +#' @param group_by #' #' @return A `list` of fit metrics: #' - `correlation_original`: correlation matrix of the original data. @@ -16,7 +17,7 @@ #' #' @export -util_corr_fit <- function(postsynth, data) { +util_corr_fit <- function(postsynth, data, group_by = NULL) { if (is_postsynth(postsynth)) { @@ -28,12 +29,19 @@ util_corr_fit <- function(postsynth, data) { } - synthetic_data <- dplyr::select_if(synthetic_data, is.numeric) - data <- dplyr::select_if(data, is.numeric) + synthetic_data <- dplyr::select_if(synthetic_data, is.numeric, {{ group_by }}) + data <- dplyr::select_if(data, is.numeric, {{ group_by }}) # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) + if (!is.null({{ group_by }})){ + for(level in levels(data %>% select({{ group_by }}))){ + data_sub <- data %>% filter(group_by == level) + view(data_sub) + } + #df <- util_corr_fit(postsynth, data, group_by == NULL) + } # helper function to find a correlation matrix with the upper tri set to zeros lower_triangle <- function(x) { diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd index 860a090..57126a9 100644 --- a/man/util_corr_fit.Rd +++ b/man/util_corr_fit.Rd @@ -4,12 +4,14 @@ \alias{util_corr_fit} \title{Calculate the correlation fit metric of a confidential data set.} \usage{ -util_corr_fit(postsynth, data) +util_corr_fit(postsynth, data, group_by = NULL) } \arguments{ \item{postsynth}{A postsynth object from tidysynthesis or a tibble} \item{data}{an original (observed) data set.} + +\item{group_by}{} } \value{ A \code{list} of fit metrics: From ea65fe424130274bb5b3534bae328b3590924ea9 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Thu, 21 Dec 2023 17:22:00 -0500 Subject: [PATCH 03/12] added way to group by a variable for the data table --- R/util_corr_fit.R | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 338da9e..c71da6a 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -2,7 +2,8 @@ #' #' @param postsynth A postsynth object from tidysynthesis or a tibble #' @param data an original (observed) data set. -#' @param group_by +#' @param group_var +#' @param level #' #' @return A `list` of fit metrics: #' - `correlation_original`: correlation matrix of the original data. @@ -17,7 +18,10 @@ #' #' @export -util_corr_fit <- function(postsynth, data, group_by = NULL) { +util_corr_fit <- function(postsynth, + data, + group_var = NULL, + level = NULL) { if (is_postsynth(postsynth)) { @@ -26,22 +30,32 @@ util_corr_fit <- function(postsynth, data, group_by = NULL) { } else { synthetic_data <- postsynth - } - synthetic_data <- dplyr::select_if(synthetic_data, is.numeric, {{ group_by }}) - data <- dplyr::select_if(data, is.numeric, {{ group_by }}) + + synthetic_data <- dplyr::select(synthetic_data, where(is.numeric), {{ group_var }}) + data <- dplyr::select(data, where(is.numeric), {{ group_var }}) # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) - if (!is.null({{ group_by }})){ - for(level in levels(data %>% select({{ group_by }}))){ - data_sub <- data %>% filter(group_by == level) - view(data_sub) + # issue: if group_var = NULL is passed into the function, this runs + if(!missing(group_var)){ + + levels <- data %>% dplyr::distinct({{ group_var }}) %>% pull() + + correlation_data <- data.frame() + + for(level in levels) { + data_sub <- data %>% dplyr::filter({{ group_var }} == level) + + df <- util_corr_fit(postsynth = synthetic_data, data = data_sub, level = level)$correlation_data + + correlation_data <- dplyr::bind_rows(correlation_data, df) } - #df <- util_corr_fit(postsynth, data, group_by == NULL) - } + + return(correlation_data) + } # helper function to find a correlation matrix with the upper tri set to zeros lower_triangle <- function(x) { @@ -90,6 +104,11 @@ util_corr_fit <- function(postsynth, data, group_by = NULL) { mutate(difference = original - synthetic, proportion_difference = .data$difference / .data$original) + # add level (if level is not null) + if(!is.null(level)){ + correlation_data <- cbind(level, correlation_data) + } + # find the length of the nonzero values in the matrices n <- choose(ncol(correlation_data), 2) From 39cf538a8e52e7ffb68f66932053e72b077379a1 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Fri, 29 Dec 2023 14:06:16 -0500 Subject: [PATCH 04/12] extending group_by function to include all outputs --- R/util_corr_fit.R | 105 +++++++++++++++++-------------------------- man/util_corr_fit.Rd | 6 ++- 2 files changed, 46 insertions(+), 65 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index c71da6a..af6093b 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -2,8 +2,7 @@ #' #' @param postsynth A postsynth object from tidysynthesis or a tibble #' @param data an original (observed) data set. -#' @param group_var -#' @param level +#' @param group_by #' #' @return A `list` of fit metrics: #' - `correlation_original`: correlation matrix of the original data. @@ -20,8 +19,7 @@ util_corr_fit <- function(postsynth, data, - group_var = NULL, - level = NULL) { + group_by = NULL) { if (is_postsynth(postsynth)) { @@ -33,28 +31,48 @@ util_corr_fit <- function(postsynth, } - synthetic_data <- dplyr::select(synthetic_data, where(is.numeric), {{ group_var }}) - data <- dplyr::select(data, where(is.numeric), {{ group_var }}) + synthetic_data <- dplyr::select(synthetic_data, where(is.numeric), {{ group_by }}) + data <- dplyr::select(data, where(is.numeric), {{ group_by }}) # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) - # issue: if group_var = NULL is passed into the function, this runs - if(!missing(group_var)){ - - levels <- data %>% dplyr::distinct({{ group_var }}) %>% pull() + # issue: if group_by = NULL is passed into the function, this runs + if(!missing(group_by)){ + + levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull() correlation_data <- data.frame() + correlation_fit = c() + correlation_difference_mae = c() + correlation_difference_rmse = c() for(level in levels) { - data_sub <- data %>% dplyr::filter({{ group_var }} == level) + data_sub <- data %>% dplyr::filter({{ group_by }} == level) + + # get the results for the subgroup/level + result <- util_corr_fit(postsynth = synthetic_data, data = data_sub) - df <- util_corr_fit(postsynth = synthetic_data, data = data_sub, level = level)$correlation_data + df <- result$correlation_data + fit <- result$correlation_fit + mae <- result$correlation_difference_mae + rmse <- result$correlation_difference_rmse - correlation_data <- dplyr::bind_rows(correlation_data, df) + # add the results to a growing list of results for each subgroup/level + correlation_data <- dplyr::bind_rows(correlation_data, cbind(level, df)) + correlation_fit = c(correlation_fit, fit) + correlation_difference_mae = c(correlation_difference_mae, mae) + correlation_difference_rmse = c(correlation_difference_rmse, rmse) } - return(correlation_data) + return( + list( + correlation_data = correlation_data, + correlation_fit = correlation_fit, + correlation_difference_mae = correlation_difference_mae, + correlation_difference_rmse = correlation_difference_rmse + ) + ) } # helper function to find a correlation matrix with the upper tri set to zeros @@ -80,10 +98,10 @@ util_corr_fit <- function(postsynth, # restructuring the correlation matrix so the cols are var1, var2, original original_lt <- original_lt %>% - pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>% - filter(!is.na(original)) %>% - arrange(var1) %>% - select(var1, var2, original) + tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>% + dplyr::filter(!is.na(original)) %>% + dplyr::arrange(var1) %>% + dplyr::select(var1, var2, original) # find the lower triangle of the synthetic data linear correlation matrix and return a df synthetic_lt <- data.frame(lower_triangle(synthetic_data)) @@ -93,22 +111,17 @@ util_corr_fit <- function(postsynth, # restructuring the correlation matrix so the cols are var1, var2, synthetic synthetic_lt <- synthetic_lt %>% - pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>% - filter(!is.na(synthetic)) %>% - arrange(var1) %>% - select(var1, var2, synthetic) + tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>% + dplyr::filter(!is.na(synthetic)) %>% + dplyr::arrange(var1) %>% + dplyr::select(var1, var2, synthetic) # find the difference between the original correlations and the synthetic correlation_data <- original_lt %>% - left_join(synthetic_lt, by = c("var1","var2")) %>% - mutate(difference = original - synthetic, + dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>% + dplyr::mutate(difference = original - synthetic, proportion_difference = .data$difference / .data$original) - # add level (if level is not null) - if(!is.null(level)){ - correlation_data <- cbind(level, correlation_data) - } - # find the length of the nonzero values in the matrices n <- choose(ncol(correlation_data), 2) @@ -126,45 +139,11 @@ util_corr_fit <- function(postsynth, correlation_difference_rmse <- difference_vec ^ 2 %>% mean() %>% sqrt() - - # compare names - # if (any(rownames(original_lt) != rownames(synthetic_lt))) { - # stop("ERROR: rownames are not identical") - # } - # - # if (any(colnames(original_lt) != colnames(synthetic_lt))) { - # stop("ERROR: colnames are not identical") - # } - # - # # find the difference between the matrices - # difference_lt <- synthetic_lt - original_lt - # - # # find the length of the nonzero values in the matrices - # n <- choose(ncol(difference_lt), 2) - - # calculate the correlation fit and divide by n - # correlation_fit <- sqrt(sum(difference_lt ^ 2, na.rm = TRUE)) / n - # - # difference_vec <- as.numeric(difference_lt)[!is.na(difference_lt)] - # - # # mean absolute error - # correlation_difference_mae <- difference_vec %>% - # abs() %>% - # mean() - # - # # root mean square error - # correlation_difference_rmse <- - # difference_vec ^ 2%>% - # mean() %>% - # sqrt() return( list( correlation_data = correlation_data, - # correlation_original = original_lt, - # correlation_synthetic = synthetic_lt - # correlation_difference = difference_lt, correlation_fit = correlation_fit, correlation_difference_mae = correlation_difference_mae, correlation_difference_rmse = correlation_difference_rmse diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd index 57126a9..5be7e4e 100644 --- a/man/util_corr_fit.Rd +++ b/man/util_corr_fit.Rd @@ -4,14 +4,16 @@ \alias{util_corr_fit} \title{Calculate the correlation fit metric of a confidential data set.} \usage{ -util_corr_fit(postsynth, data, group_by = NULL) +util_corr_fit(postsynth, data, group_var = NULL, level = NULL) } \arguments{ \item{postsynth}{A postsynth object from tidysynthesis or a tibble} \item{data}{an original (observed) data set.} -\item{group_by}{} +\item{group_var}{} + +\item{level}{} } \value{ A \code{list} of fit metrics: From 543d063e04827292f44f33b2f6f23de2e2082b4d Mon Sep 17 00:00:00 2001 From: emcfalls Date: Fri, 29 Dec 2023 14:19:24 -0500 Subject: [PATCH 05/12] fixed is.null() issue --- R/util_corr_fit.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index af6093b..bb46a67 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -37,8 +37,7 @@ util_corr_fit <- function(postsynth, # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) - # issue: if group_by = NULL is passed into the function, this runs - if(!missing(group_by)){ + if(!rlang::quo_is_null(enquo(group_by))){ levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull() From 731cb64eeab772cfb677616bb950e1618775deb0 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Mon, 1 Jan 2024 19:20:12 -0500 Subject: [PATCH 06/12] changed param to group_by --- R/util_corr_fit.R | 4 ++-- man/util_corr_fit.Rd | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index bb46a67..5701592 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -37,7 +37,7 @@ util_corr_fit <- function(postsynth, # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) - if(!rlang::quo_is_null(enquo(group_by))){ + if(!rlang::quo_is_null(rlang::enquo(group_by))){ levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull() @@ -115,7 +115,7 @@ util_corr_fit <- function(postsynth, dplyr::arrange(var1) %>% dplyr::select(var1, var2, synthetic) - # find the difference between the original correlations and the synthetic + # combining the data and finding the difference between the original and synthetic correlations correlation_data <- original_lt %>% dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>% dplyr::mutate(difference = original - synthetic, diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd index 5be7e4e..57126a9 100644 --- a/man/util_corr_fit.Rd +++ b/man/util_corr_fit.Rd @@ -4,16 +4,14 @@ \alias{util_corr_fit} \title{Calculate the correlation fit metric of a confidential data set.} \usage{ -util_corr_fit(postsynth, data, group_var = NULL, level = NULL) +util_corr_fit(postsynth, data, group_by = NULL) } \arguments{ \item{postsynth}{A postsynth object from tidysynthesis or a tibble} \item{data}{an original (observed) data set.} -\item{group_var}{} - -\item{level}{} +\item{group_by}{} } \value{ A \code{list} of fit metrics: From e8a292fa2e8a5eb668b00f31d1547addc6a6ff16 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Fri, 5 Jan 2024 16:57:18 -0500 Subject: [PATCH 07/12] allowing the function to group by multiple variables, issue with lower_triangle function --- R/util_corr_fit.R | 105 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 22 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 5701592..588003c 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -36,33 +36,55 @@ util_corr_fit <- function(postsynth, # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) - - if(!rlang::quo_is_null(rlang::enquo(group_by))){ + + # check if a group_by variable was passed + if (!rlang::quo_is_null(rlang::enquo(group_by))) { - levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull() + # get all of the level combinations from each group by variable + levels <- data %>% + dplyr::select({{ group_by }}) %>% + expand.grid() %>% + unique() + # initialize return values correlation_data <- data.frame() correlation_fit = c() correlation_difference_mae = c() correlation_difference_rmse = c() - - for(level in levels) { - data_sub <- data %>% dplyr::filter({{ group_by }} == level) - - # get the results for the subgroup/level - result <- util_corr_fit(postsynth = synthetic_data, data = data_sub) - - df <- result$correlation_data - fit <- result$correlation_fit - mae <- result$correlation_difference_mae - rmse <- result$correlation_difference_rmse - - # add the results to a growing list of results for each subgroup/level - correlation_data <- dplyr::bind_rows(correlation_data, cbind(level, df)) - correlation_fit = c(correlation_fit, fit) - correlation_difference_mae = c(correlation_difference_mae, mae) - correlation_difference_rmse = c(correlation_difference_rmse, rmse) - } + + # get a subset of the data + for (level in 1:nrow(levels)) { + + data_sub <- data + + for (i in 1:length(colnames(levels))){ + col <- colnames(levels)[i] + value <- levels[level,i] + + data_sub <- data_sub %>% filter(!!rlang::sym(col) == value) + } + + + # get the results for the subgroup/level + result <- util_corr_fit(postsynth = synthetic_data, data = data_sub) + + df <- result$correlation_data + fit <- result$correlation_fit + mae <- result$correlation_difference_mae + rmse <- result$correlation_difference_rmse + + #view(data_sub) + + # add the results to a growing list of results for each subgroup/level + correlation_data <- dplyr::bind_rows(correlation_data, cbind("level_add_later", df)) + correlation_fit = c(correlation_fit, fit) + correlation_difference_mae = c(correlation_difference_mae, mae) + correlation_difference_rmse = c(correlation_difference_rmse, rmse) + + #view(correlation_data) + + stop() + } return( list( @@ -70,8 +92,43 @@ util_corr_fit <- function(postsynth, correlation_fit = correlation_fit, correlation_difference_mae = correlation_difference_mae, correlation_difference_rmse = correlation_difference_rmse - ) + ) ) + + + #levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull() + + # correlation_data <- data.frame() + # correlation_fit = c() + # correlation_difference_mae = c() + # correlation_difference_rmse = c() + # + # for(level in levels) { + # data_sub <- data %>% dplyr::filter({{ group_by }} == level) + # + # # get the results for the subgroup/level + # result <- util_corr_fit(postsynth = synthetic_data, data = data_sub) + # + # df <- result$correlation_data + # fit <- result$correlation_fit + # mae <- result$correlation_difference_mae + # rmse <- result$correlation_difference_rmse + # + # # add the results to a growing list of results for each subgroup/level + # correlation_data <- dplyr::bind_rows(correlation_data, cbind(level, df)) + # correlation_fit = c(correlation_fit, fit) + # correlation_difference_mae = c(correlation_difference_mae, mae) + # correlation_difference_rmse = c(correlation_difference_rmse, rmse) + # } + # + # return( + # list( + # correlation_data = correlation_data, + # correlation_fit = correlation_fit, + # correlation_difference_mae = correlation_difference_mae, + # correlation_difference_rmse = correlation_difference_rmse + # ) + # ) } # helper function to find a correlation matrix with the upper tri set to zeros @@ -86,9 +143,13 @@ util_corr_fit <- function(postsynth, # set the values in the upper triangle to zero to avoid double counting correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA + # view(correlation_matrix) + return(correlation_matrix) } + # view(lower_triangle(data)) + # find the lower triangle of the original data linear correlation matrix and return a df original_lt <- data.frame(lower_triangle(data)) From 6423b3a1673981e5f0a1e10f1ea6757933a8bdd4 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Tue, 9 Jan 2024 09:11:22 -0500 Subject: [PATCH 08/12] finished adding group_by param --- R/util_corr_fit.R | 100 +++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 68 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 588003c..4f69125 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -2,16 +2,18 @@ #' #' @param postsynth A postsynth object from tidysynthesis or a tibble #' @param data an original (observed) data set. -#' @param group_by +#' @param group_by The unquoted name of a (or multiple) grouping variable(s) #' #' @return A `list` of fit metrics: -#' - `correlation_original`: correlation matrix of the original data. -#' - `correlation_synthetic`: correlation matrix of the synthetic data. -#' - `correlation_difference`: difference between `correlation_synthetic` and -#' `correlation_original`. +#' - `correlation_data`: A `tibble` of the correlations among the +#' numeric variables for the actual and synthetic data #' - `correlation_fit`: square root of the sum of squared differences between -#' `correlation_synthetic` and `correlation_original`, divided by the number of +#' the synthetic and original data, divided by the number of #' cells in the correlation matrix. +#' - `correlation_difference_mae`: the mean of the absolute correlation +#' differences between the actual and synthetic data +#' - `correlation_difference_rmse`: the root mean of the squared correlation +#' differences between the actual and synthetic data #' #' @family utility functions #' @@ -41,10 +43,9 @@ util_corr_fit <- function(postsynth, if (!rlang::quo_is_null(rlang::enquo(group_by))) { # get all of the level combinations from each group by variable - levels <- data %>% - dplyr::select({{ group_by }}) %>% - expand.grid() %>% - unique() + data_levels <- data %>% dplyr::select({{ group_by }}) + syn_levels <- synthetic_data %>% dplyr::select({{ group_by }}) + combined_levels <- rbind(data_levels, syn_levels) %>% unique() # initialize return values correlation_data <- data.frame() @@ -53,37 +54,35 @@ util_corr_fit <- function(postsynth, correlation_difference_rmse = c() # get a subset of the data - for (level in 1:nrow(levels)) { + for (i in 1:nrow(combined_levels)) { data_sub <- data + syn_sub <- synthetic_data + data_levels <- data.frame() + current_level <- combined_levels[i,] - for (i in 1:length(colnames(levels))){ - col <- colnames(levels)[i] - value <- levels[level,i] + for (j in 1:length(colnames(combined_levels))){ + col <- colnames(combined_levels)[j] + value <- combined_levels[i,j] + data_levels <- combined_levels[i,j] data_sub <- data_sub %>% filter(!!rlang::sym(col) == value) + syn_sub <- syn_sub %>% filter(!!rlang::sym(col) == value) } - # get the results for the subgroup/level - result <- util_corr_fit(postsynth = synthetic_data, data = data_sub) + result <- util_corr_fit(postsynth = syn_sub, data = data_sub) df <- result$correlation_data fit <- result$correlation_fit mae <- result$correlation_difference_mae rmse <- result$correlation_difference_rmse - #view(data_sub) - # add the results to a growing list of results for each subgroup/level - correlation_data <- dplyr::bind_rows(correlation_data, cbind("level_add_later", df)) + correlation_data <- dplyr::bind_rows(correlation_data, cbind(current_level, df, row.names = NULL)) correlation_fit = c(correlation_fit, fit) correlation_difference_mae = c(correlation_difference_mae, mae) correlation_difference_rmse = c(correlation_difference_rmse, rmse) - - #view(correlation_data) - - stop() } return( @@ -94,41 +93,6 @@ util_corr_fit <- function(postsynth, correlation_difference_rmse = correlation_difference_rmse ) ) - - - #levels <- data %>% dplyr::distinct({{ group_by }}) %>% dplyr::pull() - - # correlation_data <- data.frame() - # correlation_fit = c() - # correlation_difference_mae = c() - # correlation_difference_rmse = c() - # - # for(level in levels) { - # data_sub <- data %>% dplyr::filter({{ group_by }} == level) - # - # # get the results for the subgroup/level - # result <- util_corr_fit(postsynth = synthetic_data, data = data_sub) - # - # df <- result$correlation_data - # fit <- result$correlation_fit - # mae <- result$correlation_difference_mae - # rmse <- result$correlation_difference_rmse - # - # # add the results to a growing list of results for each subgroup/level - # correlation_data <- dplyr::bind_rows(correlation_data, cbind(level, df)) - # correlation_fit = c(correlation_fit, fit) - # correlation_difference_mae = c(correlation_difference_mae, mae) - # correlation_difference_rmse = c(correlation_difference_rmse, rmse) - # } - # - # return( - # list( - # correlation_data = correlation_data, - # correlation_fit = correlation_fit, - # correlation_difference_mae = correlation_difference_mae, - # correlation_difference_rmse = correlation_difference_rmse - # ) - # ) } # helper function to find a correlation matrix with the upper tri set to zeros @@ -140,16 +104,13 @@ util_corr_fit <- function(postsynth, dplyr::select_if(is.numeric) %>% stats::cor() - # set the values in the upper triangle to zero to avoid double counting + # set NA values in the lower triangle to "", set the values in the upper triangle to zero to avoid double counting + correlation_matrix[is.na(correlation_matrix[lower.tri(correlation_matrix, diag = FALSE)])] <- "" correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA - - # view(correlation_matrix) - + return(correlation_matrix) } - # view(lower_triangle(data)) - # find the lower triangle of the original data linear correlation matrix and return a df original_lt <- data.frame(lower_triangle(data)) @@ -161,7 +122,8 @@ util_corr_fit <- function(postsynth, tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>% dplyr::filter(!is.na(original)) %>% dplyr::arrange(var1) %>% - dplyr::select(var1, var2, original) + dplyr::select(var1, var2, original) %>% + dplyr::mutate(original = case_when(original == "" ~ NA, .default = original)) # find the lower triangle of the synthetic data linear correlation matrix and return a df synthetic_lt <- data.frame(lower_triangle(synthetic_data)) @@ -174,16 +136,18 @@ util_corr_fit <- function(postsynth, tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>% dplyr::filter(!is.na(synthetic)) %>% dplyr::arrange(var1) %>% - dplyr::select(var1, var2, synthetic) + dplyr::select(var1, var2, synthetic) %>% + dplyr::mutate(synthetic = case_when(synthetic == "" ~ NA, .default = synthetic)) # combining the data and finding the difference between the original and synthetic correlations correlation_data <- original_lt %>% dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>% - dplyr::mutate(difference = original - synthetic, - proportion_difference = .data$difference / .data$original) + dplyr::mutate(difference = as.numeric(.data$original) - as.numeric(.data$synthetic), + proportion_difference = as.numeric(.data$difference) / as.numeric(.data$original)) # find the length of the nonzero values in the matrices n <- choose(ncol(correlation_data), 2) + print(n) # calculate the correlation fit and divide by n correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n From b2cccd6db3997f73518a6f289413e561b6ab6cf2 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Wed, 10 Jan 2024 16:29:11 -0500 Subject: [PATCH 09/12] cleaned up function slightly, added tests for util_corr_fit() --- R/util_corr_fit.R | 30 ++--- tests/testthat/test-util_corr_fit.R | 199 +++++++++++++++++++++++----- 2 files changed, 183 insertions(+), 46 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 4f69125..d76a595 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -31,10 +31,9 @@ util_corr_fit <- function(postsynth, synthetic_data <- postsynth } - - synthetic_data <- dplyr::select(synthetic_data, where(is.numeric), {{ group_by }}) - data <- dplyr::select(data, where(is.numeric), {{ group_by }}) + synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }}) + data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) @@ -43,9 +42,9 @@ util_corr_fit <- function(postsynth, if (!rlang::quo_is_null(rlang::enquo(group_by))) { # get all of the level combinations from each group by variable - data_levels <- data %>% dplyr::select({{ group_by }}) - syn_levels <- synthetic_data %>% dplyr::select({{ group_by }}) - combined_levels <- rbind(data_levels, syn_levels) %>% unique() + combined_levels <- rbind(data, synthetic_data) %>% + dplyr::select({{ group_by }}) %>% + unique() # initialize return values correlation_data <- data.frame() @@ -58,16 +57,16 @@ util_corr_fit <- function(postsynth, data_sub <- data syn_sub <- synthetic_data - data_levels <- data.frame() - current_level <- combined_levels[i,] + + current_level <- data.frame(combined_levels[i,]) + colnames(current_level) <- colnames(combined_levels) for (j in 1:length(colnames(combined_levels))){ col <- colnames(combined_levels)[j] value <- combined_levels[i,j] - - data_levels <- combined_levels[i,j] - data_sub <- data_sub %>% filter(!!rlang::sym(col) == value) - syn_sub <- syn_sub %>% filter(!!rlang::sym(col) == value) + + data_sub <- data_sub %>% dplyr::filter(!!rlang::sym(col) == value) + syn_sub <- syn_sub %>% dplyr::filter(!!rlang::sym(col) == value) } # get the results for the subgroup/level @@ -123,7 +122,7 @@ util_corr_fit <- function(postsynth, dplyr::filter(!is.na(original)) %>% dplyr::arrange(var1) %>% dplyr::select(var1, var2, original) %>% - dplyr::mutate(original = case_when(original == "" ~ NA, .default = original)) + dplyr::mutate(original = dplyr::case_when(original == "" ~ NA, .default = original)) # find the lower triangle of the synthetic data linear correlation matrix and return a df synthetic_lt <- data.frame(lower_triangle(synthetic_data)) @@ -137,7 +136,7 @@ util_corr_fit <- function(postsynth, dplyr::filter(!is.na(synthetic)) %>% dplyr::arrange(var1) %>% dplyr::select(var1, var2, synthetic) %>% - dplyr::mutate(synthetic = case_when(synthetic == "" ~ NA, .default = synthetic)) + dplyr::mutate(synthetic = dplyr::case_when(synthetic == "" ~ NA, .default = synthetic)) # combining the data and finding the difference between the original and synthetic correlations correlation_data <- original_lt %>% @@ -146,8 +145,7 @@ util_corr_fit <- function(postsynth, proportion_difference = as.numeric(.data$difference) / as.numeric(.data$original)) # find the length of the nonzero values in the matrices - n <- choose(ncol(correlation_data), 2) - print(n) + n <- nrow(dplyr::filter(correlation_data, !is.na(difference))) # calculate the correlation fit and divide by n correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R index 4ff40c8..fbcc380 100644 --- a/tests/testthat/test-util_corr_fit.R +++ b/tests/testthat/test-util_corr_fit.R @@ -1,50 +1,189 @@ # confidential data -df <- data.frame(a = c(1, 2, 3), - b = c(1, 2, 3), - c = c(1, 2, 3), - RECID = c("a", "b", "c")) - -# difference matrix for tests -diff_matrix <- matrix( - c(NA, NA, NA, - -2, NA, NA, - 0, -2, NA), - ncol = 3, - byrow = TRUE -) - -rownames(diff_matrix) <- c("a", "c", "b") -colnames(diff_matrix) <- c("a", "c", "b") +df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green")) + +syn <- list(synthetic_data = data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"))) %>% + structure(class = "postsynth") # test with postsynth test_that("util_corr_fit is correct with postsynth ", { - syn <- list(synthetic_data = data.frame(a = c(1, 2, 3), - c = c(3, 2, 1), - b = c(1, 2, 3), - RECID = c("a", "b", "c"))) %>% - structure(class = "postsynth") - corr <- util_corr_fit(postsynth = syn, data = df) + corr_data <- corr$correlation_data + + # check dimensions + expect_equal(ncol(corr_data), 6) + expect_equal(nrow(corr_data), 3) - expect_equal(corr$correlation_difference, diff_matrix) + expect_equal(corr_data$difference, c(2, 0, 2)) expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2)))) expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) + }) # test with data test_that("util_corr_fit is correct with postsynth ", { - syn <- data.frame(a = c(1, 2, 3), - c = c(3, 2, 1), - b = c(1, 2, 3), - RECID = c("a", "b", "c")) + syn_data <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) - corr <- util_corr_fit(postsynth = syn, data = df) - - expect_equal(corr$correlation_difference, diff_matrix) + corr <- util_corr_fit(postsynth = syn_data, data = df) + corr_data <- corr$correlation_data + + # check dimensions + expect_equal(ncol(corr_data), 6) + expect_equal(nrow(corr_data), 3) + + expect_equal(corr_data$difference, c(2, 0, 2)) expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2)))) expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) + +}) + + + +test_that("util_corr_fit is correct when groupped by one variable ", { + + df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green")) + + syn <- list(synthetic_data = data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"))) %>% + structure(class = "postsynth") + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = d) + corr_data <- corr$correlation_data + + # check dimensions + expect_equal(ncol(corr_data), 7) + expect_equal(nrow(corr_data), 6) + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2)) + expect_equal(corr$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2)) + expect_equal(corr$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2)) + expect_equal(corr$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2)) + +}) + + +test_that("util_corr_fit is correct when groupped by more than one variable ", { + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) + corr_data <- corr$correlation_data + + # check dimensions + expect_equal(ncol(corr_data), 8) + expect_equal(nrow(corr_data), 6) + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2)) + expect_equal(corr$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2)) + expect_equal(corr$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2)) + expect_equal(corr$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2)) + +}) + + +test_that("util_corr_fit returns NAs when only one observation is in a grouping ", { + + df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "red"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + syn <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "red"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) + corr_data <- corr$correlation_data + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA)) + expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN)) + expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA)) + expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA)) + }) + + + +test_that("util_corr_fit returns NAs when a grouping in the actual data is not in the synthetic data", { + + df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "red"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + syn <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) + corr_data <- corr$correlation_data + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA)) + expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN)) + expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA)) + expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA)) + +}) + + +test_that("util_corr_fit returns NAs when a grouping in the synthetic data is not in the actual data", { + + df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + syn <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "red"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) + corr_data <- corr$correlation_data + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA)) + expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN)) + expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA)) + expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA)) + +}) + + + + + From 7a57123078ffe67e1a601ab280aac449f3381a13 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Tue, 16 Jan 2024 10:07:22 -0500 Subject: [PATCH 10/12] finished formatting code and testing util_corr_fit --- R/util_corr_fit.R | 40 ++++++++++++++--------------- man/util_corr_fit.Rd | 14 +++++----- tests/testthat/test-util_corr_fit.R | 1 - 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index d76a595..0b0626d 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -65,8 +65,8 @@ util_corr_fit <- function(postsynth, col <- colnames(combined_levels)[j] value <- combined_levels[i,j] - data_sub <- data_sub %>% dplyr::filter(!!rlang::sym(col) == value) - syn_sub <- syn_sub %>% dplyr::filter(!!rlang::sym(col) == value) + data_sub <- dplyr::filter(data_sub, !!rlang::sym(col) == value) + syn_sub <- dplyr::filter(syn_sub, !!rlang::sym(col) == value) } # get the results for the subgroup/level @@ -78,7 +78,9 @@ util_corr_fit <- function(postsynth, rmse <- result$correlation_difference_rmse # add the results to a growing list of results for each subgroup/level - correlation_data <- dplyr::bind_rows(correlation_data, cbind(current_level, df, row.names = NULL)) + correlation_data <- dplyr::bind_rows(correlation_data, + cbind(current_level, df, + row.names = NULL)) correlation_fit = c(correlation_fit, fit) correlation_difference_mae = c(correlation_difference_mae, mae) correlation_difference_rmse = c(correlation_difference_rmse, rmse) @@ -110,41 +112,39 @@ util_corr_fit <- function(postsynth, return(correlation_matrix) } - # find the lower triangle of the original data linear correlation matrix and return a df + # find the lower triangle of the linear correlation matrices and add a var column original_lt <- data.frame(lower_triangle(data)) - - # adding variable 2 column to the original df original_lt$var2 <- colnames(original_lt) - # restructuring the correlation matrix so the cols are var1, var2, original + synthetic_lt <- data.frame(lower_triangle(synthetic_data)) + synthetic_lt$var2 <- colnames(synthetic_lt) + + # restructure the correlation matrix so the cols are var1, var2, original/synthetic original_lt <- original_lt %>% tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>% dplyr::filter(!is.na(original)) %>% dplyr::arrange(var1) %>% dplyr::select(var1, var2, original) %>% - dplyr::mutate(original = dplyr::case_when(original == "" ~ NA, .default = original)) - - # find the lower triangle of the synthetic data linear correlation matrix and return a df - synthetic_lt <- data.frame(lower_triangle(synthetic_data)) - - # adding variable 2 column to the synthetic df - synthetic_lt$var2 <- colnames(synthetic_lt) + dplyr::mutate(original = dplyr::case_when(.data$original == "" ~ NA, + .default = .data$original)) - # restructuring the correlation matrix so the cols are var1, var2, synthetic synthetic_lt <- synthetic_lt %>% tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>% dplyr::filter(!is.na(synthetic)) %>% dplyr::arrange(var1) %>% dplyr::select(var1, var2, synthetic) %>% - dplyr::mutate(synthetic = dplyr::case_when(synthetic == "" ~ NA, .default = synthetic)) + dplyr::mutate(synthetic = dplyr::case_when(.data$synthetic == "" ~ NA, + .default = .data$synthetic)) - # combining the data and finding the difference between the original and synthetic correlations + # combine the data and find the difference between the original and synthetic correlations correlation_data <- original_lt %>% dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>% - dplyr::mutate(difference = as.numeric(.data$original) - as.numeric(.data$synthetic), - proportion_difference = as.numeric(.data$difference) / as.numeric(.data$original)) + dplyr::mutate(original = as.numeric(.data$original), + synthetic = as.numeric(.data$synthetic), + difference = .data$original - .data$synthetic, + proportion_difference = .data$difference / .data$original) - # find the length of the nonzero values in the matrices + # find the number of values in the lower triangle n <- nrow(dplyr::filter(correlation_data, !is.na(difference))) # calculate the correlation fit and divide by n diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd index 57126a9..721e8a4 100644 --- a/man/util_corr_fit.Rd +++ b/man/util_corr_fit.Rd @@ -11,18 +11,20 @@ util_corr_fit(postsynth, data, group_by = NULL) \item{data}{an original (observed) data set.} -\item{group_by}{} +\item{group_by}{The unquoted name of a (or multiple) grouping variable(s)} } \value{ A \code{list} of fit metrics: \itemize{ -\item \code{correlation_original}: correlation matrix of the original data. -\item \code{correlation_synthetic}: correlation matrix of the synthetic data. -\item \code{correlation_difference}: difference between \code{correlation_synthetic} and -\code{correlation_original}. +\item \code{correlation_data}: A \code{tibble} of the correlations among the +numeric variables for the actual and synthetic data \item \code{correlation_fit}: square root of the sum of squared differences between -\code{correlation_synthetic} and \code{correlation_original}, divided by the number of +the synthetic and original data, divided by the number of cells in the correlation matrix. +\item \code{correlation_difference_mae}: the mean of the absolute correlation +differences between the actual and synthetic data +\item \code{correlation_difference_rmse}: the root mean of the squared correlation +differences between the actual and synthetic data } } \description{ diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R index fbcc380..8d254ee 100644 --- a/tests/testthat/test-util_corr_fit.R +++ b/tests/testthat/test-util_corr_fit.R @@ -129,7 +129,6 @@ test_that("util_corr_fit returns NAs when only one observation is in a grouping }) - test_that("util_corr_fit returns NAs when a grouping in the actual data is not in the synthetic data", { df <- data.frame(a = c(1, 2, 3, 4, 5), From 8309fc468c6e67a82829b1739480d6c2e9da8635 Mon Sep 17 00:00:00 2001 From: emcfalls Date: Fri, 10 May 2024 12:18:32 -0400 Subject: [PATCH 11/12] updated code for util_corr_fit -> new code does not account for instances where the group_by variable(s) are different for the synthetic and actual data --- R/util_corr_fit.R | 107 +++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 63 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 0b0626d..2228210 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -23,6 +23,8 @@ util_corr_fit <- function(postsynth, data, group_by = NULL) { + + if (is_postsynth(postsynth)) { synthetic_data <- postsynth$synthetic_data @@ -31,71 +33,46 @@ util_corr_fit <- function(postsynth, synthetic_data <- postsynth } - - synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }}) - data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) - + # reorder data names (this appears to check if the variables are the same) + # issue when the groups in the synthetic data do not match the groups in the og data, and vice versa + # thinking about filling in all of groupings for each dataset first then running everything else data <- dplyr::select(data, names(synthetic_data)) + + synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }}) |> + dplyr::arrange(dplyr::across({{ group_by }})) |> + dplyr::group_split(dplyr::across({{ group_by }})) + + data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) |> + dplyr::arrange(dplyr::across({{ group_by }})) |> + dplyr::group_split(dplyr::across({{ group_by }})) + + groups <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) |> + slice(1)) - # check if a group_by variable was passed - if (!rlang::quo_is_null(rlang::enquo(group_by))) { - - # get all of the level combinations from each group by variable - combined_levels <- rbind(data, synthetic_data) %>% - dplyr::select({{ group_by }}) %>% - unique() + results <- purrr::pmap( + .l = list(synthetic_data, data, groups), + .f = get_correlations + ) + + metrics <- dplyr::bind_cols( + correlation_fit = map_dbl(results, "correlation_fit"), + correlation_difference_mae = map_dbl(results, "correlation_difference_mae"), + correlation_difference_rmse = map_dbl(results, "correlation_difference_rmse"), + bind_rows(groups) + ) - # initialize return values - correlation_data <- data.frame() - correlation_fit = c() - correlation_difference_mae = c() - correlation_difference_rmse = c() + corr_data <- dplyr::bind_rows(map_dfr(results, "correlation_data")) - # get a subset of the data - for (i in 1:nrow(combined_levels)) { - - data_sub <- data - syn_sub <- synthetic_data - - current_level <- data.frame(combined_levels[i,]) - colnames(current_level) <- colnames(combined_levels) - - for (j in 1:length(colnames(combined_levels))){ - col <- colnames(combined_levels)[j] - value <- combined_levels[i,j] + return(list( + corr_data, + metrics + )) +} - data_sub <- dplyr::filter(data_sub, !!rlang::sym(col) == value) - syn_sub <- dplyr::filter(syn_sub, !!rlang::sym(col) == value) - } - - # get the results for the subgroup/level - result <- util_corr_fit(postsynth = syn_sub, data = data_sub) - - df <- result$correlation_data - fit <- result$correlation_fit - mae <- result$correlation_difference_mae - rmse <- result$correlation_difference_rmse - - # add the results to a growing list of results for each subgroup/level - correlation_data <- dplyr::bind_rows(correlation_data, - cbind(current_level, df, - row.names = NULL)) - correlation_fit = c(correlation_fit, fit) - correlation_difference_mae = c(correlation_difference_mae, mae) - correlation_difference_rmse = c(correlation_difference_rmse, rmse) - } - - return( - list( - correlation_data = correlation_data, - correlation_fit = correlation_fit, - correlation_difference_mae = correlation_difference_mae, - correlation_difference_rmse = correlation_difference_rmse - ) - ) - } - +get_correlations <- function(synthetic_data, + data, + groups) { # helper function to find a correlation matrix with the upper tri set to zeros lower_triangle <- function(x) { @@ -108,10 +85,11 @@ util_corr_fit <- function(postsynth, # set NA values in the lower triangle to "", set the values in the upper triangle to zero to avoid double counting correlation_matrix[is.na(correlation_matrix[lower.tri(correlation_matrix, diag = FALSE)])] <- "" correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA - + return(correlation_matrix) } + # find the lower triangle of the linear correlation matrices and add a var column original_lt <- data.frame(lower_triangle(data)) original_lt$var2 <- colnames(original_lt) @@ -144,9 +122,11 @@ util_corr_fit <- function(postsynth, difference = .data$original - .data$synthetic, proportion_difference = .data$difference / .data$original) + correlation_data <- bind_cols(correlation_data, groups) + # find the number of values in the lower triangle n <- nrow(dplyr::filter(correlation_data, !is.na(difference))) - + # calculate the correlation fit and divide by n correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n @@ -156,7 +136,7 @@ util_corr_fit <- function(postsynth, correlation_difference_mae <- difference_vec %>% abs() %>% mean() - + # root mean square error correlation_difference_rmse <- difference_vec ^ 2 %>% mean() %>% @@ -172,4 +152,5 @@ util_corr_fit <- function(postsynth, ) ) -} \ No newline at end of file +} + From e42d840864b88220da70dbacf8c7b2bf3c39256b Mon Sep 17 00:00:00 2001 From: emcfalls Date: Fri, 10 May 2024 16:36:46 -0400 Subject: [PATCH 12/12] updated util_corr_fit code and tests --- R/util_corr_fit.R | 50 +++++++++++---------- tests/testthat/test-util_corr_fit.R | 67 ++++++++++++++++++++--------- 2 files changed, 73 insertions(+), 44 deletions(-) diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 2228210..2161387 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -7,13 +7,12 @@ #' @return A `list` of fit metrics: #' - `correlation_data`: A `tibble` of the correlations among the #' numeric variables for the actual and synthetic data -#' - `correlation_fit`: square root of the sum of squared differences between -#' the synthetic and original data, divided by the number of -#' cells in the correlation matrix. -#' - `correlation_difference_mae`: the mean of the absolute correlation -#' differences between the actual and synthetic data -#' - `correlation_difference_rmse`: the root mean of the squared correlation -#' differences between the actual and synthetic data +#' - `metrics`: Correlation metrics including the correlation fit, square root +#' of the sum of squared differences between the synthetic and original data, +#' divided by the number of cells in the correlation matrix, mean squared +#' error, the mean of the absolute correlation differences between the actual +#' and synthetic data, and the root mean squared error, the root mean of the +#' squared correlation differences between the actual and synthetic data #' #' @family utility functions #' @@ -35,20 +34,23 @@ util_corr_fit <- function(postsynth, } # reorder data names (this appears to check if the variables are the same) - # issue when the groups in the synthetic data do not match the groups in the og data, and vice versa - # thinking about filling in all of groupings for each dataset first then running everything else data <- dplyr::select(data, names(synthetic_data)) - synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }}) |> - dplyr::arrange(dplyr::across({{ group_by }})) |> + #TODO: Need to update code so if the synthetic data and actual data do not have the same groupings, the function still runs correctly. + # I think we could add all of the groupings to each dataset, the run the rest of the code. + + synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }}) %>% + dplyr::arrange(dplyr::across({{ group_by }})) %>% dplyr::group_split(dplyr::across({{ group_by }})) - data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) |> - dplyr::arrange(dplyr::across({{ group_by }})) |> + data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) %>% + dplyr::arrange(dplyr::across({{ group_by }})) %>% dplyr::group_split(dplyr::across({{ group_by }})) - groups <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) |> - slice(1)) + groups <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) %>% + dplyr::slice(1)) + n <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) %>% + dplyr::count()) results <- purrr::pmap( .l = list(synthetic_data, data, groups), @@ -56,18 +58,20 @@ util_corr_fit <- function(postsynth, ) metrics <- dplyr::bind_cols( - correlation_fit = map_dbl(results, "correlation_fit"), - correlation_difference_mae = map_dbl(results, "correlation_difference_mae"), - correlation_difference_rmse = map_dbl(results, "correlation_difference_rmse"), - bind_rows(groups) + n = unlist(n), + correlation_fit = purrr::map_dbl(results, "correlation_fit"), + correlation_difference_mae = purrr::map_dbl(results, "correlation_difference_mae"), + correlation_difference_rmse =purrr:: map_dbl(results, "correlation_difference_rmse"), + dplyr::bind_rows(groups) ) - corr_data <- dplyr::bind_rows(map_dfr(results, "correlation_data")) + corr_data <- dplyr::bind_rows(purrr::map_dfr(results, "correlation_data")) return(list( - corr_data, - metrics - )) + correlation_data = corr_data, + metrics = metrics + ) + ) } get_correlations <- function(synthetic_data, diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R index 8d254ee..86ec3a4 100644 --- a/tests/testthat/test-util_corr_fit.R +++ b/tests/testthat/test-util_corr_fit.R @@ -1,3 +1,5 @@ +library(tidyverse) + # confidential data df <- data.frame(a = c(1, 2, 3, 4, 5), b = c(1, 2, 3, 4, 5), @@ -17,15 +19,20 @@ test_that("util_corr_fit is correct with postsynth ", { corr <- util_corr_fit(postsynth = syn, data = df) corr_data <- corr$correlation_data + metrics <- corr$metrics # check dimensions expect_equal(ncol(corr_data), 6) expect_equal(nrow(corr_data), 3) + expect_equal(ncol(metrics), 4) + expect_equal(nrow(metrics), 1) expect_equal(corr_data$difference, c(2, 0, 2)) - expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) - expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2)))) - expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) + expect_equal(metrics$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) + expect_equal(metrics$correlation_difference_mae, mean(abs(c(0, -2, -2)))) + expect_equal(metrics$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) + # throws an error, may be due to how the n column was added to the data + expect_equal(metrics$n, 5) }) @@ -36,21 +43,23 @@ test_that("util_corr_fit is correct with postsynth ", { b = c(5, 4, 3, 2, 1), c = c(1, 2, 3, 4, 5), d = c("red", "red", "yellow", "yellow", "yellow"), - e = c("blue", "blue", "green", "green", "green"), - RECID = c("a", "b", "c", "d", "e")) + e = c("blue", "blue", "green", "green", "green")) corr <- util_corr_fit(postsynth = syn_data, data = df) corr_data <- corr$correlation_data + metrics <- corr$metrics # check dimensions expect_equal(ncol(corr_data), 6) expect_equal(nrow(corr_data), 3) + expect_equal(ncol(metrics), 4) + expect_equal(nrow(metrics), 1) expect_equal(corr_data$difference, c(2, 0, 2)) - expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) - expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2)))) - expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) - + expect_equal(metrics$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) + expect_equal(metrics$correlation_difference_mae, mean(abs(c(0, -2, -2)))) + expect_equal(metrics$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) + expect_equal(metrics$n, 5) }) @@ -71,16 +80,20 @@ test_that("util_corr_fit is correct when groupped by one variable ", { structure(class = "postsynth") corr <- util_corr_fit(postsynth = syn, data = df, group_by = d) - corr_data <- corr$correlation_data + corr_data <- corr$correlation_data + metrics <- corr$metrics # check dimensions expect_equal(ncol(corr_data), 7) expect_equal(nrow(corr_data), 6) + expect_equal(ncol(metrics), 5) + expect_equal(nrow(metrics), 2) expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2)) - expect_equal(corr$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2)) - expect_equal(corr$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2)) - expect_equal(corr$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2)) + expect_equal(metrics$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2)) + expect_equal(metrics$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2)) + expect_equal(metrics$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2)) + expect_equal(metrics$n, c(2, 3)) }) @@ -89,15 +102,19 @@ test_that("util_corr_fit is correct when groupped by more than one variable ", { corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) corr_data <- corr$correlation_data + metrics <- corr$metrics # check dimensions expect_equal(ncol(corr_data), 8) expect_equal(nrow(corr_data), 6) + expect_equal(ncol(metrics), 6) + expect_equal(nrow(metrics), 2) expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2)) - expect_equal(corr$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2)) - expect_equal(corr$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2)) - expect_equal(corr$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2)) + expect_equal(metrics$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2)) + expect_equal(metrics$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2)) + expect_equal(metrics$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2)) + expect_equal(metrics$n, c(2, 3)) }) @@ -119,15 +136,23 @@ test_that("util_corr_fit returns NAs when only one observation is in a grouping RECID = c("a", "b", "c", "d", "e")) corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) - corr_data <- corr$correlation_data + corr_data <- corr$correlation_data + metrics <- corr$metrics - expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA)) - expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN)) - expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA)) - expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA)) + expect_equal(ncol(corr_data), 8) + expect_equal(nrow(corr_data), 9) + expect_equal(ncol(metrics), 6) + expect_equal(nrow(metrics), 3) + + expect_equal(corr_data$difference, c(2, 0, 2, NA, NA, NA, 2, 0, 2)) + expect_equal(metrics$correlation_fit, c(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, NaN, sqrt(sum(c(0, -2, -2) ^ 2)) / 3)) + expect_equal(metrics$correlation_difference_mae, c(mean(abs(c(0, -2, -2))), NA, mean(abs(c(0, -2, -2))))) + expect_equal(metrics$correlation_difference_rmse, c(sqrt(mean(c(0, -2, -2) ^ 2)), NA, sqrt(mean(c(0, -2, -2) ^ 2)))) + expect_equal(metrics$n, c(2, 1, 2)) }) +# NOTE: These tests will fail test_that("util_corr_fit returns NAs when a grouping in the actual data is not in the synthetic data", {