diff --git a/R/util_corr_fit.R b/R/util_corr_fit.R index 5d0232a..2161387 100644 --- a/R/util_corr_fit.R +++ b/R/util_corr_fit.R @@ -2,21 +2,27 @@ #' #' @param postsynth A postsynth object from tidysynthesis or a tibble #' @param data an original (observed) data set. +#' @param group_by The unquoted name of a (or multiple) grouping variable(s) #' #' @return A `list` of fit metrics: -#' - `correlation_original`: correlation matrix of the original data. -#' - `correlation_synthetic`: correlation matrix of the synthetic data. -#' - `correlation_difference`: difference between `correlation_synthetic` and -#' `correlation_original`. -#' - `correlation_fit`: square root of the sum of squared differences between -#' `correlation_synthetic` and `correlation_original`, divided by the number of -#' cells in the correlation matrix. +#' - `correlation_data`: A `tibble` of the correlations among the +#' numeric variables for the actual and synthetic data +#' - `metrics`: Correlation metrics including the correlation fit, square root +#' of the sum of squared differences between the synthetic and original data, +#' divided by the number of cells in the correlation matrix, mean squared +#' error, the mean of the absolute correlation differences between the actual +#' and synthetic data, and the root mean squared error, the root mean of the +#' squared correlation differences between the actual and synthetic data #' #' @family utility functions #' #' @export -util_corr_fit <- function(postsynth, data) { +util_corr_fit <- function(postsynth, + data, + group_by = NULL) { + + if (is_postsynth(postsynth)) { @@ -25,15 +31,52 @@ util_corr_fit <- function(postsynth, data) { } else { synthetic_data <- postsynth - } - synthetic_data <- dplyr::select_if(synthetic_data, is.numeric) - data <- dplyr::select_if(data, is.numeric) - - # reorder data names + # reorder data names (this appears to check if the variables are the same) data <- dplyr::select(data, names(synthetic_data)) + #TODO: Need to update code so if the synthetic data and actual data do not have the same groupings, the function still runs correctly. + # I think we could add all of the groupings to each dataset, the run the rest of the code. + + synthetic_data <- dplyr::select(synthetic_data, dplyr::where(is.numeric), {{ group_by }}) %>% + dplyr::arrange(dplyr::across({{ group_by }})) %>% + dplyr::group_split(dplyr::across({{ group_by }})) + + data <- dplyr::select(data, dplyr::where(is.numeric), {{ group_by }}) %>% + dplyr::arrange(dplyr::across({{ group_by }})) %>% + dplyr::group_split(dplyr::across({{ group_by }})) + + groups <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) %>% + dplyr::slice(1)) + n <- lapply(data, function(x) dplyr::select(x, {{ group_by }}) %>% + dplyr::count()) + + results <- purrr::pmap( + .l = list(synthetic_data, data, groups), + .f = get_correlations + ) + + metrics <- dplyr::bind_cols( + n = unlist(n), + correlation_fit = purrr::map_dbl(results, "correlation_fit"), + correlation_difference_mae = purrr::map_dbl(results, "correlation_difference_mae"), + correlation_difference_rmse =purrr:: map_dbl(results, "correlation_difference_rmse"), + dplyr::bind_rows(groups) + ) + + corr_data <- dplyr::bind_rows(purrr::map_dfr(results, "correlation_data")) + + return(list( + correlation_data = corr_data, + metrics = metrics + ) + ) +} + +get_correlations <- function(synthetic_data, + data, + groups) { # helper function to find a correlation matrix with the upper tri set to zeros lower_triangle <- function(x) { @@ -43,37 +86,55 @@ util_corr_fit <- function(postsynth, data) { dplyr::select_if(is.numeric) %>% stats::cor() - # set the values in the upper triangle to zero to avoid double counting + # set NA values in the lower triangle to "", set the values in the upper triangle to zero to avoid double counting + correlation_matrix[is.na(correlation_matrix[lower.tri(correlation_matrix, diag = FALSE)])] <- "" correlation_matrix[upper.tri(correlation_matrix, diag = TRUE)] <- NA return(correlation_matrix) } - # find the lower triangle of the original data linear correlation matrix - original_lt <- lower_triangle(data) - # find the lower triangle of the synthetic data linear correlation matrix - synthetic_lt <- lower_triangle(synthetic_data) + # find the lower triangle of the linear correlation matrices and add a var column + original_lt <- data.frame(lower_triangle(data)) + original_lt$var2 <- colnames(original_lt) - # compare names - if (any(rownames(original_lt) != rownames(synthetic_lt))) { - stop("ERROR: rownames are not identical") - } + synthetic_lt <- data.frame(lower_triangle(synthetic_data)) + synthetic_lt$var2 <- colnames(synthetic_lt) - if (any(colnames(original_lt) != colnames(synthetic_lt))) { - stop("ERROR: colnames are not identical") - } + # restructure the correlation matrix so the cols are var1, var2, original/synthetic + original_lt <- original_lt %>% + tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "original") %>% + dplyr::filter(!is.na(original)) %>% + dplyr::arrange(var1) %>% + dplyr::select(var1, var2, original) %>% + dplyr::mutate(original = dplyr::case_when(.data$original == "" ~ NA, + .default = .data$original)) + + synthetic_lt <- synthetic_lt %>% + tidyr::pivot_longer(cols = !var2, names_to = "var1", values_to = "synthetic") %>% + dplyr::filter(!is.na(synthetic)) %>% + dplyr::arrange(var1) %>% + dplyr::select(var1, var2, synthetic) %>% + dplyr::mutate(synthetic = dplyr::case_when(.data$synthetic == "" ~ NA, + .default = .data$synthetic)) + + # combine the data and find the difference between the original and synthetic correlations + correlation_data <- original_lt %>% + dplyr::left_join(synthetic_lt, by = c("var1","var2")) %>% + dplyr::mutate(original = as.numeric(.data$original), + synthetic = as.numeric(.data$synthetic), + difference = .data$original - .data$synthetic, + proportion_difference = .data$difference / .data$original) - # find the difference between the matrices - difference_lt <- synthetic_lt - original_lt + correlation_data <- bind_cols(correlation_data, groups) - # find the length of the nonzero values in the matrices - n <- choose(ncol(difference_lt), 2) + # find the number of values in the lower triangle + n <- nrow(dplyr::filter(correlation_data, !is.na(difference))) # calculate the correlation fit and divide by n - correlation_fit <- sqrt(sum(difference_lt ^ 2, na.rm = TRUE)) / n + correlation_fit <- sqrt(sum(correlation_data$difference ^ 2, na.rm = TRUE)) / n - difference_vec <- as.numeric(difference_lt)[!is.na(difference_lt)] + difference_vec <- as.numeric(correlation_data$difference) # mean absolute error correlation_difference_mae <- difference_vec %>% @@ -81,20 +142,19 @@ util_corr_fit <- function(postsynth, data) { mean() # root mean square error - correlation_difference_rmse <- - difference_vec ^ 2%>% + correlation_difference_rmse <- difference_vec ^ 2 %>% mean() %>% sqrt() + return( list( - correlation_original = original_lt, - correlation_synthetic = synthetic_lt, - correlation_difference = difference_lt, + correlation_data = correlation_data, correlation_fit = correlation_fit, correlation_difference_mae = correlation_difference_mae, correlation_difference_rmse = correlation_difference_rmse ) ) -} \ No newline at end of file +} + diff --git a/man/util_corr_fit.Rd b/man/util_corr_fit.Rd index 860a090..721e8a4 100644 --- a/man/util_corr_fit.Rd +++ b/man/util_corr_fit.Rd @@ -4,23 +4,27 @@ \alias{util_corr_fit} \title{Calculate the correlation fit metric of a confidential data set.} \usage{ -util_corr_fit(postsynth, data) +util_corr_fit(postsynth, data, group_by = NULL) } \arguments{ \item{postsynth}{A postsynth object from tidysynthesis or a tibble} \item{data}{an original (observed) data set.} + +\item{group_by}{The unquoted name of a (or multiple) grouping variable(s)} } \value{ A \code{list} of fit metrics: \itemize{ -\item \code{correlation_original}: correlation matrix of the original data. -\item \code{correlation_synthetic}: correlation matrix of the synthetic data. -\item \code{correlation_difference}: difference between \code{correlation_synthetic} and -\code{correlation_original}. +\item \code{correlation_data}: A \code{tibble} of the correlations among the +numeric variables for the actual and synthetic data \item \code{correlation_fit}: square root of the sum of squared differences between -\code{correlation_synthetic} and \code{correlation_original}, divided by the number of +the synthetic and original data, divided by the number of cells in the correlation matrix. +\item \code{correlation_difference_mae}: the mean of the absolute correlation +differences between the actual and synthetic data +\item \code{correlation_difference_rmse}: the root mean of the squared correlation +differences between the actual and synthetic data } } \description{ diff --git a/tests/testthat/test-util_corr_fit.R b/tests/testthat/test-util_corr_fit.R index 4ff40c8..86ec3a4 100644 --- a/tests/testthat/test-util_corr_fit.R +++ b/tests/testthat/test-util_corr_fit.R @@ -1,50 +1,213 @@ +library(tidyverse) + # confidential data -df <- data.frame(a = c(1, 2, 3), - b = c(1, 2, 3), - c = c(1, 2, 3), - RECID = c("a", "b", "c")) - -# difference matrix for tests -diff_matrix <- matrix( - c(NA, NA, NA, - -2, NA, NA, - 0, -2, NA), - ncol = 3, - byrow = TRUE -) - -rownames(diff_matrix) <- c("a", "c", "b") -colnames(diff_matrix) <- c("a", "c", "b") +df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green")) + +syn <- list(synthetic_data = data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"))) %>% + structure(class = "postsynth") # test with postsynth test_that("util_corr_fit is correct with postsynth ", { - syn <- list(synthetic_data = data.frame(a = c(1, 2, 3), - c = c(3, 2, 1), - b = c(1, 2, 3), - RECID = c("a", "b", "c"))) %>% - structure(class = "postsynth") - corr <- util_corr_fit(postsynth = syn, data = df) + corr_data <- corr$correlation_data + metrics <- corr$metrics + + # check dimensions + expect_equal(ncol(corr_data), 6) + expect_equal(nrow(corr_data), 3) + expect_equal(ncol(metrics), 4) + expect_equal(nrow(metrics), 1) + + expect_equal(corr_data$difference, c(2, 0, 2)) + expect_equal(metrics$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) + expect_equal(metrics$correlation_difference_mae, mean(abs(c(0, -2, -2)))) + expect_equal(metrics$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) + # throws an error, may be due to how the n column was added to the data + expect_equal(metrics$n, 5) - expect_equal(corr$correlation_difference, diff_matrix) - expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) - expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2)))) - expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) }) # test with data test_that("util_corr_fit is correct with postsynth ", { - syn <- data.frame(a = c(1, 2, 3), - c = c(3, 2, 1), - b = c(1, 2, 3), - RECID = c("a", "b", "c")) + syn_data <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green")) - corr <- util_corr_fit(postsynth = syn, data = df) + corr <- util_corr_fit(postsynth = syn_data, data = df) + corr_data <- corr$correlation_data + metrics <- corr$metrics + + # check dimensions + expect_equal(ncol(corr_data), 6) + expect_equal(nrow(corr_data), 3) + expect_equal(ncol(metrics), 4) + expect_equal(nrow(metrics), 1) + + expect_equal(corr_data$difference, c(2, 0, 2)) + expect_equal(metrics$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) + expect_equal(metrics$correlation_difference_mae, mean(abs(c(0, -2, -2)))) + expect_equal(metrics$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) + expect_equal(metrics$n, 5) +}) + + + +test_that("util_corr_fit is correct when groupped by one variable ", { + + df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green")) + + syn <- list(synthetic_data = data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"))) %>% + structure(class = "postsynth") + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = d) + corr_data <- corr$correlation_data + metrics <- corr$metrics + + # check dimensions + expect_equal(ncol(corr_data), 7) + expect_equal(nrow(corr_data), 6) + expect_equal(ncol(metrics), 5) + expect_equal(nrow(metrics), 2) + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2)) + expect_equal(metrics$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2)) + expect_equal(metrics$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2)) + expect_equal(metrics$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2)) + expect_equal(metrics$n, c(2, 3)) + +}) + + +test_that("util_corr_fit is correct when groupped by more than one variable ", { + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) + corr_data <- corr$correlation_data + metrics <- corr$metrics + + # check dimensions + expect_equal(ncol(corr_data), 8) + expect_equal(nrow(corr_data), 6) + expect_equal(ncol(metrics), 6) + expect_equal(nrow(metrics), 2) + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2)) + expect_equal(metrics$correlation_fit, rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2)) + expect_equal(metrics$correlation_difference_mae, rep(mean(abs(c(0, -2, -2))), 2)) + expect_equal(metrics$correlation_difference_rmse, rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2)) + expect_equal(metrics$n, c(2, 3)) + +}) - expect_equal(corr$correlation_difference, diff_matrix) - expect_equal(corr$correlation_fit, sqrt(sum(c(0, -2, -2) ^ 2)) / 3) - expect_equal(corr$correlation_difference_mae, mean(abs(c(0, -2, -2)))) - expect_equal(corr$correlation_difference_rmse, sqrt(mean(c(0, -2, -2) ^ 2))) + +test_that("util_corr_fit returns NAs when only one observation is in a grouping ", { + + df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "red"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + syn <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "red"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) + corr_data <- corr$correlation_data + metrics <- corr$metrics + + expect_equal(ncol(corr_data), 8) + expect_equal(nrow(corr_data), 9) + expect_equal(ncol(metrics), 6) + expect_equal(nrow(metrics), 3) + + expect_equal(corr_data$difference, c(2, 0, 2, NA, NA, NA, 2, 0, 2)) + expect_equal(metrics$correlation_fit, c(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, NaN, sqrt(sum(c(0, -2, -2) ^ 2)) / 3)) + expect_equal(metrics$correlation_difference_mae, c(mean(abs(c(0, -2, -2))), NA, mean(abs(c(0, -2, -2))))) + expect_equal(metrics$correlation_difference_rmse, c(sqrt(mean(c(0, -2, -2) ^ 2)), NA, sqrt(mean(c(0, -2, -2) ^ 2)))) + expect_equal(metrics$n, c(2, 1, 2)) + +}) + +# NOTE: These tests will fail + +test_that("util_corr_fit returns NAs when a grouping in the actual data is not in the synthetic data", { + + df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "red"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + syn <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) + corr_data <- corr$correlation_data + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA)) + expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN)) + expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA)) + expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA)) + +}) + + +test_that("util_corr_fit returns NAs when a grouping in the synthetic data is not in the actual data", { + + df <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(1, 2, 3, 4, 5), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "yellow"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + syn <- data.frame(a = c(1, 2, 3, 4, 5), + b = c(5, 4, 3, 2, 1), + c = c(1, 2, 3, 4, 5), + d = c("red", "red", "yellow", "yellow", "red"), + e = c("blue", "blue", "green", "green", "green"), + RECID = c("a", "b", "c", "d", "e")) + + corr <- util_corr_fit(postsynth = syn, data = df, group_by = c(d, e)) + corr_data <- corr$correlation_data + + expect_equal(corr_data$difference, c(2, 0, 2, 2, 0, 2, NA, NA, NA)) + expect_equal(corr$correlation_fit, c(rep(sqrt(sum(c(0, -2, -2) ^ 2)) / 3, 2), NaN)) + expect_equal(corr$correlation_difference_mae, c(rep(mean(abs(c(0, -2, -2))), 2), NA)) + expect_equal(corr$correlation_difference_rmse, c(rep(sqrt(mean(c(0, -2, -2) ^ 2)), 2), NA)) + }) + + + + +