From 365116522decc4814218fde7c9497bd59b1b32f2 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 14:57:34 +0000 Subject: [PATCH 01/16] Add Ingest Script --- analyses/Ingest_script.qmd | 74 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 analyses/Ingest_script.qmd diff --git a/analyses/Ingest_script.qmd b/analyses/Ingest_script.qmd new file mode 100644 index 00000000..ad9108b4 --- /dev/null +++ b/analyses/Ingest_script.qmd @@ -0,0 +1,74 @@ +model_fetch_run_subset <- function(run_id, year, paths, append_run_id = FALSE) { + tictoc::tic(paste0("Fetched run: ", run_id)) + + s3_objs <- grep("s3://", unlist(analyses_paths$output), value = TRUE) + bucket <- strsplit(s3_objs[1], "/")[[1]][3] + + data_list <- list() + + for (analyses_path in analyses_paths$output) { + is_directory <- endsWith(analyses_path$s3, "/") + if (is_directory) { + partitioned_by_run <- endsWith(analyses_path$s3, + paste0("run_id=", run_id, "/")) + if (partitioned_by_run) { + dir_path <- analyses_path$s3 + } else { + dir_path <- paste0(analyses_path$s3, "year=", + year, "/run_id=", run_id, "/") + } + + message("Now fetching: ", dir_path) + objs_prefix <- sub(paste0("s3://", bucket, "/"), "", dir_path) + objs <- aws.s3::get_bucket_df(bucket, objs_prefix) + objs <- dplyr::filter(objs, Size > 0) + + if (nrow(objs) > 0) { + combined_data <- NULL + for (key in objs$Key) { + message("Now fetching: ", key) + local_temp_path <- file.path(tempdir(), basename(key)) + aws.s3::save_object(key, bucket = bucket, file = local_temp_path) + + # Read the Parquet file and append it to combined_data + temp_data <- arrow::read_parquet(local_temp_path) + if (is.null(combined_data)) { + combined_data <- temp_data + } else { + combined_data <- dplyr::bind_rows(combined_data, temp_data) + } + } + + output_key <- analyses_path$key + if (append_run_id) { + output_key <- paste0(output_key, "_", run_id) + } + + data_list[[output_key]] <- combined_data + } else { + warning(analyses_path$key, " does not exist for this run") + } + } else { + message("Now fetching: ", analyses_path$s3) + if (aws.s3::object_exists(analyses_path$s3, bucket = bucket)) { + local_temp_path <- file.path(tempdir(), basename(analyses_path$s3)) + aws.s3::save_object(analyses_path$s3, bucket = bucket, + file = local_temp_path) + + temp_data <- arrow::read_parquet(local_temp_path) + + output_key <- analyses_path$key + if (append_run_id) { + output_key <- paste0(output_key, "_", run_id) + } + + data_list[[output_key]] <- temp_data + } else { + warning(analyses_path$key, " does not exist for this run") + } + } + } + + tictoc::toc() + return(data_list) +} From 261947454a4e91db586fe506c20580a78f2b3603 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 15:19:10 +0000 Subject: [PATCH 02/16] Add data ingest and transformation --- Data_ingest_and_transformation.R | 199 +++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 Data_ingest_and_transformation.R diff --git a/Data_ingest_and_transformation.R b/Data_ingest_and_transformation.R new file mode 100644 index 00000000..60359511 --- /dev/null +++ b/Data_ingest_and_transformation.R @@ -0,0 +1,199 @@ +base_paths <- model_file_dict(params$run_id, params$run_id_year) +comparison_paths <- model_file_dict( + params$comparison_run_id, + params$comparison_run_id_year +) +run_id <- params$run_id +comparison_run_id <- params$comparison_run_id + + +analyses_paths <- list( + output = list( + list( + s3 = base_paths$output$assessment_card$s3, + key = "assessment_card" + ), + list( + s3 = base_paths$output$assessment_pin$s3, + key = "assessment_pin" + ), + list( + s3 = base_paths$output$metadata$s3, + key = "metadata" + ), + list( + s3 = base_paths$output$performance_test$s3, + key = "performance_test" + ), + list( + s3 = base_paths$output$shap$s3, + key = "shap" + ) + ) +) + +source("analyses/Ingest_script.qmd") + +data_new <- model_fetch_run_subset( + params$run_id, + params$run_id_year, analyses_paths, TRUE +) + +list2env(data_new, envir = .GlobalEnv) + +rm(data_new) + +comparison_paths <- list( + output = list( + list( + s3 = base_paths$output$assessment_card$s3, + key = "assessment_card" + ), + list( + s3 = base_paths$output$assessment_pin$s3, + key = "assessment_pin" + ), + list( + s3 = base_paths$output$metadata$s3, + key = "metadata" + ), + list( + s3 = base_paths$output$performance_test$s3, + key = "performance_test" + ), + list( + s3 = base_paths$output$shap$s3, + key = "shap" + ) + ) +) + +data_comparison <- model_fetch_run_subset( + params$comparison_run_id, + params$comparison_run_id_year, + comparison_paths, TRUE +) + +list2env(data_comparison, envir = .GlobalEnv) + +rm(data_comparison) + +all_vars <- ls() + +# Loop through the variables and rename those that match the patterns +for (var_name in all_vars) { + # Check if the variable is a dataframe and ends with _run_id + if (exists(var_name) && is.data.frame(get(var_name)) && + grepl(paste0("_", params$run_id, "$"), var_name)) { + new_name <- sub(paste0("_", params$run_id, "$"), "_new", var_name) + assign(new_name, get(var_name), envir = .GlobalEnv) + rm(list = var_name, envir = .GlobalEnv) + } + + # Check if the variable is a dataframe and ends with _comp_run_id + if (exists(var_name) && is.data.frame(get(var_name)) && + grepl(paste0("_", params$comp_run_id, "$"), var_name)) { + new_name <- sub(paste0("_", params$comp_run_id, "$"), "_comparison", var_name) + assign(new_name, get(var_name), envir = .GlobalEnv) + rm(list = var_name, envir = .GlobalEnv) + } +} + +target_feature_value <- params$added_feature +target_feature_shap <- params$added_feature_shap +nbhd <- ccao::nbhd_shp + + + +# Selecting and joining relevant data +card_individual <- shap_new %>% + select( + meta_pin, meta_card_num, pred_card_shap_baseline_fmv, + {{ target_feature_value }} + ) %>% + rename(!!target_feature_shap := !!target_feature_value) %>% + inner_join(assessment_card_new, by = c("meta_pin", "meta_card_num")) %>% + mutate( + relative_shap = + round(!!sym({{ target_feature_shap }}) / pred_card_initial_fmv, 2) + ) + + + +# Summarizing data by neighborhood code +card_nbhd <- card_individual %>% + group_by(meta_nbhd_code) %>% + summarize( + avg_target_feature_shap = + mean(!!sym({{ target_feature_shap }}), na.rm = TRUE), + avg_target_feature_shap_abs = + mean(abs(!!sym({{ target_feature_shap }})), na.rm = TRUE) + ) %>% + ungroup() %>% + inner_join( + nbhd, + by = c("meta_nbhd_code" = "town_nbhd") + ) %>% + st_as_sf() + +pin_individual <- assessment_pin_new %>% + select(meta_pin, pred_pin_final_fmv, pred_pin_initial_fmv) %>% + rename( + pred_pin_final_fmv_new = pred_pin_final_fmv, + pred_pin_initial_fmv_new = pred_pin_initial_fmv + ) %>% + inner_join( + assessment_pin_comparison %>% + select(meta_pin, pred_pin_final_fmv, pred_pin_initial_fmv), + by = "meta_pin" + ) %>% + rename( + pred_pin_final_fmv_comp = pred_pin_final_fmv, + pred_pin_initial_fmv_comp = pred_pin_initial_fmv + ) %>% + mutate( + diff_pred_pin_final_fmv = round((( + pred_pin_final_fmv_new - + pred_pin_final_fmv_comp) / + pred_pin_final_fmv_comp), 4), + pred_pin_final_fmv_new = dollar(pred_pin_final_fmv_new), + pred_pin_final_fmv_comp = dollar(pred_pin_final_fmv_comp), + diff_pred_pin_initial_fmv = round((( + pred_pin_initial_fmv_new - + pred_pin_initial_fmv_comp) / + pred_pin_initial_fmv_comp), 4), + pred_pin_initial_fmv_new = dollar(pred_pin_initial_fmv_new), + pred_pin_initial_fmv_comp = dollar(pred_pin_initial_fmv_comp) + ) %>% + inner_join( + assessment_data %>% + distinct(meta_pin, .keep_all = TRUE) %>% + select( + meta_pin, meta_nbhd_code, loc_longitude, + loc_latitude, meta_township_name, {{ target_feature_value }} + ), + by = "meta_pin" + ) + +pin_nbhd <- pin_individual_new %>% + group_by(meta_nbhd_code) %>% + summarize( + !!paste0({{ target_feature_value }}, "_neighborhood_mean") := + mean(!!sym({{ target_feature_value }}), na.rm = TRUE), + !!paste0({{ target_feature_value }}, "_neighborhood_median") := + median(!!sym({{ target_feature_value }}), na.rm = TRUE), + !!paste0({{ target_feature_value }}, "_neighborhood_90th") := + quantile(!!sym({{ target_feature_value }}), 0.9, na.rm = TRUE) + ) %>% + ungroup() %>% + inner_join( + nbhd, + by = c("meta_nbhd_code" = "town_nbhd") + ) %>% + st_as_sf() + +leaflet_data <- card_individual %>% + select(meta_pin, {{ target_feature_shap }}) %>% + right_join(pin_individual, + by = c("meta_pin" = "meta_pin") + ) From 9cbef722118fe228b28ba0b065227d539e695e28 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 15:19:31 +0000 Subject: [PATCH 03/16] Renove ingest script --- analyses/Ingest_script.qmd | 74 -------------------------------------- 1 file changed, 74 deletions(-) delete mode 100644 analyses/Ingest_script.qmd diff --git a/analyses/Ingest_script.qmd b/analyses/Ingest_script.qmd deleted file mode 100644 index ad9108b4..00000000 --- a/analyses/Ingest_script.qmd +++ /dev/null @@ -1,74 +0,0 @@ -model_fetch_run_subset <- function(run_id, year, paths, append_run_id = FALSE) { - tictoc::tic(paste0("Fetched run: ", run_id)) - - s3_objs <- grep("s3://", unlist(analyses_paths$output), value = TRUE) - bucket <- strsplit(s3_objs[1], "/")[[1]][3] - - data_list <- list() - - for (analyses_path in analyses_paths$output) { - is_directory <- endsWith(analyses_path$s3, "/") - if (is_directory) { - partitioned_by_run <- endsWith(analyses_path$s3, - paste0("run_id=", run_id, "/")) - if (partitioned_by_run) { - dir_path <- analyses_path$s3 - } else { - dir_path <- paste0(analyses_path$s3, "year=", - year, "/run_id=", run_id, "/") - } - - message("Now fetching: ", dir_path) - objs_prefix <- sub(paste0("s3://", bucket, "/"), "", dir_path) - objs <- aws.s3::get_bucket_df(bucket, objs_prefix) - objs <- dplyr::filter(objs, Size > 0) - - if (nrow(objs) > 0) { - combined_data <- NULL - for (key in objs$Key) { - message("Now fetching: ", key) - local_temp_path <- file.path(tempdir(), basename(key)) - aws.s3::save_object(key, bucket = bucket, file = local_temp_path) - - # Read the Parquet file and append it to combined_data - temp_data <- arrow::read_parquet(local_temp_path) - if (is.null(combined_data)) { - combined_data <- temp_data - } else { - combined_data <- dplyr::bind_rows(combined_data, temp_data) - } - } - - output_key <- analyses_path$key - if (append_run_id) { - output_key <- paste0(output_key, "_", run_id) - } - - data_list[[output_key]] <- combined_data - } else { - warning(analyses_path$key, " does not exist for this run") - } - } else { - message("Now fetching: ", analyses_path$s3) - if (aws.s3::object_exists(analyses_path$s3, bucket = bucket)) { - local_temp_path <- file.path(tempdir(), basename(analyses_path$s3)) - aws.s3::save_object(analyses_path$s3, bucket = bucket, - file = local_temp_path) - - temp_data <- arrow::read_parquet(local_temp_path) - - output_key <- analyses_path$key - if (append_run_id) { - output_key <- paste0(output_key, "_", run_id) - } - - data_list[[output_key]] <- temp_data - } else { - warning(analyses_path$key, " does not exist for this run") - } - } - } - - tictoc::toc() - return(data_list) -} From 882c611701bb7bb2cb3c9630109be90c9be270b5 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 17:43:07 +0000 Subject: [PATCH 04/16] Rename and relocate file --- .../Data_transformation.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Data_ingest_and_transformation.R => analyses/Data_transformation.R (100%) diff --git a/Data_ingest_and_transformation.R b/analyses/Data_transformation.R similarity index 100% rename from Data_ingest_and_transformation.R rename to analyses/Data_transformation.R From 248f0f73d9293f797345f858371006fc3ab588bb Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 18:19:14 +0000 Subject: [PATCH 05/16] Add changes to leaflet data --- analyses/Data_transformation.R | 119 ++++----------------------------- 1 file changed, 12 insertions(+), 107 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index 60359511..7e743858 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -1,104 +1,3 @@ -base_paths <- model_file_dict(params$run_id, params$run_id_year) -comparison_paths <- model_file_dict( - params$comparison_run_id, - params$comparison_run_id_year -) -run_id <- params$run_id -comparison_run_id <- params$comparison_run_id - - -analyses_paths <- list( - output = list( - list( - s3 = base_paths$output$assessment_card$s3, - key = "assessment_card" - ), - list( - s3 = base_paths$output$assessment_pin$s3, - key = "assessment_pin" - ), - list( - s3 = base_paths$output$metadata$s3, - key = "metadata" - ), - list( - s3 = base_paths$output$performance_test$s3, - key = "performance_test" - ), - list( - s3 = base_paths$output$shap$s3, - key = "shap" - ) - ) -) - -source("analyses/Ingest_script.qmd") - -data_new <- model_fetch_run_subset( - params$run_id, - params$run_id_year, analyses_paths, TRUE -) - -list2env(data_new, envir = .GlobalEnv) - -rm(data_new) - -comparison_paths <- list( - output = list( - list( - s3 = base_paths$output$assessment_card$s3, - key = "assessment_card" - ), - list( - s3 = base_paths$output$assessment_pin$s3, - key = "assessment_pin" - ), - list( - s3 = base_paths$output$metadata$s3, - key = "metadata" - ), - list( - s3 = base_paths$output$performance_test$s3, - key = "performance_test" - ), - list( - s3 = base_paths$output$shap$s3, - key = "shap" - ) - ) -) - -data_comparison <- model_fetch_run_subset( - params$comparison_run_id, - params$comparison_run_id_year, - comparison_paths, TRUE -) - -list2env(data_comparison, envir = .GlobalEnv) - -rm(data_comparison) - -all_vars <- ls() - -# Loop through the variables and rename those that match the patterns -for (var_name in all_vars) { - # Check if the variable is a dataframe and ends with _run_id - if (exists(var_name) && is.data.frame(get(var_name)) && - grepl(paste0("_", params$run_id, "$"), var_name)) { - new_name <- sub(paste0("_", params$run_id, "$"), "_new", var_name) - assign(new_name, get(var_name), envir = .GlobalEnv) - rm(list = var_name, envir = .GlobalEnv) - } - - # Check if the variable is a dataframe and ends with _comp_run_id - if (exists(var_name) && is.data.frame(get(var_name)) && - grepl(paste0("_", params$comp_run_id, "$"), var_name)) { - new_name <- sub(paste0("_", params$comp_run_id, "$"), "_comparison", var_name) - assign(new_name, get(var_name), envir = .GlobalEnv) - rm(list = var_name, envir = .GlobalEnv) - } -} - target_feature_value <- params$added_feature target_feature_shap <- params$added_feature_shap nbhd <- ccao::nbhd_shp @@ -155,13 +54,13 @@ pin_individual <- assessment_pin_new %>% diff_pred_pin_final_fmv = round((( pred_pin_final_fmv_new - pred_pin_final_fmv_comp) / - pred_pin_final_fmv_comp), 4), + pred_pin_final_fmv_comp), 4), pred_pin_final_fmv_new = dollar(pred_pin_final_fmv_new), pred_pin_final_fmv_comp = dollar(pred_pin_final_fmv_comp), diff_pred_pin_initial_fmv = round((( pred_pin_initial_fmv_new - pred_pin_initial_fmv_comp) / - pred_pin_initial_fmv_comp), 4), + pred_pin_initial_fmv_comp), 4), pred_pin_initial_fmv_new = dollar(pred_pin_initial_fmv_new), pred_pin_initial_fmv_comp = dollar(pred_pin_initial_fmv_comp) ) %>% @@ -175,7 +74,7 @@ pin_individual <- assessment_pin_new %>% by = "meta_pin" ) -pin_nbhd <- pin_individual_new %>% +pin_nbhd <- pin_individual %>% group_by(meta_nbhd_code) %>% summarize( !!paste0({{ target_feature_value }}, "_neighborhood_mean") := @@ -194,6 +93,12 @@ pin_nbhd <- pin_individual_new %>% leaflet_data <- card_individual %>% select(meta_pin, {{ target_feature_shap }}) %>% - right_join(pin_individual, - by = c("meta_pin" = "meta_pin") - ) + right_join(pin_individual, by = c("meta_pin" = "meta_pin")) %>% + group_by(meta_pin) %>% + mutate(variable_index = row_number()) %>% + pivot_wider( + names_from = variable_index, + values_from = {{ target_feature_shap }}, + names_prefix = paste0(deparse(substitute(target_feature_shap)), "_") + ) %>% + ungroup() From 38730fc4ecce30c110d1676065ff4a9b8faddc54 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 18:46:21 +0000 Subject: [PATCH 06/16] Rename variables --- analyses/Data_transformation.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index 7e743858..615ede0f 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -4,7 +4,7 @@ nbhd <- ccao::nbhd_shp -# Selecting and joining relevant data +# Create a individual card level dataset card_individual <- shap_new %>% select( meta_pin, meta_card_num, pred_card_shap_baseline_fmv, @@ -23,9 +23,11 @@ card_individual <- shap_new %>% card_nbhd <- card_individual %>% group_by(meta_nbhd_code) %>% summarize( - avg_target_feature_shap = + !!paste0({{ target_feature_shap }}, "_mean") := mean(!!sym({{ target_feature_shap }}), na.rm = TRUE), - avg_target_feature_shap_abs = + !!paste0({{ target_feature_shap }}, "_90th") := + quantile(!!sym({{ target_feature_shap }}), probs = 0.9, na.rm = TRUE), + !!paste0({{ target_feature_shap }}, "_mean_abs") := mean(abs(!!sym({{ target_feature_shap }})), na.rm = TRUE) ) %>% ungroup() %>% @@ -35,6 +37,7 @@ card_nbhd <- card_individual %>% ) %>% st_as_sf() +## Create a pin level dataset pin_individual <- assessment_pin_new %>% select(meta_pin, pred_pin_final_fmv, pred_pin_initial_fmv) %>% rename( @@ -74,6 +77,7 @@ pin_individual <- assessment_pin_new %>% by = "meta_pin" ) +# Aggregate to neighborhood level pin_nbhd <- pin_individual %>% group_by(meta_nbhd_code) %>% summarize( @@ -91,6 +95,7 @@ pin_nbhd <- pin_individual %>% ) %>% st_as_sf() +# Pivot wider for leaflet maps to allow multiple shap values leaflet_data <- card_individual %>% select(meta_pin, {{ target_feature_shap }}) %>% right_join(pin_individual, by = c("meta_pin" = "meta_pin")) %>% From 345bd55ccc9b64097f5ec64ea5c20be1df18a1de Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 20:25:03 +0000 Subject: [PATCH 07/16] Add relative_shap to leaflet --- analyses/Data_transformation.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index 615ede0f..7b99f777 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -2,8 +2,6 @@ target_feature_value <- params$added_feature target_feature_shap <- params$added_feature_shap nbhd <- ccao::nbhd_shp - - # Create a individual card level dataset card_individual <- shap_new %>% select( @@ -97,7 +95,7 @@ pin_nbhd <- pin_individual %>% # Pivot wider for leaflet maps to allow multiple shap values leaflet_data <- card_individual %>% - select(meta_pin, {{ target_feature_shap }}) %>% + select(meta_pin, relative_shap, {{ target_feature_shap }}) %>% right_join(pin_individual, by = c("meta_pin" = "meta_pin")) %>% group_by(meta_pin) %>% mutate(variable_index = row_number()) %>% From 66becd38ddcc9bd95098c3790b93b2b11458d7fb Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 21:55:48 +0000 Subject: [PATCH 08/16] Improve join --- analyses/Data_transformation.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index 7b99f777..b4c23cf5 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -8,14 +8,17 @@ card_individual <- shap_new %>% meta_pin, meta_card_num, pred_card_shap_baseline_fmv, {{ target_feature_value }} ) %>% - rename(!!target_feature_shap := !!target_feature_value) %>% - inner_join(assessment_card_new, by = c("meta_pin", "meta_card_num")) %>% + rename(!!target_feature_shap := {{ target_feature_value }}) %>% + inner_join( + assessment_card_new %>% + select(meta_pin, meta_card_num, pred_card_initial_fmv), + by = c("meta_pin", "meta_card_num") + ) %>% mutate( - relative_shap = - round(!!sym({{ target_feature_shap }}) / pred_card_initial_fmv, 2) + relative_shap = round({{ target_feature_shap }} / pred_card_initial_fmv, 2) ) - +assessment_card_new$pred_card_initial_fmv # Summarizing data by neighborhood code card_nbhd <- card_individual %>% From 3ed6984af912d3a21aee416d91f5c9404cf4acc1 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Mon, 5 Aug 2024 21:56:27 +0000 Subject: [PATCH 09/16] Remove extra text --- analyses/Data_transformation.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index b4c23cf5..e541d13f 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -18,8 +18,6 @@ card_individual <- shap_new %>% relative_shap = round({{ target_feature_shap }} / pred_card_initial_fmv, 2) ) -assessment_card_new$pred_card_initial_fmv - # Summarizing data by neighborhood code card_nbhd <- card_individual %>% group_by(meta_nbhd_code) %>% From 4deb9f53df7aa6e91886f4a292f91642798f5d78 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 6 Aug 2024 02:45:25 +0000 Subject: [PATCH 10/16] Add relative shap --- analyses/Data_transformation.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index e541d13f..23fce9d4 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -8,14 +8,14 @@ card_individual <- shap_new %>% meta_pin, meta_card_num, pred_card_shap_baseline_fmv, {{ target_feature_value }} ) %>% - rename(!!target_feature_shap := {{ target_feature_value }}) %>% + rename(!!sym(target_feature_shap) := !!sym(target_feature_value)) %>% inner_join( assessment_card_new %>% - select(meta_pin, meta_card_num, pred_card_initial_fmv), + select(meta_pin, meta_nbhd_code, meta_card_num, pred_card_initial_fmv, {{ target_feature_value }}), by = c("meta_pin", "meta_card_num") ) %>% mutate( - relative_shap = round({{ target_feature_shap }} / pred_card_initial_fmv, 2) + relative_shap = round(!!sym(target_feature_shap) / pred_card_initial_fmv, 2) ) # Summarizing data by neighborhood code From 90aad667691f8d7381d5f91f50700c9c6d138f24 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 6 Aug 2024 13:01:05 +0000 Subject: [PATCH 11/16] lintr --- analyses/Data_transformation.R | 47 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index 23fce9d4..a613eddc 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -11,11 +11,11 @@ card_individual <- shap_new %>% rename(!!sym(target_feature_shap) := !!sym(target_feature_value)) %>% inner_join( assessment_card_new %>% - select(meta_pin, meta_nbhd_code, meta_card_num, pred_card_initial_fmv, {{ target_feature_value }}), + select(meta_pin, meta_nbhd_code, + meta_card_num, + pred_card_initial_fmv, + {{ target_feature_value }}), by = c("meta_pin", "meta_card_num") - ) %>% - mutate( - relative_shap = round(!!sym(target_feature_shap) / pred_card_initial_fmv, 2) ) # Summarizing data by neighborhood code @@ -53,16 +53,14 @@ pin_individual <- assessment_pin_new %>% pred_pin_initial_fmv_comp = pred_pin_initial_fmv ) %>% mutate( - diff_pred_pin_final_fmv = round((( - pred_pin_final_fmv_new - - pred_pin_final_fmv_comp) / - pred_pin_final_fmv_comp), 4), + diff_pred_pin_final_fmv = + round(((pred_pin_final_fmv_new - pred_pin_final_fmv_comp) / + pred_pin_final_fmv_comp), 4), pred_pin_final_fmv_new = dollar(pred_pin_final_fmv_new), pred_pin_final_fmv_comp = dollar(pred_pin_final_fmv_comp), - diff_pred_pin_initial_fmv = round((( - pred_pin_initial_fmv_new - - pred_pin_initial_fmv_comp) / - pred_pin_initial_fmv_comp), 4), + diff_pred_pin_initial_fmv = + round(((pred_pin_initial_fmv_new - pred_pin_initial_fmv_comp) / + pred_pin_initial_fmv_comp), 4), pred_pin_initial_fmv_new = dollar(pred_pin_initial_fmv_new), pred_pin_initial_fmv_comp = dollar(pred_pin_initial_fmv_comp) ) %>% @@ -97,12 +95,25 @@ pin_nbhd <- pin_individual %>% # Pivot wider for leaflet maps to allow multiple shap values leaflet_data <- card_individual %>% select(meta_pin, relative_shap, {{ target_feature_shap }}) %>% - right_join(pin_individual, by = c("meta_pin" = "meta_pin")) %>% group_by(meta_pin) %>% - mutate(variable_index = row_number()) %>% + mutate( + shap_total = sum(!!sym({{ target_feature_shap }})), + variable_index = row_number(), + name_col = paste0(deparse(substitute( + target_feature_shap)), "_", + variable_index) + ) %>% pivot_wider( - names_from = variable_index, - values_from = {{ target_feature_shap }}, - names_prefix = paste0(deparse(substitute(target_feature_shap)), "_") + id_cols = c("meta_pin", "shap_total"), + names_from = name_col, + values_from = !!sym({{ target_feature_shap }}) + ) %>% + ungroup() %>% + right_join(pin_individual, by = c("meta_pin" = "meta_pin")) %>% + mutate( + pred_pin_initial_fmv_new_numeric = + as.numeric(gsub("[$,]", "", pred_pin_initial_fmv_new)), + relative_shap = + round(as.numeric(shap_total) / pred_pin_initial_fmv_new_numeric, 2) ) %>% - ungroup() + distinct(meta_pin, .keep_all = TRUE) From 9cfce310cf4663511db4689b0f42676665778bb3 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Tue, 6 Aug 2024 17:55:19 +0000 Subject: [PATCH 12/16] lintr --- analyses/Data_transformation.R | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index a613eddc..999e0713 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -11,10 +11,12 @@ card_individual <- shap_new %>% rename(!!sym(target_feature_shap) := !!sym(target_feature_value)) %>% inner_join( assessment_card_new %>% - select(meta_pin, meta_nbhd_code, - meta_card_num, - pred_card_initial_fmv, - {{ target_feature_value }}), + select( + meta_pin, meta_nbhd_code, + meta_card_num, + pred_card_initial_fmv, + {{ target_feature_value }} + ), by = c("meta_pin", "meta_card_num") ) @@ -55,12 +57,12 @@ pin_individual <- assessment_pin_new %>% mutate( diff_pred_pin_final_fmv = round(((pred_pin_final_fmv_new - pred_pin_final_fmv_comp) / - pred_pin_final_fmv_comp), 4), + pred_pin_final_fmv_comp), 4), pred_pin_final_fmv_new = dollar(pred_pin_final_fmv_new), pred_pin_final_fmv_comp = dollar(pred_pin_final_fmv_comp), diff_pred_pin_initial_fmv = round(((pred_pin_initial_fmv_new - pred_pin_initial_fmv_comp) / - pred_pin_initial_fmv_comp), 4), + pred_pin_initial_fmv_comp), 4), pred_pin_initial_fmv_new = dollar(pred_pin_initial_fmv_new), pred_pin_initial_fmv_comp = dollar(pred_pin_initial_fmv_comp) ) %>% @@ -99,9 +101,12 @@ leaflet_data <- card_individual %>% mutate( shap_total = sum(!!sym({{ target_feature_shap }})), variable_index = row_number(), - name_col = paste0(deparse(substitute( - target_feature_shap)), "_", - variable_index) + name_col = paste0( + deparse(substitute( + target_feature_shap + )), "_", + variable_index + ) ) %>% pivot_wider( id_cols = c("meta_pin", "shap_total"), From 35faf5f6f19355570a70d5736d2e7facb9aeddcb Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Wed, 7 Aug 2024 15:54:35 +0000 Subject: [PATCH 13/16] Include handling for categorical --- analyses/Data_transformation.R | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index 999e0713..68956dc8 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -1,5 +1,6 @@ target_feature_value <- params$added_feature target_feature_shap <- params$added_feature_shap +type <- params$type nbhd <- ccao::nbhd_shp # Create a individual card level dataset @@ -79,15 +80,27 @@ pin_individual <- assessment_pin_new %>% # Aggregate to neighborhood level pin_nbhd <- pin_individual %>% group_by(meta_nbhd_code) %>% - summarize( - !!paste0({{ target_feature_value }}, "_neighborhood_mean") := - mean(!!sym({{ target_feature_value }}), na.rm = TRUE), - !!paste0({{ target_feature_value }}, "_neighborhood_median") := - median(!!sym({{ target_feature_value }}), na.rm = TRUE), - !!paste0({{ target_feature_value }}, "_neighborhood_90th") := - quantile(!!sym({{ target_feature_value }}), 0.9, na.rm = TRUE) - ) %>% - ungroup() %>% + if (type == "continuous") { + summarize( + !!paste0({{ target_feature_value }}, "_neighborhood_mean") := + mean(!!sym({{ target_feature_value }}), na.rm = TRUE), + !!paste0({{ target_feature_value }}, "_neighborhood_median") := + median(!!sym({{ target_feature_value }}), na.rm = TRUE), + !!paste0({{ target_feature_value }}, "_neighborhood_90th") := + quantile(!!sym({{ target_feature_value }}), 0.9, na.rm = TRUE) + ) + } else { + summarize( + !!paste0({{ target_feature_value }}, "_most_common_value") := + names(sort(table(!!sym({{ target_feature_value }})), decreasing = TRUE)[1]), + !!paste0({{ target_feature_value }}, "_top5_common_values_percent") := { + freq <- sort(table(!!sym({{ target_feature_value }})), decreasing = TRUE) + top5 <- head(freq, 5) + sum(top5) / sum(freq) * 100 + } + ) + } +ungroup() %>% inner_join( nbhd, by = c("meta_nbhd_code" = "town_nbhd") From e2f26d2ec263b47603d9c1edf42094485d0dcb55 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Wed, 7 Aug 2024 17:09:38 +0000 Subject: [PATCH 14/16] Add logic for categorical --- analyses/Data_transformation.R | 57 ++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index 68956dc8..073933c2 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -68,7 +68,7 @@ pin_individual <- assessment_pin_new %>% pred_pin_initial_fmv_comp = dollar(pred_pin_initial_fmv_comp) ) %>% inner_join( - assessment_data %>% + assessment_data_new %>% distinct(meta_pin, .keep_all = TRUE) %>% select( meta_pin, meta_nbhd_code, loc_longitude, @@ -78,34 +78,37 @@ pin_individual <- assessment_pin_new %>% ) # Aggregate to neighborhood level -pin_nbhd <- pin_individual %>% - group_by(meta_nbhd_code) %>% - if (type == "continuous") { - summarize( - !!paste0({{ target_feature_value }}, "_neighborhood_mean") := - mean(!!sym({{ target_feature_value }}), na.rm = TRUE), - !!paste0({{ target_feature_value }}, "_neighborhood_median") := - median(!!sym({{ target_feature_value }}), na.rm = TRUE), - !!paste0({{ target_feature_value }}, "_neighborhood_90th") := - quantile(!!sym({{ target_feature_value }}), 0.9, na.rm = TRUE) - ) - } else { +if (type == "continuous") { + pin_nbhd <- pin_individual %>% + group_by(meta_nbhd_code) %>% summarize( - !!paste0({{ target_feature_value }}, "_most_common_value") := - names(sort(table(!!sym({{ target_feature_value }})), decreasing = TRUE)[1]), - !!paste0({{ target_feature_value }}, "_top5_common_values_percent") := { - freq <- sort(table(!!sym({{ target_feature_value }})), decreasing = TRUE) - top5 <- head(freq, 5) - sum(top5) / sum(freq) * 100 - } + !!paste0(target_feature_value, "_neighborhood_mean") := + mean(!!sym(target_feature_value), na.rm = TRUE), + !!paste0(target_feature_value, "_neighborhood_median") := + median(!!sym(target_feature_value), na.rm = TRUE), + !!paste0(target_feature_value, "_neighborhood_90th") := + quantile(!!sym(target_feature_value), 0.9, na.rm = TRUE) ) - } -ungroup() %>% - inner_join( - nbhd, - by = c("meta_nbhd_code" = "town_nbhd") - ) %>% - st_as_sf() +} else { + { + pin_nbhd <- pin_individual %>% + group_by(meta_nbhd_code, !!sym({{ target_feature_value }})) %>% + count() %>% + ungroup() %>% + group_by(meta_nbhd_code) %>% + mutate(percentage = n / sum(n) * 100) %>% + select(meta_nbhd_code, !!sym({{ target_feature_value }}), percentage) %>% + arrange(meta_nbhd_code, desc(percentage)) %>% # Arrange to have the highest percentage first + mutate(rank = row_number()) %>% + pivot_wider( + names_from = rank, + values_from = c(!!sym({{ target_feature_value }}), percentage), + names_glue = "{.value}_{rank}" + ) + } %>% + inner_join(nbhd, by = c("meta_nbhd_code" = "town_nbhd")) %>% + st_as_sf() +} # Pivot wider for leaflet maps to allow multiple shap values leaflet_data <- card_individual %>% From 5e990ec10dc1715c50b6ec8792e105bc92ef3ad7 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Wed, 7 Aug 2024 17:28:20 +0000 Subject: [PATCH 15/16] Add spatial joins to if_else --- analyses/Data_transformation.R | 35 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index 073933c2..d9836510 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -88,26 +88,25 @@ if (type == "continuous") { median(!!sym(target_feature_value), na.rm = TRUE), !!paste0(target_feature_value, "_neighborhood_90th") := quantile(!!sym(target_feature_value), 0.9, na.rm = TRUE) - ) -} else { - { - pin_nbhd <- pin_individual %>% - group_by(meta_nbhd_code, !!sym({{ target_feature_value }})) %>% - count() %>% - ungroup() %>% - group_by(meta_nbhd_code) %>% - mutate(percentage = n / sum(n) * 100) %>% - select(meta_nbhd_code, !!sym({{ target_feature_value }}), percentage) %>% - arrange(meta_nbhd_code, desc(percentage)) %>% # Arrange to have the highest percentage first - mutate(rank = row_number()) %>% - pivot_wider( - names_from = rank, - values_from = c(!!sym({{ target_feature_value }}), percentage), - names_glue = "{.value}_{rank}" - ) - } %>% + ) %>% inner_join(nbhd, by = c("meta_nbhd_code" = "town_nbhd")) %>% st_as_sf() +} else {{ pin_nbhd <- pin_individual %>% + group_by(meta_nbhd_code, !!sym({{ target_feature_value }})) %>% + count() %>% + ungroup() %>% + group_by(meta_nbhd_code) %>% + mutate(percentage = n / sum(n) * 100) %>% + select(meta_nbhd_code, !!sym({{ target_feature_value }}), percentage) %>% + arrange(meta_nbhd_code, desc(percentage)) %>% # Arrange to have the highest percentage first + mutate(rank = row_number()) %>% + pivot_wider( + names_from = rank, + values_from = c(!!sym({{ target_feature_value }}), percentage), + names_glue = "{.value}_{rank}" + ) %>% + inner_join(nbhd, by = c("meta_nbhd_code" = "town_nbhd")) %>% + st_as_sf() } } # Pivot wider for leaflet maps to allow multiple shap values From 76d37179bd8c492cb5fe01c703ecceebdc6f7f9d Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 8 Aug 2024 01:53:53 +0000 Subject: [PATCH 16/16] Correct syntax --- analyses/Data_transformation.R | 38 ++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/analyses/Data_transformation.R b/analyses/Data_transformation.R index d9836510..ece20ba4 100644 --- a/analyses/Data_transformation.R +++ b/analyses/Data_transformation.R @@ -91,22 +91,28 @@ if (type == "continuous") { ) %>% inner_join(nbhd, by = c("meta_nbhd_code" = "town_nbhd")) %>% st_as_sf() -} else {{ pin_nbhd <- pin_individual %>% - group_by(meta_nbhd_code, !!sym({{ target_feature_value }})) %>% - count() %>% - ungroup() %>% - group_by(meta_nbhd_code) %>% - mutate(percentage = n / sum(n) * 100) %>% - select(meta_nbhd_code, !!sym({{ target_feature_value }}), percentage) %>% - arrange(meta_nbhd_code, desc(percentage)) %>% # Arrange to have the highest percentage first - mutate(rank = row_number()) %>% - pivot_wider( - names_from = rank, - values_from = c(!!sym({{ target_feature_value }}), percentage), - names_glue = "{.value}_{rank}" - ) %>% - inner_join(nbhd, by = c("meta_nbhd_code" = "town_nbhd")) %>% - st_as_sf() } +} else { + pin_nbhd <- pin_individual %>% + group_by(meta_nbhd_code, !!sym(target_feature_value)) %>% + count() %>% + ungroup() %>% + group_by(meta_nbhd_code) %>% + mutate( + percentage = n / sum(n) * 100 + ) %>% + arrange(meta_nbhd_code, desc(n)) %>% + mutate( + plurality_factor = first(!!sym(target_feature_value)) + ) %>% + ungroup() %>% + select(meta_nbhd_code, !!sym(target_feature_value), percentage, plurality_factor) %>% + pivot_wider( + names_from = !!sym(target_feature_value), + values_from = percentage, + names_prefix = "percentage_" + ) %>% + inner_join(nbhd, by = c("meta_nbhd_code" = "town_nbhd")) %>% + st_as_sf() } # Pivot wider for leaflet maps to allow multiple shap values