From 6a7ba1e70a57a7bd0094dc8fd8f5a41e3aaec174 Mon Sep 17 00:00:00 2001 From: Damonamajor Date: Thu, 18 Jul 2024 21:42:11 +0000 Subject: [PATCH] Revert join --- analyses/new-feature-template.qmd | 274 +++++++++++++++--------------- 1 file changed, 139 insertions(+), 135 deletions(-) diff --git a/analyses/new-feature-template.qmd b/analyses/new-feature-template.qmd index 64ba42d8..b869634c 100644 --- a/analyses/new-feature-template.qmd +++ b/analyses/new-feature-template.qmd @@ -63,142 +63,142 @@ paths <- model_file_dict(model_params$run_id, model_params$year) ``` ```{r download_new_data} -analyses_paths <- list( - output = list( - list( - s3 = paste0("s3://ccao-model-results-us-east-1/performance/year=", params$run_id_year, "/stage=assessment/", params$run_id, ".parquet"), - key = "performance" - ), - list( - s3 = paste0("s3://ccao-model-results-us-east-1/metadata/year=", params$run_id_year, "/", params$run_id, ".parquet"), - key = "metadata" - ), - list( - s3 = paste0("s3://ccao-model-results-us-east-1/shap/year=", params$run_id_year, "/run_id=", params$run_id, "/"), - key = "shap" - ), - list( - s3 = paste0("s3://ccao-model-results-us-east-1/assessment_card/year=", params$run_id_year, "/run_id=", params$run_id, "/"), - key = "assessment_card" - ), - list( - s3 = paste0("s3://ccao-model-results-us-east-1/assessment_pin/year=", params$run_id_year, "/run_id=", params$run_id, "/"), - key = "assessment_pin" - ) - ) -) - -fetch_analyses <- function(run_id, year, analyses_paths) { - tictoc::tic(paste0("Fetched run: ", run_id)) - - s3_objs <- grep("s3://", unlist(analyses_paths$output), value = TRUE) - bucket <- strsplit(s3_objs[1], "/")[[1]][3] - - data_list <- list() - - for (analyses_path in analyses_paths$output) { - is_directory <- endsWith(analyses_path$s3, "/") - if (is_directory) { - partitioned_by_run <- endsWith(analyses_path$s3, paste0("run_id=", run_id, "/")) - if (partitioned_by_run) { - dir_path <- analyses_path$s3 - } else { - dir_path <- paste0(analyses_path$s3, "year=", year, "/run_id=", run_id, "/") - } - - message("Now fetching: ", dir_path) - objs_prefix <- sub(paste0("s3://", bucket, "/"), "", dir_path) - objs <- aws.s3::get_bucket_df(bucket, objs_prefix) - objs <- dplyr::filter(objs, Size > 0) - - if (nrow(objs) > 0) { - combined_data <- NULL - for (key in objs$Key) { - message("Now fetching: ", key) - local_temp_path <- file.path(tempdir(), basename(key)) - aws.s3::save_object(key, bucket = bucket, file = local_temp_path) - - # Read the Parquet file and append it to combined_data - temp_data <- arrow::read_parquet(local_temp_path) - if (is.null(combined_data)) { - combined_data <- temp_data - } else { - combined_data <- dplyr::bind_rows(combined_data, temp_data) - } - } - data_list[[analyses_path$key]] <- combined_data - } else { - warning(analyses_path$key, " does not exist for this run") - } - } else { - message("Now fetching: ", analyses_path$s3) - if (aws.s3::object_exists(analyses_path$s3, bucket = bucket)) { - local_temp_path <- file.path(tempdir(), basename(analyses_path$s3)) - aws.s3::save_object(analyses_path$s3, bucket = bucket, file = local_temp_path) - data_list[[analyses_path$key]] <- arrow::read_parquet(local_temp_path) - } else { - warning(analyses_path$key, " does not exist for this run") - } - } - } - - tictoc::toc() - return(data_list) -} - -data <- fetch_analyses(params$run_id, params$run_id_year, analyses_paths) - -performance <- data$performance -metadata <- data$metadata -shap <- data$shap -assessment_card <- data$assessment_card %>% - select(meta_pin, loc_longitude, loc_latitude, meta_card_num, pred_card_initial_fmv, meta_nbhd_code, meta_township_code, !!sym(params$added_feature)) - -assessment_pin <- data$assessment_pin - -lockfile_assessment <- metadata$dvc_md5_assessment_data - -# Define S3 paths for assessment and training data -s3_path_assessment <- paste0( - "s3://ccao-data-dvc-us-east-1/files/md5/", - substr(lockfile_assessment, 1, 2), "/", - substr(lockfile_assessment, 3, nchar(lockfile_assessment)) -) - - -assessment_data <- s3read_using(FUN = read_parquet, object = s3_path_assessment) +# analyses_paths <- list( +# output = list( +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/performance/year=", params$run_id_year, "/stage=assessment/", params$run_id, ".parquet"), +# key = "performance" +# ), +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/metadata/year=", params$run_id_year, "/", params$run_id, ".parquet"), +# key = "metadata" +# ), +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/shap/year=", params$run_id_year, "/run_id=", params$run_id, "/"), +# key = "shap" +# ), +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/assessment_card/year=", params$run_id_year, "/run_id=", params$run_id, "/"), +# key = "assessment_card" +# ), +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/assessment_pin/year=", params$run_id_year, "/run_id=", params$run_id, "/"), +# key = "assessment_pin" +# ) +# ) +# ) +# +# fetch_analyses <- function(run_id, year, analyses_paths) { +# tictoc::tic(paste0("Fetched run: ", run_id)) +# +# s3_objs <- grep("s3://", unlist(analyses_paths$output), value = TRUE) +# bucket <- strsplit(s3_objs[1], "/")[[1]][3] +# +# data_list <- list() +# +# for (analyses_path in analyses_paths$output) { +# is_directory <- endsWith(analyses_path$s3, "/") +# if (is_directory) { +# partitioned_by_run <- endsWith(analyses_path$s3, paste0("run_id=", run_id, "/")) +# if (partitioned_by_run) { +# dir_path <- analyses_path$s3 +# } else { +# dir_path <- paste0(analyses_path$s3, "year=", year, "/run_id=", run_id, "/") +# } +# +# message("Now fetching: ", dir_path) +# objs_prefix <- sub(paste0("s3://", bucket, "/"), "", dir_path) +# objs <- aws.s3::get_bucket_df(bucket, objs_prefix) +# objs <- dplyr::filter(objs, Size > 0) +# +# if (nrow(objs) > 0) { +# combined_data <- NULL +# for (key in objs$Key) { +# message("Now fetching: ", key) +# local_temp_path <- file.path(tempdir(), basename(key)) +# aws.s3::save_object(key, bucket = bucket, file = local_temp_path) +# +# # Read the Parquet file and append it to combined_data +# temp_data <- arrow::read_parquet(local_temp_path) +# if (is.null(combined_data)) { +# combined_data <- temp_data +# } else { +# combined_data <- dplyr::bind_rows(combined_data, temp_data) +# } +# } +# data_list[[analyses_path$key]] <- combined_data +# } else { +# warning(analyses_path$key, " does not exist for this run") +# } +# } else { +# message("Now fetching: ", analyses_path$s3) +# if (aws.s3::object_exists(analyses_path$s3, bucket = bucket)) { +# local_temp_path <- file.path(tempdir(), basename(analyses_path$s3)) +# aws.s3::save_object(analyses_path$s3, bucket = bucket, file = local_temp_path) +# data_list[[analyses_path$key]] <- arrow::read_parquet(local_temp_path) +# } else { +# warning(analyses_path$key, " does not exist for this run") +# } +# } +# } +# +# tictoc::toc() +# return(data_list) +# } +# +# data <- fetch_analyses(params$run_id, params$run_id_year, analyses_paths) +# +# performance <- data$performance +# metadata <- data$metadata +# shap <- data$shap +# assessment_card <- data$assessment_card %>% +# select(meta_pin, meta_card_num, pred_card_initial_fmv, meta_township_code, !!sym(params$added_feature)) +# +# assessment_pin <- data$assessment_pin +# +# lockfile_assessment <- metadata$dvc_md5_assessment_data +# +# # Define S3 paths for assessment and training data +# s3_path_assessment <- paste0( +# "s3://ccao-data-dvc-us-east-1/files/md5/", +# substr(lockfile_assessment, 1, 2), "/", +# substr(lockfile_assessment, 3, nchar(lockfile_assessment)) +# ) +# +# +# assessment_data <- s3read_using(FUN = read_parquet, object = s3_path_assessment) ``` ```{r download_comparison_data} -analyses_paths <- list( - output = list( - list( - s3 = paste0("s3://ccao-model-results-us-east-1/performance/year=", params$comparison_run_id_year, "/stage=assessment/", params$comparison_run_id, ".parquet"), - key = "performance" - ), - list( - s3 = paste0("s3://ccao-model-results-us-east-1/assessment_pin/year=", params$comparison_run_id_year, "/run_id=", params$comparison_run_id, "/"), - key = "assessment_pin" - ), - list( - s3 = paste0("s3://ccao-model-results-us-east-1/shap/year=", params$comparison_run_id_year, "/run_id=", params$comparison_run_id, "/"), - key = "shap" - ), - list( - s3 = paste0("s3://ccao-model-results-us-east-1/metadata/year=", params$comparison_run_id_year, "/", params$comparison_run_id, ".parquet"), - key = "metadata" - ) - ) -) - -data_comparison <- fetch_analyses(params$comparison_run_id, params$comparison_run_id_year, analyses_paths) - -metadata_comparison <- data_comparison$metadata -model_performance_assessment_comparison <- data_comparison$performance -shap_comparison <- data_comparison$shap - -assessment_pin_comparison <- data_comparison$assessment_pin %>% - select(meta_pin, pred_pin_final_fmv, sale_ratio_study_price, meta_nbhd_code, meta_triad_code, pred_pin_initial_fmv, meta_township_code) +# analyses_paths <- list( +# output = list( +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/performance/year=", params$comparison_run_id_year, "/stage=assessment/", params$comparison_run_id, ".parquet"), +# key = "performance" +# ), +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/assessment_pin/year=", params$comparison_run_id_year, "/run_id=", params$comparison_run_id, "/"), +# key = "assessment_pin" +# ), +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/shap/year=", params$comparison_run_id_year, "/run_id=", params$comparison_run_id, "/"), +# key = "shap" +# ), +# list( +# s3 = paste0("s3://ccao-model-results-us-east-1/metadata/year=", params$comparison_run_id_year, "/", params$comparison_run_id, ".parquet"), +# key = "metadata" +# ) +# ) +# ) +# +# data_comparison <- fetch_analyses(params$comparison_run_id, params$comparison_run_id_year, analyses_paths) +# +# metadata_comparison <- data_comparison$metadata +# model_performance_assessment_comparison <- data_comparison$performance +# shap_comparison <- data_comparison$shap +# +# assessment_pin_comparison <- data_comparison$assessment_pin %>% +# select(meta_pin, pred_pin_final_fmv, sale_ratio_study_price, meta_nbhd_code, meta_triad_code, pred_pin_initial_fmv, meta_township_code) ``` ```{r} @@ -238,8 +238,9 @@ assessment_data_small <- assessment_data %>% working_data_card <- shap %>% select(meta_pin, meta_card_num, pred_card_shap_baseline_fmv, !!sym(params$added_feature)) %>% rename(!!params$added_feature_shap := !!sym(params$added_feature)) %>% - inner_join(assessment_data_small, by = c("meta_pin", "meta_card_num")) %>% inner_join(assessment_card, by = c("meta_pin", "meta_card_num")) %>% + rename(added_feature_card = !!sym(params$added_feature)) %>% + inner_join(assessment_data_small, by = c("meta_pin", "meta_card_num")) %>% group_by(meta_nbhd_code) %>% mutate( !!paste0(params$added_feature, "_shap_neighborhood_mean") := mean(abs(!!sym(params$added_feature_shap)), na.rm = TRUE), @@ -262,7 +263,7 @@ working_data_card <- shap %>% pred_pin_initial_fmv_comp = pred_pin_initial_fmv ) %>% mutate( - shap_relative = percent((!!sym(params$added_feature_shap) / pred_pin_initial_fmv_new), accuracy = 0.01), + shap_relative = percent((!!sym(params$added_feature_shap) / pred_card_initial_fmv), accuracy = 0.01), diff_pred_pin_final_fmv = round(pred_pin_final_fmv_new - pred_pin_final_fmv_comp, 2), pred_pin_final_fmv_new = scales::dollar(pred_pin_final_fmv_new), pred_pin_final_fmv_comparison = scales::dollar(pred_pin_final_fmv_comp), @@ -771,6 +772,9 @@ The primary metric that the CCAO Data team uses to assess the importance of a fe ## Absolute Value Rank of SHAP Scores +```{r} +shap_predictors <- unlist(metadata$model_predictor_all_name) +``` The following table produces the median absolute SHAP value by township, and creates a grouped table. In total, there are `r length(shap_predictors)` indicators in the model. Thus, if the median SHAP is ranked 1, it is the most important feature in a township, while if it is ranked `r length(shap_predictors)`, it is the least important feature in a township. The median value (without absolute) is also included to better contextualize the impact.