Skip to content

Commit

Permalink
Revert join
Browse files Browse the repository at this point in the history
  • Loading branch information
Damonamajor committed Jul 18, 2024
1 parent 5fd6b0e commit 6a7ba1e
Showing 1 changed file with 139 additions and 135 deletions.
274 changes: 139 additions & 135 deletions analyses/new-feature-template.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -63,142 +63,142 @@ paths <- model_file_dict(model_params$run_id, model_params$year)
```

```{r download_new_data}
analyses_paths <- list(
output = list(
list(
s3 = paste0("s3://ccao-model-results-us-east-1/performance/year=", params$run_id_year, "/stage=assessment/", params$run_id, ".parquet"),
key = "performance"
),
list(
s3 = paste0("s3://ccao-model-results-us-east-1/metadata/year=", params$run_id_year, "/", params$run_id, ".parquet"),
key = "metadata"
),
list(
s3 = paste0("s3://ccao-model-results-us-east-1/shap/year=", params$run_id_year, "/run_id=", params$run_id, "/"),
key = "shap"
),
list(
s3 = paste0("s3://ccao-model-results-us-east-1/assessment_card/year=", params$run_id_year, "/run_id=", params$run_id, "/"),
key = "assessment_card"
),
list(
s3 = paste0("s3://ccao-model-results-us-east-1/assessment_pin/year=", params$run_id_year, "/run_id=", params$run_id, "/"),
key = "assessment_pin"
)
)
)
fetch_analyses <- function(run_id, year, analyses_paths) {
tictoc::tic(paste0("Fetched run: ", run_id))
s3_objs <- grep("s3://", unlist(analyses_paths$output), value = TRUE)
bucket <- strsplit(s3_objs[1], "/")[[1]][3]
data_list <- list()
for (analyses_path in analyses_paths$output) {
is_directory <- endsWith(analyses_path$s3, "/")
if (is_directory) {
partitioned_by_run <- endsWith(analyses_path$s3, paste0("run_id=", run_id, "/"))
if (partitioned_by_run) {
dir_path <- analyses_path$s3
} else {
dir_path <- paste0(analyses_path$s3, "year=", year, "/run_id=", run_id, "/")
}
message("Now fetching: ", dir_path)
objs_prefix <- sub(paste0("s3://", bucket, "/"), "", dir_path)
objs <- aws.s3::get_bucket_df(bucket, objs_prefix)
objs <- dplyr::filter(objs, Size > 0)
if (nrow(objs) > 0) {
combined_data <- NULL
for (key in objs$Key) {
message("Now fetching: ", key)
local_temp_path <- file.path(tempdir(), basename(key))
aws.s3::save_object(key, bucket = bucket, file = local_temp_path)
# Read the Parquet file and append it to combined_data
temp_data <- arrow::read_parquet(local_temp_path)
if (is.null(combined_data)) {
combined_data <- temp_data
} else {
combined_data <- dplyr::bind_rows(combined_data, temp_data)
}
}
data_list[[analyses_path$key]] <- combined_data
} else {
warning(analyses_path$key, " does not exist for this run")
}
} else {
message("Now fetching: ", analyses_path$s3)
if (aws.s3::object_exists(analyses_path$s3, bucket = bucket)) {
local_temp_path <- file.path(tempdir(), basename(analyses_path$s3))
aws.s3::save_object(analyses_path$s3, bucket = bucket, file = local_temp_path)
data_list[[analyses_path$key]] <- arrow::read_parquet(local_temp_path)
} else {
warning(analyses_path$key, " does not exist for this run")
}
}
}
tictoc::toc()
return(data_list)
}
data <- fetch_analyses(params$run_id, params$run_id_year, analyses_paths)
performance <- data$performance
metadata <- data$metadata
shap <- data$shap
assessment_card <- data$assessment_card %>%
select(meta_pin, loc_longitude, loc_latitude, meta_card_num, pred_card_initial_fmv, meta_nbhd_code, meta_township_code, !!sym(params$added_feature))
assessment_pin <- data$assessment_pin
lockfile_assessment <- metadata$dvc_md5_assessment_data
# Define S3 paths for assessment and training data
s3_path_assessment <- paste0(
"s3://ccao-data-dvc-us-east-1/files/md5/",
substr(lockfile_assessment, 1, 2), "/",
substr(lockfile_assessment, 3, nchar(lockfile_assessment))
)
assessment_data <- s3read_using(FUN = read_parquet, object = s3_path_assessment)
# analyses_paths <- list(
# output = list(
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/performance/year=", params$run_id_year, "/stage=assessment/", params$run_id, ".parquet"),
# key = "performance"
# ),
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/metadata/year=", params$run_id_year, "/", params$run_id, ".parquet"),
# key = "metadata"
# ),
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/shap/year=", params$run_id_year, "/run_id=", params$run_id, "/"),
# key = "shap"
# ),
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/assessment_card/year=", params$run_id_year, "/run_id=", params$run_id, "/"),
# key = "assessment_card"
# ),
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/assessment_pin/year=", params$run_id_year, "/run_id=", params$run_id, "/"),
# key = "assessment_pin"
# )
# )
# )
#
# fetch_analyses <- function(run_id, year, analyses_paths) {
# tictoc::tic(paste0("Fetched run: ", run_id))
#
# s3_objs <- grep("s3://", unlist(analyses_paths$output), value = TRUE)
# bucket <- strsplit(s3_objs[1], "/")[[1]][3]
#
# data_list <- list()
#
# for (analyses_path in analyses_paths$output) {
# is_directory <- endsWith(analyses_path$s3, "/")
# if (is_directory) {
# partitioned_by_run <- endsWith(analyses_path$s3, paste0("run_id=", run_id, "/"))
# if (partitioned_by_run) {
# dir_path <- analyses_path$s3
# } else {
# dir_path <- paste0(analyses_path$s3, "year=", year, "/run_id=", run_id, "/")
# }
#
# message("Now fetching: ", dir_path)
# objs_prefix <- sub(paste0("s3://", bucket, "/"), "", dir_path)
# objs <- aws.s3::get_bucket_df(bucket, objs_prefix)
# objs <- dplyr::filter(objs, Size > 0)
#
# if (nrow(objs) > 0) {
# combined_data <- NULL
# for (key in objs$Key) {
# message("Now fetching: ", key)
# local_temp_path <- file.path(tempdir(), basename(key))
# aws.s3::save_object(key, bucket = bucket, file = local_temp_path)
#
# # Read the Parquet file and append it to combined_data
# temp_data <- arrow::read_parquet(local_temp_path)
# if (is.null(combined_data)) {
# combined_data <- temp_data
# } else {
# combined_data <- dplyr::bind_rows(combined_data, temp_data)
# }
# }
# data_list[[analyses_path$key]] <- combined_data
# } else {
# warning(analyses_path$key, " does not exist for this run")
# }
# } else {
# message("Now fetching: ", analyses_path$s3)
# if (aws.s3::object_exists(analyses_path$s3, bucket = bucket)) {
# local_temp_path <- file.path(tempdir(), basename(analyses_path$s3))
# aws.s3::save_object(analyses_path$s3, bucket = bucket, file = local_temp_path)
# data_list[[analyses_path$key]] <- arrow::read_parquet(local_temp_path)
# } else {
# warning(analyses_path$key, " does not exist for this run")
# }
# }
# }
#
# tictoc::toc()
# return(data_list)
# }
#
# data <- fetch_analyses(params$run_id, params$run_id_year, analyses_paths)
#
# performance <- data$performance
# metadata <- data$metadata
# shap <- data$shap
# assessment_card <- data$assessment_card %>%
# select(meta_pin, meta_card_num, pred_card_initial_fmv, meta_township_code, !!sym(params$added_feature))
#
# assessment_pin <- data$assessment_pin
#
# lockfile_assessment <- metadata$dvc_md5_assessment_data
#
# # Define S3 paths for assessment and training data
# s3_path_assessment <- paste0(
# "s3://ccao-data-dvc-us-east-1/files/md5/",
# substr(lockfile_assessment, 1, 2), "/",
# substr(lockfile_assessment, 3, nchar(lockfile_assessment))
# )
#
#
# assessment_data <- s3read_using(FUN = read_parquet, object = s3_path_assessment)
```

```{r download_comparison_data}
analyses_paths <- list(
output = list(
list(
s3 = paste0("s3://ccao-model-results-us-east-1/performance/year=", params$comparison_run_id_year, "/stage=assessment/", params$comparison_run_id, ".parquet"),
key = "performance"
),
list(
s3 = paste0("s3://ccao-model-results-us-east-1/assessment_pin/year=", params$comparison_run_id_year, "/run_id=", params$comparison_run_id, "/"),
key = "assessment_pin"
),
list(
s3 = paste0("s3://ccao-model-results-us-east-1/shap/year=", params$comparison_run_id_year, "/run_id=", params$comparison_run_id, "/"),
key = "shap"
),
list(
s3 = paste0("s3://ccao-model-results-us-east-1/metadata/year=", params$comparison_run_id_year, "/", params$comparison_run_id, ".parquet"),
key = "metadata"
)
)
)
data_comparison <- fetch_analyses(params$comparison_run_id, params$comparison_run_id_year, analyses_paths)
metadata_comparison <- data_comparison$metadata
model_performance_assessment_comparison <- data_comparison$performance
shap_comparison <- data_comparison$shap
assessment_pin_comparison <- data_comparison$assessment_pin %>%
select(meta_pin, pred_pin_final_fmv, sale_ratio_study_price, meta_nbhd_code, meta_triad_code, pred_pin_initial_fmv, meta_township_code)
# analyses_paths <- list(
# output = list(
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/performance/year=", params$comparison_run_id_year, "/stage=assessment/", params$comparison_run_id, ".parquet"),
# key = "performance"
# ),
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/assessment_pin/year=", params$comparison_run_id_year, "/run_id=", params$comparison_run_id, "/"),
# key = "assessment_pin"
# ),
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/shap/year=", params$comparison_run_id_year, "/run_id=", params$comparison_run_id, "/"),
# key = "shap"
# ),
# list(
# s3 = paste0("s3://ccao-model-results-us-east-1/metadata/year=", params$comparison_run_id_year, "/", params$comparison_run_id, ".parquet"),
# key = "metadata"
# )
# )
# )
#
# data_comparison <- fetch_analyses(params$comparison_run_id, params$comparison_run_id_year, analyses_paths)
#
# metadata_comparison <- data_comparison$metadata
# model_performance_assessment_comparison <- data_comparison$performance
# shap_comparison <- data_comparison$shap
#
# assessment_pin_comparison <- data_comparison$assessment_pin %>%
# select(meta_pin, pred_pin_final_fmv, sale_ratio_study_price, meta_nbhd_code, meta_triad_code, pred_pin_initial_fmv, meta_township_code)
```

```{r}
Expand Down Expand Up @@ -238,8 +238,9 @@ assessment_data_small <- assessment_data %>%
working_data_card <- shap %>%
select(meta_pin, meta_card_num, pred_card_shap_baseline_fmv, !!sym(params$added_feature)) %>%
rename(!!params$added_feature_shap := !!sym(params$added_feature)) %>%
inner_join(assessment_data_small, by = c("meta_pin", "meta_card_num")) %>%
inner_join(assessment_card, by = c("meta_pin", "meta_card_num")) %>%
rename(added_feature_card = !!sym(params$added_feature)) %>%
inner_join(assessment_data_small, by = c("meta_pin", "meta_card_num")) %>%
group_by(meta_nbhd_code) %>%
mutate(
!!paste0(params$added_feature, "_shap_neighborhood_mean") := mean(abs(!!sym(params$added_feature_shap)), na.rm = TRUE),
Expand All @@ -262,7 +263,7 @@ working_data_card <- shap %>%
pred_pin_initial_fmv_comp = pred_pin_initial_fmv
) %>%
mutate(
shap_relative = percent((!!sym(params$added_feature_shap) / pred_pin_initial_fmv_new), accuracy = 0.01),
shap_relative = percent((!!sym(params$added_feature_shap) / pred_card_initial_fmv), accuracy = 0.01),
diff_pred_pin_final_fmv = round(pred_pin_final_fmv_new - pred_pin_final_fmv_comp, 2),
pred_pin_final_fmv_new = scales::dollar(pred_pin_final_fmv_new),
pred_pin_final_fmv_comparison = scales::dollar(pred_pin_final_fmv_comp),
Expand Down Expand Up @@ -771,6 +772,9 @@ The primary metric that the CCAO Data team uses to assess the importance of a fe

## Absolute Value Rank of SHAP Scores

```{r}
shap_predictors <- unlist(metadata$model_predictor_all_name)
```

The following table produces the median absolute SHAP value by township, and creates a grouped table. In total, there are `r length(shap_predictors)` indicators in the model. Thus, if the median SHAP is ranked 1, it is the most important feature in a township, while if it is ranked `r length(shap_predictors)`, it is the least important feature in a township. The median value (without absolute) is also included to better contextualize the impact.

Expand Down

0 comments on commit 6a7ba1e

Please sign in to comment.