-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create a feature selection evaluation template (#250)
* Push of old docs * Draft * Chugging * Keep pushing * Include temp files * Remove old file * Remove old file * Move things around * Work on tabsets * Continued improvements * Finalizing version * lintr * Push old analyses * Move to correct folder * Finalize * Typo * Leaflett function * Download directly rather than save * Cosmetic changes * Quick text changes * Fix shap download * Fix join * lintr * Update model run and FMV tests * Add quarto.yml * Changes before stashing * Changes before stashing * push dvclock * Add some more charts * Add categorical charts * Improve naming of columns * lintr * Add simple fmv change * lintr * Add correlation plot, Refactor to _pin / _card * Add shap relationship plots * Revert to old join * Revert join * unhash * Add location fix * Quick edits * lintr * Improve borders * Add low order scheme * Include neighborhood level pluralities * Add temp pin join * Add some text * Remove hashes * Renaming stuff * Hashing to switch brnches * gitignore * Create data processing file * Add data processing * rename file * Create analyses paths * add comps * Violin plot * Add unique components to code_to_review * Better defining of keys * Add append ingest scripts, Add link to source * Add ingest script to external * Push all files * Update download files * lintr * Remove old files * Remove old files * Remove old files * Stage leaflet changes * Add neighborhood maps * Add SHAP plots * Add analysis helper file * Add ingest script * Add ratio stats * Add data transformations * Add correlations * Add summary stats * Integrate into single doc * Add percentage option to leaflet maps * lintr * Apply suggestions from code review Co-authored-by: Dan Snow <31494343+dfsnow@users.noreply.github.com> * Remove analysis * Add png to gitignore * Move readme * Revert dvc lock * lintr * Minor editing * Correct # values * Add categorical variables * lint * lintr * remove extra header * Simplify ingest * Add quarto documentation * Rename helpers * Revert dvc * Add temp readme.md * Change text * Remove text * Remove old readme --------- Co-authored-by: Dan Snow <31494343+dfsnow@users.noreply.github.com>
- Loading branch information
1 parent
dd0dcc3
commit ce04cd2
Showing
7 changed files
with
1,382 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# This identifies the root directory for quarto |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
The new-feature-template is a feature selection tool which provides insight into if new feature provides an added value to the model. | ||
|
||
To complete this report, execute the following steps: | ||
|
||
Workflow: | ||
|
||
1\. Identify if there is a model run which utilizes all features except for the new feature that you plan on adding. | ||
|
||
2\. If there is comparable run, skip to step 5. | ||
|
||
3\. Update the params with the added variable for the new feature. | ||
|
||
4\. Run dvc ingest unfreeze in terminal. | ||
|
||
5\. Run the ingest stage with dvc repro -f ingest in terminal. | ||
|
||
6\. Run the model through github actions with SHAP values, upload to S3 and cross validation enabled. | ||
|
||
7\. Create a new folder with the new feature and an ascending numeric value. | ||
|
||
8\. Update the params of new-feature-template with the correct run_id's for the comparison run and the new run. | ||
|
||
9\. Run the report and review the results. | ||
|
||
10\. Write a summary of the results in the README.md file of the new feature folder. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
--- | ||
title: "aggregated_maps_categorical" | ||
format: html | ||
--- | ||
|
||
```{r} | ||
# Extracting suffixes | ||
suffixes <- pin_nbhd %>% | ||
as_tibble() %>% | ||
select(starts_with("percentage_")) %>% | ||
names() %>% | ||
gsub("percentage_", "", .) | ||
``` | ||
|
||
```{r} | ||
maps <- lapply(suffixes, function(suffix) { | ||
map1 <- pin_nbhd %>% | ||
ggplot() + | ||
geom_sf(aes(fill = !!sym(paste0("percentage_", suffix)))) + | ||
scale_fill_viridis_c(option = "viridis", name = paste0("Value: ", suffix)) + | ||
theme_void() + | ||
coord_sf(xlim = c(-88.4, -87.52398), ylim = c(41.5, 42.2)) + | ||
ggtitle(paste0("Map for ", suffix)) | ||
return(map1) | ||
}) | ||
``` | ||
|
||
## Categorical Maps | ||
|
||
```{r, results = 'asis'} | ||
for (i in seq_along(maps)) { | ||
cat("### Map for ", suffixes[i], "\n\n") | ||
print(maps[[i]]) | ||
cat("\n\n") | ||
} | ||
``` | ||
|
||
## Plurality in Each Neighborhood | ||
|
||
```{r} | ||
pin_nbhd %>% | ||
ggplot() + | ||
geom_sf(aes(fill = as.factor(plurality_factor))) + | ||
scale_fill_viridis_d(option = "viridis") + | ||
labs(fill = "Variable") + | ||
theme_void() + | ||
coord_sf(xlim = c(-88.4, -87.52398), ylim = c(41.5, 42.2)) + | ||
ggtitle("Plurality Variable") | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
--- | ||
title: "add_descriptive_stats_categorical" | ||
format: html | ||
--- | ||
|
||
```{r} | ||
create_category_percentages <- function(data, group_var, feature_var) { | ||
group_vars <- if (!is.null(group_var)) { | ||
list(sym(group_var), sym(feature_var)) | ||
} else { | ||
list(sym(feature_var)) | ||
} | ||
mode_function <- function(x) { | ||
ux <- unique(x) | ||
ux[which.max(tabulate(match(x, ux)))] | ||
} | ||
category_percentages <- data %>% | ||
group_by(!!!group_vars) %>% | ||
count() %>% | ||
group_by(!!!if (!is.null(group_var)) list(sym(group_var)) else list()) %>% | ||
mutate(percentage = scales::percent(n / sum(n), accuracy = 0.01)) %>% | ||
select(!!!group_vars, percentage) %>% | ||
pivot_wider(names_from = !!sym(feature_var), values_from = percentage, values_fill = list(percentage = scales::percent(0))) | ||
# Calculate mode for each group (if group_var is present) and add it as a new column | ||
mode_column <- data %>% | ||
group_by(!!!if (!is.null(group_var)) list(sym(group_var)) else list()) %>% | ||
summarize(mode_value = mode_function(!!sym(feature_var))) %>% | ||
ungroup() | ||
# Join the mode column to the category_percentages data | ||
category_percentages <- category_percentages %>% | ||
left_join(mode_column, by = if (!is.null(group_var)) group_var else character(0)) %>% | ||
mutate(mode = as.character(mode_value)) %>% | ||
select(-mode_value) | ||
datatable(category_percentages, | ||
options = list( | ||
scrollY = "300px", | ||
scrollX = TRUE, | ||
paging = FALSE, | ||
searching = TRUE | ||
), | ||
rownames = FALSE | ||
) | ||
} | ||
``` | ||
|
||
## Descriptive Stats for Categorical Variables | ||
|
||
::: panel-tabset | ||
|
||
### Descriptive Stats for the County | ||
```{r} | ||
create_category_percentages(pin_individual, NULL, params$added_feature) | ||
``` | ||
|
||
### Descriptive Stats for the Township | ||
```{r} | ||
create_category_percentages(pin_individual, "meta_township_name", params$added_feature) | ||
``` | ||
|
||
### Descriptive Stats for the Neighborhood | ||
```{r} | ||
create_category_percentages(pin_individual, "meta_nbhd_code", params$added_feature) | ||
``` | ||
|
||
### Historgram of the Target Feature | ||
```{r} | ||
pin_individual %>% | ||
count(!!sym({{ target_feature_value }})) %>% | ||
mutate(percentage = n / sum(n) * 100) %>% | ||
ggplot(aes(x = !!sym({{ target_feature_value }}), y = percentage)) + | ||
geom_bar(stat = "identity", fill = "blue", color = "black", alpha = 0.7) + | ||
labs( | ||
x = target_feature_value, | ||
y = "Percentage" | ||
) + | ||
theme_minimal() | ||
``` | ||
|
||
::: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
model_fetch_run_subset <- function( | ||
run_id, year, analyses_paths, append_run_id = FALSE) { | ||
s3_objs <- grep("s3://", unlist(analyses_paths$output), value = TRUE) | ||
bucket <- strsplit(s3_objs[1], "/")[[1]][3] | ||
|
||
data_list <- list() | ||
|
||
for (analyses_path in analyses_paths$output) { | ||
is_directory <- endsWith(analyses_path$s3, "/") | ||
if (is_directory) { | ||
partitioned_by_run <- endsWith( | ||
analyses_path$s3, | ||
paste0("run_id=", run_id, "/") | ||
) | ||
dir_path <- if (partitioned_by_run) { | ||
analyses_path$s3 | ||
} else { | ||
paste0(analyses_path$s3, "year=", year, "/run_id=", run_id, "/") | ||
} | ||
|
||
message("Now fetching directory: ", dir_path) | ||
objs_prefix <- sub(paste0("s3://", bucket, "/"), "", dir_path) | ||
objs <- aws.s3::get_bucket_df(bucket, objs_prefix) | ||
objs <- dplyr::filter(objs, Size > 0) | ||
|
||
combined_data <- purrr::map_dfr(objs$Key, function(key) { | ||
message("Now fetching file: ", key) | ||
local_temp_path <- file.path(tempdir(), basename(key)) | ||
aws.s3::save_object(key, bucket = bucket, file = local_temp_path) | ||
arrow::read_parquet(local_temp_path) | ||
}) | ||
|
||
if (nrow(objs) > 0) { | ||
data_key <- if (append_run_id) { | ||
paste0(analyses_path$key, "_", run_id) | ||
} else { | ||
analyses_path$key | ||
} | ||
data_list[[data_key]] <- combined_data | ||
} else { | ||
warning(analyses_path$key, " does not exist for this run") | ||
} | ||
} else { | ||
message("Now fetching file: ", analyses_path$s3) | ||
if (aws.s3::object_exists(analyses_path$s3, bucket = bucket)) { | ||
local_temp_path <- file.path(tempdir(), basename(analyses_path$s3)) | ||
aws.s3::save_object( | ||
analyses_path$s3, | ||
bucket = bucket, file = local_temp_path | ||
) | ||
data_key <- if (append_run_id) { | ||
paste0(analyses_path$key, "_", run_id) | ||
} else { | ||
analyses_path$key | ||
} | ||
data_list[[data_key]] <- arrow::read_parquet(local_temp_path) | ||
} else { | ||
warning(analyses_path$key, " does not exist for this run") | ||
} | ||
} | ||
} | ||
|
||
return(data_list) | ||
} | ||
|
||
|
||
rename_var <- function(var_name, suffix, new_suffix) { | ||
if (exists(var_name) && is.data.frame(get(var_name))) { | ||
if (grepl(paste0("_", suffix, "$"), var_name)) { | ||
new_name <- sub(paste0("_", suffix, "$"), new_suffix, var_name) | ||
assign(new_name, get(var_name), envir = .GlobalEnv) | ||
rm(list = var_name, envir = .GlobalEnv) | ||
} | ||
} | ||
} | ||
|
||
clean_column_values <- function(df, column_name) { | ||
df[[column_name]] <- df[[column_name]] %>% | ||
gsub("^meta_|^prox_|^other_|^loc_|^char_|^acs5|^acs_|^ccao_", "", .) %>% | ||
gsub("_", " ", .) %>% | ||
stringr::str_to_title() | ||
return(df) | ||
} | ||
|
||
s3_data_download <- function(dvc_md5_assessment_data) { | ||
# Define the S3 path for assessment data | ||
s3_path <- paste0( | ||
"s3://ccao-data-dvc-us-east-1/files/md5/", | ||
substr(dvc_md5_assessment_data, 1, 2), "/", | ||
substr(dvc_md5_assessment_data, 3, nchar(dvc_md5_assessment_data)) | ||
) | ||
|
||
# Read and return the parquet data | ||
read_parquet(s3_path) | ||
} |
Oops, something went wrong.