Create a feature selection evaluation template (#250)

* Push of old docs * Draft * Chugging * Keep pushing * Include temp files * Remove old file * Remove old file * Move things around * Work on tabsets * Continued improvements * Finalizing version * lintr * Push old analyses * Move to correct folder * Finalize * Typo * Leaflett function * Download directly rather than save * Cosmetic changes * Quick text changes * Fix shap download * Fix join * lintr * Update model run and FMV tests * Add quarto.yml * Changes before stashing * Changes before stashing * push dvclock * Add some more charts * Add categorical charts * Improve naming of columns * lintr * Add simple fmv change * lintr * Add correlation plot, Refactor to _pin / _card * Add shap relationship plots * Revert to old join * Revert join * unhash * Add location fix * Quick edits * lintr * Improve borders * Add low order scheme * Include neighborhood level pluralities * Add temp pin join * Add some text * Remove hashes * Renaming stuff * Hashing to switch brnches * gitignore * Create data processing file * Add data processing * rename file * Create analyses paths * add comps * Violin plot * Add unique components to code_to_review * Better defining of keys * Add append ingest scripts, Add link to source * Add ingest script to external * Push all files * Update download files * lintr * Remove old files * Remove old files * Remove old files * Stage leaflet changes * Add neighborhood maps * Add SHAP plots * Add analysis helper file * Add ingest script * Add ratio stats * Add data transformations * Add correlations * Add summary stats * Integrate into single doc * Add percentage option to leaflet maps * lintr * Apply suggestions from code review Co-authored-by: Dan Snow <31494343+dfsnow@users.noreply.github.com> * Remove analysis * Add png to gitignore * Move readme * Revert dvc lock * lintr * Minor editing * Correct # values * Add categorical variables * lint * lintr * remove extra header * Simplify ingest * Add quarto documentation * Rename helpers * Revert dvc * Add temp readme.md * Change text * Remove text * Remove old readme --------- Co-authored-by: Dan Snow <31494343+dfsnow@users.noreply.github.com>
ccao-data · Sep 5, 2024 · ce04cd2 · ce04cd2
1 parent dd0dcc3
commit ce04cd2
Show file tree

Hide file tree

Showing 7 changed files with 1,382 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -23,9 +23,13 @@ cache/
 *.xlsm
 *.html
 *.rmarkdown
+*.quarto
+*.png
 
 # Ignore scratch documents
 scratch*.*
 
 # Python files
 __pycache__
+
+/.quarto/
diff --git a/_quarto.yml b/_quarto.yml
@@ -0,0 +1 @@
+# This identifies the root directory for quarto
diff --git a/analyses/README.md b/analyses/README.md
@@ -0,0 +1,25 @@
+The new-feature-template is a feature selection tool which provides insight into if new feature provides an added value to the model.
+
+To complete this report, execute the following steps:
+
+Workflow:
+
+1\. Identify if there is a model run which utilizes all features except for the new feature that you plan on adding.
+
+2\. If there is comparable run, skip to step 5.
+
+3\. Update the params with the added variable for the new feature.
+
+4\. Run dvc ingest unfreeze in terminal.
+
+5\. Run the ingest stage with dvc repro -f ingest in terminal.
+
+6\. Run the model through github actions with SHAP values, upload to S3 and cross validation enabled.
+
+7\. Create a new folder with the new feature and an ascending numeric value.
+
+8\. Update the params of new-feature-template with the correct run_id's for the comparison run and the new run.
+
+9\. Run the report and review the results.
+
+10\. Write a summary of the results in the README.md file of the new feature folder.
diff --git a/analyses/aggregated_maps_categorical.qmd b/analyses/aggregated_maps_categorical.qmd
@@ -0,0 +1,50 @@
+---
+title: "aggregated_maps_categorical"
+format: html
+---
+
+```{r}
+# Extracting suffixes
+suffixes <- pin_nbhd %>%
+  as_tibble() %>%
+  select(starts_with("percentage_")) %>%
+  names() %>%
+  gsub("percentage_", "", .)
+```
+
+```{r}
+maps <- lapply(suffixes, function(suffix) {
+  map1 <- pin_nbhd %>%
+    ggplot() +
+    geom_sf(aes(fill = !!sym(paste0("percentage_", suffix)))) +
+    scale_fill_viridis_c(option = "viridis", name = paste0("Value: ", suffix)) +
+    theme_void() +
+    coord_sf(xlim = c(-88.4, -87.52398), ylim = c(41.5, 42.2)) +
+    ggtitle(paste0("Map for ", suffix))
+
+  return(map1)
+})
+```
+
+## Categorical Maps
+
+```{r, results = 'asis'}
+for (i in seq_along(maps)) {
+  cat("### Map for ", suffixes[i], "\n\n")
+  print(maps[[i]])
+  cat("\n\n")
+}
+```
+
+## Plurality in Each Neighborhood
+
+```{r}
+pin_nbhd %>%
+  ggplot() +
+  geom_sf(aes(fill = as.factor(plurality_factor))) +
+  scale_fill_viridis_d(option = "viridis") +
+  labs(fill = "Variable") +
+  theme_void() +
+  coord_sf(xlim = c(-88.4, -87.52398), ylim = c(41.5, 42.2)) +
+  ggtitle("Plurality Variable")
+```
diff --git a/analyses/descriptive_stats_categorical.qmd b/analyses/descriptive_stats_categorical.qmd
@@ -0,0 +1,84 @@
+---
+title: "add_descriptive_stats_categorical"
+format: html
+---
+
+```{r}
+create_category_percentages <- function(data, group_var, feature_var) {
+  group_vars <- if (!is.null(group_var)) {
+    list(sym(group_var), sym(feature_var))
+  } else {
+    list(sym(feature_var))
+  }
+
+  mode_function <- function(x) {
+    ux <- unique(x)
+    ux[which.max(tabulate(match(x, ux)))]
+  }
+
+  category_percentages <- data %>%
+    group_by(!!!group_vars) %>%
+    count() %>%
+    group_by(!!!if (!is.null(group_var)) list(sym(group_var)) else list()) %>%
+    mutate(percentage = scales::percent(n / sum(n), accuracy = 0.01)) %>%
+    select(!!!group_vars, percentage) %>%
+    pivot_wider(names_from = !!sym(feature_var), values_from = percentage, values_fill = list(percentage = scales::percent(0)))
+
+  # Calculate mode for each group (if group_var is present) and add it as a new column
+  mode_column <- data %>%
+    group_by(!!!if (!is.null(group_var)) list(sym(group_var)) else list()) %>%
+    summarize(mode_value = mode_function(!!sym(feature_var))) %>%
+    ungroup()
+
+  # Join the mode column to the category_percentages data
+  category_percentages <- category_percentages %>%
+    left_join(mode_column, by = if (!is.null(group_var)) group_var else character(0)) %>%
+    mutate(mode = as.character(mode_value)) %>%
+    select(-mode_value)
+
+  datatable(category_percentages,
+    options = list(
+      scrollY = "300px",
+      scrollX = TRUE,
+      paging = FALSE,
+      searching = TRUE
+    ),
+    rownames = FALSE
+  )
+}
+```
+
+## Descriptive Stats for Categorical Variables
+
+::: panel-tabset
+
+### Descriptive Stats for the County
+```{r}
+create_category_percentages(pin_individual, NULL, params$added_feature)
+```
+
+### Descriptive Stats for the Township
+```{r}
+create_category_percentages(pin_individual, "meta_township_name", params$added_feature)
+```
+
+### Descriptive Stats for the Neighborhood
+```{r}
+create_category_percentages(pin_individual, "meta_nbhd_code", params$added_feature)
+```
+
+### Historgram of the Target Feature
+```{r}
+pin_individual %>%
+  count(!!sym({{ target_feature_value }})) %>%
+  mutate(percentage = n / sum(n) * 100) %>%
+  ggplot(aes(x = !!sym({{ target_feature_value }}), y = percentage)) +
+  geom_bar(stat = "identity", fill = "blue", color = "black", alpha = 0.7) +
+  labs(
+    x = target_feature_value,
+    y = "Percentage"
+  ) +
+  theme_minimal()
+```
+
+:::
diff --git a/analyses/helpers.R b/analyses/helpers.R
@@ -0,0 +1,95 @@
+model_fetch_run_subset <- function(
+    run_id, year, analyses_paths, append_run_id = FALSE) {
+  s3_objs <- grep("s3://", unlist(analyses_paths$output), value = TRUE)
+  bucket <- strsplit(s3_objs[1], "/")[[1]][3]
+
+  data_list <- list()
+
+  for (analyses_path in analyses_paths$output) {
+    is_directory <- endsWith(analyses_path$s3, "/")
+    if (is_directory) {
+      partitioned_by_run <- endsWith(
+        analyses_path$s3,
+        paste0("run_id=", run_id, "/")
+      )
+      dir_path <- if (partitioned_by_run) {
+        analyses_path$s3
+      } else {
+        paste0(analyses_path$s3, "year=", year, "/run_id=", run_id, "/")
+      }
+
+      message("Now fetching directory: ", dir_path)
+      objs_prefix <- sub(paste0("s3://", bucket, "/"), "", dir_path)
+      objs <- aws.s3::get_bucket_df(bucket, objs_prefix)
+      objs <- dplyr::filter(objs, Size > 0)
+
+      combined_data <- purrr::map_dfr(objs$Key, function(key) {
+        message("Now fetching file: ", key)
+        local_temp_path <- file.path(tempdir(), basename(key))
+        aws.s3::save_object(key, bucket = bucket, file = local_temp_path)
+        arrow::read_parquet(local_temp_path)
+      })
+
+      if (nrow(objs) > 0) {
+        data_key <- if (append_run_id) {
+          paste0(analyses_path$key, "_", run_id)
+        } else {
+          analyses_path$key
+        }
+        data_list[[data_key]] <- combined_data
+      } else {
+        warning(analyses_path$key, " does not exist for this run")
+      }
+    } else {
+      message("Now fetching file: ", analyses_path$s3)
+      if (aws.s3::object_exists(analyses_path$s3, bucket = bucket)) {
+        local_temp_path <- file.path(tempdir(), basename(analyses_path$s3))
+        aws.s3::save_object(
+          analyses_path$s3,
+          bucket = bucket, file = local_temp_path
+        )
+        data_key <- if (append_run_id) {
+          paste0(analyses_path$key, "_", run_id)
+        } else {
+          analyses_path$key
+        }
+        data_list[[data_key]] <- arrow::read_parquet(local_temp_path)
+      } else {
+        warning(analyses_path$key, " does not exist for this run")
+      }
+    }
+  }
+
+  return(data_list)
+}
+
+
+rename_var <- function(var_name, suffix, new_suffix) {
+  if (exists(var_name) && is.data.frame(get(var_name))) {
+    if (grepl(paste0("_", suffix, "$"), var_name)) {
+      new_name <- sub(paste0("_", suffix, "$"), new_suffix, var_name)
+      assign(new_name, get(var_name), envir = .GlobalEnv)
+      rm(list = var_name, envir = .GlobalEnv)
+    }
+  }
+}
+
+clean_column_values <- function(df, column_name) {
+  df[[column_name]] <- df[[column_name]] %>%
+    gsub("^meta_|^prox_|^other_|^loc_|^char_|^acs5|^acs_|^ccao_", "", .) %>%
+    gsub("_", " ", .) %>%
+    stringr::str_to_title()
+  return(df)
+}
+
+s3_data_download <- function(dvc_md5_assessment_data) {
+  # Define the S3 path for assessment data
+  s3_path <- paste0(
+    "s3://ccao-data-dvc-us-east-1/files/md5/",
+    substr(dvc_md5_assessment_data, 1, 2), "/",
+    substr(dvc_md5_assessment_data, 3, nchar(dvc_md5_assessment_data))
+  )
+
+  # Read and return the parquet data
+  read_parquet(s3_path)
+}