Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add data transformation to new_feature_template #252

Closed
wants to merge 16 commits into from
145 changes: 145 additions & 0 deletions analyses/Data_transformation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
target_feature_value <- params$added_feature
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are used throughout all the other docs, without being explicitly mentioned in the others.

target_feature_shap <- params$added_feature_shap
type <- params$type
nbhd <- ccao::nbhd_shp

# Create a individual card level dataset
card_individual <- shap_new %>%
select(
meta_pin, meta_card_num, pred_card_shap_baseline_fmv,
{{ target_feature_value }}
) %>%
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renames the value to append _shap, allowing the feature value to be merged without creating .x, .y.

rename(!!sym(target_feature_shap) := !!sym(target_feature_value)) %>%
inner_join(
assessment_card_new %>%
select(
meta_pin, meta_nbhd_code,
meta_card_num,
pred_card_initial_fmv,
{{ target_feature_value }}
),
by = c("meta_pin", "meta_card_num")
)

# Summarizing data by neighborhood code
card_nbhd <- card_individual %>%
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are mostly used for the ggplots for neighborhood mapping.

group_by(meta_nbhd_code) %>%
summarize(
!!paste0({{ target_feature_shap }}, "_mean") :=
mean(!!sym({{ target_feature_shap }}), na.rm = TRUE),
!!paste0({{ target_feature_shap }}, "_90th") :=
quantile(!!sym({{ target_feature_shap }}), probs = 0.9, na.rm = TRUE),
!!paste0({{ target_feature_shap }}, "_mean_abs") :=
mean(abs(!!sym({{ target_feature_shap }})), na.rm = TRUE)
) %>%
ungroup() %>%
inner_join(
nbhd,
by = c("meta_nbhd_code" = "town_nbhd")
) %>%
st_as_sf()

## Create a pin level dataset
pin_individual <- assessment_pin_new %>%
select(meta_pin, pred_pin_final_fmv, pred_pin_initial_fmv) %>%
rename(
pred_pin_final_fmv_new = pred_pin_final_fmv,
pred_pin_initial_fmv_new = pred_pin_initial_fmv
) %>%
inner_join(
assessment_pin_comparison %>%
select(meta_pin, pred_pin_final_fmv, pred_pin_initial_fmv),
by = "meta_pin"
) %>%
rename(
pred_pin_final_fmv_comp = pred_pin_final_fmv,
pred_pin_initial_fmv_comp = pred_pin_initial_fmv
) %>%
mutate(
diff_pred_pin_final_fmv =
round(((pred_pin_final_fmv_new - pred_pin_final_fmv_comp) /
pred_pin_final_fmv_comp), 4),

Check warning on line 61 in analyses/Data_transformation.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/analyses/Data_transformation.R,line=61,col=8,[indentation_linter] Indentation should be 15 spaces but is 8 spaces.
pred_pin_final_fmv_new = dollar(pred_pin_final_fmv_new),
pred_pin_final_fmv_comp = dollar(pred_pin_final_fmv_comp),
diff_pred_pin_initial_fmv =
round(((pred_pin_initial_fmv_new - pred_pin_initial_fmv_comp) /
pred_pin_initial_fmv_comp), 4),

Check warning on line 66 in analyses/Data_transformation.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/analyses/Data_transformation.R,line=66,col=8,[indentation_linter] Indentation should be 15 spaces but is 8 spaces.
pred_pin_initial_fmv_new = dollar(pred_pin_initial_fmv_new),
pred_pin_initial_fmv_comp = dollar(pred_pin_initial_fmv_comp)
) %>%
inner_join(
assessment_data_new %>%
distinct(meta_pin, .keep_all = TRUE) %>%
select(
meta_pin, meta_nbhd_code, loc_longitude,
loc_latitude, meta_township_name, {{ target_feature_value }}
),
by = "meta_pin"
)

# Aggregate to neighborhood level
if (type == "continuous") {
pin_nbhd <- pin_individual %>%
group_by(meta_nbhd_code) %>%
summarize(
!!paste0(target_feature_value, "_neighborhood_mean") :=
mean(!!sym(target_feature_value), na.rm = TRUE),
!!paste0(target_feature_value, "_neighborhood_median") :=
median(!!sym(target_feature_value), na.rm = TRUE),
!!paste0(target_feature_value, "_neighborhood_90th") :=
quantile(!!sym(target_feature_value), 0.9, na.rm = TRUE)
) %>%
inner_join(nbhd, by = c("meta_nbhd_code" = "town_nbhd")) %>%
st_as_sf()
} else {
pin_nbhd <- pin_individual %>%
group_by(meta_nbhd_code, !!sym(target_feature_value)) %>%
count() %>%
ungroup() %>%
group_by(meta_nbhd_code) %>%
mutate(
percentage = n / sum(n) * 100
) %>%
arrange(meta_nbhd_code, desc(n)) %>%
mutate(
plurality_factor = first(!!sym(target_feature_value))
) %>%
ungroup() %>%
select(meta_nbhd_code, !!sym(target_feature_value), percentage, plurality_factor) %>%

Check warning on line 108 in analyses/Data_transformation.R

View workflow job for this annotation

GitHub Actions / pre-commit

file=/home/runner/work/model-res-avm/model-res-avm/analyses/Data_transformation.R,line=108,col=81,[line_length_linter] Lines should not be more than 80 characters. This line is 89 characters.
pivot_wider(
names_from = !!sym(target_feature_value),
values_from = percentage,
names_prefix = "percentage_"
) %>%
inner_join(nbhd, by = c("meta_nbhd_code" = "town_nbhd")) %>%
st_as_sf()
}

# Pivot wider for leaflet maps to allow multiple shap values
leaflet_data <- card_individual %>%
select(meta_pin, relative_shap, {{ target_feature_shap }}) %>%
group_by(meta_pin) %>%
mutate(
shap_total = sum(!!sym({{ target_feature_shap }})),
variable_index = row_number(),
name_col = paste0(
deparse(substitute(
target_feature_shap
)), "_",
variable_index
)
) %>%
pivot_wider(
id_cols = c("meta_pin", "shap_total"),
names_from = name_col,
values_from = !!sym({{ target_feature_shap }})
) %>%
ungroup() %>%
right_join(pin_individual, by = c("meta_pin" = "meta_pin")) %>%
mutate(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removes the dollar::scales before dividing by it.

pred_pin_initial_fmv_new_numeric =
as.numeric(gsub("[$,]", "", pred_pin_initial_fmv_new)),
relative_shap =
round(as.numeric(shap_total) / pred_pin_initial_fmv_new_numeric, 2)
) %>%
distinct(meta_pin, .keep_all = TRUE)
Loading