-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Include imputed strata in assessment card/pin #55
base: master
Are you sure you want to change the base?
Changes from all commits
a603001
223aa02
fb520c6
80fd36f
b692b22
6edaaf3
3e1a54b
3e4ab9b
45accc7
84593d4
0837cd2
8ae8d84
aeaa387
d0d8135
0bf170c
75d8d4e
f22d226
92710bf
cfe990b
3eed3df
21ca2ca
77d6cee
2d9628f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,20 +43,50 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local) | |
# FMV per unit | ||
assessment_data_pred <- read_parquet(paths$input$assessment$local) %>% | ||
as_tibble() %>% | ||
# Bake the data first and extract meta strata columns | ||
{ | ||
baked_data <- bake(lgbm_final_full_recipe, new_data = ., all_predictors()) | ||
mutate( | ||
., | ||
pred_card_initial_fmv = as.numeric(predict( | ||
lgbm_final_full_fit, | ||
new_data = baked_data | ||
)$.pred), | ||
# Some strata are imputed during the baking process | ||
# so we extract all values. | ||
temp_strata_1 = baked_data$meta_strata_1, | ||
temp_strata_2 = baked_data$meta_strata_2 | ||
) | ||
} | ||
|
||
# The trained model encodes categorical values as base-0 integers. | ||
# However, here we want to recover the original (unencoded) values | ||
# of our strata variables. To do so, we create a mapping of the | ||
# encoded to unencoded values and use the to recover both the original | ||
# strata values and those imputed by step_impute_knn (in R/recipes.R) | ||
mapping_1 <- assessment_data_pred %>% | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this can be minorly optimized by use deframe from the tibble package, but that's not part of the existing renv. |
||
filter(!is.na(meta_strata_1)) %>% | ||
distinct(temp_strata_1, meta_strata_1) | ||
|
||
mapping_2 <- assessment_data_pred %>% | ||
filter(!is.na(meta_strata_2)) %>% | ||
distinct(temp_strata_2, meta_strata_2) | ||
|
||
strata_mapping_1 <- setNames(mapping_1$meta_strata_1, mapping_1$temp_strata_1) | ||
strata_mapping_2 <- setNames(mapping_2$meta_strata_2, mapping_2$temp_strata_2) | ||
|
||
# Apply mappings | ||
assessment_data_pred <- assessment_data_pred %>% | ||
mutate( | ||
pred_card_initial_fmv = predict( | ||
lgbm_final_full_fit, | ||
new_data = bake( | ||
lgbm_final_full_recipe, | ||
new_data = ., | ||
all_predictors() | ||
) | ||
)$.pred | ||
) | ||
|
||
|
||
|
||
|
||
# Binary variable to identify condos which have imputed strata | ||
meta_strata_is_imputed = ifelse(is.na(meta_strata_1), 1, 0), | ||
# Use mappings to replace meta_strata_1 and meta_strata_2 directly | ||
# Unname removes the previously encoded information for clarity | ||
meta_strata_1 = unname(strata_mapping_1[as.character(temp_strata_1)]), | ||
meta_strata_2 = unname(strata_mapping_2[as.character(temp_strata_2)]) | ||
) %>% | ||
# Remove duplicated columns | ||
select(-temp_strata_1, -temp_strata_2) | ||
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
# 3. Post-Modeling Adjustments ------------------------------------------------- | ||
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
@@ -154,7 +184,8 @@ assessment_data_merged %>% | |
select( | ||
meta_year, meta_pin, meta_class, meta_card_num, meta_lline_num, | ||
meta_modeling_group, ends_with("_num_sale"), pred_card_initial_fmv, | ||
all_of(params$model$predictor$all), township_code | ||
all_of(params$model$predictor$all), | ||
meta_strata_is_imputed, township_code | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: Let's change the prefix of this variable to |
||
) %>% | ||
mutate( | ||
ccao_n_years_exe_homeowner = as.integer(ccao_n_years_exe_homeowner) | ||
|
@@ -268,7 +299,8 @@ assessment_data_pin <- assessment_data_merged %>% | |
meta_year, meta_pin, meta_pin10, meta_triad_code, meta_township_code, | ||
meta_nbhd_code, meta_tax_code, meta_class, meta_tieback_key_pin, | ||
meta_tieback_proration_rate, meta_cdu, meta_modeling_group, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any documentation that needs to be updated with new values in pin / card output files? Strata was never in the pin output file to begin with. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope! This should just get automatically added to the equivalent tables in Athena once it's crawled. |
||
meta_pin_num_landlines, char_yrblt, | ||
meta_pin_num_landlines, char_yrblt, meta_strata_1, meta_strata_2, | ||
meta_strata_is_imputed, | ||
|
||
# Keep overall building square footage | ||
char_total_bldg_sf = char_building_sf, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
assessment_data_pred <- read_parquet(paths$input$assessment$local) %>% | ||
as_tibble() %>% | ||
# Bake the data first and extract meta strata columns | ||
{ | ||
baked_data <- bake(lgbm_final_full_recipe, new_data = ., all_predictors()) | ||
mutate( | ||
., | ||
pred_card_initial_fmv = as.numeric(predict( | ||
lgbm_final_full_fit, | ||
new_data = baked_data | ||
)$.pred), | ||
# Some strata are imputed during the baking process | ||
# so we extract all values. | ||
temp_strata_1 = baked_data$meta_strata_1, | ||
temp_strata_2 = baked_data$meta_strata_2 | ||
) | ||
} | ||
|
||
# For the lightgbm model, values are recoded to a 0 based scale. | ||
# This means that these values are a 1:1 match with values of a | ||
# different scale. Because of this, we map values to our original | ||
# calculations for continuity. | ||
strata_mapping_1 <- assessment_data_pred %>% | ||
filter(!is.na(meta_strata_1)) %>% | ||
distinct(temp_strata_1, meta_strata_1) %>% | ||
with(setNames(meta_strata_1, temp_strata_1)) | ||
|
||
strata_mapping_2 <- assessment_data_pred %>% | ||
filter(!is.na(meta_strata_2)) %>% | ||
distinct(temp_strata_2, meta_strata_2) %>% | ||
with(setNames(meta_strata_2, temp_strata_2)) | ||
|
||
# Apply mappings | ||
assessment_data_pred <- assessment_data_pred %>% | ||
mutate( | ||
# Binary variable to identify condos which have imputed strata | ||
flag_strata_is_imputed = ifelse(is.na(meta_strata_1), 1, 0), | ||
# Use mappings to replace meta_strata_1 and meta_strata_2 directly | ||
# Unname removes the previously encoded information for clarity | ||
meta_strata_1 = unname(strata_mapping_1[as.character(temp_strata_1)]), | ||
meta_strata_2 = unname(strata_mapping_2[as.character(temp_strata_2)]) | ||
) %>% | ||
# Remove duplicated columns | ||
select(-temp_strata_1, -temp_strata_2) | ||
|
||
|
||
assessment_data_pred_old <- read_parquet(paths$input$assessment$local) %>% | ||
as_tibble() %>% | ||
mutate( | ||
pred_card_initial_fmv = predict( | ||
lgbm_final_full_fit, | ||
new_data = bake( | ||
lgbm_final_full_recipe, | ||
new_data = ., | ||
all_predictors() | ||
) | ||
)$.pred | ||
) | ||
|
||
# Perform the comparison | ||
comparison_result <- assessment_data_pred %>% | ||
inner_join(assessment_data_pred_old, by = "meta_pin", suffix = c("_new", "_old")) %>% | ||
mutate( | ||
match_pred_card_initial_fmv = pred_card_initial_fmv_new == pred_card_initial_fmv_old, | ||
match_meta_strata_1 = ifelse( | ||
!is.na(meta_strata_1_new) & !is.na(meta_strata_1_old), | ||
meta_strata_1_new == meta_strata_1_old, | ||
NA | ||
), | ||
match_meta_strata_2 = ifelse( | ||
!is.na(meta_strata_2_new) & !is.na(meta_strata_2_old), | ||
meta_strata_2_new == meta_strata_2_old, | ||
NA | ||
) | ||
) %>% | ||
select( | ||
meta_pin, meta_strata_1_new, meta_strata_1_old, | ||
meta_strata_2_new, meta_strata_2_old, | ||
pred_card_initial_fmv_new, pred_card_initial_fmv_old, | ||
match_pred_card_initial_fmv, match_meta_strata_1, match_meta_strata_2 | ||
) | ||
|
||
# Print the result | ||
print(comparison_result) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
issue (blocking): This is sort of correct. The imputation will be deterministic between model runs if a seed is set somewhere in this file. However, it won't be deterministic if you run the same prediction twice in a single session (unless you set the seed again).
I would add
set.seed(params$input$strata$seed)
somewhere at the top of this file to ensure that prediction is always using the same seed. Then run the stage twice (run once, restart, run again) and check that the results are the same.