ccao-data · Damonamajor · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
@@ -31,7 +31,10 @@ model_main_recipe <- function(data, pred_vars, cat_vars,
     # Remove any variables not an outcome var or in the pred_vars vector
     step_rm(-all_outcomes(), -all_predictors(), -has_role("ID")) %>%
     # Impute missing values using KNN. Specific to condo model, usually used to
-    # impute missing condo building strata
+    # impute missing condo building strata. Within step_impute_knn, an estimated
+    # node value is called with the sample(). This is not deterministic, meaning
+    # different runs of the model will have different imputed values, and thus
+    # different FMVs.
     step_impute_knn(
       all_of(knn_vars),
       neighbors = tune(),

@@ -43,20 +43,50 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local)
 # FMV per unit
 assessment_data_pred <- read_parquet(paths$input$assessment$local) %>%
   as_tibble() %>%
+  # Bake the data first and extract meta strata columns
+  {
+    baked_data <- bake(lgbm_final_full_recipe, new_data = ., all_predictors())
+    mutate(
+      .,
+      pred_card_initial_fmv = as.numeric(predict(
+        lgbm_final_full_fit,
+        new_data = baked_data
+      )$.pred),
+      # Some strata are imputed during the baking process
+      # so we extract all values.
+      temp_strata_1 = baked_data$meta_strata_1,
+      temp_strata_2 = baked_data$meta_strata_2
+    )
+  }
+
+# The trained model encodes categorical values as base-0 integers.
+# However, here we want to recover the original (unencoded) values
+# of our strata variables. To do so, we create a mapping of the
+# encoded to unencoded values and use the to recover both the original
+# strata values and those imputed by step_impute_knn (in R/recipes.R)
+mapping_1 <- assessment_data_pred %>%
+  filter(!is.na(meta_strata_1)) %>%
+  distinct(temp_strata_1, meta_strata_1)
+
+mapping_2 <- assessment_data_pred %>%
+  filter(!is.na(meta_strata_2)) %>%
+  distinct(temp_strata_2, meta_strata_2)
+
+strata_mapping_1 <- setNames(mapping_1$meta_strata_1, mapping_1$temp_strata_1)
+strata_mapping_2 <- setNames(mapping_2$meta_strata_2, mapping_2$temp_strata_2)
+
+# Apply mappings
+assessment_data_pred <- assessment_data_pred %>%
   mutate(
-    pred_card_initial_fmv = predict(
-      lgbm_final_full_fit,
-      new_data = bake(
-        lgbm_final_full_recipe,
-        new_data = .,
-        all_predictors()
-      )
-    )$.pred
-  )
-
-
-
-
+    # Binary variable to identify condos which have imputed strata
+    meta_strata_is_imputed = ifelse(is.na(meta_strata_1), 1, 0),
+    # Use mappings to replace meta_strata_1 and meta_strata_2 directly
+    # Unname removes the previously encoded information for clarity
+    meta_strata_1 = unname(strata_mapping_1[as.character(temp_strata_1)]),
+    meta_strata_2 = unname(strata_mapping_2[as.character(temp_strata_2)])
+  ) %>%
+  # Remove duplicated columns
+  select(-temp_strata_1, -temp_strata_2)
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 # 3. Post-Modeling Adjustments -------------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -154,7 +184,8 @@ assessment_data_merged %>%
   select(
     meta_year, meta_pin, meta_class, meta_card_num, meta_lline_num,
     meta_modeling_group, ends_with("_num_sale"), pred_card_initial_fmv,
-    all_of(params$model$predictor$all), township_code
+    all_of(params$model$predictor$all),
+    meta_strata_is_imputed, township_code
   ) %>%
   mutate(
     ccao_n_years_exe_homeowner = as.integer(ccao_n_years_exe_homeowner)
@@ -268,7 +299,8 @@ assessment_data_pin <- assessment_data_merged %>%
     meta_year, meta_pin, meta_pin10, meta_triad_code, meta_township_code,
     meta_nbhd_code, meta_tax_code, meta_class, meta_tieback_key_pin,
     meta_tieback_proration_rate, meta_cdu, meta_modeling_group,
-    meta_pin_num_landlines, char_yrblt,
+    meta_pin_num_landlines, char_yrblt, meta_strata_1, meta_strata_2,
+    meta_strata_is_imputed,
 
     # Keep overall building square footage
     char_total_bldg_sf = char_building_sf,

@@ -0,0 +1,85 @@
+assessment_data_pred <- read_parquet(paths$input$assessment$local) %>%
+  as_tibble() %>%
+  # Bake the data first and extract meta strata columns
+  {
+    baked_data <- bake(lgbm_final_full_recipe, new_data = ., all_predictors())
+    mutate(
+      .,
+      pred_card_initial_fmv = as.numeric(predict(
+        lgbm_final_full_fit,
+        new_data = baked_data
+      )$.pred),
+      # Some strata are imputed during the baking process
+      # so we extract all values.
+      temp_strata_1 = baked_data$meta_strata_1,
+      temp_strata_2 = baked_data$meta_strata_2
+    )
+  }
+
+# For the lightgbm model, values are recoded to a 0 based scale.
+# This means that these values are a 1:1 match with values of a
+# different scale. Because of this, we map values to our original
+# calculations for continuity.
+strata_mapping_1 <- assessment_data_pred %>%
+  filter(!is.na(meta_strata_1)) %>%
+  distinct(temp_strata_1, meta_strata_1) %>%
+  with(setNames(meta_strata_1, temp_strata_1))
+
+strata_mapping_2 <- assessment_data_pred %>%
+  filter(!is.na(meta_strata_2)) %>%
+  distinct(temp_strata_2, meta_strata_2) %>%
+  with(setNames(meta_strata_2, temp_strata_2))
+
+# Apply mappings
+assessment_data_pred <- assessment_data_pred %>%
+  mutate(
+    # Binary variable to identify condos which have imputed strata
+    flag_strata_is_imputed = ifelse(is.na(meta_strata_1), 1, 0),
+    # Use mappings to replace meta_strata_1 and meta_strata_2 directly
+    # Unname removes the previously encoded information for clarity
+    meta_strata_1 = unname(strata_mapping_1[as.character(temp_strata_1)]),
+    meta_strata_2 = unname(strata_mapping_2[as.character(temp_strata_2)])
+  ) %>%
+  # Remove duplicated columns
+  select(-temp_strata_1, -temp_strata_2)
+
+
+assessment_data_pred_old <- read_parquet(paths$input$assessment$local) %>%
+  as_tibble() %>%
+  mutate(
+    pred_card_initial_fmv = predict(
+      lgbm_final_full_fit,
+      new_data = bake(
+        lgbm_final_full_recipe,
+        new_data = .,
+        all_predictors()
+      )
+    )$.pred
+  )
+
+# Perform the comparison
+comparison_result <- assessment_data_pred %>%
+  inner_join(assessment_data_pred_old, by = "meta_pin", suffix = c("_new", "_old")) %>%
+  mutate(
+    match_pred_card_initial_fmv = pred_card_initial_fmv_new == pred_card_initial_fmv_old,
+    match_meta_strata_1 = ifelse(
+      !is.na(meta_strata_1_new) & !is.na(meta_strata_1_old),
+      meta_strata_1_new == meta_strata_1_old,
+      NA
+    ),
+    match_meta_strata_2 = ifelse(
+      !is.na(meta_strata_2_new) & !is.na(meta_strata_2_old),
+      meta_strata_2_new == meta_strata_2_old,
+      NA
+    )
+  ) %>%
+  select(
+    meta_pin, meta_strata_1_new, meta_strata_1_old,
+    meta_strata_2_new, meta_strata_2_old,
+    pred_card_initial_fmv_new, pred_card_initial_fmv_old,
+    match_pred_card_initial_fmv, match_meta_strata_1, match_meta_strata_2
+  )
+
+# Print the result
+print(comparison_result)
+