From b16a3e64b5d0af8f0a357ff7e89311aa47961670 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Tue, 21 Nov 2023 19:58:09 +0000 Subject: [PATCH 01/27] Generate and upload model performance report in finalize pipeline step --- dvc.yaml | 8 +++-- misc/file_dict.csv | 3 +- pipeline/05-finalize.R | 54 ++++++++++++++++++++++++----- reports/performance/performance.qmd | 19 ++++++++++ 4 files changed, 72 insertions(+), 12 deletions(-) create mode 100644 reports/performance/performance.qmd diff --git a/dvc.yaml b/dvc.yaml index 861d8a6e..346c3879 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -124,9 +124,9 @@ stages: finalize: cmd: Rscript pipeline/05-finalize.R desc: > - Save run timings, upload pipeline run results to S3, and send an SNS - notification. Will also clean some of the generated outputs prior to - upload and attach a unique run ID + Save run timings, generate a report, upload pipeline run results to S3, + and send an SNS notification. Will also clean some of the generated + outputs prior to upload and attach a unique run ID deps: - output/parameter_final/model_parameter_final.parquet - output/parameter_range/model_parameter_range.parquet @@ -160,6 +160,8 @@ stages: cache: false - output/metadata/model_metadata.parquet: cache: false + - reports/performance/performance.html: + cache: false export: cmd: Rscript pipeline/06-export.R diff --git a/misc/file_dict.csv b/misc/file_dict.csv index 5e1b0f8d..eab4ba16 100644 --- a/misc/file_dict.csv +++ b/misc/file_dict.csv @@ -19,6 +19,7 @@ output,performance_quantile_test,3,evaluate,ccao-model-results-us-east-1,output/ output,performance_quantile_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_assessment.parquet,performance_quantile/year={year}/stage=assessment/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",No,Test + assessment,Performance metrics by quantile within class and geography,Assessment set uses the prior year sales to compare to the assessed value output,shap,4,interpret,ccao-model-results-us-east-1,output/shap/model_shap.parquet,shap/,shap,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,SHAP values for each feature for each card in the assessment data,NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler output,feature_importance,4,interpret,ccao-model-results-us-east-1,output/feature_importance/model_feature_importance.parquet,feature_importance/year={year}/{run_id}.parquet,feature_importance,predictor,"year, run_id, model_predictor_all_name",No,Yes,"Feature importance values (gain, cover, and frequency) for the run", +output,report,5,finalize,ccao-model-results-us-east-1,reports/performance/performance.html,report/year={year}/{run_id}.html,,model run,,No,Yes,Rendered Quarto doc with model performance statistics output,metadata,5,finalize,ccao-model-results-us-east-1,output/metadata/model_metadata.parquet,metadata/year={year}/{run_id}.parquet,metadata,model run,"year, run_id",Yes,Yes,"Information about each run, including parameters, run ID, git info, etc.", intermediate,timing,,all,,output/intermediate/timing/,,,model stage,"year, msg",Yes,Yes,Parquet files for each stage containing the stage time elapsed,Converted into a one-row data frame in the finalize stage -output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Yes,Yes,Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages" \ No newline at end of file +output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Yes,Yes,Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages" diff --git a/pipeline/05-finalize.R b/pipeline/05-finalize.R index ee03e3d9..eb001546 100644 --- a/pipeline/05-finalize.R +++ b/pipeline/05-finalize.R @@ -12,10 +12,12 @@ suppressPackageStartupMessages({ library(aws.ec2metadata) library(ccao) library(dplyr) + library(glue) library(here) library(lubridate) library(paws.application.integration) library(purrr) + library(quarto) library(tidyr) library(tune) library(yaml) @@ -203,7 +205,23 @@ tictoc::tic.clearlog() #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 4. Upload -------------------------------------------------------------------- +# 4. Generate performance report ----------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +message("Generating performance report") + +here("../reports/performance/performance.qmd") %>% + quarto_render( + execute_params = list( + run_id = run_id, + year = params$assessment$year + ) + ) + + + + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 5. Upload -------------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Uploading run artifacts") @@ -216,7 +234,7 @@ if (params$toggle$upload_to_s3) { ) - ## 4.1. Train ---------------------------------------------------------------- + ## 5.1. Train ---------------------------------------------------------------- # Upload lightgbm fit aws.s3::put_object( @@ -301,7 +319,7 @@ if (params$toggle$upload_to_s3) { } - # 4.2. Assess ---------------------------------------------------------------- + # 5.2. Assess ---------------------------------------------------------------- message("Uploading final assessment results") # Upload PIN and card-level values for full runs. These outputs are very @@ -329,7 +347,7 @@ if (params$toggle$upload_to_s3) { } - # 4.3. Evaluate -------------------------------------------------------------- + # 5.3. Evaluate -------------------------------------------------------------- # Upload test set performance message("Uploading test set evaluation") @@ -356,7 +374,7 @@ if (params$toggle$upload_to_s3) { } - # 4.4. Interpret ------------------------------------------------------------- + # 5.4. Interpret ------------------------------------------------------------- # Upload SHAP values if a full run. SHAP values are one row per card and one # column per feature, so the output is very large. Therefore, we partition @@ -384,8 +402,8 @@ if (params$toggle$upload_to_s3) { } - # 4.5. Finalize -------------------------------------------------------------- - message("Uploading run metadata and timings") + # 5.5. Finalize -------------------------------------------------------------- + message("Uploading run metadata, timings, and performance report") # Upload metadata aws.s3::put_object( @@ -398,13 +416,19 @@ if (params$toggle$upload_to_s3) { paths$output$timing$local, paths$output$timing$s3 ) + + # Upload performance report + aws.s3::put_object( + paths$output$report$local, + paths$output$report$s3 + ) } #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 5. Wrap-Up ------------------------------------------------------------------- +# 6. Wrap-Up ------------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # This will run a Glue crawler to update schemas and send an email to any SNS @@ -450,12 +474,26 @@ if (params$toggle$upload_to_s3) { .[!grepl("=", .)] %>% paste0(collapse = "\n") + # Get a link to the uploaded Quarto report + report_path_parts <- strsplit(paths$output$report$s3[1], "/")[[1]] + report_bucket <- report_path_parts[3] + report_path <- report_path_parts[4:length(report_path_parts)] %>% + paste(collapse = "/") + # Use direct link to the console instead of to the object so that we don't + # have to bother with signed URLs + report_url <- paste0( + "https://s3.console.aws.amazon.com/s3/object/", + "{report_bucket}/{report_path}?region=us-east-1&tab=overview" + ) %>% + glue::glue() + # Publish to SNS pipeline_sns$publish( Subject = paste("Model Run Complete:", run_id), Message = paste0( "Model run: ", run_id, " complete\n", "Finished in: ", pipeline_sns_total_time, "\n\n", + "Report link: ", report_url, "\n\n", pipeline_sns_results ), TopicArn = Sys.getenv("AWS_SNS_ARN_MODEL_STATUS") diff --git a/reports/performance/performance.qmd b/reports/performance/performance.qmd new file mode 100644 index 00000000..ba60ae90 --- /dev/null +++ b/reports/performance/performance.qmd @@ -0,0 +1,19 @@ +--- +title: "Model performance: `r params$run_id`" +execute: + echo: false + warning: false +format: + html: + embed-resources: true + toc: true + toc_float: true + fig-align: center + fontsize: 12pt +editor: source +params: + run_id: '2023-03-14-clever-damani' + year: '2023' +--- + +This document is a stub. From 331b241c9681eb0e764dfd19619c3da2778e521a Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Tue, 21 Nov 2023 20:00:20 +0000 Subject: [PATCH 02/27] Include .html files in model_get_s3_artifacts_for_run --- R/helpers.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/helpers.R b/R/helpers.R index 6bfd069b..f32c833e 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -36,7 +36,9 @@ model_get_s3_artifacts_for_run <- function(run_id, year) { bucket <- strsplit(s3_objs[1], "/")[[1]][3] # First get anything partitioned only by year - s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) %>% + s3_objs_limited <- grep( + ".parquet$|.zip$|.rds|.html$", s3_objs, value = TRUE + ) %>% unname() # Next get the prefix of anything partitioned by year and run_id From 0144f28aac7cd24980194498662953ec2df8580e Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 22 Nov 2023 20:12:04 +0000 Subject: [PATCH 03/27] Refactor repo to support reports/renv.lock lockfile --- DESCRIPTION | 1 + Dockerfile | 2 + renv/activate.R | 68 +- renv/profiles/reporting/renv/.gitignore | 7 + reports/performance/performance.qmd | 8 + reports/renv.lock | 1206 +++++++++++++++++++++++ 6 files changed, 1268 insertions(+), 24 deletions(-) create mode 100644 DESCRIPTION create mode 100644 renv/profiles/reporting/renv/.gitignore create mode 100644 reports/renv.lock diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 00000000..ff0211c9 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1 @@ +Config/renv/profiles/reporting/dependencies: leaflet, plotly, sf diff --git a/Dockerfile b/Dockerfile index 86180185..22121379 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,11 +26,13 @@ RUN pipenv install --system --deploy # Copy R bootstrap files into the image COPY renv.lock . +COPY reports/renv.lock reports/renv.lock COPY .Rprofile . COPY renv/ renv/ # Install R dependencies RUN Rscript -e 'renv::restore()' +RUN Rscript -e 'renv::restore(lockfile = "reports/renv.lock")' # Copy the directory into the container ADD ./ model-res-avm/ diff --git a/renv/activate.R b/renv/activate.R index cc742fc9..bf5dff7c 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -8,6 +8,21 @@ local({ # the project directory project <- getwd() + # use start-up diagnostics if enabled + diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE") + if (diagnostics) { + start <- Sys.time() + profile <- tempfile("renv-startup-", fileext = ".Rprof") + utils::Rprof(profile) + on.exit({ + utils::Rprof(NULL) + elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L) + writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed))) + writeLines(sprintf("- Profile: %s", profile)) + print(utils::summaryRprof(profile)) + }, add = TRUE) + } + # figure out whether the autoloader is enabled enabled <- local({ @@ -504,7 +519,7 @@ local({ # open the bundle for reading # We use gzcon for everything because (from ?gzcon) - # > Reading from a connection which does not supply a ‘gzip’ magic + # > Reading from a connection which does not supply a 'gzip' magic # > header is equivalent to reading from the original connection conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) on.exit(close(conn)) @@ -767,10 +782,12 @@ local({ renv_bootstrap_validate_version <- function(version, description = NULL) { # resolve description file - description <- description %||% { - path <- getNamespaceInfo("renv", "path") - packageDescription("renv", lib.loc = dirname(path)) - } + # + # avoid passing lib.loc to `packageDescription()` below, since R will + # use the loaded version of the package by default anyhow. note that + # this function should only be called after 'renv' is loaded + # https://github.com/rstudio/renv/issues/1625 + description <- description %||% packageDescription("renv") # check whether requested version 'version' matches loaded version of renv sha <- attr(version, "sha", exact = TRUE) @@ -841,7 +858,7 @@ local({ hooks <- getHook("renv::autoload") for (hook in hooks) if (is.function(hook)) - tryCatch(hook(), error = warning) + tryCatch(hook(), error = warnify) # load the project renv::load(project) @@ -982,10 +999,15 @@ local({ } - renv_bootstrap_version_friendly <- function(version, sha = NULL) { + renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { sha <- sha %||% attr(version, "sha", exact = TRUE) - parts <- c(version, sprintf("[sha: %s]", substring(sha, 1L, 7L))) - paste(parts, collapse = " ") + parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) + paste(parts, collapse = "") + } + + renv_bootstrap_exec <- function(project, libpath, version) { + if (!renv_bootstrap_load(project, libpath, version)) + renv_bootstrap_run(version, libpath) } renv_bootstrap_run <- function(version, libpath) { @@ -1017,6 +1039,14 @@ local({ commandArgs()[[1]] == "RStudio" } + # Used to work around buglet in RStudio if hook uses readline + renv_bootstrap_flush_console <- function() { + tryCatch({ + tools <- as.environment("tools:rstudio") + tools$.rs.api.sendToConsole("", echo = FALSE, focus = FALSE) + }, error = function(cnd) {}) + } + renv_json_read <- function(file = NULL, text = NULL) { jlerr <- NULL @@ -1155,25 +1185,15 @@ local({ # construct full libpath libpath <- file.path(root, prefix) - # attempt to load - if (renv_bootstrap_load(project, libpath, version)) - return(TRUE) - if (renv_bootstrap_in_rstudio()) { + # RStudio only updates console once .Rprofile is finished, so + # instead run code on sessionInit setHook("rstudio.sessionInit", function(...) { - renv_bootstrap_run(version, libpath) - - # Work around buglet in RStudio if hook uses readline - tryCatch( - { - tools <- as.environment("tools:rstudio") - tools$.rs.api.sendToConsole("", echo = FALSE, focus = FALSE) - }, - error = function(cnd) {} - ) + renv_bootstrap_exec(project, libpath, version) + renv_bootstrap_flush_console() }) } else { - renv_bootstrap_run(version, libpath) + renv_bootstrap_exec(project, libpath, version) } invisible() diff --git a/renv/profiles/reporting/renv/.gitignore b/renv/profiles/reporting/renv/.gitignore new file mode 100644 index 00000000..0ec0cbba --- /dev/null +++ b/renv/profiles/reporting/renv/.gitignore @@ -0,0 +1,7 @@ +library/ +local/ +cellar/ +lock/ +python/ +sandbox/ +staging/ diff --git a/reports/performance/performance.qmd b/reports/performance/performance.qmd index ba60ae90..d2cad7a3 100644 --- a/reports/performance/performance.qmd +++ b/reports/performance/performance.qmd @@ -17,3 +17,11 @@ params: --- This document is a stub. + +```{r setup} +library(plotly) +library(leaflet) +library(sf) + +print("All packages loaded!") +``` diff --git a/reports/renv.lock b/reports/renv.lock new file mode 100644 index 00000000..39b73dc9 --- /dev/null +++ b/reports/renv.lock @@ -0,0 +1,1206 @@ +{ + "R": { + "Version": "4.2.2", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "DBI": { + "Package": "DBI", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "b2866e62bab9378c3cc9476a1954226b" + }, + "KernSmooth": { + "Package": "KernSmooth", + "Version": "2.23-22", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats" + ], + "Hash": "2fecebc3047322fa5930f74fae5de70f" + }, + "MASS": { + "Package": "MASS", + "Version": "7.3-60", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "methods", + "stats", + "utils" + ], + "Hash": "a56a6365b3fa73293ea8d084be0d9bb0" + }, + "Matrix": { + "Package": "Matrix", + "Version": "1.6-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "lattice", + "methods", + "stats", + "utils" + ], + "Hash": "31262fd18481fab05c5e7258dac163ca" + }, + "R6": { + "Package": "R6", + "Version": "2.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "470851b6d5d0ac559e9d01bb352b4021" + }, + "RColorBrewer": { + "Package": "RColorBrewer", + "Version": "1.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "45f0398006e83a5b10b72a90663d8d8c" + }, + "Rcpp": { + "Package": "Rcpp", + "Version": "1.0.11", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "methods", + "utils" + ], + "Hash": "ae6cbbe1492f4de79c45fce06f967ce8" + }, + "askpass": { + "Package": "askpass", + "Version": "1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "sys" + ], + "Hash": "e8a22846fff485f0be3770c2da758713" + }, + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "543776ae6848fde2f48ff3816d0628bc" + }, + "bslib": { + "Package": "bslib", + "Version": "0.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "base64enc", + "cachem", + "grDevices", + "htmltools", + "jquerylib", + "jsonlite", + "memoise", + "mime", + "rlang", + "sass" + ], + "Hash": "283015ddfbb9d7bf15ea9f0b5698f0d9" + }, + "cachem": { + "Package": "cachem", + "Version": "1.0.8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "fastmap", + "rlang" + ], + "Hash": "c35768291560ce302c0a6589f92e837d" + }, + "class": { + "Package": "class", + "Version": "7.3-22", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "MASS", + "R", + "stats", + "utils" + ], + "Hash": "f91f6b29f38b8c280f2b9477787d4bb2" + }, + "classInt": { + "Package": "classInt", + "Version": "0.4-10", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "KernSmooth", + "R", + "class", + "e1071", + "grDevices", + "graphics", + "stats" + ], + "Hash": "f5a40793b1ae463a7ffb3902a95bf864" + }, + "cli": { + "Package": "cli", + "Version": "3.6.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "89e6d8219950eac806ae0c489052048a" + }, + "colorspace": { + "Package": "colorspace", + "Version": "2.1-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "methods", + "stats" + ], + "Hash": "f20c47fd52fae58b4e377c37bb8c335b" + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.4.4", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "3f7d8664d7324406cd10cd650ad85e5f" + }, + "crosstalk": { + "Package": "crosstalk", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R6", + "htmltools", + "jsonlite", + "lazyeval" + ], + "Hash": "6aa54f69598c32177e920eb3402e8293" + }, + "curl": { + "Package": "curl", + "Version": "5.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "2118af9cb164c8d2dddc7b89eaf732d9" + }, + "data.table": { + "Package": "data.table", + "Version": "1.14.8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "b4c06e554f33344e044ccd7fdca750a9" + }, + "digest": { + "Package": "digest", + "Version": "0.6.33", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "b18a9cf3c003977b0cc49d5e76ebe48d" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "rlang", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "dea6970ff715ca541c387de363ff405e" + }, + "e1071": { + "Package": "e1071", + "Version": "1.7-13", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "class", + "grDevices", + "graphics", + "methods", + "proxy", + "stats", + "utils" + ], + "Hash": "1046cb48d06cb40c2900d8878f03a0fe" + }, + "ellipsis": { + "Package": "ellipsis", + "Version": "0.3.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "rlang" + ], + "Hash": "bb0eec2fe32e88d9e2836c2f73ea2077" + }, + "evaluate": { + "Package": "evaluate", + "Version": "0.21", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "d59f3b464e8da1aef82dc04b588b8dfb" + }, + "fansi": { + "Package": "fansi", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "utils" + ], + "Hash": "1d9e7ad3c8312a192dea7d3db0274fde" + }, + "farver": { + "Package": "farver", + "Version": "2.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8106d78941f34855c440ddb946b8f7a5" + }, + "fastmap": { + "Package": "fastmap", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "f7736a18de97dea803bde0a2daaafb27" + }, + "fontawesome": { + "Package": "fontawesome", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "htmltools", + "rlang" + ], + "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d" + }, + "fs": { + "Package": "fs", + "Version": "1.6.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "47b5f30c720c23999b913a1a635cf0bb" + }, + "generics": { + "Package": "generics", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + }, + "ggplot2": { + "Package": "ggplot2", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "MASS", + "R", + "cli", + "glue", + "grDevices", + "grid", + "gtable", + "isoband", + "lifecycle", + "mgcv", + "rlang", + "scales", + "stats", + "tibble", + "vctrs", + "withr" + ], + "Hash": "3a147ee02e85a8941aad9909f1b43b7b" + }, + "glue": { + "Package": "glue", + "Version": "1.6.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "4f2596dfb05dac67b9dc558e5c6fba2e" + }, + "gridExtra": { + "Package": "gridExtra", + "Version": "2.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "grDevices", + "graphics", + "grid", + "gtable", + "utils" + ], + "Hash": "7d7f283939f563670a697165b2cf5560" + }, + "gtable": { + "Package": "gtable", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "grid", + "lifecycle", + "rlang" + ], + "Hash": "b44addadb528a0d227794121c00572a0" + }, + "highr": { + "Package": "highr", + "Version": "0.10", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "xfun" + ], + "Hash": "06230136b2d2b9ba5805e1963fa6e890" + }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "base64enc", + "digest", + "ellipsis", + "fastmap", + "grDevices", + "rlang", + "utils" + ], + "Hash": "a2326a66919a3311f7fbb1e3bf568283" + }, + "htmlwidgets": { + "Package": "htmlwidgets", + "Version": "1.6.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "grDevices", + "htmltools", + "jsonlite", + "knitr", + "rmarkdown", + "yaml" + ], + "Hash": "a865aa85bcb2697f47505bfd70422471" + }, + "httr": { + "Package": "httr", + "Version": "1.4.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "curl", + "jsonlite", + "mime", + "openssl" + ], + "Hash": "7e5e3cbd2a7bc07880c94e22348fb661" + }, + "isoband": { + "Package": "isoband", + "Version": "0.2.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "grid", + "utils" + ], + "Hash": "0080607b4a1a7b28979aecef976d8bc2" + }, + "jquerylib": { + "Package": "jquerylib", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "htmltools" + ], + "Hash": "5aab57a3bd297eee1c1d862735972182" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods" + ], + "Hash": "266a20443ca13c65688b2116d5220f76" + }, + "knitr": { + "Package": "knitr", + "Version": "1.43", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "evaluate", + "highr", + "methods", + "tools", + "xfun", + "yaml" + ], + "Hash": "9775eb076713f627c07ce41d8199d8f6" + }, + "labeling": { + "Package": "labeling", + "Version": "0.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "graphics", + "stats" + ], + "Hash": "3d5108641f47470611a32d0bdf357a72" + }, + "later": { + "Package": "later", + "Version": "1.3.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "Rcpp", + "rlang" + ], + "Hash": "40401c9cf2bc2259dfe83311c9384710" + }, + "lattice": { + "Package": "lattice", + "Version": "0.21-8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "stats", + "utils" + ], + "Hash": "0b8a6d63c8770f02a8b5635f3c431e6b" + }, + "lazyeval": { + "Package": "lazyeval", + "Version": "0.2.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "d908914ae53b04d4c0c0fd72ecc35370" + }, + "leaflet": { + "Package": "leaflet", + "Version": "2.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "RColorBrewer", + "crosstalk", + "htmltools", + "htmlwidgets", + "jquerylib", + "leaflet.providers", + "magrittr", + "methods", + "png", + "raster", + "scales", + "sp", + "stats", + "viridis", + "xfun" + ], + "Hash": "6e09cb2c9dc2e5a1e71a413e60c3834e" + }, + "leaflet.providers": { + "Package": "leaflet.providers", + "Version": "2.0.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "htmltools" + ], + "Hash": "c0b81ad9d5d932772f7a457ac398cf36" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "rlang" + ], + "Hash": "001cecbeac1cff9301bdc3775ee46a86" + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "7ce2733a9826b3aeb1775d56fd305472" + }, + "memoise": { + "Package": "memoise", + "Version": "2.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "cachem", + "rlang" + ], + "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" + }, + "mgcv": { + "Package": "mgcv", + "Version": "1.9-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Matrix", + "R", + "graphics", + "methods", + "nlme", + "splines", + "stats", + "utils" + ], + "Hash": "086028ca0460d0c368028d3bda58f31b" + }, + "mime": { + "Package": "mime", + "Version": "0.12", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "tools" + ], + "Hash": "18e9c28c1d3ca1560ce30658b22ce104" + }, + "munsell": { + "Package": "munsell", + "Version": "0.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "colorspace", + "methods" + ], + "Hash": "6dfe8bf774944bd5595785e3229d8771" + }, + "nlme": { + "Package": "nlme", + "Version": "3.1-162", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "graphics", + "lattice", + "stats", + "utils" + ], + "Hash": "0984ce8da8da9ead8643c5cbbb60f83e" + }, + "openssl": { + "Package": "openssl", + "Version": "2.0.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "askpass" + ], + "Hash": "0f7cd2962e3044bb940cca4f4b5cecbe" + }, + "pillar": { + "Package": "pillar", + "Version": "1.9.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "cli", + "fansi", + "glue", + "lifecycle", + "rlang", + "utf8", + "utils", + "vctrs" + ], + "Hash": "15da5a8412f317beeee6175fbc76f4bb" + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "01f28d4278f15c76cddbea05899c5d6f" + }, + "plotly": { + "Package": "plotly", + "Version": "4.10.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "RColorBrewer", + "base64enc", + "crosstalk", + "data.table", + "digest", + "dplyr", + "ggplot2", + "htmltools", + "htmlwidgets", + "httr", + "jsonlite", + "lazyeval", + "magrittr", + "promises", + "purrr", + "rlang", + "scales", + "tibble", + "tidyr", + "tools", + "vctrs", + "viridisLite" + ], + "Hash": "56914cc61df53f2d0283d5498680867e" + }, + "png": { + "Package": "png", + "Version": "0.1-8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "bd54ba8a0a5faded999a7aab6e46b374" + }, + "promises": { + "Package": "promises", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R6", + "Rcpp", + "fastmap", + "later", + "magrittr", + "rlang", + "stats" + ], + "Hash": "0d8a15c9d000970ada1ab21405387dee" + }, + "proxy": { + "Package": "proxy", + "Version": "0.4-27", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats", + "utils" + ], + "Hash": "e0ef355c12942cf7a6b91a6cfaea8b3e" + }, + "purrr": { + "Package": "purrr", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "lifecycle", + "magrittr", + "rlang", + "vctrs" + ], + "Hash": "d71c815267c640f17ddbf7f16144b4bb" + }, + "rappdirs": { + "Package": "rappdirs", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "5e3c5dc0b071b21fa128676560dbe94d" + }, + "raster": { + "Package": "raster", + "Version": "3.6-26", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp", + "methods", + "sp", + "terra" + ], + "Hash": "7d6eda494f34a644420ac1bfd2a8023a" + }, + "renv": { + "Package": "renv", + "Version": "1.0.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "c321cd99d56443dbffd1c9e673c0c1a2" + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "a85c767b55f0bf9b7ad16c6d7baee5bb" + }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.25", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bslib", + "evaluate", + "fontawesome", + "htmltools", + "jquerylib", + "jsonlite", + "knitr", + "methods", + "stringr", + "tinytex", + "tools", + "utils", + "xfun", + "yaml" + ], + "Hash": "d65e35823c817f09f4de424fcdfa812a" + }, + "s2": { + "Package": "s2", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "wk" + ], + "Hash": "f1cbe03bb3346f8e817518ffa20f9f5a" + }, + "sass": { + "Package": "sass", + "Version": "0.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R6", + "fs", + "htmltools", + "rappdirs", + "rlang" + ], + "Hash": "6bd4d33b50ff927191ec9acbf52fd056" + }, + "scales": { + "Package": "scales", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "RColorBrewer", + "farver", + "labeling", + "lifecycle", + "munsell", + "rlang", + "viridisLite" + ], + "Hash": "906cb23d2f1c5680b8ce439b44c6fa63" + }, + "sf": { + "Package": "sf", + "Version": "1.0-14", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "DBI", + "R", + "Rcpp", + "classInt", + "grDevices", + "graphics", + "grid", + "magrittr", + "methods", + "s2", + "stats", + "tools", + "units", + "utils" + ], + "Hash": "e2111252a76984ca50bf8d6314348681" + }, + "sp": { + "Package": "sp", + "Version": "2.1-1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "lattice", + "methods", + "stats", + "utils" + ], + "Hash": "e9090fe4ff468d366aa6a76a9b3ec078" + }, + "stringi": { + "Package": "stringi", + "Version": "1.7.12", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats", + "tools", + "utils" + ], + "Hash": "ca8bd84263c77310739d2cf64d84d7c9" + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ], + "Hash": "671a4d384ae9d32fc47a14e98bfa3dc8" + }, + "sys": { + "Package": "sys", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "3a1be13d68d47a8cd0bfd74739ca1555" + }, + "terra": { + "Package": "terra", + "Version": "1.7-55", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp", + "methods" + ], + "Hash": "c011cc748506148c793428eb8ec101f9" + }, + "tibble": { + "Package": "tibble", + "Version": "3.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "fansi", + "lifecycle", + "magrittr", + "methods", + "pillar", + "pkgconfig", + "rlang", + "utils", + "vctrs" + ], + "Hash": "a84e2cc86d07289b3b6f5069df7a004c" + }, + "tidyr": { + "Package": "tidyr", + "Version": "1.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "cpp11", + "dplyr", + "glue", + "lifecycle", + "magrittr", + "purrr", + "rlang", + "stringr", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "e47debdc7ce599b070c8e78e8ac0cfcf" + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ], + "Hash": "79540e5fcd9e0435af547d885f184fd5" + }, + "tinytex": { + "Package": "tinytex", + "Version": "0.47", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "xfun" + ], + "Hash": "8d4ccb733843e513c1c1cdd66a759f0d" + }, + "units": { + "Package": "units", + "Version": "0.8-4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp" + ], + "Hash": "e0fbcea25008a7540c83c2c294135de0" + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "1fe17157424bb09c48a8b3b550c753bc" + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang" + ], + "Hash": "d0ef2856b83dc33ea6e255caf6229ee2" + }, + "viridis": { + "Package": "viridis", + "Version": "0.6.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "ggplot2", + "gridExtra", + "viridisLite" + ], + "Hash": "80cd127bc8c9d3d9f0904ead9a9102f1" + }, + "viridisLite": { + "Package": "viridisLite", + "Version": "0.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "c826c7c4241b6fc89ff55aaea3fa7491" + }, + "withr": { + "Package": "withr", + "Version": "2.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "stats" + ], + "Hash": "c0e49a9760983e81e55cdd9be92e7182" + }, + "wk": { + "Package": "wk", + "Version": "0.9.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "f58cfa8d9c3a78a309d455a647dee853" + }, + "xfun": { + "Package": "xfun", + "Version": "0.39", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "stats", + "tools" + ], + "Hash": "8f56e9acb54fb525e66464d57ab58bcb" + }, + "yaml": { + "Package": "yaml", + "Version": "2.3.7", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "0d0056cc5383fbc240ccd0cb584bf436" + } + } +} From af287d2feeb43bc52bdc73951a74af568e1734d8 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 24 Nov 2023 17:19:13 +0000 Subject: [PATCH 04/27] Remove unnecessary changes to renv/activate.R --- renv/activate.R | 68 +++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 44 deletions(-) diff --git a/renv/activate.R b/renv/activate.R index bf5dff7c..cc742fc9 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -8,21 +8,6 @@ local({ # the project directory project <- getwd() - # use start-up diagnostics if enabled - diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE") - if (diagnostics) { - start <- Sys.time() - profile <- tempfile("renv-startup-", fileext = ".Rprof") - utils::Rprof(profile) - on.exit({ - utils::Rprof(NULL) - elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L) - writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed))) - writeLines(sprintf("- Profile: %s", profile)) - print(utils::summaryRprof(profile)) - }, add = TRUE) - } - # figure out whether the autoloader is enabled enabled <- local({ @@ -519,7 +504,7 @@ local({ # open the bundle for reading # We use gzcon for everything because (from ?gzcon) - # > Reading from a connection which does not supply a 'gzip' magic + # > Reading from a connection which does not supply a ‘gzip’ magic # > header is equivalent to reading from the original connection conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) on.exit(close(conn)) @@ -782,12 +767,10 @@ local({ renv_bootstrap_validate_version <- function(version, description = NULL) { # resolve description file - # - # avoid passing lib.loc to `packageDescription()` below, since R will - # use the loaded version of the package by default anyhow. note that - # this function should only be called after 'renv' is loaded - # https://github.com/rstudio/renv/issues/1625 - description <- description %||% packageDescription("renv") + description <- description %||% { + path <- getNamespaceInfo("renv", "path") + packageDescription("renv", lib.loc = dirname(path)) + } # check whether requested version 'version' matches loaded version of renv sha <- attr(version, "sha", exact = TRUE) @@ -858,7 +841,7 @@ local({ hooks <- getHook("renv::autoload") for (hook in hooks) if (is.function(hook)) - tryCatch(hook(), error = warnify) + tryCatch(hook(), error = warning) # load the project renv::load(project) @@ -999,15 +982,10 @@ local({ } - renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { + renv_bootstrap_version_friendly <- function(version, sha = NULL) { sha <- sha %||% attr(version, "sha", exact = TRUE) - parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) - paste(parts, collapse = "") - } - - renv_bootstrap_exec <- function(project, libpath, version) { - if (!renv_bootstrap_load(project, libpath, version)) - renv_bootstrap_run(version, libpath) + parts <- c(version, sprintf("[sha: %s]", substring(sha, 1L, 7L))) + paste(parts, collapse = " ") } renv_bootstrap_run <- function(version, libpath) { @@ -1039,14 +1017,6 @@ local({ commandArgs()[[1]] == "RStudio" } - # Used to work around buglet in RStudio if hook uses readline - renv_bootstrap_flush_console <- function() { - tryCatch({ - tools <- as.environment("tools:rstudio") - tools$.rs.api.sendToConsole("", echo = FALSE, focus = FALSE) - }, error = function(cnd) {}) - } - renv_json_read <- function(file = NULL, text = NULL) { jlerr <- NULL @@ -1185,15 +1155,25 @@ local({ # construct full libpath libpath <- file.path(root, prefix) + # attempt to load + if (renv_bootstrap_load(project, libpath, version)) + return(TRUE) + if (renv_bootstrap_in_rstudio()) { - # RStudio only updates console once .Rprofile is finished, so - # instead run code on sessionInit setHook("rstudio.sessionInit", function(...) { - renv_bootstrap_exec(project, libpath, version) - renv_bootstrap_flush_console() + renv_bootstrap_run(version, libpath) + + # Work around buglet in RStudio if hook uses readline + tryCatch( + { + tools <- as.environment("tools:rstudio") + tools$.rs.api.sendToConsole("", echo = FALSE, focus = FALSE) + }, + error = function(cnd) {} + ) }) } else { - renv_bootstrap_exec(project, libpath, version) + renv_bootstrap_run(version, libpath) } invisible() From b616399706742ab5c8a0d36e46145fa67124d9bc Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 24 Nov 2023 17:19:29 +0000 Subject: [PATCH 05/27] Fix missing column in performance report row of misc/file_dict.csv --- misc/file_dict.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/file_dict.csv b/misc/file_dict.csv index eab4ba16..6a9c905e 100644 --- a/misc/file_dict.csv +++ b/misc/file_dict.csv @@ -19,7 +19,7 @@ output,performance_quantile_test,3,evaluate,ccao-model-results-us-east-1,output/ output,performance_quantile_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_assessment.parquet,performance_quantile/year={year}/stage=assessment/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",No,Test + assessment,Performance metrics by quantile within class and geography,Assessment set uses the prior year sales to compare to the assessed value output,shap,4,interpret,ccao-model-results-us-east-1,output/shap/model_shap.parquet,shap/,shap,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,SHAP values for each feature for each card in the assessment data,NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler output,feature_importance,4,interpret,ccao-model-results-us-east-1,output/feature_importance/model_feature_importance.parquet,feature_importance/year={year}/{run_id}.parquet,feature_importance,predictor,"year, run_id, model_predictor_all_name",No,Yes,"Feature importance values (gain, cover, and frequency) for the run", -output,report,5,finalize,ccao-model-results-us-east-1,reports/performance/performance.html,report/year={year}/{run_id}.html,,model run,,No,Yes,Rendered Quarto doc with model performance statistics +output,report,5,finalize,ccao-model-results-us-east-1,reports/performance/performance.html,report/year={year}/{run_id}.html,,model run,,No,Yes,Rendered Quarto doc with model performance statistics, output,metadata,5,finalize,ccao-model-results-us-east-1,output/metadata/model_metadata.parquet,metadata/year={year}/{run_id}.parquet,metadata,model run,"year, run_id",Yes,Yes,"Information about each run, including parameters, run ID, git info, etc.", intermediate,timing,,all,,output/intermediate/timing/,,,model stage,"year, msg",Yes,Yes,Parquet files for each stage containing the stage time elapsed,Converted into a one-row data frame in the finalize stage output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Yes,Yes,Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages" From 25f2850ca873a26b88e4dbfeaa30d72fcaccc25b Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 24 Nov 2023 17:21:11 +0000 Subject: [PATCH 06/27] Update README with instructions on updating R dependencies --- README.Rmd | 22 +++++ README.md | 244 ++++++++++++++++++++++++++++++++--------------------- 2 files changed, 170 insertions(+), 96 deletions(-) diff --git a/README.Rmd b/README.Rmd index 07055be8..a6ac164b 100644 --- a/README.Rmd +++ b/README.Rmd @@ -609,12 +609,15 @@ The code in this repository is written primarily in [R](https://www.r-project.or If you're on Windows, you'll also need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) in order to build the necessary packages. You may also want to (optionally) install [DVC](https://dvc.org/doc/install) to pull data and run pipelines. +We also publish a Docker image containing model code and all of the dependencies necessary to run it. If you're comfortable using Docker, you can skip the installation steps below and instead pull the image from `ghcr.io/ccao-data/model-res-avm:master` to run the model. + ## Installation 1. Clone this repository using git, or simply download it using the button at the top of the page. 2. Set your working directory to the local folder containing this repository's files, either using R's `setwd()` command or (preferably) using RStudio's [projects](https://support.posit.co/hc/en-us/articles/200526207-Using-Projects). 3. Install `renv`, R's package manager, by running `install.packages("renv")`. 4. Install all R package dependencies using `renv` by running `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. + 1. The `finalize` step of the model pipeline requires some additional dependencies for generating a model performance report. These dependencies must be installed in addition to the core dependnecies installed in step 4. If you would like to run this step, make sure to install its additional dependencies by running `renv::restore(lockfile = "reports/renv.lock")`. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -743,6 +746,25 @@ Both [Tidymodels](https://tune.tidymodels.org/articles/extras/optimizations.html * The number of threads is set via the [num_threads](https://lightgbm.readthedocs.io/en/latest/Parameters.html#num_threads) parameter, which is passed to the model using the `set_args()` function from `parsnip`. By default, `num_threads` is equal to the full number of physical cores available. More (or faster) cores will decrease total training time. * This repository uses the CPU version of LightGBM included with the [LightGBM R package](https://lightgbm.readthedocs.io/en/latest/R/index.html). If you'd like to use the GPU version you'll need to [build it yourself](https://lightgbm.readthedocs.io/en/latest/R/index.html#installing-a-gpu-enabled-build) or wait for the [upcoming CUDA release](https://github.com/microsoft/LightGBM/issues/5153). +## Updating R dependencies + +There are two lockfiles that we use with renv to manage R dependencies: + +1. **`renv.lock`** is the canonical list of dependencies that are used by the **core model pipeline**. Any dependencies that are required to run the model itself should be defined in this lockfile. +2. **`reports/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `finalize` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. + +Our goal in maintaining multiple lockfiles is to keep the list of dependencies that are required to run the model as short as possibile. This choice adds overhead to the process of updating R dependencies, but incurs the benefit of a more maintainable model over the long term. + +The process for **updating core model pipeline dependencies** is straightforward: Running `renv::install("")` and `renv::snapshot()` will ensure that the dependency gets added or updated in `renv.lock`, as long is it is imported somewhere in the model pipeline via a `library()` call. + +The process for updating **model report dependencies** is more complex, since it requires the use of a separate `reporting` profile: + +1. Run `Sys.setenv(RENV_PROFILE = "reporting")` to set the renv profile to `reporting` +2. Make sure that the dependency is defined in the `DESCRIPTION` file under the `Config/renv/profiles/reporting/dependencies` key +3. Run `renv::install("")` to add or update the dependency as necessary +4. Run `renv::snapshot(lockfile = "reports/renv.lock", type = "explicit")` to update the reporting lockfile with the dependencies defined in the `DESCRIPTION` file +5. Run `Sys.unsetenv("RENV_PROFILE")` to switch the renv profile back to the default + ## Troubleshooting The dependencies for this repository are numerous and not all of them may install correctly. Here are some common install issues (as seen in the R console) as well as their respective resolutions: diff --git a/README.md b/README.md index bec76f7e..a5bd23af 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Table of Contents - [Output](#output) - [Getting Data](#getting-data) - [System Requirements](#system-requirements) + - [Updating R dependencies](#updating-r-dependencies) - [Troubleshooting](#troubleshooting) - [License](#license) - [Contributing](#contributing) @@ -331,102 +332,102 @@ districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/- and many others. The features in the table below are the ones that made the cut. They’re the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the -model as of 2023-10-05. - -| Feature Name | Category | Type | Possible Values | Notes | -|:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the population 17 years or younger. ACS variable (B01001_003E + B01001_004E + B01001_005E + B01001_006E + B01001_007E + B01001_027E + B01001_028E + B01001_029E + B01001_030E + B01001_031E) / B01001_001E | -| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the population 65 years or older. ACS variable (B01001_020E + B01001_021E + B01001_022E + B01001_023E + B01001_024E + B01001_025E + B01001_044E + B01001_045E + B01001_046E + B01001_046E + B01001_048E + B01001_049E) / B01001_001E | -| Median Population Age | ACS5 | numeric | | Median age for whole population. ACS variable B01002_001E | -| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months. ACS variable B07003_004E / B07003_001E | -| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months. ACS variable B07003_013E / B07003_001E | -| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married (married). ACS variable B11001_003E / B11001_001E | -| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single). ACS variable B11001_008E / B11001_001E | -| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree. ACS variable (B15002_011E + B15002_028E) / B15002_001E | -| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor degree. ACS variable (B15002_015E + B15002_032E) / B15002_001E | -| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree. ACS variable (B15002_016E + B15002_017E + B15002_018E + B15002_033E + B15002_034E + B15002_035E) / B15002_001E | -| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people below poverty level. ACS variable B17001_002E / B17001_001E | -| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months. ACS variable B19013_001E | -| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months. ACS variable B19301_001E | -| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months. ACS variable B22003_002E / B22003_001E | -| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed. ACS variable B23025_005E / B23025_003E | -| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied housing units. ACS variable B25037_001E | -| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units. ACS variable B25064_001E | -| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied. ACS variable B25003_002E / B25003_001E | -| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Selected conditions, including: incomplete plumbing or kitchens, overcrowding, 30% or more of the household income spent on rent or monthly owner costs ACS variable (B25123_003E + B25123_004E + B25123_005E + B25123_006E + B25123_009E + B25123_010E + B25123_011E + B25123_012E) / B25123_001E | -| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months. ACS variable B07003_007E / B07003_001E | -| Year Built | Characteristic | numeric | | | -| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | | -| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties. CAUTION: Note the numerically encoded values DO NOT correspond to the number of apartments i.e. code 1 means 2 apartments, code 6 means 0 apartments | -| Attic Finish | Characteristic | categorical | Living Area, Partial, None | | -| Attic Type | Characteristic | categorical | Full, Partial, None | | -| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building | -| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior | -| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | | -| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | | -| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | | -| Full Baths | Characteristic | numeric | | Number of full bathrooms, defined as having a bath or shower. If this value is missing, the default value is set to 1 | -| Fireplaces | Characteristic | numeric | | Number of fireplaces, counted as the number of flues one can see from the outside of the building | -| Garage 1 Area Included | Characteristic | categorical | Yes, No | | -| Garage 1 Attached | Characteristic | categorical | Yes, No | | -| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | | -| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | | -| Half Baths | Characteristic | numeric | | Number of half baths, defined as bathrooms without a shower or bathtub | -| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property. A single PIN can have multiple “land lines,” meaning it can be associated with more than one 200-class land lot | -| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | | -| Number of Commercial Units | Characteristic | numeric | | Number of commercial units. The vast majority are for properties with class 212 | -| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | | -| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | | -| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths). Not to be confused with bedrooms | -| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated. Field has not been updated recently enough to be useful for modeling | -| Design Plan | Characteristic | categorical | Architect, Stock Plan | | -| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | | -| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years. Renovation is indicated by the char_renovation characteristic flipping from “NO” to “YES” | -| Longitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN | -| Latitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN | -| Municipality Name | Location | character | | Municipality name for a given PIN. Taken from Cook County GIS shapefiles | -| FEMA Special Flood Hazard Area | Location | logical | | Indicator for a PIN within a FEMA Special Flood Hazard Area. Taken from FEMA site for 2021 only | -| First Street Factor | Location | numeric | | First Street flood factor (risk score) for a given PIN, scores 1 - 10. Provided to the CCAO by firststreet.org | -| First Street Risk Direction | Location | numeric | | First Street risk direction for a given PIN. Positive scores indicate increasing future flood risk, negative scores the opposite. Provided to the CCAO by firststreet.org | -| School Elementary District GEOID | Location | character | | Elementary school district ID for a given PIN. For CPS, elementary school attendance boundaries are used. Taken from Cook County GIS shapefiles | -| School Secondary District GEOID | Location | character | | Secondary school district ID for a given PIN. For CPS, secondary school attendance boundaries are used. Taken from Cook County GIS shapefiles | -| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | -| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | -| Airport Noise DNL | Location | numeric | | Airport noise calculated via kriging noise monitor data from CDA. See for more information | -| Township Code | Meta | character | | Numeric code identifying the Cook County township of a given PIN | -| Neighborhood Code | Meta | character | | Assessor neighborhood. First 2 digits are township code, last 3 digits are neighborhood code | -| Tieback Proration Rate | Meta | numeric | | Proration rate for a given PIN. Some buildings sit across multiple PINs. This number is intended to capture the split in building value | -| Property Group | Meta | categorical | Non-Livable Space, Single-Family, Multi-Family, Condominium, Bed & Breakfast | | -| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN. Idea is to capture any downward pressure on price from higher tax burdens | -| School District (Elementary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of elementary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | -| School District (Secondary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of secondary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | -| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile of a given PIN. Condo buildings are counted as a single PIN | -| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops (CTA or PACE) within half mile of a given PIN. Taken from GTFS feeds retrieved from transitfeeds.com | -| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of PIN-level foreclosure in the past 5 years, per 1000 PINs, within half mile of a given PIN. Taken from Illinois Public Records | -| Number of Schools in Half Mile | Proximity | numeric | | Number of schools within half mile of a given PIN. This includes preschools, small private schools, universities, etc | -| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools with Great Schools ratings within half mile of a given PIN | -| Average School Rating in Half Mile | Proximity | numeric | | Average Great Schools rating for all schools (with a rating) within half mile of a given PIN. Public schools must be within the same district as the PIN to be considered in the average | -| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest bike trail (linestring). Taken from Cook County GIS shapefiles | -| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest cemetery (polygon). Taken from Cook County GIS shapefiles | -| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA tracks. Taken from CTA GTFS feeds retrieved via transitfeeds.com | -| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA stop. Taken from CTA GTFS feeds retrieved via transitfeeds.com | -| Nearest Hospital Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest hospital (polygon). Taken from Cook County GIS shapefiles | -| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance in feet to the Lake Michigan coastline. Taken from TIGER/Line coastlines file and filtered to Cook County only | -| Nearest Major Road Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest major road/highway. Pulled via OpenStreetMap, key=highway, value=motorway,primary,trunk | -| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra tracks. Taken from Metra GTFS feeds retrieved via transitfeeds.com | -| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra stop. Taken from Metra GTFS feeds retrieved via transitfeeds.com | -| Nearest Park Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest park. Pull via OpenStreetMap, key=leisure, value=park | -| Nearest Railroad Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest rail (not including CTA). Taken from Cook County GIS shapefiles | -| Nearest Water Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest water. As identified by Cook County hydrology files | -| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest golf course (polygon). Taken from Cook County GIS shapefiles and OpenStreetMap | -| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | -| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | -| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | -| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | -| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | -| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | -| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | -| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | +model as of 2023-11-24. + +| Feature Name | Category | Type | Possible Values | Notes | +|:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the people 17 years or younger. ACS variable (`B01001_003E` + `B01001_004E` + `B01001_005E` + `B01001_006E` + `B01001_007E` + `B01001_027E` + `B01001_028E` + `B01001_029E` + `B01001_030E` + `B01001_031E`) / `B01001_001E` | +| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the people 65 years or older. ACS variable (`B01001_020E` + `B01001_021E` + `B01001_022E` + `B01001_023E` + `B01001_024E` + `B01001_025E` + `B01001_044E` + `B01001_045E` + `B01001_046E` + `B01001_046E` + `B01001_048E` + `B01001_049E`) / `B01001_001E` | +| Median Population Age | ACS5 | numeric | | Median age for whole population. ACS variable `B01002_001E` | +| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months. ACS variable `B07003_004E` / `B07003_001E` | +| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months. ACS variable `B07003_013E` / `B07003_001E` | +| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married. ACS variable `B11001_003E` / `B11001_001E` | +| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single). ACS variable `B11001_008E` / `B11001_001E` | +| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree. ACS variable (`B15002_011E` + `B15002_028E`) / `B15002_001E` | +| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor’s degree. ACS variable (`B15002_015E` + `B15002_032E`) / `B15002_001E` | +| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree. ACS variable (`B15002_016E` + `B15002_017E` + `B15002_018E` + `B15002_033E` + `B15002_034E` + `B15002_035E`) / `B15002_001E` | +| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people above the poverty level in the last 12 months. ACS variable `B17001_003E` / `B17001_001E` | +| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months. ACS variable `B19013_001E` | +| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months. ACS variable `B19301_001E` | +| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months. ACS variable `B22003_002E` / `B22003_001E` | +| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed. ACS variable `B23025_005E` / `B23025_003E` | +| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied households. ACS variable `B25037_001E` | +| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units. ACS variable `B25064_001E` | +| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied. ACS variable `B25003_002E` / `B25003_001E` | +| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Percent of occupied households with selected conditions. Conditions include: incomplete plumbing or kitchens, overcrowding, 30% or more of the household income spent on rent or monthly owner costs. ACS variable (`B25123_003E` + `B25123_004E` + `B25123_005E` + `B25123_006E` + `B25123_009E` + `B25123_010E` + `B25123_011E` + `B25123_012E`) / `B25123_001E` | +| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months. ACS variable `B07003_007E` / `B07003_001E` | +| Year Built | Characteristic | numeric | | Year the property was constructed | +| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | Indicator for central air. Possible values for this variable are: - `1` = Central A/C (`YES`) - `2` = No central A/C (`NO`) | +| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties. CAUTION: Note the numerically encoded values DO NOT correspond to the number of apartments i.e. code 1 means 2 apartments, code 6 means 0 apartments. Possible values for this variable are: - `1` = Two apartments (`TWO`) - `2` = Three apartments (`THREE`) - `3` = Four apartments (`FOUR`) - `4` = Five apartments (`FIVE`) - `5` = Six apartments (`SIX`) - `6` = No apartments (`NONE`) | +| Attic Finish | Characteristic | categorical | Living Area, Partial, None | Attic finish. Possible values for this variable are: - `1` = Living area (`LAR`) - `2` = Partial (`PT`) - `3` = None (`UNF`) | +| Attic Type | Characteristic | categorical | Full, Partial, None | Attic type. Possible values for this variable are: - `1` = Full (`FL`) - `2` = Partial (`PT`) - `3` = None (`NO`) | +| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building. | +| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior. | +| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | Basement type. Possible values for this variable are: - `1` = Full (`FL`) - `2` = Slab (`SL`) - `3` = Partial (`PT`) - `4` = Crawl (`CR`) | +| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | Basement finish. Possible values for this variable are: - `1` = Finished / formal rec room (`REC`) - `2` = Apartment (`APT`) - `3` = Unfinished (`UNF`) | +| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Exterior wall construction. Possible values for this variable are: - `1` = Frame (`FRAM`) - `2` = Masonry (`MASR`) - `3` = Frame + masonry (`FRMA`) - `4` = Stucco (`STUC`) | +| Full Baths | Characteristic | numeric | | Number of full bathrooms. Defined as bathrooms with a bath or shower. If this value is missing, the default value is set to 1 | +| Fireplaces | Characteristic | numeric | | Number of fireplaces. Counted as the number of flues one can see from the outside of the building | +| Garage 1 Area Included | Characteristic | categorical | Yes, No | Indicator for garage area inclusion. Is the garage physically included within the building area? If yes, the garage area is subtracted from the building square feet calculation by the field agent. Possible values for this variable are: - `1` = Yes (`YES`) - `2` = No (`NO`) | +| Garage 1 Attached | Characteristic | categorical | Yes, No | Indicator for garage attached. Is the garage physically attached to the main building? Possible values for this variable are: - `1` = Yes (`YES`) - `2` = No (`NO`) | +| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Garage exterior wall construction. Possible values for this variable are: - `1` = Frame (`FRAM`) - `2` = Masonry (`MASR`) - `3` = Frame + masonry (`FRMA`) - `4` = Stucco (`STUC`) | +| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | Garage size (number of cars). Possible values for this variable are: - `1` = 1 car (`1CAR`) - `2` = 1.5 cars (`1.5CAR`) - `3` = 2 cars (`2CAR`) - `4` = 2.5 cars (`2.5CAR`) - `5` = 3 cars (`3CAR`) - `6` = 3.5 cars (`3.5CAR`) - `7` = 0 cars (`0CAR`) - `8` = 4 cars (`4CAR`) | +| Half Baths | Characteristic | numeric | | Number of half baths. Defined as bathrooms without a shower or bathtub | +| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property. A single PIN can have multiple “land lines”, meaning it can be associated with more than one 200-class land lot | +| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | Interior heating type. Possible values for this variable are: - `1` = Central air / furnace (`FURN`) - `2` = Steam / radiator (`STM`) - `3` = Electric (`ELEC`) - `4` = None (`NONE`) | +| Number of Commercial Units | Characteristic | numeric | | Number of commercial units. The vast majority are for properties with class 212 | +| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | Porch type. Possible values for this variable are: - `0` = None (`NONE`) - `1` = Frame enclosed (`FRAM`) - `2` = Masonry enclosed (`MSRY`) | +| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | Roof material / construction. Possible values for this variable are: - `1` = Shingle + asphalt (`SHAS`) - `2` = Tar + gravel (`TRGR`) - `3` = Slate (`SLTE`) - `4` = Shake (`SHKE`) - `5` = Tile (`TILE`) - `6` = Other (`OTHR`) | +| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths). Not to be confused with bedrooms | +| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated. Field has not been updated recently enough to be useful for modeling | +| Design Plan | Characteristic | categorical | Architect, Stock Plan | Design plan. Whether the property was designed by an architect or from a stock plan. Possible values for this variable are: - `1` = Architect (`ARCT`) - `2` = Stock plan (`STCK`) | +| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | Type of residence. Used to indicate stories as well as other information about the design of the property. Also used to determine the property class. Possible values for this variable are: - `1` = 1 story (`1STRY`) - `2` = 2 story (`2STRY`) - `3` = 3 story or more (`3STRY+`) - `4` = Split level (`SPLT`) - `5` = 1.5 story (`1.5STRY`) - `9.9` = Missing (`MSSNG`) | +| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years. Renovation is indicated by the `char_renovation` characteristic flipping from `NO` to `YES` | +| Longitude | Location | numeric | | X coordinate in degrees (global longitude). Point location derived from the centroid of the largest polygon associated with the geometry. Units are degrees, taken from the WGS84 projection (EPSG 4326) | +| Latitude | Location | numeric | | Y coordinate in degrees (global latitude). Point location derived from the centroid of the largest polygon associated with the geometry. Units are degrees, taken from the WGS84 projection (EPSG 4326) | +| Municipality Name | Location | character | | | +| FEMA Special Flood Hazard Area | Location | logical | | FEMA Special Flood Hazard Area, derived from spatial intersection with FEMA floodplain maps. Taken from FEMA site for 2021 only | +| First Street Factor | Location | numeric | | First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk. Pulled from 2019 First Street extract provided to the CCAO | +| First Street Risk Direction | Location | numeric | | First Street risk direction. Positive scores indicate increasing risk of flood, negative scores indicate decreasing risk of flood, 0 indicates no movement of risk. Pulled from 2019 First Street extract provided to the CCAO | +| School Elementary District GEOID | Location | character | | School district (elementary) GEOID. Derived from Cook County and City of Chicago shapefiles. Chicago Public Schools are associated with attendance areas where suburban schools are associated with districts | +| School Secondary District GEOID | Location | character | | School district (secondary) GEOID. Derived from Cook County and City of Chicago shapefiles. Chicago Public Schools are associated with attendance areas where suburban schools are associated with districts | +| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | +| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | +| Airport Noise DNL | Location | numeric | | O’Hare and Midway noise, measured as DNL. DNL measures the total cumulative sound exposure over a 24-hour period. Here DNL is imputed using physical models or a kriging surface based on noise data from monitors around each airport. Noise monitor data retrieved from the Chicago Department of Aviation | +| Township Code | Meta | character | | Cook County township code. See `township_name` for more information. Note that township codes that start with 7 are City triad townships | +| Neighborhood Code | Meta | character | | Assessor neighborhood code. First 2 digits are township code, last 3 digits are neighborhood code. Neighborhood boundaries are coincident with townships. Geographic neighborhoods intended to represent relatively homogeneous housing sub-markets. They were created a long time ago for internal use by the various property tax offices. The Assessor now uses them as units of work and analysis. For example, land rates are usually delimited by neighborhood | +| Tieback Proration Rate | Meta | numeric | | Proration rate applied to the PIN. PINs are occasionally prorated when not all of their value is contained within their boundaries. For example, a building that lies equally across two PINs would be prorated to 50%. In this case, the *land* value of the PIN is not prorated, but the building value is. | +| Property Group | Meta | categorical | Non-Livable Space, Single-Family, Multi-Family, Condominium, Bed & Breakfast | | +| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN. For modeling, the idea is to capture any downward pressure on price from higher tax burdens | +| School District (Elementary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of elementary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | +| School District (Secondary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of secondary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | +| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile | +| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops within half mile. Includes CTA and PACE bus stops. Stop locations sourced from agency GTFS feeds | +| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of foreclosures per 1000 PINs, within half mile (past 5 years). Normalized version of the half mile foreclosure count to account for PIN density. Sourced from Illinois Public Record (IPR). Note that this data is reported on a long lag | +| Number of Schools in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile. School locations sourced from [GreatSchools](https://www.greatschools.org/) | +| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile. Includes only schools that have a GreatSchools rating. School locations and ratings sourced from [GreatSchools](https://www.greatschools.org/) | +| Average School Rating in Half Mile | Proximity | numeric | | Average school rating of schools within half mile. Schools of any type (elementary, secondary, etc.) are included. School ratings sourced from [GreatSchools](https://www.greatschools.org/) | +| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Nearest bike trail distance (feet). Bike trail data sourced from Cook County GIS | +| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Nearest cemetery distance (feet). Cemetery data sourced from Cook County GIS | +| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Nearest CTA route distance (feet). Routes include any active CTA tracks. Route data sourced from CTA GTFS feeds | +| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Nearest CTA stop distance (feet). Stops include any active CTA stops for trains only. Stop data sourced from CTA GTFS feeds | +| Nearest Hospital Distance (Feet) | Proximity | numeric | | Nearest hospital distance (feet). Hospital locations sourced from Cook County GIS | +| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance to Lake Michigan shoreline (feet). Shoreline sourced from Census hydrography files | +| Nearest Major Road Distance (Feet) | Proximity | numeric | | Nearest major road distance (feet). Major road locations sourced from OpenStreetMap (OSM). Major roads include any OSM ways tagged with `highway/motorway`, `highway/trunk`, or `highway/primary` | +| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Nearest Metra route distance (feet). Routes include any active Metra tracks. Route data sourced from Metra GTFS feeds | +| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Nearest Metra stop distance (feet). Stops include any active Metra stops. Stop data sourced from Metra GTFS feeds | +| Nearest Park Distance (Feet) | Proximity | numeric | | Nearest park distance (feet). Park locations sourced from OpenStreetMap using the tag `leisure/park` | +| Nearest Railroad Distance (Feet) | Proximity | numeric | | Nearest railroad distance (feet). Railroad locations sourced from Cook County GIS. Inclusive of any rail (CTA, Metra, non-passenger freight, etc.) | +| Nearest Water Distance (Feet) | Proximity | numeric | | Nearest water distance (feet). Water locations are inclusive of *any* body of water. Sourced from Census hydrology files | +| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Nearest golf course distance (feet). Golf course data sourced from Cook County GIS and OpenStreetMap | +| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | +| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | +| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | +| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | +| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | +| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | +| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | +| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | #### Data Sources @@ -1021,6 +1022,11 @@ If you’re on Windows, you’ll also need to install build the necessary packages. You may also want to (optionally) install [DVC](https://dvc.org/doc/install) to pull data and run pipelines. +We also publish a Docker image containing model code and all of the +dependencies necessary to run it. If you’re comfortable using Docker, +you can skip the installation steps below and instead pull the image +from `ghcr.io/ccao-data/model-res-avm:master` to run the model. + ## Installation 1. Clone this repository using git, or simply download it using the @@ -1035,6 +1041,12 @@ build the necessary packages. You may also want to (optionally) install `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. + 1. The `finalize` step of the model pipeline requires some + additional dependencies for generating a model performance + report. These dependencies must be installed in addition to the + core dependnecies installed in step 4. If you would like to run + this step, make sure to install its additional dependencies by + running `renv::restore(lockfile = "reports/renv.lock")`. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -1235,6 +1247,46 @@ sped up using the parallel processing built-in to LightGBM. Note that: or wait for the [upcoming CUDA release](https://github.com/microsoft/LightGBM/issues/5153). +## Updating R dependencies + +There are two lockfiles that we use with renv to manage R dependencies: + +1. **`renv.lock`** is the canonical list of dependencies that are used + by the **core model pipeline**. Any dependencies that are required + to run the model itself should be defined in this lockfile. +2. **`reports/renv.lock`** is the canonical list of dependencies that + are used to **generate a model performance report** in the + `finalize` step of the pipeline. Any dependencies that are required + to generate that report or others like it should be defined in this + lockfile. + +Our goal in maintaining multiple lockfiles is to keep the list of +dependencies that are required to run the model as short as possibile. +This choice adds overhead to the process of updating R dependencies, but +incurs the benefit of a more maintainable model over the long term. + +The process for **updating core model pipeline dependencies** is +straightforward: Running `renv::install("")` and +`renv::snapshot()` will ensure that the dependency gets added or updated +in `renv.lock`, as long is it is imported somewhere in the model +pipeline via a `library()` call. + +The process for updating **model report dependencies** is more complex, +since it requires the use of a separate `reporting` profile: + +1. Run `Sys.setenv(RENV_PROFILE = "reporting")` to set the renv profile + to `reporting` +2. Make sure that the dependency is defined in the `DESCRIPTION` file + under the `Config/renv/profiles/reporting/dependencies` key +3. Run `renv::install("")` to add or update the + dependency as necessary +4. Run + `renv::snapshot(lockfile = "reports/renv.lock", type = "explicit")` + to update the reporting lockfile with the dependencies defined in + the `DESCRIPTION` file +5. Run `Sys.unsetenv("RENV_PROFILE")` to switch the renv profile back + to the default + ## Troubleshooting The dependencies for this repository are numerous and not all of them From 9f1c1caf5bd6f1f525e10de34df2335bb65678a0 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 24 Nov 2023 17:34:04 +0000 Subject: [PATCH 07/27] Add quarto to DESCRIPTION dependencies --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index ff0211c9..b41839a2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1 +1 @@ -Config/renv/profiles/reporting/dependencies: leaflet, plotly, sf +Config/renv/profiles/reporting/dependencies: quarto, leaflet, plotly, sf From aedd58ba65608924a4a47957b6ea21342a3a49bd Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 24 Nov 2023 17:34:27 +0000 Subject: [PATCH 08/27] Move reports/renv.lock -> renv/profiles/reporting/renv.lock --- Dockerfile | 4 +- README.Rmd | 6 +- README.md | 19 ++--- .../profiles/reporting}/renv.lock | 84 ++++++++++++++++++- 4 files changed, 97 insertions(+), 16 deletions(-) rename {reports => renv/profiles/reporting}/renv.lock (93%) diff --git a/Dockerfile b/Dockerfile index 22121379..21726026 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,13 +26,13 @@ RUN pipenv install --system --deploy # Copy R bootstrap files into the image COPY renv.lock . -COPY reports/renv.lock reports/renv.lock +COPY renv/profiles/reporting/renv.lock reporting-renv.lock COPY .Rprofile . COPY renv/ renv/ # Install R dependencies RUN Rscript -e 'renv::restore()' -RUN Rscript -e 'renv::restore(lockfile = "reports/renv.lock")' +RUN Rscript -e 'renv::restore(lockfile = "reporting-renv.lock")' # Copy the directory into the container ADD ./ model-res-avm/ diff --git a/README.Rmd b/README.Rmd index a6ac164b..669837f5 100644 --- a/README.Rmd +++ b/README.Rmd @@ -617,7 +617,7 @@ We also publish a Docker image containing model code and all of the dependencies 2. Set your working directory to the local folder containing this repository's files, either using R's `setwd()` command or (preferably) using RStudio's [projects](https://support.posit.co/hc/en-us/articles/200526207-Using-Projects). 3. Install `renv`, R's package manager, by running `install.packages("renv")`. 4. Install all R package dependencies using `renv` by running `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. - 1. The `finalize` step of the model pipeline requires some additional dependencies for generating a model performance report. These dependencies must be installed in addition to the core dependnecies installed in step 4. If you would like to run this step, make sure to install its additional dependencies by running `renv::restore(lockfile = "reports/renv.lock")`. + 1. The `finalize` step of the model pipeline requires some additional dependencies for generating a model performance report. These dependencies must be installed in addition to the core dependnecies installed in step 4. If you would like to run this step, make sure to install its additional dependencies by running `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -751,7 +751,7 @@ Both [Tidymodels](https://tune.tidymodels.org/articles/extras/optimizations.html There are two lockfiles that we use with renv to manage R dependencies: 1. **`renv.lock`** is the canonical list of dependencies that are used by the **core model pipeline**. Any dependencies that are required to run the model itself should be defined in this lockfile. -2. **`reports/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `finalize` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. +2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `finalize` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. Our goal in maintaining multiple lockfiles is to keep the list of dependencies that are required to run the model as short as possibile. This choice adds overhead to the process of updating R dependencies, but incurs the benefit of a more maintainable model over the long term. @@ -762,7 +762,7 @@ The process for updating **model report dependencies** is more complex, since it 1. Run `Sys.setenv(RENV_PROFILE = "reporting")` to set the renv profile to `reporting` 2. Make sure that the dependency is defined in the `DESCRIPTION` file under the `Config/renv/profiles/reporting/dependencies` key 3. Run `renv::install("")` to add or update the dependency as necessary -4. Run `renv::snapshot(lockfile = "reports/renv.lock", type = "explicit")` to update the reporting lockfile with the dependencies defined in the `DESCRIPTION` file +4. Run `renv::snapshot(type = "explicit")` to update the reporting lockfile with the dependencies defined in the `DESCRIPTION` file 5. Run `Sys.unsetenv("RENV_PROFILE")` to switch the renv profile back to the default ## Troubleshooting diff --git a/README.md b/README.md index a5bd23af..4cfa90c7 100644 --- a/README.md +++ b/README.md @@ -1046,7 +1046,8 @@ from `ghcr.io/ccao-data/model-res-avm:master` to run the model. report. These dependencies must be installed in addition to the core dependnecies installed in step 4. If you would like to run this step, make sure to install its additional dependencies by - running `renv::restore(lockfile = "reports/renv.lock")`. + running + `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -1254,11 +1255,11 @@ There are two lockfiles that we use with renv to manage R dependencies: 1. **`renv.lock`** is the canonical list of dependencies that are used by the **core model pipeline**. Any dependencies that are required to run the model itself should be defined in this lockfile. -2. **`reports/renv.lock`** is the canonical list of dependencies that - are used to **generate a model performance report** in the - `finalize` step of the pipeline. Any dependencies that are required - to generate that report or others like it should be defined in this - lockfile. +2. **`renv/profiles/reporting/renv.lock`** is the canonical list of + dependencies that are used to **generate a model performance + report** in the `finalize` step of the pipeline. Any dependencies + that are required to generate that report or others like it should + be defined in this lockfile. Our goal in maintaining multiple lockfiles is to keep the list of dependencies that are required to run the model as short as possibile. @@ -1280,10 +1281,8 @@ since it requires the use of a separate `reporting` profile: under the `Config/renv/profiles/reporting/dependencies` key 3. Run `renv::install("")` to add or update the dependency as necessary -4. Run - `renv::snapshot(lockfile = "reports/renv.lock", type = "explicit")` - to update the reporting lockfile with the dependencies defined in - the `DESCRIPTION` file +4. Run `renv::snapshot(type = "explicit")` to update the reporting + lockfile with the dependencies defined in the `DESCRIPTION` file 5. Run `Sys.unsetenv("RENV_PROFILE")` to switch the renv profile back to the default diff --git a/reports/renv.lock b/renv/profiles/reporting/renv.lock similarity index 93% rename from reports/renv.lock rename to renv/profiles/reporting/renv.lock index 39b73dc9..2b58417f 100644 --- a/reports/renv.lock +++ b/renv/profiles/reporting/renv.lock @@ -4,7 +4,7 @@ "Repositories": [ { "Name": "CRAN", - "URL": "https://cran.rstudio.com" + "URL": "https://packagemanager.posit.co/cran/latest" } ] }, @@ -721,6 +721,18 @@ ], "Hash": "0f7cd2962e3044bb940cca4f4b5cecbe" }, + "packrat": { + "Package": "packrat", + "Version": "0.9.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "tools", + "utils" + ], + "Hash": "55ddd2d4a1959535f18393478b0c14a6" + }, "pillar": { "Package": "pillar", "Version": "1.9.0", @@ -790,6 +802,19 @@ ], "Hash": "bd54ba8a0a5faded999a7aab6e46b374" }, + "processx": { + "Package": "processx", + "Version": "3.8.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "ps", + "utils" + ], + "Hash": "3efbd8ac1be0296a46c55387aeace0f3" + }, "promises": { "Package": "promises", "Version": "1.2.1", @@ -818,6 +843,17 @@ ], "Hash": "e0ef355c12942cf7a6b91a6cfaea8b3e" }, + "ps": { + "Package": "ps", + "Version": "1.7.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "709d852d33178db54b17c722e5b1e594" + }, "purrr": { "Package": "purrr", "Version": "1.0.1", @@ -833,6 +869,23 @@ ], "Hash": "d71c815267c640f17ddbf7f16144b4bb" }, + "quarto": { + "Package": "quarto", + "Version": "1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "jsonlite", + "later", + "processx", + "rmarkdown", + "rsconnect", + "rstudioapi", + "utils", + "yaml" + ], + "Hash": "79e1cff980960b566ddc4ddb1a49a13d" + }, "rappdirs": { "Package": "rappdirs", "Version": "0.3.3", @@ -902,6 +955,35 @@ ], "Hash": "d65e35823c817f09f4de424fcdfa812a" }, + "rsconnect": { + "Package": "rsconnect", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "curl", + "digest", + "jsonlite", + "lifecycle", + "openssl", + "packrat", + "renv", + "rlang", + "rstudioapi", + "tools", + "yaml" + ], + "Hash": "672fc66985074d17c86b6335105143b8" + }, + "rstudioapi": { + "Package": "rstudioapi", + "Version": "0.15.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5564500e25cffad9e22244ced1379887" + }, "s2": { "Package": "s2", "Version": "1.1.4", From 7ae4f6b85a591a3fdc4f344814f6a1d946e95fbc Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 24 Nov 2023 17:43:28 +0000 Subject: [PATCH 09/27] Properly style R/helpers.R --- R/helpers.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/helpers.R b/R/helpers.R index f32c833e..db76d059 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -37,7 +37,8 @@ model_get_s3_artifacts_for_run <- function(run_id, year) { # First get anything partitioned only by year s3_objs_limited <- grep( - ".parquet$|.zip$|.rds|.html$", s3_objs, value = TRUE + ".parquet$|.zip$|.rds|.html$", s3_objs, + value = TRUE ) %>% unname() From fd6538b08e3a2dd68a79332f22a53ae9842b5a60 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 24 Nov 2023 21:08:23 +0000 Subject: [PATCH 10/27] Install Quarto in Dockerfile --- Dockerfile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 21726026..549c5736 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,13 @@ ENV RENV_PATHS_LIBRARY renv/library RUN apt-get update && apt-get install --no-install-recommends -y \ libcurl4-openssl-dev libssl-dev libxml2-dev libgit2-dev git \ libudunits2-dev python3-dev python3-pip libgdal-dev libgeos-dev \ - libproj-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev pandoc + libproj-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev pandoc \ + curl gdebi-core + +# Install Quarto +RUN curl -o quarto-linux-amd64.deb -L \ + https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.450/quarto-1.3.450-linux-amd64.deb +RUN gdebi -n quarto-linux-amd64.deb # Install pipenv for Python dependencies RUN pip install pipenv From 7b39d2fa0144f9ed6b503e42ce6a05d8cff1c0c5 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Mon, 27 Nov 2023 23:27:21 +0000 Subject: [PATCH 11/27] Use the correct path to performance.qmd in 05-finalize.R step --- pipeline/05-finalize.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/05-finalize.R b/pipeline/05-finalize.R index eb001546..2966722e 100644 --- a/pipeline/05-finalize.R +++ b/pipeline/05-finalize.R @@ -209,7 +209,7 @@ tictoc::tic.clearlog() #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Generating performance report") -here("../reports/performance/performance.qmd") %>% +here("reports/performance/performance.qmd") %>% quarto_render( execute_params = list( run_id = run_id, From 65948ddcbce6d71929ce8746b9f64a303bea874e Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 29 Nov 2023 10:36:53 -0600 Subject: [PATCH 12/27] Move performance.qmd to the top level of the `reports/` subdir --- dvc.yaml | 2 +- misc/file_dict.csv | 2 +- pipeline/05-finalize.R | 2 +- reports/{performance => }/performance.qmd | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename reports/{performance => }/performance.qmd (100%) diff --git a/dvc.yaml b/dvc.yaml index 346c3879..f4db9ab2 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -160,7 +160,7 @@ stages: cache: false - output/metadata/model_metadata.parquet: cache: false - - reports/performance/performance.html: + - reports/performance.html: cache: false export: diff --git a/misc/file_dict.csv b/misc/file_dict.csv index 6a9c905e..7100bb08 100644 --- a/misc/file_dict.csv +++ b/misc/file_dict.csv @@ -19,7 +19,7 @@ output,performance_quantile_test,3,evaluate,ccao-model-results-us-east-1,output/ output,performance_quantile_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_assessment.parquet,performance_quantile/year={year}/stage=assessment/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",No,Test + assessment,Performance metrics by quantile within class and geography,Assessment set uses the prior year sales to compare to the assessed value output,shap,4,interpret,ccao-model-results-us-east-1,output/shap/model_shap.parquet,shap/,shap,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,SHAP values for each feature for each card in the assessment data,NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler output,feature_importance,4,interpret,ccao-model-results-us-east-1,output/feature_importance/model_feature_importance.parquet,feature_importance/year={year}/{run_id}.parquet,feature_importance,predictor,"year, run_id, model_predictor_all_name",No,Yes,"Feature importance values (gain, cover, and frequency) for the run", -output,report,5,finalize,ccao-model-results-us-east-1,reports/performance/performance.html,report/year={year}/{run_id}.html,,model run,,No,Yes,Rendered Quarto doc with model performance statistics, +output,report,5,finalize,ccao-model-results-us-east-1,reports/performance.html,report/year={year}/{run_id}.html,,model run,,No,Yes,Rendered Quarto doc with model performance statistics, output,metadata,5,finalize,ccao-model-results-us-east-1,output/metadata/model_metadata.parquet,metadata/year={year}/{run_id}.parquet,metadata,model run,"year, run_id",Yes,Yes,"Information about each run, including parameters, run ID, git info, etc.", intermediate,timing,,all,,output/intermediate/timing/,,,model stage,"year, msg",Yes,Yes,Parquet files for each stage containing the stage time elapsed,Converted into a one-row data frame in the finalize stage output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Yes,Yes,Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages" diff --git a/pipeline/05-finalize.R b/pipeline/05-finalize.R index 2966722e..91e15b5b 100644 --- a/pipeline/05-finalize.R +++ b/pipeline/05-finalize.R @@ -209,7 +209,7 @@ tictoc::tic.clearlog() #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Generating performance report") -here("reports/performance/performance.qmd") %>% +here("reports", "performance.qmd") %>% quarto_render( execute_params = list( run_id = run_id, diff --git a/reports/performance/performance.qmd b/reports/performance.qmd similarity index 100% rename from reports/performance/performance.qmd rename to reports/performance.qmd From dab451cdb011135b1a7a96c957dc1f77b8347348 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 29 Nov 2023 12:22:17 -0600 Subject: [PATCH 13/27] Factor out report generation into 05-report.R pipeline stage --- README.Rmd | 31 +++++++++---- README.md | 56 +++++++++++++++-------- dvc.yaml | 22 ++++++--- pipeline/05-report.R | 52 +++++++++++++++++++++ pipeline/{05-finalize.R => 06-finalize.R} | 31 +++---------- pipeline/{06-export.R => 08-export.R} | 0 reports/performance.qmd | 3 +- 7 files changed, 133 insertions(+), 62 deletions(-) create mode 100644 pipeline/05-report.R rename pipeline/{05-finalize.R => 06-finalize.R} (94%) rename pipeline/{06-export.R => 08-export.R} (100%) diff --git a/README.Rmd b/README.Rmd index 669837f5..1a488aca 100644 --- a/README.Rmd +++ b/README.Rmd @@ -58,6 +58,7 @@ graph LR assess("Assess") evaluate("Evaluate") interpret("Interpret") + report("Report") finalize("Finalize") export("Export") @@ -66,7 +67,8 @@ graph LR train --> interpret assess --> evaluate evaluate --> finalize - interpret --> finalize + interpret --> report + report --> finalize finalize --> aws finalize --> export aws --> ingest @@ -87,9 +89,11 @@ All inputs and outputs are stored on AWS S3 using a unique run identifier. Each 4. **Interpret**: Calculate SHAP values for all the estimated values from the assess stage. These are the _per feature_ contribution to the predicted value for an _individual observation_ (usually a single PIN). Also calculate the aggregate feature importance for the entire model. The primary output of this stage is a data frame of the contributions of each feature for each property. -5. **Finalize**: Add metadata and then upload all output objects to AWS (S3). All model outputs for every model run are stored in perpetuity in S3. Each run's performance can be visualized using the CCAO's internal Tableau dashboards. +5. **Report**: Render a Quarto document containing a model performance report to `reports/performance.html`. -6. **Export**: Export assessed values to Desk Review spreadsheets for Valuations, as well as a delimited text format for upload to the system of record (iasWorld). NOTE: This stage is only run when a final model is selected. It is not run automatically or as part of the main pipeline. +6. **Finalize**: Add metadata and then upload all output objects to AWS (S3). All model outputs for every model run are stored in perpetuity in S3. Each run's performance can be visualized using the CCAO's internal Tableau dashboards. + +7. **Export**: Export assessed values to Desk Review spreadsheets for Valuations, as well as a delimited text format for upload to the system of record (iasWorld). NOTE: This stage is only run when a final model is selected. It is not run automatically or as part of the main pipeline. ## Choices Made @@ -464,7 +468,7 @@ This repository represents a significant departure from the old [residential mod ### [`assessment-year-2022`](https://github.com/ccao-data/model-res-avm/tree/2022-assessment-year) -* Moved previously separate processes into this repository and improved their integration with the overall modeling process. For example, the [etl_res_data](https://gitlab.com/ccao-data-science---modeling/processes/etl_res_data) process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) was moved to [pipeline/06-export.R](pipeline/06-export.R). +* Moved previously separate processes into this repository and improved their integration with the overall modeling process. For example, the [etl_res_data](https://gitlab.com/ccao-data-science---modeling/processes/etl_res_data) process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) was moved to [pipeline/06-export.R](pipeline/07-export.R). * Added [DVC](https://dvc.org/) support/integration. This repository uses DVC in 2 ways: 1. All input data in [`input/`](input/) is versioned, tracked, and stored using DVC. Previous input data sets are stored in perpetuity on S3. 2. [DVC pipelines](https://dvc.org/doc/user-guide/project-structure/pipelines-files) are used to sequentially run R pipeline scripts and track/cache inputs and outputs. @@ -487,6 +491,13 @@ This repository represents a significant departure from the old [residential mod * Dropped explicit spatial lag generation in the ingest stage. * Lots of other bugfixes and minor improvements. +### Upcoming + +* Infrastructure improvements + * Added [`build-and-run-model`](https://github.com/ccao-data/model-res-avm/actions/workflows/build-and-run-model.yaml) workflow to run the model using GitHub Actions and AWS Batch. + * Added [`delete-model-run`](https://github.com/ccao-data/model-res-avm/actions/workflows/delete-model-runs.yaml) workflow to delete test run artifacts in S3 using GitHub Actions. + * Added [pipeline/05-report.R](pipeline/05-report.R) step to render a performance report using Quarto. + # Ongoing Issues The CCAO faces a number of ongoing issues which make modeling difficult. Some of these issues are in the process of being solved; others are less tractable. We list them here for the sake of transparency and to provide a sense of the challenges we face. @@ -609,7 +620,7 @@ The code in this repository is written primarily in [R](https://www.r-project.or If you're on Windows, you'll also need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) in order to build the necessary packages. You may also want to (optionally) install [DVC](https://dvc.org/doc/install) to pull data and run pipelines. -We also publish a Docker image containing model code and all of the dependencies necessary to run it. If you're comfortable using Docker, you can skip the installation steps below and instead pull the image from `ghcr.io/ccao-data/model-res-avm:master` to run the model. +We also publish a Docker image containing model code and all of the dependencies necessary to run it. If you're comfortable using Docker, you can skip the installation steps below and instead pull the image from `ghcr.io/ccao-data/model-res-avm:master` to run the latest version of the model. ## Installation @@ -617,7 +628,7 @@ We also publish a Docker image containing model code and all of the dependencies 2. Set your working directory to the local folder containing this repository's files, either using R's `setwd()` command or (preferably) using RStudio's [projects](https://support.posit.co/hc/en-us/articles/200526207-Using-Projects). 3. Install `renv`, R's package manager, by running `install.packages("renv")`. 4. Install all R package dependencies using `renv` by running `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. - 1. The `finalize` step of the model pipeline requires some additional dependencies for generating a model performance report. These dependencies must be installed in addition to the core dependnecies installed in step 4. If you would like to run this step, make sure to install its additional dependencies by running `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. +5. The `report` step of the model pipeline requires some additional dependencies for generating a model performance report. Install these additional dependencies by running `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. These dependencies must be installed in addition to the core dependencies installed in step 4. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -628,8 +639,8 @@ For installation issues, particularly related to package installation and depend To use this repository, simply open the [pipeline/](./pipeline) directory and run the R scripts in order. Non-CCAO users can skip the following stages: * [`pipeline/00-ingest.R`](pipeline/00-ingest.R) - Requires access to CCAO internal AWS services to pull data. See [Getting Data](#getting-data) if you are a member of the public. -* [`pipeline/05-finalize.R`](pipeline/05-finalize.R) - Requires access to CCAO internal AWS services to upload model results. -* [`pipeline/06-export.R`](pipeline/06-export.R) - Only required for CCAO internal processes. +* [`pipeline/06-finalize.R`](pipeline/06-finalize.R) - Requires access to CCAO internal AWS services to upload model results. +* [`pipeline/07-export.R`](pipeline/07-export.R) - Only required for CCAO internal processes. #### Using DVC @@ -670,7 +681,7 @@ Each R script has a set of associated parameters (tracked via `dvc.yaml`). DVC w ## Output -The full model pipeline produces a large number of outputs. A full list of these outputs and their purpose can be found in [`misc/file_dict.csv`](misc/file_dict.csv). For public users, all outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all outputs are uploaded to S3 via the [finalize stage](pipeline/05-finalize.R). Uploaded Parquet files are converted into the following Athena tables: +The full model pipeline produces a large number of outputs. A full list of these outputs and their purpose can be found in [`misc/file_dict.csv`](misc/file_dict.csv). For public users, all outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all outputs are uploaded to S3 via the [finalize stage](pipeline/06-finalize.R). Uploaded Parquet files are converted into the following Athena tables: #### Athena Tables @@ -751,7 +762,7 @@ Both [Tidymodels](https://tune.tidymodels.org/articles/extras/optimizations.html There are two lockfiles that we use with renv to manage R dependencies: 1. **`renv.lock`** is the canonical list of dependencies that are used by the **core model pipeline**. Any dependencies that are required to run the model itself should be defined in this lockfile. -2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `finalize` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. +2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `report` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. Our goal in maintaining multiple lockfiles is to keep the list of dependencies that are required to run the model as short as possibile. This choice adds overhead to the process of updating R dependencies, but incurs the benefit of a more maintainable model over the long term. diff --git a/README.md b/README.md index 4cfa90c7..4af3fc63 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Table of Contents - [`assessment-year-2021`](#assessment-year-2021) - [`assessment-year-2022`](#assessment-year-2022) - [`assessment-year-2023`](#assessment-year-2023) + - [Upcoming](#upcoming) - [Ongoing Issues](#ongoing-issues) - [Data Quality and Integrity](#data-quality-and-integrity) - [Heterogeneity and Extremes](#heterogeneity-and-extremes) @@ -100,6 +101,7 @@ graph LR assess("Assess") evaluate("Evaluate") interpret("Interpret") + report("Report") finalize("Finalize") export("Export") @@ -108,7 +110,8 @@ graph LR train --> interpret assess --> evaluate evaluate --> finalize - interpret --> finalize + interpret --> report + report --> finalize finalize --> aws finalize --> export aws --> ingest @@ -157,12 +160,15 @@ stand-alone script) or as part of the overall pipeline (with entire model. The primary output of this stage is a data frame of the contributions of each feature for each property. -5. **Finalize**: Add metadata and then upload all output objects to AWS +5. **Report**: Render a Quarto document containing a model performance + report to `reports/performance.html`. + +6. **Finalize**: Add metadata and then upload all output objects to AWS (S3). All model outputs for every model run are stored in perpetuity in S3. Each run’s performance can be visualized using the CCAO’s internal Tableau dashboards. -6. **Export**: Export assessed values to Desk Review spreadsheets for +7. **Export**: Export assessed values to Desk Review spreadsheets for Valuations, as well as a delimited text format for upload to the system of record (iasWorld). NOTE: This stage is only run when a final model is selected. It is not run automatically or as part of @@ -332,7 +338,7 @@ districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/- and many others. The features in the table below are the ones that made the cut. They’re the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the -model as of 2023-11-24. +model as of 2023-11-29. | Feature Name | Category | Type | Possible Values | Notes | |:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -709,7 +715,7 @@ the following major changes to the residential modeling codebase: process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) - was moved to [pipeline/06-export.R](pipeline/06-export.R). + was moved to [pipeline/06-export.R](pipeline/07-export.R). - Added [DVC](https://dvc.org/) support/integration. This repository uses DVC in 2 ways: 1. All input data in [`input/`](input/) is versioned, tracked, and @@ -760,6 +766,18 @@ the following major changes to the residential modeling codebase: - Dropped explicit spatial lag generation in the ingest stage. - Lots of other bugfixes and minor improvements. +### Upcoming + +- Infrastructure improvements + - Added + [`build-and-run-model`](https://github.com/ccao-data/model-res-avm/actions/workflows/build-and-run-model.yaml) + workflow to run the model using GitHub Actions and AWS Batch. + - Added + [`delete-model-run`](https://github.com/ccao-data/model-res-avm/actions/workflows/delete-model-runs.yaml) + workflow to delete test run artifacts in S3 using GitHub Actions. + - Added [pipeline/05-report.R](pipeline/05-report.R) step to render a + performance report using Quarto. + # Ongoing Issues The CCAO faces a number of ongoing issues which make modeling difficult. @@ -1025,7 +1043,8 @@ build the necessary packages. You may also want to (optionally) install We also publish a Docker image containing model code and all of the dependencies necessary to run it. If you’re comfortable using Docker, you can skip the installation steps below and instead pull the image -from `ghcr.io/ccao-data/model-res-avm:master` to run the model. +from `ghcr.io/ccao-data/model-res-avm:master` to run the latest version +of the model. ## Installation @@ -1041,13 +1060,12 @@ from `ghcr.io/ccao-data/model-res-avm:master` to run the model. `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. - 1. The `finalize` step of the model pipeline requires some - additional dependencies for generating a model performance - report. These dependencies must be installed in addition to the - core dependnecies installed in step 4. If you would like to run - this step, make sure to install its additional dependencies by - running - `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. +5. The `report` step of the model pipeline requires some additional + dependencies for generating a model performance report. Install + these additional dependencies by running + `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. + These dependencies must be installed in addition to the core + dependencies installed in step 4. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -1063,9 +1081,9 @@ following stages: - [`pipeline/00-ingest.R`](pipeline/00-ingest.R) - Requires access to CCAO internal AWS services to pull data. See [Getting Data](#getting-data) if you are a member of the public. -- [`pipeline/05-finalize.R`](pipeline/05-finalize.R) - Requires access +- [`pipeline/06-finalize.R`](pipeline/06-finalize.R) - Requires access to CCAO internal AWS services to upload model results. -- [`pipeline/06-export.R`](pipeline/06-export.R) - Only required for +- [`pipeline/07-export.R`](pipeline/07-export.R) - Only required for CCAO internal processes. #### Using DVC @@ -1129,7 +1147,7 @@ of these outputs and their purpose can be found in outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all outputs are uploaded to S3 via the [finalize -stage](pipeline/05-finalize.R). Uploaded Parquet files are converted +stage](pipeline/06-finalize.R). Uploaded Parquet files are converted into the following Athena tables: #### Athena Tables @@ -1257,9 +1275,9 @@ There are two lockfiles that we use with renv to manage R dependencies: to run the model itself should be defined in this lockfile. 2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance - report** in the `finalize` step of the pipeline. Any dependencies - that are required to generate that report or others like it should - be defined in this lockfile. + report** in the `report` step of the pipeline. Any dependencies that + are required to generate that report or others like it should be + defined in this lockfile. Our goal in maintaining multiple lockfiles is to keep the list of dependencies that are required to run the model as short as possibile. diff --git a/dvc.yaml b/dvc.yaml index f4db9ab2..f57d61d1 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -121,12 +121,21 @@ stages: - output/intermediate/timing/model_timing_interpret.parquet: cache: false + report: + cmd: Rscript pipeline/05-report.R + desc: Render a performance report using Quarto. + deps: + - output/shap/model_shap.parquet + outs: + - reports/performance.html: + cache: false + finalize: - cmd: Rscript pipeline/05-finalize.R + cmd: Rscript pipeline/06-finalize.R desc: > - Save run timings, generate a report, upload pipeline run results to S3, - and send an SNS notification. Will also clean some of the generated - outputs prior to upload and attach a unique run ID + Save run timings, upload pipeline run results to S3, and send an SNS + notification. Will also clean some of the generated outputs prior to + upload and attach a unique run ID deps: - output/parameter_final/model_parameter_final.parquet - output/parameter_range/model_parameter_range.parquet @@ -146,6 +155,7 @@ stages: - output/intermediate/timing/model_timing_assess.parquet - output/intermediate/timing/model_timing_evaluate.parquet - output/intermediate/timing/model_timing_interpret.parquet + - reports/performance.html params: - run_note - run_type @@ -160,11 +170,9 @@ stages: cache: false - output/metadata/model_metadata.parquet: cache: false - - reports/performance.html: - cache: false export: - cmd: Rscript pipeline/06-export.R + cmd: Rscript pipeline/07-export.R desc: > Generate Desk Review spreadsheets and iasWorld upload CSVs from a finished run. NOT automatically run since it is typically only run once. Manually diff --git a/pipeline/05-report.R b/pipeline/05-report.R new file mode 100644 index 00000000..23def3bb --- /dev/null +++ b/pipeline/05-report.R @@ -0,0 +1,52 @@ +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 1. Setup --------------------------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +# Wrap this script in a `try` block so that the pipeline continues execution +# even if report generation fails +tryCatch( + { + # Load libraries and scripts + suppressPackageStartupMessages({ + library(here) + library(magrittr) + library(quarto) + library(yaml) + }) + + # Load the parameters file containing the run settings + params <- read_yaml("params.yaml") + + + #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + # 2. Generate performance report ------------------------------------------- + #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + message("Generating performance report") + + here("reports", "performance.qmd") %>% + quarto_render( + execute_params = list( + year = params$assessment$year + ) + ) + }, + + + #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + # 3. Error handling ---------------------------------------------------------- + #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + error = function(c) { + # Print the error message + print("Error in report generation:") + print(conditionMessage(c)) + + # Save an empty report so that this pipeline step produces the required + # output + print("Saving an empty report file in order to continue execution") + sink(here("reports", "performance.html")) + cat("Error in report generation:") + cat() + cat(conditionMessage(c)) + sink() + } +) diff --git a/pipeline/05-finalize.R b/pipeline/06-finalize.R similarity index 94% rename from pipeline/05-finalize.R rename to pipeline/06-finalize.R index 91e15b5b..83593d63 100644 --- a/pipeline/05-finalize.R +++ b/pipeline/06-finalize.R @@ -17,7 +17,6 @@ suppressPackageStartupMessages({ library(lubridate) library(paws.application.integration) library(purrr) - library(quarto) library(tidyr) library(tune) library(yaml) @@ -205,23 +204,7 @@ tictoc::tic.clearlog() #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 4. Generate performance report ----------------------------------------------- -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -message("Generating performance report") - -here("reports", "performance.qmd") %>% - quarto_render( - execute_params = list( - run_id = run_id, - year = params$assessment$year - ) - ) - - - - -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 5. Upload -------------------------------------------------------------------- +# 4. Upload -------------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Uploading run artifacts") @@ -234,7 +217,7 @@ if (params$toggle$upload_to_s3) { ) - ## 5.1. Train ---------------------------------------------------------------- + ## 4.1. Train ---------------------------------------------------------------- # Upload lightgbm fit aws.s3::put_object( @@ -319,7 +302,7 @@ if (params$toggle$upload_to_s3) { } - # 5.2. Assess ---------------------------------------------------------------- + # 4.2. Assess ---------------------------------------------------------------- message("Uploading final assessment results") # Upload PIN and card-level values for full runs. These outputs are very @@ -347,7 +330,7 @@ if (params$toggle$upload_to_s3) { } - # 5.3. Evaluate -------------------------------------------------------------- + # 4.3. Evaluate -------------------------------------------------------------- # Upload test set performance message("Uploading test set evaluation") @@ -374,7 +357,7 @@ if (params$toggle$upload_to_s3) { } - # 5.4. Interpret ------------------------------------------------------------- + # 4.4. Interpret ------------------------------------------------------------- # Upload SHAP values if a full run. SHAP values are one row per card and one # column per feature, so the output is very large. Therefore, we partition @@ -402,7 +385,7 @@ if (params$toggle$upload_to_s3) { } - # 5.5. Finalize -------------------------------------------------------------- + # 4.5. Finalize -------------------------------------------------------------- message("Uploading run metadata, timings, and performance report") # Upload metadata @@ -428,7 +411,7 @@ if (params$toggle$upload_to_s3) { #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 6. Wrap-Up ------------------------------------------------------------------- +# 5. Wrap-Up ------------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # This will run a Glue crawler to update schemas and send an email to any SNS diff --git a/pipeline/06-export.R b/pipeline/08-export.R similarity index 100% rename from pipeline/06-export.R rename to pipeline/08-export.R diff --git a/reports/performance.qmd b/reports/performance.qmd index d2cad7a3..3e5cbe75 100644 --- a/reports/performance.qmd +++ b/reports/performance.qmd @@ -1,5 +1,5 @@ --- -title: "Model performance: `r params$run_id`" +title: "Model performance for `r params$year`" execute: echo: false warning: false @@ -12,7 +12,6 @@ format: fontsize: 12pt editor: source params: - run_id: '2023-03-14-clever-damani' year: '2023' --- From d41f6b934283f9d2062c749564bb013189d4bd64 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 29 Nov 2023 16:21:28 -0600 Subject: [PATCH 14/27] Presign the Quarto report URL in 05-finalize.R --- pipeline/06-finalize.R | 22 +++++++++++++--------- renv.lock | 23 +++++++++++++++++------ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/pipeline/06-finalize.R b/pipeline/06-finalize.R index 83593d63..5dd82af7 100644 --- a/pipeline/06-finalize.R +++ b/pipeline/06-finalize.R @@ -16,6 +16,7 @@ suppressPackageStartupMessages({ library(here) library(lubridate) library(paws.application.integration) + library(paws.storage) library(purrr) library(tidyr) library(tune) @@ -457,18 +458,21 @@ if (params$toggle$upload_to_s3) { .[!grepl("=", .)] %>% paste0(collapse = "\n") - # Get a link to the uploaded Quarto report + # Extract the path to the generated Quarto report so we can send a + # link to SNS topic consumers report_path_parts <- strsplit(paths$output$report$s3[1], "/")[[1]] report_bucket <- report_path_parts[3] - report_path <- report_path_parts[4:length(report_path_parts)] %>% + report_key <- report_path_parts[4:length(report_path_parts)] %>% paste(collapse = "/") - # Use direct link to the console instead of to the object so that we don't - # have to bother with signed URLs - report_url <- paste0( - "https://s3.console.aws.amazon.com/s3/object/", - "{report_bucket}/{report_path}?region=us-east-1&tab=overview" - ) %>% - glue::glue() + + # Presign the URL to the generated report so that consumers of the + # SNS email notification can click the link to download the file + report_url <- paws.storage::s3()$generate_presigned_url( + client_method = "get_object", + params = list(Bucket = report_bucket, Key = report_key), + expires_in = 3600 + ) + glue::glue("Report link: {report_url}") %>% message() # Publish to SNS pipeline_sns$publish( diff --git a/renv.lock b/renv.lock index 089b0da2..41242fc5 100644 --- a/renv.lock +++ b/renv.lock @@ -1215,27 +1215,27 @@ }, "paws.analytics": { "Package": "paws.analytics", - "Version": "0.3.0", + "Version": "0.4.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "paws.common" ], - "Hash": "f3a27b0314926120e8c333db7645351a" + "Hash": "0690ad4abdce9d81f1df66a4dcc98d6e" }, "paws.application.integration": { "Package": "paws.application.integration", - "Version": "0.3.1", + "Version": "0.4.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "paws.common" ], - "Hash": "bd24295846d31fbfba786d9560b34c6d" + "Hash": "31228082fcbb37d21007efa46f890d63" }, "paws.common": { "Package": "paws.common", - "Version": "0.5.8", + "Version": "0.6.4", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1246,10 +1246,21 @@ "httr", "jsonlite", "methods", + "stats", "utils", "xml2" ], - "Hash": "35039630a878ed95aa7780ebe8f7e0bf" + "Hash": "fcc0e7509d0c9da0874b5d3a7d8ea904" + }, + "paws.storage": { + "Package": "paws.storage", + "Version": "0.4.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "paws.common" + ], + "Hash": "8aea3bee1ea38991f23aea27583b93ef" }, "pillar": { "Package": "pillar", From 626d34d440bcbe2f878021721421e48a429a274b Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 29 Nov 2023 16:24:50 -0600 Subject: [PATCH 15/27] Temporarily adjust Dockerfile CMD to test paws --- Dockerfile | 2 +- R/test_paws.R | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 R/test_paws.R diff --git a/Dockerfile b/Dockerfile index 549c5736..b7062884 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,4 +50,4 @@ RUN mv renv model-res-avm/ # Set the working directory to the app dir WORKDIR model-res-avm/ -CMD dvc pull && dvc repro +CMD Rscript R/test_paws.R diff --git a/R/test_paws.R b/R/test_paws.R new file mode 100644 index 00000000..d74eb453 --- /dev/null +++ b/R/test_paws.R @@ -0,0 +1,14 @@ +library(glue) +library(magrittr) +library(paws.storage) + +s3 <- paws.storage::s3(config(signature_version = "v4", region = "us-east-1")) +report_url <- s3$generate_presigned_url( + client_method = "get_object", + params = list( + Bucket = "ccao-data-public-us-east-1", + Key = "reporting/old_ward.parquet" + ), + expires_in = 3600 +) +glue::glue("{report_url}") %>% message() From 1eba9caa4f9178eac589c29a219d7f2aee13fc41 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 30 Nov 2023 10:39:28 -0600 Subject: [PATCH 16/27] Revert "Temporarily adjust Dockerfile CMD to test paws" This reverts commit 626d34d440bcbe2f878021721421e48a429a274b. --- Dockerfile | 2 +- R/test_paws.R | 14 -------------- 2 files changed, 1 insertion(+), 15 deletions(-) delete mode 100644 R/test_paws.R diff --git a/Dockerfile b/Dockerfile index b7062884..549c5736 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,4 +50,4 @@ RUN mv renv model-res-avm/ # Set the working directory to the app dir WORKDIR model-res-avm/ -CMD Rscript R/test_paws.R +CMD dvc pull && dvc repro diff --git a/R/test_paws.R b/R/test_paws.R deleted file mode 100644 index d74eb453..00000000 --- a/R/test_paws.R +++ /dev/null @@ -1,14 +0,0 @@ -library(glue) -library(magrittr) -library(paws.storage) - -s3 <- paws.storage::s3(config(signature_version = "v4", region = "us-east-1")) -report_url <- s3$generate_presigned_url( - client_method = "get_object", - params = list( - Bucket = "ccao-data-public-us-east-1", - Key = "reporting/old_ward.parquet" - ), - expires_in = 3600 -) -glue::glue("{report_url}") %>% message() From dde61a69e3a9102a88555cceecfb693d45d66361 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 30 Nov 2023 10:39:44 -0600 Subject: [PATCH 17/27] Revert "Presign the Quarto report URL in 05-finalize.R" This reverts commit d41f6b934283f9d2062c749564bb013189d4bd64. --- pipeline/06-finalize.R | 22 +++++++++------------- renv.lock | 23 ++++++----------------- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/pipeline/06-finalize.R b/pipeline/06-finalize.R index 5dd82af7..83593d63 100644 --- a/pipeline/06-finalize.R +++ b/pipeline/06-finalize.R @@ -16,7 +16,6 @@ suppressPackageStartupMessages({ library(here) library(lubridate) library(paws.application.integration) - library(paws.storage) library(purrr) library(tidyr) library(tune) @@ -458,21 +457,18 @@ if (params$toggle$upload_to_s3) { .[!grepl("=", .)] %>% paste0(collapse = "\n") - # Extract the path to the generated Quarto report so we can send a - # link to SNS topic consumers + # Get a link to the uploaded Quarto report report_path_parts <- strsplit(paths$output$report$s3[1], "/")[[1]] report_bucket <- report_path_parts[3] - report_key <- report_path_parts[4:length(report_path_parts)] %>% + report_path <- report_path_parts[4:length(report_path_parts)] %>% paste(collapse = "/") - - # Presign the URL to the generated report so that consumers of the - # SNS email notification can click the link to download the file - report_url <- paws.storage::s3()$generate_presigned_url( - client_method = "get_object", - params = list(Bucket = report_bucket, Key = report_key), - expires_in = 3600 - ) - glue::glue("Report link: {report_url}") %>% message() + # Use direct link to the console instead of to the object so that we don't + # have to bother with signed URLs + report_url <- paste0( + "https://s3.console.aws.amazon.com/s3/object/", + "{report_bucket}/{report_path}?region=us-east-1&tab=overview" + ) %>% + glue::glue() # Publish to SNS pipeline_sns$publish( diff --git a/renv.lock b/renv.lock index 41242fc5..089b0da2 100644 --- a/renv.lock +++ b/renv.lock @@ -1215,27 +1215,27 @@ }, "paws.analytics": { "Package": "paws.analytics", - "Version": "0.4.0", + "Version": "0.3.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "paws.common" ], - "Hash": "0690ad4abdce9d81f1df66a4dcc98d6e" + "Hash": "f3a27b0314926120e8c333db7645351a" }, "paws.application.integration": { "Package": "paws.application.integration", - "Version": "0.4.0", + "Version": "0.3.1", "Source": "Repository", "Repository": "RSPM", "Requirements": [ "paws.common" ], - "Hash": "31228082fcbb37d21007efa46f890d63" + "Hash": "bd24295846d31fbfba786d9560b34c6d" }, "paws.common": { "Package": "paws.common", - "Version": "0.6.4", + "Version": "0.5.8", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1246,21 +1246,10 @@ "httr", "jsonlite", "methods", - "stats", "utils", "xml2" ], - "Hash": "fcc0e7509d0c9da0874b5d3a7d8ea904" - }, - "paws.storage": { - "Package": "paws.storage", - "Version": "0.4.0", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "paws.common" - ], - "Hash": "8aea3bee1ea38991f23aea27583b93ef" + "Hash": "35039630a878ed95aa7780ebe8f7e0bf" }, "pillar": { "Package": "pillar", From 24cf22336093f115e5e574742e9ef97b92a37e14 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 30 Nov 2023 11:26:34 -0600 Subject: [PATCH 18/27] Factor S3/SNS operations out into new 06-upload.R stage --- README.Rmd | 18 +- dvc.yaml | 26 +-- pipeline/05-finalize.R | 235 ++++++++++++++++++++++++ pipeline/05-report.R | 52 ------ pipeline/{06-finalize.R => 06-upload.R} | 194 ++----------------- pipeline/{08-export.R => 07-export.R} | 0 pipeline/{07-api.R => 08-api.R} | 0 reports/performance.qmd | 3 +- 8 files changed, 273 insertions(+), 255 deletions(-) create mode 100644 pipeline/05-finalize.R delete mode 100644 pipeline/05-report.R rename pipeline/{06-finalize.R => 06-upload.R} (60%) rename pipeline/{08-export.R => 07-export.R} (100%) rename pipeline/{07-api.R => 08-api.R} (100%) diff --git a/README.Rmd b/README.Rmd index 1a488aca..4305a4a1 100644 --- a/README.Rmd +++ b/README.Rmd @@ -58,8 +58,8 @@ graph LR assess("Assess") evaluate("Evaluate") interpret("Interpret") - report("Report") finalize("Finalize") + upload("Upload") export("Export") ingest --> train @@ -67,10 +67,10 @@ graph LR train --> interpret assess --> evaluate evaluate --> finalize - interpret --> report - report --> finalize - finalize --> aws + interpret --> finalize + finalize --> upload finalize --> export + upload --> aws aws --> ingest aws --> export ``` @@ -89,9 +89,9 @@ All inputs and outputs are stored on AWS S3 using a unique run identifier. Each 4. **Interpret**: Calculate SHAP values for all the estimated values from the assess stage. These are the _per feature_ contribution to the predicted value for an _individual observation_ (usually a single PIN). Also calculate the aggregate feature importance for the entire model. The primary output of this stage is a data frame of the contributions of each feature for each property. -5. **Report**: Render a Quarto document containing a model performance report to `reports/performance.html`. +5. **Finalize**: Save run timings and metadata and render a Quarto document containing a model performance report to `reports/performance.html`. -6. **Finalize**: Add metadata and then upload all output objects to AWS (S3). All model outputs for every model run are stored in perpetuity in S3. Each run's performance can be visualized using the CCAO's internal Tableau dashboards. +6. **Upload**: Upload all output objects to AWS (S3). All model outputs for every model run are stored in perpetuity in S3. Each run's performance can be visualized using the CCAO's internal Tableau dashboards. NOTE: This stage is only run internally, since it requires access to the CCAO Data AWS account. 7. **Export**: Export assessed values to Desk Review spreadsheets for Valuations, as well as a delimited text format for upload to the system of record (iasWorld). NOTE: This stage is only run when a final model is selected. It is not run automatically or as part of the main pipeline. @@ -496,7 +496,7 @@ This repository represents a significant departure from the old [residential mod * Infrastructure improvements * Added [`build-and-run-model`](https://github.com/ccao-data/model-res-avm/actions/workflows/build-and-run-model.yaml) workflow to run the model using GitHub Actions and AWS Batch. * Added [`delete-model-run`](https://github.com/ccao-data/model-res-avm/actions/workflows/delete-model-runs.yaml) workflow to delete test run artifacts in S3 using GitHub Actions. - * Added [pipeline/05-report.R](pipeline/05-report.R) step to render a performance report using Quarto. + * Updated [pipeline/05-finalize](pipeline/05-finalize.R) step to render a performance report using Quarto and factored S3/SNS operations out into [pipeline/06-upload.R]. # Ongoing Issues @@ -639,7 +639,7 @@ For installation issues, particularly related to package installation and depend To use this repository, simply open the [pipeline/](./pipeline) directory and run the R scripts in order. Non-CCAO users can skip the following stages: * [`pipeline/00-ingest.R`](pipeline/00-ingest.R) - Requires access to CCAO internal AWS services to pull data. See [Getting Data](#getting-data) if you are a member of the public. -* [`pipeline/06-finalize.R`](pipeline/06-finalize.R) - Requires access to CCAO internal AWS services to upload model results. +* [`pipeline/06-upload.R`](pipeline/06-upload.R) - Requires access to CCAO internal AWS services to upload model results. * [`pipeline/07-export.R`](pipeline/07-export.R) - Only required for CCAO internal processes. #### Using DVC @@ -681,7 +681,7 @@ Each R script has a set of associated parameters (tracked via `dvc.yaml`). DVC w ## Output -The full model pipeline produces a large number of outputs. A full list of these outputs and their purpose can be found in [`misc/file_dict.csv`](misc/file_dict.csv). For public users, all outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all outputs are uploaded to S3 via the [finalize stage](pipeline/06-finalize.R). Uploaded Parquet files are converted into the following Athena tables: +The full model pipeline produces a large number of outputs. A full list of these outputs and their purpose can be found in [`misc/file_dict.csv`](misc/file_dict.csv). For public users, all outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all outputs are uploaded to S3 via the [upload stage](pipeline/06-upload). Uploaded Parquet files are converted into the following Athena tables: #### Athena Tables diff --git a/dvc.yaml b/dvc.yaml index f57d61d1..66cbe270 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -121,21 +121,11 @@ stages: - output/intermediate/timing/model_timing_interpret.parquet: cache: false - report: - cmd: Rscript pipeline/05-report.R - desc: Render a performance report using Quarto. - deps: - - output/shap/model_shap.parquet - outs: - - reports/performance.html: - cache: false - finalize: - cmd: Rscript pipeline/06-finalize.R + cmd: Rscript pipeline/05-finalize.R desc: > - Save run timings, upload pipeline run results to S3, and send an SNS - notification. Will also clean some of the generated outputs prior to - upload and attach a unique run ID + Save run timings and run metadata to disk and render a performance report + using Quarto. deps: - output/parameter_final/model_parameter_final.parquet - output/parameter_range/model_parameter_range.parquet @@ -171,6 +161,16 @@ stages: - output/metadata/model_metadata.parquet: cache: false + upload: + cmd: Rscript pipeline/06-upload.R + desc: > + Upload performance stats and report to S3, trigger Glue crawlers, and + publish to a model run SNS topic. Will also clean some of the generated + outputs prior to upload and attach a unique run ID. This step requires + access to the CCAO Data AWS account, and so is assumed to be internal-only + deps: + - output/metadata/model_metadata.parquet + export: cmd: Rscript pipeline/07-export.R desc: > diff --git a/pipeline/05-finalize.R b/pipeline/05-finalize.R new file mode 100644 index 00000000..ffeba8aa --- /dev/null +++ b/pipeline/05-finalize.R @@ -0,0 +1,235 @@ +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 1. Setup --------------------------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +# Load libraries and scripts +suppressPackageStartupMessages({ + library(arrow) + library(ccao) + library(dplyr) + library(here) + library(lubridate) + library(purrr) + library(tidyr) + library(tune) + library(yaml) +}) +source(here("R", "helpers.R")) + +# Initialize a dictionary of file paths. See misc/file_dict.csv for details +paths <- model_file_dict() + +# Load the parameters file containing the run settings +params <- read_yaml("params.yaml") + +# Override CV toggle, SHAP toggle, and run_type, used for CI or limited runs +cv_enable <- as.logical( + Sys.getenv("CV_ENABLE_OVERRIDE", unset = params$toggle$cv_enable) +) +shap_enable <- as.logical( + Sys.getenv("SHAP_ENABLE_OVERRIDE", unset = params$toggle$shap_enable) +) +run_type <- as.character( + Sys.getenv("RUN_TYPE_OVERRIDE", unset = params$run_type) +) + + + + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 2. Save Metadata ------------------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +message("Saving run metadata") + +## 2.1. Run Info --------------------------------------------------------------- + +# Generate a random identifier for this run. This will serve as the primary key/ +# identifier for in perpetuity. See ?ccao_generate_id for details +run_id <- ccao::ccao_generate_id() + +# Get the current timestamp for when the run ended +run_end_timestamp <- lubridate::now() + +# Get the commit of the current reference +git_commit <- git2r::revparse_single(git2r::repository(), "HEAD") + +# For full runs, use the run note included in params.yaml, otherwise use the +# commit message +if (run_type == "full") { + run_note <- params$run_note +} else { + run_note <- gsub("\n", "", git_commit$message) +} + + +## 2.2. DVC Hashes ------------------------------------------------------------- + +# Read the MD5 hash of each input dataset. These are created by DVC and used to +# version and share the input data +dvc_md5_df <- bind_rows(read_yaml("dvc.lock")$stages$ingest$outs) %>% + mutate(path = paste0("dvc_md5_", gsub("input/|.parquet", "", path))) %>% + select(path, md5) %>% + pivot_wider(names_from = path, values_from = md5) + + +## 2.3. Parameters ------------------------------------------------------------- + +# Save most parameters from params.yaml to a metadata file, along with +# run info, git stuff, etc. +metadata <- tibble::tibble( + run_id = run_id, + run_end_timestamp = run_end_timestamp, + run_type = run_type, + run_note = run_note, + git_sha_short = substr(git_commit$sha, 1, 8), + git_sha_long = git_commit$sha, + git_message = gsub("\n", "", git_commit$message), + git_author = git_commit$author$name, + git_email = git_commit$author$email, + assessment_year = params$assessment$year, + assessment_date = params$assessment$date, + assessment_triad = params$assessment$triad, + assessment_group = params$assessment$group, + assessment_data_year = params$assessment$data_year, + input_min_sale_year = params$input$min_sale_year, + input_max_sale_year = params$input$max_sale_year, + input_complex_match_exact = list(params$input$complex$match_exact), + input_complex_match_fuzzy_name = list( + names(params$input$complex$match_fuzzy) + ), + input_complex_match_fuzzy_value = list( + as.numeric(params$input$complex$match_fuzzy) + ), + input_sale_validation_stat_groups = list( + params$input$sale_validation$stat_groups + ), + input_sale_validation_iso_forest = list( + params$input$sale_validation$iso_forest + ), + input_sale_validation_dev_bounds = list( + params$input$sale_validation$dev_bounds + ), + ratio_study_far_year = params$ratio_study$far_year, + ratio_study_far_stage = params$ratio_study$far_stage, + ratio_study_far_column = params$ratio_study$far_column, + ratio_study_near_year = params$ratio_study$near_year, + ratio_study_near_stage = params$ratio_study$near_stage, + ratio_study_near_column = params$ratio_study$near_column, + ratio_study_num_quantile = list(params$ratio_study$num_quantile), + shap_enable = shap_enable, + cv_enable = cv_enable, + cv_num_folds = params$cv$num_folds, + cv_initial_set = params$cv$initial_set, + cv_max_iterations = params$cv$max_iterations, + cv_no_improve = params$cv$no_improve, + cv_split_prop = params$cv$split_prop, + cv_best_metric = params$cv$best_metric, + pv_multicard_yoy_cap = params$pv$multicard_yoy_cap, + pv_land_pct_of_total_cap = params$pv$land_pct_of_total_cap, + pv_round_break = list(params$pv$round_break), + pv_round_to_nearest = list(params$pv$round_to_nearest), + pv_round_type = params$pv$round_type, + model_predictor_id_count = length(params$model$predictor$id), + model_predictor_id_name = list(params$model$predictor$id), + model_predictor_all_count = length(params$model$predictor$all), + model_predictor_all_name = list(params$model$predictor$all), + model_predictor_categorical_count = + length(params$model$predictor$categorical), + model_predictor_categorical_name = list(params$model$predictor$categorical) +) %>% + bind_cols(dvc_md5_df) %>% + relocate( + starts_with("dvc_id_"), + .after = "input_complex_match_fuzzy_value" + ) %>% + arrow::write_parquet(paths$output$metadata$local) + + + + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 3. Save Timings -------------------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +message("Saving run timings") + +# Filter ensure we only get timing files for stages that actually ran +if (run_type == "full") { + timings <- list.files( + paste0(paths$intermediate$timing, "/"), + full.names = TRUE + ) +} else { + timings <- list.files( + paste0(paths$intermediate$timing, "/"), + pattern = "train|evaluate", + full.names = TRUE + ) +} + +# Convert the intermediate timing logs to a wide data frame, then save to file +timings_df <- purrr::map_dfr(timings, read_parquet) %>% + mutate( + run_id = run_id, + run_end_timestamp = run_end_timestamp, + elapsed = round(toc - tic, 2), + stage = paste0(tolower(stringr::word(msg, 1)), "_sec_elapsed"), + order = recode( + msg, + "Train" = "01", "Assess" = "02", + "Evaluate" = "03", "Interpret" = "04" + ) + ) %>% + arrange(order) %>% + select(-c(tic:toc, msg)) %>% + tidyr::pivot_wider( + id_cols = c(run_id, run_end_timestamp), + names_from = stage, + values_from = elapsed + ) %>% + mutate(overall_sec_elapsed = rowSums(across(ends_with("_sec_elapsed")))) %>% + mutate(across(ends_with("_sec_elapsed"), function(x) round(x, 2))) %>% + write_parquet(paths$output$timing$local) + +# Clear any remaining logs from tictoc +tictoc::tic.clearlog() + + + + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 4. Generate performance report ----------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +# Wrap this block in an error handler so that the pipeline continues execution +# even if report generation fails. This is important because the report file is +# defined separately, so this script can't be sure that it is error-free, and +# +tryCatch( + { + suppressPackageStartupMessages({ + library(quarto) + }) + + message("Generating performance report") + + here("reports", "performance.qmd") %>% + quarto_render( + execute_params = list( + run_id = run_id, + year = params$assessment$year + ) + ) + }, + error = function(func) { + message("Encountered error during report generation:") + message(conditionMessage(func)) + + # Save an empty report so that this pipeline step produces the required + # output even in cases of failure + message("Saving an empty report file in order to continue execution") + sink(here("reports", "performance.html")) + cat("Encountered error in report generation:\n\n") + cat(conditionMessage(func)) + sink() + } +) diff --git a/pipeline/05-report.R b/pipeline/05-report.R deleted file mode 100644 index 23def3bb..00000000 --- a/pipeline/05-report.R +++ /dev/null @@ -1,52 +0,0 @@ -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 1. Setup --------------------------------------------------------------------- -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# Wrap this script in a `try` block so that the pipeline continues execution -# even if report generation fails -tryCatch( - { - # Load libraries and scripts - suppressPackageStartupMessages({ - library(here) - library(magrittr) - library(quarto) - library(yaml) - }) - - # Load the parameters file containing the run settings - params <- read_yaml("params.yaml") - - - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 2. Generate performance report ------------------------------------------- - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Generating performance report") - - here("reports", "performance.qmd") %>% - quarto_render( - execute_params = list( - year = params$assessment$year - ) - ) - }, - - - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 3. Error handling ---------------------------------------------------------- - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - error = function(c) { - # Print the error message - print("Error in report generation:") - print(conditionMessage(c)) - - # Save an empty report so that this pipeline step produces the required - # output - print("Saving an empty report file in order to continue execution") - sink(here("reports", "performance.html")) - cat("Error in report generation:") - cat() - cat(conditionMessage(c)) - sink() - } -) diff --git a/pipeline/06-finalize.R b/pipeline/06-upload.R similarity index 60% rename from pipeline/06-finalize.R rename to pipeline/06-upload.R index 83593d63..319b10f6 100644 --- a/pipeline/06-finalize.R +++ b/pipeline/06-upload.R @@ -10,15 +10,14 @@ suppressPackageStartupMessages({ library(arrow) library(aws.s3) library(aws.ec2metadata) - library(ccao) library(dplyr) library(glue) library(here) + library(knitr) library(lubridate) + library(paws.analytics) library(paws.application.integration) - library(purrr) library(tidyr) - library(tune) library(yaml) }) source(here("R", "helpers.R")) @@ -29,182 +28,17 @@ paths <- model_file_dict() # Load the parameters file containing the run settings params <- read_yaml("params.yaml") -# Override CV toggle, SHAP toggle, and run_type, used for CI or limited runs -cv_enable <- as.logical( - Sys.getenv("CV_ENABLE_OVERRIDE", unset = params$toggle$cv_enable) -) -shap_enable <- as.logical( - Sys.getenv("SHAP_ENABLE_OVERRIDE", unset = params$toggle$shap_enable) -) -run_type <- as.character( - Sys.getenv("RUN_TYPE_OVERRIDE", unset = params$run_type) -) +# Load CV toggle, SHAP toggle, and run_type as defined in the `finalize` step +metadata <- read_parquet(paths$output$metadata$local) +cv_enable <- metadata$cv_enable +shap_enable <- metadata$shap_enable +run_type <- metadata$run_type #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 2. Save Metadata ------------------------------------------------------------- -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -message("Saving run metadata") - -## 2.1. Run Info --------------------------------------------------------------- - -# Generate a random identifier for this run. This will serve as the primary key/ -# identifier for in perpetuity. See ?ccao_generate_id for details -run_id <- ccao::ccao_generate_id() - -# Get the current timestamp for when the run ended -run_end_timestamp <- lubridate::now() - -# Get the commit of the current reference -git_commit <- git2r::revparse_single(git2r::repository(), "HEAD") - -# For full runs, use the run note included in params.yaml, otherwise use the -# commit message -if (run_type == "full") { - run_note <- params$run_note -} else { - run_note <- gsub("\n", "", git_commit$message) -} - - -## 2.2. DVC Hashes ------------------------------------------------------------- - -# Read the MD5 hash of each input dataset. These are created by DVC and used to -# version and share the input data -dvc_md5_df <- bind_rows(read_yaml("dvc.lock")$stages$ingest$outs) %>% - mutate(path = paste0("dvc_md5_", gsub("input/|.parquet", "", path))) %>% - select(path, md5) %>% - pivot_wider(names_from = path, values_from = md5) - - -## 2.3. Parameters ------------------------------------------------------------- - -# Save most parameters from params.yaml to a metadata file, along with -# run info, git stuff, etc. -metadata <- tibble::tibble( - run_id = run_id, - run_end_timestamp = run_end_timestamp, - run_type = run_type, - run_note = run_note, - git_sha_short = substr(git_commit$sha, 1, 8), - git_sha_long = git_commit$sha, - git_message = gsub("\n", "", git_commit$message), - git_author = git_commit$author$name, - git_email = git_commit$author$email, - assessment_year = params$assessment$year, - assessment_date = params$assessment$date, - assessment_triad = params$assessment$triad, - assessment_group = params$assessment$group, - assessment_data_year = params$assessment$data_year, - input_min_sale_year = params$input$min_sale_year, - input_max_sale_year = params$input$max_sale_year, - input_complex_match_exact = list(params$input$complex$match_exact), - input_complex_match_fuzzy_name = list( - names(params$input$complex$match_fuzzy) - ), - input_complex_match_fuzzy_value = list( - as.numeric(params$input$complex$match_fuzzy) - ), - input_sale_validation_stat_groups = list( - params$input$sale_validation$stat_groups - ), - input_sale_validation_iso_forest = list( - params$input$sale_validation$iso_forest - ), - input_sale_validation_dev_bounds = list( - params$input$sale_validation$dev_bounds - ), - ratio_study_far_year = params$ratio_study$far_year, - ratio_study_far_stage = params$ratio_study$far_stage, - ratio_study_far_column = params$ratio_study$far_column, - ratio_study_near_year = params$ratio_study$near_year, - ratio_study_near_stage = params$ratio_study$near_stage, - ratio_study_near_column = params$ratio_study$near_column, - ratio_study_num_quantile = list(params$ratio_study$num_quantile), - shap_enable = shap_enable, - cv_enable = cv_enable, - cv_num_folds = params$cv$num_folds, - cv_initial_set = params$cv$initial_set, - cv_max_iterations = params$cv$max_iterations, - cv_no_improve = params$cv$no_improve, - cv_split_prop = params$cv$split_prop, - cv_best_metric = params$cv$best_metric, - pv_multicard_yoy_cap = params$pv$multicard_yoy_cap, - pv_land_pct_of_total_cap = params$pv$land_pct_of_total_cap, - pv_round_break = list(params$pv$round_break), - pv_round_to_nearest = list(params$pv$round_to_nearest), - pv_round_type = params$pv$round_type, - model_predictor_id_count = length(params$model$predictor$id), - model_predictor_id_name = list(params$model$predictor$id), - model_predictor_all_count = length(params$model$predictor$all), - model_predictor_all_name = list(params$model$predictor$all), - model_predictor_categorical_count = - length(params$model$predictor$categorical), - model_predictor_categorical_name = list(params$model$predictor$categorical) -) %>% - bind_cols(dvc_md5_df) %>% - relocate( - starts_with("dvc_id_"), - .after = "input_complex_match_fuzzy_value" - ) %>% - arrow::write_parquet(paths$output$metadata$local) - - - - -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 3. Save Timings -------------------------------------------------------------- -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -message("Saving run timings") - -# Filter ensure we only get timing files for stages that actually ran -if (run_type == "full") { - timings <- list.files( - paste0(paths$intermediate$timing, "/"), - full.names = TRUE - ) -} else { - timings <- list.files( - paste0(paths$intermediate$timing, "/"), - pattern = "train|evaluate", - full.names = TRUE - ) -} - -# Convert the intermediate timing logs to a wide data frame, then save to file -timings_df <- purrr::map_dfr(timings, read_parquet) %>% - mutate( - run_id = run_id, - run_end_timestamp = run_end_timestamp, - elapsed = round(toc - tic, 2), - stage = paste0(tolower(stringr::word(msg, 1)), "_sec_elapsed"), - order = recode( - msg, - "Train" = "01", "Assess" = "02", - "Evaluate" = "03", "Interpret" = "04" - ) - ) %>% - arrange(order) %>% - select(-c(tic:toc, msg)) %>% - tidyr::pivot_wider( - id_cols = c(run_id, run_end_timestamp), - names_from = stage, - values_from = elapsed - ) %>% - mutate(overall_sec_elapsed = rowSums(across(ends_with("_sec_elapsed")))) %>% - mutate(across(ends_with("_sec_elapsed"), function(x) round(x, 2))) %>% - write_parquet(paths$output$timing$local) - -# Clear any remaining logs from tictoc -tictoc::tic.clearlog() - - - - -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 4. Upload -------------------------------------------------------------------- +# 2. Upload -------------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Uploading run artifacts") @@ -217,7 +51,7 @@ if (params$toggle$upload_to_s3) { ) - ## 4.1. Train ---------------------------------------------------------------- + ## 2.1. Train ---------------------------------------------------------------- # Upload lightgbm fit aws.s3::put_object( @@ -302,7 +136,7 @@ if (params$toggle$upload_to_s3) { } - # 4.2. Assess ---------------------------------------------------------------- + # 2.2. Assess ---------------------------------------------------------------- message("Uploading final assessment results") # Upload PIN and card-level values for full runs. These outputs are very @@ -330,7 +164,7 @@ if (params$toggle$upload_to_s3) { } - # 4.3. Evaluate -------------------------------------------------------------- + # 2.3. Evaluate -------------------------------------------------------------- # Upload test set performance message("Uploading test set evaluation") @@ -357,7 +191,7 @@ if (params$toggle$upload_to_s3) { } - # 4.4. Interpret ------------------------------------------------------------- + # 2.4. Interpret ------------------------------------------------------------- # Upload SHAP values if a full run. SHAP values are one row per card and one # column per feature, so the output is very large. Therefore, we partition @@ -385,7 +219,7 @@ if (params$toggle$upload_to_s3) { } - # 4.5. Finalize -------------------------------------------------------------- + # 2.5. Finalize -------------------------------------------------------------- message("Uploading run metadata, timings, and performance report") # Upload metadata @@ -411,7 +245,7 @@ if (params$toggle$upload_to_s3) { #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 5. Wrap-Up ------------------------------------------------------------------- +# 3. Crawl and notify ---------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # This will run a Glue crawler to update schemas and send an email to any SNS diff --git a/pipeline/08-export.R b/pipeline/07-export.R similarity index 100% rename from pipeline/08-export.R rename to pipeline/07-export.R diff --git a/pipeline/07-api.R b/pipeline/08-api.R similarity index 100% rename from pipeline/07-api.R rename to pipeline/08-api.R diff --git a/reports/performance.qmd b/reports/performance.qmd index 3e5cbe75..88d2f510 100644 --- a/reports/performance.qmd +++ b/reports/performance.qmd @@ -1,5 +1,5 @@ --- -title: "Model performance for `r params$year`" +title: "Model performance for `r params$run_id`" execute: echo: false warning: false @@ -12,6 +12,7 @@ format: fontsize: 12pt editor: source params: + run_id: 2023-03-14-clever-damani year: '2023' --- From 6772f45feb5cae5c2da498d2b6530d2b2dd57218 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 30 Nov 2023 21:34:51 +0000 Subject: [PATCH 19/27] Fix typo in README.Rmd and regenerate README --- README.Rmd | 4 ++-- README.md | 51 ++++++++++++++++++++++++++++----------------------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/README.Rmd b/README.Rmd index 4305a4a1..c92d0255 100644 --- a/README.Rmd +++ b/README.Rmd @@ -628,7 +628,7 @@ We also publish a Docker image containing model code and all of the dependencies 2. Set your working directory to the local folder containing this repository's files, either using R's `setwd()` command or (preferably) using RStudio's [projects](https://support.posit.co/hc/en-us/articles/200526207-Using-Projects). 3. Install `renv`, R's package manager, by running `install.packages("renv")`. 4. Install all R package dependencies using `renv` by running `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. -5. The `report` step of the model pipeline requires some additional dependencies for generating a model performance report. Install these additional dependencies by running `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. These dependencies must be installed in addition to the core dependencies installed in step 4. +5. The `finalize` step of the model pipeline requires some additional dependencies for generating a model performance report. Install these additional dependencies by running `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. These dependencies must be installed in addition to the core dependencies installed in step 4. If dependencies are not installed, the report will fail to generate and the pipeline stage will print the error message to the report file at `reports/performance.html`; the pipeline will continue to execute in spite of the failure. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -762,7 +762,7 @@ Both [Tidymodels](https://tune.tidymodels.org/articles/extras/optimizations.html There are two lockfiles that we use with renv to manage R dependencies: 1. **`renv.lock`** is the canonical list of dependencies that are used by the **core model pipeline**. Any dependencies that are required to run the model itself should be defined in this lockfile. -2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `report` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. +2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `finalize` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. Our goal in maintaining multiple lockfiles is to keep the list of dependencies that are required to run the model as short as possibile. This choice adds overhead to the process of updating R dependencies, but incurs the benefit of a more maintainable model over the long term. diff --git a/README.md b/README.md index 4af3fc63..36bee5a9 100644 --- a/README.md +++ b/README.md @@ -101,8 +101,8 @@ graph LR assess("Assess") evaluate("Evaluate") interpret("Interpret") - report("Report") finalize("Finalize") + upload("Upload") export("Export") ingest --> train @@ -110,10 +110,10 @@ graph LR train --> interpret assess --> evaluate evaluate --> finalize - interpret --> report - report --> finalize - finalize --> aws + interpret --> finalize + finalize --> upload finalize --> export + upload --> aws aws --> ingest aws --> export ``` @@ -160,13 +160,15 @@ stand-alone script) or as part of the overall pipeline (with entire model. The primary output of this stage is a data frame of the contributions of each feature for each property. -5. **Report**: Render a Quarto document containing a model performance - report to `reports/performance.html`. +5. **Finalize**: Save run timings and metadata and render a Quarto + document containing a model performance report to + `reports/performance.html`. -6. **Finalize**: Add metadata and then upload all output objects to AWS - (S3). All model outputs for every model run are stored in perpetuity - in S3. Each run’s performance can be visualized using the CCAO’s - internal Tableau dashboards. +6. **Upload**: Upload all output objects to AWS (S3). All model outputs + for every model run are stored in perpetuity in S3. Each run’s + performance can be visualized using the CCAO’s internal Tableau + dashboards. NOTE: This stage is only run internally, since it + requires access to the CCAO Data AWS account. 7. **Export**: Export assessed values to Desk Review spreadsheets for Valuations, as well as a delimited text format for upload to the @@ -338,7 +340,7 @@ districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/- and many others. The features in the table below are the ones that made the cut. They’re the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the -model as of 2023-11-29. +model as of 2023-11-30. | Feature Name | Category | Type | Possible Values | Notes | |:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -775,8 +777,9 @@ the following major changes to the residential modeling codebase: - Added [`delete-model-run`](https://github.com/ccao-data/model-res-avm/actions/workflows/delete-model-runs.yaml) workflow to delete test run artifacts in S3 using GitHub Actions. - - Added [pipeline/05-report.R](pipeline/05-report.R) step to render a - performance report using Quarto. + - Updated [pipeline/05-finalize](pipeline/05-finalize.R) step to + render a performance report using Quarto and factored S3/SNS + operations out into \[pipeline/06-upload.R\]. # Ongoing Issues @@ -1060,12 +1063,15 @@ of the model. `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. -5. The `report` step of the model pipeline requires some additional +5. The `finalize` step of the model pipeline requires some additional dependencies for generating a model performance report. Install these additional dependencies by running `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. These dependencies must be installed in addition to the core - dependencies installed in step 4. + dependencies installed in step 4. If dependencies are not installed, + the report will fail to generate and the pipeline stage will print + the error message to the report file at `reports/performance.html`; + the pipeline will continue to execute in spite of the failure. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -1081,8 +1087,8 @@ following stages: - [`pipeline/00-ingest.R`](pipeline/00-ingest.R) - Requires access to CCAO internal AWS services to pull data. See [Getting Data](#getting-data) if you are a member of the public. -- [`pipeline/06-finalize.R`](pipeline/06-finalize.R) - Requires access - to CCAO internal AWS services to upload model results. +- [`pipeline/06-upload.R`](pipeline/06-upload.R) - Requires access to + CCAO internal AWS services to upload model results. - [`pipeline/07-export.R`](pipeline/07-export.R) - Only required for CCAO internal processes. @@ -1146,9 +1152,8 @@ of these outputs and their purpose can be found in [`misc/file_dict.csv`](misc/file_dict.csv). For public users, all outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all -outputs are uploaded to S3 via the [finalize -stage](pipeline/06-finalize.R). Uploaded Parquet files are converted -into the following Athena tables: +outputs are uploaded to S3 via the [upload stage](pipeline/06-upload). +Uploaded Parquet files are converted into the following Athena tables: #### Athena Tables @@ -1275,9 +1280,9 @@ There are two lockfiles that we use with renv to manage R dependencies: to run the model itself should be defined in this lockfile. 2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance - report** in the `report` step of the pipeline. Any dependencies that - are required to generate that report or others like it should be - defined in this lockfile. + report** in the `finalize` step of the pipeline. Any dependencies + that are required to generate that report or others like it should + be defined in this lockfile. Our goal in maintaining multiple lockfiles is to keep the list of dependencies that are required to run the model as short as possibile. From ec1c35f91373bcaf9ab36c2326cace433a962280 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 30 Nov 2023 15:35:36 -0600 Subject: [PATCH 20/27] Fix mixed up deps/outputs between finalize and upload stages --- dvc.yaml | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/dvc.yaml b/dvc.yaml index 66cbe270..30ae1dff 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -127,25 +127,10 @@ stages: Save run timings and run metadata to disk and render a performance report using Quarto. deps: - - output/parameter_final/model_parameter_final.parquet - - output/parameter_range/model_parameter_range.parquet - - output/parameter_search/model_parameter_search.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds - - output/test_card/model_test_card.parquet - - output/assessment_card/model_assessment_card.parquet - - output/assessment_pin/model_assessment_pin.parquet - - output/performance/model_performance_test.parquet - - output/performance_quantile/model_performance_quantile_test.parquet - - output/performance/model_performance_assessment.parquet - - output/performance_quantile/model_performance_quantile_assessment.parquet - - output/shap/model_shap.parquet - - output/feature_importance/model_feature_importance.parquet - output/intermediate/timing/model_timing_train.parquet - output/intermediate/timing/model_timing_assess.parquet - output/intermediate/timing/model_timing_evaluate.parquet - output/intermediate/timing/model_timing_interpret.parquet - - reports/performance.html params: - run_note - run_type @@ -160,6 +145,8 @@ stages: cache: false - output/metadata/model_metadata.parquet: cache: false + - reports/performance.html: + cache: false upload: cmd: Rscript pipeline/06-upload.R @@ -169,7 +156,23 @@ stages: outputs prior to upload and attach a unique run ID. This step requires access to the CCAO Data AWS account, and so is assumed to be internal-only deps: + - output/parameter_final/model_parameter_final.parquet + - output/parameter_range/model_parameter_range.parquet + - output/parameter_search/model_parameter_search.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds + - output/test_card/model_test_card.parquet + - output/assessment_card/model_assessment_card.parquet + - output/assessment_pin/model_assessment_pin.parquet + - output/performance/model_performance_test.parquet + - output/performance_quantile/model_performance_quantile_test.parquet + - output/performance/model_performance_assessment.parquet + - output/performance_quantile/model_performance_quantile_assessment.parquet + - output/shap/model_shap.parquet + - output/feature_importance/model_feature_importance.parquet - output/metadata/model_metadata.parquet + - output/timing/model_timing.parquet + - reports/performance.html export: cmd: Rscript pipeline/07-export.R From ae750d5837a7a903eb095b5732e2a2161d2a1519 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 1 Dec 2023 15:57:40 +0000 Subject: [PATCH 21/27] Add missing run_id variable to upload pipeline stage --- pipeline/06-upload.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipeline/06-upload.R b/pipeline/06-upload.R index 319b10f6..c8db021e 100644 --- a/pipeline/06-upload.R +++ b/pipeline/06-upload.R @@ -28,10 +28,11 @@ paths <- model_file_dict() # Load the parameters file containing the run settings params <- read_yaml("params.yaml") -# Load CV toggle, SHAP toggle, and run_type as defined in the `finalize` step +# Load various overridden parameters as defined in the `finalize` step metadata <- read_parquet(paths$output$metadata$local) cv_enable <- metadata$cv_enable shap_enable <- metadata$shap_enable +run_id <- metadata$run_id run_type <- metadata$run_type From 31408249b37673df55240335605c925d061719a9 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 1 Dec 2023 17:38:06 +0000 Subject: [PATCH 22/27] Partition Quarto performance report S3 uploads by year --- misc/file_dict.csv | 50 +++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/misc/file_dict.csv b/misc/file_dict.csv index 7100bb08..6826ce75 100644 --- a/misc/file_dict.csv +++ b/misc/file_dict.csv @@ -1,25 +1,25 @@ -type,name,stage_number,stage_name,s3_bucket,path_local,path_s3,athena_table,observation_unit,primary_key,run_type_limited,run_type_full,description,notes -input,training,0,ingest,ccao-data-dvc-us-east-1,input/training_data.parquet,,,pin,"year, meta_pin",No,No,Sales data used to train the model,"Excludes any PINs with multiple cards, therefore unit of observation is PIN" -input,assessment,0,ingest,ccao-data-dvc-us-east-1,input/assessment_data.parquet,,,card,"year, meta_pin, meta_card_num",No,No,Universe of properties that need assessed values,"Card-level, rather than PIN-level" -input,complex_id,0,ingest,ccao-data-dvc-us-east-1,input/complex_id_data.parquet,,,pin,"year, meta_pin",No,No,Programmatically created townhome complex IDs,"Purpose is to assign townhomes in the same ""complex"" the same value. Only run as-needed i.e. as little as possible" -input,land_site_rate,0,ingest,ccao-data-dvc-us-east-1,input/land_site_rate_data.parquet,,,pin,"year, meta_pin",No,No,Site-specific flat land values provided by Valuations,Applies to class 210 and 295 properties only -input,land_nbhd_rate,0,ingest,ccao-data-dvc-us-east-1,input/land_nbhd_rate_data.parquet,,,nbhd,"year, meta_nbhd",No,No,Neighborhood-level land rates (per sqft) provided by Valuations,Applies to all properties (except those with a PIN-specific flat value) -output,parameter_search,1,train,ccao-model-results-us-east-1,output/parameter_search/model_parameter_search.parquet,parameter_search/year={year}/{run_id}.parquet,parameter_search,model cv fold,"year, run_id, configuration, fold_id",No,If CV enabled,Tidymodels tuning output from cross-validation,Each row is the result from one fold assessment from one iteration -output,parameter_raw,1,train,ccao-model-results-us-east-1,output/parameter_search/model_parameter_search.parquet,parameter_raw/year={year}/{run_id}.parquet,,model cv fold,"year, run_id, configuration, fold_id",No,If CV enabled,"Raw, nested Tidymodels tuning output",Not useful in Athena but needed to make models reproducible -output,parameter_final,1,train,ccao-model-results-us-east-1,output/parameter_final/model_parameter_final.parquet,parameter_final/year={year}/{run_id}.parquet,parameter_final,model run,"year, run_id",Yes,Yes,Chosen set of hyperparameters for each run,"As chosen by tune::select_best() if using CV, otherwise the default set of hyperparameters specified in params.yaml (model.hyperparameter.default)" -output,parameter_range,1,train,ccao-model-results-us-east-1,output/parameter_range/model_parameter_range.parquet,parameter_range/year={year}/{run_id}.parquet,parameter_range,parameter,"year, run_id, parameter_name",No,If CV enabled,Range of hyperparameters searched during CV tuning,As specified in params.yaml (model.hyperparameter.range) -output,test_card,1,train,ccao-model-results-us-east-1,output/test_card/model_test_card.parquet,test_card/year={year}/{run_id}.parquet,test_card,card,"year, meta_pin, meta_card_num",Yes,Yes,Test set predictions at the card level,Only includes the minimal variables necessary to perform evaluation -output,workflow_fit,1,train,ccao-model-results-us-east-1,output/workflow/fit/model_workflow_fit.zip,workflow/fit/year={year}/{run_id}.zip,,model run,,Yes,Yes,Trained LightGBM model object + Tidymodels specification,Can be loaded with lightsnip::lgbm_load() to produce predictions using new data -output,workflow_recipe,1,train,ccao-model-results-us-east-1,output/workflow/recipe/model_workflow_recipe.rds,workflow/recipe/year={year}/{run_id}.rds,,model run,,Yes,Yes,Trained Tidymodels recipe object used for data preprocessing,Can be used to prepare new data in the same way as the original model training -output,assessment_card,2,assess,ccao-model-results-us-east-1,output/assessment_card/model_assessment_card.parquet,assessment_card/,assessment_card,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,Assessment results at the card level AKA raw model output,Also includes card-level characteristics. Multi-card PINs will have more than one row. NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler -output,assessment_pin,2,assess,ccao-model-results-us-east-1,output/assessment_pin/model_assessment_pin.parquet,assessment_pin/,assessment_pin,pin,"year, run_id, township_code, meta_pin",No,Yes,Assessment results at the PIN level AKA aggregated and cleaned,"Aggregation depends on PIN specifics, see assess script for details. Includes PIN-level stats like YoY % changes, land, sales, etc. NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler" -output,performance_test,3,evaluate,ccao-model-results-us-east-1,output/performance/model_performance_test.parquet,performance/year={year}/stage=test/{run_id}.parquet,performance,geography [by class],"year, run_id, stage, geography_type, geography_id, by_class, class",Test only,Test + assessment,Peformance metrics (optionally) broken out by class for different levels of geography,Test set includes the most recent 10% of sales -output,performance_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance/model_performance_assessment.parquet,performance/year={year}/stage=assessment/{run_id}.parquet,performance,geography [by class],"year, run_id, stage, geography_type, geography_id, by_class, class",No,Test + assessment,Peformance metrics (optionally) broken out by class for different levels of geography,Assessment set uses the prior year sales to compare to the assessed value -output,performance_quantile_test,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_test.parquet,performance_quantile/year={year}/stage=test/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",Test only,Test + assessment,Performance metrics by quantile within class and geography,Test set includes the most recent 10% of sales -output,performance_quantile_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_assessment.parquet,performance_quantile/year={year}/stage=assessment/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",No,Test + assessment,Performance metrics by quantile within class and geography,Assessment set uses the prior year sales to compare to the assessed value -output,shap,4,interpret,ccao-model-results-us-east-1,output/shap/model_shap.parquet,shap/,shap,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,SHAP values for each feature for each card in the assessment data,NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler -output,feature_importance,4,interpret,ccao-model-results-us-east-1,output/feature_importance/model_feature_importance.parquet,feature_importance/year={year}/{run_id}.parquet,feature_importance,predictor,"year, run_id, model_predictor_all_name",No,Yes,"Feature importance values (gain, cover, and frequency) for the run", -output,report,5,finalize,ccao-model-results-us-east-1,reports/performance.html,report/year={year}/{run_id}.html,,model run,,No,Yes,Rendered Quarto doc with model performance statistics, -output,metadata,5,finalize,ccao-model-results-us-east-1,output/metadata/model_metadata.parquet,metadata/year={year}/{run_id}.parquet,metadata,model run,"year, run_id",Yes,Yes,"Information about each run, including parameters, run ID, git info, etc.", -intermediate,timing,,all,,output/intermediate/timing/,,,model stage,"year, msg",Yes,Yes,Parquet files for each stage containing the stage time elapsed,Converted into a one-row data frame in the finalize stage -output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Yes,Yes,Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages" +type,name,stage_number,stage_name,s3_bucket,path_local,path_s3,athena_table,observation_unit,primary_key,run_type_limited,run_type_full,description,notes +input,training,0,ingest,ccao-data-dvc-us-east-1,input/training_data.parquet,,,pin,"year, meta_pin",No,No,Sales data used to train the model,"Excludes any PINs with multiple cards, therefore unit of observation is PIN" +input,assessment,0,ingest,ccao-data-dvc-us-east-1,input/assessment_data.parquet,,,card,"year, meta_pin, meta_card_num",No,No,Universe of properties that need assessed values,"Card-level, rather than PIN-level" +input,complex_id,0,ingest,ccao-data-dvc-us-east-1,input/complex_id_data.parquet,,,pin,"year, meta_pin",No,No,Programmatically created townhome complex IDs,"Purpose is to assign townhomes in the same ""complex"" the same value. Only run as-needed i.e. as little as possible" +input,land_site_rate,0,ingest,ccao-data-dvc-us-east-1,input/land_site_rate_data.parquet,,,pin,"year, meta_pin",No,No,Site-specific flat land values provided by Valuations,Applies to class 210 and 295 properties only +input,land_nbhd_rate,0,ingest,ccao-data-dvc-us-east-1,input/land_nbhd_rate_data.parquet,,,nbhd,"year, meta_nbhd",No,No,Neighborhood-level land rates (per sqft) provided by Valuations,Applies to all properties (except those with a PIN-specific flat value) +output,parameter_search,1,train,ccao-model-results-us-east-1,output/parameter_search/model_parameter_search.parquet,parameter_search/year={year}/{run_id}.parquet,parameter_search,model cv fold,"year, run_id, configuration, fold_id",No,If CV enabled,Tidymodels tuning output from cross-validation,Each row is the result from one fold assessment from one iteration +output,parameter_raw,1,train,ccao-model-results-us-east-1,output/parameter_search/model_parameter_search.parquet,parameter_raw/year={year}/{run_id}.parquet,,model cv fold,"year, run_id, configuration, fold_id",No,If CV enabled,"Raw, nested Tidymodels tuning output",Not useful in Athena but needed to make models reproducible +output,parameter_final,1,train,ccao-model-results-us-east-1,output/parameter_final/model_parameter_final.parquet,parameter_final/year={year}/{run_id}.parquet,parameter_final,model run,"year, run_id",Yes,Yes,Chosen set of hyperparameters for each run,"As chosen by tune::select_best() if using CV, otherwise the default set of hyperparameters specified in params.yaml (model.hyperparameter.default)" +output,parameter_range,1,train,ccao-model-results-us-east-1,output/parameter_range/model_parameter_range.parquet,parameter_range/year={year}/{run_id}.parquet,parameter_range,parameter,"year, run_id, parameter_name",No,If CV enabled,Range of hyperparameters searched during CV tuning,As specified in params.yaml (model.hyperparameter.range) +output,test_card,1,train,ccao-model-results-us-east-1,output/test_card/model_test_card.parquet,test_card/year={year}/{run_id}.parquet,test_card,card,"year, meta_pin, meta_card_num",Yes,Yes,Test set predictions at the card level,Only includes the minimal variables necessary to perform evaluation +output,workflow_fit,1,train,ccao-model-results-us-east-1,output/workflow/fit/model_workflow_fit.zip,workflow/fit/year={year}/{run_id}.zip,,model run,,Yes,Yes,Trained LightGBM model object + Tidymodels specification,Can be loaded with lightsnip::lgbm_load() to produce predictions using new data +output,workflow_recipe,1,train,ccao-model-results-us-east-1,output/workflow/recipe/model_workflow_recipe.rds,workflow/recipe/year={year}/{run_id}.rds,,model run,,Yes,Yes,Trained Tidymodels recipe object used for data preprocessing,Can be used to prepare new data in the same way as the original model training +output,assessment_card,2,assess,ccao-model-results-us-east-1,output/assessment_card/model_assessment_card.parquet,assessment_card/,assessment_card,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,Assessment results at the card level AKA raw model output,Also includes card-level characteristics. Multi-card PINs will have more than one row. NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler +output,assessment_pin,2,assess,ccao-model-results-us-east-1,output/assessment_pin/model_assessment_pin.parquet,assessment_pin/,assessment_pin,pin,"year, run_id, township_code, meta_pin",No,Yes,Assessment results at the PIN level AKA aggregated and cleaned,"Aggregation depends on PIN specifics, see assess script for details. Includes PIN-level stats like YoY % changes, land, sales, etc. NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler" +output,performance_test,3,evaluate,ccao-model-results-us-east-1,output/performance/model_performance_test.parquet,performance/year={year}/stage=test/{run_id}.parquet,performance,geography [by class],"year, run_id, stage, geography_type, geography_id, by_class, class",Test only,Test + assessment,Peformance metrics (optionally) broken out by class for different levels of geography,Test set includes the most recent 10% of sales +output,performance_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance/model_performance_assessment.parquet,performance/year={year}/stage=assessment/{run_id}.parquet,performance,geography [by class],"year, run_id, stage, geography_type, geography_id, by_class, class",No,Test + assessment,Peformance metrics (optionally) broken out by class for different levels of geography,Assessment set uses the prior year sales to compare to the assessed value +output,performance_quantile_test,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_test.parquet,performance_quantile/year={year}/stage=test/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",Test only,Test + assessment,Performance metrics by quantile within class and geography,Test set includes the most recent 10% of sales +output,performance_quantile_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_assessment.parquet,performance_quantile/year={year}/stage=assessment/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",No,Test + assessment,Performance metrics by quantile within class and geography,Assessment set uses the prior year sales to compare to the assessed value +output,shap,4,interpret,ccao-model-results-us-east-1,output/shap/model_shap.parquet,shap/,shap,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,SHAP values for each feature for each card in the assessment data,NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler +output,feature_importance,4,interpret,ccao-model-results-us-east-1,output/feature_importance/model_feature_importance.parquet,feature_importance/year={year}/{run_id}.parquet,feature_importance,predictor,"year, run_id, model_predictor_all_name",No,Yes,"Feature importance values (gain, cover, and frequency) for the run", +output,report,5,finalize,ccao-model-results-us-east-1,reports/performance.html,report/year={year}/report_type=performance/{run_id}.html,,model run,,No,Yes,Rendered Quarto doc with model performance statistics, +output,metadata,5,finalize,ccao-model-results-us-east-1,output/metadata/model_metadata.parquet,metadata/year={year}/{run_id}.parquet,metadata,model run,"year, run_id",Yes,Yes,"Information about each run, including parameters, run ID, git info, etc.", +intermediate,timing,,all,,output/intermediate/timing/,,,model stage,"year, msg",Yes,Yes,Parquet files for each stage containing the stage time elapsed,Converted into a one-row data frame in the finalize stage +output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Yes,Yes,Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages" From 25c8d914efe3b57c1bc1c98cb8ef3d8e425f1544 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 1 Dec 2023 13:28:12 -0600 Subject: [PATCH 23/27] Strip everything after the first period in README feature table notes --- README.Rmd | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.Rmd b/README.Rmd index c92d0255..e60d34a6 100644 --- a/README.Rmd +++ b/README.Rmd @@ -254,7 +254,10 @@ dbt_manifest <- fromJSON( get_column_description <- function(colname, dag_nodes, hardcoded_descriptions) { # Retrieve the description for a column `colname` either from a set of # dbt DAG nodes (`dag_nodes`) or a set of hardcoded descriptions - # (`hardcoded_descriptions`) + # (`hardcoded_descriptions`). Column descriptions that come from dbt DAG nodes + # will be truncated starting from the first period to reflect the fact that + # we use periods in our dbt documentation to separate high-level column + # summaries from their detailed notes # # Prefer the hardcoded descriptions, if they exist if (colname %in% hardcoded_descriptions$column) { @@ -271,7 +274,11 @@ get_column_description <- function(colname, dag_nodes, hardcoded_descriptions) { if (column_name == colname) { description <- node$columns[[column_name]]$description if (!is.null(description) && trimws(description) != "") { - return(gsub("\n", " ", description)) + # Strip everything after the first period, since we use the first + # period as a delimiter separating a column's high-level summary from + # its detailed notes in our dbt docs + summary_description <- strplit(description, ".", fixed = TRUE)[[1]][1] + return(gsub("\n", " ", summary_description)) } } } From 7458babd12d1612b6f900c001672524ef63bf7c2 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 1 Dec 2023 13:28:34 -0600 Subject: [PATCH 24/27] Clean up some typos in README --- README.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.Rmd b/README.Rmd index e60d34a6..7dfc1ed1 100644 --- a/README.Rmd +++ b/README.Rmd @@ -475,7 +475,7 @@ This repository represents a significant departure from the old [residential mod ### [`assessment-year-2022`](https://github.com/ccao-data/model-res-avm/tree/2022-assessment-year) -* Moved previously separate processes into this repository and improved their integration with the overall modeling process. For example, the [etl_res_data](https://gitlab.com/ccao-data-science---modeling/processes/etl_res_data) process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) was moved to [pipeline/06-export.R](pipeline/07-export.R). +* Moved previously separate processes into this repository and improved their integration with the overall modeling process. For example, the [etl_res_data](https://gitlab.com/ccao-data-science---modeling/processes/etl_res_data) process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) was moved to [pipeline/07-export.R](pipeline/07-export.R). * Added [DVC](https://dvc.org/) support/integration. This repository uses DVC in 2 ways: 1. All input data in [`input/`](input/) is versioned, tracked, and stored using DVC. Previous input data sets are stored in perpetuity on S3. 2. [DVC pipelines](https://dvc.org/doc/user-guide/project-structure/pipelines-files) are used to sequentially run R pipeline scripts and track/cache inputs and outputs. @@ -503,7 +503,7 @@ This repository represents a significant departure from the old [residential mod * Infrastructure improvements * Added [`build-and-run-model`](https://github.com/ccao-data/model-res-avm/actions/workflows/build-and-run-model.yaml) workflow to run the model using GitHub Actions and AWS Batch. * Added [`delete-model-run`](https://github.com/ccao-data/model-res-avm/actions/workflows/delete-model-runs.yaml) workflow to delete test run artifacts in S3 using GitHub Actions. - * Updated [pipeline/05-finalize](pipeline/05-finalize.R) step to render a performance report using Quarto and factored S3/SNS operations out into [pipeline/06-upload.R]. + * Updated [pipeline/05-finalize](pipeline/05-finalize.R) step to render a performance report using Quarto and factored S3/SNS operations out into [pipeline/06-upload.R](pipeline/06-upload.R). # Ongoing Issues From 5da06da94cc809241668f672151066676786109e Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 1 Dec 2023 13:28:49 -0600 Subject: [PATCH 25/27] Generalize `Updating R dependencies` section of the README --- README.Rmd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.Rmd b/README.Rmd index 7dfc1ed1..ca1b5b43 100644 --- a/README.Rmd +++ b/README.Rmd @@ -766,7 +766,7 @@ Both [Tidymodels](https://tune.tidymodels.org/articles/extras/optimizations.html ## Updating R dependencies -There are two lockfiles that we use with renv to manage R dependencies: +We use multiple renv lockfiles in order to manage R dependencies: 1. **`renv.lock`** is the canonical list of dependencies that are used by the **core model pipeline**. Any dependencies that are required to run the model itself should be defined in this lockfile. 2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `finalize` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. @@ -775,13 +775,13 @@ Our goal in maintaining multiple lockfiles is to keep the list of dependencies t The process for **updating core model pipeline dependencies** is straightforward: Running `renv::install("")` and `renv::snapshot()` will ensure that the dependency gets added or updated in `renv.lock`, as long is it is imported somewhere in the model pipeline via a `library()` call. -The process for updating **model report dependencies** is more complex, since it requires the use of a separate `reporting` profile: +The process for updating *dependencies for other lockfiles** is more complex, since it requires the use of a separate profile when running renv commands. Determine the name of the profile you'd like to update (`` in the code that follows) and run the following commands: -1. Run `Sys.setenv(RENV_PROFILE = "reporting")` to set the renv profile to `reporting` -2. Make sure that the dependency is defined in the `DESCRIPTION` file under the `Config/renv/profiles/reporting/dependencies` key +1. Run `renv::activate(profile = "")` to set the renv profile to `` +2. Make sure that the dependency is defined in the `DESCRIPTION` file under the `Config/renv/profiles//dependencies` key 3. Run `renv::install("")` to add or update the dependency as necessary 4. Run `renv::snapshot(type = "explicit")` to update the reporting lockfile with the dependencies defined in the `DESCRIPTION` file -5. Run `Sys.unsetenv("RENV_PROFILE")` to switch the renv profile back to the default +5. Run `renv::activate()` if you would like to switch back to the default renv profile ## Troubleshooting From 31dc99dc849082f004645627271dcbc8e3590003 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 1 Dec 2023 13:29:33 -0600 Subject: [PATCH 26/27] Generate tictoc timings for finalize pipeline stage --- dvc.yaml | 2 + pipeline/05-finalize.R | 101 +++++++++++++++++++++++------------------ 2 files changed, 59 insertions(+), 44 deletions(-) diff --git a/dvc.yaml b/dvc.yaml index 30ae1dff..2af54dc4 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -141,6 +141,8 @@ stages: - pv - ratio_study outs: + - output/intermediate/timing/model_timing_finalize.parquet: + cache: false - output/timing/model_timing.parquet: cache: false - output/metadata/model_metadata.parquet: diff --git a/pipeline/05-finalize.R b/pipeline/05-finalize.R index ffeba8aa..b9bd5ddf 100644 --- a/pipeline/05-finalize.R +++ b/pipeline/05-finalize.R @@ -2,6 +2,10 @@ # 1. Setup --------------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Start the stage timer and clear logs from prior stage +tictoc::tic.clearlog() +tictoc::tic("Finalize") + # Load libraries and scripts suppressPackageStartupMessages({ library(arrow) @@ -11,6 +15,7 @@ suppressPackageStartupMessages({ library(lubridate) library(purrr) library(tidyr) + library(tictoc) library(tune) library(yaml) }) @@ -148,10 +153,58 @@ metadata <- tibble::tibble( #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 3. Save Timings -------------------------------------------------------------- +# 3. Generate performance report ----------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +# Wrap this block in an error handler so that the pipeline continues execution +# even if report generation fails. This is important because the report file is +# defined separately, so this script can't be sure that it is error-free +tryCatch( + { + suppressPackageStartupMessages({ + library(quarto) + }) + + message("Generating performance report") + + here("reports", "performance.qmd") %>% + quarto_render( + execute_params = list( + run_id = run_id, + year = params$assessment$year + ) + ) + }, + error = function(func) { + message("Encountered error during report generation:") + message(conditionMessage(func)) + + # Save an empty report so that this pipeline step produces the required + # output even in cases of failure + message("Saving an empty report file in order to continue execution") + sink(here("reports", "performance.html")) + cat("Encountered error in report generation:\n\n") + cat(conditionMessage(func)) + sink() + } +) + + + + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 4. Save Timings -------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Saving run timings") +# End the stage timer and write the time elapsed to a temporary file +tictoc::toc(log = TRUE) +bind_rows(tictoc::tic.log(format = FALSE)) %>% + arrow::write_parquet(gsub("//*", "/", file.path( + paths$intermediate$timing$local, + "model_timing_finalize.parquet" + ))) + # Filter ensure we only get timing files for stages that actually ran if (run_type == "full") { timings <- list.files( @@ -161,7 +214,7 @@ if (run_type == "full") { } else { timings <- list.files( paste0(paths$intermediate$timing, "/"), - pattern = "train|evaluate", + pattern = "train|evaluate|finalize", full.names = TRUE ) } @@ -176,7 +229,8 @@ timings_df <- purrr::map_dfr(timings, read_parquet) %>% order = recode( msg, "Train" = "01", "Assess" = "02", - "Evaluate" = "03", "Interpret" = "04" + "Evaluate" = "03", "Interpret" = "04", + "Finalize" = "05" ) ) %>% arrange(order) %>% @@ -192,44 +246,3 @@ timings_df <- purrr::map_dfr(timings, read_parquet) %>% # Clear any remaining logs from tictoc tictoc::tic.clearlog() - - - - -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 4. Generate performance report ----------------------------------------------- -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# Wrap this block in an error handler so that the pipeline continues execution -# even if report generation fails. This is important because the report file is -# defined separately, so this script can't be sure that it is error-free, and -# -tryCatch( - { - suppressPackageStartupMessages({ - library(quarto) - }) - - message("Generating performance report") - - here("reports", "performance.qmd") %>% - quarto_render( - execute_params = list( - run_id = run_id, - year = params$assessment$year - ) - ) - }, - error = function(func) { - message("Encountered error during report generation:") - message(conditionMessage(func)) - - # Save an empty report so that this pipeline step produces the required - # output even in cases of failure - message("Saving an empty report file in order to continue execution") - sink(here("reports", "performance.html")) - cat("Encountered error in report generation:\n\n") - cat(conditionMessage(func)) - sink() - } -) From 18cfbceac5173919d4580bde07215b9822961134 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 1 Dec 2023 19:31:56 +0000 Subject: [PATCH 27/27] Rerender README.md --- README.Rmd | 2 +- README.md | 215 +++++++++++++++++++++++++++-------------------------- 2 files changed, 110 insertions(+), 107 deletions(-) diff --git a/README.Rmd b/README.Rmd index ca1b5b43..71abcadf 100644 --- a/README.Rmd +++ b/README.Rmd @@ -277,7 +277,7 @@ get_column_description <- function(colname, dag_nodes, hardcoded_descriptions) { # Strip everything after the first period, since we use the first # period as a delimiter separating a column's high-level summary from # its detailed notes in our dbt docs - summary_description <- strplit(description, ".", fixed = TRUE)[[1]][1] + summary_description <- strsplit(description, ".", fixed = TRUE)[[1]][1] return(gsub("\n", " ", summary_description)) } } diff --git a/README.md b/README.md index 36bee5a9..64522247 100644 --- a/README.md +++ b/README.md @@ -340,102 +340,102 @@ districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/- and many others. The features in the table below are the ones that made the cut. They’re the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the -model as of 2023-11-30. - -| Feature Name | Category | Type | Possible Values | Notes | -|:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the people 17 years or younger. ACS variable (`B01001_003E` + `B01001_004E` + `B01001_005E` + `B01001_006E` + `B01001_007E` + `B01001_027E` + `B01001_028E` + `B01001_029E` + `B01001_030E` + `B01001_031E`) / `B01001_001E` | -| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the people 65 years or older. ACS variable (`B01001_020E` + `B01001_021E` + `B01001_022E` + `B01001_023E` + `B01001_024E` + `B01001_025E` + `B01001_044E` + `B01001_045E` + `B01001_046E` + `B01001_046E` + `B01001_048E` + `B01001_049E`) / `B01001_001E` | -| Median Population Age | ACS5 | numeric | | Median age for whole population. ACS variable `B01002_001E` | -| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months. ACS variable `B07003_004E` / `B07003_001E` | -| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months. ACS variable `B07003_013E` / `B07003_001E` | -| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married. ACS variable `B11001_003E` / `B11001_001E` | -| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single). ACS variable `B11001_008E` / `B11001_001E` | -| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree. ACS variable (`B15002_011E` + `B15002_028E`) / `B15002_001E` | -| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor’s degree. ACS variable (`B15002_015E` + `B15002_032E`) / `B15002_001E` | -| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree. ACS variable (`B15002_016E` + `B15002_017E` + `B15002_018E` + `B15002_033E` + `B15002_034E` + `B15002_035E`) / `B15002_001E` | -| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people above the poverty level in the last 12 months. ACS variable `B17001_003E` / `B17001_001E` | -| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months. ACS variable `B19013_001E` | -| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months. ACS variable `B19301_001E` | -| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months. ACS variable `B22003_002E` / `B22003_001E` | -| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed. ACS variable `B23025_005E` / `B23025_003E` | -| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied households. ACS variable `B25037_001E` | -| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units. ACS variable `B25064_001E` | -| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied. ACS variable `B25003_002E` / `B25003_001E` | -| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Percent of occupied households with selected conditions. Conditions include: incomplete plumbing or kitchens, overcrowding, 30% or more of the household income spent on rent or monthly owner costs. ACS variable (`B25123_003E` + `B25123_004E` + `B25123_005E` + `B25123_006E` + `B25123_009E` + `B25123_010E` + `B25123_011E` + `B25123_012E`) / `B25123_001E` | -| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months. ACS variable `B07003_007E` / `B07003_001E` | -| Year Built | Characteristic | numeric | | Year the property was constructed | -| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | Indicator for central air. Possible values for this variable are: - `1` = Central A/C (`YES`) - `2` = No central A/C (`NO`) | -| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties. CAUTION: Note the numerically encoded values DO NOT correspond to the number of apartments i.e. code 1 means 2 apartments, code 6 means 0 apartments. Possible values for this variable are: - `1` = Two apartments (`TWO`) - `2` = Three apartments (`THREE`) - `3` = Four apartments (`FOUR`) - `4` = Five apartments (`FIVE`) - `5` = Six apartments (`SIX`) - `6` = No apartments (`NONE`) | -| Attic Finish | Characteristic | categorical | Living Area, Partial, None | Attic finish. Possible values for this variable are: - `1` = Living area (`LAR`) - `2` = Partial (`PT`) - `3` = None (`UNF`) | -| Attic Type | Characteristic | categorical | Full, Partial, None | Attic type. Possible values for this variable are: - `1` = Full (`FL`) - `2` = Partial (`PT`) - `3` = None (`NO`) | -| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building. | -| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior. | -| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | Basement type. Possible values for this variable are: - `1` = Full (`FL`) - `2` = Slab (`SL`) - `3` = Partial (`PT`) - `4` = Crawl (`CR`) | -| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | Basement finish. Possible values for this variable are: - `1` = Finished / formal rec room (`REC`) - `2` = Apartment (`APT`) - `3` = Unfinished (`UNF`) | -| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Exterior wall construction. Possible values for this variable are: - `1` = Frame (`FRAM`) - `2` = Masonry (`MASR`) - `3` = Frame + masonry (`FRMA`) - `4` = Stucco (`STUC`) | -| Full Baths | Characteristic | numeric | | Number of full bathrooms. Defined as bathrooms with a bath or shower. If this value is missing, the default value is set to 1 | -| Fireplaces | Characteristic | numeric | | Number of fireplaces. Counted as the number of flues one can see from the outside of the building | -| Garage 1 Area Included | Characteristic | categorical | Yes, No | Indicator for garage area inclusion. Is the garage physically included within the building area? If yes, the garage area is subtracted from the building square feet calculation by the field agent. Possible values for this variable are: - `1` = Yes (`YES`) - `2` = No (`NO`) | -| Garage 1 Attached | Characteristic | categorical | Yes, No | Indicator for garage attached. Is the garage physically attached to the main building? Possible values for this variable are: - `1` = Yes (`YES`) - `2` = No (`NO`) | -| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Garage exterior wall construction. Possible values for this variable are: - `1` = Frame (`FRAM`) - `2` = Masonry (`MASR`) - `3` = Frame + masonry (`FRMA`) - `4` = Stucco (`STUC`) | -| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | Garage size (number of cars). Possible values for this variable are: - `1` = 1 car (`1CAR`) - `2` = 1.5 cars (`1.5CAR`) - `3` = 2 cars (`2CAR`) - `4` = 2.5 cars (`2.5CAR`) - `5` = 3 cars (`3CAR`) - `6` = 3.5 cars (`3.5CAR`) - `7` = 0 cars (`0CAR`) - `8` = 4 cars (`4CAR`) | -| Half Baths | Characteristic | numeric | | Number of half baths. Defined as bathrooms without a shower or bathtub | -| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property. A single PIN can have multiple “land lines”, meaning it can be associated with more than one 200-class land lot | -| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | Interior heating type. Possible values for this variable are: - `1` = Central air / furnace (`FURN`) - `2` = Steam / radiator (`STM`) - `3` = Electric (`ELEC`) - `4` = None (`NONE`) | -| Number of Commercial Units | Characteristic | numeric | | Number of commercial units. The vast majority are for properties with class 212 | -| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | Porch type. Possible values for this variable are: - `0` = None (`NONE`) - `1` = Frame enclosed (`FRAM`) - `2` = Masonry enclosed (`MSRY`) | -| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | Roof material / construction. Possible values for this variable are: - `1` = Shingle + asphalt (`SHAS`) - `2` = Tar + gravel (`TRGR`) - `3` = Slate (`SLTE`) - `4` = Shake (`SHKE`) - `5` = Tile (`TILE`) - `6` = Other (`OTHR`) | -| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths). Not to be confused with bedrooms | -| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated. Field has not been updated recently enough to be useful for modeling | -| Design Plan | Characteristic | categorical | Architect, Stock Plan | Design plan. Whether the property was designed by an architect or from a stock plan. Possible values for this variable are: - `1` = Architect (`ARCT`) - `2` = Stock plan (`STCK`) | -| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | Type of residence. Used to indicate stories as well as other information about the design of the property. Also used to determine the property class. Possible values for this variable are: - `1` = 1 story (`1STRY`) - `2` = 2 story (`2STRY`) - `3` = 3 story or more (`3STRY+`) - `4` = Split level (`SPLT`) - `5` = 1.5 story (`1.5STRY`) - `9.9` = Missing (`MSSNG`) | -| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years. Renovation is indicated by the `char_renovation` characteristic flipping from `NO` to `YES` | -| Longitude | Location | numeric | | X coordinate in degrees (global longitude). Point location derived from the centroid of the largest polygon associated with the geometry. Units are degrees, taken from the WGS84 projection (EPSG 4326) | -| Latitude | Location | numeric | | Y coordinate in degrees (global latitude). Point location derived from the centroid of the largest polygon associated with the geometry. Units are degrees, taken from the WGS84 projection (EPSG 4326) | -| Municipality Name | Location | character | | | -| FEMA Special Flood Hazard Area | Location | logical | | FEMA Special Flood Hazard Area, derived from spatial intersection with FEMA floodplain maps. Taken from FEMA site for 2021 only | -| First Street Factor | Location | numeric | | First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk. Pulled from 2019 First Street extract provided to the CCAO | -| First Street Risk Direction | Location | numeric | | First Street risk direction. Positive scores indicate increasing risk of flood, negative scores indicate decreasing risk of flood, 0 indicates no movement of risk. Pulled from 2019 First Street extract provided to the CCAO | -| School Elementary District GEOID | Location | character | | School district (elementary) GEOID. Derived from Cook County and City of Chicago shapefiles. Chicago Public Schools are associated with attendance areas where suburban schools are associated with districts | -| School Secondary District GEOID | Location | character | | School district (secondary) GEOID. Derived from Cook County and City of Chicago shapefiles. Chicago Public Schools are associated with attendance areas where suburban schools are associated with districts | -| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | -| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | -| Airport Noise DNL | Location | numeric | | O’Hare and Midway noise, measured as DNL. DNL measures the total cumulative sound exposure over a 24-hour period. Here DNL is imputed using physical models or a kriging surface based on noise data from monitors around each airport. Noise monitor data retrieved from the Chicago Department of Aviation | -| Township Code | Meta | character | | Cook County township code. See `township_name` for more information. Note that township codes that start with 7 are City triad townships | -| Neighborhood Code | Meta | character | | Assessor neighborhood code. First 2 digits are township code, last 3 digits are neighborhood code. Neighborhood boundaries are coincident with townships. Geographic neighborhoods intended to represent relatively homogeneous housing sub-markets. They were created a long time ago for internal use by the various property tax offices. The Assessor now uses them as units of work and analysis. For example, land rates are usually delimited by neighborhood | -| Tieback Proration Rate | Meta | numeric | | Proration rate applied to the PIN. PINs are occasionally prorated when not all of their value is contained within their boundaries. For example, a building that lies equally across two PINs would be prorated to 50%. In this case, the *land* value of the PIN is not prorated, but the building value is. | -| Property Group | Meta | categorical | Non-Livable Space, Single-Family, Multi-Family, Condominium, Bed & Breakfast | | -| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN. For modeling, the idea is to capture any downward pressure on price from higher tax burdens | -| School District (Elementary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of elementary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | -| School District (Secondary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of secondary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | -| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile | -| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops within half mile. Includes CTA and PACE bus stops. Stop locations sourced from agency GTFS feeds | -| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of foreclosures per 1000 PINs, within half mile (past 5 years). Normalized version of the half mile foreclosure count to account for PIN density. Sourced from Illinois Public Record (IPR). Note that this data is reported on a long lag | -| Number of Schools in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile. School locations sourced from [GreatSchools](https://www.greatschools.org/) | -| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile. Includes only schools that have a GreatSchools rating. School locations and ratings sourced from [GreatSchools](https://www.greatschools.org/) | -| Average School Rating in Half Mile | Proximity | numeric | | Average school rating of schools within half mile. Schools of any type (elementary, secondary, etc.) are included. School ratings sourced from [GreatSchools](https://www.greatschools.org/) | -| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Nearest bike trail distance (feet). Bike trail data sourced from Cook County GIS | -| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Nearest cemetery distance (feet). Cemetery data sourced from Cook County GIS | -| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Nearest CTA route distance (feet). Routes include any active CTA tracks. Route data sourced from CTA GTFS feeds | -| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Nearest CTA stop distance (feet). Stops include any active CTA stops for trains only. Stop data sourced from CTA GTFS feeds | -| Nearest Hospital Distance (Feet) | Proximity | numeric | | Nearest hospital distance (feet). Hospital locations sourced from Cook County GIS | -| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance to Lake Michigan shoreline (feet). Shoreline sourced from Census hydrography files | -| Nearest Major Road Distance (Feet) | Proximity | numeric | | Nearest major road distance (feet). Major road locations sourced from OpenStreetMap (OSM). Major roads include any OSM ways tagged with `highway/motorway`, `highway/trunk`, or `highway/primary` | -| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Nearest Metra route distance (feet). Routes include any active Metra tracks. Route data sourced from Metra GTFS feeds | -| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Nearest Metra stop distance (feet). Stops include any active Metra stops. Stop data sourced from Metra GTFS feeds | -| Nearest Park Distance (Feet) | Proximity | numeric | | Nearest park distance (feet). Park locations sourced from OpenStreetMap using the tag `leisure/park` | -| Nearest Railroad Distance (Feet) | Proximity | numeric | | Nearest railroad distance (feet). Railroad locations sourced from Cook County GIS. Inclusive of any rail (CTA, Metra, non-passenger freight, etc.) | -| Nearest Water Distance (Feet) | Proximity | numeric | | Nearest water distance (feet). Water locations are inclusive of *any* body of water. Sourced from Census hydrology files | -| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Nearest golf course distance (feet). Golf course data sourced from Cook County GIS and OpenStreetMap | -| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | -| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | -| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | -| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | -| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | -| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | -| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | -| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | +model as of 2023-12-01. + +| Feature Name | Category | Type | Possible Values | Notes | +|:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------| +| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the people 17 years or younger | +| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the people 65 years or older | +| Median Population Age | ACS5 | numeric | | Median age for whole population | +| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months | +| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months | +| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married | +| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single) | +| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree | +| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor’s degree | +| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree | +| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people above the poverty level in the last 12 months | +| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months | +| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months | +| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months | +| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed | +| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied households | +| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units | +| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied | +| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Percent of occupied households with selected conditions | +| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months | +| Year Built | Characteristic | numeric | | Year the property was constructed | +| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | Indicator for central air | +| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties | +| Attic Finish | Characteristic | categorical | Living Area, Partial, None | Attic finish | +| Attic Type | Characteristic | categorical | Full, Partial, None | Attic type | +| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building | +| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior | +| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | Basement type | +| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | Basement finish | +| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Exterior wall construction | +| Full Baths | Characteristic | numeric | | Number of full bathrooms | +| Fireplaces | Characteristic | numeric | | Number of fireplaces | +| Garage 1 Area Included | Characteristic | categorical | Yes, No | Indicator for garage area inclusion | +| Garage 1 Attached | Characteristic | categorical | Yes, No | Indicator for garage attached | +| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Garage exterior wall construction | +| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | Garage size (number of cars) | +| Half Baths | Characteristic | numeric | | Number of half baths | +| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property | +| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | Interior heating type | +| Number of Commercial Units | Characteristic | numeric | | Number of commercial units | +| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | Porch type | +| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | Roof material / construction | +| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths) | +| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated | +| Design Plan | Characteristic | categorical | Architect, Stock Plan | Design plan | +| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | Type of residence | +| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years | +| Longitude | Location | numeric | | X coordinate in degrees (global longitude) | +| Latitude | Location | numeric | | Y coordinate in degrees (global latitude) | +| Municipality Name | Location | character | | | +| FEMA Special Flood Hazard Area | Location | logical | | FEMA Special Flood Hazard Area, derived from spatial intersection with FEMA floodplain maps | +| First Street Factor | Location | numeric | | First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk | +| First Street Risk Direction | Location | numeric | | First Street risk direction | +| School Elementary District GEOID | Location | character | | School district (elementary) GEOID | +| School Secondary District GEOID | Location | character | | School district (secondary) GEOID | +| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability | +| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability | +| Airport Noise DNL | Location | numeric | | O’Hare and Midway noise, measured as DNL | +| Township Code | Meta | character | | Cook County township code | +| Neighborhood Code | Meta | character | | Assessor neighborhood code | +| Tieback Proration Rate | Meta | numeric | | Proration rate applied to the PIN | +| Property Group | Meta | categorical | Non-Livable Space, Single-Family, Multi-Family, Condominium, Bed & Breakfast | | +| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN | +| School District (Elementary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of elementary schools within the district of a given PIN | +| School District (Secondary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of secondary schools within the district of a given PIN | +| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile | +| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops within half mile | +| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of foreclosures per 1000 PINs, within half mile (past 5 years) | +| Number of Schools in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile | +| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile | +| Average School Rating in Half Mile | Proximity | numeric | | Average school rating of schools within half mile | +| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Nearest bike trail distance (feet) | +| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Nearest cemetery distance (feet) | +| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Nearest CTA route distance (feet) | +| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Nearest CTA stop distance (feet) | +| Nearest Hospital Distance (Feet) | Proximity | numeric | | Nearest hospital distance (feet) | +| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance to Lake Michigan shoreline (feet) | +| Nearest Major Road Distance (Feet) | Proximity | numeric | | Nearest major road distance (feet) | +| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Nearest Metra route distance (feet) | +| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Nearest Metra stop distance (feet) | +| Nearest Park Distance (Feet) | Proximity | numeric | | Nearest park distance (feet) | +| Nearest Railroad Distance (Feet) | Proximity | numeric | | Nearest railroad distance (feet) | +| Nearest Water Distance (Feet) | Proximity | numeric | | Nearest water distance (feet) | +| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Nearest golf course distance (feet) | +| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | +| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | +| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | +| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | +| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | +| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | +| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | +| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | #### Data Sources @@ -717,7 +717,7 @@ the following major changes to the residential modeling codebase: process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) - was moved to [pipeline/06-export.R](pipeline/07-export.R). + was moved to [pipeline/07-export.R](pipeline/07-export.R). - Added [DVC](https://dvc.org/) support/integration. This repository uses DVC in 2 ways: 1. All input data in [`input/`](input/) is versioned, tracked, and @@ -779,7 +779,7 @@ the following major changes to the residential modeling codebase: workflow to delete test run artifacts in S3 using GitHub Actions. - Updated [pipeline/05-finalize](pipeline/05-finalize.R) step to render a performance report using Quarto and factored S3/SNS - operations out into \[pipeline/06-upload.R\]. + operations out into [pipeline/06-upload.R](pipeline/06-upload.R). # Ongoing Issues @@ -1273,7 +1273,7 @@ sped up using the parallel processing built-in to LightGBM. Note that: ## Updating R dependencies -There are two lockfiles that we use with renv to manage R dependencies: +We use multiple renv lockfiles in order to manage R dependencies: 1. **`renv.lock`** is the canonical list of dependencies that are used by the **core model pipeline**. Any dependencies that are required @@ -1295,19 +1295,22 @@ straightforward: Running `renv::install("")` and in `renv.lock`, as long is it is imported somewhere in the model pipeline via a `library()` call. -The process for updating **model report dependencies** is more complex, -since it requires the use of a separate `reporting` profile: +The process for updating \*dependencies for other lockfiles\*\* is more +complex, since it requires the use of a separate profile when running +renv commands. Determine the name of the profile you’d like to update +(`` in the code that follows) and run the following +commands: -1. Run `Sys.setenv(RENV_PROFILE = "reporting")` to set the renv profile - to `reporting` +1. Run `renv::activate(profile = "")` to set the renv + profile to `` 2. Make sure that the dependency is defined in the `DESCRIPTION` file - under the `Config/renv/profiles/reporting/dependencies` key + under the `Config/renv/profiles//dependencies` key 3. Run `renv::install("")` to add or update the dependency as necessary 4. Run `renv::snapshot(type = "explicit")` to update the reporting lockfile with the dependencies defined in the `DESCRIPTION` file -5. Run `Sys.unsetenv("RENV_PROFILE")` to switch the renv profile back - to the default +5. Run `renv::activate()` if you would like to switch back to the + default renv profile ## Troubleshooting