diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 00000000..b41839a2 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1 @@ +Config/renv/profiles/reporting/dependencies: quarto, leaflet, plotly, sf diff --git a/Dockerfile b/Dockerfile index 86180185..549c5736 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,13 @@ ENV RENV_PATHS_LIBRARY renv/library RUN apt-get update && apt-get install --no-install-recommends -y \ libcurl4-openssl-dev libssl-dev libxml2-dev libgit2-dev git \ libudunits2-dev python3-dev python3-pip libgdal-dev libgeos-dev \ - libproj-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev pandoc + libproj-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev pandoc \ + curl gdebi-core + +# Install Quarto +RUN curl -o quarto-linux-amd64.deb -L \ + https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.450/quarto-1.3.450-linux-amd64.deb +RUN gdebi -n quarto-linux-amd64.deb # Install pipenv for Python dependencies RUN pip install pipenv @@ -26,11 +32,13 @@ RUN pipenv install --system --deploy # Copy R bootstrap files into the image COPY renv.lock . +COPY renv/profiles/reporting/renv.lock reporting-renv.lock COPY .Rprofile . COPY renv/ renv/ # Install R dependencies RUN Rscript -e 'renv::restore()' +RUN Rscript -e 'renv::restore(lockfile = "reporting-renv.lock")' # Copy the directory into the container ADD ./ model-res-avm/ diff --git a/R/helpers.R b/R/helpers.R index 6bfd069b..db76d059 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -36,7 +36,10 @@ model_get_s3_artifacts_for_run <- function(run_id, year) { bucket <- strsplit(s3_objs[1], "/")[[1]][3] # First get anything partitioned only by year - s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) %>% + s3_objs_limited <- grep( + ".parquet$|.zip$|.rds|.html$", s3_objs, + value = TRUE + ) %>% unname() # Next get the prefix of anything partitioned by year and run_id diff --git a/README.Rmd b/README.Rmd index 07055be8..71abcadf 100644 --- a/README.Rmd +++ b/README.Rmd @@ -59,6 +59,7 @@ graph LR evaluate("Evaluate") interpret("Interpret") finalize("Finalize") + upload("Upload") export("Export") ingest --> train @@ -67,8 +68,9 @@ graph LR assess --> evaluate evaluate --> finalize interpret --> finalize - finalize --> aws + finalize --> upload finalize --> export + upload --> aws aws --> ingest aws --> export ``` @@ -87,9 +89,11 @@ All inputs and outputs are stored on AWS S3 using a unique run identifier. Each 4. **Interpret**: Calculate SHAP values for all the estimated values from the assess stage. These are the _per feature_ contribution to the predicted value for an _individual observation_ (usually a single PIN). Also calculate the aggregate feature importance for the entire model. The primary output of this stage is a data frame of the contributions of each feature for each property. -5. **Finalize**: Add metadata and then upload all output objects to AWS (S3). All model outputs for every model run are stored in perpetuity in S3. Each run's performance can be visualized using the CCAO's internal Tableau dashboards. +5. **Finalize**: Save run timings and metadata and render a Quarto document containing a model performance report to `reports/performance.html`. -6. **Export**: Export assessed values to Desk Review spreadsheets for Valuations, as well as a delimited text format for upload to the system of record (iasWorld). NOTE: This stage is only run when a final model is selected. It is not run automatically or as part of the main pipeline. +6. **Upload**: Upload all output objects to AWS (S3). All model outputs for every model run are stored in perpetuity in S3. Each run's performance can be visualized using the CCAO's internal Tableau dashboards. NOTE: This stage is only run internally, since it requires access to the CCAO Data AWS account. + +7. **Export**: Export assessed values to Desk Review spreadsheets for Valuations, as well as a delimited text format for upload to the system of record (iasWorld). NOTE: This stage is only run when a final model is selected. It is not run automatically or as part of the main pipeline. ## Choices Made @@ -250,7 +254,10 @@ dbt_manifest <- fromJSON( get_column_description <- function(colname, dag_nodes, hardcoded_descriptions) { # Retrieve the description for a column `colname` either from a set of # dbt DAG nodes (`dag_nodes`) or a set of hardcoded descriptions - # (`hardcoded_descriptions`) + # (`hardcoded_descriptions`). Column descriptions that come from dbt DAG nodes + # will be truncated starting from the first period to reflect the fact that + # we use periods in our dbt documentation to separate high-level column + # summaries from their detailed notes # # Prefer the hardcoded descriptions, if they exist if (colname %in% hardcoded_descriptions$column) { @@ -267,7 +274,11 @@ get_column_description <- function(colname, dag_nodes, hardcoded_descriptions) { if (column_name == colname) { description <- node$columns[[column_name]]$description if (!is.null(description) && trimws(description) != "") { - return(gsub("\n", " ", description)) + # Strip everything after the first period, since we use the first + # period as a delimiter separating a column's high-level summary from + # its detailed notes in our dbt docs + summary_description <- strsplit(description, ".", fixed = TRUE)[[1]][1] + return(gsub("\n", " ", summary_description)) } } } @@ -464,7 +475,7 @@ This repository represents a significant departure from the old [residential mod ### [`assessment-year-2022`](https://github.com/ccao-data/model-res-avm/tree/2022-assessment-year) -* Moved previously separate processes into this repository and improved their integration with the overall modeling process. For example, the [etl_res_data](https://gitlab.com/ccao-data-science---modeling/processes/etl_res_data) process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) was moved to [pipeline/06-export.R](pipeline/06-export.R). +* Moved previously separate processes into this repository and improved their integration with the overall modeling process. For example, the [etl_res_data](https://gitlab.com/ccao-data-science---modeling/processes/etl_res_data) process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) was moved to [pipeline/07-export.R](pipeline/07-export.R). * Added [DVC](https://dvc.org/) support/integration. This repository uses DVC in 2 ways: 1. All input data in [`input/`](input/) is versioned, tracked, and stored using DVC. Previous input data sets are stored in perpetuity on S3. 2. [DVC pipelines](https://dvc.org/doc/user-guide/project-structure/pipelines-files) are used to sequentially run R pipeline scripts and track/cache inputs and outputs. @@ -487,6 +498,13 @@ This repository represents a significant departure from the old [residential mod * Dropped explicit spatial lag generation in the ingest stage. * Lots of other bugfixes and minor improvements. +### Upcoming + +* Infrastructure improvements + * Added [`build-and-run-model`](https://github.com/ccao-data/model-res-avm/actions/workflows/build-and-run-model.yaml) workflow to run the model using GitHub Actions and AWS Batch. + * Added [`delete-model-run`](https://github.com/ccao-data/model-res-avm/actions/workflows/delete-model-runs.yaml) workflow to delete test run artifacts in S3 using GitHub Actions. + * Updated [pipeline/05-finalize](pipeline/05-finalize.R) step to render a performance report using Quarto and factored S3/SNS operations out into [pipeline/06-upload.R](pipeline/06-upload.R). + # Ongoing Issues The CCAO faces a number of ongoing issues which make modeling difficult. Some of these issues are in the process of being solved; others are less tractable. We list them here for the sake of transparency and to provide a sense of the challenges we face. @@ -609,12 +627,15 @@ The code in this repository is written primarily in [R](https://www.r-project.or If you're on Windows, you'll also need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) in order to build the necessary packages. You may also want to (optionally) install [DVC](https://dvc.org/doc/install) to pull data and run pipelines. +We also publish a Docker image containing model code and all of the dependencies necessary to run it. If you're comfortable using Docker, you can skip the installation steps below and instead pull the image from `ghcr.io/ccao-data/model-res-avm:master` to run the latest version of the model. + ## Installation 1. Clone this repository using git, or simply download it using the button at the top of the page. 2. Set your working directory to the local folder containing this repository's files, either using R's `setwd()` command or (preferably) using RStudio's [projects](https://support.posit.co/hc/en-us/articles/200526207-Using-Projects). 3. Install `renv`, R's package manager, by running `install.packages("renv")`. 4. Install all R package dependencies using `renv` by running `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. +5. The `finalize` step of the model pipeline requires some additional dependencies for generating a model performance report. Install these additional dependencies by running `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. These dependencies must be installed in addition to the core dependencies installed in step 4. If dependencies are not installed, the report will fail to generate and the pipeline stage will print the error message to the report file at `reports/performance.html`; the pipeline will continue to execute in spite of the failure. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -625,8 +646,8 @@ For installation issues, particularly related to package installation and depend To use this repository, simply open the [pipeline/](./pipeline) directory and run the R scripts in order. Non-CCAO users can skip the following stages: * [`pipeline/00-ingest.R`](pipeline/00-ingest.R) - Requires access to CCAO internal AWS services to pull data. See [Getting Data](#getting-data) if you are a member of the public. -* [`pipeline/05-finalize.R`](pipeline/05-finalize.R) - Requires access to CCAO internal AWS services to upload model results. -* [`pipeline/06-export.R`](pipeline/06-export.R) - Only required for CCAO internal processes. +* [`pipeline/06-upload.R`](pipeline/06-upload.R) - Requires access to CCAO internal AWS services to upload model results. +* [`pipeline/07-export.R`](pipeline/07-export.R) - Only required for CCAO internal processes. #### Using DVC @@ -667,7 +688,7 @@ Each R script has a set of associated parameters (tracked via `dvc.yaml`). DVC w ## Output -The full model pipeline produces a large number of outputs. A full list of these outputs and their purpose can be found in [`misc/file_dict.csv`](misc/file_dict.csv). For public users, all outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all outputs are uploaded to S3 via the [finalize stage](pipeline/05-finalize.R). Uploaded Parquet files are converted into the following Athena tables: +The full model pipeline produces a large number of outputs. A full list of these outputs and their purpose can be found in [`misc/file_dict.csv`](misc/file_dict.csv). For public users, all outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all outputs are uploaded to S3 via the [upload stage](pipeline/06-upload). Uploaded Parquet files are converted into the following Athena tables: #### Athena Tables @@ -743,6 +764,25 @@ Both [Tidymodels](https://tune.tidymodels.org/articles/extras/optimizations.html * The number of threads is set via the [num_threads](https://lightgbm.readthedocs.io/en/latest/Parameters.html#num_threads) parameter, which is passed to the model using the `set_args()` function from `parsnip`. By default, `num_threads` is equal to the full number of physical cores available. More (or faster) cores will decrease total training time. * This repository uses the CPU version of LightGBM included with the [LightGBM R package](https://lightgbm.readthedocs.io/en/latest/R/index.html). If you'd like to use the GPU version you'll need to [build it yourself](https://lightgbm.readthedocs.io/en/latest/R/index.html#installing-a-gpu-enabled-build) or wait for the [upcoming CUDA release](https://github.com/microsoft/LightGBM/issues/5153). +## Updating R dependencies + +We use multiple renv lockfiles in order to manage R dependencies: + +1. **`renv.lock`** is the canonical list of dependencies that are used by the **core model pipeline**. Any dependencies that are required to run the model itself should be defined in this lockfile. +2. **`renv/profiles/reporting/renv.lock`** is the canonical list of dependencies that are used to **generate a model performance report** in the `finalize` step of the pipeline. Any dependencies that are required to generate that report or others like it should be defined in this lockfile. + +Our goal in maintaining multiple lockfiles is to keep the list of dependencies that are required to run the model as short as possibile. This choice adds overhead to the process of updating R dependencies, but incurs the benefit of a more maintainable model over the long term. + +The process for **updating core model pipeline dependencies** is straightforward: Running `renv::install("")` and `renv::snapshot()` will ensure that the dependency gets added or updated in `renv.lock`, as long is it is imported somewhere in the model pipeline via a `library()` call. + +The process for updating *dependencies for other lockfiles** is more complex, since it requires the use of a separate profile when running renv commands. Determine the name of the profile you'd like to update (`` in the code that follows) and run the following commands: + +1. Run `renv::activate(profile = "")` to set the renv profile to `` +2. Make sure that the dependency is defined in the `DESCRIPTION` file under the `Config/renv/profiles//dependencies` key +3. Run `renv::install("")` to add or update the dependency as necessary +4. Run `renv::snapshot(type = "explicit")` to update the reporting lockfile with the dependencies defined in the `DESCRIPTION` file +5. Run `renv::activate()` if you would like to switch back to the default renv profile + ## Troubleshooting The dependencies for this repository are numerous and not all of them may install correctly. Here are some common install issues (as seen in the R console) as well as their respective resolutions: diff --git a/README.md b/README.md index bec76f7e..64522247 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Table of Contents - [`assessment-year-2021`](#assessment-year-2021) - [`assessment-year-2022`](#assessment-year-2022) - [`assessment-year-2023`](#assessment-year-2023) + - [Upcoming](#upcoming) - [Ongoing Issues](#ongoing-issues) - [Data Quality and Integrity](#data-quality-and-integrity) - [Heterogeneity and Extremes](#heterogeneity-and-extremes) @@ -27,6 +28,7 @@ Table of Contents - [Output](#output) - [Getting Data](#getting-data) - [System Requirements](#system-requirements) + - [Updating R dependencies](#updating-r-dependencies) - [Troubleshooting](#troubleshooting) - [License](#license) - [Contributing](#contributing) @@ -100,6 +102,7 @@ graph LR evaluate("Evaluate") interpret("Interpret") finalize("Finalize") + upload("Upload") export("Export") ingest --> train @@ -108,8 +111,9 @@ graph LR assess --> evaluate evaluate --> finalize interpret --> finalize - finalize --> aws + finalize --> upload finalize --> export + upload --> aws aws --> ingest aws --> export ``` @@ -156,12 +160,17 @@ stand-alone script) or as part of the overall pipeline (with entire model. The primary output of this stage is a data frame of the contributions of each feature for each property. -5. **Finalize**: Add metadata and then upload all output objects to AWS - (S3). All model outputs for every model run are stored in perpetuity - in S3. Each run’s performance can be visualized using the CCAO’s - internal Tableau dashboards. +5. **Finalize**: Save run timings and metadata and render a Quarto + document containing a model performance report to + `reports/performance.html`. -6. **Export**: Export assessed values to Desk Review spreadsheets for +6. **Upload**: Upload all output objects to AWS (S3). All model outputs + for every model run are stored in perpetuity in S3. Each run’s + performance can be visualized using the CCAO’s internal Tableau + dashboards. NOTE: This stage is only run internally, since it + requires access to the CCAO Data AWS account. + +7. **Export**: Export assessed values to Desk Review spreadsheets for Valuations, as well as a delimited text format for upload to the system of record (iasWorld). NOTE: This stage is only run when a final model is selected. It is not run automatically or as part of @@ -331,102 +340,102 @@ districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/- and many others. The features in the table below are the ones that made the cut. They’re the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the -model as of 2023-10-05. - -| Feature Name | Category | Type | Possible Values | Notes | -|:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the population 17 years or younger. ACS variable (B01001_003E + B01001_004E + B01001_005E + B01001_006E + B01001_007E + B01001_027E + B01001_028E + B01001_029E + B01001_030E + B01001_031E) / B01001_001E | -| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the population 65 years or older. ACS variable (B01001_020E + B01001_021E + B01001_022E + B01001_023E + B01001_024E + B01001_025E + B01001_044E + B01001_045E + B01001_046E + B01001_046E + B01001_048E + B01001_049E) / B01001_001E | -| Median Population Age | ACS5 | numeric | | Median age for whole population. ACS variable B01002_001E | -| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months. ACS variable B07003_004E / B07003_001E | -| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months. ACS variable B07003_013E / B07003_001E | -| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married (married). ACS variable B11001_003E / B11001_001E | -| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single). ACS variable B11001_008E / B11001_001E | -| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree. ACS variable (B15002_011E + B15002_028E) / B15002_001E | -| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor degree. ACS variable (B15002_015E + B15002_032E) / B15002_001E | -| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree. ACS variable (B15002_016E + B15002_017E + B15002_018E + B15002_033E + B15002_034E + B15002_035E) / B15002_001E | -| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people below poverty level. ACS variable B17001_002E / B17001_001E | -| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months. ACS variable B19013_001E | -| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months. ACS variable B19301_001E | -| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months. ACS variable B22003_002E / B22003_001E | -| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed. ACS variable B23025_005E / B23025_003E | -| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied housing units. ACS variable B25037_001E | -| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units. ACS variable B25064_001E | -| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied. ACS variable B25003_002E / B25003_001E | -| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Selected conditions, including: incomplete plumbing or kitchens, overcrowding, 30% or more of the household income spent on rent or monthly owner costs ACS variable (B25123_003E + B25123_004E + B25123_005E + B25123_006E + B25123_009E + B25123_010E + B25123_011E + B25123_012E) / B25123_001E | -| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months. ACS variable B07003_007E / B07003_001E | -| Year Built | Characteristic | numeric | | | -| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | | -| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties. CAUTION: Note the numerically encoded values DO NOT correspond to the number of apartments i.e. code 1 means 2 apartments, code 6 means 0 apartments | -| Attic Finish | Characteristic | categorical | Living Area, Partial, None | | -| Attic Type | Characteristic | categorical | Full, Partial, None | | -| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building | -| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior | -| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | | -| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | | -| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | | -| Full Baths | Characteristic | numeric | | Number of full bathrooms, defined as having a bath or shower. If this value is missing, the default value is set to 1 | -| Fireplaces | Characteristic | numeric | | Number of fireplaces, counted as the number of flues one can see from the outside of the building | -| Garage 1 Area Included | Characteristic | categorical | Yes, No | | -| Garage 1 Attached | Characteristic | categorical | Yes, No | | -| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | | -| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | | -| Half Baths | Characteristic | numeric | | Number of half baths, defined as bathrooms without a shower or bathtub | -| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property. A single PIN can have multiple “land lines,” meaning it can be associated with more than one 200-class land lot | -| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | | -| Number of Commercial Units | Characteristic | numeric | | Number of commercial units. The vast majority are for properties with class 212 | -| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | | -| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | | -| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths). Not to be confused with bedrooms | -| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated. Field has not been updated recently enough to be useful for modeling | -| Design Plan | Characteristic | categorical | Architect, Stock Plan | | -| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | | -| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years. Renovation is indicated by the char_renovation characteristic flipping from “NO” to “YES” | -| Longitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN | -| Latitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN | -| Municipality Name | Location | character | | Municipality name for a given PIN. Taken from Cook County GIS shapefiles | -| FEMA Special Flood Hazard Area | Location | logical | | Indicator for a PIN within a FEMA Special Flood Hazard Area. Taken from FEMA site for 2021 only | -| First Street Factor | Location | numeric | | First Street flood factor (risk score) for a given PIN, scores 1 - 10. Provided to the CCAO by firststreet.org | -| First Street Risk Direction | Location | numeric | | First Street risk direction for a given PIN. Positive scores indicate increasing future flood risk, negative scores the opposite. Provided to the CCAO by firststreet.org | -| School Elementary District GEOID | Location | character | | Elementary school district ID for a given PIN. For CPS, elementary school attendance boundaries are used. Taken from Cook County GIS shapefiles | -| School Secondary District GEOID | Location | character | | Secondary school district ID for a given PIN. For CPS, secondary school attendance boundaries are used. Taken from Cook County GIS shapefiles | -| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | -| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | -| Airport Noise DNL | Location | numeric | | Airport noise calculated via kriging noise monitor data from CDA. See for more information | -| Township Code | Meta | character | | Numeric code identifying the Cook County township of a given PIN | -| Neighborhood Code | Meta | character | | Assessor neighborhood. First 2 digits are township code, last 3 digits are neighborhood code | -| Tieback Proration Rate | Meta | numeric | | Proration rate for a given PIN. Some buildings sit across multiple PINs. This number is intended to capture the split in building value | -| Property Group | Meta | categorical | Non-Livable Space, Single-Family, Multi-Family, Condominium, Bed & Breakfast | | -| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN. Idea is to capture any downward pressure on price from higher tax burdens | -| School District (Elementary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of elementary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | -| School District (Secondary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of secondary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | -| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile of a given PIN. Condo buildings are counted as a single PIN | -| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops (CTA or PACE) within half mile of a given PIN. Taken from GTFS feeds retrieved from transitfeeds.com | -| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of PIN-level foreclosure in the past 5 years, per 1000 PINs, within half mile of a given PIN. Taken from Illinois Public Records | -| Number of Schools in Half Mile | Proximity | numeric | | Number of schools within half mile of a given PIN. This includes preschools, small private schools, universities, etc | -| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools with Great Schools ratings within half mile of a given PIN | -| Average School Rating in Half Mile | Proximity | numeric | | Average Great Schools rating for all schools (with a rating) within half mile of a given PIN. Public schools must be within the same district as the PIN to be considered in the average | -| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest bike trail (linestring). Taken from Cook County GIS shapefiles | -| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest cemetery (polygon). Taken from Cook County GIS shapefiles | -| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA tracks. Taken from CTA GTFS feeds retrieved via transitfeeds.com | -| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA stop. Taken from CTA GTFS feeds retrieved via transitfeeds.com | -| Nearest Hospital Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest hospital (polygon). Taken from Cook County GIS shapefiles | -| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance in feet to the Lake Michigan coastline. Taken from TIGER/Line coastlines file and filtered to Cook County only | -| Nearest Major Road Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest major road/highway. Pulled via OpenStreetMap, key=highway, value=motorway,primary,trunk | -| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra tracks. Taken from Metra GTFS feeds retrieved via transitfeeds.com | -| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra stop. Taken from Metra GTFS feeds retrieved via transitfeeds.com | -| Nearest Park Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest park. Pull via OpenStreetMap, key=leisure, value=park | -| Nearest Railroad Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest rail (not including CTA). Taken from Cook County GIS shapefiles | -| Nearest Water Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest water. As identified by Cook County hydrology files | -| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest golf course (polygon). Taken from Cook County GIS shapefiles and OpenStreetMap | -| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | -| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | -| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | -| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | -| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | -| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | -| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | -| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | +model as of 2023-12-01. + +| Feature Name | Category | Type | Possible Values | Notes | +|:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------| +| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the people 17 years or younger | +| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the people 65 years or older | +| Median Population Age | ACS5 | numeric | | Median age for whole population | +| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months | +| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months | +| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married | +| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single) | +| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree | +| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor’s degree | +| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree | +| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people above the poverty level in the last 12 months | +| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months | +| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months | +| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months | +| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed | +| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied households | +| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units | +| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied | +| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Percent of occupied households with selected conditions | +| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months | +| Year Built | Characteristic | numeric | | Year the property was constructed | +| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | Indicator for central air | +| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties | +| Attic Finish | Characteristic | categorical | Living Area, Partial, None | Attic finish | +| Attic Type | Characteristic | categorical | Full, Partial, None | Attic type | +| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building | +| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior | +| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | Basement type | +| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | Basement finish | +| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Exterior wall construction | +| Full Baths | Characteristic | numeric | | Number of full bathrooms | +| Fireplaces | Characteristic | numeric | | Number of fireplaces | +| Garage 1 Area Included | Characteristic | categorical | Yes, No | Indicator for garage area inclusion | +| Garage 1 Attached | Characteristic | categorical | Yes, No | Indicator for garage attached | +| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | Garage exterior wall construction | +| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | Garage size (number of cars) | +| Half Baths | Characteristic | numeric | | Number of half baths | +| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property | +| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | Interior heating type | +| Number of Commercial Units | Characteristic | numeric | | Number of commercial units | +| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | Porch type | +| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | Roof material / construction | +| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths) | +| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated | +| Design Plan | Characteristic | categorical | Architect, Stock Plan | Design plan | +| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | Type of residence | +| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years | +| Longitude | Location | numeric | | X coordinate in degrees (global longitude) | +| Latitude | Location | numeric | | Y coordinate in degrees (global latitude) | +| Municipality Name | Location | character | | | +| FEMA Special Flood Hazard Area | Location | logical | | FEMA Special Flood Hazard Area, derived from spatial intersection with FEMA floodplain maps | +| First Street Factor | Location | numeric | | First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk | +| First Street Risk Direction | Location | numeric | | First Street risk direction | +| School Elementary District GEOID | Location | character | | School district (elementary) GEOID | +| School Secondary District GEOID | Location | character | | School district (secondary) GEOID | +| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability | +| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability | +| Airport Noise DNL | Location | numeric | | O’Hare and Midway noise, measured as DNL | +| Township Code | Meta | character | | Cook County township code | +| Neighborhood Code | Meta | character | | Assessor neighborhood code | +| Tieback Proration Rate | Meta | numeric | | Proration rate applied to the PIN | +| Property Group | Meta | categorical | Non-Livable Space, Single-Family, Multi-Family, Condominium, Bed & Breakfast | | +| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN | +| School District (Elementary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of elementary schools within the district of a given PIN | +| School District (Secondary) GreatSchools Rating | Other | numeric | | Average GreatSchools rating of secondary schools within the district of a given PIN | +| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile | +| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops within half mile | +| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of foreclosures per 1000 PINs, within half mile (past 5 years) | +| Number of Schools in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile | +| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools (any kind) within half mile | +| Average School Rating in Half Mile | Proximity | numeric | | Average school rating of schools within half mile | +| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Nearest bike trail distance (feet) | +| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Nearest cemetery distance (feet) | +| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Nearest CTA route distance (feet) | +| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Nearest CTA stop distance (feet) | +| Nearest Hospital Distance (Feet) | Proximity | numeric | | Nearest hospital distance (feet) | +| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance to Lake Michigan shoreline (feet) | +| Nearest Major Road Distance (Feet) | Proximity | numeric | | Nearest major road distance (feet) | +| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Nearest Metra route distance (feet) | +| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Nearest Metra stop distance (feet) | +| Nearest Park Distance (Feet) | Proximity | numeric | | Nearest park distance (feet) | +| Nearest Railroad Distance (Feet) | Proximity | numeric | | Nearest railroad distance (feet) | +| Nearest Water Distance (Feet) | Proximity | numeric | | Nearest water distance (feet) | +| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Nearest golf course distance (feet) | +| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | +| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | +| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | +| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | +| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | +| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | +| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | +| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | #### Data Sources @@ -708,7 +717,7 @@ the following major changes to the residential modeling codebase: process was moved to [pipeline/00-ingest.R](pipeline/00-ingest.R), while the process to [finalize model values](https://gitlab.com/ccao-data-science---modeling/processes/finalize_model_values) - was moved to [pipeline/06-export.R](pipeline/06-export.R). + was moved to [pipeline/07-export.R](pipeline/07-export.R). - Added [DVC](https://dvc.org/) support/integration. This repository uses DVC in 2 ways: 1. All input data in [`input/`](input/) is versioned, tracked, and @@ -759,6 +768,19 @@ the following major changes to the residential modeling codebase: - Dropped explicit spatial lag generation in the ingest stage. - Lots of other bugfixes and minor improvements. +### Upcoming + +- Infrastructure improvements + - Added + [`build-and-run-model`](https://github.com/ccao-data/model-res-avm/actions/workflows/build-and-run-model.yaml) + workflow to run the model using GitHub Actions and AWS Batch. + - Added + [`delete-model-run`](https://github.com/ccao-data/model-res-avm/actions/workflows/delete-model-runs.yaml) + workflow to delete test run artifacts in S3 using GitHub Actions. + - Updated [pipeline/05-finalize](pipeline/05-finalize.R) step to + render a performance report using Quarto and factored S3/SNS + operations out into [pipeline/06-upload.R](pipeline/06-upload.R). + # Ongoing Issues The CCAO faces a number of ongoing issues which make modeling difficult. @@ -1021,6 +1043,12 @@ If you’re on Windows, you’ll also need to install build the necessary packages. You may also want to (optionally) install [DVC](https://dvc.org/doc/install) to pull data and run pipelines. +We also publish a Docker image containing model code and all of the +dependencies necessary to run it. If you’re comfortable using Docker, +you can skip the installation steps below and instead pull the image +from `ghcr.io/ccao-data/model-res-avm:master` to run the latest version +of the model. + ## Installation 1. Clone this repository using git, or simply download it using the @@ -1035,6 +1063,15 @@ build the necessary packages. You may also want to (optionally) install `renv::restore()`. This step may take awhile. Linux users will likely need to install dependencies (via apt, yum, etc.) to build from source. +5. The `finalize` step of the model pipeline requires some additional + dependencies for generating a model performance report. Install + these additional dependencies by running + `renv::restore(lockfile = "renv/profiles/reporting/renv.lock")`. + These dependencies must be installed in addition to the core + dependencies installed in step 4. If dependencies are not installed, + the report will fail to generate and the pipeline stage will print + the error message to the report file at `reports/performance.html`; + the pipeline will continue to execute in spite of the failure. For installation issues, particularly related to package installation and dependencies, see [Troubleshooting](#troubleshooting). @@ -1050,9 +1087,9 @@ following stages: - [`pipeline/00-ingest.R`](pipeline/00-ingest.R) - Requires access to CCAO internal AWS services to pull data. See [Getting Data](#getting-data) if you are a member of the public. -- [`pipeline/05-finalize.R`](pipeline/05-finalize.R) - Requires access - to CCAO internal AWS services to upload model results. -- [`pipeline/06-export.R`](pipeline/06-export.R) - Only required for +- [`pipeline/06-upload.R`](pipeline/06-upload.R) - Requires access to + CCAO internal AWS services to upload model results. +- [`pipeline/07-export.R`](pipeline/07-export.R) - Only required for CCAO internal processes. #### Using DVC @@ -1115,9 +1152,8 @@ of these outputs and their purpose can be found in [`misc/file_dict.csv`](misc/file_dict.csv). For public users, all outputs are saved in the [`output/`](output/) directory, where they can be further used/examined after a model run. For CCAO employees, all -outputs are uploaded to S3 via the [finalize -stage](pipeline/05-finalize.R). Uploaded Parquet files are converted -into the following Athena tables: +outputs are uploaded to S3 via the [upload stage](pipeline/06-upload). +Uploaded Parquet files are converted into the following Athena tables: #### Athena Tables @@ -1235,6 +1271,47 @@ sped up using the parallel processing built-in to LightGBM. Note that: or wait for the [upcoming CUDA release](https://github.com/microsoft/LightGBM/issues/5153). +## Updating R dependencies + +We use multiple renv lockfiles in order to manage R dependencies: + +1. **`renv.lock`** is the canonical list of dependencies that are used + by the **core model pipeline**. Any dependencies that are required + to run the model itself should be defined in this lockfile. +2. **`renv/profiles/reporting/renv.lock`** is the canonical list of + dependencies that are used to **generate a model performance + report** in the `finalize` step of the pipeline. Any dependencies + that are required to generate that report or others like it should + be defined in this lockfile. + +Our goal in maintaining multiple lockfiles is to keep the list of +dependencies that are required to run the model as short as possibile. +This choice adds overhead to the process of updating R dependencies, but +incurs the benefit of a more maintainable model over the long term. + +The process for **updating core model pipeline dependencies** is +straightforward: Running `renv::install("")` and +`renv::snapshot()` will ensure that the dependency gets added or updated +in `renv.lock`, as long is it is imported somewhere in the model +pipeline via a `library()` call. + +The process for updating \*dependencies for other lockfiles\*\* is more +complex, since it requires the use of a separate profile when running +renv commands. Determine the name of the profile you’d like to update +(`` in the code that follows) and run the following +commands: + +1. Run `renv::activate(profile = "")` to set the renv + profile to `` +2. Make sure that the dependency is defined in the `DESCRIPTION` file + under the `Config/renv/profiles//dependencies` key +3. Run `renv::install("")` to add or update the + dependency as necessary +4. Run `renv::snapshot(type = "explicit")` to update the reporting + lockfile with the dependencies defined in the `DESCRIPTION` file +5. Run `renv::activate()` if you would like to switch back to the + default renv profile + ## Troubleshooting The dependencies for this repository are numerous and not all of them diff --git a/dvc.yaml b/dvc.yaml index 861d8a6e..2af54dc4 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -124,24 +124,9 @@ stages: finalize: cmd: Rscript pipeline/05-finalize.R desc: > - Save run timings, upload pipeline run results to S3, and send an SNS - notification. Will also clean some of the generated outputs prior to - upload and attach a unique run ID + Save run timings and run metadata to disk and render a performance report + using Quarto. deps: - - output/parameter_final/model_parameter_final.parquet - - output/parameter_range/model_parameter_range.parquet - - output/parameter_search/model_parameter_search.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds - - output/test_card/model_test_card.parquet - - output/assessment_card/model_assessment_card.parquet - - output/assessment_pin/model_assessment_pin.parquet - - output/performance/model_performance_test.parquet - - output/performance_quantile/model_performance_quantile_test.parquet - - output/performance/model_performance_assessment.parquet - - output/performance_quantile/model_performance_quantile_assessment.parquet - - output/shap/model_shap.parquet - - output/feature_importance/model_feature_importance.parquet - output/intermediate/timing/model_timing_train.parquet - output/intermediate/timing/model_timing_assess.parquet - output/intermediate/timing/model_timing_evaluate.parquet @@ -156,13 +141,43 @@ stages: - pv - ratio_study outs: + - output/intermediate/timing/model_timing_finalize.parquet: + cache: false - output/timing/model_timing.parquet: cache: false - output/metadata/model_metadata.parquet: cache: false + - reports/performance.html: + cache: false + + upload: + cmd: Rscript pipeline/06-upload.R + desc: > + Upload performance stats and report to S3, trigger Glue crawlers, and + publish to a model run SNS topic. Will also clean some of the generated + outputs prior to upload and attach a unique run ID. This step requires + access to the CCAO Data AWS account, and so is assumed to be internal-only + deps: + - output/parameter_final/model_parameter_final.parquet + - output/parameter_range/model_parameter_range.parquet + - output/parameter_search/model_parameter_search.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds + - output/test_card/model_test_card.parquet + - output/assessment_card/model_assessment_card.parquet + - output/assessment_pin/model_assessment_pin.parquet + - output/performance/model_performance_test.parquet + - output/performance_quantile/model_performance_quantile_test.parquet + - output/performance/model_performance_assessment.parquet + - output/performance_quantile/model_performance_quantile_assessment.parquet + - output/shap/model_shap.parquet + - output/feature_importance/model_feature_importance.parquet + - output/metadata/model_metadata.parquet + - output/timing/model_timing.parquet + - reports/performance.html export: - cmd: Rscript pipeline/06-export.R + cmd: Rscript pipeline/07-export.R desc: > Generate Desk Review spreadsheets and iasWorld upload CSVs from a finished run. NOT automatically run since it is typically only run once. Manually diff --git a/misc/file_dict.csv b/misc/file_dict.csv index 5e1b0f8d..6826ce75 100644 --- a/misc/file_dict.csv +++ b/misc/file_dict.csv @@ -1,24 +1,25 @@ -type,name,stage_number,stage_name,s3_bucket,path_local,path_s3,athena_table,observation_unit,primary_key,run_type_limited,run_type_full,description,notes -input,training,0,ingest,ccao-data-dvc-us-east-1,input/training_data.parquet,,,pin,"year, meta_pin",No,No,Sales data used to train the model,"Excludes any PINs with multiple cards, therefore unit of observation is PIN" -input,assessment,0,ingest,ccao-data-dvc-us-east-1,input/assessment_data.parquet,,,card,"year, meta_pin, meta_card_num",No,No,Universe of properties that need assessed values,"Card-level, rather than PIN-level" -input,complex_id,0,ingest,ccao-data-dvc-us-east-1,input/complex_id_data.parquet,,,pin,"year, meta_pin",No,No,Programmatically created townhome complex IDs,"Purpose is to assign townhomes in the same ""complex"" the same value. Only run as-needed i.e. as little as possible" -input,land_site_rate,0,ingest,ccao-data-dvc-us-east-1,input/land_site_rate_data.parquet,,,pin,"year, meta_pin",No,No,Site-specific flat land values provided by Valuations,Applies to class 210 and 295 properties only -input,land_nbhd_rate,0,ingest,ccao-data-dvc-us-east-1,input/land_nbhd_rate_data.parquet,,,nbhd,"year, meta_nbhd",No,No,Neighborhood-level land rates (per sqft) provided by Valuations,Applies to all properties (except those with a PIN-specific flat value) -output,parameter_search,1,train,ccao-model-results-us-east-1,output/parameter_search/model_parameter_search.parquet,parameter_search/year={year}/{run_id}.parquet,parameter_search,model cv fold,"year, run_id, configuration, fold_id",No,If CV enabled,Tidymodels tuning output from cross-validation,Each row is the result from one fold assessment from one iteration -output,parameter_raw,1,train,ccao-model-results-us-east-1,output/parameter_search/model_parameter_search.parquet,parameter_raw/year={year}/{run_id}.parquet,,model cv fold,"year, run_id, configuration, fold_id",No,If CV enabled,"Raw, nested Tidymodels tuning output",Not useful in Athena but needed to make models reproducible -output,parameter_final,1,train,ccao-model-results-us-east-1,output/parameter_final/model_parameter_final.parquet,parameter_final/year={year}/{run_id}.parquet,parameter_final,model run,"year, run_id",Yes,Yes,Chosen set of hyperparameters for each run,"As chosen by tune::select_best() if using CV, otherwise the default set of hyperparameters specified in params.yaml (model.hyperparameter.default)" -output,parameter_range,1,train,ccao-model-results-us-east-1,output/parameter_range/model_parameter_range.parquet,parameter_range/year={year}/{run_id}.parquet,parameter_range,parameter,"year, run_id, parameter_name",No,If CV enabled,Range of hyperparameters searched during CV tuning,As specified in params.yaml (model.hyperparameter.range) -output,test_card,1,train,ccao-model-results-us-east-1,output/test_card/model_test_card.parquet,test_card/year={year}/{run_id}.parquet,test_card,card,"year, meta_pin, meta_card_num",Yes,Yes,Test set predictions at the card level,Only includes the minimal variables necessary to perform evaluation -output,workflow_fit,1,train,ccao-model-results-us-east-1,output/workflow/fit/model_workflow_fit.zip,workflow/fit/year={year}/{run_id}.zip,,model run,,Yes,Yes,Trained LightGBM model object + Tidymodels specification,Can be loaded with lightsnip::lgbm_load() to produce predictions using new data -output,workflow_recipe,1,train,ccao-model-results-us-east-1,output/workflow/recipe/model_workflow_recipe.rds,workflow/recipe/year={year}/{run_id}.rds,,model run,,Yes,Yes,Trained Tidymodels recipe object used for data preprocessing,Can be used to prepare new data in the same way as the original model training -output,assessment_card,2,assess,ccao-model-results-us-east-1,output/assessment_card/model_assessment_card.parquet,assessment_card/,assessment_card,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,Assessment results at the card level AKA raw model output,Also includes card-level characteristics. Multi-card PINs will have more than one row. NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler -output,assessment_pin,2,assess,ccao-model-results-us-east-1,output/assessment_pin/model_assessment_pin.parquet,assessment_pin/,assessment_pin,pin,"year, run_id, township_code, meta_pin",No,Yes,Assessment results at the PIN level AKA aggregated and cleaned,"Aggregation depends on PIN specifics, see assess script for details. Includes PIN-level stats like YoY % changes, land, sales, etc. NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler" -output,performance_test,3,evaluate,ccao-model-results-us-east-1,output/performance/model_performance_test.parquet,performance/year={year}/stage=test/{run_id}.parquet,performance,geography [by class],"year, run_id, stage, geography_type, geography_id, by_class, class",Test only,Test + assessment,Peformance metrics (optionally) broken out by class for different levels of geography,Test set includes the most recent 10% of sales -output,performance_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance/model_performance_assessment.parquet,performance/year={year}/stage=assessment/{run_id}.parquet,performance,geography [by class],"year, run_id, stage, geography_type, geography_id, by_class, class",No,Test + assessment,Peformance metrics (optionally) broken out by class for different levels of geography,Assessment set uses the prior year sales to compare to the assessed value -output,performance_quantile_test,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_test.parquet,performance_quantile/year={year}/stage=test/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",Test only,Test + assessment,Performance metrics by quantile within class and geography,Test set includes the most recent 10% of sales -output,performance_quantile_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_assessment.parquet,performance_quantile/year={year}/stage=assessment/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",No,Test + assessment,Performance metrics by quantile within class and geography,Assessment set uses the prior year sales to compare to the assessed value -output,shap,4,interpret,ccao-model-results-us-east-1,output/shap/model_shap.parquet,shap/,shap,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,SHAP values for each feature for each card in the assessment data,NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler -output,feature_importance,4,interpret,ccao-model-results-us-east-1,output/feature_importance/model_feature_importance.parquet,feature_importance/year={year}/{run_id}.parquet,feature_importance,predictor,"year, run_id, model_predictor_all_name",No,Yes,"Feature importance values (gain, cover, and frequency) for the run", -output,metadata,5,finalize,ccao-model-results-us-east-1,output/metadata/model_metadata.parquet,metadata/year={year}/{run_id}.parquet,metadata,model run,"year, run_id",Yes,Yes,"Information about each run, including parameters, run ID, git info, etc.", -intermediate,timing,,all,,output/intermediate/timing/,,,model stage,"year, msg",Yes,Yes,Parquet files for each stage containing the stage time elapsed,Converted into a one-row data frame in the finalize stage -output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Yes,Yes,Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages" \ No newline at end of file +type,name,stage_number,stage_name,s3_bucket,path_local,path_s3,athena_table,observation_unit,primary_key,run_type_limited,run_type_full,description,notes +input,training,0,ingest,ccao-data-dvc-us-east-1,input/training_data.parquet,,,pin,"year, meta_pin",No,No,Sales data used to train the model,"Excludes any PINs with multiple cards, therefore unit of observation is PIN" +input,assessment,0,ingest,ccao-data-dvc-us-east-1,input/assessment_data.parquet,,,card,"year, meta_pin, meta_card_num",No,No,Universe of properties that need assessed values,"Card-level, rather than PIN-level" +input,complex_id,0,ingest,ccao-data-dvc-us-east-1,input/complex_id_data.parquet,,,pin,"year, meta_pin",No,No,Programmatically created townhome complex IDs,"Purpose is to assign townhomes in the same ""complex"" the same value. Only run as-needed i.e. as little as possible" +input,land_site_rate,0,ingest,ccao-data-dvc-us-east-1,input/land_site_rate_data.parquet,,,pin,"year, meta_pin",No,No,Site-specific flat land values provided by Valuations,Applies to class 210 and 295 properties only +input,land_nbhd_rate,0,ingest,ccao-data-dvc-us-east-1,input/land_nbhd_rate_data.parquet,,,nbhd,"year, meta_nbhd",No,No,Neighborhood-level land rates (per sqft) provided by Valuations,Applies to all properties (except those with a PIN-specific flat value) +output,parameter_search,1,train,ccao-model-results-us-east-1,output/parameter_search/model_parameter_search.parquet,parameter_search/year={year}/{run_id}.parquet,parameter_search,model cv fold,"year, run_id, configuration, fold_id",No,If CV enabled,Tidymodels tuning output from cross-validation,Each row is the result from one fold assessment from one iteration +output,parameter_raw,1,train,ccao-model-results-us-east-1,output/parameter_search/model_parameter_search.parquet,parameter_raw/year={year}/{run_id}.parquet,,model cv fold,"year, run_id, configuration, fold_id",No,If CV enabled,"Raw, nested Tidymodels tuning output",Not useful in Athena but needed to make models reproducible +output,parameter_final,1,train,ccao-model-results-us-east-1,output/parameter_final/model_parameter_final.parquet,parameter_final/year={year}/{run_id}.parquet,parameter_final,model run,"year, run_id",Yes,Yes,Chosen set of hyperparameters for each run,"As chosen by tune::select_best() if using CV, otherwise the default set of hyperparameters specified in params.yaml (model.hyperparameter.default)" +output,parameter_range,1,train,ccao-model-results-us-east-1,output/parameter_range/model_parameter_range.parquet,parameter_range/year={year}/{run_id}.parquet,parameter_range,parameter,"year, run_id, parameter_name",No,If CV enabled,Range of hyperparameters searched during CV tuning,As specified in params.yaml (model.hyperparameter.range) +output,test_card,1,train,ccao-model-results-us-east-1,output/test_card/model_test_card.parquet,test_card/year={year}/{run_id}.parquet,test_card,card,"year, meta_pin, meta_card_num",Yes,Yes,Test set predictions at the card level,Only includes the minimal variables necessary to perform evaluation +output,workflow_fit,1,train,ccao-model-results-us-east-1,output/workflow/fit/model_workflow_fit.zip,workflow/fit/year={year}/{run_id}.zip,,model run,,Yes,Yes,Trained LightGBM model object + Tidymodels specification,Can be loaded with lightsnip::lgbm_load() to produce predictions using new data +output,workflow_recipe,1,train,ccao-model-results-us-east-1,output/workflow/recipe/model_workflow_recipe.rds,workflow/recipe/year={year}/{run_id}.rds,,model run,,Yes,Yes,Trained Tidymodels recipe object used for data preprocessing,Can be used to prepare new data in the same way as the original model training +output,assessment_card,2,assess,ccao-model-results-us-east-1,output/assessment_card/model_assessment_card.parquet,assessment_card/,assessment_card,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,Assessment results at the card level AKA raw model output,Also includes card-level characteristics. Multi-card PINs will have more than one row. NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler +output,assessment_pin,2,assess,ccao-model-results-us-east-1,output/assessment_pin/model_assessment_pin.parquet,assessment_pin/,assessment_pin,pin,"year, run_id, township_code, meta_pin",No,Yes,Assessment results at the PIN level AKA aggregated and cleaned,"Aggregation depends on PIN specifics, see assess script for details. Includes PIN-level stats like YoY % changes, land, sales, etc. NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler" +output,performance_test,3,evaluate,ccao-model-results-us-east-1,output/performance/model_performance_test.parquet,performance/year={year}/stage=test/{run_id}.parquet,performance,geography [by class],"year, run_id, stage, geography_type, geography_id, by_class, class",Test only,Test + assessment,Peformance metrics (optionally) broken out by class for different levels of geography,Test set includes the most recent 10% of sales +output,performance_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance/model_performance_assessment.parquet,performance/year={year}/stage=assessment/{run_id}.parquet,performance,geography [by class],"year, run_id, stage, geography_type, geography_id, by_class, class",No,Test + assessment,Peformance metrics (optionally) broken out by class for different levels of geography,Assessment set uses the prior year sales to compare to the assessed value +output,performance_quantile_test,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_test.parquet,performance_quantile/year={year}/stage=test/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",Test only,Test + assessment,Performance metrics by quantile within class and geography,Test set includes the most recent 10% of sales +output,performance_quantile_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_assessment.parquet,performance_quantile/year={year}/stage=assessment/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",No,Test + assessment,Performance metrics by quantile within class and geography,Assessment set uses the prior year sales to compare to the assessed value +output,shap,4,interpret,ccao-model-results-us-east-1,output/shap/model_shap.parquet,shap/,shap,card,"year, run_id, township_code, meta_pin, meta_card_num",No,Yes,SHAP values for each feature for each card in the assessment data,NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler +output,feature_importance,4,interpret,ccao-model-results-us-east-1,output/feature_importance/model_feature_importance.parquet,feature_importance/year={year}/{run_id}.parquet,feature_importance,predictor,"year, run_id, model_predictor_all_name",No,Yes,"Feature importance values (gain, cover, and frequency) for the run", +output,report,5,finalize,ccao-model-results-us-east-1,reports/performance.html,report/year={year}/report_type=performance/{run_id}.html,,model run,,No,Yes,Rendered Quarto doc with model performance statistics, +output,metadata,5,finalize,ccao-model-results-us-east-1,output/metadata/model_metadata.parquet,metadata/year={year}/{run_id}.parquet,metadata,model run,"year, run_id",Yes,Yes,"Information about each run, including parameters, run ID, git info, etc.", +intermediate,timing,,all,,output/intermediate/timing/,,,model stage,"year, msg",Yes,Yes,Parquet files for each stage containing the stage time elapsed,Converted into a one-row data frame in the finalize stage +output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Yes,Yes,Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages" diff --git a/pipeline/05-finalize.R b/pipeline/05-finalize.R index ee03e3d9..b9bd5ddf 100644 --- a/pipeline/05-finalize.R +++ b/pipeline/05-finalize.R @@ -2,21 +2,20 @@ # 1. Setup --------------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# NOTE: This script requires CCAO employee access. See wiki for S3 credentials -# setup and multi-factor authentication help +# Start the stage timer and clear logs from prior stage +tictoc::tic.clearlog() +tictoc::tic("Finalize") # Load libraries and scripts suppressPackageStartupMessages({ library(arrow) - library(aws.s3) - library(aws.ec2metadata) library(ccao) library(dplyr) library(here) library(lubridate) - library(paws.application.integration) library(purrr) library(tidyr) + library(tictoc) library(tune) library(yaml) }) @@ -154,10 +153,58 @@ metadata <- tibble::tibble( #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 3. Save Timings -------------------------------------------------------------- +# 3. Generate performance report ----------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +# Wrap this block in an error handler so that the pipeline continues execution +# even if report generation fails. This is important because the report file is +# defined separately, so this script can't be sure that it is error-free +tryCatch( + { + suppressPackageStartupMessages({ + library(quarto) + }) + + message("Generating performance report") + + here("reports", "performance.qmd") %>% + quarto_render( + execute_params = list( + run_id = run_id, + year = params$assessment$year + ) + ) + }, + error = function(func) { + message("Encountered error during report generation:") + message(conditionMessage(func)) + + # Save an empty report so that this pipeline step produces the required + # output even in cases of failure + message("Saving an empty report file in order to continue execution") + sink(here("reports", "performance.html")) + cat("Encountered error in report generation:\n\n") + cat(conditionMessage(func)) + sink() + } +) + + + + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 4. Save Timings -------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Saving run timings") +# End the stage timer and write the time elapsed to a temporary file +tictoc::toc(log = TRUE) +bind_rows(tictoc::tic.log(format = FALSE)) %>% + arrow::write_parquet(gsub("//*", "/", file.path( + paths$intermediate$timing$local, + "model_timing_finalize.parquet" + ))) + # Filter ensure we only get timing files for stages that actually ran if (run_type == "full") { timings <- list.files( @@ -167,7 +214,7 @@ if (run_type == "full") { } else { timings <- list.files( paste0(paths$intermediate$timing, "/"), - pattern = "train|evaluate", + pattern = "train|evaluate|finalize", full.names = TRUE ) } @@ -182,7 +229,8 @@ timings_df <- purrr::map_dfr(timings, read_parquet) %>% order = recode( msg, "Train" = "01", "Assess" = "02", - "Evaluate" = "03", "Interpret" = "04" + "Evaluate" = "03", "Interpret" = "04", + "Finalize" = "05" ) ) %>% arrange(order) %>% @@ -198,267 +246,3 @@ timings_df <- purrr::map_dfr(timings, read_parquet) %>% # Clear any remaining logs from tictoc tictoc::tic.clearlog() - - - - -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 4. Upload -------------------------------------------------------------------- -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -message("Uploading run artifacts") - -# Only upload files if explicitly enabled -if (params$toggle$upload_to_s3) { - # Initialize a dictionary of paths AND S3 URIs specific to the run ID and year - paths <- model_file_dict( - run_id = run_id, - year = params$assessment$year - ) - - - ## 4.1. Train ---------------------------------------------------------------- - - # Upload lightgbm fit - aws.s3::put_object( - paths$output$workflow_fit$local, - paths$output$workflow_fit$s3 - ) - - # Upload Tidymodels recipe - aws.s3::put_object( - paths$output$workflow_recipe$local, - paths$output$workflow_recipe$s3 - ) - - # Upload finalized run parameters - read_parquet(paths$output$parameter_final$local) %>% - mutate(run_id = run_id) %>% - relocate(run_id) %>% - # Max_depth is set by lightsnip if link_max_depth is true, so we need to - # back out its value. Otherwise, use whichever value is chosen by CV - mutate(max_depth = { - if (link_max_depth) { - as.integer(floor(log2(num_leaves)) + add_to_linked_depth) - } else if (!is.null(.[["max_depth"]])) { - .$max_depth - } else { - NULL - } - }) %>% - write_parquet(paths$output$parameter_final$s3) - - # Upload the test set predictions - read_parquet(paths$output$test_card$local) %>% - mutate(run_id = run_id) %>% - relocate(run_id) %>% - write_parquet(paths$output$test_card$s3) - - # Upload the parameter search objects if CV was enabled. Requires some - # cleaning since the Tidymodels output is stored as a nested data frame - if (cv_enable) { - message("Uploading cross-validation artifacts") - - # Upload the raw parameters object to S3 in case we need to use it later - aws.s3::put_object( - paths$output$parameter_raw$local, - paths$output$parameter_raw$s3 - ) - - # Upload the parameter ranges used for CV - read_parquet(paths$output$parameter_range$local) %>% - mutate(run_id = run_id) %>% - relocate(run_id) %>% - write_parquet(paths$output$parameter_range$s3) - - # Clean and unnest the raw parameters data, then write the results to S3 - bind_cols( - read_parquet(paths$output$parameter_raw$local) %>% - tidyr::unnest(cols = .metrics) %>% - mutate(run_id = run_id) %>% - left_join( - rename(., notes = .notes) %>% - tidyr::unnest(cols = notes) %>% - rename(notes = note) - ) %>% - select(-.notes) %>% - rename_with(~ gsub("^\\.", "", .x)) %>% - tidyr::pivot_wider(names_from = "metric", values_from = "estimate") %>% - relocate( - all_of(c( - "run_id", - "iteration" = "iter", - "configuration" = "config", "fold_id" = "id" - )) - ) %>% - relocate(c(location, type, notes), .after = everything()), - read_parquet(paths$output$parameter_raw$local) %>% - tidyr::unnest(cols = .extracts) %>% - tidyr::unnest(cols = .extracts) %>% - dplyr::select(num_iterations = .extracts) - ) %>% - dplyr::select(-any_of(c("estimator")), -extracts) %>% - write_parquet(paths$output$parameter_search$s3) - } - - - # 4.2. Assess ---------------------------------------------------------------- - message("Uploading final assessment results") - - # Upload PIN and card-level values for full runs. These outputs are very - # large, so to help reduce file size and improve query performance we - # partition them by year, run ID, and township - if (run_type == "full") { - read_parquet(paths$output$assessment_card$local) %>% - mutate(run_id = run_id, year = params$assessment$year) %>% - group_by(year, run_id, township_code) %>% - arrow::write_dataset( - path = paths$output$assessment_card$s3, - format = "parquet", - hive_style = TRUE, - compression = "snappy" - ) - read_parquet(paths$output$assessment_pin$local) %>% - mutate(run_id = run_id, year = params$assessment$year) %>% - group_by(year, run_id, township_code) %>% - arrow::write_dataset( - path = paths$output$assessment_pin$s3, - format = "parquet", - hive_style = TRUE, - compression = "snappy" - ) - } - - - # 4.3. Evaluate -------------------------------------------------------------- - - # Upload test set performance - message("Uploading test set evaluation") - read_parquet(paths$output$performance_test$local) %>% - mutate(run_id = run_id) %>% - relocate(run_id) %>% - write_parquet(paths$output$performance_test$s3) - read_parquet(paths$output$performance_quantile_test$local) %>% - mutate(run_id = run_id) %>% - relocate(run_id) %>% - write_parquet(paths$output$performance_quantile_test$s3) - - # Upload assessment set performance if a full run - if (run_type == "full") { - message("Uploading assessment set evaluation") - read_parquet(paths$output$performance_assessment$local) %>% - mutate(run_id = run_id) %>% - relocate(run_id) %>% - write_parquet(paths$output$performance_assessment$s3) - read_parquet(paths$output$performance_quantile_assessment$local) %>% - mutate(run_id = run_id) %>% - relocate(run_id) %>% - write_parquet(paths$output$performance_quantile_assessment$s3) - } - - - # 4.4. Interpret ------------------------------------------------------------- - - # Upload SHAP values if a full run. SHAP values are one row per card and one - # column per feature, so the output is very large. Therefore, we partition - # the data by year, run, and township - if (run_type == "full" && shap_enable) { - message("Uploading SHAP values") - read_parquet(paths$output$shap$local) %>% - mutate(run_id = run_id, year = params$assessment$year) %>% - group_by(year, run_id, township_code) %>% - arrow::write_dataset( - path = paths$output$shap$s3, - format = "parquet", - hive_style = TRUE, - compression = "snappy" - ) - } - - # Upload feature importance metrics - if (run_type == "full") { - message("Uploading feature importance metrics") - read_parquet(paths$output$feature_importance$local) %>% - mutate(run_id = run_id) %>% - relocate(run_id) %>% - write_parquet(paths$output$feature_importance$s3) - } - - - # 4.5. Finalize -------------------------------------------------------------- - message("Uploading run metadata and timings") - - # Upload metadata - aws.s3::put_object( - paths$output$metadata$local, - paths$output$metadata$s3 - ) - - # Upload finalized timings - aws.s3::put_object( - paths$output$timing$local, - paths$output$timing$s3 - ) -} - - - - -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# 5. Wrap-Up ------------------------------------------------------------------- -#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# This will run a Glue crawler to update schemas and send an email to any SNS -# subscribers. Only run when actually uploading -if (params$toggle$upload_to_s3) { - message("Sending run email and running model crawler") - - # If assessments and SHAP values were uploaded, trigger a Glue crawler to find - # any new partitions - if (run_type == "full") { - glue_srv <- paws.analytics::glue() - glue_srv$start_crawler("ccao-model-results-crawler") - } - - # If SNS ARN is available, notify subscribers via email upon run completion - if (!is.na(Sys.getenv("AWS_SNS_ARN_MODEL_STATUS", unset = NA))) { - pipeline_sns <- paws.application.integration::sns( - config = list(region = Sys.getenv("AWS_REGION")) - ) - - # Get pipeline total run time from file - pipeline_sns_total_time <- read_parquet(paths$output$timing$local) %>% - mutate(dur = lubridate::seconds_to_period(round(overall_sec_elapsed))) %>% - dplyr::pull(dur) - - # Get overall stats by township for the triad of interest, collapsed into - # a plaintext table - pipeline_sns_results <- arrow::read_parquet( - paths$output$performance_test$local, - col_select = c("geography_type", "geography_id", "by_class", "cod") - ) %>% - filter( - tolower(town_get_triad(geography_id, name = TRUE)) == - params$assessment$triad, - !by_class, geography_type == "township_code" - ) %>% - mutate(township_name = town_convert(geography_id)) %>% - select(cod, township_name) %>% - mutate(across(where(is.numeric), round, 2)) %>% - arrange(cod) %>% - knitr::kable(format = "rst") %>% - as.character() %>% - .[!grepl("=", .)] %>% - paste0(collapse = "\n") - - # Publish to SNS - pipeline_sns$publish( - Subject = paste("Model Run Complete:", run_id), - Message = paste0( - "Model run: ", run_id, " complete\n", - "Finished in: ", pipeline_sns_total_time, "\n\n", - pipeline_sns_results - ), - TopicArn = Sys.getenv("AWS_SNS_ARN_MODEL_STATUS") - ) - } -} diff --git a/pipeline/06-upload.R b/pipeline/06-upload.R new file mode 100644 index 00000000..c8db021e --- /dev/null +++ b/pipeline/06-upload.R @@ -0,0 +1,320 @@ +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 1. Setup --------------------------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +# NOTE: This script requires CCAO employee access. See wiki for S3 credentials +# setup and multi-factor authentication help + +# Load libraries and scripts +suppressPackageStartupMessages({ + library(arrow) + library(aws.s3) + library(aws.ec2metadata) + library(dplyr) + library(glue) + library(here) + library(knitr) + library(lubridate) + library(paws.analytics) + library(paws.application.integration) + library(tidyr) + library(yaml) +}) +source(here("R", "helpers.R")) + +# Initialize a dictionary of file paths. See misc/file_dict.csv for details +paths <- model_file_dict() + +# Load the parameters file containing the run settings +params <- read_yaml("params.yaml") + +# Load various overridden parameters as defined in the `finalize` step +metadata <- read_parquet(paths$output$metadata$local) +cv_enable <- metadata$cv_enable +shap_enable <- metadata$shap_enable +run_id <- metadata$run_id +run_type <- metadata$run_type + + + + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 2. Upload -------------------------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +message("Uploading run artifacts") + +# Only upload files if explicitly enabled +if (params$toggle$upload_to_s3) { + # Initialize a dictionary of paths AND S3 URIs specific to the run ID and year + paths <- model_file_dict( + run_id = run_id, + year = params$assessment$year + ) + + + ## 2.1. Train ---------------------------------------------------------------- + + # Upload lightgbm fit + aws.s3::put_object( + paths$output$workflow_fit$local, + paths$output$workflow_fit$s3 + ) + + # Upload Tidymodels recipe + aws.s3::put_object( + paths$output$workflow_recipe$local, + paths$output$workflow_recipe$s3 + ) + + # Upload finalized run parameters + read_parquet(paths$output$parameter_final$local) %>% + mutate(run_id = run_id) %>% + relocate(run_id) %>% + # Max_depth is set by lightsnip if link_max_depth is true, so we need to + # back out its value. Otherwise, use whichever value is chosen by CV + mutate(max_depth = { + if (link_max_depth) { + as.integer(floor(log2(num_leaves)) + add_to_linked_depth) + } else if (!is.null(.[["max_depth"]])) { + .$max_depth + } else { + NULL + } + }) %>% + write_parquet(paths$output$parameter_final$s3) + + # Upload the test set predictions + read_parquet(paths$output$test_card$local) %>% + mutate(run_id = run_id) %>% + relocate(run_id) %>% + write_parquet(paths$output$test_card$s3) + + # Upload the parameter search objects if CV was enabled. Requires some + # cleaning since the Tidymodels output is stored as a nested data frame + if (cv_enable) { + message("Uploading cross-validation artifacts") + + # Upload the raw parameters object to S3 in case we need to use it later + aws.s3::put_object( + paths$output$parameter_raw$local, + paths$output$parameter_raw$s3 + ) + + # Upload the parameter ranges used for CV + read_parquet(paths$output$parameter_range$local) %>% + mutate(run_id = run_id) %>% + relocate(run_id) %>% + write_parquet(paths$output$parameter_range$s3) + + # Clean and unnest the raw parameters data, then write the results to S3 + bind_cols( + read_parquet(paths$output$parameter_raw$local) %>% + tidyr::unnest(cols = .metrics) %>% + mutate(run_id = run_id) %>% + left_join( + rename(., notes = .notes) %>% + tidyr::unnest(cols = notes) %>% + rename(notes = note) + ) %>% + select(-.notes) %>% + rename_with(~ gsub("^\\.", "", .x)) %>% + tidyr::pivot_wider(names_from = "metric", values_from = "estimate") %>% + relocate( + all_of(c( + "run_id", + "iteration" = "iter", + "configuration" = "config", "fold_id" = "id" + )) + ) %>% + relocate(c(location, type, notes), .after = everything()), + read_parquet(paths$output$parameter_raw$local) %>% + tidyr::unnest(cols = .extracts) %>% + tidyr::unnest(cols = .extracts) %>% + dplyr::select(num_iterations = .extracts) + ) %>% + dplyr::select(-any_of(c("estimator")), -extracts) %>% + write_parquet(paths$output$parameter_search$s3) + } + + + # 2.2. Assess ---------------------------------------------------------------- + message("Uploading final assessment results") + + # Upload PIN and card-level values for full runs. These outputs are very + # large, so to help reduce file size and improve query performance we + # partition them by year, run ID, and township + if (run_type == "full") { + read_parquet(paths$output$assessment_card$local) %>% + mutate(run_id = run_id, year = params$assessment$year) %>% + group_by(year, run_id, township_code) %>% + arrow::write_dataset( + path = paths$output$assessment_card$s3, + format = "parquet", + hive_style = TRUE, + compression = "snappy" + ) + read_parquet(paths$output$assessment_pin$local) %>% + mutate(run_id = run_id, year = params$assessment$year) %>% + group_by(year, run_id, township_code) %>% + arrow::write_dataset( + path = paths$output$assessment_pin$s3, + format = "parquet", + hive_style = TRUE, + compression = "snappy" + ) + } + + + # 2.3. Evaluate -------------------------------------------------------------- + + # Upload test set performance + message("Uploading test set evaluation") + read_parquet(paths$output$performance_test$local) %>% + mutate(run_id = run_id) %>% + relocate(run_id) %>% + write_parquet(paths$output$performance_test$s3) + read_parquet(paths$output$performance_quantile_test$local) %>% + mutate(run_id = run_id) %>% + relocate(run_id) %>% + write_parquet(paths$output$performance_quantile_test$s3) + + # Upload assessment set performance if a full run + if (run_type == "full") { + message("Uploading assessment set evaluation") + read_parquet(paths$output$performance_assessment$local) %>% + mutate(run_id = run_id) %>% + relocate(run_id) %>% + write_parquet(paths$output$performance_assessment$s3) + read_parquet(paths$output$performance_quantile_assessment$local) %>% + mutate(run_id = run_id) %>% + relocate(run_id) %>% + write_parquet(paths$output$performance_quantile_assessment$s3) + } + + + # 2.4. Interpret ------------------------------------------------------------- + + # Upload SHAP values if a full run. SHAP values are one row per card and one + # column per feature, so the output is very large. Therefore, we partition + # the data by year, run, and township + if (run_type == "full" && shap_enable) { + message("Uploading SHAP values") + read_parquet(paths$output$shap$local) %>% + mutate(run_id = run_id, year = params$assessment$year) %>% + group_by(year, run_id, township_code) %>% + arrow::write_dataset( + path = paths$output$shap$s3, + format = "parquet", + hive_style = TRUE, + compression = "snappy" + ) + } + + # Upload feature importance metrics + if (run_type == "full") { + message("Uploading feature importance metrics") + read_parquet(paths$output$feature_importance$local) %>% + mutate(run_id = run_id) %>% + relocate(run_id) %>% + write_parquet(paths$output$feature_importance$s3) + } + + + # 2.5. Finalize -------------------------------------------------------------- + message("Uploading run metadata, timings, and performance report") + + # Upload metadata + aws.s3::put_object( + paths$output$metadata$local, + paths$output$metadata$s3 + ) + + # Upload finalized timings + aws.s3::put_object( + paths$output$timing$local, + paths$output$timing$s3 + ) + + # Upload performance report + aws.s3::put_object( + paths$output$report$local, + paths$output$report$s3 + ) +} + + + + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# 3. Crawl and notify ---------------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +# This will run a Glue crawler to update schemas and send an email to any SNS +# subscribers. Only run when actually uploading +if (params$toggle$upload_to_s3) { + message("Sending run email and running model crawler") + + # If assessments and SHAP values were uploaded, trigger a Glue crawler to find + # any new partitions + if (run_type == "full") { + glue_srv <- paws.analytics::glue() + glue_srv$start_crawler("ccao-model-results-crawler") + } + + # If SNS ARN is available, notify subscribers via email upon run completion + if (!is.na(Sys.getenv("AWS_SNS_ARN_MODEL_STATUS", unset = NA))) { + pipeline_sns <- paws.application.integration::sns( + config = list(region = Sys.getenv("AWS_REGION")) + ) + + # Get pipeline total run time from file + pipeline_sns_total_time <- read_parquet(paths$output$timing$local) %>% + mutate(dur = lubridate::seconds_to_period(round(overall_sec_elapsed))) %>% + dplyr::pull(dur) + + # Get overall stats by township for the triad of interest, collapsed into + # a plaintext table + pipeline_sns_results <- arrow::read_parquet( + paths$output$performance_test$local, + col_select = c("geography_type", "geography_id", "by_class", "cod") + ) %>% + filter( + tolower(town_get_triad(geography_id, name = TRUE)) == + params$assessment$triad, + !by_class, geography_type == "township_code" + ) %>% + mutate(township_name = town_convert(geography_id)) %>% + select(cod, township_name) %>% + mutate(across(where(is.numeric), round, 2)) %>% + arrange(cod) %>% + knitr::kable(format = "rst") %>% + as.character() %>% + .[!grepl("=", .)] %>% + paste0(collapse = "\n") + + # Get a link to the uploaded Quarto report + report_path_parts <- strsplit(paths$output$report$s3[1], "/")[[1]] + report_bucket <- report_path_parts[3] + report_path <- report_path_parts[4:length(report_path_parts)] %>% + paste(collapse = "/") + # Use direct link to the console instead of to the object so that we don't + # have to bother with signed URLs + report_url <- paste0( + "https://s3.console.aws.amazon.com/s3/object/", + "{report_bucket}/{report_path}?region=us-east-1&tab=overview" + ) %>% + glue::glue() + + # Publish to SNS + pipeline_sns$publish( + Subject = paste("Model Run Complete:", run_id), + Message = paste0( + "Model run: ", run_id, " complete\n", + "Finished in: ", pipeline_sns_total_time, "\n\n", + "Report link: ", report_url, "\n\n", + pipeline_sns_results + ), + TopicArn = Sys.getenv("AWS_SNS_ARN_MODEL_STATUS") + ) + } +} diff --git a/pipeline/06-export.R b/pipeline/07-export.R similarity index 100% rename from pipeline/06-export.R rename to pipeline/07-export.R diff --git a/pipeline/07-api.R b/pipeline/08-api.R similarity index 100% rename from pipeline/07-api.R rename to pipeline/08-api.R diff --git a/renv/profiles/reporting/renv.lock b/renv/profiles/reporting/renv.lock new file mode 100644 index 00000000..2b58417f --- /dev/null +++ b/renv/profiles/reporting/renv.lock @@ -0,0 +1,1288 @@ +{ + "R": { + "Version": "4.2.2", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://packagemanager.posit.co/cran/latest" + } + ] + }, + "Packages": { + "DBI": { + "Package": "DBI", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "b2866e62bab9378c3cc9476a1954226b" + }, + "KernSmooth": { + "Package": "KernSmooth", + "Version": "2.23-22", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats" + ], + "Hash": "2fecebc3047322fa5930f74fae5de70f" + }, + "MASS": { + "Package": "MASS", + "Version": "7.3-60", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "methods", + "stats", + "utils" + ], + "Hash": "a56a6365b3fa73293ea8d084be0d9bb0" + }, + "Matrix": { + "Package": "Matrix", + "Version": "1.6-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "lattice", + "methods", + "stats", + "utils" + ], + "Hash": "31262fd18481fab05c5e7258dac163ca" + }, + "R6": { + "Package": "R6", + "Version": "2.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "470851b6d5d0ac559e9d01bb352b4021" + }, + "RColorBrewer": { + "Package": "RColorBrewer", + "Version": "1.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "45f0398006e83a5b10b72a90663d8d8c" + }, + "Rcpp": { + "Package": "Rcpp", + "Version": "1.0.11", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "methods", + "utils" + ], + "Hash": "ae6cbbe1492f4de79c45fce06f967ce8" + }, + "askpass": { + "Package": "askpass", + "Version": "1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "sys" + ], + "Hash": "e8a22846fff485f0be3770c2da758713" + }, + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "543776ae6848fde2f48ff3816d0628bc" + }, + "bslib": { + "Package": "bslib", + "Version": "0.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "base64enc", + "cachem", + "grDevices", + "htmltools", + "jquerylib", + "jsonlite", + "memoise", + "mime", + "rlang", + "sass" + ], + "Hash": "283015ddfbb9d7bf15ea9f0b5698f0d9" + }, + "cachem": { + "Package": "cachem", + "Version": "1.0.8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "fastmap", + "rlang" + ], + "Hash": "c35768291560ce302c0a6589f92e837d" + }, + "class": { + "Package": "class", + "Version": "7.3-22", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "MASS", + "R", + "stats", + "utils" + ], + "Hash": "f91f6b29f38b8c280f2b9477787d4bb2" + }, + "classInt": { + "Package": "classInt", + "Version": "0.4-10", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "KernSmooth", + "R", + "class", + "e1071", + "grDevices", + "graphics", + "stats" + ], + "Hash": "f5a40793b1ae463a7ffb3902a95bf864" + }, + "cli": { + "Package": "cli", + "Version": "3.6.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "utils" + ], + "Hash": "89e6d8219950eac806ae0c489052048a" + }, + "colorspace": { + "Package": "colorspace", + "Version": "2.1-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "methods", + "stats" + ], + "Hash": "f20c47fd52fae58b4e377c37bb8c335b" + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.4.4", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "3f7d8664d7324406cd10cd650ad85e5f" + }, + "crosstalk": { + "Package": "crosstalk", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R6", + "htmltools", + "jsonlite", + "lazyeval" + ], + "Hash": "6aa54f69598c32177e920eb3402e8293" + }, + "curl": { + "Package": "curl", + "Version": "5.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "2118af9cb164c8d2dddc7b89eaf732d9" + }, + "data.table": { + "Package": "data.table", + "Version": "1.14.8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "b4c06e554f33344e044ccd7fdca750a9" + }, + "digest": { + "Package": "digest", + "Version": "0.6.33", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "b18a9cf3c003977b0cc49d5e76ebe48d" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "rlang", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "dea6970ff715ca541c387de363ff405e" + }, + "e1071": { + "Package": "e1071", + "Version": "1.7-13", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "class", + "grDevices", + "graphics", + "methods", + "proxy", + "stats", + "utils" + ], + "Hash": "1046cb48d06cb40c2900d8878f03a0fe" + }, + "ellipsis": { + "Package": "ellipsis", + "Version": "0.3.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "rlang" + ], + "Hash": "bb0eec2fe32e88d9e2836c2f73ea2077" + }, + "evaluate": { + "Package": "evaluate", + "Version": "0.21", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "d59f3b464e8da1aef82dc04b588b8dfb" + }, + "fansi": { + "Package": "fansi", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "utils" + ], + "Hash": "1d9e7ad3c8312a192dea7d3db0274fde" + }, + "farver": { + "Package": "farver", + "Version": "2.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8106d78941f34855c440ddb946b8f7a5" + }, + "fastmap": { + "Package": "fastmap", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "f7736a18de97dea803bde0a2daaafb27" + }, + "fontawesome": { + "Package": "fontawesome", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "htmltools", + "rlang" + ], + "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d" + }, + "fs": { + "Package": "fs", + "Version": "1.6.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "47b5f30c720c23999b913a1a635cf0bb" + }, + "generics": { + "Package": "generics", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + }, + "ggplot2": { + "Package": "ggplot2", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "MASS", + "R", + "cli", + "glue", + "grDevices", + "grid", + "gtable", + "isoband", + "lifecycle", + "mgcv", + "rlang", + "scales", + "stats", + "tibble", + "vctrs", + "withr" + ], + "Hash": "3a147ee02e85a8941aad9909f1b43b7b" + }, + "glue": { + "Package": "glue", + "Version": "1.6.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "4f2596dfb05dac67b9dc558e5c6fba2e" + }, + "gridExtra": { + "Package": "gridExtra", + "Version": "2.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "grDevices", + "graphics", + "grid", + "gtable", + "utils" + ], + "Hash": "7d7f283939f563670a697165b2cf5560" + }, + "gtable": { + "Package": "gtable", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "grid", + "lifecycle", + "rlang" + ], + "Hash": "b44addadb528a0d227794121c00572a0" + }, + "highr": { + "Package": "highr", + "Version": "0.10", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "xfun" + ], + "Hash": "06230136b2d2b9ba5805e1963fa6e890" + }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "base64enc", + "digest", + "ellipsis", + "fastmap", + "grDevices", + "rlang", + "utils" + ], + "Hash": "a2326a66919a3311f7fbb1e3bf568283" + }, + "htmlwidgets": { + "Package": "htmlwidgets", + "Version": "1.6.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "grDevices", + "htmltools", + "jsonlite", + "knitr", + "rmarkdown", + "yaml" + ], + "Hash": "a865aa85bcb2697f47505bfd70422471" + }, + "httr": { + "Package": "httr", + "Version": "1.4.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "curl", + "jsonlite", + "mime", + "openssl" + ], + "Hash": "7e5e3cbd2a7bc07880c94e22348fb661" + }, + "isoband": { + "Package": "isoband", + "Version": "0.2.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "grid", + "utils" + ], + "Hash": "0080607b4a1a7b28979aecef976d8bc2" + }, + "jquerylib": { + "Package": "jquerylib", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "htmltools" + ], + "Hash": "5aab57a3bd297eee1c1d862735972182" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods" + ], + "Hash": "266a20443ca13c65688b2116d5220f76" + }, + "knitr": { + "Package": "knitr", + "Version": "1.43", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "evaluate", + "highr", + "methods", + "tools", + "xfun", + "yaml" + ], + "Hash": "9775eb076713f627c07ce41d8199d8f6" + }, + "labeling": { + "Package": "labeling", + "Version": "0.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "graphics", + "stats" + ], + "Hash": "3d5108641f47470611a32d0bdf357a72" + }, + "later": { + "Package": "later", + "Version": "1.3.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "Rcpp", + "rlang" + ], + "Hash": "40401c9cf2bc2259dfe83311c9384710" + }, + "lattice": { + "Package": "lattice", + "Version": "0.21-8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "stats", + "utils" + ], + "Hash": "0b8a6d63c8770f02a8b5635f3c431e6b" + }, + "lazyeval": { + "Package": "lazyeval", + "Version": "0.2.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "d908914ae53b04d4c0c0fd72ecc35370" + }, + "leaflet": { + "Package": "leaflet", + "Version": "2.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "RColorBrewer", + "crosstalk", + "htmltools", + "htmlwidgets", + "jquerylib", + "leaflet.providers", + "magrittr", + "methods", + "png", + "raster", + "scales", + "sp", + "stats", + "viridis", + "xfun" + ], + "Hash": "6e09cb2c9dc2e5a1e71a413e60c3834e" + }, + "leaflet.providers": { + "Package": "leaflet.providers", + "Version": "2.0.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "htmltools" + ], + "Hash": "c0b81ad9d5d932772f7a457ac398cf36" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "rlang" + ], + "Hash": "001cecbeac1cff9301bdc3775ee46a86" + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "7ce2733a9826b3aeb1775d56fd305472" + }, + "memoise": { + "Package": "memoise", + "Version": "2.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "cachem", + "rlang" + ], + "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" + }, + "mgcv": { + "Package": "mgcv", + "Version": "1.9-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Matrix", + "R", + "graphics", + "methods", + "nlme", + "splines", + "stats", + "utils" + ], + "Hash": "086028ca0460d0c368028d3bda58f31b" + }, + "mime": { + "Package": "mime", + "Version": "0.12", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "tools" + ], + "Hash": "18e9c28c1d3ca1560ce30658b22ce104" + }, + "munsell": { + "Package": "munsell", + "Version": "0.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "colorspace", + "methods" + ], + "Hash": "6dfe8bf774944bd5595785e3229d8771" + }, + "nlme": { + "Package": "nlme", + "Version": "3.1-162", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "graphics", + "lattice", + "stats", + "utils" + ], + "Hash": "0984ce8da8da9ead8643c5cbbb60f83e" + }, + "openssl": { + "Package": "openssl", + "Version": "2.0.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "askpass" + ], + "Hash": "0f7cd2962e3044bb940cca4f4b5cecbe" + }, + "packrat": { + "Package": "packrat", + "Version": "0.9.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "tools", + "utils" + ], + "Hash": "55ddd2d4a1959535f18393478b0c14a6" + }, + "pillar": { + "Package": "pillar", + "Version": "1.9.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "cli", + "fansi", + "glue", + "lifecycle", + "rlang", + "utf8", + "utils", + "vctrs" + ], + "Hash": "15da5a8412f317beeee6175fbc76f4bb" + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "01f28d4278f15c76cddbea05899c5d6f" + }, + "plotly": { + "Package": "plotly", + "Version": "4.10.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "RColorBrewer", + "base64enc", + "crosstalk", + "data.table", + "digest", + "dplyr", + "ggplot2", + "htmltools", + "htmlwidgets", + "httr", + "jsonlite", + "lazyeval", + "magrittr", + "promises", + "purrr", + "rlang", + "scales", + "tibble", + "tidyr", + "tools", + "vctrs", + "viridisLite" + ], + "Hash": "56914cc61df53f2d0283d5498680867e" + }, + "png": { + "Package": "png", + "Version": "0.1-8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "bd54ba8a0a5faded999a7aab6e46b374" + }, + "processx": { + "Package": "processx", + "Version": "3.8.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "ps", + "utils" + ], + "Hash": "3efbd8ac1be0296a46c55387aeace0f3" + }, + "promises": { + "Package": "promises", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R6", + "Rcpp", + "fastmap", + "later", + "magrittr", + "rlang", + "stats" + ], + "Hash": "0d8a15c9d000970ada1ab21405387dee" + }, + "proxy": { + "Package": "proxy", + "Version": "0.4-27", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats", + "utils" + ], + "Hash": "e0ef355c12942cf7a6b91a6cfaea8b3e" + }, + "ps": { + "Package": "ps", + "Version": "1.7.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "709d852d33178db54b17c722e5b1e594" + }, + "purrr": { + "Package": "purrr", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "lifecycle", + "magrittr", + "rlang", + "vctrs" + ], + "Hash": "d71c815267c640f17ddbf7f16144b4bb" + }, + "quarto": { + "Package": "quarto", + "Version": "1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "jsonlite", + "later", + "processx", + "rmarkdown", + "rsconnect", + "rstudioapi", + "utils", + "yaml" + ], + "Hash": "79e1cff980960b566ddc4ddb1a49a13d" + }, + "rappdirs": { + "Package": "rappdirs", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "5e3c5dc0b071b21fa128676560dbe94d" + }, + "raster": { + "Package": "raster", + "Version": "3.6-26", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp", + "methods", + "sp", + "terra" + ], + "Hash": "7d6eda494f34a644420ac1bfd2a8023a" + }, + "renv": { + "Package": "renv", + "Version": "1.0.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "c321cd99d56443dbffd1c9e673c0c1a2" + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "a85c767b55f0bf9b7ad16c6d7baee5bb" + }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.25", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bslib", + "evaluate", + "fontawesome", + "htmltools", + "jquerylib", + "jsonlite", + "knitr", + "methods", + "stringr", + "tinytex", + "tools", + "utils", + "xfun", + "yaml" + ], + "Hash": "d65e35823c817f09f4de424fcdfa812a" + }, + "rsconnect": { + "Package": "rsconnect", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "curl", + "digest", + "jsonlite", + "lifecycle", + "openssl", + "packrat", + "renv", + "rlang", + "rstudioapi", + "tools", + "yaml" + ], + "Hash": "672fc66985074d17c86b6335105143b8" + }, + "rstudioapi": { + "Package": "rstudioapi", + "Version": "0.15.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5564500e25cffad9e22244ced1379887" + }, + "s2": { + "Package": "s2", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "wk" + ], + "Hash": "f1cbe03bb3346f8e817518ffa20f9f5a" + }, + "sass": { + "Package": "sass", + "Version": "0.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R6", + "fs", + "htmltools", + "rappdirs", + "rlang" + ], + "Hash": "6bd4d33b50ff927191ec9acbf52fd056" + }, + "scales": { + "Package": "scales", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "RColorBrewer", + "farver", + "labeling", + "lifecycle", + "munsell", + "rlang", + "viridisLite" + ], + "Hash": "906cb23d2f1c5680b8ce439b44c6fa63" + }, + "sf": { + "Package": "sf", + "Version": "1.0-14", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "DBI", + "R", + "Rcpp", + "classInt", + "grDevices", + "graphics", + "grid", + "magrittr", + "methods", + "s2", + "stats", + "tools", + "units", + "utils" + ], + "Hash": "e2111252a76984ca50bf8d6314348681" + }, + "sp": { + "Package": "sp", + "Version": "2.1-1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "lattice", + "methods", + "stats", + "utils" + ], + "Hash": "e9090fe4ff468d366aa6a76a9b3ec078" + }, + "stringi": { + "Package": "stringi", + "Version": "1.7.12", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats", + "tools", + "utils" + ], + "Hash": "ca8bd84263c77310739d2cf64d84d7c9" + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ], + "Hash": "671a4d384ae9d32fc47a14e98bfa3dc8" + }, + "sys": { + "Package": "sys", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "3a1be13d68d47a8cd0bfd74739ca1555" + }, + "terra": { + "Package": "terra", + "Version": "1.7-55", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "Rcpp", + "methods" + ], + "Hash": "c011cc748506148c793428eb8ec101f9" + }, + "tibble": { + "Package": "tibble", + "Version": "3.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "fansi", + "lifecycle", + "magrittr", + "methods", + "pillar", + "pkgconfig", + "rlang", + "utils", + "vctrs" + ], + "Hash": "a84e2cc86d07289b3b6f5069df7a004c" + }, + "tidyr": { + "Package": "tidyr", + "Version": "1.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "cpp11", + "dplyr", + "glue", + "lifecycle", + "magrittr", + "purrr", + "rlang", + "stringr", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "e47debdc7ce599b070c8e78e8ac0cfcf" + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ], + "Hash": "79540e5fcd9e0435af547d885f184fd5" + }, + "tinytex": { + "Package": "tinytex", + "Version": "0.47", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "xfun" + ], + "Hash": "8d4ccb733843e513c1c1cdd66a759f0d" + }, + "units": { + "Package": "units", + "Version": "0.8-4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp" + ], + "Hash": "e0fbcea25008a7540c83c2c294135de0" + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "1fe17157424bb09c48a8b3b550c753bc" + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang" + ], + "Hash": "d0ef2856b83dc33ea6e255caf6229ee2" + }, + "viridis": { + "Package": "viridis", + "Version": "0.6.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "ggplot2", + "gridExtra", + "viridisLite" + ], + "Hash": "80cd127bc8c9d3d9f0904ead9a9102f1" + }, + "viridisLite": { + "Package": "viridisLite", + "Version": "0.4.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "c826c7c4241b6fc89ff55aaea3fa7491" + }, + "withr": { + "Package": "withr", + "Version": "2.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "stats" + ], + "Hash": "c0e49a9760983e81e55cdd9be92e7182" + }, + "wk": { + "Package": "wk", + "Version": "0.9.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "f58cfa8d9c3a78a309d455a647dee853" + }, + "xfun": { + "Package": "xfun", + "Version": "0.39", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "stats", + "tools" + ], + "Hash": "8f56e9acb54fb525e66464d57ab58bcb" + }, + "yaml": { + "Package": "yaml", + "Version": "2.3.7", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "0d0056cc5383fbc240ccd0cb584bf436" + } + } +} diff --git a/renv/profiles/reporting/renv/.gitignore b/renv/profiles/reporting/renv/.gitignore new file mode 100644 index 00000000..0ec0cbba --- /dev/null +++ b/renv/profiles/reporting/renv/.gitignore @@ -0,0 +1,7 @@ +library/ +local/ +cellar/ +lock/ +python/ +sandbox/ +staging/ diff --git a/reports/performance.qmd b/reports/performance.qmd new file mode 100644 index 00000000..88d2f510 --- /dev/null +++ b/reports/performance.qmd @@ -0,0 +1,27 @@ +--- +title: "Model performance for `r params$run_id`" +execute: + echo: false + warning: false +format: + html: + embed-resources: true + toc: true + toc_float: true + fig-align: center + fontsize: 12pt +editor: source +params: + run_id: 2023-03-14-clever-damani + year: '2023' +--- + +This document is a stub. + +```{r setup} +library(plotly) +library(leaflet) +library(sf) + +print("All packages loaded!") +```