diff --git a/README.Rmd b/README.Rmd
index ee2eb477..e082e5c7 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -217,20 +217,78 @@ Model accuracy for each parameter combination is measured on a validation set us
The residential model uses a variety of individual and aggregate features to determine a property's assessed value. We've tested a long list of possible features over time, including [walk score](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_walkscore.html), [crime rate](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/chicago_crimerate.html), [school districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_school_boundaries_mean_encoded.html), and many others. The features in the table below are the ones that made the cut. They're the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the model as of `r Sys.Date()`.
+
```{r feature_guide, message=FALSE, results='asis', echo=FALSE}
library(dplyr)
library(tidyr)
library(yaml)
+library(jsonlite)
+library(purrr)
+library(tibble)
+
+# Some values are derived in the model itself, so they are not documented
+# in the dbt DAG and need to be documented here
+hardcoded_descriptions <- tribble(
+ ~"column", ~"description",
+ "sale_year", "Sale year calculated as the number of years since 0 B.C.E",
+ "sale_day", "Sale day calculated as the number of days since January 1st, 1997",
+ "sale_quarter_of_year", "Character encoding of quarter of year (Q1 - Q4)",
+ "sale_month_of_year", "Character encoding of month of year (Jan - Dec)",
+ "sale_day_of_year", "Numeric encoding of day of year (1 - 365)",
+ "sale_day_of_month", "Numeric encoding of day of month (1 - 31)",
+ "sale_day_of_week", "Numeric encoding of day of week (1 - 7)",
+ "sale_post_covid", "Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020)"
+)
+
+# Load the dbt DAG from our prod docs site
+dbt_manifest <- fromJSON("https://ccao-data.github.io/data-architecture/manifest.json")
+
+get_column_description <- function(colname, dag_nodes, hardcoded_descriptions) {
+ # Retrieve the description for a column `colname` either from a set of
+ # dbt DAG nodes (`dag_nodes`) or a set of hardcoded descriptions
+ # (`hardcoded_descriptions`)
+ #
+ # Prefer the hardcoded descriptions, if they exist
+ if (colname %in% hardcoded_descriptions$column) {
+ return(
+ hardcoded_descriptions[
+ match(colname, hardcoded_descriptions$column),
+ ]$description
+ )
+ }
+ # If no hardcoded description exists, fall back to checking the dbt DAG
+ for (node_name in ls(dag_nodes)) {
+ node <- dag_nodes[[node_name]]
+ for (column_name in ls(node$columns)) {
+ if (column_name == colname) {
+ description <- node$columns[[column_name]]$description
+ if (!is.null(description) && trimws(description) != "") {
+ return(gsub("\n", " ", description))
+ }
+ }
+ }
+ }
+ # No match in either the hardcoded descriptions or the dbt DAG, so fall
+ # back to an empty string
+ return("")
+}
+
params <- read_yaml("params.yaml")
-ccao::vars_dict %>%
- filter(
- var_is_predictor,
- var_name_model != "meta_sale_price",
- var_model_type %in% c("all", "res")
- ) %>%
+
+param_tbl <- as_tibble(params$model$predictor$all)
+
+# Make a vector of column descriptions that we can add to the param tibble
+# as a new column
+param_notes <- param_tbl$value %>%
+ ccao::vars_rename(names_from = "model", names_to = "athena") %>%
+ map(\(x) get_column_description(x, dbt_manifest$nodes, hardcoded_descriptions)) %>%
+ unlist
+
+param_tbl %>%
+ add_column(description=param_notes) %>%
inner_join(
- as_tibble(params$model$predictor$all),
- by = c("var_name_model" = "value")
+ ccao::vars_dict,
+ by = c("value" = "var_name_model")
) %>%
group_by(var_name_pretty) %>%
mutate(row = paste0("X", row_number())) %>%
@@ -238,7 +296,7 @@ ccao::vars_dict %>%
`Feature Name` = var_name_pretty,
Category = var_type,
Type = var_data_type,
- Notes = var_notes,
+ Notes = description,
var_value, row
) %>%
mutate(Category = recode(
@@ -253,7 +311,7 @@ ccao::vars_dict %>%
values_from = var_value
) %>%
unite("Possible Values", starts_with("X"), sep = ", ", na.rm = TRUE) %>%
- mutate(Notes = replace_na(Notes, "")) %>%
+ mutate(Notes = replace_na(Notes, list(""))) %>%
arrange(Category) %>%
relocate(Notes, .after = everything()) %>%
knitr::kable(format = "markdown")
diff --git a/README.md b/README.md
index 18b242cd..e0742637 100644
--- a/README.md
+++ b/README.md
@@ -331,101 +331,102 @@ districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-
and many others. The features in the table below are the ones that made
the cut. They’re the right combination of easy to understand and impute,
powerfully predictive, and well-behaved. Most of them are in use in the
-model as of 2023-07-13.
-
-| Feature Name | Category | Type | Possible Values | Notes |
-|:------------------------------------------------------------------------|:---------------|:------------|:---------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the population 17 years or younger. (B01001_003E + B01001_004E + B01001_005E + B01001_006E + B01001_007E + B01001_027E + B01001_028E + B01001_029E + B01001_030E + B01001_031E) / B01001_001E |
-| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the population 65 years or older. (B01001_020E + B01001_021E + B01001_022E + B01001_023E + B01001_024E + B01001_025E + B01001_044E + B01001_045E + B01001_046E + B01001_046E + B01001_048E + B01001_049E) / B01001_001E |
-| Median Population Age | ACS5 | numeric | | Median age for whole population. B01002_001E |
-| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months. B07003_004E / B07003_001E |
-| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months. B07003_013E / B07003_001E |
-| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married (married). B11001_003E / B11001_001E |
-| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single). B11001_008E / B11001_001E |
-| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree. (B15002_011E + B15002_028E) / B15002_001E |
-| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor degree. (B15002_015E + B15002_032E) / B15002_001E |
-| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree. (B15002_016E + B15002_017E + B15002_018E + B15002_033E + B15002_034E + B15002_035E) / B15002_001E |
-| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people below poverty level. B17001_002E / B17001_001E |
-| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months. B19013_001E |
-| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months. B19301_001E |
-| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months. B22003_002E / B22003_001E |
-| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed. B23025_005E / B23025_003E |
-| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied housing units. B25037_001E |
-| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units. B25064_001E |
-| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied. B25003_002E / B25003_001E |
-| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Selected conditions, including: incomplete plumbing or kitchens, overcrowding, 30% or more of the household income spent on rent or monthly owner costs. (B25123_003E + B25123_004E + B25123_005E + B25123_006E + B25123_009E + B25123_010E + B25123_011E + B25123_012E) / B25123_001E |
-| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months. B07003_007E / B07003_001E |
-| Year Built | Characteristic | numeric | | |
-| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | |
-| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties. CAUTION: Note the numerically encoded values DO NOT correspond to the number of apartments i.e. code 1 means 2 apartments, code 6 means 0 apartments |
-| Attic Finish | Characteristic | categorical | Living Area, Partial, None | |
-| Attic Type | Characteristic | categorical | Full, Partial, None | |
-| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building |
-| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior |
-| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | |
-| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | |
-| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | |
-| Full Baths | Characteristic | numeric | | Number of full bathrooms, defined as having a bath or shower. If this value is missing, the default value is set to 1 |
-| Fireplaces | Characteristic | numeric | | Number of fireplaces, counted as the number of flues one can see from the outside of the building |
-| Garage 1 Area Included | Characteristic | categorical | Yes, No | Is Garage 1 physically included within the building area? If yes, the garage area is subtracted from the building square feet calculation by the field agent |
-| Garage 1 Attached | Characteristic | categorical | Yes, No | |
-| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | |
-| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | |
-| Half Baths | Characteristic | numeric | | Number of half baths, defined as bathrooms without a shower or bathtub |
-| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property. Note that a single PIN can have multiple “land lines,” meaning it can be associated with more than one 200-class land lot |
-| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | |
-| Number of Commercial Units | Characteristic | numeric | | Number of commercial units (the vast majority are for properties with class 212) |
-| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | |
-| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | |
-| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths). Not to be confused with bedrooms |
-| Cathedral Ceiling | Characteristic | categorical | Yes, No | Field has not been updated recently enough to be useful for modeling. |
-| Design Plan | Characteristic | categorical | Architect, Stock Plan | |
-| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | |
-| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years. Renovation is indicated by the char_renovation characteristic flipping from “NO” to “YES” |
-| Longitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN |
-| Latitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN |
-| Municipality Name | Location | character | | Municipality name for a given PIN. Taken from Cook County GIS shapefiles |
-| FEMA Special Flood Hazard Area | Location | logical | | Indicator for a PIN within a FEMA Special Flood Hazard Area. Taken from FEMA site for 2021 only |
-| First Street Factor | Location | numeric | | First Street flood factor (risk score) for a given PIN, scores 1 - 10. Provided to the CCAO by firststreet.org |
-| First Street Risk Direction | Location | numeric | | First Street risk direction for a given PIN. Positive scores indicate increasing future flood risk, negative scores the opposite. Provided to the CCAO by firststreet.org |
-| School Elementary District GEOID | Location | character | | Elementary school district ID for a given PIN. For CPS, elementary school attendance boundaries are used. Taken from Cook County GIS shapefiles |
-| School Secondary District GEOID | Location | character | | Secondary school district ID for a given PIN. For CPS, secondary school attendance boundaries are used. Taken from Cook County GIS shapefiles |
-| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability. Taken from CMAP’s ON TO 2050 walkability layer |
-| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability. Taken from CMAP’s ON TO 2050 walkability layer |
-| Airport Noise DNL | Location | numeric | | Airport noise calculated via kriging noise monitor data from CDA. See GitLab issue \#70 in the residential modeling repository for more information |
-| Township Code | Meta | character | | Numeric code identifying the Cook County township of a given PIN |
-| Neighborhood Code | Meta | character | | Assessor neighborhood. First 2 digits are township code, last 3 digits are neighborhood code |
-| Tieback Proration Rate | Meta | numeric | | Proration rate for a given PIN. Some buildings sit across multiple PINs. This number is intended to capture the split in building value |
-| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN. Idea is to capture any downward pressure on price from higher tax burdens |
-| School District (Elementary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of elementary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used |
-| School District (Secondary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of secondary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used |
-| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile of a given PIN. Condo buildings are counted as a single PIN |
-| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops (CTA or PACE) within half mile of a given PIN. Taken from GTFS feeds retrieved from transitfeeds.com |
-| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of PIN-level foreclosure in the past 5 years, per 1000 PINs, within half mile of a given PIN. Taken from Illinois Public Records |
-| Number of Schools in Half Mile | Proximity | numeric | | Number of schools within half mile of a given PIN. This includes preschools, small private schools, universities, etc |
-| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools with Great Schools ratings within half mile of a given PIN |
-| Average School Rating in Half Mile | Proximity | numeric | | Average Great Schools rating for all schools (with a rating) within half mile of a given PIN. Public schools must be within the same district as the PIN to be considered in the average |
-| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest bike trail (linestring). Taken from Cook County GIS shapefiles |
-| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest cemetery (polygon). Taken from Cook County GIS shapefiles |
-| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA tracks. Taken from CTA GTFS feeds retrieved via transitfeeds.com |
-| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA stop. Taken from CTA GTFS feeds retrieved via transitfeeds.com |
-| Nearest Hospital Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest hospital (polygon). Taken from Cook County GIS shapefiles |
-| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance in feet to the Lake Michigan coastline. Taken from TIGER/Line coastlines file and filtered to Cook County only |
-| Nearest Major Road Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest major road/highway. Pulled via OpenStreetMap, key=highway, value=motorway,primary,trunk |
-| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra tracks. Taken from Metra GTFS feeds retrieved via transitfeeds.com |
-| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra stop. Taken from Metra GTFS feeds retrieved via transitfeeds.com |
-| Nearest Park Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest park. Pull via OpenStreetMap, key=leisure, value=park |
-| Nearest Railroad Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest rail (not including CTA). Taken from Cook County GIS shapefiles |
-| Nearest Water Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest water, as identified by Cook County hydrology files |
-| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest golf course (polygon). Taken from Cook County GIS shapefiles and OpenStreetMap |
-| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E |
-| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 |
-| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) |
-| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) |
-| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) |
-| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) |
-| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) |
-| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) |
+model as of 2023-10-04.
+
+| Feature Name | Category | Type | Possible Values | Notes |
+|:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the population 17 years or younger. ACS variable (B01001_003E + B01001_004E + B01001_005E + B01001_006E + B01001_007E + B01001_027E + B01001_028E + B01001_029E + B01001_030E + B01001_031E) / B01001_001E |
+| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the population 65 years or older. ACS variable (B01001_020E + B01001_021E + B01001_022E + B01001_023E + B01001_024E + B01001_025E + B01001_044E + B01001_045E + B01001_046E + B01001_046E + B01001_048E + B01001_049E) / B01001_001E |
+| Median Population Age | ACS5 | numeric | | Median age for whole population. ACS variable B01002_001E |
+| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months. ACS variable B07003_004E / B07003_001E |
+| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months. ACS variable B07003_013E / B07003_001E |
+| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married (married). ACS variable B11001_003E / B11001_001E |
+| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single). ACS variable B11001_008E / B11001_001E |
+| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree. ACS variable (B15002_011E + B15002_028E) / B15002_001E |
+| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor degree. ACS variable (B15002_015E + B15002_032E) / B15002_001E |
+| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree. ACS variable (B15002_016E + B15002_017E + B15002_018E + B15002_033E + B15002_034E + B15002_035E) / B15002_001E |
+| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people below poverty level. ACS variable B17001_002E / B17001_001E |
+| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months. ACS variable B19013_001E |
+| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months. ACS variable B19301_001E |
+| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months. ACS variable B22003_002E / B22003_001E |
+| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed. ACS variable B23025_005E / B23025_003E |
+| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied housing units. ACS variable B25037_001E |
+| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units. ACS variable B25064_001E |
+| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied. ACS variable B25003_002E / B25003_001E |
+| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Selected conditions, including: incomplete plumbing or kitchens, overcrowding, 30% or more of the household income spent on rent or monthly owner costs ACS variable (B25123_003E + B25123_004E + B25123_005E + B25123_006E + B25123_009E + B25123_010E + B25123_011E + B25123_012E) / B25123_001E |
+| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months. ACS variable B07003_007E / B07003_001E |
+| Year Built | Characteristic | numeric | | |
+| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | |
+| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties. CAUTION: Note the numerically encoded values DO NOT correspond to the number of apartments i.e. code 1 means 2 apartments, code 6 means 0 apartments |
+| Attic Finish | Characteristic | categorical | Living Area, Partial, None | |
+| Attic Type | Characteristic | categorical | Full, Partial, None | |
+| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building |
+| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior |
+| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | |
+| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | |
+| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | |
+| Full Baths | Characteristic | numeric | | Number of full bathrooms, defined as having a bath or shower. If this value is missing, the default value is set to 1 |
+| Fireplaces | Characteristic | numeric | | Number of fireplaces, counted as the number of flues one can see from the outside of the building |
+| Garage 1 Area Included | Characteristic | categorical | Yes, No | |
+| Garage 1 Attached | Characteristic | categorical | Yes, No | |
+| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | |
+| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | |
+| Half Baths | Characteristic | numeric | | Number of half baths, defined as bathrooms without a shower or bathtub |
+| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property. A single PIN can have multiple “land lines,” meaning it can be associated with more than one 200-class land lot |
+| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | |
+| Number of Commercial Units | Characteristic | numeric | | Number of commercial units. The vast majority are for properties with class 212 |
+| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | |
+| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | |
+| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths). Not to be confused with bedrooms |
+| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated. Field has not been updated recently enough to be useful for modeling |
+| Design Plan | Characteristic | categorical | Architect, Stock Plan | |
+| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | |
+| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years. Renovation is indicated by the char_renovation characteristic flipping from “NO” to “YES” |
+| Longitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN |
+| Latitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN |
+| FEMA Special Flood Hazard Area | Location | logical | | Indicator for a PIN within a FEMA Special Flood Hazard Area. Taken from FEMA site for 2021 only |
+| First Street Factor | Location | numeric | | First Street flood factor (risk score) for a given PIN, scores 1 - 10. Provided to the CCAO by firststreet.org |
+| First Street Risk Direction | Location | numeric | | First Street risk direction for a given PIN. Positive scores indicate increasing future flood risk, negative scores the opposite. Provided to the CCAO by firststreet.org |
+| Airport Noise DNL | Location | numeric | | Airport noise calculated via kriging noise monitor data from CDA. See for more information |
+| School Elementary District GEOID | Location | character | | Elementary school district ID for a given PIN. For CPS, elementary school attendance boundaries are used. Taken from Cook County GIS shapefiles |
+| School Secondary District GEOID | Location | character | | Secondary school district ID for a given PIN. For CPS, secondary school attendance boundaries are used. Taken from Cook County GIS shapefiles |
+| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability. Taken from CMAP’s ON TO 2050 walkability layer |
+| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability. Taken from CMAP’s ON TO 2050 walkability layer |
+| Municipality Name | Location | character | | Municipality name for a given PIN. Taken from Cook County GIS shapefiles |
+| Township Code | Meta | character | | Numeric code identifying the Cook County township of a given PIN |
+| Neighborhood Code | Meta | character | | Assessor neighborhood. First 2 digits are township code, last 3 digits are neighborhood code |
+| Property Group | Meta | categorical | Non-Livable Space, Single-Family, Multi-Family, Condominium, Bed & Breakfast | |
+| Tieback Proration Rate | Meta | numeric | | Proration rate for a given PIN. Some buildings sit across multiple PINs. This number is intended to capture the split in building value |
+| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN. Idea is to capture any downward pressure on price from higher tax burdens |
+| School District (Elementary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of elementary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used |
+| School District (Secondary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of secondary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used |
+| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile of a given PIN. Condo buildings are counted as a single PIN |
+| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops (CTA or PACE) within half mile of a given PIN. Taken from GTFS feeds retrieved from transitfeeds.com |
+| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of PIN-level foreclosure in the past 5 years, per 1000 PINs, within half mile of a given PIN. Taken from Illinois Public Records |
+| Number of Schools in Half Mile | Proximity | numeric | | Number of schools within half mile of a given PIN. This includes preschools, small private schools, universities, etc |
+| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools with Great Schools ratings within half mile of a given PIN |
+| Average School Rating in Half Mile | Proximity | numeric | | Average Great Schools rating for all schools (with a rating) within half mile of a given PIN. Public schools must be within the same district as the PIN to be considered in the average |
+| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest bike trail (linestring). Taken from Cook County GIS shapefiles |
+| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest cemetery (polygon). Taken from Cook County GIS shapefiles |
+| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA tracks. Taken from CTA GTFS feeds retrieved via transitfeeds.com |
+| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA stop. Taken from CTA GTFS feeds retrieved via transitfeeds.com |
+| Nearest Hospital Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest hospital (polygon). Taken from Cook County GIS shapefiles |
+| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance in feet to the Lake Michigan coastline. Taken from TIGER/Line coastlines file and filtered to Cook County only |
+| Nearest Major Road Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest major road/highway. Pulled via OpenStreetMap, key=highway, value=motorway,primary,trunk |
+| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra tracks. Taken from Metra GTFS feeds retrieved via transitfeeds.com |
+| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra stop. Taken from Metra GTFS feeds retrieved via transitfeeds.com |
+| Nearest Park Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest park. Pull via OpenStreetMap, key=leisure, value=park |
+| Nearest Railroad Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest rail (not including CTA). Taken from Cook County GIS shapefiles |
+| Nearest Water Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest water. As identified by Cook County hydrology files |
+| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest golf course (polygon). Taken from Cook County GIS shapefiles and OpenStreetMap |
+| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E |
+| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 |
+| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) |
+| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) |
+| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) |
+| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) |
+| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) |
+| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) |
#### Data Sources