diff --git a/README.Rmd b/README.Rmd index ee2eb477..e082e5c7 100644 --- a/README.Rmd +++ b/README.Rmd @@ -217,20 +217,78 @@ Model accuracy for each parameter combination is measured on a validation set us The residential model uses a variety of individual and aggregate features to determine a property's assessed value. We've tested a long list of possible features over time, including [walk score](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_walkscore.html), [crime rate](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/chicago_crimerate.html), [school districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/-/blob/9407d1fae1986c5ce1f5434aa91d3f8cf06c8ea1/output/test_new_variables/county_school_boundaries_mean_encoded.html), and many others. The features in the table below are the ones that made the cut. They're the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the model as of `r Sys.Date()`. + ```{r feature_guide, message=FALSE, results='asis', echo=FALSE} library(dplyr) library(tidyr) library(yaml) +library(jsonlite) +library(purrr) +library(tibble) + +# Some values are derived in the model itself, so they are not documented +# in the dbt DAG and need to be documented here +hardcoded_descriptions <- tribble( + ~"column", ~"description", + "sale_year", "Sale year calculated as the number of years since 0 B.C.E", + "sale_day", "Sale day calculated as the number of days since January 1st, 1997", + "sale_quarter_of_year", "Character encoding of quarter of year (Q1 - Q4)", + "sale_month_of_year", "Character encoding of month of year (Jan - Dec)", + "sale_day_of_year", "Numeric encoding of day of year (1 - 365)", + "sale_day_of_month", "Numeric encoding of day of month (1 - 31)", + "sale_day_of_week", "Numeric encoding of day of week (1 - 7)", + "sale_post_covid", "Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020)" +) + +# Load the dbt DAG from our prod docs site +dbt_manifest <- fromJSON("https://ccao-data.github.io/data-architecture/manifest.json") + +get_column_description <- function(colname, dag_nodes, hardcoded_descriptions) { + # Retrieve the description for a column `colname` either from a set of + # dbt DAG nodes (`dag_nodes`) or a set of hardcoded descriptions + # (`hardcoded_descriptions`) + # + # Prefer the hardcoded descriptions, if they exist + if (colname %in% hardcoded_descriptions$column) { + return( + hardcoded_descriptions[ + match(colname, hardcoded_descriptions$column), + ]$description + ) + } + # If no hardcoded description exists, fall back to checking the dbt DAG + for (node_name in ls(dag_nodes)) { + node <- dag_nodes[[node_name]] + for (column_name in ls(node$columns)) { + if (column_name == colname) { + description <- node$columns[[column_name]]$description + if (!is.null(description) && trimws(description) != "") { + return(gsub("\n", " ", description)) + } + } + } + } + # No match in either the hardcoded descriptions or the dbt DAG, so fall + # back to an empty string + return("") +} + params <- read_yaml("params.yaml") -ccao::vars_dict %>% - filter( - var_is_predictor, - var_name_model != "meta_sale_price", - var_model_type %in% c("all", "res") - ) %>% + +param_tbl <- as_tibble(params$model$predictor$all) + +# Make a vector of column descriptions that we can add to the param tibble +# as a new column +param_notes <- param_tbl$value %>% + ccao::vars_rename(names_from = "model", names_to = "athena") %>% + map(\(x) get_column_description(x, dbt_manifest$nodes, hardcoded_descriptions)) %>% + unlist + +param_tbl %>% + add_column(description=param_notes) %>% inner_join( - as_tibble(params$model$predictor$all), - by = c("var_name_model" = "value") + ccao::vars_dict, + by = c("value" = "var_name_model") ) %>% group_by(var_name_pretty) %>% mutate(row = paste0("X", row_number())) %>% @@ -238,7 +296,7 @@ ccao::vars_dict %>% `Feature Name` = var_name_pretty, Category = var_type, Type = var_data_type, - Notes = var_notes, + Notes = description, var_value, row ) %>% mutate(Category = recode( @@ -253,7 +311,7 @@ ccao::vars_dict %>% values_from = var_value ) %>% unite("Possible Values", starts_with("X"), sep = ", ", na.rm = TRUE) %>% - mutate(Notes = replace_na(Notes, "")) %>% + mutate(Notes = replace_na(Notes, list(""))) %>% arrange(Category) %>% relocate(Notes, .after = everything()) %>% knitr::kable(format = "markdown") diff --git a/README.md b/README.md index 18b242cd..e0742637 100644 --- a/README.md +++ b/README.md @@ -331,101 +331,102 @@ districts](https://gitlab.com/ccao-data-science---modeling/models/ccao_res_avm/- and many others. The features in the table below are the ones that made the cut. They’re the right combination of easy to understand and impute, powerfully predictive, and well-behaved. Most of them are in use in the -model as of 2023-07-13. - -| Feature Name | Category | Type | Possible Values | Notes | -|:------------------------------------------------------------------------|:---------------|:------------|:---------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the population 17 years or younger. (B01001_003E + B01001_004E + B01001_005E + B01001_006E + B01001_007E + B01001_027E + B01001_028E + B01001_029E + B01001_030E + B01001_031E) / B01001_001E | -| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the population 65 years or older. (B01001_020E + B01001_021E + B01001_022E + B01001_023E + B01001_024E + B01001_025E + B01001_044E + B01001_045E + B01001_046E + B01001_046E + B01001_048E + B01001_049E) / B01001_001E | -| Median Population Age | ACS5 | numeric | | Median age for whole population. B01002_001E | -| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months. B07003_004E / B07003_001E | -| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months. B07003_013E / B07003_001E | -| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married (married). B11001_003E / B11001_001E | -| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single). B11001_008E / B11001_001E | -| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree. (B15002_011E + B15002_028E) / B15002_001E | -| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor degree. (B15002_015E + B15002_032E) / B15002_001E | -| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree. (B15002_016E + B15002_017E + B15002_018E + B15002_033E + B15002_034E + B15002_035E) / B15002_001E | -| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people below poverty level. B17001_002E / B17001_001E | -| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months. B19013_001E | -| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months. B19301_001E | -| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months. B22003_002E / B22003_001E | -| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed. B23025_005E / B23025_003E | -| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied housing units. B25037_001E | -| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units. B25064_001E | -| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied. B25003_002E / B25003_001E | -| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Selected conditions, including: incomplete plumbing or kitchens, overcrowding, 30% or more of the household income spent on rent or monthly owner costs. (B25123_003E + B25123_004E + B25123_005E + B25123_006E + B25123_009E + B25123_010E + B25123_011E + B25123_012E) / B25123_001E | -| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months. B07003_007E / B07003_001E | -| Year Built | Characteristic | numeric | | | -| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | | -| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties. CAUTION: Note the numerically encoded values DO NOT correspond to the number of apartments i.e. code 1 means 2 apartments, code 6 means 0 apartments | -| Attic Finish | Characteristic | categorical | Living Area, Partial, None | | -| Attic Type | Characteristic | categorical | Full, Partial, None | | -| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building | -| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior | -| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | | -| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | | -| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | | -| Full Baths | Characteristic | numeric | | Number of full bathrooms, defined as having a bath or shower. If this value is missing, the default value is set to 1 | -| Fireplaces | Characteristic | numeric | | Number of fireplaces, counted as the number of flues one can see from the outside of the building | -| Garage 1 Area Included | Characteristic | categorical | Yes, No | Is Garage 1 physically included within the building area? If yes, the garage area is subtracted from the building square feet calculation by the field agent | -| Garage 1 Attached | Characteristic | categorical | Yes, No | | -| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | | -| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | | -| Half Baths | Characteristic | numeric | | Number of half baths, defined as bathrooms without a shower or bathtub | -| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property. Note that a single PIN can have multiple “land lines,” meaning it can be associated with more than one 200-class land lot | -| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | | -| Number of Commercial Units | Characteristic | numeric | | Number of commercial units (the vast majority are for properties with class 212) | -| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | | -| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | | -| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths). Not to be confused with bedrooms | -| Cathedral Ceiling | Characteristic | categorical | Yes, No | Field has not been updated recently enough to be useful for modeling. | -| Design Plan | Characteristic | categorical | Architect, Stock Plan | | -| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | | -| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years. Renovation is indicated by the char_renovation characteristic flipping from “NO” to “YES” | -| Longitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN | -| Latitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN | -| Municipality Name | Location | character | | Municipality name for a given PIN. Taken from Cook County GIS shapefiles | -| FEMA Special Flood Hazard Area | Location | logical | | Indicator for a PIN within a FEMA Special Flood Hazard Area. Taken from FEMA site for 2021 only | -| First Street Factor | Location | numeric | | First Street flood factor (risk score) for a given PIN, scores 1 - 10. Provided to the CCAO by firststreet.org | -| First Street Risk Direction | Location | numeric | | First Street risk direction for a given PIN. Positive scores indicate increasing future flood risk, negative scores the opposite. Provided to the CCAO by firststreet.org | -| School Elementary District GEOID | Location | character | | Elementary school district ID for a given PIN. For CPS, elementary school attendance boundaries are used. Taken from Cook County GIS shapefiles | -| School Secondary District GEOID | Location | character | | Secondary school district ID for a given PIN. For CPS, secondary school attendance boundaries are used. Taken from Cook County GIS shapefiles | -| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | -| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | -| Airport Noise DNL | Location | numeric | | Airport noise calculated via kriging noise monitor data from CDA. See GitLab issue \#70 in the residential modeling repository for more information | -| Township Code | Meta | character | | Numeric code identifying the Cook County township of a given PIN | -| Neighborhood Code | Meta | character | | Assessor neighborhood. First 2 digits are township code, last 3 digits are neighborhood code | -| Tieback Proration Rate | Meta | numeric | | Proration rate for a given PIN. Some buildings sit across multiple PINs. This number is intended to capture the split in building value | -| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN. Idea is to capture any downward pressure on price from higher tax burdens | -| School District (Elementary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of elementary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | -| School District (Secondary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of secondary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | -| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile of a given PIN. Condo buildings are counted as a single PIN | -| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops (CTA or PACE) within half mile of a given PIN. Taken from GTFS feeds retrieved from transitfeeds.com | -| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of PIN-level foreclosure in the past 5 years, per 1000 PINs, within half mile of a given PIN. Taken from Illinois Public Records | -| Number of Schools in Half Mile | Proximity | numeric | | Number of schools within half mile of a given PIN. This includes preschools, small private schools, universities, etc | -| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools with Great Schools ratings within half mile of a given PIN | -| Average School Rating in Half Mile | Proximity | numeric | | Average Great Schools rating for all schools (with a rating) within half mile of a given PIN. Public schools must be within the same district as the PIN to be considered in the average | -| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest bike trail (linestring). Taken from Cook County GIS shapefiles | -| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest cemetery (polygon). Taken from Cook County GIS shapefiles | -| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA tracks. Taken from CTA GTFS feeds retrieved via transitfeeds.com | -| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA stop. Taken from CTA GTFS feeds retrieved via transitfeeds.com | -| Nearest Hospital Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest hospital (polygon). Taken from Cook County GIS shapefiles | -| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance in feet to the Lake Michigan coastline. Taken from TIGER/Line coastlines file and filtered to Cook County only | -| Nearest Major Road Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest major road/highway. Pulled via OpenStreetMap, key=highway, value=motorway,primary,trunk | -| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra tracks. Taken from Metra GTFS feeds retrieved via transitfeeds.com | -| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra stop. Taken from Metra GTFS feeds retrieved via transitfeeds.com | -| Nearest Park Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest park. Pull via OpenStreetMap, key=leisure, value=park | -| Nearest Railroad Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest rail (not including CTA). Taken from Cook County GIS shapefiles | -| Nearest Water Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest water, as identified by Cook County hydrology files | -| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest golf course (polygon). Taken from Cook County GIS shapefiles and OpenStreetMap | -| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | -| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | -| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | -| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | -| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | -| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | -| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | -| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | +model as of 2023-10-04. + +| Feature Name | Category | Type | Possible Values | Notes | +|:------------------------------------------------------------------------|:---------------|:------------|:-----------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Percent Population Age, Under 19 Years Old | ACS5 | numeric | | Percent of the population 17 years or younger. ACS variable (B01001_003E + B01001_004E + B01001_005E + B01001_006E + B01001_007E + B01001_027E + B01001_028E + B01001_029E + B01001_030E + B01001_031E) / B01001_001E | +| Percent Population Age, Over 65 Years Old | ACS5 | numeric | | Percent of the population 65 years or older. ACS variable (B01001_020E + B01001_021E + B01001_022E + B01001_023E + B01001_024E + B01001_025E + B01001_044E + B01001_045E + B01001_046E + B01001_046E + B01001_048E + B01001_049E) / B01001_001E | +| Median Population Age | ACS5 | numeric | | Median age for whole population. ACS variable B01002_001E | +| Percent Population Mobility, In Same House 1 Year Ago | ACS5 | numeric | | Percent of people (older than 1 year) who have not moved in the past 12 months. ACS variable B07003_004E / B07003_001E | +| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved from another state in the past 12 months. ACS variable B07003_013E / B07003_001E | +| Percent Households Family, Married | ACS5 | numeric | | Percent of households that are family, married (married). ACS variable B11001_003E / B11001_001E | +| Percent Households Nonfamily, Living Alone | ACS5 | numeric | | Percent of households that are non-family, alone (single). ACS variable B11001_008E / B11001_001E | +| Percent Population Education, High School Degree | ACS5 | numeric | | Percent of people older than 25 who attained a high school degree. ACS variable (B15002_011E + B15002_028E) / B15002_001E | +| Percent Population Education, Bachelor Degree | ACS5 | numeric | | Percent of people older than 25 who attained a bachelor degree. ACS variable (B15002_015E + B15002_032E) / B15002_001E | +| Percent Population Education, Graduate Degree | ACS5 | numeric | | Percent of people older than 25 who attained a graduate degree. ACS variable (B15002_016E + B15002_017E + B15002_018E + B15002_033E + B15002_034E + B15002_035E) / B15002_001E | +| Percent Population Income, Below Poverty Level | ACS5 | numeric | | Percent of people below poverty level. ACS variable B17001_002E / B17001_001E | +| Median Income, Household in Past Year | ACS5 | numeric | | Median income per household in the past 12 months. ACS variable B19013_001E | +| Median Income, Per Capita in Past Year | ACS5 | numeric | | Median income per capita in the past 12 months. ACS variable B19301_001E | +| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | | Percent of households that received SNAP in the past 12 months. ACS variable B22003_002E / B22003_001E | +| Percent Population Employment, Unemployed | ACS5 | numeric | | Percent of people 16 years and older unemployed. ACS variable B23025_005E / B23025_003E | +| Median Occupied Household, Total, Year Built | ACS5 | numeric | | Median year built for all occupied housing units. ACS variable B25037_001E | +| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | | Median gross rent for only renter-occupied units. ACS variable B25064_001E | +| Percent Occupied Households, Owner | ACS5 | numeric | | Percent of households that are owner-occupied. ACS variable B25003_002E / B25003_001E | +| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | | Selected conditions, including: incomplete plumbing or kitchens, overcrowding, 30% or more of the household income spent on rent or monthly owner costs ACS variable (B25123_003E + B25123_004E + B25123_005E + B25123_006E + B25123_009E + B25123_010E + B25123_011E + B25123_012E) / B25123_001E | +| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | | Percent of people (older than 1 year) who moved in county in the past 12 months. ACS variable B07003_007E / B07003_001E | +| Year Built | Characteristic | numeric | | | +| Central Air Conditioning | Characteristic | categorical | Central A/C, No Central A/C | | +| Apartments | Characteristic | categorical | Two, Three, Four, Five, Six, None | Number of apartments for class 211 and 212 properties. CAUTION: Note the numerically encoded values DO NOT correspond to the number of apartments i.e. code 1 means 2 apartments, code 6 means 0 apartments | +| Attic Finish | Characteristic | categorical | Living Area, Partial, None | | +| Attic Type | Characteristic | categorical | Full, Partial, None | | +| Bedrooms | Characteristic | numeric | | Number of bedrooms in the building | +| Building Square Feet | Characteristic | numeric | | Square footage of the building, as measured from the exterior | +| Basement Type | Characteristic | categorical | Full, Slab, Partial, Crawl | | +| Basement Finish | Characteristic | categorical | Formal Rec Room, Apartment, Unfinished | | +| Exterior Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | | +| Full Baths | Characteristic | numeric | | Number of full bathrooms, defined as having a bath or shower. If this value is missing, the default value is set to 1 | +| Fireplaces | Characteristic | numeric | | Number of fireplaces, counted as the number of flues one can see from the outside of the building | +| Garage 1 Area Included | Characteristic | categorical | Yes, No | | +| Garage 1 Attached | Characteristic | categorical | Yes, No | | +| Garage 1 Ext. Wall Material | Characteristic | categorical | Frame, Masonry, Frame + Masonry, Stucco | | +| Garage 1 Size | Characteristic | categorical | 1 cars, 1.5 cars, 2 cars, 2.5 cars, 3 cars, 3.5 cars, 0 cars, 4 cars | | +| Half Baths | Characteristic | numeric | | Number of half baths, defined as bathrooms without a shower or bathtub | +| Land Square Feet | Characteristic | numeric | | Square footage of the land (not just the building) of the property. A single PIN can have multiple “land lines,” meaning it can be associated with more than one 200-class land lot | +| Central Heating | Characteristic | categorical | Warm Air Furnace, Hot Water Steam, Electric Heater, None | | +| Number of Commercial Units | Characteristic | numeric | | Number of commercial units. The vast majority are for properties with class 212 | +| Porch | Characteristic | categorical | None, Frame Enclosed, Masonry Enclosed | | +| Roof Material | Characteristic | categorical | Shingle + Asphalt, Tar + Gravel, Slate, Shake, Tile, Other | | +| Rooms | Characteristic | numeric | | Number of total rooms in the building (excluding baths). Not to be confused with bedrooms | +| Cathedral Ceiling | Characteristic | categorical | Yes, No | Deprecated. Field has not been updated recently enough to be useful for modeling | +| Design Plan | Characteristic | categorical | Architect, Stock Plan | | +| Type of Residence | Characteristic | categorical | 1 Story, 2 Story, 3 Story +, Split Level, 1.5 Story, Missing | | +| Recent Renovation | Characteristic | logical | | Indicates whether or not a property was renovated within the last 3 years. Renovation is indicated by the char_renovation characteristic flipping from “NO” to “YES” | +| Longitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN | +| Latitude | Location | numeric | | PIN location derived from the centroid of the largest polygon associated with the PIN | +| FEMA Special Flood Hazard Area | Location | logical | | Indicator for a PIN within a FEMA Special Flood Hazard Area. Taken from FEMA site for 2021 only | +| First Street Factor | Location | numeric | | First Street flood factor (risk score) for a given PIN, scores 1 - 10. Provided to the CCAO by firststreet.org | +| First Street Risk Direction | Location | numeric | | First Street risk direction for a given PIN. Positive scores indicate increasing future flood risk, negative scores the opposite. Provided to the CCAO by firststreet.org | +| Airport Noise DNL | Location | numeric | | Airport noise calculated via kriging noise monitor data from CDA. See for more information | +| School Elementary District GEOID | Location | character | | Elementary school district ID for a given PIN. For CPS, elementary school attendance boundaries are used. Taken from Cook County GIS shapefiles | +| School Secondary District GEOID | Location | character | | Secondary school district ID for a given PIN. For CPS, secondary school attendance boundaries are used. Taken from Cook County GIS shapefiles | +| CMAP Walkability Score (No Transit) | Location | numeric | | CMAP walkability score for a given PIN, excluding transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | +| CMAP Walkability Total Score | Location | numeric | | CMAP walkability score for a given PIN, including transit walkability. Taken from CMAP’s ON TO 2050 walkability layer | +| Municipality Name | Location | character | | Municipality name for a given PIN. Taken from Cook County GIS shapefiles | +| Township Code | Meta | character | | Numeric code identifying the Cook County township of a given PIN | +| Neighborhood Code | Meta | character | | Assessor neighborhood. First 2 digits are township code, last 3 digits are neighborhood code | +| Property Group | Meta | categorical | Non-Livable Space, Single-Family, Multi-Family, Condominium, Bed & Breakfast | | +| Tieback Proration Rate | Meta | numeric | | Proration rate for a given PIN. Some buildings sit across multiple PINs. This number is intended to capture the split in building value | +| Property Tax Bill Aggregate Rate | Other | numeric | | Tax bill rate for the taxing district containing a given PIN. Idea is to capture any downward pressure on price from higher tax burdens | +| School District (Elementary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of elementary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | +| School District (Secondary) GreatSchools Rating | Other | numeric | | Average Great Schools rating of secondary schools within the district of a given PIN. For CPS, which is a unified school district, the average of schools within attendance boundary is used | +| Number of PINs in Half Mile | Proximity | numeric | | Number of PINs within half mile of a given PIN. Condo buildings are counted as a single PIN | +| Number of Bus Stops in Half Mile | Proximity | numeric | | Number of bus stops (CTA or PACE) within half mile of a given PIN. Taken from GTFS feeds retrieved from transitfeeds.com | +| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | | Number of PIN-level foreclosure in the past 5 years, per 1000 PINs, within half mile of a given PIN. Taken from Illinois Public Records | +| Number of Schools in Half Mile | Proximity | numeric | | Number of schools within half mile of a given PIN. This includes preschools, small private schools, universities, etc | +| Number of Schools with Rating in Half Mile | Proximity | numeric | | Number of schools with Great Schools ratings within half mile of a given PIN | +| Average School Rating in Half Mile | Proximity | numeric | | Average Great Schools rating for all schools (with a rating) within half mile of a given PIN. Public schools must be within the same district as the PIN to be considered in the average | +| Nearest Bike Trail Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest bike trail (linestring). Taken from Cook County GIS shapefiles | +| Nearest Cemetery Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest cemetery (polygon). Taken from Cook County GIS shapefiles | +| Nearest CTA Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA tracks. Taken from CTA GTFS feeds retrieved via transitfeeds.com | +| Nearest CTA Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest CTA stop. Taken from CTA GTFS feeds retrieved via transitfeeds.com | +| Nearest Hospital Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest hospital (polygon). Taken from Cook County GIS shapefiles | +| Lake Michigan Distance (Feet) | Proximity | numeric | | Distance in feet to the Lake Michigan coastline. Taken from TIGER/Line coastlines file and filtered to Cook County only | +| Nearest Major Road Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest major road/highway. Pulled via OpenStreetMap, key=highway, value=motorway,primary,trunk | +| Nearest Metra Route Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra tracks. Taken from Metra GTFS feeds retrieved via transitfeeds.com | +| Nearest Metra Stop Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest Metra stop. Taken from Metra GTFS feeds retrieved via transitfeeds.com | +| Nearest Park Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest park. Pull via OpenStreetMap, key=leisure, value=park | +| Nearest Railroad Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest rail (not including CTA). Taken from Cook County GIS shapefiles | +| Nearest Water Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest water. As identified by Cook County hydrology files | +| Nearest Golf Course Distance (Feet) | Proximity | numeric | | Distance in feet to the nearest golf course (polygon). Taken from Cook County GIS shapefiles and OpenStreetMap | +| Sale Year | Time | numeric | | Sale year calculated as the number of years since 0 B.C.E | +| Sale Day | Time | numeric | | Sale day calculated as the number of days since January 1st, 1997 | +| Sale Quarter of Year | Time | character | | Character encoding of quarter of year (Q1 - Q4) | +| Sale Month of Year | Time | character | | Character encoding of month of year (Jan - Dec) | +| Sale Day of Year | Time | numeric | | Numeric encoding of day of year (1 - 365) | +| Sale Day of Month | Time | numeric | | Numeric encoding of day of month (1 - 31) | +| Sale Day of Week | Time | numeric | | Numeric encoding of day of week (1 - 7) | +| Sale After COVID-19 | Time | logical | | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | #### Data Sources