Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use dbt DAG to populate notes in the README feature table #18

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 75 additions & 9 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -221,24 +221,90 @@ The residential model uses a variety of individual and aggregate features to det
library(dplyr)
library(tidyr)
library(yaml)
library(jsonlite)
library(purrr)
library(tibble)

# Some values are derived in the model itself, so they are not documented
# in the dbt DAG and need to be documented here
hardcoded_descriptions <- tribble(
~"column", ~"description",
"sale_year", "Sale year calculated as the number of years since 0 B.C.E",
"sale_day",
"Sale day calculated as the number of days since January 1st, 1997",
"sale_quarter_of_year", "Character encoding of quarter of year (Q1 - Q4)",
"sale_month_of_year", "Character encoding of month of year (Jan - Dec)",
"sale_day_of_year", "Numeric encoding of day of year (1 - 365)",
"sale_day_of_month", "Numeric encoding of day of month (1 - 31)",
"sale_day_of_week", "Numeric encoding of day of week (1 - 7)",
# nolint start: line_length_linter
"sale_post_covid", "Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020)"
# nolint end
)
jeancochrane marked this conversation as resolved.
Show resolved Hide resolved
jeancochrane marked this conversation as resolved.
Show resolved Hide resolved

# Load the dbt DAG from our prod docs site
dbt_manifest <- fromJSON(
"https://ccao-data.github.io/data-architecture/manifest.json"
)

# nolint start: cyclomp_linter
get_column_description <- function(colname, dag_nodes, hardcoded_descriptions) {
# Retrieve the description for a column `colname` either from a set of
# dbt DAG nodes (`dag_nodes`) or a set of hardcoded descriptions
# (`hardcoded_descriptions`)
#
# Prefer the hardcoded descriptions, if they exist
if (colname %in% hardcoded_descriptions$column) {
return(
hardcoded_descriptions[
match(colname, hardcoded_descriptions$column),
]$description
)
}
# If no hardcoded description exists, fall back to checking the dbt DAG
for (node_name in ls(dag_nodes)) {
node <- dag_nodes[[node_name]]
for (column_name in ls(node$columns)) {
if (column_name == colname) {
description <- node$columns[[column_name]]$description
if (!is.null(description) && trimws(description) != "") {
return(gsub("\n", " ", description))
}
}
}
}
jeancochrane marked this conversation as resolved.
Show resolved Hide resolved
# No match in either the hardcoded descriptions or the dbt DAG, so fall
# back to an empty string
return("")
}
# nolint end

params <- read_yaml("params.yaml")
ccao::vars_dict %>%
filter(
var_is_predictor,
var_name_model != "meta_sale_price",
var_model_type %in% c("all", "res")
) %>%

param_tbl <- as_tibble(params$model$predictor$all)

# Make a vector of column descriptions that we can add to the param tibble
# as a new column
param_notes <- param_tbl$value %>%
ccao::vars_rename(names_from = "model", names_to = "athena") %>%
map(\(x) get_column_description(
x, dbt_manifest$nodes, hardcoded_descriptions
)) %>%
unlist()
jeancochrane marked this conversation as resolved.
Show resolved Hide resolved

param_tbl %>%
add_column(description = param_notes) %>%
jeancochrane marked this conversation as resolved.
Show resolved Hide resolved
inner_join(
as_tibble(params$model$predictor$all),
by = c("var_name_model" = "value")
ccao::vars_dict,
by = c("value" = "var_name_model")
jeancochrane marked this conversation as resolved.
Show resolved Hide resolved
) %>%
group_by(var_name_pretty) %>%
mutate(row = paste0("X", row_number())) %>%
distinct(
`Feature Name` = var_name_pretty,
Category = var_type,
Type = var_data_type,
Notes = var_notes,
Notes = description,
var_value, row
) %>%
mutate(Category = recode(
Expand Down
Loading