From b5cdf123625308bea6fe6bbe5c209423af789281 Mon Sep 17 00:00:00 2001
From: Bryn Pickering <17178478+brynpickering@users.noreply.github.com>
Date: Wed, 14 Feb 2024 16:37:56 +0000
Subject: [PATCH] Add schema updater (#568)

Co-authored-by: Stefan Pfenninger <stefan@pfenninger.org>
---
 CHANGELOG.md                        |  3 +
 docs/reference/api/schema.md        |  1 +
 docs/user_defined_math/customise.md | 52 +++++++++++++++++
 mkdocs.yml                          |  1 +
 src/calliope/util/schema.py         | 66 +++++++++++++++++++--
 tests/test_core_util.py             | 90 +++++++++++++++++++++++++----
 6 files changed, 199 insertions(+), 14 deletions(-)
 create mode 100644 docs/reference/api/schema.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f17b7c6..1b6a18ce 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,9 @@ Requires the [PyMdown tabbed extension](https://facelessuser.github.io/pymdown-e
 |new| Variables and global expressions can have a `default` value, which is used to fill missing array elements when doing math operations.
 These default values ensure that `NaN` doesn't creep into the built optimisation problem math and are set to values that lead to them having no impact on the optimal solution.
 
+|new| Utility function `calliope.util.schema.update_model_schema(...)` to add user-defined parameters to the model schema / update existing parameters using YAML schema syntax.
+`calliope.util.schema.reset()` can be used to clean the model schema and return to the original, pre-defined schema.
+
 |fixed| Timeseries clustering file can be a non-ISO standard date format.
 Both the index and the values of the timeseries (both being date strings) should be in the user-defined `config.init.time_format`.
 
diff --git a/docs/reference/api/schema.md b/docs/reference/api/schema.md
new file mode 100644
index 00000000..a3f46cfc
--- /dev/null
+++ b/docs/reference/api/schema.md
@@ -0,0 +1 @@
+::: calliope.util.schema
\ No newline at end of file
diff --git a/docs/user_defined_math/customise.md b/docs/user_defined_math/customise.md
index 8b6f7bdb..a95e2ade 100644
--- a/docs/user_defined_math/customise.md
+++ b/docs/user_defined_math/customise.md
@@ -32,6 +32,58 @@ config:
     add_math: [my_new_math_1.yaml, storage_inter_cluster, my_new_math_2.md]
 ```
 
+## Adding your parameters to the YAML schema
+
+Our YAML schemas are used to validate user inputs.
+The model definition schema includes metadata on all our pre-defined parameters, which you can find rendered in our [reference page][model-definition-schema].
+
+When you add your own math you are likely to be adding new parameters to the model.
+You can update the Calliope model definition schema to include your new entries using [`calliope.util.schema.update_model_schema(...)`][calliope.util.schema.update_model_schema].
+This ensures that your parameters have default values attached to them and if you choose to [write your own documentation](#writing-your-own-math-documentation), your parameters will have this metadata added to their descriptions.
+
+Entries in the schema look like this:
+
+```yaml
+flow_cap_max:
+  $ref: "#/$defs/TechParamNullNumber"  # (1)!
+  default: .inf
+  x-type: float
+  title: Maximum rated flow capacity.
+  description: >-
+    Limits `flow_cap` to a maximum.
+  x-unit: power.
+```
+
+1. This is a cross-reference to a much longer schema entry that says the parameter type is either `None`, a simple number, or an indexed parameter dictionary with the `data`, `index`, and `dims` keys.
+
+When you add your own parameters to the schema, you will need to know the top-level key under which the parameter will be found in your YAML definition: [`nodes`](../creating/nodes.md), [`techs`](../creating/techs.md), or [`parameters`](../creating/parameters.md).
+As a general rule, if it includes the `techs` dimension, put it under `techs`; if it includes `nodes` but _not_ `techs` then put it under `nodes`; if it includes neither dimension, put it under `parameters`.
+
+The dictionary you supply for each parameter can include the following:
+
+* title (str): Short description of the parameter.
+* description (str): Long description of the parameter.
+* type (str or array): expected type of entry.
+We recommend you use the pre-defined cross-reference `$ref: "#/$defs/TechParamNullNumber"` instead of explicitly using this key, to allow the parameter to be either numeric or an indexed parameter.
+If you are adding a cost, you can use the cross reference `$ref: "#/$defs/TechCostNullNumber"`.
+If you want to allow non-numeric data (e.g., strings), you would set `type: string` instead of using the cross-reference.
+* default (str): default value.
+This will be used in generating the optimisation problem.
+* x-type (str): type of the non-NaN array entries in the internal calliope representation of the parameter.
+This is usually one of `float` or `str`.
+* x-unit (str): Unit of the parameter to use in documentation.
+* x-operate-param (bool): If True, this parameter's schema data will only be loaded into the optimisation problem if running in "operate" mode.
+
+!!! note
+
+    Schema attributes which start with `x-` are Calliope-specific.
+    They are not used at all for YAML validation and instead get picked up by us using the utility function [calliope.util.schema.extract_from_schema][].
+
+!!! warning
+
+    The schema is updated in-place so your edits to it will remain active as long as you are running in the same session.
+    You can reset your updates to the schema and return to the pre-defined schema by calling [`calliope.util.schema.reset()`][calliope.util.schema.reset]
+
 ## Writing your own math documentation
 
 You can write your model's mathematical formulation to view it in a rich-text format (as we do for our [pre-defined math](../pre_defined_math/index.md) in this documentation).
diff --git a/mkdocs.yml b/mkdocs.yml
index ba4c2cc7..1744e20f 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -146,6 +146,7 @@ nav:
       - reference/api/backend_model.md
       - reference/api/helper_functions.md
       - reference/api/example_models.md
+      - reference/api/schema.md
       - reference/api/attrdict.md
       - reference/api/exceptions.md
       - reference/api/logging.md
diff --git a/src/calliope/util/schema.py b/src/calliope/util/schema.py
index c4c1b14d..9770aa8e 100644
--- a/src/calliope/util/schema.py
+++ b/src/calliope/util/schema.py
@@ -1,4 +1,13 @@
+# Copyright (C) since 2013 Calliope contributors listed in AUTHORS.
+# Licensed under the Apache 2.0 License (see LICENSE file).
+
+"""
+Load, update, and access attributes in the Calliope pre-defined YAML schemas
+"""
+
+import importlib
 import re
+import sys
 from copy import deepcopy
 from typing import Literal, Optional
 
@@ -14,6 +23,11 @@
 MATH_SCHEMA = load_config("math_schema.yaml")
 
 
+def reset():
+    """Reset all module-level schema to the pre-defined dictionaries."""
+    importlib.reload(sys.modules[__name__])
+
+
 def update_then_validate_config(
     config_key: str, config_dict: AttrDict, **update_kwargs
 ) -> AttrDict:
@@ -27,6 +41,46 @@ def update_then_validate_config(
     return to_validate
 
 
+def update_model_schema(
+    top_level_property: Literal["nodes", "techs", "parameters"],
+    new_entries: dict,
+    allow_override: bool = True,
+):
+    """Update existing entries in the model schema or add a new parameter to the model schema.
+
+    Available attributes:
+
+    * title (str): Short description of the parameter.
+    * description (str): Long description of the parameter.
+    * type (str): expected type of entry. Pre-defined entries tend to use "$ref: "#/$defs/TechParamNullNumber" instead, to allow type to be either numeric or an indexed parameter.
+    * default (str): default value. This will be used in generating the optimisation problem.
+    * x-type (str): type of the non-NaN array entries in the internal calliope representation of the parameter.
+    * x-unit (str): Unit of the parameter to use in documentation.
+    * x-operate-param (bool): If True, this parameter's schema data will only be loaded into the optimisation problem if running in "operate" mode.
+
+    Args:
+        top_level_property (Literal["nodes", "techs", "parameters"]): Top-level key under which parameters are to be updated/added.
+        new_entries (dict): Data to update the schema with.
+        allow_override (bool, optional): If True, allow existing entries in the schema to be overwritten. Defaults to True.
+    """
+    new_schema = deepcopy(MODEL_SCHEMA)
+    to_update: AttrDict
+    if top_level_property == "parameters":
+        to_update = new_schema["properties"][top_level_property]["properties"]
+    else:
+        to_update = new_schema["properties"][top_level_property]["patternProperties"][
+            "^[^_^\\d][\\w]*$"
+        ]["properties"]
+
+    to_update.union(AttrDict(new_entries), allow_override=allow_override)
+
+    validator = jsonschema.Draft202012Validator
+    validator.META_SCHEMA["unevaluatedProperties"] = False
+    validator.check_schema(new_schema)
+
+    MODEL_SCHEMA.union(new_schema, allow_override=True)
+
+
 def validate_dict(to_validate: dict, schema: dict, dict_descriptor: str) -> None:
     """
     Validate a dictionary under a given schema.
@@ -37,8 +91,11 @@ def validate_dict(to_validate: dict, schema: dict, dict_descriptor: str) -> None
         dict_descriptor (str): Description of the dictionary to validate, to use if an error is raised.
 
     Raises:
-        jsonschema.SchemaError: If the schema itself is malformed, a SchemaError will be raised at the first issue. Other issues than that raised may still exist.
-        calliope.exceptions.ModelError: If the dictionary is not valid according to the schema, a list of the issues found will be collated and raised.
+        jsonschema.SchemaError:
+            If the schema itself is malformed, a SchemaError will be raised at the first issue.
+            Other issues than that raised may still exist.
+        calliope.exceptions.ModelError:
+            If the dictionary is not valid according to the schema, a list of the issues found will be collated and raised.
     """
     errors = []
     validator = jsonschema.Draft202012Validator
@@ -95,8 +152,9 @@ def extract_from_schema(
             Defaults to None, i.e., all property branches are included.
 
     Returns:
-        dict: Flat dictionary of property name : keyword value.
-        Property trees are discarded since property names must be unique.
+        dict:
+            Flat dictionary of property name : keyword value.
+            Property trees are discarded since property names must be unique.
     """
     extracted_keywords: dict = {}
     KeywordValidatingValidator = _extend_with_keyword(
diff --git a/tests/test_core_util.py b/tests/test_core_util.py
index 5cd8fdba..98f012bb 100644
--- a/tests/test_core_util.py
+++ b/tests/test_core_util.py
@@ -9,9 +9,9 @@
 import numpy as np
 import pandas as pd
 import pytest
+from calliope.util import schema
 from calliope.util.generate_runs import generate_runs
 from calliope.util.logging import log_time
-from calliope.util.schema import extract_from_schema, validate_dict
 
 from .common.util import check_error_or_warning
 
@@ -122,7 +122,7 @@ class TestValidateDict:
         reason="Checking the schema itself doesn't seem to be working properly; no clear idea of _why_ yet..."
     )
     @pytest.mark.parametrize(
-        ["schema", "expected_path"],
+        ["schema_dict", "expected_path"],
         [
             ({"foo": 2}, ""),
             ({"properties": {"bar": {"foo": "string"}}}, " at `properties.bar`"),
@@ -135,10 +135,10 @@ class TestValidateDict:
             ),
         ],
     )
-    def test_malformed_schema(self, schema, expected_path):
+    def test_malformed_schema(self, schema_dict, expected_path):
         to_validate = {"bar": [1, 2, 3]}
         with pytest.raises(jsonschema.SchemaError) as err:
-            validate_dict(to_validate, schema, "foobar")
+            schema.validate_dict(to_validate, schema_dict, "foobar")
         assert check_error_or_warning(
             err,
             f"The foobar schema is malformed{expected_path}: Unevaluated properties are not allowed ('foo' was unexpected)",
@@ -152,7 +152,7 @@ def test_malformed_schema(self, schema, expected_path):
         ],
     )
     def test_invalid_dict(self, to_validate, expected_path):
-        schema = {
+        schema_dict = {
             "properties": {
                 "valid": {
                     "type": "object",
@@ -163,7 +163,7 @@ def test_invalid_dict(self, to_validate, expected_path):
             "additionalProperties": False,
         }
         with pytest.raises(calliope.exceptions.ModelError) as err:
-            validate_dict(to_validate, schema, "foobar")
+            schema.validate_dict(to_validate, schema_dict, "foobar")
         assert check_error_or_warning(
             err,
             [
@@ -188,7 +188,7 @@ def test_validate_math(self, base_math, dict_path):
         to_validate = base_math.union(
             calliope.AttrDict.from_yaml(dict_path), allow_override=True
         )
-        validate_dict(to_validate, math_schema, "")
+        schema.validate_dict(to_validate, math_schema, "")
 
 
 class TestExtractFromSchema:
@@ -339,7 +339,7 @@ def test_extract_config_defaults(
         self, sample_config_schema, expected_config_defaults
     ):
         extracted_defaults = pd.Series(
-            extract_from_schema(sample_config_schema, "default")
+            schema.extract_from_schema(sample_config_schema, "default")
         )
         pd.testing.assert_series_equal(
             extracted_defaults.sort_index(), expected_config_defaults
@@ -349,7 +349,7 @@ def test_extract_model_def_defaults(
         self, sample_model_def_schema, expected_model_def_defaults
     ):
         extracted_defaults = pd.Series(
-            extract_from_schema(sample_model_def_schema, "default")
+            schema.extract_from_schema(sample_model_def_schema, "default")
         )
         pd.testing.assert_series_equal(
             extracted_defaults.sort_index(), expected_model_def_defaults
@@ -379,10 +379,80 @@ def test_extract_defaults_subset(
         prop_keys,
     ):
         extracted_defaults = pd.Series(
-            extract_from_schema(sample_model_def_schema, "default", schema_key)
+            schema.extract_from_schema(sample_model_def_schema, "default", schema_key)
         )
         pd.testing.assert_series_equal(
             expected_model_def_defaults.loc[prop_keys].sort_index(),
             extracted_defaults.sort_index(),
             check_dtype=False,
         )
+
+
+class TestUpdateSchema:
+
+    @pytest.mark.parametrize("top_level", ["parameters", "nodes", "techs"])
+    def test_add_new_schema(self, top_level):
+        schema.update_model_schema(
+            top_level,
+            {
+                f"{top_level}_foo": {
+                    "type": "number",
+                    "description": "bar",
+                    "default": 1,
+                }
+            },
+            allow_override=False,
+        )
+
+        extracted_defaults = schema.extract_from_schema(schema.MODEL_SCHEMA, "default")
+        assert extracted_defaults[f"{top_level}_foo"] == 1
+        extracted_descriptions = schema.extract_from_schema(
+            schema.MODEL_SCHEMA, "description"
+        )
+        assert extracted_descriptions[f"{top_level}_foo"] == "bar"
+
+        schema.reset()
+
+    @pytest.mark.parametrize("top_level", ["parameters", "nodes", "techs"])
+    def test_update_schema(self, top_level):
+        schema.update_model_schema(
+            top_level, {f"{top_level}_foo": {"default": 1}}, allow_override=False
+        )
+
+        extracted_defaults = schema.extract_from_schema(schema.MODEL_SCHEMA, "default")
+        assert extracted_defaults[f"{top_level}_foo"] == 1
+
+        schema.update_model_schema(
+            top_level, {f"{top_level}_foo": {"default": 2}}, allow_override=True
+        )
+
+        extracted_defaults = pd.Series(
+            schema.extract_from_schema(schema.MODEL_SCHEMA, "default")
+        )
+        assert extracted_defaults[f"{top_level}_foo"] == 2
+
+        schema.reset()
+
+    @pytest.mark.parametrize("top_level", ["parameters", "nodes", "techs"])
+    def test_update_schema_malformed(self, top_level):
+        with pytest.raises(jsonschema.SchemaError):
+            schema.update_model_schema(
+                top_level,
+                {f"{top_level}_foo": {"type": "i_am_not_a_type"}},
+                allow_override=True,
+            )
+        schema.reset()
+
+    def test_reset_schema(self):
+        schema.update_model_schema(
+            "techs",
+            {"foo": {"type": "number", "description": "bar", "default": 1}},
+            allow_override=False,
+        )
+        schema.reset()
+        assert (
+            "foo"
+            not in schema.MODEL_SCHEMA["properties"]["techs"]["patternProperties"][
+                "^[^_^\\d][\\w]*$"
+            ]["properties"]
+        )