From 46ae5a005f0d3c8443d3b14c1cd92cde5ae81f45 Mon Sep 17 00:00:00 2001
From: Ivan Ruiz Manuel <72193617+irm-codebase@users.noreply.github.com>
Date: Tue, 19 Nov 2024 10:14:09 +0100
Subject: [PATCH] Add data table schema w/ tests

---
 src/calliope/schemas/config_schema.py     | 25 +------
 src/calliope/schemas/data_table_schema.py | 84 +++++++++++++++++++++++
 src/calliope/util/schema.py               | 26 ++++++-
 tests/test_data_table_schema.py           | 77 +++++++++++++++++++++
 4 files changed, 189 insertions(+), 23 deletions(-)
 create mode 100644 src/calliope/schemas/data_table_schema.py
 create mode 100644 tests/test_data_table_schema.py

diff --git a/src/calliope/schemas/config_schema.py b/src/calliope/schemas/config_schema.py
index 4e55beeb..e9113ef6 100644
--- a/src/calliope/schemas/config_schema.py
+++ b/src/calliope/schemas/config_schema.py
@@ -2,39 +2,20 @@
 # Licensed under the Apache 2.0 License (see LICENSE file).
 """Implements the Calliope configuration class."""
 
-from collections.abc import Hashable
 from datetime import datetime
 from pathlib import Path
-from typing import Annotated, Literal, Self, TypeVar
+from typing import Literal, Self
 
 import jsonref
-from pydantic import AfterValidator, BaseModel, Field, model_validator
-from pydantic_core import PydanticCustomError
+from pydantic import BaseModel, Field, model_validator
 
 from calliope.attrdict import AttrDict
 from calliope.util import tools
+from calliope.util.schema import UniqueList
 
 MODES_T = Literal["plan", "operate", "spores"]
 CONFIG_T = Literal["init", "build", "solve"]
 
-# ==
-# Taken from https://github.com/pydantic/pydantic-core/pull/820#issuecomment-1670475909
-T = TypeVar("T", bound=Hashable)
-
-
-def _validate_unique_list(v: list[T]) -> list[T]:
-    if len(v) != len(set(v)):
-        raise PydanticCustomError("unique_list", "List must be unique")
-    return v
-
-
-UniqueList = Annotated[
-    list[T],
-    AfterValidator(_validate_unique_list),
-    Field(json_schema_extra={"uniqueItems": True}),
-]
-# ==
-
 
 def hide_from_schema(to_hide: list[str]):
     """Hide fields from the generated schema.
diff --git a/src/calliope/schemas/data_table_schema.py b/src/calliope/schemas/data_table_schema.py
new file mode 100644
index 00000000..c693150f
--- /dev/null
+++ b/src/calliope/schemas/data_table_schema.py
@@ -0,0 +1,84 @@
+"""Implements the data table configuration class."""
+
+from typing import Self
+
+from pydantic import BaseModel, model_validator
+
+from calliope.util.schema import AttrStr, UniqueList
+from calliope.util.tools import listify
+
+# Get rid of pyright false negatives (see https://github.com/microsoft/pylance-release/issues/5457)
+# pyright: reportInvalidTypeForm=false
+
+
+class DataTable(BaseModel):
+    """Data table validation model."""
+
+    data: str
+    """
+    Absolute or relative filepath.
+    Relative paths are based on the model config file used to initialise the model.
+    """
+    rows: None | AttrStr | UniqueList[AttrStr] = None
+    """
+    Names of dimensions defined row-wise.
+    Each name should correspond to a column in your data that contains index items.
+    These columns must be to the left of the columns containing your data.
+    """
+    columns: None | AttrStr | UniqueList[AttrStr] = None
+    """
+    Names of dimensions defined column-wise.
+    Each name should correspond to a row in your data that contains index items.
+    These rows must be above the rows containing your data.
+    """
+    select: None | dict[AttrStr, AttrStr | UniqueList[AttrStr]] = None
+    """
+    Select one or more index item from a dimension.
+    Selection takes place before `drop` and `add_dims`, so you can select a single
+    value from a data dimension and then drop the dimension so it doesn't find its way
+    through to the final dataset.
+    """
+    drop: None | AttrStr | UniqueList[AttrStr] = None
+    """
+    Enables removing rows and/or columns that contain irrelevant data/metadata.
+    These could include comments on the source of the data, the data license, or the parameter units.
+    You can also drop a dimension and then reintroduce it in `add_dims`, but with different index items.
+    """
+    add_dims: None | dict[AttrStr, AttrStr] = None
+    """
+    Data dimensions to add after loading in the array.
+    These allow you to use the same file to assign values to different parameters/dimension index items
+    (e.g., setting `flow_cap_min` and `flow_cap_max` to the same value),
+    or to add a dimension which would otherwise be a column containing the same information in each row
+    (e.g., assigning the cost class to monetary for a file containing cost data).
+    """
+    rename_dims: None | dict[AttrStr, AttrStr] = None
+    """
+    Mapping between dimension names in the data table being loaded to equivalent Calliope dimension names.
+    For instance, the "time" column in the data table would need to be mapped to "timesteps": `{"time": "timesteps"}`.
+    """
+    template: None | AttrStr = None
+    """
+    Reference to a template from which to inherit common configuration options.
+    """
+
+    @model_validator(mode="after")
+    def check_row_and_columns(self) -> Self:
+        """Ensure users specify a valid data table shape."""
+        rows = set(listify(self.rows))
+        columns = set(listify(self.columns))
+        if not rows and not columns:
+            raise ValueError("Either row or columns must be defined for data_table.")
+        elif rows & columns:
+            raise ValueError("Rows and columns must not overlap.")
+
+        if self.add_dims:
+            if self.add_dims.keys() & (rows | columns):
+                raise ValueError("Added dimensions must not be in columns or rows.")
+
+        if self.rename_dims:
+            if set(self.rename_dims.values()) - (rows | columns):
+                raise ValueError(
+                    "Renamed dimensions must be in either rows or columns."
+                )
+        return self
diff --git a/src/calliope/util/schema.py b/src/calliope/util/schema.py
index 361cd9a9..f207c35a 100644
--- a/src/calliope/util/schema.py
+++ b/src/calliope/util/schema.py
@@ -5,10 +5,13 @@
 import importlib
 import re
 import sys
+from collections.abc import Hashable
 from copy import deepcopy
-from typing import Literal
+from typing import Annotated, Literal, TypeVar
 
 import jsonschema
+from pydantic import AfterValidator, Field, constr
+from pydantic_core import PydanticCustomError
 
 from calliope.attrdict import AttrDict
 from calliope.exceptions import print_warnings_and_raise_errors
@@ -19,6 +22,27 @@
 DATA_TABLE_SCHEMA = load_config("data_table_schema.yaml")
 MATH_SCHEMA = load_config("math_schema.yaml")
 
+# Regular string pattern for most calliope attributes
+FIELD_REGEX = r"^[^_^\d][\w]*$"
+AttrStr = constr(pattern=FIELD_REGEX)
+# ==
+# Taken from https://github.com/pydantic/pydantic-core/pull/820#issuecomment-1670475909
+T = TypeVar("T", bound=Hashable)
+
+
+def _validate_unique_list(v: list[T]) -> list[T]:
+    if len(v) != len(set(v)):
+        raise PydanticCustomError("unique_list", "List must be unique")
+    return v
+
+
+UniqueList = Annotated[
+    list[T],
+    AfterValidator(_validate_unique_list),
+    Field(json_schema_extra={"uniqueItems": True}),
+]
+# ==
+
 
 def reset():
     """Reset all module-level schema to the pre-defined dictionaries."""
diff --git a/tests/test_data_table_schema.py b/tests/test_data_table_schema.py
new file mode 100644
index 00000000..4e7b5c81
--- /dev/null
+++ b/tests/test_data_table_schema.py
@@ -0,0 +1,77 @@
+"""Test data table schema validation."""
+
+import pytest
+from pydantic import ValidationError
+
+from calliope.attrdict import AttrDict
+from calliope.schemas.data_table_schema import DataTable
+
+from .common.util import check_error_or_warning
+
+FULL_TABLE_CONFIG = """
+data: time_varying_df
+rows: timesteps
+columns: [comment, nodes, techs]
+select:
+    nodes: [node1, node2]
+    techs: pv
+drop: comment
+add_dims:
+    parameters: something
+    costs: monetary
+rename_dims:
+    location: nodes
+template: some_template
+"""
+
+
+@pytest.mark.parametrize(
+    "data_table",
+    [{"rows": "timesteps"}, {"rows": "timesteps", "columns": ["techs", "nodes"]}],
+)
+def test_path_not_provided(data_table):
+    """Not providing the path should result in a failure."""
+    with pytest.raises(ValidationError):
+        DataTable(**data_table)
+
+
+@pytest.mark.parametrize("data_table", [{"data": "foo"}])
+def test_incomplete_column_or_row(data_table):
+    """Not providing either rows or columns is invalid."""
+    with pytest.raises(ValidationError) as excinfo:
+        DataTable(**data_table)
+    assert check_error_or_warning(
+        excinfo, "Either row or columns must be defined for data_table."
+    )
+
+
+@pytest.mark.parametrize(
+    ("rows", "columns"),
+    [
+        ("nodes", "nodes"),
+        (["nodes", "techs"], "techs"),
+        (["nodes", "techs", "params"], ["params", "costs"]),
+    ],
+)
+def test_row_column_overlap(rows, columns):
+    """Rows and columns must not share any similar values."""
+    with pytest.raises(ValidationError) as excinfo:
+        DataTable(data="foobar", rows=rows, columns=columns)
+    assert check_error_or_warning(excinfo, "Rows and columns must not overlap.")
+
+
+@pytest.mark.parametrize(
+    ("rows", "columns", "add_dims"), [("nodes", None, {"nodes": "MEX"})]
+)
+def test_add_dims_overlap(rows, columns, add_dims):
+    with pytest.raises(ValidationError) as excinfo:
+        DataTable(data="foo", rows=rows, columns=columns, add_dims=add_dims)
+    assert check_error_or_warning(
+        excinfo, "Added dimensions must not be in columns or rows."
+    )
+
+
+@pytest.mark.parametrize("data_table", [FULL_TABLE_CONFIG])
+def test_full_table_config(data_table):
+    """Test a fully fledged data table configuration."""
+    DataTable(**AttrDict.from_yaml_string(data_table))