diff --git a/.gitignore b/.gitignore index 878c90183..64f3b3fe4 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,4 @@ dmypy.json # Cython debug symbols cython_debug/ +test/test/ diff --git a/openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license b/openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license new file mode 100644 index 000000000..57abd4c9e --- /dev/null +++ b/openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license b/openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license new file mode 100644 index 000000000..57abd4c9e --- /dev/null +++ b/openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/openstef/data_classes/prediction_job.py b/openstef/data_classes/prediction_job.py index 6df22145d..31119250c 100644 --- a/openstef/data_classes/prediction_job.py +++ b/openstef/data_classes/prediction_job.py @@ -27,6 +27,7 @@ class PredictionJobDataClass(BaseModel): - ``"linear"`` - ``"linear_quantile"`` - ``"xgb_multioutput_quantile"`` + - ``"flatliner"`` If unsure what to pick, choose ``"xgb"``. diff --git a/openstef/enums.py b/openstef/enums.py index 84a010409..cef096121 100644 --- a/openstef/enums.py +++ b/openstef/enums.py @@ -13,6 +13,7 @@ class MLModelType(Enum): LINEAR = "linear" LINEAR_QUANTILE = "linear_quantile" ARIMA = "arima" + FLATLINER = "flatliner" class ForecastType(Enum): diff --git a/openstef/model/confidence_interval_applicator.py b/openstef/model/confidence_interval_applicator.py index 8a4025a5e..3a31bcbc1 100644 --- a/openstef/model/confidence_interval_applicator.py +++ b/openstef/model/confidence_interval_applicator.py @@ -73,8 +73,11 @@ def add_confidence_interval( result = self._add_quantiles_to_forecast_quantile_regression( temp_forecast, self.model.quantiles ) - self.logger.warning('Quantiles are requested the model was not trained on. Using the quantiles the model was trained on', - requested_quantiles=pj["quantiles"], trained_quantiles=self.model.quantiles) + self.logger.warning( + "Quantiles are requested the model was not trained on. Using the quantiles the model was trained on", + requested_quantiles=pj["quantiles"], + trained_quantiles=self.model.quantiles, + ) return result return self._add_quantiles_to_forecast_default(temp_forecast, pj["quantiles"]) diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py index f2ec76442..837c6052a 100644 --- a/openstef/model/model_creator.py +++ b/openstef/model/model_creator.py @@ -13,6 +13,7 @@ from openstef.model.regressors.linear import LinearOpenstfRegressor from openstef.model.regressors.linear_quantile import LinearQuantileOpenstfRegressor from openstef.model.regressors.regressor import OpenstfRegressor +from openstef.model.regressors.flatliner import FlatlinerRegressor from openstef.model.regressors.xgb import XGBOpenstfRegressor from openstef.model.regressors.xgb_quantile import XGBQuantileOpenstfRegressor from openstef.model.regressors.xgb_multioutput_quantile import ( @@ -105,6 +106,9 @@ "imputation_strategy", "fill_value", ], + MLModelType.FLATLINER: [ + "quantiles", + ], MLModelType.LINEAR_QUANTILE: [ "alpha", "quantiles", @@ -134,6 +138,7 @@ class ModelCreator: MLModelType.LINEAR: LinearOpenstfRegressor, MLModelType.LINEAR_QUANTILE: LinearQuantileOpenstfRegressor, MLModelType.ARIMA: ARIMAOpenstfRegressor, + MLModelType.FLATLINER: FlatlinerRegressor, } @staticmethod diff --git a/openstef/model/regressors/dazls.py b/openstef/model/regressors/dazls.py index 91abcc8db..9e3ad30ac 100644 --- a/openstef/model/regressors/dazls.py +++ b/openstef/model/regressors/dazls.py @@ -52,6 +52,7 @@ def fit(self, features, target): Args: features: inputs for domain and adaptation model (domain_model_input, adaptation_model_input) target: the expected output (y_train) + """ x, y = ( features.loc[:, self.baseline_input_columns], @@ -76,6 +77,7 @@ def predict(self, x: np.array): Returns: prediction: The output prediction after both models. + """ model_test_data = x.loc[:, self.baseline_input_columns] @@ -90,6 +92,7 @@ def score(self, truth, prediction): Returns: RMSE and R2 scores + """ rmse = (mean_squared_error(truth, prediction)) ** 0.5 r2_score_value = r2_score(truth, prediction) @@ -100,6 +103,7 @@ def __str__(self): Returns: Summary represented by a string + """ summary_str = ( f"{self.__name__} model summary:\n\n" diff --git a/openstef/model/regressors/flatliner.py b/openstef/model/regressors/flatliner.py new file mode 100644 index 000000000..e5bd2a416 --- /dev/null +++ b/openstef/model/regressors/flatliner.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project # noqa E501> +# +# SPDX-License-Identifier: MPL-2.0 +import re +from typing import Dict, Union, Set, Optional, List + +import numpy as np +import pandas as pd +from sklearn.base import RegressorMixin +from sklearn.linear_model import QuantileRegressor +from sklearn.preprocessing import MinMaxScaler +from sklearn.utils.validation import check_is_fitted + +from openstef.feature_engineering.missing_values_transformer import ( + MissingValuesTransformer, +) +from openstef.model.regressors.regressor import OpenstfRegressor + + +class FlatlinerRegressor(OpenstfRegressor, RegressorMixin): + feature_names_: List[str] = [] + + def __init__(self, quantiles=None): + """Initialize FlatlinerRegressor. + + The model always predicts 0.0, regardless of the input features. The model is + meant to be used for flatliner locations that still expect a prediction while + preserving the prediction interface. + """ + super().__init__() + self.quantiles = quantiles + + @property + def feature_names(self) -> list: + """The names of the features used to train the model.""" + check_is_fitted(self) + return self.feature_names_ + + @staticmethod + def _get_importance_names(): + return { + "gain_importance_name": "total_gain", + "weight_importance_name": "weight", + } + + @property + def can_predict_quantiles(self) -> bool: + """Attribute that indicates if the model predict particular quantiles.""" + return True + + def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: + """Fits flatliner model. + + Args: + x: Feature matrix + y: Labels + + Returns: + Fitted LinearQuantile model + + """ + self.feature_names_ = list(x.columns) + self.feature_importances_ = np.ones(len(self.feature_names_)) / ( + len(self.feature_names_) or 1.0 + ) + + return self + + def predict(self, x: pd.DataFrame, quantile: float = 0.5, **kwargs) -> np.array: + """Makes a prediction for a desired quantile. + + Args: + x: Feature matrix + quantile: Quantile for which a prediciton is desired, + note that only quantile are available for which a model is trained, + and that this is a quantile-model specific keyword + + Returns: + Prediction + + Raises: + ValueError in case no model is trained for the requested quantile + + """ + check_is_fitted(self) + + return np.zeros(x.shape[0]) + + def _get_feature_importance_from_linear(self, quantile: float = 0.5) -> np.array: + check_is_fitted(self) + return np.array([0.0 for _ in self.feature_names_]) + + @classmethod + def _get_param_names(cls): + return [ + "quantiles", + ] + + def __sklearn_is_fitted__(self) -> bool: + return True diff --git a/test/unit/model/regressors/test_flatliner.py b/test/unit/model/regressors/test_flatliner.py new file mode 100644 index 000000000..3fcf5a5ad --- /dev/null +++ b/test/unit/model/regressors/test_flatliner.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project # noqa E501> +# +# SPDX-License-Identifier: MPL-2.0 +import unittest +from unittest.mock import MagicMock + +import numpy as np +import pandas as pd +import sklearn +from sklearn.utils.estimator_checks import check_estimator + +from openstef.feature_engineering.apply_features import apply_features +from openstef.model.regressors.flatliner import FlatlinerRegressor +from test.unit.utils.base import BaseTestCase +from test.unit.utils.data import TestData + +train_input = TestData.load("reference_sets/307-train-data.csv") + + +class MockModel: + coef_ = np.array([1, 1, 3]) + + +class TestLinearQuantile(BaseTestCase): + def setUp(self) -> None: + self.quantiles = [0.9, 0.5, 0.6, 0.1] + + @unittest.skip # Use this during development, this test requires not allowing nan vallues which we explicitly do allow. + def test_sklearn_compliant(self): + # Use sklearn build in check, this will raise an exception if some check fails + # During these tests the fit and predict methods are elaborately tested + # More info: https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.check_estimator.html + check_estimator(FlatlinerRegressor(quantiles=tuple(self.quantiles))) + + def test_quantile_fit(self): + """Test happy flow of the training of model""" + # Arrange + model = FlatlinerRegressor() + + # Act + model.fit(train_input.iloc[:, 1:], train_input.iloc[:, 0]) + + # Assert + # check if the model was fitted (raises NotFittedError when not fitted) + self.assertIsNone(sklearn.utils.validation.check_is_fitted(model)) + + # check if model is sklearn compatible + self.assertIsInstance(model, sklearn.base.BaseEstimator) + + result: np.ndarray = model.predict(train_input.iloc[:, 1:]) + + self.assertEquals(len(result), len(train_input.iloc[:, 1:])) + self.assertTrue((result == 0).all()) + + def test_get_feature_names_from_linear(self): + # Arrange + model = FlatlinerRegressor() + model.feature_names_ = ["a", "b", "c"] + + # Act + feature_importance = model._get_feature_importance_from_linear(quantile=0.5) + + # Assert + self.assertTrue( + (feature_importance == np.array([0, 0, 0], dtype=np.float32)).all() + ) diff --git a/test/unit/model/test_confidence_interval_applicator.py b/test/unit/model/test_confidence_interval_applicator.py index 3c5b396bf..c48547491 100644 --- a/test/unit/model/test_confidence_interval_applicator.py +++ b/test/unit/model/test_confidence_interval_applicator.py @@ -26,20 +26,26 @@ class MockModel: } ) - @staticmethod - def predict(input, quantile): + can_predict_quantiles_ = True + + def predict(self, input, quantile): + if self.can_predict_quantiles and quantile not in self.quantiles: + # When model is trained on quantiles, it should fail if quantile is not in + # trained quantiles + raise ValueError("Quantile not in trained quantiles") + stdev_forecast = pd.DataFrame({"forecast": [5, 6, 7], "stdev": [0.5, 0.6, 0.7]}) return stdev_forecast["stdev"].rename(quantile) @property def can_predict_quantiles(self): - return True - + return self.can_predict_quantiles_ + @property def quantiles(self): return [0.01, 0.10, 0.25, 0.50, 0.75, 0.90, 0.99] - - + + class MockNonQuantileModel(MockModel): @property def can_predict_quantiles(self): @@ -182,11 +188,10 @@ def test_add_standard_deviation_to_forecast_in_past(self): actual_stdev_forecast["stdev"].max(), 14 ) # => MockModel.standard_deviation.stdev.max()) - def test_add_quantiles_to_forecast_untrained_quantiles_with_quantile_model(self): """For quantile models, the trained quantiles can used if the quantiles of the pj are incompatible""" # Set up - pj = {"quantiles": [0.12, 0.5, 0.65]} # numbers are arbitrary + pj = {"quantiles": [0.12, 0.5, 0.65]} # numbers are arbitrary model = MockModel() forecast = pd.DataFrame({"forecast": [5, 6, 7], "tAhead": [-1.0, 0.0, 1.0]}) forecast.index = [ @@ -194,11 +199,10 @@ def test_add_quantiles_to_forecast_untrained_quantiles_with_quantile_model(self) pd.Timestamp(2012, 5, 1, 1, 45), pd.Timestamp(2012, 5, 1, 2, 00), ] + model.can_predict_quantiles_ = True # Specify expectation expected_quantiles = model.quantiles - expected_columns = [ - f"quantile_P{int(q * 100):02d}" for q in expected_quantiles - ] + expected_columns = [f"quantile_P{int(q * 100):02d}" for q in expected_quantiles] # Act pp_forecast = ConfidenceIntervalApplicator( @@ -208,12 +212,11 @@ def test_add_quantiles_to_forecast_untrained_quantiles_with_quantile_model(self) # Assert for expected_column in expected_columns: self.assertTrue(expected_column in pp_forecast.columns) - - + def test_add_quantiles_to_forecast_untrained_quantiles_with_nonquantile_model(self): """For nonquantile models, the quantiles of the pj should be used, also if the model was not trained on those""" # Set up - pj = {"quantiles": [0.12, 0.5, 0.65]} # numbers are arbitrary + pj = {"quantiles": [0.12, 0.5, 0.65]} # numbers are arbitrary model = MockModel() forecast = pd.DataFrame({"forecast": [5, 6, 7], "tAhead": [-1.0, 0.0, 1.0]}) forecast.index = [ @@ -221,11 +224,10 @@ def test_add_quantiles_to_forecast_untrained_quantiles_with_nonquantile_model(se pd.Timestamp(2012, 5, 1, 1, 45), pd.Timestamp(2012, 5, 1, 2, 00), ] + model.can_predict_quantiles_ = False # Specify expectation - expected_quantiles = pj['quantiles'] - expected_columns = [ - f"quantile_P{int(q * 100):02d}" for q in expected_quantiles - ] + expected_quantiles = pj["quantiles"] + expected_columns = [f"quantile_P{int(q * 100):02d}" for q in expected_quantiles] # Act pp_forecast = ConfidenceIntervalApplicator( @@ -235,5 +237,3 @@ def test_add_quantiles_to_forecast_untrained_quantiles_with_nonquantile_model(se # Assert for expected_column in expected_columns: self.assertTrue(expected_column in pp_forecast.columns) - - \ No newline at end of file diff --git a/test/unit/model/test_model_creator.py b/test/unit/model/test_model_creator.py index 0ec8a9cb0..5465325a3 100644 --- a/test/unit/model/test_model_creator.py +++ b/test/unit/model/test_model_creator.py @@ -28,6 +28,8 @@ def test_create_model_happy_flow(self): MLModelType("linear_quantile"), "xgb_multioutput_quantile", MLModelType("xgb_multioutput_quantile"), + "flatliner", + MLModelType("flatliner"), ]: self.assertTrue(model.can_predict_quantiles) else: