fix: Made use of non_null params consistent for linear quantile model…

… so null columns are handled correctly. (#542) * fix: Made use of non_null params consistent for linear quantile model so null columns are handled correctly. Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> * fix: Fixed (test) issues introduced by xgboost / sklearn upgrade. Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> * Format Python code with Black Signed-off-by: black <action@github.com> --------- Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> Signed-off-by: black <action@github.com> Co-authored-by: black <action@github.com>
OpenSTEF · Jul 17, 2024 · b16e1d1 · b16e1d1
1 parent 42f9ad5
commit b16e1d1
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 15 deletions.
diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import FunctionTransformer
-from sklearn.utils.validation import check_array
+from sklearn.utils.validation import check_array, check_is_fitted
 
 
 class MissingValuesTransformer:
@@ -42,6 +42,20 @@ def __init__(
         self.missing_values = missing_values
         self.imputation_strategy = imputation_strategy
         self.fill_value = fill_value
+        self.is_fitted_ = False
+
+        # Build the proper imputation transformer
+        # - Identity function if strategy is None
+        # - SimpleImputer with the dedicated strategy
+        if self.imputation_strategy is None:
+            self.imputer_ = FunctionTransformer(func=self._identity)
+        else:
+            self.imputer_ = SimpleImputer(
+                missing_values=self.missing_values,
+                strategy=self.imputation_strategy,
+                fill_value=self.fill_value,
+            ).set_output(transform="pandas")
+            self.imputer_._validate_params()
 
     def fit(self, x, y=None):
         """Fit the imputer on the input data."""
@@ -56,23 +70,13 @@ def fit(self, x, y=None):
         is_column_null = x.isnull().all(axis="index")
         self.non_null_feature_names = list(x.columns[~is_column_null])
 
-        # Build the proper imputation transformer
-        # - Identity function if strategy is None
-        # - SimpleImputer with the dedicated strategy
-        if self.imputation_strategy is None:
-            self.imputer_ = FunctionTransformer(func=self._identity)
-        else:
-            self.imputer_ = SimpleImputer(
-                missing_values=self.missing_values,
-                strategy=self.imputation_strategy,
-                fill_value=self.fill_value,
-            ).set_output(transform="pandas")
-
         # Imputers do not support labels
-        self.imputer_.fit(X=x, y=None)
+        self.imputer_.fit(X=x[self.non_null_feature_names], y=None)
+        self.is_fitted_ = True
 
     def transform(self, x) -> pd.DataFrame:
         """Transform the input data by imputing missing values."""
+        check_is_fitted(self)
         _ = check_array(x, force_all_finite="allow-nan")
         if not isinstance(x, pd.DataFrame):
             x = pd.DataFrame(np.asarray(x))

diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py
@@ -231,7 +231,7 @@ def _get_feature_importance_from_linear(self, quantile: float = 0.5) -> np.array
         return np.array(
             [
                 reg_feature_importances_dict.get(c, 0)
-                for c in self.imputer_.in_feature_names
+                for c in self.imputer_.non_null_feature_names
             ]
         )
 

diff --git a/openstef/model/regressors/xgb.py b/openstef/model/regressors/xgb.py
@@ -1,6 +1,10 @@
 # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
 #
 # SPDX-License-Identifier: MPL-2.0
+from typing import Optional
+
+import numpy as np
+from sklearn.base import RegressorMixin
 
 from xgboost import XGBRegressor
 
@@ -27,3 +31,22 @@ def _get_importance_names():
             "gain_importance_name": "total_gain",
             "weight_importance_name": "weight",
         }
+
+    def fit(
+        self,
+        x: np.array,
+        y: np.array,
+        *,
+        early_stopping_rounds: Optional[int] = None,
+        callbacks: Optional[list] = None,
+        eval_metric: Optional[str] = None,
+        **kwargs
+    ):
+        if early_stopping_rounds is not None:
+            self.set_params(early_stopping_rounds=early_stopping_rounds)
+        if callbacks is not None:
+            self.set_params(callbacks=callbacks)
+        if eval_metric is not None:
+            self.set_params(eval_metric=eval_metric)
+
+        super().fit(x, y, **kwargs)
diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py
@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
+#
+# SPDX-License-Identifier: MPL-2.0
+from test.unit.utils.base import BaseTestCase
+
+import unittest
+import pandas as pd
+import numpy as np
+from sklearn.exceptions import NotFittedError
+from openstef.feature_engineering.missing_values_transformer import (
+    MissingValuesTransformer,
+)
+
+
+class MissingValuesTransformerTests(BaseTestCase):
+    def setUp(self):
+        self.data = pd.DataFrame(
+            {"A": [1, np.nan, 3], "B": [4, 5, np.nan], "C": [np.nan, np.nan, np.nan]}
+        )
+
+    def test_imputation_with_mean_strategy_fills_missing_values(self):
+        transformer = MissingValuesTransformer(imputation_strategy="mean")
+        transformed = transformer.fit_transform(self.data)
+        self.assertEqual(transformed.isnull().sum().sum(), 0)
+        self.assertAlmostEqual(transformed.loc[1, "A"], 2)
+        self.assertAlmostEqual(transformed.loc[2, "B"], 4.5)
+
+    def test_imputation_with_constant_strategy_fills_missing_values(self):
+        transformer = MissingValuesTransformer(
+            imputation_strategy="constant", fill_value=0
+        )
+        transformed = transformer.fit_transform(self.data)
+        self.assertEqual(transformed.isnull().sum().sum(), 0)
+        self.assertEqual(transformed.loc[1, "A"], 0)
+        self.assertEqual(transformed.loc[2, "B"], 0)
+
+    def test_columns_always_null_are_removed(self):
+        transformer = MissingValuesTransformer()
+        transformer.fit(self.data)
+        self.assertNotIn("C", transformer.non_null_feature_names)
+
+    def test_non_dataframe_input_is_converted_and_processed(self):
+        transformer = MissingValuesTransformer(imputation_strategy="mean")
+        array = np.array([[1, np.nan], [np.nan, 2]])
+        transformed = transformer.fit_transform(array)
+        self.assertIsInstance(transformed, pd.DataFrame)
+        self.assertEqual(transformed.isnull().sum().sum(), 0)
+
+    def test_fitting_transformer_without_strategy_keeps_data_unchanged(self):
+        transformer = MissingValuesTransformer()
+        transformed = transformer.fit_transform(self.data)
+        pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C"]))
+
+    def test_calling_transform_before_fit_raises_error(self):
+        transformer = MissingValuesTransformer()
+        with self.assertRaises(NotFittedError):
+            transformer.transform(self.data)
+
+    def test_imputation_with_unsupported_strategy_raises_value_error(self):
+        with self.assertRaises(ValueError):
+            MissingValuesTransformer(imputation_strategy="unsupported")