Skip to content

Commit

Permalink
fix: Made use of non_null params consistent for linear quantile model…
Browse files Browse the repository at this point in the history
… so null columns are handled correctly. (#542)

* fix: Made use of non_null params consistent for linear quantile model so null columns are handled correctly.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* fix: Fixed (test) issues introduced by xgboost / sklearn upgrade.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* Format Python code with Black

Signed-off-by: black <action@github.com>

---------

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>
Signed-off-by: black <action@github.com>
Co-authored-by: black <action@github.com>
  • Loading branch information
egordm and actions-user authored Jul 17, 2024
1 parent 42f9ad5 commit b16e1d1
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 15 deletions.
32 changes: 18 additions & 14 deletions openstef/feature_engineering/missing_values_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_array, check_is_fitted


class MissingValuesTransformer:
Expand Down Expand Up @@ -42,6 +42,20 @@ def __init__(
self.missing_values = missing_values
self.imputation_strategy = imputation_strategy
self.fill_value = fill_value
self.is_fitted_ = False

# Build the proper imputation transformer
# - Identity function if strategy is None
# - SimpleImputer with the dedicated strategy
if self.imputation_strategy is None:
self.imputer_ = FunctionTransformer(func=self._identity)
else:
self.imputer_ = SimpleImputer(
missing_values=self.missing_values,
strategy=self.imputation_strategy,
fill_value=self.fill_value,
).set_output(transform="pandas")
self.imputer_._validate_params()

def fit(self, x, y=None):
"""Fit the imputer on the input data."""
Expand All @@ -56,23 +70,13 @@ def fit(self, x, y=None):
is_column_null = x.isnull().all(axis="index")
self.non_null_feature_names = list(x.columns[~is_column_null])

# Build the proper imputation transformer
# - Identity function if strategy is None
# - SimpleImputer with the dedicated strategy
if self.imputation_strategy is None:
self.imputer_ = FunctionTransformer(func=self._identity)
else:
self.imputer_ = SimpleImputer(
missing_values=self.missing_values,
strategy=self.imputation_strategy,
fill_value=self.fill_value,
).set_output(transform="pandas")

# Imputers do not support labels
self.imputer_.fit(X=x, y=None)
self.imputer_.fit(X=x[self.non_null_feature_names], y=None)
self.is_fitted_ = True

def transform(self, x) -> pd.DataFrame:
"""Transform the input data by imputing missing values."""
check_is_fitted(self)
_ = check_array(x, force_all_finite="allow-nan")
if not isinstance(x, pd.DataFrame):
x = pd.DataFrame(np.asarray(x))
Expand Down
2 changes: 1 addition & 1 deletion openstef/model/regressors/linear_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def _get_feature_importance_from_linear(self, quantile: float = 0.5) -> np.array
return np.array(
[
reg_feature_importances_dict.get(c, 0)
for c in self.imputer_.in_feature_names
for c in self.imputer_.non_null_feature_names
]
)

Expand Down
23 changes: 23 additions & 0 deletions openstef/model/regressors/xgb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
#
# SPDX-License-Identifier: MPL-2.0
from typing import Optional

import numpy as np
from sklearn.base import RegressorMixin

from xgboost import XGBRegressor

Expand All @@ -27,3 +31,22 @@ def _get_importance_names():
"gain_importance_name": "total_gain",
"weight_importance_name": "weight",
}

def fit(
self,
x: np.array,
y: np.array,
*,
early_stopping_rounds: Optional[int] = None,
callbacks: Optional[list] = None,
eval_metric: Optional[str] = None,
**kwargs
):
if early_stopping_rounds is not None:
self.set_params(early_stopping_rounds=early_stopping_rounds)
if callbacks is not None:
self.set_params(callbacks=callbacks)
if eval_metric is not None:
self.set_params(eval_metric=eval_metric)

super().fit(x, y, **kwargs)
61 changes: 61 additions & 0 deletions test/unit/feature_engineering/test_missing_values_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
#
# SPDX-License-Identifier: MPL-2.0
from test.unit.utils.base import BaseTestCase

import unittest
import pandas as pd
import numpy as np
from sklearn.exceptions import NotFittedError
from openstef.feature_engineering.missing_values_transformer import (
MissingValuesTransformer,
)


class MissingValuesTransformerTests(BaseTestCase):
def setUp(self):
self.data = pd.DataFrame(
{"A": [1, np.nan, 3], "B": [4, 5, np.nan], "C": [np.nan, np.nan, np.nan]}
)

def test_imputation_with_mean_strategy_fills_missing_values(self):
transformer = MissingValuesTransformer(imputation_strategy="mean")
transformed = transformer.fit_transform(self.data)
self.assertEqual(transformed.isnull().sum().sum(), 0)
self.assertAlmostEqual(transformed.loc[1, "A"], 2)
self.assertAlmostEqual(transformed.loc[2, "B"], 4.5)

def test_imputation_with_constant_strategy_fills_missing_values(self):
transformer = MissingValuesTransformer(
imputation_strategy="constant", fill_value=0
)
transformed = transformer.fit_transform(self.data)
self.assertEqual(transformed.isnull().sum().sum(), 0)
self.assertEqual(transformed.loc[1, "A"], 0)
self.assertEqual(transformed.loc[2, "B"], 0)

def test_columns_always_null_are_removed(self):
transformer = MissingValuesTransformer()
transformer.fit(self.data)
self.assertNotIn("C", transformer.non_null_feature_names)

def test_non_dataframe_input_is_converted_and_processed(self):
transformer = MissingValuesTransformer(imputation_strategy="mean")
array = np.array([[1, np.nan], [np.nan, 2]])
transformed = transformer.fit_transform(array)
self.assertIsInstance(transformed, pd.DataFrame)
self.assertEqual(transformed.isnull().sum().sum(), 0)

def test_fitting_transformer_without_strategy_keeps_data_unchanged(self):
transformer = MissingValuesTransformer()
transformed = transformer.fit_transform(self.data)
pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C"]))

def test_calling_transform_before_fit_raises_error(self):
transformer = MissingValuesTransformer()
with self.assertRaises(NotFittedError):
transformer.transform(self.data)

def test_imputation_with_unsupported_strategy_raises_value_error(self):
with self.assertRaises(ValueError):
MissingValuesTransformer(imputation_strategy="unsupported")

0 comments on commit b16e1d1

Please sign in to comment.