From e0ebfb591a3a8cf69e7a47f8a4a58aad6e62285b Mon Sep 17 00:00:00 2001 From: Egor Dmitriev Date: Fri, 4 Oct 2024 10:38:20 +0200 Subject: [PATCH 01/22] feature(KTP-1279): Changed feature scaling in linear model. Added exponential sample weighting in linear model. Signed-off-by: Egor Dmitriev --- openstef/model/model_creator.py | 2 ++ openstef/model/regressors/linear_quantile.py | 34 ++++++++++++++++---- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py index 40515fb6f..e4aa33588 100644 --- a/openstef/model/model_creator.py +++ b/openstef/model/model_creator.py @@ -116,6 +116,8 @@ "missing_values", "imputation_strategy", "fill_value", + "weight_scale_percentile", + "weight_exponent", ], ModelType.ARIMA: [ "backtest_max_horizon", diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 5e64fa4b3..57cfa6961 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -8,7 +8,7 @@ import pandas as pd from sklearn.base import RegressorMixin from sklearn.linear_model import QuantileRegressor -from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_is_fitted from openstef.feature_engineering.missing_values_transformer import ( @@ -25,8 +25,8 @@ class LinearQuantileOpenstfRegressor(OpenstfRegressor, RegressorMixin): solver: str imputer_: MissingValuesTransformer - x_scaler_: MinMaxScaler - y_scaler_: MinMaxScaler + x_scaler_: StandardScaler + y_scaler_: StandardScaler models_: Dict[float, QuantileRegressor] is_fitted_: bool = False @@ -47,6 +47,8 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: Optional[str] = "mean", fill_value: Union[str, int, float] = None, + weight_scale_percentile: int = 95, + weight_exponent: float = 1, ): """Initialize LinearQuantileOpenstfRegressor. @@ -82,13 +84,15 @@ def __init__( self.quantiles = quantiles self.alpha = alpha self.solver = solver + self.weight_scale_percentile = weight_scale_percentile + self.weight_exponent = weight_exponent self.imputer_ = MissingValuesTransformer( missing_values=missing_values, imputation_strategy=imputation_strategy, fill_value=fill_value, ) - self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1)) - self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1)) + self.x_scaler_ = StandardScaler() + self.y_scaler_ = StandardScaler() self.models_ = { quantile: QuantileRegressor(alpha=alpha, quantile=quantile, solver=solver) for quantile in quantiles @@ -177,7 +181,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: y_scaled = self.y_scaler_.fit_transform(y.to_frame())[:, 0] # Add more focus on extreme / peak values - sample_weight = np.abs(y_scaled) + sample_weight = self._calculate_sample_weights(y.values.squeeze()) # Fit quantile regressors for quantile in self.quantiles: @@ -191,6 +195,16 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self + def _calculate_sample_weights(self, y: np.array): + return np.clip( + _weight_exp( + _scale_percentile(y, percentile=self.weight_scale_percentile), + exponent=self.weight_exponent, + ), + a_min=0, + a_max=1, + ) + def predict(self, x: pd.DataFrame, quantile: float = 0.5, **kwargs) -> np.array: """Makes a prediction for a desired quantile. @@ -245,3 +259,11 @@ def _get_param_names(cls): def __sklearn_is_fitted__(self) -> bool: return self.is_fitted_ + + +def _scale_percentile(x: np.ndarray, percentile: int = 95): + return np.abs(x / np.percentile(np.abs(x), percentile)) + + +def _weight_exp(x: np.ndarray, exponent: float = 1): + return np.abs(x) ** exponent From 870b8a8b864b8877f9fd2b0b603714333ccf42b0 Mon Sep 17 00:00:00 2001 From: Egor Dmitriev Date: Fri, 4 Oct 2024 10:45:31 +0200 Subject: [PATCH 02/22] feature(KTP-1279): Added test to change linear model parameters. Signed-off-by: Egor Dmitriev --- .../model/regressors/test_linear_quantile.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index b6ed316a1..94b93810b 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -10,6 +10,7 @@ from sklearn.utils.estimator_checks import check_estimator from openstef.feature_engineering.apply_features import apply_features +from openstef.model.model_creator import ModelCreator from openstef.model.regressors.linear_quantile import LinearQuantileOpenstfRegressor from test.unit.utils.base import BaseTestCase from test.unit.utils.data import TestData @@ -144,3 +145,22 @@ def test_ignore_features(self): self.assertNotIn("E1B_AMI_I", input_data_filtered.columns) self.assertNotIn("E4A_I", input_data_filtered.columns) self.assertIn("load", input_data_filtered.columns) + + def test_create_model(self): + # Arrange + kwargs = { + "weight_scale_percentile": 50, + "weight_exponent": 2, + } + + # Act + model = ModelCreator.create_model( + model_type="linear_quantile", + quantiles=[0.5], + **kwargs, + ) + + # Assert + self.assertIsInstance(model, LinearQuantileOpenstfRegressor) + self.assertEqual(model.weight_scale_percentile, 50) + self.assertEqual(model.weight_exponent, 2) From c7f550aa470d30a57245e15d209c7e6004dd0d1e Mon Sep 17 00:00:00 2001 From: Egor Dmitriev Date: Fri, 4 Oct 2024 10:53:50 +0200 Subject: [PATCH 03/22] style: Code style fixes. Signed-off-by: Egor Dmitriev --- openstef/model/regressors/custom_regressor.py | 6 ++---- openstef/model/standard_deviation_generator.py | 6 +++--- test/unit/data_classes/test_split_function.py | 6 +++--- test/unit/model/test_custom_models.py | 3 +-- test/unit/pipeline/test_create_basecase.py | 12 ++++++------ test/unit/pipeline/test_pipeline_train_model.py | 3 +-- 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/openstef/model/regressors/custom_regressor.py b/openstef/model/regressors/custom_regressor.py index 33939cedd..3c3e75116 100644 --- a/openstef/model/regressors/custom_regressor.py +++ b/openstef/model/regressors/custom_regressor.py @@ -23,13 +23,11 @@ class CustomOpenstfRegressor(OpenstfRegressor): @staticmethod @abstractmethod - def valid_kwargs() -> list[str]: - ... + def valid_kwargs() -> list[str]: ... @classmethod @abstractmethod - def objective(self) -> Type[RegressorObjective]: - ... + def objective(self) -> Type[RegressorObjective]: ... def load_custom_model(custom_model_path) -> CustomOpenstfRegressor: diff --git a/openstef/model/standard_deviation_generator.py b/openstef/model/standard_deviation_generator.py index f268b4b0e..957077152 100644 --- a/openstef/model/standard_deviation_generator.py +++ b/openstef/model/standard_deviation_generator.py @@ -40,9 +40,9 @@ def generate_standard_deviation_data(self, model: RegressorMixin) -> RegressorMi confidence_interval_horizon = self._calculate_standard_deviation( sub_val.iloc[:, 0], predicted ) - confidence_interval_horizon[ - "horizon" - ] = horizon # Label with respective horizon + confidence_interval_horizon["horizon"] = ( + horizon # Label with respective horizon + ) self.standard_deviation = pd.concat( [self.standard_deviation, confidence_interval_horizon] ) diff --git a/test/unit/data_classes/test_split_function.py b/test/unit/data_classes/test_split_function.py index 5938bd3da..47e6c9062 100644 --- a/test/unit/data_classes/test_split_function.py +++ b/test/unit/data_classes/test_split_function.py @@ -62,9 +62,9 @@ def test_load(self): # Non Callable object split_func_dc = copy.deepcopy(self.split_func_with_strings) - split_func_dc[ - "function" - ] = "test.unit.data_classes.test_split_function.dummy_not_func" + split_func_dc["function"] = ( + "test.unit.data_classes.test_split_function.dummy_not_func" + ) with self.assertRaises(ValueError): _ = split_func_dc.load() diff --git a/test/unit/model/test_custom_models.py b/test/unit/model/test_custom_models.py index 3f51829c3..22c889a2f 100644 --- a/test/unit/model/test_custom_models.py +++ b/test/unit/model/test_custom_models.py @@ -16,8 +16,7 @@ ) -class DummyObjective(RegressorObjective): - ... +class DummyObjective(RegressorObjective): ... class DummyRegressor(CustomOpenstfRegressor): diff --git a/test/unit/pipeline/test_create_basecase.py b/test/unit/pipeline/test_create_basecase.py index fdd3b0999..1da571b54 100644 --- a/test/unit/pipeline/test_create_basecase.py +++ b/test/unit/pipeline/test_create_basecase.py @@ -19,9 +19,9 @@ def setUp(self) -> None: self.PJ = TestData.get_prediction_job(pid=307) forecast_input = TestData.load("reference_sets/307-test-data.csv") # Set last 7 days to nan, just like operationally - forecast_input.loc[ - forecast_input.index.max() - timedelta(days=7) :, "load" - ] = np.nan + forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( + np.nan + ) # Shift so the input matches 'now' offset_seconds = ( pd.to_datetime(datetime.utcnow(), utc=True) @@ -80,9 +80,9 @@ def test_create_basecase_forecast_pipeline_constant_load(self): forecast_input.loc[ forecast_input.index.max() - timedelta(days=21) :, "load" ] = forecast_input.loc[forecast_input.index.max() - timedelta(days=14), "load"] - forecast_input.loc[ - forecast_input.index.max() - timedelta(days=7) :, "load" - ] = np.nan + forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( + np.nan + ) base_case_forecast = create_basecase_forecast_pipeline(self.PJ, forecast_input) diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index 2809816ef..3bcc3a3ed 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -39,8 +39,7 @@ from openstef.validation import validation -class DummyObjective(RegressorObjective): - ... +class DummyObjective(RegressorObjective): ... class DummyRegressor(CustomOpenstfRegressor): From ad2eba67ea9df5a168779350c9ac975c426c5fb9 Mon Sep 17 00:00:00 2001 From: black Date: Fri, 4 Oct 2024 08:55:29 +0000 Subject: [PATCH 04/22] Format Python code with Black Signed-off-by: black --- openstef/model/regressors/custom_regressor.py | 6 ++++-- openstef/model/standard_deviation_generator.py | 6 +++--- test/unit/data_classes/test_split_function.py | 6 +++--- test/unit/model/test_custom_models.py | 3 ++- test/unit/pipeline/test_create_basecase.py | 12 ++++++------ test/unit/pipeline/test_pipeline_train_model.py | 3 ++- 6 files changed, 20 insertions(+), 16 deletions(-) diff --git a/openstef/model/regressors/custom_regressor.py b/openstef/model/regressors/custom_regressor.py index 3c3e75116..33939cedd 100644 --- a/openstef/model/regressors/custom_regressor.py +++ b/openstef/model/regressors/custom_regressor.py @@ -23,11 +23,13 @@ class CustomOpenstfRegressor(OpenstfRegressor): @staticmethod @abstractmethod - def valid_kwargs() -> list[str]: ... + def valid_kwargs() -> list[str]: + ... @classmethod @abstractmethod - def objective(self) -> Type[RegressorObjective]: ... + def objective(self) -> Type[RegressorObjective]: + ... def load_custom_model(custom_model_path) -> CustomOpenstfRegressor: diff --git a/openstef/model/standard_deviation_generator.py b/openstef/model/standard_deviation_generator.py index 957077152..f268b4b0e 100644 --- a/openstef/model/standard_deviation_generator.py +++ b/openstef/model/standard_deviation_generator.py @@ -40,9 +40,9 @@ def generate_standard_deviation_data(self, model: RegressorMixin) -> RegressorMi confidence_interval_horizon = self._calculate_standard_deviation( sub_val.iloc[:, 0], predicted ) - confidence_interval_horizon["horizon"] = ( - horizon # Label with respective horizon - ) + confidence_interval_horizon[ + "horizon" + ] = horizon # Label with respective horizon self.standard_deviation = pd.concat( [self.standard_deviation, confidence_interval_horizon] ) diff --git a/test/unit/data_classes/test_split_function.py b/test/unit/data_classes/test_split_function.py index 47e6c9062..5938bd3da 100644 --- a/test/unit/data_classes/test_split_function.py +++ b/test/unit/data_classes/test_split_function.py @@ -62,9 +62,9 @@ def test_load(self): # Non Callable object split_func_dc = copy.deepcopy(self.split_func_with_strings) - split_func_dc["function"] = ( - "test.unit.data_classes.test_split_function.dummy_not_func" - ) + split_func_dc[ + "function" + ] = "test.unit.data_classes.test_split_function.dummy_not_func" with self.assertRaises(ValueError): _ = split_func_dc.load() diff --git a/test/unit/model/test_custom_models.py b/test/unit/model/test_custom_models.py index 22c889a2f..3f51829c3 100644 --- a/test/unit/model/test_custom_models.py +++ b/test/unit/model/test_custom_models.py @@ -16,7 +16,8 @@ ) -class DummyObjective(RegressorObjective): ... +class DummyObjective(RegressorObjective): + ... class DummyRegressor(CustomOpenstfRegressor): diff --git a/test/unit/pipeline/test_create_basecase.py b/test/unit/pipeline/test_create_basecase.py index 1da571b54..fdd3b0999 100644 --- a/test/unit/pipeline/test_create_basecase.py +++ b/test/unit/pipeline/test_create_basecase.py @@ -19,9 +19,9 @@ def setUp(self) -> None: self.PJ = TestData.get_prediction_job(pid=307) forecast_input = TestData.load("reference_sets/307-test-data.csv") # Set last 7 days to nan, just like operationally - forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( - np.nan - ) + forecast_input.loc[ + forecast_input.index.max() - timedelta(days=7) :, "load" + ] = np.nan # Shift so the input matches 'now' offset_seconds = ( pd.to_datetime(datetime.utcnow(), utc=True) @@ -80,9 +80,9 @@ def test_create_basecase_forecast_pipeline_constant_load(self): forecast_input.loc[ forecast_input.index.max() - timedelta(days=21) :, "load" ] = forecast_input.loc[forecast_input.index.max() - timedelta(days=14), "load"] - forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( - np.nan - ) + forecast_input.loc[ + forecast_input.index.max() - timedelta(days=7) :, "load" + ] = np.nan base_case_forecast = create_basecase_forecast_pipeline(self.PJ, forecast_input) diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index 3bcc3a3ed..2809816ef 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -39,7 +39,8 @@ from openstef.validation import validation -class DummyObjective(RegressorObjective): ... +class DummyObjective(RegressorObjective): + ... class DummyRegressor(CustomOpenstfRegressor): From 20fd99c3b43920be5fa8b230cdbaeee894eeaade Mon Sep 17 00:00:00 2001 From: Egor Dmitriev Date: Fri, 4 Oct 2024 11:04:09 +0200 Subject: [PATCH 05/22] feature(KTP-1279): Added additional test condition for linear model params. Signed-off-by: Egor Dmitriev --- test/unit/model/regressors/test_linear_quantile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index 94b93810b..8f22bb0df 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -164,3 +164,4 @@ def test_create_model(self): self.assertIsInstance(model, LinearQuantileOpenstfRegressor) self.assertEqual(model.weight_scale_percentile, 50) self.assertEqual(model.weight_exponent, 2) + self.assertEqual(model.quantiles, [0.5]) From 094391aad7dd286af6609f2af78728a599fb2426 Mon Sep 17 00:00:00 2001 From: Egor Dmitriev Date: Fri, 4 Oct 2024 11:13:46 +0200 Subject: [PATCH 06/22] style: Code style fixes. Signed-off-by: Egor Dmitriev --- openstef/model/regressors/custom_regressor.py | 6 ++---- openstef/model/standard_deviation_generator.py | 6 +++--- test/unit/data_classes/test_split_function.py | 6 +++--- test/unit/model/test_custom_models.py | 3 +-- test/unit/pipeline/test_create_basecase.py | 12 ++++++------ test/unit/pipeline/test_pipeline_train_model.py | 3 +-- 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/openstef/model/regressors/custom_regressor.py b/openstef/model/regressors/custom_regressor.py index 33939cedd..3c3e75116 100644 --- a/openstef/model/regressors/custom_regressor.py +++ b/openstef/model/regressors/custom_regressor.py @@ -23,13 +23,11 @@ class CustomOpenstfRegressor(OpenstfRegressor): @staticmethod @abstractmethod - def valid_kwargs() -> list[str]: - ... + def valid_kwargs() -> list[str]: ... @classmethod @abstractmethod - def objective(self) -> Type[RegressorObjective]: - ... + def objective(self) -> Type[RegressorObjective]: ... def load_custom_model(custom_model_path) -> CustomOpenstfRegressor: diff --git a/openstef/model/standard_deviation_generator.py b/openstef/model/standard_deviation_generator.py index f268b4b0e..957077152 100644 --- a/openstef/model/standard_deviation_generator.py +++ b/openstef/model/standard_deviation_generator.py @@ -40,9 +40,9 @@ def generate_standard_deviation_data(self, model: RegressorMixin) -> RegressorMi confidence_interval_horizon = self._calculate_standard_deviation( sub_val.iloc[:, 0], predicted ) - confidence_interval_horizon[ - "horizon" - ] = horizon # Label with respective horizon + confidence_interval_horizon["horizon"] = ( + horizon # Label with respective horizon + ) self.standard_deviation = pd.concat( [self.standard_deviation, confidence_interval_horizon] ) diff --git a/test/unit/data_classes/test_split_function.py b/test/unit/data_classes/test_split_function.py index 5938bd3da..47e6c9062 100644 --- a/test/unit/data_classes/test_split_function.py +++ b/test/unit/data_classes/test_split_function.py @@ -62,9 +62,9 @@ def test_load(self): # Non Callable object split_func_dc = copy.deepcopy(self.split_func_with_strings) - split_func_dc[ - "function" - ] = "test.unit.data_classes.test_split_function.dummy_not_func" + split_func_dc["function"] = ( + "test.unit.data_classes.test_split_function.dummy_not_func" + ) with self.assertRaises(ValueError): _ = split_func_dc.load() diff --git a/test/unit/model/test_custom_models.py b/test/unit/model/test_custom_models.py index 3f51829c3..22c889a2f 100644 --- a/test/unit/model/test_custom_models.py +++ b/test/unit/model/test_custom_models.py @@ -16,8 +16,7 @@ ) -class DummyObjective(RegressorObjective): - ... +class DummyObjective(RegressorObjective): ... class DummyRegressor(CustomOpenstfRegressor): diff --git a/test/unit/pipeline/test_create_basecase.py b/test/unit/pipeline/test_create_basecase.py index fdd3b0999..1da571b54 100644 --- a/test/unit/pipeline/test_create_basecase.py +++ b/test/unit/pipeline/test_create_basecase.py @@ -19,9 +19,9 @@ def setUp(self) -> None: self.PJ = TestData.get_prediction_job(pid=307) forecast_input = TestData.load("reference_sets/307-test-data.csv") # Set last 7 days to nan, just like operationally - forecast_input.loc[ - forecast_input.index.max() - timedelta(days=7) :, "load" - ] = np.nan + forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( + np.nan + ) # Shift so the input matches 'now' offset_seconds = ( pd.to_datetime(datetime.utcnow(), utc=True) @@ -80,9 +80,9 @@ def test_create_basecase_forecast_pipeline_constant_load(self): forecast_input.loc[ forecast_input.index.max() - timedelta(days=21) :, "load" ] = forecast_input.loc[forecast_input.index.max() - timedelta(days=14), "load"] - forecast_input.loc[ - forecast_input.index.max() - timedelta(days=7) :, "load" - ] = np.nan + forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( + np.nan + ) base_case_forecast = create_basecase_forecast_pipeline(self.PJ, forecast_input) diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index 2809816ef..3bcc3a3ed 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -39,8 +39,7 @@ from openstef.validation import validation -class DummyObjective(RegressorObjective): - ... +class DummyObjective(RegressorObjective): ... class DummyRegressor(CustomOpenstfRegressor): From 0667ec25227b69acf9819f9e60da7c5d69c3dcf6 Mon Sep 17 00:00:00 2001 From: black Date: Fri, 4 Oct 2024 09:14:22 +0000 Subject: [PATCH 07/22] Format Python code with Black Signed-off-by: black --- openstef/model/regressors/custom_regressor.py | 6 ++++-- openstef/model/standard_deviation_generator.py | 6 +++--- test/unit/data_classes/test_split_function.py | 6 +++--- test/unit/model/test_custom_models.py | 3 ++- test/unit/pipeline/test_create_basecase.py | 12 ++++++------ test/unit/pipeline/test_pipeline_train_model.py | 3 ++- 6 files changed, 20 insertions(+), 16 deletions(-) diff --git a/openstef/model/regressors/custom_regressor.py b/openstef/model/regressors/custom_regressor.py index 3c3e75116..33939cedd 100644 --- a/openstef/model/regressors/custom_regressor.py +++ b/openstef/model/regressors/custom_regressor.py @@ -23,11 +23,13 @@ class CustomOpenstfRegressor(OpenstfRegressor): @staticmethod @abstractmethod - def valid_kwargs() -> list[str]: ... + def valid_kwargs() -> list[str]: + ... @classmethod @abstractmethod - def objective(self) -> Type[RegressorObjective]: ... + def objective(self) -> Type[RegressorObjective]: + ... def load_custom_model(custom_model_path) -> CustomOpenstfRegressor: diff --git a/openstef/model/standard_deviation_generator.py b/openstef/model/standard_deviation_generator.py index 957077152..f268b4b0e 100644 --- a/openstef/model/standard_deviation_generator.py +++ b/openstef/model/standard_deviation_generator.py @@ -40,9 +40,9 @@ def generate_standard_deviation_data(self, model: RegressorMixin) -> RegressorMi confidence_interval_horizon = self._calculate_standard_deviation( sub_val.iloc[:, 0], predicted ) - confidence_interval_horizon["horizon"] = ( - horizon # Label with respective horizon - ) + confidence_interval_horizon[ + "horizon" + ] = horizon # Label with respective horizon self.standard_deviation = pd.concat( [self.standard_deviation, confidence_interval_horizon] ) diff --git a/test/unit/data_classes/test_split_function.py b/test/unit/data_classes/test_split_function.py index 47e6c9062..5938bd3da 100644 --- a/test/unit/data_classes/test_split_function.py +++ b/test/unit/data_classes/test_split_function.py @@ -62,9 +62,9 @@ def test_load(self): # Non Callable object split_func_dc = copy.deepcopy(self.split_func_with_strings) - split_func_dc["function"] = ( - "test.unit.data_classes.test_split_function.dummy_not_func" - ) + split_func_dc[ + "function" + ] = "test.unit.data_classes.test_split_function.dummy_not_func" with self.assertRaises(ValueError): _ = split_func_dc.load() diff --git a/test/unit/model/test_custom_models.py b/test/unit/model/test_custom_models.py index 22c889a2f..3f51829c3 100644 --- a/test/unit/model/test_custom_models.py +++ b/test/unit/model/test_custom_models.py @@ -16,7 +16,8 @@ ) -class DummyObjective(RegressorObjective): ... +class DummyObjective(RegressorObjective): + ... class DummyRegressor(CustomOpenstfRegressor): diff --git a/test/unit/pipeline/test_create_basecase.py b/test/unit/pipeline/test_create_basecase.py index 1da571b54..fdd3b0999 100644 --- a/test/unit/pipeline/test_create_basecase.py +++ b/test/unit/pipeline/test_create_basecase.py @@ -19,9 +19,9 @@ def setUp(self) -> None: self.PJ = TestData.get_prediction_job(pid=307) forecast_input = TestData.load("reference_sets/307-test-data.csv") # Set last 7 days to nan, just like operationally - forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( - np.nan - ) + forecast_input.loc[ + forecast_input.index.max() - timedelta(days=7) :, "load" + ] = np.nan # Shift so the input matches 'now' offset_seconds = ( pd.to_datetime(datetime.utcnow(), utc=True) @@ -80,9 +80,9 @@ def test_create_basecase_forecast_pipeline_constant_load(self): forecast_input.loc[ forecast_input.index.max() - timedelta(days=21) :, "load" ] = forecast_input.loc[forecast_input.index.max() - timedelta(days=14), "load"] - forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( - np.nan - ) + forecast_input.loc[ + forecast_input.index.max() - timedelta(days=7) :, "load" + ] = np.nan base_case_forecast = create_basecase_forecast_pipeline(self.PJ, forecast_input) diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index 3bcc3a3ed..2809816ef 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -39,7 +39,8 @@ from openstef.validation import validation -class DummyObjective(RegressorObjective): ... +class DummyObjective(RegressorObjective): + ... class DummyRegressor(CustomOpenstfRegressor): From f85f2e694210e64669ae9b58e66b23cbce897de6 Mon Sep 17 00:00:00 2001 From: Egor Dmitriev Date: Fri, 4 Oct 2024 11:14:50 +0200 Subject: [PATCH 08/22] feature(KTP-1279): Added additional test condition for linear model params. Signed-off-by: Egor Dmitriev --- test/unit/model/regressors/test_linear_quantile.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index 8f22bb0df..94b93810b 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -164,4 +164,3 @@ def test_create_model(self): self.assertIsInstance(model, LinearQuantileOpenstfRegressor) self.assertEqual(model.weight_scale_percentile, 50) self.assertEqual(model.weight_exponent, 2) - self.assertEqual(model.quantiles, [0.5]) From 6f4afa7dae9c228eeddd9dc3e87b4aeffc91f7f9 Mon Sep 17 00:00:00 2001 From: Egor Dmitriev Date: Mon, 7 Oct 2024 16:08:03 +0200 Subject: [PATCH 09/22] feature(KTP-1279): Added weight floor. Added documentation for sample weight calculation. Signed-off-by: Egor Dmitriev --- openstef/model/model_creator.py | 1 + openstef/model/regressors/linear_quantile.py | 21 +++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py index e4aa33588..cc91b32d5 100644 --- a/openstef/model/model_creator.py +++ b/openstef/model/model_creator.py @@ -118,6 +118,7 @@ "fill_value", "weight_scale_percentile", "weight_exponent", + "weight_floor", ], ModelType.ARIMA: [ "backtest_max_horizon", diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 57cfa6961..96242e2fc 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -49,6 +49,7 @@ def __init__( fill_value: Union[str, int, float] = None, weight_scale_percentile: int = 95, weight_exponent: float = 1, + weight_floor: float = 0.1, ): """Initialize LinearQuantileOpenstfRegressor. @@ -86,6 +87,7 @@ def __init__( self.solver = solver self.weight_scale_percentile = weight_scale_percentile self.weight_exponent = weight_exponent + self.weight_floor = weight_floor self.imputer_ = MissingValuesTransformer( missing_values=missing_values, imputation_strategy=imputation_strategy, @@ -196,12 +198,29 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): + """ + Calculate sample weights based on the y values of arbitrary scale. The resulting + weights are in the range [0, 1] and are used to put more emphasis on certain + samples. + + The sample weighting function does: + * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will + be within this range. Rest is outside. + * Calculate the weight by taking the exponent of scaled data. + * exponent=0: Results in uniform weights for all samples. + * exponent=1: Results in linearly increasing weights for samples that are + closer to the extremes. + * exponent>1: Results in exponentially increasing weights for samples that are + closer to the extremes. + * Clip the data to [0, 1] range with weight_floor as the minimum weight. + * Weight floor is used to make sure that all the samples are considered. + """ return np.clip( _weight_exp( _scale_percentile(y, percentile=self.weight_scale_percentile), exponent=self.weight_exponent, ), - a_min=0, + a_min=self.weight_floor, a_max=1, ) From 6c5e67d21c453459d72a52c80cb21c75c4eaa197 Mon Sep 17 00:00:00 2001 From: black Date: Mon, 7 Oct 2024 14:10:24 +0000 Subject: [PATCH 10/22] Format Python code with Black Signed-off-by: black --- openstef/model/regressors/linear_quantile.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 96242e2fc..00401cc12 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -198,10 +198,8 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): - """ - Calculate sample weights based on the y values of arbitrary scale. The resulting - weights are in the range [0, 1] and are used to put more emphasis on certain - samples. + """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range [0, + 1] and are used to put more emphasis on certain samples. The sample weighting function does: * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will @@ -214,6 +212,7 @@ def _calculate_sample_weights(self, y: np.array): closer to the extremes. * Clip the data to [0, 1] range with weight_floor as the minimum weight. * Weight floor is used to make sure that all the samples are considered. + """ return np.clip( _weight_exp( From 385c356398e960fe8247b36f3e1c2cf8780855f9 Mon Sep 17 00:00:00 2001 From: Clara De Smet Date: Thu, 10 Oct 2024 13:51:30 +0200 Subject: [PATCH 11/22] Merge branch 'main' into feature/KTP-1279-linear-sample-weight Signed-off-by: Clara De Smet --- .../missing_values_transformer.py | 47 +++++++++++-- openstef/model/model_creator.py | 1 + openstef/model/regressors/flatliner.py | 7 +- openstef/model/regressors/linear_quantile.py | 9 ++- setup.py | 2 +- .../test_missing_values_transformer.py | 69 +++++++++++++++---- .../model/regressors/test_linear_quantile.py | 15 ++-- .../pipeline/test_pipeline_train_model.py | 7 +- 8 files changed, 121 insertions(+), 36 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index b2ced0895..7c46e5192 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -27,6 +27,7 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: str = None, fill_value: Union[str, int, float] = None, + no_fill_future_values_features: List[str] = None, ): """Initialize missing values handler. @@ -37,11 +38,14 @@ def __init__( Can be one of "mean", "median", "most_frequent", "constant" or None. fill_value: When strategy == "constant", fill_value is used to replace all occurrences of missing_values. - + no_fill_future_values_features: The features for which it does not make sense + to fill future values. Rows that contain trailing null values for these + features will be removed from the data. """ self.missing_values = missing_values self.imputation_strategy = imputation_strategy self.fill_value = fill_value + self.no_fill_future_values_features = no_fill_future_values_features or [] self.is_fitted_ = False # Build the proper imputation transformer @@ -57,6 +61,11 @@ def __init__( ).set_output(transform="pandas") self.imputer_._validate_params() + @staticmethod + def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series: + """Determine rows with trailing null values in a DataFrame.""" + return ~x.bfill().isnull().any(axis="columns") + def fit(self, x, y=None): """Fit the imputer on the input data.""" _ = check_array(x, force_all_finite="allow-nan") @@ -69,9 +78,17 @@ def fit(self, x, y=None): # Remove always null columns is_column_null = x.isnull().all(axis="index") self.non_null_feature_names = list(x.columns[~is_column_null]) + x = x[self.non_null_feature_names] + + # Remove trailing null rows for features that should + # not be imputed in the future + trailing_null_rows = self._determine_trailing_null_rows( + x[self.no_fill_future_values_features] + ) + x = x.loc[trailing_null_rows] # Imputers do not support labels - self.imputer_.fit(X=x[self.non_null_feature_names], y=None) + self.imputer_.fit(X=x, y=None) self.is_fitted_ = True def transform(self, x) -> pd.DataFrame: @@ -83,9 +100,11 @@ def transform(self, x) -> pd.DataFrame: x = x[self.non_null_feature_names] - return self.imputer_.transform(x) + transformed = self.imputer_.transform(x) - def fit_transform(self, x, y=None): + return transformed + + def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]: """Fit the imputer on the input data and transform it. Returns: @@ -93,7 +112,25 @@ def fit_transform(self, x, y=None): """ self.fit(x, y) - return self.transform(x) + + if not isinstance(x, pd.DataFrame): + x = pd.DataFrame(np.asarray(x)) + + x = x[self.non_null_feature_names] + + # Remove trailing null rows for features that should + # not be imputed in the future + non_trailing_null_rows = self._determine_trailing_null_rows( + x[self.no_fill_future_values_features] + ) + x = x.loc[non_trailing_null_rows] + + x = self.transform(x) + + if y is not None: + y = y.loc[non_trailing_null_rows] + + return x, y @classmethod def _identity(cls, x): diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py index cc91b32d5..32c108868 100644 --- a/openstef/model/model_creator.py +++ b/openstef/model/model_creator.py @@ -119,6 +119,7 @@ "weight_scale_percentile", "weight_exponent", "weight_floor", + "no_fill_future_values_features", ], ModelType.ARIMA: [ "backtest_max_horizon", diff --git a/openstef/model/regressors/flatliner.py b/openstef/model/regressors/flatliner.py index 764773d52..995052bbf 100644 --- a/openstef/model/regressors/flatliner.py +++ b/openstef/model/regressors/flatliner.py @@ -2,18 +2,13 @@ # # SPDX-License-Identifier: MPL-2.0 import re -from typing import Dict, Union, Set, Optional, List +from typing import List import numpy as np import pandas as pd from sklearn.base import RegressorMixin -from sklearn.linear_model import QuantileRegressor -from sklearn.preprocessing import MinMaxScaler from sklearn.utils.validation import check_is_fitted -from openstef.feature_engineering.missing_values_transformer import ( - MissingValuesTransformer, -) from openstef.model.regressors.regressor import OpenstfRegressor diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 00401cc12..2e126136e 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: MPL-2.0 import re -from typing import Dict, Union, Set, Optional +from typing import Dict, Union, Set, Optional, List import numpy as np import pandas as pd @@ -50,6 +50,7 @@ def __init__( weight_scale_percentile: int = 95, weight_exponent: float = 1, weight_floor: float = 0.1, + no_fill_future_values_features: List[str] = None, ): """Initialize LinearQuantileOpenstfRegressor. @@ -72,6 +73,9 @@ def __init__( missing_values: Value to be considered as missing value imputation_strategy: Imputation strategy fill_value: Fill value + no_fill_future_values_features: The features for which it does not make sense + to fill future values. Rows that contain trailing null values for these + features will be removed from the data. """ super().__init__() @@ -92,6 +96,7 @@ def __init__( missing_values=missing_values, imputation_strategy=imputation_strategy, fill_value=fill_value, + no_fill_future_values_features=no_fill_future_values_features, ) self.x_scaler_ = StandardScaler() self.y_scaler_ = StandardScaler() @@ -171,7 +176,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: x = self._remove_ignored_features(x) # Fix nan columns - x = self.imputer_.fit_transform(x) + x, y = self.imputer_.fit_transform(x, y) if x.isna().any().any(): raise ValueError( "There are nan values in the input data. Set " diff --git a/setup.py b/setup.py index 45d849e24..9a672bfc1 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def read_long_description_from_readme(): setup( name="openstef", - version="3.4.38", + version="3.4.39", packages=find_packages(include=["openstef", "openstef.*"]), description="Open short term energy forecaster", long_description=read_long_description_from_readme(), diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index 6a50157e7..ac93a30f3 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: MPL-2.0 from test.unit.utils.base import BaseTestCase -import unittest import pandas as pd import numpy as np from sklearn.exceptions import NotFittedError @@ -15,41 +14,83 @@ class MissingValuesTransformerTests(BaseTestCase): def setUp(self): self.data = pd.DataFrame( - {"A": [1, np.nan, 3], "B": [4, 5, np.nan], "C": [np.nan, np.nan, np.nan]} + { + "A": [np.nan, 2, 3, 4], + "B": [3, np.nan, 4, 5], + "C": [3, 4, 5, np.nan], + "D": [np.nan, np.nan, np.nan, np.nan], + }, + index=[0, 1, 1, 2], ) def test_imputation_with_mean_strategy_fills_missing_values(self): transformer = MissingValuesTransformer(imputation_strategy="mean") - transformed = transformer.fit_transform(self.data) + transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertAlmostEqual(transformed.loc[1, "A"], 2) - self.assertAlmostEqual(transformed.loc[2, "B"], 4.5) + self.assertAlmostEqual(transformed.iloc[0]["A"], 3) + self.assertAlmostEqual(transformed.iloc[1]["B"], 4) def test_imputation_with_constant_strategy_fills_missing_values(self): transformer = MissingValuesTransformer( imputation_strategy="constant", fill_value=0 ) - transformed = transformer.fit_transform(self.data) + transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertEqual(transformed.loc[1, "A"], 0) - self.assertEqual(transformed.loc[2, "B"], 0) + self.assertEqual(transformed.iloc[0]["A"], 0) + self.assertEqual(transformed.iloc[1]["B"], 0) def test_columns_always_null_are_removed(self): transformer = MissingValuesTransformer() transformer.fit(self.data) - self.assertNotIn("C", transformer.non_null_feature_names) + self.assertNotIn("D", transformer.non_null_feature_names) + + def test_determining_non_trailing_null_rows(self): + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) + transformer.fit(self.data) + non_trailing_null_rows = transformer._determine_trailing_null_rows( + self.data[transformer.non_null_feature_names] + ) + pd.testing.assert_series_equal( + non_trailing_null_rows, + pd.Series([True, True, True, False], index=[0, 1, 1, 2]), + ) + + def test_fitting_with_labels_removes_rows_with_trailing_nulls(self): + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) + _, y_transformed = transformer.fit_transform( + self.data, y=pd.Series([1, 2, 3, 4], index=self.data.index) + ) + self.assertEqual(y_transformed.tolist(), [1, 2, 3]) def test_non_dataframe_input_is_converted_and_processed(self): transformer = MissingValuesTransformer(imputation_strategy="mean") - array = np.array([[1, np.nan], [np.nan, 2]]) - transformed = transformer.fit_transform(array) + array = np.array([[1, np.nan, np.nan], [np.nan, 2, np.nan]]) + transformed, _ = transformer.fit_transform(array) self.assertIsInstance(transformed, pd.DataFrame) self.assertEqual(transformed.isnull().sum().sum(), 0) + self.assertEqual(transformed.shape, (2, 2)) - def test_fitting_transformer_without_strategy_keeps_data_unchanged(self): + def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self): transformer = MissingValuesTransformer() - transformed = transformer.fit_transform(self.data) - pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C"])) + transformed, _ = transformer.fit_transform(self.data) + pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["D"])) + + def test_call_transform_on_fitted_transformer_does_not_remove_trailing_null_rows( + self, + ): + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) + transformer.fit(self.data) + new_data = pd.DataFrame( + { + "A": [1, 2, 3, 4], + "B": [1, 2, 3, 4], + "C": [1, 2, 3, 4], + "D": [1, 2, 3, 4], + }, + index=[0, 1, 1, 2], + ) + transformed = transformer.transform(new_data) + pd.testing.assert_frame_equal(transformed, new_data.drop(columns=["D"])) def test_calling_transform_before_fit_raises_error(self): transformer = MissingValuesTransformer() diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index 94b93810b..08be99757 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -56,11 +56,14 @@ def test_imputer(self): # Arrange n_sample = train_input.shape[0] X = train_input.iloc[:, 1:].copy(deep=True) - sp = np.ones(n_sample) - sp[-1] = np.nan - X["Sparse"] = sp + X["sparse"] = np.ones(n_sample) + X.loc[X.index[-2], "sparse"] = np.nan + X["sparse_2"] = np.ones(n_sample) + X.loc[X.index[-1], "sparse_2"] = np.nan model1 = LinearQuantileOpenstfRegressor(imputation_strategy=None) - model2 = LinearQuantileOpenstfRegressor(imputation_strategy="mean") + model2 = LinearQuantileOpenstfRegressor( + imputation_strategy="mean", no_fill_future_values_features=["sparse_2"] + ) # Act # Model should give error if nan values are present. @@ -76,6 +79,10 @@ def test_imputer(self): X_ = pd.DataFrame(model2.imputer_.transform(X), columns=X.columns) self.assertTrue((model2.predict(X_) == model2.predict(X)).all()) + # check if last row is removed because of trailing null values + X_transformed, _ = model2.imputer_.fit_transform(X) + self.assertEqual(X_transformed.shape[0], n_sample - 1) + def test_value_error_raised(self): # Check if Value Error is raised when 0.5 is not in the requested quantiles list with self.assertRaises(ValueError): diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index 2809816ef..67009cb5a 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -125,8 +125,6 @@ def test_train_model_pipeline_core_happy_flow(self): but it can/should include predictors (e.g. weather data) """ - # Select 50 data points to speedup test - train_input = self.train_input.iloc[:50, :] # Remove modeltypes which are optional, and add a dummy regressor for model_type in list(ModelType) + [__name__ + ".DummyRegressor"]: with self.subTest(model_type=model_type): @@ -136,7 +134,9 @@ def test_train_model_pipeline_core_happy_flow(self): model_type.value if hasattr(model_type, "value") else model_type ) model_specs = self.model_specs - train_input = self.train_input + + # Select 150 data points to speedup test + train_input = self.train_input.iloc[:150, :] # Use default parameters model_specs.hyper_params = {} @@ -155,7 +155,6 @@ def test_train_model_pipeline_core_happy_flow(self): function=split_dummy_arima, arguments={}, ) - train_input = self.train_input[:150] model, report, modelspecs, _ = train_model_pipeline_core( pj=pj, model_specs=model_specs, input_data=train_input From 75764b501004d1a89658bd2b9a4058f3522077fd Mon Sep 17 00:00:00 2001 From: black Date: Thu, 10 Oct 2024 12:57:44 +0000 Subject: [PATCH 12/22] Format Python code with Black Signed-off-by: black --- openstef/feature_engineering/missing_values_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 7c46e5192..faa3b2fc0 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -41,6 +41,7 @@ def __init__( no_fill_future_values_features: The features for which it does not make sense to fill future values. Rows that contain trailing null values for these features will be removed from the data. + """ self.missing_values = missing_values self.imputation_strategy = imputation_strategy From 873e98445c0bd898128e4b740301700ff4e90caf Mon Sep 17 00:00:00 2001 From: Clara De Smet Date: Thu, 10 Oct 2024 15:53:11 +0200 Subject: [PATCH 13/22] Fixed linter suggestion Signed-off-by: Clara De Smet --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c41a8e0bf..cabf66998 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ OpenSTEF is a Python package designed for generating short-term forecasts in the pip install openstef ``` -### Remark regarding installation within a **conda environment on Windows**: +### Remark regarding installation within a **conda environment on Windows** A version of the pywin32 package will be installed as a secondary dependency along with the installation of the openstef package. Since conda relies on an old version of pywin32, the new installation can break conda's functionality. The following command can solve this issue: ```shell From 5dbed25847bf5c20e7671c42ab86f46ffc871d18 Mon Sep 17 00:00:00 2001 From: Clara De Smet Date: Fri, 11 Oct 2024 14:25:37 +0200 Subject: [PATCH 14/22] Added documentation --- openstef/model/regressors/custom_regressor.py | 6 ++---- openstef/model/regressors/linear_quantile.py | 7 +++++-- openstef/model/standard_deviation_generator.py | 6 +++--- test/unit/data_classes/test_split_function.py | 6 +++--- test/unit/model/test_custom_models.py | 3 +-- test/unit/pipeline/test_create_basecase.py | 12 ++++++------ test/unit/pipeline/test_pipeline_train_model.py | 3 +-- 7 files changed, 21 insertions(+), 22 deletions(-) diff --git a/openstef/model/regressors/custom_regressor.py b/openstef/model/regressors/custom_regressor.py index 33939cedd..3c3e75116 100644 --- a/openstef/model/regressors/custom_regressor.py +++ b/openstef/model/regressors/custom_regressor.py @@ -23,13 +23,11 @@ class CustomOpenstfRegressor(OpenstfRegressor): @staticmethod @abstractmethod - def valid_kwargs() -> list[str]: - ... + def valid_kwargs() -> list[str]: ... @classmethod @abstractmethod - def objective(self) -> Type[RegressorObjective]: - ... + def objective(self) -> Type[RegressorObjective]: ... def load_custom_model(custom_model_path) -> CustomOpenstfRegressor: diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 2e126136e..a4154f8bf 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -73,6 +73,9 @@ def __init__( missing_values: Value to be considered as missing value imputation_strategy: Imputation strategy fill_value: Fill value + weight_scale_percentile: Percentile used in scaling of the samples + weight_exponent: Exponent used in sample weighing + weight_floor: Minimum weight for samples no_fill_future_values_features: The features for which it does not make sense to fill future values. Rows that contain trailing null values for these features will be removed from the data. @@ -203,8 +206,8 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): - """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range [0, - 1] and are used to put more emphasis on certain samples. + """Calculate sample weights based on the y values of arbitrary scale. + The resulting weights are in the range [0, 1] and are used to put more emphasis on certain samples. The sample weighting function does: * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will diff --git a/openstef/model/standard_deviation_generator.py b/openstef/model/standard_deviation_generator.py index f268b4b0e..957077152 100644 --- a/openstef/model/standard_deviation_generator.py +++ b/openstef/model/standard_deviation_generator.py @@ -40,9 +40,9 @@ def generate_standard_deviation_data(self, model: RegressorMixin) -> RegressorMi confidence_interval_horizon = self._calculate_standard_deviation( sub_val.iloc[:, 0], predicted ) - confidence_interval_horizon[ - "horizon" - ] = horizon # Label with respective horizon + confidence_interval_horizon["horizon"] = ( + horizon # Label with respective horizon + ) self.standard_deviation = pd.concat( [self.standard_deviation, confidence_interval_horizon] ) diff --git a/test/unit/data_classes/test_split_function.py b/test/unit/data_classes/test_split_function.py index 5938bd3da..47e6c9062 100644 --- a/test/unit/data_classes/test_split_function.py +++ b/test/unit/data_classes/test_split_function.py @@ -62,9 +62,9 @@ def test_load(self): # Non Callable object split_func_dc = copy.deepcopy(self.split_func_with_strings) - split_func_dc[ - "function" - ] = "test.unit.data_classes.test_split_function.dummy_not_func" + split_func_dc["function"] = ( + "test.unit.data_classes.test_split_function.dummy_not_func" + ) with self.assertRaises(ValueError): _ = split_func_dc.load() diff --git a/test/unit/model/test_custom_models.py b/test/unit/model/test_custom_models.py index 3f51829c3..22c889a2f 100644 --- a/test/unit/model/test_custom_models.py +++ b/test/unit/model/test_custom_models.py @@ -16,8 +16,7 @@ ) -class DummyObjective(RegressorObjective): - ... +class DummyObjective(RegressorObjective): ... class DummyRegressor(CustomOpenstfRegressor): diff --git a/test/unit/pipeline/test_create_basecase.py b/test/unit/pipeline/test_create_basecase.py index fdd3b0999..1da571b54 100644 --- a/test/unit/pipeline/test_create_basecase.py +++ b/test/unit/pipeline/test_create_basecase.py @@ -19,9 +19,9 @@ def setUp(self) -> None: self.PJ = TestData.get_prediction_job(pid=307) forecast_input = TestData.load("reference_sets/307-test-data.csv") # Set last 7 days to nan, just like operationally - forecast_input.loc[ - forecast_input.index.max() - timedelta(days=7) :, "load" - ] = np.nan + forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( + np.nan + ) # Shift so the input matches 'now' offset_seconds = ( pd.to_datetime(datetime.utcnow(), utc=True) @@ -80,9 +80,9 @@ def test_create_basecase_forecast_pipeline_constant_load(self): forecast_input.loc[ forecast_input.index.max() - timedelta(days=21) :, "load" ] = forecast_input.loc[forecast_input.index.max() - timedelta(days=14), "load"] - forecast_input.loc[ - forecast_input.index.max() - timedelta(days=7) :, "load" - ] = np.nan + forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( + np.nan + ) base_case_forecast = create_basecase_forecast_pipeline(self.PJ, forecast_input) diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index 67009cb5a..da741ca9e 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -39,8 +39,7 @@ from openstef.validation import validation -class DummyObjective(RegressorObjective): - ... +class DummyObjective(RegressorObjective): ... class DummyRegressor(CustomOpenstfRegressor): From b5d24309557544b001cc0530d59696e3017ae583 Mon Sep 17 00:00:00 2001 From: black Date: Fri, 11 Oct 2024 12:26:09 +0000 Subject: [PATCH 15/22] Format Python code with Black Signed-off-by: black --- openstef/model/regressors/custom_regressor.py | 6 ++++-- openstef/model/regressors/linear_quantile.py | 4 ++-- openstef/model/standard_deviation_generator.py | 6 +++--- test/unit/data_classes/test_split_function.py | 6 +++--- test/unit/model/test_custom_models.py | 3 ++- test/unit/pipeline/test_create_basecase.py | 12 ++++++------ test/unit/pipeline/test_pipeline_train_model.py | 3 ++- 7 files changed, 22 insertions(+), 18 deletions(-) diff --git a/openstef/model/regressors/custom_regressor.py b/openstef/model/regressors/custom_regressor.py index 3c3e75116..33939cedd 100644 --- a/openstef/model/regressors/custom_regressor.py +++ b/openstef/model/regressors/custom_regressor.py @@ -23,11 +23,13 @@ class CustomOpenstfRegressor(OpenstfRegressor): @staticmethod @abstractmethod - def valid_kwargs() -> list[str]: ... + def valid_kwargs() -> list[str]: + ... @classmethod @abstractmethod - def objective(self) -> Type[RegressorObjective]: ... + def objective(self) -> Type[RegressorObjective]: + ... def load_custom_model(custom_model_path) -> CustomOpenstfRegressor: diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index a4154f8bf..b61a92f46 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -206,8 +206,8 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): - """Calculate sample weights based on the y values of arbitrary scale. - The resulting weights are in the range [0, 1] and are used to put more emphasis on certain samples. + """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range [0, + 1] and are used to put more emphasis on certain samples. The sample weighting function does: * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will diff --git a/openstef/model/standard_deviation_generator.py b/openstef/model/standard_deviation_generator.py index 957077152..f268b4b0e 100644 --- a/openstef/model/standard_deviation_generator.py +++ b/openstef/model/standard_deviation_generator.py @@ -40,9 +40,9 @@ def generate_standard_deviation_data(self, model: RegressorMixin) -> RegressorMi confidence_interval_horizon = self._calculate_standard_deviation( sub_val.iloc[:, 0], predicted ) - confidence_interval_horizon["horizon"] = ( - horizon # Label with respective horizon - ) + confidence_interval_horizon[ + "horizon" + ] = horizon # Label with respective horizon self.standard_deviation = pd.concat( [self.standard_deviation, confidence_interval_horizon] ) diff --git a/test/unit/data_classes/test_split_function.py b/test/unit/data_classes/test_split_function.py index 47e6c9062..5938bd3da 100644 --- a/test/unit/data_classes/test_split_function.py +++ b/test/unit/data_classes/test_split_function.py @@ -62,9 +62,9 @@ def test_load(self): # Non Callable object split_func_dc = copy.deepcopy(self.split_func_with_strings) - split_func_dc["function"] = ( - "test.unit.data_classes.test_split_function.dummy_not_func" - ) + split_func_dc[ + "function" + ] = "test.unit.data_classes.test_split_function.dummy_not_func" with self.assertRaises(ValueError): _ = split_func_dc.load() diff --git a/test/unit/model/test_custom_models.py b/test/unit/model/test_custom_models.py index 22c889a2f..3f51829c3 100644 --- a/test/unit/model/test_custom_models.py +++ b/test/unit/model/test_custom_models.py @@ -16,7 +16,8 @@ ) -class DummyObjective(RegressorObjective): ... +class DummyObjective(RegressorObjective): + ... class DummyRegressor(CustomOpenstfRegressor): diff --git a/test/unit/pipeline/test_create_basecase.py b/test/unit/pipeline/test_create_basecase.py index 1da571b54..fdd3b0999 100644 --- a/test/unit/pipeline/test_create_basecase.py +++ b/test/unit/pipeline/test_create_basecase.py @@ -19,9 +19,9 @@ def setUp(self) -> None: self.PJ = TestData.get_prediction_job(pid=307) forecast_input = TestData.load("reference_sets/307-test-data.csv") # Set last 7 days to nan, just like operationally - forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( - np.nan - ) + forecast_input.loc[ + forecast_input.index.max() - timedelta(days=7) :, "load" + ] = np.nan # Shift so the input matches 'now' offset_seconds = ( pd.to_datetime(datetime.utcnow(), utc=True) @@ -80,9 +80,9 @@ def test_create_basecase_forecast_pipeline_constant_load(self): forecast_input.loc[ forecast_input.index.max() - timedelta(days=21) :, "load" ] = forecast_input.loc[forecast_input.index.max() - timedelta(days=14), "load"] - forecast_input.loc[forecast_input.index.max() - timedelta(days=7) :, "load"] = ( - np.nan - ) + forecast_input.loc[ + forecast_input.index.max() - timedelta(days=7) :, "load" + ] = np.nan base_case_forecast = create_basecase_forecast_pipeline(self.PJ, forecast_input) diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index da741ca9e..67009cb5a 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -39,7 +39,8 @@ from openstef.validation import validation -class DummyObjective(RegressorObjective): ... +class DummyObjective(RegressorObjective): + ... class DummyRegressor(CustomOpenstfRegressor): From da443b7767877cd04b748866ff4c00f1ba0f5659 Mon Sep 17 00:00:00 2001 From: Clara De Smet Date: Fri, 11 Oct 2024 15:00:12 +0200 Subject: [PATCH 16/22] Bumped version of black formatting --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 195389c20..807f44796 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ repos: - repo: https://github.com/ambv/black - rev: 22.1.0 + rev: 24.3.0 hooks: - id: black language_version: python3.11 From b51c69048fd66b6f5afcfc80d183ab945da91174 Mon Sep 17 00:00:00 2001 From: Clara De Smet Date: Fri, 11 Oct 2024 15:10:37 +0200 Subject: [PATCH 17/22] Updated documentation --- openstef/model/regressors/linear_quantile.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index b61a92f46..0f9c459bc 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -206,9 +206,8 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): - """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range [0, - 1] and are used to put more emphasis on certain samples. - + """Calculate sample weights based on the y values of arbitrary scale. + The resulting weights are in the range [0,1] and are used to put more emphasis on certain samples. The sample weighting function does: * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will be within this range. Rest is outside. From fafc48b1bc8d1459de95637c2bffbdf7a005eca0 Mon Sep 17 00:00:00 2001 From: black Date: Fri, 11 Oct 2024 13:11:12 +0000 Subject: [PATCH 18/22] Format Python code with Black Signed-off-by: black --- openstef/model/regressors/linear_quantile.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 0f9c459bc..06f74ce6d 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -206,9 +206,9 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): - """Calculate sample weights based on the y values of arbitrary scale. - The resulting weights are in the range [0,1] and are used to put more emphasis on certain samples. - The sample weighting function does: + """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range + [0,1] and are used to put more emphasis on certain samples. The sample weighting function does: + * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will be within this range. Rest is outside. * Calculate the weight by taking the exponent of scaled data. From ac5b8e3aa2245d99cb9b39ee9335fe32286d4745 Mon Sep 17 00:00:00 2001 From: Clara De Smet Date: Fri, 11 Oct 2024 15:12:28 +0200 Subject: [PATCH 19/22] Removed blank line --- openstef/model/regressors/linear_quantile.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 0f9c459bc..3162ac43f 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -219,7 +219,6 @@ def _calculate_sample_weights(self, y: np.array): closer to the extremes. * Clip the data to [0, 1] range with weight_floor as the minimum weight. * Weight floor is used to make sure that all the samples are considered. - """ return np.clip( _weight_exp( From 0c8025958bad4bf808873f5cab1c70f0b3fc1136 Mon Sep 17 00:00:00 2001 From: black Date: Fri, 11 Oct 2024 13:13:58 +0000 Subject: [PATCH 20/22] Format Python code with Black Signed-off-by: black --- openstef/model/regressors/linear_quantile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 79706c102..fa83970fd 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -206,7 +206,8 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): - """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range + """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range. + [0,1] and are used to put more emphasis on certain samples. The sample weighting function does: * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will @@ -219,6 +220,7 @@ def _calculate_sample_weights(self, y: np.array): closer to the extremes. * Clip the data to [0, 1] range with weight_floor as the minimum weight. * Weight floor is used to make sure that all the samples are considered. + """ return np.clip( _weight_exp( From f6a09a26039c17545bd553f682afc38fe25511ee Mon Sep 17 00:00:00 2001 From: Clara De Smet Date: Fri, 11 Oct 2024 15:20:54 +0200 Subject: [PATCH 21/22] Reformatting docs --- openstef/model/regressors/linear_quantile.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 79706c102..b98ed22c5 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -206,9 +206,10 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): - """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range - [0,1] and are used to put more emphasis on certain samples. The sample weighting function does: + """Calculate sample weights based on the y values of arbitrary scale. + The resulting weights are in the range [0,1] and are used to put more emphasis + on certain samples. The sample weighting function does: * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will be within this range. Rest is outside. * Calculate the weight by taking the exponent of scaled data. From 1143f1d773d9a4d7f0dc5f89472d026e28684309 Mon Sep 17 00:00:00 2001 From: Clara De Smet Date: Fri, 11 Oct 2024 15:22:25 +0200 Subject: [PATCH 22/22] Reformatting docs --- openstef/model/regressors/linear_quantile.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index fa83970fd..aa4fc57d7 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -206,9 +206,10 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self def _calculate_sample_weights(self, y: np.array): - """Calculate sample weights based on the y values of arbitrary scale. The resulting weights are in the range. + """Calculate sample weights based on the y values of arbitrary scale. - [0,1] and are used to put more emphasis on certain samples. The sample weighting function does: + The resulting weights are in the range [0,1] and are used to put more emphasis + on certain samples. The sample weighting function does: * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will be within this range. Rest is outside.