Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature(KTP-1279): Changed linear model scaling and improved sample weighting feature. #565

Merged
merged 25 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e0ebfb5
feature(KTP-1279): Changed feature scaling in linear model. Added exp…
egordm Oct 4, 2024
870b8a8
feature(KTP-1279): Added test to change linear model parameters.
egordm Oct 4, 2024
c7f550a
style: Code style fixes.
egordm Oct 4, 2024
ad2eba6
Format Python code with Black
actions-user Oct 4, 2024
20fd99c
feature(KTP-1279): Added additional test condition for linear model p…
egordm Oct 4, 2024
094391a
style: Code style fixes.
egordm Oct 4, 2024
0667ec2
Format Python code with Black
actions-user Oct 4, 2024
f85f2e6
feature(KTP-1279): Added additional test condition for linear model p…
egordm Oct 4, 2024
6f4afa7
feature(KTP-1279): Added weight floor. Added documentation for sample…
egordm Oct 7, 2024
6c5e67d
Format Python code with Black
actions-user Oct 7, 2024
385c356
Merge branch 'main' into feature/KTP-1279-linear-sample-weight
clara-de-smet Oct 10, 2024
cada386
Merge branch 'main' into feature/KTP-1279-linear-sample-weight
clara-de-smet Oct 10, 2024
75764b5
Format Python code with Black
actions-user Oct 10, 2024
873e984
Fixed linter suggestion
clara-de-smet Oct 10, 2024
5dbed25
Added documentation
clara-de-smet Oct 11, 2024
b5d2430
Format Python code with Black
actions-user Oct 11, 2024
da443b7
Bumped version of black formatting
clara-de-smet Oct 11, 2024
b51c690
Updated documentation
clara-de-smet Oct 11, 2024
fafc48b
Format Python code with Black
actions-user Oct 11, 2024
ac5b8e3
Removed blank line
clara-de-smet Oct 11, 2024
be8d70c
Merge branch 'feature/KTP-1279-linear-sample-weight' of https://githu…
clara-de-smet Oct 11, 2024
0c80259
Format Python code with Black
actions-user Oct 11, 2024
f6a09a2
Reformatting docs
clara-de-smet Oct 11, 2024
482953b
Merge branch 'feature/KTP-1279-linear-sample-weight' of https://githu…
clara-de-smet Oct 11, 2024
1143f1d
Reformatting docs
clara-de-smet Oct 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

repos:
- repo: https://github.com/ambv/black
rev: 22.1.0
rev: 24.3.0
hooks:
- id: black
language_version: python3.11
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ OpenSTEF is a Python package designed for generating short-term forecasts in the
pip install openstef
```

### Remark regarding installation within a **conda environment on Windows**:
### Remark regarding installation within a **conda environment on Windows**

A version of the pywin32 package will be installed as a secondary dependency along with the installation of the openstef package. Since conda relies on an old version of pywin32, the new installation can break conda's functionality. The following command can solve this issue:
```shell
Expand Down
1 change: 1 addition & 0 deletions openstef/feature_engineering/missing_values_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
no_fill_future_values_features: The features for which it does not make sense
to fill future values. Rows that contain trailing null values for these
features will be removed from the data.

"""
self.missing_values = missing_values
self.imputation_strategy = imputation_strategy
Expand Down
3 changes: 3 additions & 0 deletions openstef/model/model_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@
"missing_values",
"imputation_strategy",
"fill_value",
"weight_scale_percentile",
"weight_exponent",
"weight_floor",
"no_fill_future_values_features",
],
ModelType.ARIMA: [
Expand Down
56 changes: 50 additions & 6 deletions openstef/model/regressors/linear_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
from sklearn.base import RegressorMixin
from sklearn.linear_model import QuantileRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted

from openstef.feature_engineering.missing_values_transformer import (
Expand All @@ -25,8 +25,8 @@ class LinearQuantileOpenstfRegressor(OpenstfRegressor, RegressorMixin):
solver: str

imputer_: MissingValuesTransformer
x_scaler_: MinMaxScaler
y_scaler_: MinMaxScaler
x_scaler_: StandardScaler
y_scaler_: StandardScaler
models_: Dict[float, QuantileRegressor]

is_fitted_: bool = False
Expand All @@ -47,6 +47,9 @@ def __init__(
missing_values: Union[int, float, str, None] = np.nan,
imputation_strategy: Optional[str] = "mean",
fill_value: Union[str, int, float] = None,
weight_scale_percentile: int = 95,
weight_exponent: float = 1,
weight_floor: float = 0.1,
no_fill_future_values_features: List[str] = None,
):
"""Initialize LinearQuantileOpenstfRegressor.
Expand All @@ -70,6 +73,9 @@ def __init__(
missing_values: Value to be considered as missing value
imputation_strategy: Imputation strategy
fill_value: Fill value
weight_scale_percentile: Percentile used in scaling of the samples
weight_exponent: Exponent used in sample weighing
weight_floor: Minimum weight for samples
no_fill_future_values_features: The features for which it does not make sense
to fill future values. Rows that contain trailing null values for these
features will be removed from the data.
Expand All @@ -86,14 +92,17 @@ def __init__(
self.quantiles = quantiles
self.alpha = alpha
self.solver = solver
self.weight_scale_percentile = weight_scale_percentile
self.weight_exponent = weight_exponent
self.weight_floor = weight_floor
self.imputer_ = MissingValuesTransformer(
missing_values=missing_values,
imputation_strategy=imputation_strategy,
fill_value=fill_value,
no_fill_future_values_features=no_fill_future_values_features,
)
self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1))
self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1))
self.x_scaler_ = StandardScaler()
self.y_scaler_ = StandardScaler()
self.models_ = {
quantile: QuantileRegressor(alpha=alpha, quantile=quantile, solver=solver)
for quantile in quantiles
Expand Down Expand Up @@ -182,7 +191,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
y_scaled = self.y_scaler_.fit_transform(y.to_frame())[:, 0]

# Add more focus on extreme / peak values
sample_weight = np.abs(y_scaled)
sample_weight = self._calculate_sample_weights(y.values.squeeze())

# Fit quantile regressors
for quantile in self.quantiles:
Expand All @@ -196,6 +205,33 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:

return self

def _calculate_sample_weights(self, y: np.array):
clara-de-smet marked this conversation as resolved.
Show resolved Hide resolved
"""Calculate sample weights based on the y values of arbitrary scale.

The resulting weights are in the range [0,1] and are used to put more emphasis
on certain samples. The sample weighting function does:

* Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will
be within this range. Rest is outside.
* Calculate the weight by taking the exponent of scaled data.
* exponent=0: Results in uniform weights for all samples.
* exponent=1: Results in linearly increasing weights for samples that are
closer to the extremes.
* exponent>1: Results in exponentially increasing weights for samples that are
closer to the extremes.
* Clip the data to [0, 1] range with weight_floor as the minimum weight.
* Weight floor is used to make sure that all the samples are considered.

"""
return np.clip(
_weight_exp(
_scale_percentile(y, percentile=self.weight_scale_percentile),
exponent=self.weight_exponent,
),
a_min=self.weight_floor,
a_max=1,
)

def predict(self, x: pd.DataFrame, quantile: float = 0.5, **kwargs) -> np.array:
"""Makes a prediction for a desired quantile.

Expand Down Expand Up @@ -250,3 +286,11 @@ def _get_param_names(cls):

def __sklearn_is_fitted__(self) -> bool:
return self.is_fitted_


def _scale_percentile(x: np.ndarray, percentile: int = 95):
return np.abs(x / np.percentile(np.abs(x), percentile))


def _weight_exp(x: np.ndarray, exponent: float = 1):
return np.abs(x) ** exponent
20 changes: 20 additions & 0 deletions test/unit/model/regressors/test_linear_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sklearn.utils.estimator_checks import check_estimator

from openstef.feature_engineering.apply_features import apply_features
from openstef.model.model_creator import ModelCreator
from openstef.model.regressors.linear_quantile import LinearQuantileOpenstfRegressor
from test.unit.utils.base import BaseTestCase
from test.unit.utils.data import TestData
Expand Down Expand Up @@ -151,3 +152,22 @@ def test_ignore_features(self):
self.assertNotIn("E1B_AMI_I", input_data_filtered.columns)
self.assertNotIn("E4A_I", input_data_filtered.columns)
self.assertIn("load", input_data_filtered.columns)

def test_create_model(self):
# Arrange
kwargs = {
"weight_scale_percentile": 50,
"weight_exponent": 2,
}

# Act
model = ModelCreator.create_model(
model_type="linear_quantile",
quantiles=[0.5],
**kwargs,
)

# Assert
self.assertIsInstance(model, LinearQuantileOpenstfRegressor)
self.assertEqual(model.weight_scale_percentile, 50)
self.assertEqual(model.weight_exponent, 2)
Loading