From bbc73cfd7f5a85f6ac63432d3294abecdcb81d2a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 10 Sep 2023 19:13:26 +0200 Subject: [PATCH 01/21] FIX _safe_divide should handle zero-division with numpy scalar (#27312) Co-authored-by: Christian Lorentzen --- sklearn/ensemble/_gb.py | 20 +++++++++++++------ .../ensemble/tests/test_gradient_boosting.py | 8 +++++--- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 73bb9e08ae619..e198babdb28d7 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -20,6 +20,7 @@ # Arnaud Joly, Jacob Schreiber # License: BSD 3 clause +import math import warnings from abc import ABCMeta, abstractmethod from numbers import Integral, Real @@ -64,11 +65,16 @@ def _safe_divide(numerator, denominator): """Prevents overflow and division by zero.""" - with np.errstate(divide="raise"): - try: - return numerator / denominator - except FloatingPointError: - return 0.0 + try: + # Cast to Python float to trigger a ZeroDivisionError without relying + # on `np.errstate` that is not supported by Pyodide. + result = float(numerator) / float(denominator) + if math.isinf(result): + warnings.warn("overflow encountered in _safe_divide", RuntimeWarning) + return result + except ZeroDivisionError: + warnings.warn("divide by zero encountered in _safe_divide", RuntimeWarning) + return 0.0 def _init_raw_predictions(X, estimator, loss, use_predict_proba): @@ -235,7 +241,9 @@ def compute_update(y_, indices, neg_gradient, raw_prediction, k): # update each leaf (= perform line search) for leaf in np.nonzero(tree.children_left == TREE_LEAF)[0]: - indices = np.nonzero(terminal_regions == leaf)[0] # of terminal regions + indices = np.nonzero(masked_terminal_regions == leaf)[ + 0 + ] # of terminal regions y_ = y.take(indices, axis=0) sw = None if sample_weight is None else sample_weight[indices] update = compute_update(y_, indices, neg_gradient, raw_prediction, k) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 6671936d2eaae..9d61464d063f0 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1445,11 +1445,13 @@ def test_huber_vs_mean_and_median(): def test_safe_divide(): """Test that _safe_divide handles division by zero.""" - assert _safe_divide(np.array([1e300]), 0) == 0 - + with pytest.warns(RuntimeWarning, match="divide"): + assert _safe_divide(np.float64(1e300), 0) == 0 + with pytest.warns(RuntimeWarning, match="divide"): + assert _safe_divide(np.float64(0.0), np.float64(0.0)) == 0 with pytest.warns(RuntimeWarning, match="overflow"): # np.finfo(float).max = 1.7976931348623157e+308 - _safe_divide(np.array([1e300]), 1e-10) + _safe_divide(np.float64(1e300), 1e-10) def test_squared_error_exact_backward_compat(): From 7ba963f3e538eb7fe62767af2672c76f8174a763 Mon Sep 17 00:00:00 2001 From: Yuusuke Hiramatsu <140389350+hiramatsuyuusuke@users.noreply.github.com> Date: Mon, 11 Sep 2023 16:38:05 +0900 Subject: [PATCH 02/21] DOC take `Examples` out of a dropdown (#27034) --- doc/modules/svm.rst | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 0ac34cdcb6a10..8f97b8dee8806 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -521,9 +521,8 @@ is advised to use :class:`~sklearn.model_selection.GridSearchCV` with * :ref:`sphx_glr_auto_examples_svm_plot_rbf_parameters.py` * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py` -|details-start| -**Custom Kernels** -|details-split| +Custom Kernels +-------------- You can define your own kernels by either giving the kernel as a python function or by precomputing the Gram matrix. @@ -539,8 +538,9 @@ classifiers, except that: use of ``fit()`` and ``predict()`` you will have unexpected results. -Using Python functions as kernels -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +|details-start| +**Using Python functions as kernels** +|details-split| You can use your own defined kernels by passing a function to the ``kernel`` parameter. @@ -558,13 +558,13 @@ instance that will use that kernel:: ... return np.dot(X, Y.T) ... >>> clf = svm.SVC(kernel=my_kernel) + +|details-end| -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_svm_plot_custom_kernel.py`. -Using the Gram matrix -~~~~~~~~~~~~~~~~~~~~~ +|details-start| +**Using the Gram matrix** +|details-split| You can pass pre-computed kernels by using the ``kernel='precomputed'`` option. You should then pass Gram matrix instead of X to the `fit` and @@ -589,6 +589,10 @@ test vectors must be provided: |details-end| +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_svm_plot_custom_kernel.py`. + .. _svm_mathematical_formulation: Mathematical formulation From a6a6a47fb3172c7fcbf0c52870c1279b24114ad8 Mon Sep 17 00:00:00 2001 From: Puneeth K <32433964+punndcoder28@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:14:54 +0530 Subject: [PATCH 03/21] DOC Adding dropdown for module 2.1 Gaussian Mixtures (#26694) --- doc/modules/mixture.rst | 125 +++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 58 deletions(-) diff --git a/doc/modules/mixture.rst b/doc/modules/mixture.rst index fbf0551da93a4..e9cc94b1d493d 100644 --- a/doc/modules/mixture.rst +++ b/doc/modules/mixture.rst @@ -68,33 +68,36 @@ full covariance. * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_pdf.py` for an example on plotting the density estimation. -Pros and cons of class :class:`GaussianMixture` ------------------------------------------------ +|details-start| +**Pros and cons of class GaussianMixture** +|details-split| -Pros -.... +.. topic:: Pros: -:Speed: It is the fastest algorithm for learning mixture models + :Speed: It is the fastest algorithm for learning mixture models -:Agnostic: As this algorithm maximizes only the likelihood, it - will not bias the means towards zero, or bias the cluster sizes to - have specific structures that might or might not apply. + :Agnostic: As this algorithm maximizes only the likelihood, it + will not bias the means towards zero, or bias the cluster sizes to + have specific structures that might or might not apply. -Cons -.... +.. topic:: Cons: -:Singularities: When one has insufficiently many points per - mixture, estimating the covariance matrices becomes difficult, - and the algorithm is known to diverge and find solutions with - infinite likelihood unless one regularizes the covariances artificially. + :Singularities: When one has insufficiently many points per + mixture, estimating the covariance matrices becomes difficult, + and the algorithm is known to diverge and find solutions with + infinite likelihood unless one regularizes the covariances artificially. -:Number of components: This algorithm will always use all the - components it has access to, needing held-out data - or information theoretical criteria to decide how many components to use - in the absence of external cues. + :Number of components: This algorithm will always use all the + components it has access to, needing held-out data + or information theoretical criteria to decide how many components to use + in the absence of external cues. -Selecting the number of components in a classical Gaussian Mixture Model ------------------------------------------------------------------------- +|details-end| + + +|details-start| +**Selecting the number of components in a classical Gaussian Mixture model** +|details-split| The BIC criterion can be used to select the number of components in a Gaussian Mixture in an efficient way. In theory, it recovers the true number of @@ -114,10 +117,13 @@ model. * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py` for an example of model selection performed with classical Gaussian mixture. +|details-end| + .. _expectation_maximization: -Estimation algorithm Expectation-maximization ------------------------------------------------ +|details-start| +**Estimation algorithm expectation-maximization** +|details-split| The main difficulty in learning Gaussian mixture models from unlabeled data is that one usually doesn't know which points came from @@ -135,8 +141,11 @@ parameters to maximize the likelihood of the data given those assignments. Repeating this process is guaranteed to always converge to a local optimum. -Choice of the Initialization Method ------------------------------------ +|details-end| + +|details-start| +**Choice of the Initialization method** +|details-split| There is a choice of four initialization methods (as well as inputting user defined initial means) to generate the initial centers for the model components: @@ -172,6 +181,8 @@ random * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_init.py` for an example of using different initializations in Gaussian Mixture. +|details-end| + .. _bgmm: Variational Bayesian Gaussian Mixture @@ -183,8 +194,7 @@ similar to the one defined by :class:`GaussianMixture`. .. _variational_inference: -Estimation algorithm: variational inference ---------------------------------------------- +**Estimation algorithm: variational inference** Variational inference is an extension of expectation-maximization that maximizes a lower bound on model evidence (including @@ -282,48 +292,47 @@ from the two resulting mixtures. ``weight_concentration_prior_type`` for different values of the parameter ``weight_concentration_prior``. +|details-start| +**Pros and cons of variational inference with BayesianGaussianMixture** +|details-split| -Pros and cons of variational inference with :class:`BayesianGaussianMixture` ----------------------------------------------------------------------------- - -Pros -..... +.. topic:: Pros: -:Automatic selection: when ``weight_concentration_prior`` is small enough and - ``n_components`` is larger than what is found necessary by the model, the - Variational Bayesian mixture model has a natural tendency to set some mixture - weights values close to zero. This makes it possible to let the model choose - a suitable number of effective components automatically. Only an upper bound - of this number needs to be provided. Note however that the "ideal" number of - active components is very application specific and is typically ill-defined - in a data exploration setting. + :Automatic selection: when ``weight_concentration_prior`` is small enough and + ``n_components`` is larger than what is found necessary by the model, the + Variational Bayesian mixture model has a natural tendency to set some mixture + weights values close to zero. This makes it possible to let the model choose + a suitable number of effective components automatically. Only an upper bound + of this number needs to be provided. Note however that the "ideal" number of + active components is very application specific and is typically ill-defined + in a data exploration setting. -:Less sensitivity to the number of parameters: unlike finite models, which will - almost always use all components as much as they can, and hence will produce - wildly different solutions for different numbers of components, the - variational inference with a Dirichlet process prior - (``weight_concentration_prior_type='dirichlet_process'``) won't change much - with changes to the parameters, leading to more stability and less tuning. + :Less sensitivity to the number of parameters: unlike finite models, which will + almost always use all components as much as they can, and hence will produce + wildly different solutions for different numbers of components, the + variational inference with a Dirichlet process prior + (``weight_concentration_prior_type='dirichlet_process'``) won't change much + with changes to the parameters, leading to more stability and less tuning. -:Regularization: due to the incorporation of prior information, - variational solutions have less pathological special cases than - expectation-maximization solutions. + :Regularization: due to the incorporation of prior information, + variational solutions have less pathological special cases than + expectation-maximization solutions. -Cons -..... +.. topic:: Cons: -:Speed: the extra parametrization necessary for variational inference makes - inference slower, although not by much. + :Speed: the extra parametrization necessary for variational inference makes + inference slower, although not by much. -:Hyperparameters: this algorithm needs an extra hyperparameter - that might need experimental tuning via cross-validation. + :Hyperparameters: this algorithm needs an extra hyperparameter + that might need experimental tuning via cross-validation. -:Bias: there are many implicit biases in the inference algorithms (and also in - the Dirichlet process if used), and whenever there is a mismatch between - these biases and the data it might be possible to fit better models using a - finite mixture. + :Bias: there are many implicit biases in the inference algorithms (and also in + the Dirichlet process if used), and whenever there is a mismatch between + these biases and the data it might be possible to fit better models using a + finite mixture. +|details-end| .. _dirichlet_process: From d3aaa75e4a8a890f806e8af710fda615490ee0a9 Mon Sep 17 00:00:00 2001 From: Puneeth K <32433964+punndcoder28@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:20:14 +0530 Subject: [PATCH 04/21] DOC Adding Dropdown to module 7.2 Realworld Datasets (#26693) --- sklearn/datasets/descr/lfw.rst | 12 +++++++----- sklearn/datasets/descr/twenty_newsgroups.rst | 19 +++++++++++++------ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/sklearn/datasets/descr/lfw.rst b/sklearn/datasets/descr/lfw.rst index e7fc35c3caabc..8105d7d6d633a 100644 --- a/sklearn/datasets/descr/lfw.rst +++ b/sklearn/datasets/descr/lfw.rst @@ -32,8 +32,9 @@ face detector from various online websites. Features real, between 0 and 255 ================= ======================= -Usage -~~~~~ +|details-start| +**Usage** +|details-split| ``scikit-learn`` provides two loaders that will automatically download, cache, parse the metadata files, decode the jpeg and convert the @@ -111,6 +112,8 @@ The :func:`sklearn.datasets.fetch_lfw_pairs` datasets is subdivided into an evaluation ``10_folds`` set meant to compute performance metrics using a 10-folds cross validation scheme. +|details-end| + .. topic:: References: * `Labeled Faces in the Wild: A Database for Studying Face Recognition @@ -120,7 +123,6 @@ an evaluation ``10_folds`` set meant to compute performance metrics using a University of Massachusetts, Amherst, Technical Report 07-49, October, 2007. -Examples -~~~~~~~~ +.. topic:: Examples: -:ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py` + * :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py` diff --git a/sklearn/datasets/descr/twenty_newsgroups.rst b/sklearn/datasets/descr/twenty_newsgroups.rst index 8e373c6ec3e74..669e158244134 100644 --- a/sklearn/datasets/descr/twenty_newsgroups.rst +++ b/sklearn/datasets/descr/twenty_newsgroups.rst @@ -27,8 +27,9 @@ extractor. Features text ================= ========== -Usage -~~~~~ +|details-start| +**Usage** +|details-split| The :func:`sklearn.datasets.fetch_20newsgroups` function is a data fetching / caching functions that downloads the data archive from @@ -89,8 +90,11 @@ list of the categories to load to the >>> newsgroups_train.target[:10] array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0]) -Converting text to vectors -~~~~~~~~~~~~~~~~~~~~~~~~~~ +|details-end| + +|details-start| +**Converting text to vectors** +|details-split| In order to feed predictive or clustering models with the text data, one first need to turn the text into vectors of numerical values suitable @@ -122,9 +126,11 @@ returns ready-to-use token counts features instead of file names. .. _`20 newsgroups website`: http://people.csail.mit.edu/jrennie/20Newsgroups/ .. _`TF-IDF`: https://en.wikipedia.org/wiki/Tf-idf +|details-end| -Filtering text for more realistic training -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +|details-start| +**Filtering text for more realistic training** +|details-split| It is easy for a classifier to overfit on particular things that appear in the 20 Newsgroups data, such as newsgroup headers. Many classifiers achieve very @@ -218,6 +224,7 @@ It loses even more if we also strip this metadata from the training data: Some other classifiers cope better with this harder version of the task. Try the :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_text_feature_extraction.py` example with and without the `remove` option to compare the results. +|details-end| .. topic:: Data Considerations From 358a1c91e491d72542359cf10c63e77d3678424d Mon Sep 17 00:00:00 2001 From: Mohit Joshi <46566524+work-mohit@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:00:09 +0530 Subject: [PATCH 05/21] TST Extend tests for `scipy.sparse.*array`in `sklearn/ensemble/tests/test_bagging.py` (#27170) --- sklearn/ensemble/tests/test_bagging.py | 56 +++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 2c1067ccfc248..0ee0dd8be4956 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -9,7 +9,6 @@ import joblib import numpy as np import pytest -from scipy.sparse import csc_matrix, csr_matrix from sklearn.base import BaseEstimator from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2 @@ -31,6 +30,7 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import check_random_state from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS rng = check_random_state(0) @@ -83,9 +83,9 @@ def test_classification(): @pytest.mark.parametrize( - "sparse_format, params, method", + "sparse_container, params, method", product( - [csc_matrix, csr_matrix], + CSR_CONTAINERS + CSC_CONTAINERS, [ { "max_samples": 0.5, @@ -105,7 +105,7 @@ def test_classification(): ["predict", "predict_proba", "predict_log_proba", "decision_function"], ), ) -def test_sparse_classification(sparse_format, params, method): +def test_sparse_classification(sparse_container, params, method): # Check classification for various parameter settings on sparse input. class CustomSVC(SVC): @@ -121,8 +121,8 @@ def fit(self, X, y): scale(iris.data), iris.target, random_state=rng ) - X_train_sparse = sparse_format(X_train) - X_test_sparse = sparse_format(X_test) + X_train_sparse = sparse_container(X_train) + X_test_sparse = sparse_container(X_test) # Trained on sparse format sparse_classifier = BaggingClassifier( estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"), @@ -174,7 +174,8 @@ def test_regression(): ).predict(X_test) -def test_sparse_regression(): +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_sparse_regression(sparse_container): # Check regression for various parameter settings on sparse input. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split( @@ -206,29 +207,28 @@ def fit(self, X, y): {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, ] - for sparse_format in [csc_matrix, csr_matrix]: - X_train_sparse = sparse_format(X_train) - X_test_sparse = sparse_format(X_test) - for params in parameter_sets: - # Trained on sparse format - sparse_classifier = BaggingRegressor( - estimator=CustomSVR(), random_state=1, **params - ).fit(X_train_sparse, y_train) - sparse_results = sparse_classifier.predict(X_test_sparse) - - # Trained on dense format - dense_results = ( - BaggingRegressor(estimator=CustomSVR(), random_state=1, **params) - .fit(X_train, y_train) - .predict(X_test) - ) + X_train_sparse = sparse_container(X_train) + X_test_sparse = sparse_container(X_test) + for params in parameter_sets: + # Trained on sparse format + sparse_classifier = BaggingRegressor( + estimator=CustomSVR(), random_state=1, **params + ).fit(X_train_sparse, y_train) + sparse_results = sparse_classifier.predict(X_test_sparse) + + # Trained on dense format + dense_results = ( + BaggingRegressor(estimator=CustomSVR(), random_state=1, **params) + .fit(X_train, y_train) + .predict(X_test) + ) - sparse_type = type(X_train_sparse) - types = [i.data_type_ for i in sparse_classifier.estimators_] + sparse_type = type(X_train_sparse) + types = [i.data_type_ for i in sparse_classifier.estimators_] - assert_array_almost_equal(sparse_results, dense_results) - assert all([t == sparse_type for t in types]) - assert_array_almost_equal(sparse_results, dense_results) + assert_array_almost_equal(sparse_results, dense_results) + assert all([t == sparse_type for t in types]) + assert_array_almost_equal(sparse_results, dense_results) class DummySizeEstimator(BaseEstimator): From 8d48624ec9a3f2785b35a6510964c7847d39175f Mon Sep 17 00:00:00 2001 From: Brendan Lu <132311610+brendanlu@users.noreply.github.com> Date: Mon, 11 Sep 2023 19:31:21 +1000 Subject: [PATCH 06/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/feature_selection/tests/test_mutual_info.py` (#27173) Co-authored-by: Guillaume Lemaitre --- sklearn/feature_selection/tests/test_mutual_info.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index 349147f66e36c..26367544baa53 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -1,6 +1,5 @@ import numpy as np import pytest -from scipy.sparse import csr_matrix from sklearn.feature_selection import mutual_info_classif, mutual_info_regression from sklearn.feature_selection._mutual_info import _compute_mi @@ -9,6 +8,7 @@ assert_allclose, assert_array_equal, ) +from sklearn.utils.fixes import CSR_CONTAINERS def test_compute_mi_dd(): @@ -176,12 +176,13 @@ def test_mutual_info_classif_mixed(global_dtype): assert mi_nn[2] == mi[2] -def test_mutual_info_options(global_dtype): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_mutual_info_options(global_dtype, csr_container): X = np.array( [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype ) y = np.array([0, 1, 2, 2, 1], dtype=global_dtype) - X_csr = csr_matrix(X) + X_csr = csr_container(X) for mutual_info in (mutual_info_regression, mutual_info_classif): with pytest.raises(ValueError): From c366f93394bba397468cb1e36e013f7a1bb44708 Mon Sep 17 00:00:00 2001 From: Mohit Joshi <46566524+work-mohit@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:02:44 +0530 Subject: [PATCH 07/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/feature_selection/tests/test_base.py` (#27175) --- sklearn/feature_selection/tests/test_base.py | 27 +++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/sklearn/feature_selection/tests/test_base.py b/sklearn/feature_selection/tests/test_base.py index bf883797ddabd..5e2bb27bafd17 100644 --- a/sklearn/feature_selection/tests/test_base.py +++ b/sklearn/feature_selection/tests/test_base.py @@ -1,10 +1,10 @@ import numpy as np import pytest from numpy.testing import assert_array_equal -from scipy import sparse as sp from sklearn.base import BaseEstimator from sklearn.feature_selection._base import SelectorMixin +from sklearn.utils.fixes import CSC_CONTAINERS class StepSelector(SelectorMixin, BaseEstimator): @@ -60,17 +60,18 @@ def test_transform_dense(): sel.transform(np.array([[1], [2]])) -def test_transform_sparse(): - sparse = sp.csc_matrix +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_transform_sparse(csc_container): + X_sp = csc_container(X) sel = StepSelector() - Xt_actual = sel.fit(sparse(X)).transform(sparse(X)) - Xt_actual2 = sel.fit_transform(sparse(X)) + Xt_actual = sel.fit(X_sp).transform(X_sp) + Xt_actual2 = sel.fit_transform(X_sp) assert_array_equal(Xt, Xt_actual.toarray()) assert_array_equal(Xt, Xt_actual2.toarray()) # Check dtype matches - assert np.int32 == sel.transform(sparse(X).astype(np.int32)).dtype - assert np.float32 == sel.transform(sparse(X).astype(np.float32)).dtype + assert np.int32 == sel.transform(X_sp.astype(np.int32)).dtype + assert np.float32 == sel.transform(X_sp.astype(np.float32)).dtype # Check wrong shape raises error with pytest.raises(ValueError): @@ -95,15 +96,17 @@ def test_inverse_transform_dense(): sel.inverse_transform(np.array([[1], [2]])) -def test_inverse_transform_sparse(): - sparse = sp.csc_matrix +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_inverse_transform_sparse(csc_container): + X_sp = csc_container(X) + Xt_sp = csc_container(Xt) sel = StepSelector() - Xinv_actual = sel.fit(sparse(X)).inverse_transform(sparse(Xt)) + Xinv_actual = sel.fit(X_sp).inverse_transform(Xt_sp) assert_array_equal(Xinv, Xinv_actual.toarray()) # Check dtype matches - assert np.int32 == sel.inverse_transform(sparse(Xt).astype(np.int32)).dtype - assert np.float32 == sel.inverse_transform(sparse(Xt).astype(np.float32)).dtype + assert np.int32 == sel.inverse_transform(Xt_sp.astype(np.int32)).dtype + assert np.float32 == sel.inverse_transform(Xt_sp.astype(np.float32)).dtype # Check wrong shape raises error with pytest.raises(ValueError): From 11cf0eec3eae4b684b962e11ef3eadf0357eaa9a Mon Sep 17 00:00:00 2001 From: Mohit Joshi <46566524+work-mohit@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:03:32 +0530 Subject: [PATCH 08/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/feature_selection/tests/test_chi2.py` (#27176) Co-authored-by: Omar Salman --- sklearn/feature_selection/tests/test_chi2.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/tests/test_chi2.py b/sklearn/feature_selection/tests/test_chi2.py index 4fdc652a998a9..c50def36f1b6c 100644 --- a/sklearn/feature_selection/tests/test_chi2.py +++ b/sklearn/feature_selection/tests/test_chi2.py @@ -8,11 +8,11 @@ import numpy as np import pytest import scipy.stats -from scipy.sparse import coo_matrix, csr_matrix from sklearn.feature_selection import SelectKBest, chi2 from sklearn.feature_selection._univariate_selection import _chisquare from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS # Feature 0 is highly informative for class 1; # feature 1 is the same everywhere; @@ -26,7 +26,8 @@ def mkchi2(k): return SelectKBest(chi2, k=k) -def test_chi2(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_chi2(csr_container): # Test Chi2 feature extraction chi2 = mkchi2(k=1).fit(X, y) @@ -37,7 +38,7 @@ def test_chi2(): chi2 = mkchi2(k=2).fit(X, y) assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) - Xsp = csr_matrix(X, dtype=np.float64) + Xsp = csr_container(X, dtype=np.float64) chi2 = mkchi2(k=2).fit(Xsp, y) assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) Xtrans = chi2.transform(Xsp) @@ -49,18 +50,20 @@ def test_chi2(): assert_array_almost_equal(Xtrans, Xtrans2) -def test_chi2_coo(): +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_chi2_coo(coo_container): # Check that chi2 works with a COO matrix # (as returned by CountVectorizer, DictVectorizer) - Xcoo = coo_matrix(X) + Xcoo = coo_container(X) mkchi2(k=2).fit_transform(Xcoo, y) # if we got here without an exception, we're safe -def test_chi2_negative(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_chi2_negative(csr_container): # Check for proper error on negative numbers in the input X. X, y = [[0, 1], [-1e-20, 1]], [0, 1] - for X in (X, np.array(X), csr_matrix(X)): + for X in (X, np.array(X), csr_container(X)): with pytest.raises(ValueError): chi2(X, y) From 8eef53fa1d194009ee2a02948c9d14a45407fc7d Mon Sep 17 00:00:00 2001 From: Mohit Joshi <46566524+work-mohit@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:04:42 +0530 Subject: [PATCH 09/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/feature_selection/tests/test_rfe.py` (#27177) --- sklearn/feature_selection/tests/test_rfe.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index 0f141f3461d7f..a2b5b1c8d4cf0 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -7,7 +7,6 @@ import numpy as np import pytest from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal -from scipy import sparse from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.compose import TransformedTargetRegressor @@ -23,6 +22,7 @@ from sklearn.svm import SVC, SVR, LinearSVR from sklearn.utils import check_random_state from sklearn.utils._testing import ignore_warnings +from sklearn.utils.fixes import CSR_CONTAINERS class MockClassifier: @@ -79,13 +79,14 @@ def test_rfe_features_importance(): assert_array_equal(rfe.get_support(), rfe_svc.get_support()) -def test_rfe(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_rfe(csr_container): generator = check_random_state(0) iris = load_iris() # Add some irrelevant features. Random seed is set to make sure that # irrelevant features are always irrelevant. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] - X_sparse = sparse.csr_matrix(X) + X_sparse = csr_container(X) y = iris.target # dense model @@ -173,7 +174,8 @@ def test_rfe_mockclassifier(): assert X_r.shape == iris.data.shape -def test_rfecv(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_rfecv(csr_container): generator = check_random_state(0) iris = load_iris() # Add some irrelevant features. Random seed is set to make sure that @@ -197,7 +199,7 @@ def test_rfecv(): # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1) - X_sparse = sparse.csr_matrix(X) + X_sparse = csr_container(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) @@ -241,14 +243,14 @@ def test_scorer(estimator, X, y): assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2) - X_sparse = sparse.csr_matrix(X) + X_sparse = csr_container(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Verifying that steps < 1 don't blow up. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=0.2) - X_sparse = sparse.csr_matrix(X) + X_sparse = csr_container(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) From af6c35d26e76028952a551e8756bec333209d616 Mon Sep 17 00:00:00 2001 From: Mohit Joshi <46566524+work-mohit@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:17:58 +0530 Subject: [PATCH 10/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/feature_selection/tests/test_sequential.py` (#27178) --- sklearn/feature_selection/tests/test_sequential.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/tests/test_sequential.py b/sklearn/feature_selection/tests/test_sequential.py index a515bf22cdda3..82d65c55a0195 100644 --- a/sklearn/feature_selection/tests/test_sequential.py +++ b/sklearn/feature_selection/tests/test_sequential.py @@ -1,6 +1,5 @@ import numpy as np import pytest -import scipy from numpy.testing import assert_array_equal from sklearn.cluster import KMeans @@ -12,6 +11,7 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler +from sklearn.utils.fixes import CSR_CONTAINERS def test_bad_n_features_to_select(): @@ -184,11 +184,12 @@ def test_sanity(seed, direction, n_features_to_select, expected_selected_feature assert_array_equal(sfs.get_support(indices=True), expected_selected_features) -def test_sparse_support(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_support(csr_container): # Make sure sparse data is supported X, y = make_regression(n_features=10) - X = scipy.sparse.csr_matrix(X) + X = csr_container(X) sfs = SequentialFeatureSelector( LinearRegression(), n_features_to_select="auto", cv=2 ) From cfbab7769c30f49a01e398e906ed596d3581730d Mon Sep 17 00:00:00 2001 From: Tialo <65392801+Tialo@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:16:28 +0300 Subject: [PATCH 11/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/utils/tests/test_utils.py` (#27201) --- sklearn/utils/tests/test_utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 0dd23cb02d399..0f0c9c898b17a 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -6,7 +6,6 @@ import numpy as np import pytest -import scipy.sparse as sp from sklearn import config_context from sklearn.utils import ( @@ -35,6 +34,7 @@ assert_array_equal, assert_no_warnings, ) +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS # toy array X_toy = np.arange(9).reshape((3, 3)) @@ -160,21 +160,23 @@ def test_resample_stratify_2dy(): assert y.ndim == 2 -def test_resample_stratify_sparse_error(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_resample_stratify_sparse_error(csr_container): # resample must be ndarray rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 2)) y = rng.randint(0, 2, size=n_samples) - stratify = sp.csr_matrix(y) + stratify = csr_container(y) with pytest.raises(TypeError, match="A sparse matrix was passed"): X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify) -def test_safe_mask(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_safe_mask(csr_container): random_state = check_random_state(0) X = random_state.rand(5, 4) - X_csr = sp.csr_matrix(X) + X_csr = csr_container(X) mask = [False, False, True, True, True] mask = safe_mask(X, mask) @@ -514,14 +516,15 @@ def to_tuple(A): # to make the inner arrays hashable assert set(to_tuple(A)) == S -def test_shuffle_dont_convert_to_array(): +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_shuffle_dont_convert_to_array(csc_container): # Check that shuffle does not try to convert to numpy arrays with float # dtypes can let any indexable datastructure pass-through. a = ["a", "b", "c"] b = np.array(["a", "b", "c"], dtype=object) c = [1, 2, 3] d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object)) - e = sp.csc_matrix(np.arange(6).reshape(3, 2)) + e = csc_container(np.arange(6).reshape(3, 2)) a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0) assert a_s == ["c", "b", "a"] From dfe01c22273b905365d9454cea394367786e4b6a Mon Sep 17 00:00:00 2001 From: Tialo <65392801+Tialo@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:17:17 +0300 Subject: [PATCH 12/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/utils/tests/test_set_output.py` (#27202) --- sklearn/utils/tests/test_set_output.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index d1722a1553f9c..afc059a899ccd 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -3,7 +3,6 @@ import numpy as np import pytest from numpy.testing import assert_array_equal -from scipy.sparse import csr_matrix from sklearn._config import config_context, get_config from sklearn.utils._set_output import ( @@ -12,6 +11,7 @@ _SetOutputMixin, _wrap_in_pandas_container, ) +from sklearn.utils.fixes import CSR_CONTAINERS def test__wrap_in_pandas_container_dense(): @@ -41,10 +41,11 @@ def test__wrap_in_pandas_container_dense_update_columns_and_index(): assert_array_equal(new_df.index, X_df.index) -def test__wrap_in_pandas_container_error_validation(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test__wrap_in_pandas_container_error_validation(csr_container): """Check errors in _wrap_in_pandas_container.""" X = np.asarray([[1, 0, 3], [0, 0, 1]]) - X_csr = csr_matrix(X) + X_csr = csr_container(X) match = "Pandas output does not support sparse data" with pytest.raises(ValueError, match=match): _wrap_in_pandas_container(X_csr, columns=["a", "b", "c"]) From b0da1b7706054f0b78f0a0582a9362a188e1fa38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcus=20Fraa=C3=9F?= Date: Mon, 11 Sep 2023 12:42:58 +0200 Subject: [PATCH 13/21] DOC Add missing links to examples/impute (#27053) --- doc/modules/impute.rst | 4 ++++ sklearn/impute/_base.py | 3 +++ sklearn/impute/_iterative.py | 4 ++++ sklearn/impute/_knn.py | 3 +++ 4 files changed, 14 insertions(+) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index a1985bc460b77..0bb62fa03bc63 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -87,6 +87,8 @@ string values or pandas categoricals when using the ``'most_frequent'`` or ['a' 'y'] ['b' 'y']] +For another example on usage, see :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. + .. _iterative_imputer: @@ -220,6 +222,8 @@ neighbors of samples with missing values:: [5.5, 6. , 5. ], [8. , 8. , 7. ]]) +For another example on usage, see :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. + .. topic:: References .. [OL2001] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index a65210bce6960..e182a6ca73e61 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -256,6 +256,9 @@ class SimpleImputer(_BaseImputer): [[ 7. 2. 3. ] [ 4. 3.5 6. ] [10. 3.5 9. ]] + + For a more detailed example see + :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. """ _parameter_constraints: dict = { diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index a0087a5a10d55..e08f1bd773a34 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -273,6 +273,10 @@ class IterativeImputer(_BaseImputer): array([[ 6.9584..., 2. , 3. ], [ 4. , 2.6000..., 6. ], [10. , 4.9999..., 9. ]]) + + For a more detailed example see + :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py` or + :ref:`sphx_glr_auto_examples_impute_plot_iterative_imputer_variants_comparison.py`. """ _parameter_constraints: dict = { diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index e36b49f262b2d..7da7785369c0d 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -121,6 +121,9 @@ class KNNImputer(_BaseImputer): [3. , 4. , 3. ], [5.5, 6. , 5. ], [8. , 8. , 7. ]]) + + For a more detailed example see + :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. """ _parameter_constraints: dict = { From e7ae63f784c5f85af41cf8f346d194775f01f333 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 11 Sep 2023 15:58:14 +0200 Subject: [PATCH 14/21] FIX avoid unecessary `if copy` branch for sparse array/matrix (#27336) Co-authored-by: Olivier Grisel --- doc/whats_new/v1.4.rst | 6 ++++++ sklearn/utils/tests/test_validation.py | 27 +++++++++++++++++++------- sklearn/utils/validation.py | 26 ++++++++++++------------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index d54c276b3292b..612bbfebb6502 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -300,6 +300,12 @@ Changelog which can be used to check whether a given set of parameters would be consumed. :pr:`26831` by `Adrin Jalali`_. +- |Fix| :func:`sklearn.utils.check_array` should accept both matrix and array from + the sparse SciPy module. The previous implementation would fail if `copy=True` by + calling specific NumPy `np.may_share_memory` that does not work with SciPy sparse + array and does not return the correct result for SciPy sparse matrix. + :pr:`27336` by :user:`Guillaume Lemaitre `. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 96b429846f331..0e6d46e4c4945 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -49,7 +49,13 @@ skip_if_array_api_compat_not_configured, ) from sklearn.utils.estimator_checks import _NotAnArray -from sklearn.utils.fixes import parse_version +from sklearn.utils.fixes import ( + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, + parse_version, +) from sklearn.utils.validation import ( FLOAT_DTYPES, _allclose_dense_sparse, @@ -356,13 +362,20 @@ def test_check_array(): assert X is X_checked # allowed sparse != None - X_csc = sp.csc_matrix(X_C) - X_coo = X_csc.tocoo() - X_dok = X_csc.todok() - X_int = X_csc.astype(int) - X_float = X_csc.astype(float) - Xs = [X_csc, X_coo, X_dok, X_int, X_float] + # try different type of sparse format + Xs = [] + Xs.extend( + [ + sparse_container(X_C) + for sparse_container in CSR_CONTAINERS + + CSC_CONTAINERS + + COO_CONTAINERS + + DOK_CONTAINERS + ] + ) + Xs.extend([Xs[0].astype(np.int64), Xs[0].astype(np.float64)]) + accept_sparses = [["csr", "coo"], ["coo", "dok"]] # scipy sparse matrices do not support the object dtype so # this dtype is skipped in this loop diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 22c8394957cc1..ac75c8afe5e5b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -962,6 +962,19 @@ def is_sparse(dtype): allow_nan=force_all_finite == "allow-nan", ) + if copy: + if _is_numpy_namespace(xp): + # only make a copy if `array` and `array_orig` may share memory` + if np.may_share_memory(array, array_orig): + array = _asarray_with_order( + array, dtype=dtype, order=order, copy=True, xp=xp + ) + else: + # always make a copy for non-numpy arrays + array = _asarray_with_order( + array, dtype=dtype, order=order, copy=True, xp=xp + ) + if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: @@ -980,19 +993,6 @@ def is_sparse(dtype): % (n_features, array.shape, ensure_min_features, context) ) - if copy: - if _is_numpy_namespace(xp): - # only make a copy if `array` and `array_orig` may share memory` - if np.may_share_memory(array, array_orig): - array = _asarray_with_order( - array, dtype=dtype, order=order, copy=True, xp=xp - ) - else: - # always make a copy for non-numpy arrays - array = _asarray_with_order( - array, dtype=dtype, order=order, copy=True, xp=xp - ) - return array From 83a8015702213b39510a0f4898bc6879bcdf3ac2 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 11 Sep 2023 16:19:24 +0200 Subject: [PATCH 15/21] FEAT add metadata routing to *SearchCV (#27058) --- doc/whats_new/v1.4.rst | 8 + sklearn/model_selection/_search.py | 165 +++++++-- .../_search_successive_halving.py | 28 +- sklearn/model_selection/tests/test_search.py | 81 ++++- sklearn/tests/metadata_routing_common.py | 54 +-- .../test_metaestimators_metadata_routing.py | 341 +++++++++++++----- sklearn/utils/_metadata_requests.py | 6 +- 7 files changed, 509 insertions(+), 174 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 612bbfebb6502..6531102bba9fd 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -212,6 +212,14 @@ Changelog is enabled and should be passed via the `params` parameter. :pr:`26896` by `Adrin Jalali`_. +- |Feature| :class:`~model_selection.GridSearchCV`, + :class:`~model_selection.RandomizedSearchCV`, + :class:`~model_selection.HalvingGridSearchCV`, and + :class:`~model_selection.HalvingRandomSearchCV` now support metadata routing + in their ``fit`` and ``score``, and route metadata to the underlying + estimator's ``fit``, the CV splitter, and the scorer. :pr:`27058` by `Adrin + Jalali`_. + - |Enhancement| :func:`sklearn.model_selection.train_test_split` now supports Array API compatible inputs. :pr:`26855` by `Tim Head`_. diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index dcf0d2e41c905..9de03c2c663ec 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -27,10 +27,21 @@ from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier from ..exceptions import NotFittedError from ..metrics import check_scoring -from ..metrics._scorer import _check_multimetric_scoring, get_scorer_names -from ..utils import check_random_state +from ..metrics._scorer import ( + _check_multimetric_scoring, + _MultimetricScorer, + get_scorer_names, +) +from ..utils import Bunch, check_random_state from ..utils._param_validation import HasMethods, Interval, StrOptions from ..utils._tags import _safe_tags +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) from ..utils.metaestimators import available_if from ..utils.parallel import Parallel, delayed from ..utils.random import sample_without_replacement @@ -429,7 +440,7 @@ def _more_tags(self): }, } - def score(self, X, y=None): + def score(self, X, y=None, **params): """Return the score on the given data, if the estimator has been refit. This uses the score defined by ``scoring`` where provided, and the @@ -446,6 +457,14 @@ def score(self, X, y=None): Target relative to X for classification or regression; None for unsupervised learning. + **params : dict + Parameters to be passed to the underlying scorer(s). + + ..versionadded:: 1.4 + Only available if `enable_metadata_routing=True`. See + :ref:`Metadata Routing User Guide ` for more + details. + Returns ------- score : float @@ -454,6 +473,14 @@ def score(self, X, y=None): """ _check_refit(self, "score") check_is_fitted(self) + + _raise_for_params(params, self, "score") + + if _routing_enabled(): + score_params = process_routing(self, "score", **params).scorer["score"] + else: + score_params = dict() + if self.scorer_ is None: raise ValueError( "No score function explicitly defined, " @@ -465,10 +492,10 @@ def score(self, X, y=None): scorer = self.scorer_[self.refit] else: scorer = self.scorer_ - return scorer(self.best_estimator_, X, y) + return scorer(self.best_estimator_, X, y, **score_params) # callable - score = self.scorer_(self.best_estimator_, X, y) + score = self.scorer_(self.best_estimator_, X, y, **score_params) if self.multimetric_: score = score[self.refit] return score @@ -754,11 +781,62 @@ def _select_best_index(refit, refit_metric, results): best_index = results[f"rank_test_{refit_metric}"].argmin() return best_index + def _get_scorers(self, convert_multimetric): + """Get the scorer(s) to be used. + + This is used in ``fit`` and ``get_metadata_routing``. + + Parameters + ---------- + convert_multimetric : bool + Whether to convert a dict of scorers to a _MultimetricScorer. This + is used in ``get_metadata_routing`` to include the routing info for + multiple scorers. + + Returns + ------- + scorers, refit_metric + """ + refit_metric = "score" + + if callable(self.scoring): + scorers = self.scoring + elif self.scoring is None or isinstance(self.scoring, str): + scorers = check_scoring(self.estimator, self.scoring) + else: + scorers = _check_multimetric_scoring(self.estimator, self.scoring) + self._check_refit_for_multimetric(scorers) + refit_metric = self.refit + if convert_multimetric and isinstance(scorers, dict): + scorers = _MultimetricScorer( + scorers=scorers, raise_exc=(self.error_score == "raise") + ) + + return scorers, refit_metric + + def _get_routed_params_for_fit(self, params): + """Get the parameters to be used for routing. + + This is a method instead of a snippet in ``fit`` since it's used twice, + here in ``fit``, and in ``HalvingRandomSearchCV.fit``. + """ + if _routing_enabled(): + routed_params = process_routing(self, "fit", **params) + else: + params = params.copy() + groups = params.pop("groups", None) + routed_params = Bunch( + estimator=Bunch(fit=params), + splitter=Bunch(split={"groups": groups}), + scorer=Bunch(score={}), + ) + return routed_params + @_fit_context( # *SearchCV.estimator is not validated yet prefer_skip_nested_validation=False ) - def fit(self, X, y=None, *, groups=None, **fit_params): + def fit(self, X, y=None, **params): """Run fit with all sets of parameters. Parameters @@ -773,13 +851,9 @@ def fit(self, X, y=None, *, groups=None, **fit_params): Target relative to X for classification or regression; None for unsupervised learning. - groups : array-like of shape (n_samples,), default=None - Group labels for the samples used while splitting the dataset into - train/test set. Only used in conjunction with a "Group" :term:`cv` - instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). - - **fit_params : dict of str -> object - Parameters passed to the `fit` method of the estimator. + **params : dict of str -> object + Parameters passed to the ``fit`` method of the estimator, the scorer, + and the CV splitter. If a fit parameter is an array-like whose length is equal to `num_samples` then it will be split across CV groups along with `X` @@ -792,22 +866,18 @@ def fit(self, X, y=None, *, groups=None, **fit_params): Instance of fitted estimator. """ estimator = self.estimator - refit_metric = "score" + # Here we keep a dict of scorers as is, and only convert to a + # _MultimetricScorer at a later stage. Issue: + # https://github.com/scikit-learn/scikit-learn/issues/27001 + scorers, refit_metric = self._get_scorers(convert_multimetric=False) - if callable(self.scoring): - scorers = self.scoring - elif self.scoring is None or isinstance(self.scoring, str): - scorers = check_scoring(self.estimator, self.scoring) - else: - scorers = _check_multimetric_scoring(self.estimator, self.scoring) - self._check_refit_for_multimetric(scorers) - refit_metric = self.refit + X, y = indexable(X, y) + params = _check_method_params(X, params=params) - X, y, groups = indexable(X, y, groups) - fit_params = _check_method_params(X, params=fit_params) + routed_params = self._get_routed_params_for_fit(params) cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator)) - n_splits = cv_orig.get_n_splits(X, y, groups) + n_splits = cv_orig.get_n_splits(X, y, **routed_params.splitter.split) base_estimator = clone(self.estimator) @@ -815,9 +885,8 @@ def fit(self, X, y=None, *, groups=None, **fit_params): fit_and_score_kwargs = dict( scorer=scorers, - fit_params=fit_params, - # TODO(SLEP6): pass score params along - score_params=None, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, @@ -857,7 +926,8 @@ def evaluate_candidates(candidate_params, cv=None, more_results=None): **fit_and_score_kwargs, ) for (cand_idx, parameters), (split_idx, (train, test)) in product( - enumerate(candidate_params), enumerate(cv.split(X, y, groups)) + enumerate(candidate_params), + enumerate(cv.split(X, y, **routed_params.splitter.split)), ) ) @@ -935,9 +1005,9 @@ def evaluate_candidates(candidate_params, cv=None, more_results=None): refit_start_time = time.time() if y is not None: - self.best_estimator_.fit(X, y, **fit_params) + self.best_estimator_.fit(X, y, **routed_params.estimator.fit) else: - self.best_estimator_.fit(X, **fit_params) + self.best_estimator_.fit(X, **routed_params.estimator.fit) refit_end_time = time.time() self.refit_time_ = refit_end_time - refit_start_time @@ -1057,6 +1127,39 @@ def _store(key_name, array, weights=None, splits=False, rank=False): return results + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + router.add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + + scorer, _ = self._get_scorers(convert_multimetric=True) + router.add( + scorer=scorer, + method_mapping=MethodMapping() + .add(caller="score", callee="score") + .add(caller="fit", callee="score"), + ) + router.add( + splitter=self.cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + return router + class GridSearchCV(BaseSearchCV): """Exhaustive search over specified parameter values for an estimator. diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 708092d09a2a5..b1cf5ee50965c 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -27,20 +27,20 @@ def __init__(self, *, base_cv, fraction, subsample_test, random_state): self.subsample_test = subsample_test self.random_state = random_state - def split(self, X, y, groups=None): - for train_idx, test_idx in self.base_cv.split(X, y, groups): + def split(self, X, y, **kwargs): + for train_idx, test_idx in self.base_cv.split(X, y, **kwargs): train_idx = resample( train_idx, replace=False, random_state=self.random_state, - n_samples=int(self.fraction * train_idx.shape[0]), + n_samples=int(self.fraction * len(train_idx)), ) if self.subsample_test: test_idx = resample( test_idx, replace=False, random_state=self.random_state, - n_samples=int(self.fraction * test_idx.shape[0]), + n_samples=int(self.fraction * len(test_idx)), ) yield train_idx, test_idx @@ -123,7 +123,7 @@ def __init__( self.min_resources = min_resources self.aggressive_elimination = aggressive_elimination - def _check_input_parameters(self, X, y, groups): + def _check_input_parameters(self, X, y, split_params): # We need to enforce that successive calls to cv.split() yield the same # splits: see https://github.com/scikit-learn/scikit-learn/issues/15149 if not _yields_constant_splits(self._checked_cv_orig): @@ -154,7 +154,7 @@ def _check_input_parameters(self, X, y, groups): self.min_resources_ = self.min_resources if self.min_resources_ in ("smallest", "exhaust"): if self.resource == "n_samples": - n_splits = self._checked_cv_orig.get_n_splits(X, y, groups) + n_splits = self._checked_cv_orig.get_n_splits(X, y, **split_params) # please see https://gph.is/1KjihQe for a justification magic_factor = 2 self.min_resources_ = n_splits * magic_factor @@ -215,7 +215,7 @@ def _select_best_index(refit, refit_metric, results): # Halving*SearchCV.estimator is not validated yet prefer_skip_nested_validation=False ) - def fit(self, X, y=None, groups=None, **fit_params): + def fit(self, X, y=None, **params): """Run fit with all sets of parameters. Parameters @@ -229,12 +229,7 @@ def fit(self, X, y=None, groups=None, **fit_params): Target relative to X for classification or regression; None for unsupervised learning. - groups : array-like of shape (n_samples,), default=None - Group labels for the samples used while splitting the dataset into - train/test set. Only used in conjunction with a "Group" :term:`cv` - instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). - - **fit_params : dict of string -> object + **params : dict of string -> object Parameters passed to the ``fit`` method of the estimator. Returns @@ -246,15 +241,14 @@ def fit(self, X, y=None, groups=None, **fit_params): self.cv, y, classifier=is_classifier(self.estimator) ) + routed_params = self._get_routed_params_for_fit(params) self._check_input_parameters( - X=X, - y=y, - groups=groups, + X=X, y=y, split_params=routed_params.splitter.split ) self._n_samples_orig = _num_samples(X) - super().fit(X, y=y, groups=groups, **fit_params) + super().fit(X, y=y, **params) # Set best_score_: BaseSearchCV does not set it, as refit is a callable self.best_score_ = self.cv_results_["mean_test_score"][self.best_index_] diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 50b519118a2b3..1db38671ef481 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -22,9 +22,14 @@ make_multilabel_classification, ) from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.exceptions import FitFailedWarning from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.impute import SimpleImputer -from sklearn.linear_model import LinearRegression, Ridge, SGDClassifier +from sklearn.linear_model import ( + LinearRegression, + Ridge, + SGDClassifier, +) from sklearn.metrics import ( accuracy_score, confusion_matrix, @@ -51,11 +56,15 @@ train_test_split, ) from sklearn.model_selection._search import BaseSearchCV -from sklearn.model_selection._validation import FitFailedWarning from sklearn.model_selection.tests.common import OneTimeSplitter from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor from sklearn.pipeline import Pipeline from sklearn.svm import SVC, LinearSVC +from sklearn.tests.metadata_routing_common import ( + ConsumingScorer, + _Registry, + check_recorded_metadata, +) from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils._mocking import CheckingClassifier, MockDataFrame from sklearn.utils._testing import ( @@ -68,6 +77,7 @@ assert_array_equal, ignore_warnings, ) +from sklearn.utils.validation import _num_samples # Neither of the following two estimators inherit from BaseEstimator, @@ -2449,3 +2459,70 @@ def test_search_estimator_param(SearchCV, param_search): assert params["clf"][0].C == orig_C # testing that the GS is setting the parameter of the step correctly assert gs.best_estimator_.named_steps["clf"].C == 0.01 + + +# Metadata Routing Tests +# ====================== + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, "param_grid"), + (RandomizedSearchCV, "param_distributions"), + ], +) +def test_multi_metric_search_forwards_metadata(SearchCV, param_search): + """Test that *SearchCV forwards metadata correctly when passed multiple metrics.""" + X, y = make_classification(random_state=42) + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + score_weights = rng.rand(n_samples) + score_metadata = rng.rand(n_samples) + + est = LinearSVC(dual="auto") + param_grid_search = {param_search: {"C": [1]}} + + scorer_registry = _Registry() + scorer = ConsumingScorer(registry=scorer_registry).set_score_request( + sample_weight="score_weights", metadata="score_metadata" + ) + scoring = dict(my_scorer=scorer, accuracy="accuracy") + SearchCV(est, refit="accuracy", cv=2, scoring=scoring, **param_grid_search).fit( + X, y, score_weights=score_weights, score_metadata=score_metadata + ) + assert len(scorer_registry) + for _scorer in scorer_registry: + check_recorded_metadata( + obj=_scorer, + method="score", + split_params=("sample_weight", "metadata"), + sample_weight=score_weights, + metadata=score_metadata, + ) + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, "param_grid"), + (RandomizedSearchCV, "param_distributions"), + (HalvingGridSearchCV, "param_grid"), + ], +) +def test_score_rejects_params_with_no_routing_enabled(SearchCV, param_search): + """*SearchCV should reject **params when metadata routing is not enabled + since this is added only when routing is enabled.""" + X, y = make_classification(random_state=42) + est = LinearSVC(dual="auto") + param_grid_search = {param_search: {"C": [1]}} + + gs = SearchCV(est, cv=2, **param_grid_search).fit(X, y) + + with pytest.raises(ValueError, match="is only supported if"): + gs.score(X, y, metadata=1) + + +# End of Metadata Routing Tests +# ============================= diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index b5f45d3f34c72..2bc7a00aae585 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -49,7 +49,6 @@ def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): split_params : tuple, default=empty specifies any parameters which are to be checked as being a subset of the original values. - """ records = getattr(obj, "_records", dict()).get(method, dict()) assert set(kwargs.keys()) == set(records.keys()) @@ -70,11 +69,16 @@ def assert_request_is_empty(metadata_request, exclude=None): """Check if a metadata request dict is empty. One can exclude a method or a list of methods from the check using the - ``exclude`` parameter. + ``exclude`` parameter. If metadata_request is a MetadataRouter, then + ``exclude`` can be of the form ``{"object" : [method, ...]}``. """ if isinstance(metadata_request, MetadataRouter): - for _, route_mapping in metadata_request: - assert_request_is_empty(route_mapping.router) + for name, route_mapping in metadata_request: + if exclude is not None and name in exclude: + _exclude = exclude[name] + else: + _exclude = None + assert_request_is_empty(route_mapping.router, exclude=_exclude) return exclude = [] if exclude is None else exclude @@ -87,7 +91,7 @@ def assert_request_is_empty(metadata_request, exclude=None): for prop, alias in mmr.requests.items() if isinstance(alias, str) or alias is not None ] - assert not len(props) + assert not props def assert_request_equal(request, dictionary): @@ -123,7 +127,6 @@ class ConsumingRegressor(RegressorMixin, BaseEstimator): a reference to the estimator later on. Since that reference is not required in all tests, registration can be skipped by leaving this value as None. - """ def __init__(self, registry=None): @@ -151,9 +154,6 @@ def predict(self, X, sample_weight="default", metadata="default"): pass # pragma: no cover # when needed, uncomment the implementation - # if self.registry is not None: - # self.registry.append(self) - # record_metadata_not_default( # self, "predict", sample_weight=sample_weight, metadata=metadata # ) @@ -188,9 +188,13 @@ class ConsumingClassifier(ClassifierMixin, BaseEstimator): required in all tests, registration can be skipped by leaving this value as None. + alpha : float, default=0 + This parameter is only used to test the ``*SearchCV`` objects, and + doesn't do anything. """ - def __init__(self, registry=None): + def __init__(self, registry=None, alpha=0.0): + self.alpha = alpha self.registry = registry def partial_fit(self, X, y, sample_weight="default", metadata="default"): @@ -214,35 +218,35 @@ def fit(self, X, y, sample_weight="default", metadata="default"): return self def predict(self, X, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) - record_metadata_not_default( self, "predict", sample_weight=sample_weight, metadata=metadata ) return np.zeros(shape=(len(X),)) def predict_proba(self, X, sample_weight="default", metadata="default"): - if self.registry is not None: - self.registry.append(self) + pass # pragma: no cover - record_metadata_not_default( - self, "predict_proba", sample_weight=sample_weight, metadata=metadata - ) - return np.asarray([[0.0, 1.0]] * len(X)) + # uncomment when needed + # record_metadata_not_default( + # self, "predict_proba", sample_weight=sample_weight, metadata=metadata + # ) + # return np.asarray([[0.0, 1.0]] * len(X)) def predict_log_proba(self, X, sample_weight="default", metadata="default"): pass # pragma: no cover - # when needed, uncomment the implementation - # if self.registry is not None: - # self.registry.append(self) - + # uncomment when needed # record_metadata_not_default( # self, "predict_log_proba", sample_weight=sample_weight, metadata=metadata # ) # return np.zeros(shape=(len(X), 2)) + def decision_function(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, "predict_proba", sample_weight=sample_weight, metadata=metadata + ) + return np.zeros(shape=(len(X),)) + class ConsumingTransformer(TransformerMixin, BaseEstimator): """A transformer which accepts metadata on fit and transform. @@ -324,8 +328,8 @@ def split(self, X, y=None, groups="default", metadata="default"): yield test_indices, train_indices yield train_indices, test_indices - def get_n_splits(self, X=None, y=None, groups=None): - pass # pragma: no cover + def get_n_splits(self, X=None, y=None, groups=None, metadata=None): + return 2 def _iter_test_indices(self, X=None, y=None, groups=None): split_index = len(X) // 2 diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 4a548fe9f067f..d9bbe399728b8 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -7,7 +7,14 @@ from sklearn import config_context from sklearn.calibration import CalibratedClassifierCV from sklearn.exceptions import UnsetMetadataPassedError +from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.linear_model import LogisticRegressionCV +from sklearn.model_selection import ( + GridSearchCV, + HalvingGridSearchCV, + HalvingRandomSearchCV, + RandomizedSearchCV, +) from sklearn.multioutput import ( ClassifierChain, MultiOutputClassifier, @@ -49,7 +56,7 @@ def enable_slep006(): "estimator": ConsumingRegressor, "X": X, "y": y_multi, - "routing_methods": ["fit", "partial_fit"], + "estimator_routing_methods": ["fit", "partial_fit"], }, { "metaestimator": MultiOutputClassifier, @@ -57,7 +64,7 @@ def enable_slep006(): "estimator": ConsumingClassifier, "X": X, "y": y_multi, - "routing_methods": ["fit", "partial_fit"], + "estimator_routing_methods": ["fit", "partial_fit"], }, { "metaestimator": CalibratedClassifierCV, @@ -65,7 +72,7 @@ def enable_slep006(): "estimator": ConsumingClassifier, "X": X, "y": y, - "routing_methods": ["fit"], + "estimator_routing_methods": ["fit"], "preserves_metadata": False, }, { @@ -74,7 +81,7 @@ def enable_slep006(): "estimator": ConsumingClassifier, "X": X, "y": y_multi, - "routing_methods": ["fit"], + "estimator_routing_methods": ["fit"], }, { "metaestimator": RegressorChain, @@ -82,7 +89,72 @@ def enable_slep006(): "estimator": ConsumingRegressor, "X": X, "y": y_multi, - "routing_methods": ["fit"], + "estimator_routing_methods": ["fit"], + }, + { + "metaestimator": LogisticRegressionCV, + "X": X, + "y": y_multi, + "scorer_name": "scoring", + "scorer_routing_methods": ["fit", "score"], + "cv_name": "cv", + "cv_routing_methods": ["fit"], + }, + { + "metaestimator": GridSearchCV, + "estimator_name": "estimator", + "estimator": ConsumingClassifier, + "init_args": {"param_grid": {"alpha": [0.1, 0.2]}}, + "X": X, + "y": y, + "estimator_routing_methods": ["fit"], + "preserves_metadata": "subset", + "scorer_name": "scoring", + "scorer_routing_methods": ["fit", "score"], + "cv_name": "cv", + "cv_routing_methods": ["fit"], + }, + { + "metaestimator": RandomizedSearchCV, + "estimator_name": "estimator", + "estimator": ConsumingClassifier, + "init_args": {"param_distributions": {"alpha": [0.1, 0.2]}}, + "X": X, + "y": y, + "estimator_routing_methods": ["fit"], + "preserves_metadata": "subset", + "scorer_name": "scoring", + "scorer_routing_methods": ["fit", "score"], + "cv_name": "cv", + "cv_routing_methods": ["fit"], + }, + { + "metaestimator": HalvingGridSearchCV, + "estimator_name": "estimator", + "estimator": ConsumingClassifier, + "init_args": {"param_grid": {"alpha": [0.1, 0.2]}}, + "X": X, + "y": y, + "estimator_routing_methods": ["fit"], + "preserves_metadata": "subset", + "scorer_name": "scoring", + "scorer_routing_methods": ["fit", "score"], + "cv_name": "cv", + "cv_routing_methods": ["fit"], + }, + { + "metaestimator": HalvingRandomSearchCV, + "estimator_name": "estimator", + "estimator": ConsumingClassifier, + "init_args": {"param_distributions": {"alpha": [0.1, 0.2]}}, + "X": X, + "y": y, + "estimator_routing_methods": ["fit"], + "preserves_metadata": "subset", + "scorer_name": "scoring", + "scorer_routing_methods": ["fit", "score"], + "cv_name": "cv", + "cv_routing_methods": ["fit"], }, ] """List containing all metaestimators to be tested and their settings @@ -92,38 +164,79 @@ def enable_slep006(): - metaestimator: The metaestmator to be tested - estimator_name: The name of the argument for the sub-estimator - estimator: The sub-estimator +- init_args: The arguments to be passed to the metaestimator's constructor - X: X-data to fit and predict - y: y-data to fit -- routing_methods: list of all methods to check for routing -- preserves_metadata: Whether the metaestimator passes the metadata to the - sub-estimator without modification or not. If it does, we check that the - values are identical. If it doesn't, no check is performed. TODO Maybe - something smarter could be done if the data is modified. - +- estimator_routing_methods: list of all methods to check for routing metadata + to the sub-estimator +- preserves_metadata: + - True (default): the metaestimator passes the metadata to the + sub-estimator without modification. We check that the values recorded by + the sub-estimator are identical to what we've passed to the + metaestimator. + - False: no check is performed regarding values, we only check that a + metadata with the expected names/keys are passed. + - "subset": we check that the recorded metadata by the sub-estimator is a + subset of what is passed to the metaestimator. +- scorer_name: The name of the argument for the scorer +- scorer_routing_methods: list of all methods to check for routing metadata + to the scorer +- cv_name: The name of the argument for the CV splitter +- cv_routing_methods: list of all methods to check for routing metadata + to the splitter """ -# ids used for pytest fixture +# IDs used by pytest to get meaningful verbose messages when running the tests METAESTIMATOR_IDS = [str(row["metaestimator"].__name__) for row in METAESTIMATORS] -CV_SCORERS: list = [ - { - "cv_estimator": LogisticRegressionCV, - "scorer_name": "scoring", - "routing_methods": ["fit", "score"], - }, -] -CV_SPLITTERS: list = [ - { - "cv_estimator": LogisticRegressionCV, - "splitter_name": "cv", - "routing_methods": ["fit"], - } -] +def get_init_args(metaestimator_info): + """Get the init args for a metaestimator -# IDs used by pytest to get meaningful verbose messages when running the tests -CV_SCORER_IDS = [x["cv_estimator"].__name__ for x in CV_SCORERS] -CV_SPLITTER_IDS = [x["cv_estimator"].__name__ for x in CV_SPLITTERS] + This is a helper function to get the init args for a metaestimator from + the METAESTIMATORS list. It returns an empty dict if no init args are + required. + + Returns + ------- + kwargs : dict + The init args for the metaestimator. + + (estimator, estimator_registry) : (estimator, registry) + The sub-estimator and the corresponding registry. + + (scorer, scorer_registry) : (scorer, registry) + The scorer and the corresponding registry. + + (cv, cv_registry) : (CV splitter, registry) + The CV splitter and the corresponding registry. + """ + kwargs = metaestimator_info.get("init_args", {}) + estimator, estimator_registry = None, None + scorer, scorer_registry = None, None + cv, cv_registry = None, None + if "estimator" in metaestimator_info: + estimator_name = metaestimator_info["estimator_name"] + estimator_registry = _Registry() + estimator = metaestimator_info["estimator"](estimator_registry) + kwargs[estimator_name] = estimator + if "scorer_name" in metaestimator_info: + scorer_name = metaestimator_info["scorer_name"] + scorer_registry = _Registry() + scorer = ConsumingScorer(registry=scorer_registry) + kwargs[scorer_name] = scorer + if "cv_name" in metaestimator_info: + cv_name = metaestimator_info["cv_name"] + cv_registry = _Registry() + cv = ConsumingSplitter(registry=cv_registry) + kwargs[cv_name] = cv + + return ( + kwargs, + (estimator, estimator_registry), + (scorer, scorer_registry), + (cv, cv_registry), + ) def test_registry_copy(): @@ -135,134 +248,170 @@ def test_registry_copy(): assert a is copy.deepcopy(a) -@pytest.mark.parametrize( - "metaestimator", - METAESTIMATORS, - ids=METAESTIMATOR_IDS, -) +@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) def test_default_request(metaestimator): # Check that by default request is empty and the right type cls = metaestimator["metaestimator"] - estimator = metaestimator["estimator"]() - estimator_name = metaestimator["estimator_name"] - instance = cls(**{estimator_name: estimator}) - assert_request_is_empty(instance.get_metadata_routing()) + kwargs, *_ = get_init_args(metaestimator) + instance = cls(**kwargs) + if "cv_name" in metaestimator: + # Our GroupCV splitters request groups by default, which we should + # ignore in this test. + exclude = {"splitter": ["split"]} + else: + exclude = None + assert_request_is_empty(instance.get_metadata_routing(), exclude=exclude) assert isinstance(instance.get_metadata_routing(), MetadataRouter) -@pytest.mark.parametrize( - "metaestimator", - METAESTIMATORS, - ids=METAESTIMATOR_IDS, -) -def test_error_on_missing_requests(metaestimator): - # Test that a UnsetMetadataPassedError is raised when it should. +@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) +def test_error_on_missing_requests_for_sub_estimator(metaestimator): + # Test that a UnsetMetadataPassedError is raised when the sub-estimator's + # requests are not set + if "estimator" not in metaestimator: + # This test only makes sense for metaestimators which have a + # sub-estimator + return + cls = metaestimator["metaestimator"] - estimator = metaestimator["estimator"]() - estimator_name = metaestimator["estimator_name"] X = metaestimator["X"] y = metaestimator["y"] - routing_methods = metaestimator["routing_methods"] + routing_methods = metaestimator["estimator_routing_methods"] for method_name in routing_methods: for key in ["sample_weight", "metadata"]: + kwargs, (estimator, _), (scorer, _), *_ = get_init_args(metaestimator) + if scorer: + scorer.set_score_request(**{key: True}) val = {"sample_weight": sample_weight, "metadata": metadata}[key] - kwargs = {key: val} + method_kwargs = {key: val} msg = ( f"[{key}] are passed but are not explicitly set as requested or not" f" for {estimator.__class__.__name__}.{method_name}" ) - instance = cls(**{estimator_name: estimator}) - if "fit" not in method_name: # instance needs to be fitted first - instance.fit(X, y) # pragma: no cover + instance = cls(**kwargs) with pytest.raises(UnsetMetadataPassedError, match=re.escape(msg)): method = getattr(instance, method_name) - method(X, y, **kwargs) + method(X, y, **method_kwargs) -@pytest.mark.parametrize( - "metaestimator", - METAESTIMATORS, - ids=METAESTIMATOR_IDS, -) -def test_setting_request_removes_error(metaestimator): - # When the metadata is explicitly requested, there should be no errors. +@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) +def test_setting_request_on_sub_estimator_removes_error(metaestimator): + # When the metadata is explicitly requested on the sub-estimator, there + # should be no errors. + if "estimator" not in metaestimator: + # This test only makes sense for metaestimators which have a + # sub-estimator + return + def set_request(estimator, method_name): # e.g. call set_fit_request on estimator set_request_for_method = getattr(estimator, f"set_{method_name}_request") set_request_for_method(sample_weight=True, metadata=True) cls = metaestimator["metaestimator"] - estimator_name = metaestimator["estimator_name"] X = metaestimator["X"] y = metaestimator["y"] - routing_methods = metaestimator["routing_methods"] + routing_methods = metaestimator["estimator_routing_methods"] preserves_metadata = metaestimator.get("preserves_metadata", True) for method_name in routing_methods: for key in ["sample_weight", "metadata"]: val = {"sample_weight": sample_weight, "metadata": metadata}[key] - kwargs = {key: val} + method_kwargs = {key: val} - registry = _Registry() - estimator = metaestimator["estimator"](registry=registry) + kwargs, (estimator, registry), (scorer, _), (cv, _) = get_init_args( + metaestimator + ) + if scorer: + set_request(scorer, "score") + if cv: + cv.set_split_request(groups=True, metadata=True) set_request(estimator, method_name) - instance = cls(**{estimator_name: estimator}) + instance = cls(**kwargs) method = getattr(instance, method_name) - method(X, y, **kwargs) + method(X, y, **method_kwargs) - if preserves_metadata: - # sanity check that registry is not empty, or else the test - # passes trivially - assert registry + # sanity check that registry is not empty, or else the test passes + # trivially + assert registry + if preserves_metadata is True: + for estimator in registry: + check_recorded_metadata(estimator, method_name, **method_kwargs) + elif preserves_metadata == "subset": for estimator in registry: - check_recorded_metadata(estimator, method_name, **kwargs) + check_recorded_metadata( + estimator, + method_name, + split_params=method_kwargs.keys(), + **method_kwargs, + ) -@pytest.mark.parametrize("cv_scorer", CV_SCORERS, ids=CV_SCORER_IDS) -def test_metadata_is_routed_correctly_to_scorer(cv_scorer): +@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) +def test_metadata_is_routed_correctly_to_scorer(metaestimator): """Test that any requested metadata is correctly routed to the underlying scorers in CV estimators. """ - registry = _Registry() - cls = cv_scorer["cv_estimator"] - scorer_name = cv_scorer["scorer_name"] - scorer = ConsumingScorer(registry=registry) - scorer.set_score_request(sample_weight=True) - routing_methods = cv_scorer["routing_methods"] + if "scorer_name" not in metaestimator: + # This test only makes sense for CV estimators + return + + cls = metaestimator["metaestimator"] + routing_methods = metaestimator["scorer_routing_methods"] for method_name in routing_methods: - instance = cls(**{scorer_name: scorer}) + kwargs, (estimator, _), (scorer, registry), (cv, _) = get_init_args( + metaestimator + ) + if estimator: + estimator.set_fit_request(sample_weight=True, metadata=True) + scorer.set_score_request(sample_weight=True) + if cv: + cv.set_split_request(groups=True, metadata=True) + instance = cls(**kwargs) method = getattr(instance, method_name) - kwargs = {"sample_weight": sample_weight} - if "fit" not in method_name: # instance needs to be fitted first + method_kwargs = {"sample_weight": sample_weight} + if "fit" not in method_name: instance.fit(X, y) - method(X, y, **kwargs) + method(X, y, **method_kwargs) + + assert registry for _scorer in registry: check_recorded_metadata( obj=_scorer, method="score", split_params=("sample_weight",), - **kwargs, + **method_kwargs, ) -@pytest.mark.parametrize("cv_splitter", CV_SPLITTERS, ids=CV_SPLITTER_IDS) -def test_metadata_is_routed_correctly_to_splitter(cv_splitter): +@pytest.mark.parametrize("metaestimator", METAESTIMATORS, ids=METAESTIMATOR_IDS) +def test_metadata_is_routed_correctly_to_splitter(metaestimator): """Test that any requested metadata is correctly routed to the underlying splitters in CV estimators. """ - registry = _Registry() - cls = cv_splitter["cv_estimator"] - splitter_name = cv_splitter["splitter_name"] - splitter = ConsumingSplitter(registry=registry) - routing_methods = cv_splitter["routing_methods"] + if "cv_routing_methods" not in metaestimator: + # This test is only for metaestimators accepting a CV splitter + return + + cls = metaestimator["metaestimator"] + routing_methods = metaestimator["cv_routing_methods"] for method_name in routing_methods: - instance = cls(**{splitter_name: splitter}) + kwargs, (estimator, _), (scorer, _), (cv, registry) = get_init_args( + metaestimator + ) + if estimator: + estimator.set_fit_request(sample_weight=False, metadata=False) + if scorer: + scorer.set_score_request(sample_weight=False, metadata=False) + cv.set_split_request(groups=True, metadata=True) + instance = cls(**kwargs) + method_kwargs = {"groups": groups, "metadata": metadata} method = getattr(instance, method_name) - kwargs = {"groups": groups} - method(X, y, **kwargs) + method(X, y, **method_kwargs) + assert registry for _splitter in registry: - check_recorded_metadata(obj=_splitter, method="split", **kwargs) + check_recorded_metadata(obj=_splitter, method="split", **method_kwargs) diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py index cafece0406e82..9c53e21c7286c 100644 --- a/sklearn/utils/_metadata_requests.py +++ b/sklearn/utils/_metadata_requests.py @@ -985,7 +985,7 @@ def route_params(self, *, caller, params): def validate_metadata(self, *, method, params): """Validate given metadata for a method. - This raises a ``ValueError`` if some of the passed metadata are not + This raises a ``TypeError`` if some of the passed metadata are not understood by child objects. Parameters @@ -1010,8 +1010,8 @@ def validate_metadata(self, *, method, params): extra_keys = set(params.keys()) - param_names - self_params if extra_keys: raise TypeError( - f"{method} got unexpected argument(s) {extra_keys}, which are " - "not requested metadata in any object." + f"{self.owner}.{method} got unexpected argument(s) {extra_keys}, which" + " are not requested metadata in any object." ) def _serialize(self): From 8ed76c74705276b32c565c2dfeb5f1e952a59487 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 11 Sep 2023 17:37:40 +0200 Subject: [PATCH 16/21] MAINT cython typedefs in _cd_fast (#27334) --- sklearn/linear_model/_cd_fast.pyx | 28 +++++++++++++--------------- sklearn/utils/_typedefs.pxd | 1 + sklearn/utils/_typedefs.pyx | 1 + 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx index 38ba169ae8015..37b051a3ef6e3 100644 --- a/sklearn/linear_model/_cd_fast.pyx +++ b/sklearn/linear_model/_cd_fast.pyx @@ -14,15 +14,13 @@ from cython cimport floating import warnings from ..exceptions import ConvergenceWarning -from ..utils._cython_blas cimport (_axpy, _dot, _asum, _gemv, _nrm2, - _copy, _scal) +from ..utils._cython_blas cimport ( + _axpy, _dot, _asum, _gemv, _nrm2, _copy, _scal +) from ..utils._cython_blas cimport ColMajor, Trans, NoTrans - - +from ..utils._typedefs cimport uint32_t from ..utils._random cimport our_rand_r -ctypedef cnp.float64_t DOUBLE -ctypedef cnp.uint32_t UINT32_t cnp.import_array() @@ -37,7 +35,7 @@ cdef enum: RAND_R_MAX = 2147483647 -cdef inline UINT32_t rand_int(UINT32_t end, UINT32_t* random_state) noexcept nogil: +cdef inline uint32_t rand_int(uint32_t end, uint32_t* random_state) noexcept nogil: """Generate a random integer in [0; end).""" return our_rand_r(random_state) % end @@ -156,8 +154,8 @@ def enet_coordinate_descent( cdef unsigned int ii cdef unsigned int n_iter = 0 cdef unsigned int f_iter - cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) - cdef UINT32_t* rand_r_state = &rand_r_state_seed + cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef uint32_t* rand_r_state = &rand_r_state_seed if alpha == 0 and beta == 0: warnings.warn("Coordinate descent with no regularization may lead to " @@ -370,8 +368,8 @@ def sparse_enet_coordinate_descent( cdef unsigned int jj cdef unsigned int n_iter = 0 cdef unsigned int f_iter - cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) - cdef UINT32_t* rand_r_state = &rand_r_state_seed + cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef uint32_t* rand_r_state = &rand_r_state_seed cdef bint center = False cdef bint no_sample_weights = sample_weight is None cdef int kk @@ -628,8 +626,8 @@ def enet_coordinate_descent_gram( cdef unsigned int ii cdef unsigned int n_iter = 0 cdef unsigned int f_iter - cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) - cdef UINT32_t* rand_r_state = &rand_r_state_seed + cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef uint32_t* rand_r_state = &rand_r_state_seed cdef floating y_norm2 = np.dot(y, y) cdef floating* w_ptr = &w[0] @@ -808,8 +806,8 @@ def enet_coordinate_descent_multi_task( cdef unsigned int jj cdef unsigned int n_iter = 0 cdef unsigned int f_iter - cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) - cdef UINT32_t* rand_r_state = &rand_r_state_seed + cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef uint32_t* rand_r_state = &rand_r_state_seed cdef const floating* X_ptr = &X[0, 0] cdef const floating* Y_ptr = &Y[0, 0] diff --git a/sklearn/utils/_typedefs.pxd b/sklearn/utils/_typedefs.pxd index 0ef8f35b93a96..254aef54aeda1 100644 --- a/sklearn/utils/_typedefs.pxd +++ b/sklearn/utils/_typedefs.pxd @@ -15,6 +15,7 @@ # use these consistently throughout the codebase. # NOTE: Extend this list as needed when converting more cython extensions. ctypedef unsigned char uint8_t +ctypedef unsigned int uint32_t ctypedef Py_ssize_t intp_t ctypedef float float32_t ctypedef double float64_t diff --git a/sklearn/utils/_typedefs.pyx b/sklearn/utils/_typedefs.pyx index 22e18cdae8d2e..90039ae7f7bcc 100644 --- a/sklearn/utils/_typedefs.pyx +++ b/sklearn/utils/_typedefs.pyx @@ -8,6 +8,7 @@ import numpy as np ctypedef fused testing_type_t: uint8_t + uint32_t intp_t float32_t float64_t From c5f22295029575f050e5ea007cd60130d8538ec2 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 11 Sep 2023 18:01:02 +0200 Subject: [PATCH 17/21] MAINT better fused type names in loss module (#27330) --- sklearn/_loss/_loss.pxd | 12 ++- sklearn/_loss/_loss.pyx.tp | 168 +++++++++++++++++++------------------ 2 files changed, 93 insertions(+), 87 deletions(-) diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd index b1ddbadcc5f2c..69bef42b9ed6e 100644 --- a/sklearn/_loss/_loss.pxd +++ b/sklearn/_loss/_loss.pxd @@ -1,13 +1,17 @@ # cython: language_level=3 -# Fused types for y_true, y_pred, raw_prediction -ctypedef fused Y_DTYPE_C: +# Fused types for input like y_true, raw_prediction, sample_weights. +ctypedef fused floating_in: double float -# Fused types for gradient and hessian -ctypedef fused G_DTYPE_C: +# Fused types for output like gradient and hessian +# We use a different fused types for input (floating_in) and output (floating_out), such +# that input and output can have different dtypes in the same function call. A single +# fused type can only take on one single value (type) for all arguments in one function +# call. +ctypedef fused floating_out: double float diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp index d01ff43bdc0b4..8efeeea77f0e6 100644 --- a/sklearn/_loss/_loss.pyx.tp +++ b/sklearn/_loss/_loss.pyx.tp @@ -268,8 +268,8 @@ cdef inline double log1pexp(double x) noexcept nogil: cdef inline void sum_exp_minus_max( const int i, - const Y_DTYPE_C[:, :] raw_prediction, # IN - Y_DTYPE_C *p # OUT + const floating_in[:, :] raw_prediction, # IN + floating_in *p # OUT ) noexcept nogil: # Thread local buffers are used to stores results of this function via p. # The results are stored as follows: @@ -744,7 +744,7 @@ cdef inline double_pair cgrad_hess_half_binomial( double raw_prediction ) noexcept nogil: # with y_pred = expit(raw) - # hessian = y_pred * (1 - y_pred) = exp(raw) / (1 + exp(raw))**2 + # hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2 # = exp(-raw) / (1 + exp(-raw))**2 cdef double_pair gh gh.val2 = exp(-raw_prediction) # used as temporary @@ -835,7 +835,9 @@ cdef class CyLossFunction: """ pass - cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil: + cdef double_pair cy_grad_hess( + self, double y_true, double raw_prediction + ) noexcept nogil: """Compute gradient and hessian. Gradient and hessian of loss w.r.t. raw_prediction for a single sample. @@ -862,10 +864,10 @@ cdef class CyLossFunction: def loss( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[::1] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] loss_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT int n_threads=1 ): """Compute the pointwise loss value for each input. @@ -892,10 +894,10 @@ cdef class CyLossFunction: def gradient( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[::1] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] gradient_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] gradient_out, # OUT int n_threads=1 ): """Compute gradient of loss w.r.t raw_prediction for each input. @@ -922,11 +924,11 @@ cdef class CyLossFunction: def loss_gradient( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[::1] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] loss_out, # OUT - G_DTYPE_C[::1] gradient_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + floating_out[::1] gradient_out, # OUT int n_threads=1 ): """Compute loss and gradient of loss w.r.t raw_prediction. @@ -960,11 +962,11 @@ cdef class CyLossFunction: def gradient_hessian( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[::1] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] gradient_out, # OUT - G_DTYPE_C[::1] hessian_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] gradient_out, # OUT + floating_out[::1] hessian_out, # OUT int n_threads=1 ): """Compute gradient and hessian of loss w.r.t raw_prediction. @@ -1022,10 +1024,10 @@ cdef class {{name}}(CyLossFunction): def loss( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[::1] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] loss_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT int n_threads=1 ): cdef: @@ -1048,11 +1050,11 @@ cdef class {{name}}(CyLossFunction): {{if closs_grad is not None}} def loss_gradient( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[::1] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] loss_out, # OUT - G_DTYPE_C[::1] gradient_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + floating_out[::1] gradient_out, # OUT int n_threads=1 ): cdef: @@ -1080,10 +1082,10 @@ cdef class {{name}}(CyLossFunction): def gradient( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[::1] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] gradient_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] gradient_out, # OUT int n_threads=1 ): cdef: @@ -1105,11 +1107,11 @@ cdef class {{name}}(CyLossFunction): def gradient_hessian( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[::1] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] gradient_out, # OUT - G_DTYPE_C[::1] hessian_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] gradient_out, # OUT + floating_out[::1] hessian_out, # OUT int n_threads=1 ): cdef: @@ -1158,18 +1160,18 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): # opposite are welcome. def loss( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[:, :] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] loss_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT int n_threads=1 ): cdef: int i, k int n_samples = y_true.shape[0] int n_classes = raw_prediction.shape[1] - Y_DTYPE_C max_value, sum_exps - Y_DTYPE_C* p # temporary buffer + floating_in max_value, sum_exps + floating_in* p # temporary buffer # We assume n_samples > n_classes. In this case having the inner loop # over n_classes is a good default. @@ -1181,7 +1183,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): with nogil, parallel(num_threads=n_threads): # Define private buffer variables as each thread might use its # own. - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1197,7 +1199,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): free(p) else: with nogil, parallel(num_threads=n_threads): - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1218,26 +1220,26 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): def loss_gradient( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[:, :] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[::1] loss_out, # OUT - G_DTYPE_C[:, :] gradient_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + floating_out[:, :] gradient_out, # OUT int n_threads=1 ): cdef: int i, k int n_samples = y_true.shape[0] int n_classes = raw_prediction.shape[1] - Y_DTYPE_C max_value, sum_exps - Y_DTYPE_C* p # temporary buffer + floating_in max_value, sum_exps + floating_in* p # temporary buffer if sample_weight is None: # inner loop over n_classes with nogil, parallel(num_threads=n_threads): # Define private buffer variables as each thread might use its # own. - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1256,7 +1258,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): free(p) else: with nogil, parallel(num_threads=n_threads): - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1280,25 +1282,25 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): def gradient( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[:, :] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[:, :] gradient_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[:, :] gradient_out, # OUT int n_threads=1 ): cdef: int i, k int n_samples = y_true.shape[0] int n_classes = raw_prediction.shape[1] - Y_DTYPE_C sum_exps - Y_DTYPE_C* p # temporary buffer + floating_in sum_exps + floating_in* p # temporary buffer if sample_weight is None: # inner loop over n_classes with nogil, parallel(num_threads=n_threads): # Define private buffer variables as each thread might use its # own. - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1312,7 +1314,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): free(p) else: with nogil, parallel(num_threads=n_threads): - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1329,26 +1331,26 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): def gradient_hessian( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[:, :] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[:, :] gradient_out, # OUT - G_DTYPE_C[:, :] hessian_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[:, :] gradient_out, # OUT + floating_out[:, :] hessian_out, # OUT int n_threads=1 ): cdef: int i, k int n_samples = y_true.shape[0] int n_classes = raw_prediction.shape[1] - Y_DTYPE_C sum_exps - Y_DTYPE_C* p # temporary buffer + floating_in sum_exps + floating_in* p # temporary buffer if sample_weight is None: # inner loop over n_classes with nogil, parallel(num_threads=n_threads): # Define private buffer variables as each thread might use its # own. - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1364,7 +1366,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): free(p) else: with nogil, parallel(num_threads=n_threads): - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1387,26 +1389,26 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): # diagonal (in the classes) approximation as implemented above. def gradient_proba( self, - const Y_DTYPE_C[::1] y_true, # IN - const Y_DTYPE_C[:, :] raw_prediction, # IN - const Y_DTYPE_C[::1] sample_weight, # IN - G_DTYPE_C[:, :] gradient_out, # OUT - G_DTYPE_C[:, :] proba_out, # OUT + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[:, :] gradient_out, # OUT + floating_out[:, :] proba_out, # OUT int n_threads=1 ): cdef: int i, k int n_samples = y_true.shape[0] int n_classes = raw_prediction.shape[1] - Y_DTYPE_C sum_exps - Y_DTYPE_C* p # temporary buffer + floating_in sum_exps + floating_in* p # temporary buffer if sample_weight is None: # inner loop over n_classes with nogil, parallel(num_threads=n_threads): # Define private buffer variables as each thread might use its # own. - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) @@ -1420,7 +1422,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction): free(p) else: with nogil, parallel(num_threads=n_threads): - p = malloc(sizeof(Y_DTYPE_C) * (n_classes + 2)) + p = malloc(sizeof(floating_in) * (n_classes + 2)) for i in prange(n_samples, schedule='static'): sum_exp_minus_max(i, raw_prediction, p) From 3f110690d084bde4ef02b290f173a473ff930a0f Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 11 Sep 2023 18:26:17 +0200 Subject: [PATCH 18/21] DOC Rework outlier detection estimators example (#25878) Co-authored-by: ArturoAmorQ Co-authored-by: Tim Head Co-authored-by: Julien Jerphanion Co-authored-by: Guillaume Lemaitre --- .../plot_outlier_detection_bench.py | 562 +++++++++++++----- .../preprocessing/plot_scaling_importance.py | 2 + 2 files changed, 410 insertions(+), 154 deletions(-) diff --git a/examples/miscellaneous/plot_outlier_detection_bench.py b/examples/miscellaneous/plot_outlier_detection_bench.py index a0901a31b8a2e..b4fae93131971 100644 --- a/examples/miscellaneous/plot_outlier_detection_bench.py +++ b/examples/miscellaneous/plot_outlier_detection_bench.py @@ -3,197 +3,451 @@ Evaluation of outlier detection estimators ========================================== -This example benchmarks outlier detection algorithms, :ref:`local_outlier_factor` -(LOF) and :ref:`isolation_forest` (IForest), using ROC curves on -classical anomaly detection datasets. The algorithm performance -is assessed in an outlier detection context: +This example compares two outlier detection algorithms, namely +:ref:`local_outlier_factor` (LOF) and :ref:`isolation_forest` (IForest), on +real-world datasets available in :class:`sklearn.datasets`. The goal is to show +that different algorithms perform well on different datasets. -1. The algorithms are trained on the whole dataset which is assumed to -contain outliers. +The algorithms are trained in an outlier detection context: -2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed -on the same dataset using the knowledge of the labels. +1. The ROC curves are computed using knowledge of the ground-truth labels +and displayed using :class:`~sklearn.metrics.RocCurveDisplay`. +2. The performance is assessed in terms of the ROC-AUC. """ # Author: Pharuj Rajborirug +# Arturo Amor # License: BSD 3 clause -print(__doc__) +# %% +# Dataset preprocessing and model training +# ======================================== +# +# Different outlier detection models require different preprocessing. In the +# presence of categorical variables, +# :class:`~sklearn.preprocessing.OrdinalEncoder` is often a good strategy for +# tree-based models such as :class:`~sklearn.ensemble.IsolationForest`, whereas +# neighbors-based models such as :class:`~sklearn.neighbors.LocalOutlierFactor` +# would be impacted by the ordering induced by ordinal encoding. To avoid +# inducing an ordering, on should rather use +# :class:`~sklearn.preprocessing.OneHotEncoder`. +# +# Neighbors-based models may also require scaling of the numerical features (see +# for instance :ref:`neighbors_scaling`). In the presence of outliers, a good +# option is to use a :class:`~sklearn.preprocessing.RobustScaler`. + +from sklearn.compose import ColumnTransformer +from sklearn.ensemble import IsolationForest +from sklearn.neighbors import LocalOutlierFactor +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import ( + OneHotEncoder, + OrdinalEncoder, + RobustScaler, +) + + +def make_estimator(name, categorical_columns=None, iforest_kw=None, lof_kw=None): + """Create an outlier detection estimator based on its name.""" + if name == "LOF": + outlier_detector = LocalOutlierFactor(**(lof_kw or {})) + if categorical_columns is None: + preprocessor = RobustScaler() + else: + preprocessor = ColumnTransformer( + transformers=[("categorical", OneHotEncoder(), categorical_columns)], + remainder=RobustScaler(), + ) + else: # name == "IForest" + outlier_detector = IsolationForest(**(iforest_kw or {})) + if categorical_columns is None: + preprocessor = None + else: + ordinal_encoder = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 + ) + preprocessor = ColumnTransformer( + transformers=[ + ("categorical", ordinal_encoder, categorical_columns), + ], + remainder="passthrough", + ) + + return make_pipeline(preprocessor, outlier_detector) + # %% -# Define a data preprocessing function -# ------------------------------------ +# The following `fit_predict` function returns the average outlier score of X. + +from time import perf_counter + + +def fit_predict(estimator, X): + tic = perf_counter() + if estimator[-1].__class__.__name__ == "LocalOutlierFactor": + estimator.fit(X) + y_pred = estimator[-1].negative_outlier_factor_ + else: # "IsolationForest" + y_pred = estimator.fit(X).decision_function(X) + toc = perf_counter() + print(f"Duration for {model_name}: {toc - tic:.2f} s") + return y_pred + + +# %% +# On the rest of the example we process one dataset per section. After loading +# the data, the targets are modified to consist of two classes: 0 representing +# inliers and 1 representing outliers. Due to computational constraints of the +# scikit-learn documentation, the sample size of some datasets is reduced using +# a stratified :class:`~sklearn.model_selection.train_test_split`. +# +# Furthermore, we set `n_neighbors` to match the expected number of anomalies +# `expected_n_anomalies = n_samples * expected_anomaly_fraction`. This is a good +# heuristic as long as the proportion of outliers is not very low, the reason +# being that `n_neighbors` should be at least greater than the number of samples +# in the less populated cluster (see +# :ref:`sphx_glr_auto_examples_neighbors_plot_lof_outlier_detection.py`). +# +# KDDCup99 - SA dataset +# --------------------- # -# The example uses real-world datasets available in -# :class:`sklearn.datasets` and the sample size of some datasets is reduced -# to speed up computation. After the data preprocessing, the datasets' targets -# will have two classes, 0 representing inliers and 1 representing outliers. -# The `preprocess_dataset` function returns data and target. +# The :ref:`kddcup99_dataset` was generated using a closed network and +# hand-injected attacks. The SA dataset is a subset of it obtained by simply +# selecting all the normal data and an anomaly proportion of around 3%. +# %% import numpy as np -import pandas as pd - -from sklearn.datasets import fetch_covtype, fetch_kddcup99, fetch_openml -from sklearn.preprocessing import LabelBinarizer - -rng = np.random.RandomState(42) - - -def preprocess_dataset(dataset_name): - # loading and vectorization - print(f"Loading {dataset_name} data") - if dataset_name in ["http", "smtp", "SA", "SF"]: - dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng) - X = dataset.data - y = dataset.target - lb = LabelBinarizer() - - if dataset_name == "SF": - idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) - X = X[idx] # reduce the sample size - y = y[idx] - x1 = lb.fit_transform(X[:, 1].astype(str)) - X = np.c_[X[:, :1], x1, X[:, 2:]] - elif dataset_name == "SA": - idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) - X = X[idx] # reduce the sample size - y = y[idx] - x1 = lb.fit_transform(X[:, 1].astype(str)) - x2 = lb.fit_transform(X[:, 2].astype(str)) - x3 = lb.fit_transform(X[:, 3].astype(str)) - X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] - y = (y != b"normal.").astype(int) - if dataset_name == "forestcover": - dataset = fetch_covtype() - X = dataset.data - y = dataset.target - idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) - X = X[idx] # reduce the sample size - y = y[idx] - - # inliers are those with attribute 2 - # outliers are those with attribute 4 - s = (y == 2) + (y == 4) - X = X[s, :] - y = y[s] - y = (y != 2).astype(int) - if dataset_name in ["glass", "wdbc", "cardiotocography"]: - dataset = fetch_openml( - name=dataset_name, version=1, as_frame=False, parser="pandas" - ) - X = dataset.data - y = dataset.target - - if dataset_name == "glass": - s = y == "tableware" - y = s.astype(int) - if dataset_name == "wdbc": - s = y == "2" - y = s.astype(int) - X_mal, y_mal = X[s], y[s] - X_ben, y_ben = X[~s], y[~s] - - # downsampled to 39 points (9.8% outliers) - idx = rng.choice(y_mal.shape[0], 39, replace=False) - X_mal2 = X_mal[idx] - y_mal2 = y_mal[idx] - X = np.concatenate((X_ben, X_mal2), axis=0) - y = np.concatenate((y_ben, y_mal2), axis=0) - if dataset_name == "cardiotocography": - s = y == "3" - y = s.astype(int) - # 0 represents inliers, and 1 represents outliers - y = pd.Series(y, dtype="category") - return (X, y) - - -# %% -# Define an outlier prediction function -# ------------------------------------- -# There is no particular reason to choose algorithms -# :class:`~sklearn.neighbors.LocalOutlierFactor` and -# :class:`~sklearn.ensemble.IsolationForest`. The goal is to show that -# different algorithm performs well on different datasets. The following -# `compute_prediction` function returns average outlier score of X. +from sklearn.datasets import fetch_kddcup99 +from sklearn.model_selection import train_test_split -from sklearn.ensemble import IsolationForest -from sklearn.neighbors import LocalOutlierFactor +X, y = fetch_kddcup99( + subset="SA", percent10=True, random_state=42, return_X_y=True, as_frame=True +) +y = (y != b"normal.").astype(np.int32) +X, _, y, _ = train_test_split(X, y, train_size=0.1, stratify=y, random_state=42) +n_samples, anomaly_frac = X.shape[0], y.mean() +print(f"{n_samples} datapoints with {y.sum()} anomalies ({anomaly_frac:.02%})") -def compute_prediction(X, model_name): - print(f"Computing {model_name} prediction...") - if model_name == "LOF": - clf = LocalOutlierFactor(n_neighbors=20, contamination="auto") - clf.fit(X) - y_pred = clf.negative_outlier_factor_ - if model_name == "IForest": - clf = IsolationForest(random_state=rng, contamination="auto") - y_pred = clf.fit(X).decision_function(X) - return y_pred +# %% +# The SA dataset contains 41 features out of which 3 are categorical: +# "protocol_type", "service" and "flag". +# %% +y_true = {} +y_pred = {"LOF": {}, "IForest": {}} +model_names = ["LOF", "IForest"] +cat_columns = ["protocol_type", "service", "flag"] + +y_true["KDDCup99 - SA"] = y +for model_name in model_names: + model = make_estimator( + name=model_name, + categorical_columns=cat_columns, + lof_kw={"n_neighbors": int(n_samples * anomaly_frac)}, + iforest_kw={"random_state": 42}, + ) + y_pred[model_name]["KDDCup99 - SA"] = fit_predict(model, X) # %% -# Plot and interpret results -# -------------------------- +# Forest covertypes dataset +# ------------------------- # -# The algorithm performance relates to how good the true positive rate (TPR) -# is at low value of the false positive rate (FPR). The best algorithms -# have the curve on the top-left of the plot and the area under curve (AUC) -# close to 1. The diagonal dashed line represents a random classification -# of outliers and inliers. +# The :ref:`covtype_dataset` is a multiclass dataset where the target is the +# dominant species of tree in a given patch of forest. It contains 54 features, +# some of which ("Wilderness_Area" and "Soil_Type") are already binary encoded. +# Though originally meant as a classification task, one can regard inliers as +# samples encoded with label 2 and outliers as those with label 4. +# %% +from sklearn.datasets import fetch_covtype -import math +X, y = fetch_covtype(return_X_y=True, as_frame=True) +s = (y == 2) + (y == 4) +X = X.loc[s] +y = y.loc[s] +y = (y != 2).astype(np.int32) + +X, _, y, _ = train_test_split(X, y, train_size=0.05, stratify=y, random_state=42) +X_forestcover = X # save X for later use +n_samples, anomaly_frac = X.shape[0], y.mean() +print(f"{n_samples} datapoints with {y.sum()} anomalies ({anomaly_frac:.02%})") + +# %% +y_true["forestcover"] = y +for model_name in model_names: + model = make_estimator( + name=model_name, + lof_kw={"n_neighbors": int(n_samples * anomaly_frac)}, + iforest_kw={"random_state": 42}, + ) + y_pred[model_name]["forestcover"] = fit_predict(model, X) + +# %% +# Ames Housing dataset +# -------------------- +# +# The `Ames housing dataset `_ is originally a +# regression dataset where the target are sales prices of houses in Ames, Iowa. +# Here we convert it into an outlier detection problem by regarding houses with +# price over 70 USD/sqft. To make the problem easier, we drop intermediate +# prices between 40 and 70 USD/sqft. + +# %% import matplotlib.pyplot as plt -from sklearn.metrics import RocCurveDisplay +from sklearn.datasets import fetch_openml -datasets_name = [ - "http", - "smtp", - "SA", - "SF", - "forestcover", - "glass", - "wdbc", - "cardiotocography", -] +X, y = fetch_openml( + name="ames_housing", version=1, return_X_y=True, as_frame=True, parser="pandas" +) +y = y.div(X["Lot_Area"]) -models_name = [ - "LOF", - "IForest", -] +# None values in pandas 1.5.1 were mapped to np.nan in pandas 2.0.1 +X["Misc_Feature"] = X["Misc_Feature"].cat.add_categories("NoInfo").fillna("NoInfo") +X["Mas_Vnr_Type"] = X["Mas_Vnr_Type"].cat.add_categories("NoInfo").fillna("NoInfo") + +X.drop(columns="Lot_Area", inplace=True) +mask = (y < 40) | (y > 70) +X = X.loc[mask] +y = y.loc[mask] +y.hist(bins=20, edgecolor="black") +plt.xlabel("House price in USD/sqft") +_ = plt.title("Distribution of house prices in Ames") + +# %% +y = (y > 70).astype(np.int32) + +n_samples, anomaly_frac = X.shape[0], y.mean() +print(f"{n_samples} datapoints with {y.sum()} anomalies ({anomaly_frac:.02%})") + +# %% +# The dataset contains 46 categorical features. In this case it is easier use a +# :class:`~sklearn.compose.make_column_selector` to find them instead of passing +# a list made by hand. + +# %% +from sklearn.compose import make_column_selector as selector + +categorical_columns_selector = selector(dtype_include="category") +cat_columns = categorical_columns_selector(X) + +y_true["ames_housing"] = y +for model_name in model_names: + model = make_estimator( + name=model_name, + categorical_columns=cat_columns, + lof_kw={"n_neighbors": int(n_samples * anomaly_frac)}, + iforest_kw={"random_state": 42}, + ) + y_pred[model_name]["ames_housing"] = fit_predict(model, X) + +# %% +# Cardiotocography dataset +# ------------------------ +# +# The `Cardiotocography dataset `_ is a multiclass +# dataset of fetal cardiotocograms, the classes being the fetal heart rate (FHR) +# pattern encoded with labels from 1 to 10. Here we set class 3 (the minority +# class) to represent the outliers. It contains 30 numerical features, some of +# which are binary encoded and some are continuous. + +# %% +X, y = fetch_openml( + name="cardiotocography", version=1, return_X_y=True, as_frame=False, parser="pandas" +) +X_cardiotocography = X # save X for later use +s = y == "3" +y = s.astype(np.int32) + +n_samples, anomaly_frac = X.shape[0], y.mean() +print(f"{n_samples} datapoints with {y.sum()} anomalies ({anomaly_frac:.02%})") + +# %% +y_true["cardiotocography"] = y +for model_name in model_names: + model = make_estimator( + name=model_name, + lof_kw={"n_neighbors": int(n_samples * anomaly_frac)}, + iforest_kw={"random_state": 42}, + ) + y_pred[model_name]["cardiotocography"] = fit_predict(model, X) + +# %% +# Plot and interpret results +# ========================== +# +# The algorithm performance relates to how good the true positive rate (TPR) is +# at low value of the false positive rate (FPR). The best algorithms have the +# curve on the top-left of the plot and the area under curve (AUC) close to 1. +# The diagonal dashed line represents a random classification of outliers and +# inliers. + +# %% +import math + +from sklearn.metrics import RocCurveDisplay -# plotting parameters cols = 2 -linewidth = 1 pos_label = 0 # mean 0 belongs to positive class -rows = math.ceil(len(datasets_name) / cols) - -fig, axs = plt.subplots(rows, cols, figsize=(10, 10 * 2), sharex=True, sharey=True) +datasets_names = y_true.keys() +rows = math.ceil(len(datasets_names) / cols) -for ax, dataset_name in zip(axs.ravel(), datasets_name): - (X, y) = preprocess_dataset(dataset_name=dataset_name) +fig, axs = plt.subplots(nrows=rows, ncols=cols, squeeze=False, figsize=(10, rows * 4)) - for model_idx, model_name in enumerate(models_name): - y_pred = compute_prediction(X, model_name=model_name) +for ax, dataset_name in zip(axs.ravel(), datasets_names): + for model_idx, model_name in enumerate(model_names): display = RocCurveDisplay.from_predictions( - y, - y_pred, + y_true[dataset_name], + y_pred[model_name][dataset_name], pos_label=pos_label, name=model_name, - linewidth=linewidth, ax=ax, - plot_chance_level=(model_idx == len(models_name) - 1), - chance_level_kw={ - "linewidth": linewidth, - "linestyle": ":", - }, + plot_chance_level=(model_idx == len(model_names) - 1), + chance_level_kw={"linestyle": ":"}, ) ax.set_title(dataset_name) - ax.legend(loc="lower right") -plt.tight_layout(pad=2.0) # spacing between subplots +_ = plt.tight_layout(pad=2.0) # spacing between subplots + +# %% +# We observe that once the number of neighbors is tuned, LOF and IForest perform +# similarly in terms of ROC AUC for the forestcover and cardiotocography +# datasets. The score for IForest is slightly better for the SA dataset and LOF +# performs considerably better on the Ames housing dataset than IForest. +# +# Ablation study +# ============== +# +# In this section we explore the impact of the hyperparameter `n_neighbors` and +# the choice of scaling the numerical variables on the LOF model. Here we use +# the :ref:`covtype_dataset` dataset as the binary encoded categories introduce +# a natural scale of euclidean distances between 0 and 1. We then want a scaling +# method to avoid granting a privilege to non-binary features and that is robust +# enough to outliers so that the task of finding them does not become too +# difficult. + +# %% +X = X_forestcover +y = y_true["forestcover"] + +n_samples = X.shape[0] +n_neighbors_list = (n_samples * np.array([0.2, 0.02, 0.01, 0.001])).astype(np.int32) +model = make_pipeline(RobustScaler(), LocalOutlierFactor()) + +linestyles = ["solid", "dashed", "dashdot", ":", (5, (10, 3))] + +fig, ax = plt.subplots() +for model_idx, (linestyle, n_neighbors) in enumerate(zip(linestyles, n_neighbors_list)): + model.set_params(localoutlierfactor__n_neighbors=n_neighbors) + model.fit(X) + y_pred = model[-1].negative_outlier_factor_ + display = RocCurveDisplay.from_predictions( + y, + y_pred, + pos_label=pos_label, + name=f"n_neighbors = {n_neighbors}", + ax=ax, + plot_chance_level=(model_idx == len(n_neighbors_list) - 1), + chance_level_kw={"linestyle": (0, (1, 10))}, + linestyle=linestyle, + linewidth=2, + ) +_ = ax.set_title("RobustScaler with varying n_neighbors\non forestcover dataset") + +# %% +# We observe that the number of neighbors has a big impact on the performance of +# the model. If one has access to (at least some) ground truth labels, it is +# then important to tune `n_neighbors` accordingly. A convenient way to do so is +# to explore values for `n_neighbors` of the order of magnitud of the expected +# contamination. + +# %% +from sklearn.preprocessing import MinMaxScaler, SplineTransformer, StandardScaler + +preprocessor_list = [ + None, + RobustScaler(), + StandardScaler(), + MinMaxScaler(), + SplineTransformer(), +] +expected_anomaly_fraction = 0.02 +lof = LocalOutlierFactor(n_neighbors=int(n_samples * expected_anomaly_fraction)) + +fig, ax = plt.subplots() +for model_idx, (linestyle, preprocessor) in enumerate( + zip(linestyles, preprocessor_list) +): + model = make_pipeline(preprocessor, lof) + model.fit(X) + y_pred = model[-1].negative_outlier_factor_ + display = RocCurveDisplay.from_predictions( + y, + y_pred, + pos_label=pos_label, + name=str(preprocessor).split("(")[0], + ax=ax, + plot_chance_level=(model_idx == len(preprocessor_list) - 1), + chance_level_kw={"linestyle": (0, (1, 10))}, + linestyle=linestyle, + linewidth=2, + ) +_ = ax.set_title("Fixed n_neighbors with varying preprocessing\non forestcover dataset") + +# %% +# On the one hand, :class:`~sklearn.preprocessing.RobustScaler` scales each +# feature independently by using the interquartile range (IQR) by default, which +# is the range between the 25th and 75th percentiles of the data. It centers the +# data by subtracting the median and then scale it by dividing by the IQR. The +# IQR is robust to outliers: the median and interquartile range are less +# affected by extreme values than the range, the mean and the standard +# deviation. Furthermore, :class:`~sklearn.preprocessing.RobustScaler` does not +# squash marginal outlier values, contrary to +# :class:`~sklearn.preprocessing.StandardScaler`. +# +# On the other hand, :class:`~sklearn.preprocessing.MinMaxScaler` scales each +# feature individually such that its range maps into the range between zero and +# one. If there are outliers in the data, they can skew it towards either the +# minimum or maximum values, leading to a completely different distribution of +# data with large marginal outliers: all non-outlier values can be collapsed +# almost together as a result. +# +# We also evaluated no preprocessing at all (by passing `None` to the pipeline), +# :class:`~sklearn.preprocessing.StandardScaler` and +# :class:`~sklearn.preprocessing.SplineTransformer`. Please refer to their +# respective documentation for more details. +# +# Note that the optimal preprocessing depends on the dataset, as shown below: + +# %% +X = X_cardiotocography +y = y_true["cardiotocography"] + +n_samples, expected_anomaly_fraction = X.shape[0], 0.025 +lof = LocalOutlierFactor(n_neighbors=int(n_samples * expected_anomaly_fraction)) + +fig, ax = plt.subplots() +for model_idx, (linestyle, preprocessor) in enumerate( + zip(linestyles, preprocessor_list) +): + model = make_pipeline(preprocessor, lof) + model.fit(X) + y_pred = model[-1].negative_outlier_factor_ + display = RocCurveDisplay.from_predictions( + y, + y_pred, + pos_label=pos_label, + name=str(preprocessor).split("(")[0], + ax=ax, + plot_chance_level=(model_idx == len(preprocessor_list) - 1), + chance_level_kw={"linestyle": (0, (1, 10))}, + linestyle=linestyle, + linewidth=2, + ) +ax.set_title( + "Fixed n_neighbors with varying preprocessing\non cardiotocography dataset" +) plt.show() diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index 6e0ae0ae1c109..5b78fe0636d8c 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -52,6 +52,8 @@ scaled_X_train = scaler.fit_transform(X_train) # %% +# .. _neighbors_scaling: +# # Effect of rescaling on a k-neighbors models # =========================================== # From 0b6f442efea6554a692089a8112bb5e89bad38b2 Mon Sep 17 00:00:00 2001 From: Rahil Parikh <75483881+rprkh@users.noreply.github.com> Date: Mon, 11 Sep 2023 23:04:45 +0530 Subject: [PATCH 19/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/model_selection/tests/test_search.py` (#27286) Co-authored-by: Guillaume Lemaitre --- sklearn/model_selection/tests/test_search.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 1db38671ef481..d2cfdd7f7b2ed 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -11,7 +11,6 @@ import numpy as np import pytest -import scipy.sparse as sp from scipy.stats import bernoulli, expon, uniform from sklearn.base import BaseEstimator, ClassifierMixin, is_classifier @@ -77,6 +76,7 @@ assert_array_equal, ignore_warnings, ) +from sklearn.utils.fixes import CSR_CONTAINERS from sklearn.utils.validation import _num_samples @@ -489,7 +489,8 @@ def test_grid_search_bad_param_grid(): search.fit(X, y) -def test_grid_search_sparse(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_grid_search_sparse(csr_container): # Test that grid search works with both dense and sparse matrices X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) @@ -499,7 +500,7 @@ def test_grid_search_sparse(): y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C - X_ = sp.csr_matrix(X_) + X_ = csr_container(X_) clf = LinearSVC(dual="auto") cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) cv.fit(X_[:180].tocoo(), y_[:180]) @@ -510,7 +511,8 @@ def test_grid_search_sparse(): assert C == C2 -def test_grid_search_sparse_scoring(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_grid_search_sparse_scoring(csr_container): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC(dual="auto") @@ -519,7 +521,7 @@ def test_grid_search_sparse_scoring(): y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C - X_ = sp.csr_matrix(X_) + X_ = csr_container(X_) clf = LinearSVC(dual="auto") cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) From 3dd49068286b70bcab4295d60e595bc44a9641fc Mon Sep 17 00:00:00 2001 From: Alexander Al-Feghali Date: Mon, 11 Sep 2023 13:35:34 -0400 Subject: [PATCH 20/21] TST Extend tests for `scipy.sparse.*array` in `sklearn\model_selection\tests\test_search.py` (#27326) Co-authored-by: Guillaume Lemaitre From 4b8799798f95b61e80c73c329b7ec2c7ec431dde Mon Sep 17 00:00:00 2001 From: Rahil Parikh <75483881+rprkh@users.noreply.github.com> Date: Mon, 11 Sep 2023 23:10:14 +0530 Subject: [PATCH 21/21] TST Extend tests for `scipy.sparse.*array` in `sklearn/tests/test_pipeline.py` (#27278) --- sklearn/tests/test_pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 76fa44a04a1d6..c57209d3c00d2 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -10,7 +10,6 @@ import joblib import numpy as np import pytest -from scipy import sparse from sklearn.base import BaseEstimator, TransformerMixin, clone, is_classifier from sklearn.cluster import KMeans @@ -46,6 +45,7 @@ assert_array_almost_equal, assert_array_equal, ) +from sklearn.utils.fixes import CSR_CONTAINERS from sklearn.utils.validation import check_is_fitted iris = load_iris() @@ -485,7 +485,8 @@ def test_predict_methods_with_predict_params(method_name): assert pipe.named_steps["clf"].got_attribute -def test_feature_union(): +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_feature_union(csr_container): # basic sanity check for feature union X = iris.data X -= X.mean(axis=0) @@ -504,7 +505,7 @@ def test_feature_union(): # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) - X_sp = sparse.csr_matrix(X) + X_sp = csr_container(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())