Merging main

Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Oct 17, 2023 · 1adb209 · 1adb209
2 parents 09f7785 + caeb09e
commit 1adb209
Show file tree

Hide file tree

Showing 17 changed files with 469 additions and 275 deletions.
diff --git a/build_tools/cirrus/arm_tests.yml b/build_tools/cirrus/arm_tests.yml
@@ -11,6 +11,10 @@ linux_aarch64_test_task:
     LOCK_FILE: build_tools/cirrus/py39_conda_forge_linux-aarch64_conda.lock
     CONDA_PKGS_DIRS: /root/.conda/pkgs
     HOME: /  # $HOME is not defined in image and is required to install mambaforge
+    # Upload tokens have been encrypted via the CirrusCI interface:
+    # https://cirrus-ci.org/guide/writing-tasks/#encrypted-variables
+    # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires.
+    BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f]
   ccache_cache:
     folder: /root/.cache/ccache
   conda_cache:

diff --git a/doc/conf.py b/doc/conf.py
@@ -312,15 +312,18 @@
 html_show_search_summary = False
 
 
+# The "summary-anchor" IDs will be overwritten via JavaScript to be unique.
+# See `doc/theme/scikit-learn-modern/static/js/details-permalink.js`.
 rst_prolog = """
 .. |details-start| raw:: html
 
-    <details>
+    <details id="summary-anchor">
     <summary class="btn btn-light">
 
 .. |details-split| raw:: html
 
     <span class="tooltiptext">Click for more details</span>
+    <a class="headerlink" href="#summary-anchor" title="Permalink to this heading">¶</a>
     </summary>
     <div class="card">
 

diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst
@@ -612,7 +612,7 @@ Here, ``<estimator>`` is the parameter name of the nested estimator,
 in this case ``estimator``.
 If the meta-estimator is constructed as a collection of estimators as in
 `pipeline.Pipeline`, then ``<estimator>`` refers to the name of the estimator,
-see :ref:`pipeline_nested_parameters`.  In practice, there can be several
+see :ref:`pipeline_nested_parameters`. In practice, there can be several
 levels of nesting::
 
   >>> from sklearn.pipeline import Pipeline

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -181,9 +181,15 @@ take several parameters:
   of the python function is negated by the scorer object, conforming to
   the cross validation convention that scorers return higher values for better models.
 
-* for classification metrics only: whether the python function you provided requires continuous decision
-  certainties (``needs_threshold=True``).  The default value is
-  False.
+* for classification metrics only: whether the python function you provided requires
+  continuous decision certainties. If the scoring function only accepts probability
+  estimates (e.g. :func:`metrics.log_loss`) then one needs to set the parameter
+  `response_method`, thus in this case `response_method="predict_proba"`. Some scoring
+  function do not necessarily require probability estimates but rather non-thresholded
+  decision values (e.g. :func:`metrics.roc_auc_score`). In this case, one provides a
+  list such as `response_method=["decision_function", "predict_proba"]`. In this case,
+  the scorer will use the first available method, in the order given in the list,
+  to compute the scores.
 
 * any additional parameters, such as ``beta`` or ``labels`` in :func:`f1_score`.
 

diff --git a/doc/themes/scikit-learn-modern/layout.html b/doc/themes/scikit-learn-modern/layout.html
@@ -36,6 +36,7 @@
   <link rel="stylesheet" href="{{ pathto('_static/' + styles[0], 1) }}" type="text/css" />
 <script id="documentation_options" data-url_root="{{ pathto('', 1) }}" src="{{ pathto('_static/documentation_options.js', 1) }}"></script>
 <script src="{{ pathto('_static/js/vendor/jquery-3.6.3.slim.min.js', 1) }}"></script>
+<script src="{{ pathto('_static/js/details-permalink.js', 1) }}"></script>
 {%- block extrahead %} {% endblock %}
 </head>
 <body>

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -149,6 +149,15 @@ div.clearer {
 
 /* details / summary */
 
+/* Enables section links to be visible when anchor-linked */
+div.sk-page-content details::before {
+  display: block;
+  height: 52px;
+  margin-top: -52px;
+  visibility: hidden;
+  content: "";
+}
+
 div.sk-page-content details {
     margin: 4ex 0pt;
 }
@@ -202,6 +211,10 @@ div.sk-page-content summary:hover .tooltiptext {
   visibility: visible;
 }
 
+div.sk-page-content summary:hover .headerlink {
+  visibility: visible;
+}
+
 /* Button */
 
 .sk-btn-primary {

diff --git a/doc/themes/scikit-learn-modern/static/js/details-permalink.js b/doc/themes/scikit-learn-modern/static/js/details-permalink.js
@@ -0,0 +1,47 @@
+// Function to create permalink into <details> elements to be able to link them
+// The assumption is that such a block will be defined as follows:
+//     <details id="summary-anchor">
+//     <summary class="btn btn-light">
+//     Some title
+//     <span class="tooltiptext">Click for more details</span>
+//     <a class="headerlink" href="#summary-anchor" title="Permalink to this heading">¶</a>
+//     </summary>
+//     <div class="card">
+//     Some details
+//     </div>
+//     </details>
+// We seek to replace `#summary-anchor` with a unique identifier based on the
+// summary text.
+// This syntax is defined in `doc/conf.py` in the `rst_prolog` variable.
+function updateIdAndHrefBasedOnSummaryText() {
+    var allDetailsElements = document.querySelectorAll('details');
+    // Counter to store the duplicated summary text to add it as a suffix in the
+    // anchor ID
+    var anchorIDCounters = {};
+
+    allDetailsElements.forEach(function (detailsElement) {
+        // Get the <summary> element within the current <details>
+        var summaryElement = detailsElement.querySelector('summary');
+
+        // The ID uses the first line, lowercased, and spaces replaced with dashes
+        var anchorID = summaryElement.textContent.trim().split("\n")[0].replace(/\s+/g, '-').toLowerCase();
+
+        // Suffix the anchor ID with a counter if it already exists
+        if (anchorIDCounters[anchorID]) {
+            anchorIDCounters[anchorID] += 1;
+            anchorID = anchorID + '-' + anchorIDCounters[anchorID];
+        } else {
+            anchorIDCounters[anchorID] = 1;
+        }
+
+        detailsElement.setAttribute('id', anchorID);
+
+        var anchorElement = summaryElement.querySelector('a.headerlink');
+        anchorElement.setAttribute('href', '#' + anchorID);
+    });
+}
+
+// Add an event listener to execute the function when the page is loaded
+document.addEventListener('DOMContentLoaded', function () {
+    updateIdAndHrefBasedOnSummaryText();
+});
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -2,6 +2,23 @@
 
 .. currentmodule:: sklearn
 
+.. _changes_1_3_2:
+
+Version 1.3.2
+=============
+
+**October 2023**
+
+Changelog
+---------
+
+:mod:`sklearn.tree`
+...................
+
+- |Fix| Do not leak data via non-initialized memory in decision tree pickle files and make
+  the generation of those files deterministic. :pr:`27580` by :user:`Loïc Estève <lesteve>`.
+
+
 .. _changes_1_3_1:
 
 Version 1.3.1

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -253,6 +253,11 @@ Changelog
   :pr:`26315` and :pr:`27098` by :user:`Mateusz Sokół <mtsokol>`,
   :user:`Olivier Grisel <ogrisel>` and :user:`Edoardo Abati <EdAbati>`.
 
+- |Fix| Fixes a bug in :class:`decomposition.KernelPCA` by forcing the output of
+  the internal :class:`preprocessing.KernelCenterer` to be a default array. When the
+  arpack solver was used, it would expect an array with a `dtype` attribute.
+  :pr:`27583` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.ensemble`
 .......................
 
@@ -354,6 +359,14 @@ Changelog
   :func:`sklearn.metrics.zero_one_loss` now support Array API compatible inputs.
   :pr:`27137` by :user:`Edoardo Abati <EdAbati>`.
 
+- |API| Deprecated `needs_threshold` and `needs_proba` from :func:`metrics.make_scorer`.
+  These parameters will be removed in version 1.6. Instead, use `response_method` that
+  accepts `"predict"`, `"predict_proba"` or `"decision_function"` or a list of such
+  values. `needs_proba=True` is equivalent to `response_method="predict_proba"` and
+  `needs_threshold=True` is equivalent to
+  `response_method=("decision_function", "predict_proba")`.
+  :pr:`26840` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 - |Fix| Fixes a bug for metrics using `zero_division=np.nan`
   (e.g. :func:`~metrics.precision_score`) within a paralell loop
   (e.g. :func:`~model_selection.cross_val_score`) where the singleton for `np.nan`
@@ -366,6 +379,11 @@ Changelog
   :func:`metrics.root_mean_squared_log_error` instead.
   :pr:`26734` by :user:`Alejandro Martin Gil <101AlexMartin>`.
 
+- |Fix| :func:`metrics.make_scorer` now raises an error when using a regressor on a
+  scorer requesting a non-thresholded decision function (from `decision_function` or
+  `predict_proba`). Such scorer are specific to classification.
+  :pr:`26840` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 

diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py
@@ -274,14 +274,15 @@ def test_hdbscan_callable_metric():
     assert n_clusters == n_clusters_true
 
 
-@pytest.mark.parametrize("tree", ["kd", "ball"])
+@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"])
 def test_hdbscan_precomputed_non_brute(tree):
     """
     Tests that HDBSCAN correctly raises an error when passing precomputed data
     while requesting a tree-based algorithm.
     """
-    hdb = HDBSCAN(metric="precomputed", algorithm=f"prims_{tree}tree")
-    with pytest.raises(ValueError):
+    hdb = HDBSCAN(metric="precomputed", algorithm=tree)
+    msg = "precomputed is not a valid metric for"
+    with pytest.raises(ValueError, match=msg):
         hdb.fit(X)
 
 

diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
@@ -432,7 +432,7 @@ def fit(self, X, y=None):
             raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.")
         X = self._validate_data(X, accept_sparse="csr", copy=self.copy_X)
         self.gamma_ = 1 / X.shape[1] if self.gamma is None else self.gamma
-        self._centerer = KernelCenterer()
+        self._centerer = KernelCenterer().set_output(transform="default")
         K = self._get_kernel(X)
         self._fit_transform(K)
 

diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -3,7 +3,8 @@
 import numpy as np
 import pytest
 
-from sklearn.datasets import make_blobs, make_circles
+import sklearn
+from sklearn.datasets import load_iris, make_blobs, make_circles
 from sklearn.decomposition import PCA, KernelPCA
 from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import Perceptron
@@ -551,3 +552,15 @@ def test_kernel_pca_inverse_correct_gamma():
     X2_recon = kpca2.inverse_transform(kpca1.transform(X))
 
     assert_allclose(X1_recon, X2_recon)
+
+
+def test_kernel_pca_pandas_output():
+    """Check that KernelPCA works with pandas output when the solver is arpack.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27579
+    """
+    pytest.importorskip("pandas")
+    X, _ = load_iris(as_frame=True, return_X_y=True)
+    with sklearn.config_context(transform_output="pandas"):
+        KernelPCA(n_components=2, eigen_solver="arpack").fit_transform(X)