Merge branch 'submodulev2' into reed

neurodata · Aug 11, 2023 · 423fa49 · 423fa49
2 parents d4d677e + 6ec023b
commit 423fa49
Show file tree

Hide file tree

Showing 125 changed files with 4,085 additions and 1,809 deletions.
diff --git a/.cirrus.star b/.cirrus.star
@@ -14,7 +14,7 @@ def main(ctx):
 
     # Nightly jobs always run
     if env.get("CIRRUS_CRON", "") == "nightly":
-        return fs.read(arm_wheel_yaml)
+        return fs.read(arm_wheel_yaml) + fs.read(arm_tests_yaml)
 
     # Get commit message for event. We can not use `git` here because there is
     # no command line access in starlark. Thus we need to query the GitHub API
@@ -26,10 +26,12 @@ def main(ctx):
     response = http.get(url).json()
     commit_msg = response["message"]
 
-    if "[skip ci]" in commit_msg:
-        return []
+    jobs_to_run = ""
 
     if "[cd build]" in commit_msg or "[cd build cirrus]" in commit_msg:
-        return fs.read(arm_wheel_yaml) + fs.read(arm_tests_yaml)
+        jobs_to_run += fs.read(arm_wheel_yaml)
+
+    if "[cirrus arm]" in commit_msg:
+        jobs_to_run += fs.read(arm_tests_yaml)
 
-    return fs.read(arm_tests_yaml)
+    return jobs_to_run
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -103,6 +103,18 @@ jobs:
             python: 311
             platform_id: macosx_x86_64
 
+          # MacOS arm64
+          # The latest Python version is built and tested on CirrusCI
+          - os: macos-latest
+            python: 38
+            platform_id: macosx_arm64
+          - os: macos-latest
+            python: 39
+            platform_id: macosx_arm64
+          - os: macos-latest
+            python: 310
+            platform_id: macosx_arm64
+
     steps:
       - name: Checkout scikit-learn
         uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -100,6 +100,9 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
 sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
+sklearn/neighbors/_ball_tree.pyx
+sklearn/neighbors/_binary_tree.pxi
+sklearn/neighbors/_kd_tree.pyx
 
 # Default JupyterLite content
 jupyterlite_contents
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -171,7 +171,6 @@ jobs:
         DISTRIB: 'conda'
         LOCK_FILE: './build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock'
         COVERAGE: 'true'
-        SHOW_SHORT_SUMMARY: 'true'
         SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '42'  # default global random seed
 
 # Check compilation with Ubuntu 22.04 LTS (Jammy Jellyfish) and scipy from conda-forge

diff --git a/build_tools/azure/posix-docker.yml b/build_tools/azure/posix-docker.yml
@@ -22,7 +22,6 @@ jobs:
     # Set in azure-pipelines.yml
     DISTRIB: ''
     DOCKER_CONTAINER: ''
-    SHOW_SHORT_SUMMARY: 'false'
     CREATE_ISSUE_ON_TRACKER: 'true'
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
     CCACHE_COMPRESS: '1'

diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml
@@ -22,7 +22,6 @@ jobs:
     PYTEST_XDIST_VERSION: 'latest'
     COVERAGE: 'true'
     CREATE_ISSUE_ON_TRACKER: 'true'
-    SHOW_SHORT_SUMMARY: 'false'
   strategy:
     matrix:
       ${{ insert }}: ${{ parameters.matrix }}

diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
@@ -49,7 +49,7 @@ if [[ "$COVERAGE" == "true" ]]; then
 fi
 
 if [[ -n "$CHECK_WARNINGS" ]]; then
-    TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Werror::numpy.VisibleDeprecationWarning"
+    TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Werror::sklearn.utils.fixes.VisibleDeprecationWarning"
 
     # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib
     # removes its usage
@@ -75,10 +75,6 @@ if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
     TEST_CMD="$TEST_CMD -n$XDIST_WORKERS"
 fi
 
-if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then
-    TEST_CMD="$TEST_CMD -ra"
-fi
-
 if [[ -n "$SELECTED_TESTS" ]]; then
     TEST_CMD="$TEST_CMD -k $SELECTED_TESTS"
 

diff --git a/build_tools/cirrus/arm_tests.yml b/build_tools/cirrus/arm_tests.yml
@@ -17,4 +17,10 @@ linux_aarch64_test_task:
     folder: /root/.conda/pkgs
     fingerprint_script: cat build_tools/cirrus/py39_conda_forge_linux-aarch64_conda.lock
 
-  test_script: bash build_tools/cirrus/build_test_arm.sh
+  test_script: |
+    bash build_tools/cirrus/build_test_arm.sh
+    # On success, this script is run updating the issue.
+    bash build_tools/cirrus/update_tracking_issue.sh true
+
+  on_failure:
+    update_tracker_script: bash build_tools/cirrus/update_tracking_issue.sh false
diff --git a/build_tools/cirrus/arm_wheel.yml b/build_tools/cirrus/arm_wheel.yml
@@ -16,12 +16,8 @@ macos_arm64_wheel_task:
     # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires.
     BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f]
   matrix:
-    - env:
-        CIBW_BUILD: cp38-macosx_arm64
-    - env:
-        CIBW_BUILD: cp39-macosx_arm64
-    - env:
-        CIBW_BUILD: cp310-macosx_arm64
+    # Only the latest Python version is built and tested on CirrusCI, the other
+    # macos arm64 builds are on GitHub Actions
     - env:
         CIBW_BUILD: cp311-macosx_arm64
 
@@ -60,12 +56,16 @@ linux_arm64_wheel_task:
     # See `maint_tools/update_tracking_issue.py` for details on the permissions the token requires.
     BOT_GITHUB_TOKEN: ENCRYPTED[9b50205e2693f9e4ce9a3f0fcb897a259289062fda2f5a3b8aaa6c56d839e0854a15872f894a70fca337dd4787274e0f]
   matrix:
+    # Only the latest Python version is tested
     - env:
         CIBW_BUILD: cp38-manylinux_aarch64
+        CIBW_TEST_SKIP: "*_aarch64"
     - env:
         CIBW_BUILD: cp39-manylinux_aarch64
+        CIBW_TEST_SKIP: "*_aarch64"
     - env:
         CIBW_BUILD: cp310-manylinux_aarch64
+        CIBW_TEST_SKIP: "*_aarch64"
     - env:
         CIBW_BUILD: cp311-manylinux_aarch64
 

diff --git a/build_tools/cirrus/build_test_arm.sh b/build_tools/cirrus/build_test_arm.sh
@@ -25,7 +25,7 @@ setup_ccache() {
 MAMBAFORGE_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh"
 
 # Install Mambaforge
-wget $MAMBAFORGE_URL -O mambaforge.sh
+curl -L $MAMBAFORGE_URL -o mambaforge.sh
 MAMBAFORGE_PATH=$HOME/mambaforge
 bash ./mambaforge.sh -b -p $MAMBAFORGE_PATH
 export PATH=$MAMBAFORGE_PATH/bin:$PATH

diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py
@@ -556,15 +556,15 @@ def check_conda_version():
     # Avoid issues with glibc (https://github.com/conda/conda-lock/issues/292)
     # or osx (https://github.com/conda/conda-lock/issues/408) virtual package.
     # The glibc one has been fixed in conda 23.1.0 and the osx has been fixed
-    # in main and will be fixed when conda >= 23.6 is released.
+    # in conda 23.7.0.
     conda_info_output = execute_command(["conda", "info", "--json"])
 
     conda_info = json.loads(conda_info_output)
     conda_version = Version(conda_info["conda_version"])
 
-    if Version("22.9.0") < conda_version < Version("23.6"):
+    if Version("22.9.0") < conda_version < Version("23.7"):
         raise RuntimeError(
-            f"conda version should be <= 22.9.0 or >= 23.6 got: {conda_version}"
+            f"conda version should be <= 22.9.0 or >= 23.7 got: {conda_version}"
         )
 
 

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
@@ -542,6 +542,7 @@ message, the following actions are taken.
     [pypy]                 Build & test with PyPy
     [pyodide]              Build & test with Pyodide
     [azure parallel]       Run Azure CI jobs in parallel
+    [cirrus arm]           Run Cirrus CI ARM test
     [float32]              Run float32 tests by setting `SKLEARN_RUN_FLOAT32_TESTS=1`. See :ref:`environment_variable` for more details
     [doc skip]             Docs are not built
     [doc quick]            Docs built, but excludes example gallery plots

diff --git a/doc/glossary.rst b/doc/glossary.rst
@@ -205,6 +205,29 @@ General Concepts
         exceptional behaviours on the estimator using semantic :term:`estimator
         tags`.
 
+    cross-fitting
+    cross fitting
+        A resampling method that iteratively partitions data into mutually
+        exclusive subsets to fit two stages. During the first stage, the
+        mutually exclusive subsets enable predictions or transformations to be
+        computed on data not seen during training. The computed data is then
+        used in the second stage. The objective is to avoid having any
+        overfitting in the first stage introduce bias into the input data
+        distribution of the second stage.
+        For examples of its use, see: :class:`~preprocessing.TargetEncoder`,
+        :class:`~ensemble.StackingClassifier`,
+        :class:`~ensemble.StackingRegressor` and
+        :class:`~calibration.CalibratedClassifierCV`.
+
+    cross-validation
+    cross validation
+        A resampling method that iteratively partitions data into mutually
+        exclusive 'train' and 'test' subsets so model performance can be
+        evaluated on unseen data. This conserves data as avoids the need to hold
+        out a 'validation' dataset and accounts for variability as multiple
+        rounds of cross validation are generally performed.
+        See :ref:`User Guide <cross_validation>` for more details.
+
     deprecation
         We use deprecation to slowly violate our :term:`backwards
         compatibility` assurances, usually to:

diff --git a/doc/install.rst b/doc/install.rst
@@ -61,7 +61,7 @@ Installing the latest release
          ><span class="sk-expandable" data-packager="pip" data-os="linux">Install python3 and python3-pip using the package manager of the Linux Distribution.</span
          ><span class="sk-expandable" data-packager="conda"
             >Install conda using the <a href="https://docs.conda.io/projects/conda/en/latest/user-guide/install/">Anaconda or miniconda</a>
-             installers or the <a href="https://https://github.com/conda-forge/miniforge#miniforge">miniforge</a> installers
+             installers or the <a href="https://github.com/conda-forge/miniforge#miniforge">miniforge</a> installers
              (no administrator permission required for any of those).</span>
        </div>
 
@@ -279,14 +279,14 @@ and in the `main`, `conda-forge` and `intel` conda channels:
 
   conda install scikit-learn-intelex
 
-This package has an Intel optimized version of many estimators. Whenever 
-an alternative implementation doesn't exist, scikit-learn implementation 
-is used as a fallback. Those optimized solvers come from the oneDAL 
-C++ library and are optimized for the x86_64 architecture, and are 
+This package has an Intel optimized version of many estimators. Whenever
+an alternative implementation doesn't exist, scikit-learn implementation
+is used as a fallback. Those optimized solvers come from the oneDAL
+C++ library and are optimized for the x86_64 architecture, and are
 optimized for multi-core Intel CPUs.
 
 Note that those solvers are not enabled by default, please refer to the
-`scikit-learn-intelex <https://intel.github.io/scikit-learn-intelex/what-is-patching.html>`_ 
+`scikit-learn-intelex <https://intel.github.io/scikit-learn-intelex/what-is-patching.html>`_
 documentation for more details on usage scenarios. Direct export example:
 
 .. prompt:: bash $

diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst
@@ -83,17 +83,26 @@ the tensors directly::
     >>> X_trans.device.type
     'cuda'
 
-.. _array_api_estimators:
+.. _array_api_supported:
 
-Estimators with support for `Array API`-compatible inputs
-=========================================================
+Support for `Array API`-compatible inputs
+=========================================
+
+Estimators and other tools in scikit-learn that support Array API compatible inputs.
+
+Estimators
+----------
 
 - :class:`decomposition.PCA` (with `svd_solver="full"`,
   `svd_solver="randomized"` and `power_iteration_normalizer="QR"`)
 - :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`)
 
-Coverage for more estimators is expected to grow over time. Please follow the
-dedicated `meta-issue on GitHub
+Tools
+-----
+
+- :func:`model_selection.train_test_split`
+
+Coverage is expected to grow over time. Please follow the dedicated `meta-issue on GitHub
 <https://github.com/scikit-learn/scikit-learn/issues/22352>`_ to track progress.
 
 Common estimator checks

diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
@@ -66,10 +66,8 @@ it takes a variable number of estimators and returns a pipeline,
 filling in the names automatically::
 
     >>> from sklearn.pipeline import make_pipeline
-    >>> from sklearn.naive_bayes import MultinomialNB
-    >>> from sklearn.preprocessing import Binarizer
-    >>> make_pipeline(Binarizer(), MultinomialNB())
-    Pipeline(steps=[('binarizer', Binarizer()), ('multinomialnb', MultinomialNB())])
+    >>> make_pipeline(PCA(), SVC())
+    Pipeline(steps=[('pca', PCA()), ('svc', SVC())])
 
 Accessing steps
 ...............

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -102,6 +102,7 @@ where the number of samples is very small.
 .. image:: ../images/grid_search_cross_validation.png
    :width: 500px
    :height: 300px
+   :alt: A depiction of a 5 fold cross validation on a training set, while holding out a test set.
    :align: center
 
 Computing cross-validated metrics

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -10,12 +10,12 @@ Ensembles: Gradient boosting, random forests, bagging, voting, stacking
 base estimators built with a given learning algorithm in order to improve
 generalizability / robustness over a single estimator.
 
-Two very famous examples of ensemble methods are `gradient-boosted trees
-<gradient_boosting>`_ and `random forests <forest>`_.
+Two very famous examples of ensemble methods are :ref:`gradient-boosted trees
+<gradient_boosting>` and :ref:`random forests <forest>`.
 
 More generally, ensemble models can be applied to any base learner beyond
 trees, in averaging methods such as :ref:`Bagging methods <bagging>`,
-`model stacking <stacking>`_, or `Voting <voting_classifier>`_, or in
+:ref:`model stacking <stacking>`, or :ref:`Voting <voting_classifier>`, or in
 boosting, as :ref:`AdaBoost <adaboost>`.
 
 .. contents::

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -37,7 +37,7 @@ solves a problem of the form:
    :align: center
    :scale: 50%
 
-:class:`LinearRegression` will take in its ``fit`` method arrays X, y
+:class:`LinearRegression` will take in its ``fit`` method arrays ``X``, ``y``
 and will store the coefficients :math:`w` of the linear model in its
 ``coef_`` member::
 
@@ -114,7 +114,7 @@ of shrinkage and thus the coefficients become more robust to collinearity.
 
 
 As with other linear models, :class:`Ridge` will take in its ``fit`` method
-arrays X, y and will store the coefficients :math:`w` of the linear model in
+arrays ``X``, ``y`` and will store the coefficients :math:`w` of the linear model in
 its ``coef_`` member::
 
     >>> from sklearn import linear_model
@@ -889,12 +889,16 @@ the probability of the positive class :math:`P(y_i=1|X_i)` as
 
 .. math:: \hat{p}(X_i) = \operatorname{expit}(X_i w + w_0) = \frac{1}{1 + \exp(-X_i w - w_0)}.
 
+
 As an optimization problem, binary
 class logistic regression with regularization term :math:`r(w)` minimizes the
 following cost function:
 
-.. math:: \min_{w} C \sum_{i=1}^n \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + r(w).
-
+.. math::
+    :name: regularized-logistic-loss
+   
+    \min_{w} C \sum_{i=1}^n \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + r(w).
+   
 
 We currently provide four choices for the regularization term  :math:`r(w)`  via
 the `penalty` argument:

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
@@ -188,13 +188,9 @@ distance can be supplied to compute the weights.
 
 .. |classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_001.png
    :target: ../auto_examples/neighbors/plot_classification.html
-   :scale: 50
-
-.. |classification_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_002.png
-   :target: ../auto_examples/neighbors/plot_classification.html
-   :scale: 50
+   :scale: 75
 
-.. centered:: |classification_1| |classification_2|
+.. centered:: |classification_1|
 
 .. topic:: Examples:
 

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -910,16 +910,16 @@ For continuous targets, the formulation is similar to binary classification:
 where :math:`L_i` is the set of observations with category :math:`i` and
 :math:`n_i` is the number of observations with category :math:`i`.
 
-:meth:`~TargetEncoder.fit_transform` internally relies on a cross fitting
+:meth:`~TargetEncoder.fit_transform` internally relies on a :term:`cross fitting`
 scheme to prevent target information from leaking into the train-time
 representation, especially for non-informative high-cardinality categorical
 variables, and help prevent the downstream model from overfitting spurious
 correlations. Note that as a result, `fit(X, y).transform(X)` does not equal
 `fit_transform(X, y)`. In :meth:`~TargetEncoder.fit_transform`, the training
-data is split into *k* folds (determined by the `cv` parameter) and encodes each
-fold using the encodings trained on the other *k-1* folds. The following diagram
-shows the cross fitting scheme in :meth:`~TargetEncoder.fit_transform` with
-the default `cv=5`:
+data is split into *k* folds (determined by the `cv` parameter) and each fold is
+encoded using the encodings learnt using the other *k-1* folds. The following
+diagram shows the :term:`cross fitting` scheme in
+:meth:`~TargetEncoder.fit_transform` with the default `cv=5`:
 
 .. image:: ../images/target_encoder_cross_validation.svg
    :width: 600
@@ -929,10 +929,10 @@ the default `cv=5`:
 the whole training set. This is never used in
 :meth:`~TargetEncoder.fit_transform` but is saved to the attribute `encodings_`,
 for use when :meth:`~TargetEncoder.transform` is called. Note that the encodings
-learned for each fold during the cross fitting scheme are not saved to an
-attribute.
+learned for each fold during the :term:`cross fitting` scheme are not saved to
+an attribute.
 
-The :meth:`~TargetEncoder.fit` method does **not** use any cross fitting
+The :meth:`~TargetEncoder.fit` method does **not** use any :term:`cross fitting`
 schemes and learns one encoding on the entire training set, which is used to
 encode categories in :meth:`~TargetEncoder.transform`.
 This encoding is the same as the 'full data'