diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 94f6dce73..8b6fd222d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.2 + rev: v0.7.3 hooks: - id: ruff types_or: [python, pyi, jupyter] diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md new file mode 100644 index 000000000..317175f50 --- /dev/null +++ b/docs/release-notes/0.11.0.md @@ -0,0 +1,49 @@ +(v0.11.0)= +### 0.11.0 {small}`2024-11-07` + +Release candidates: + +- (v0.11.0rc3)= + {guilabel}`rc3` 2024-10-14 +- (v0.11.0rc2)= + {guilabel}`rc2` 2024-09-24 +- (v0.11.0rc1)= + {guilabel}`rc1` 2024-09-04 + +#### Bug fixes + +- Ensure {func}`anndata.concat` of {class}`~anndata.AnnData` object with {class}`scipy.sparse.spmatrix` and {class}`scipy.sparse.sparray` dask arrays uses the correct fill value of 0. {user}`ilan-gold` ({pr}`1719`) +- Ensure that views of AwkwardArrays have their "view" attributes removed on saving an {class}`~anndata.AnnData` object to disk. {user}`grst` ({pr}`1736`) + +#### Breaking changes + +- {guilabel}`rc3` Drop support for `python` 3.9 {user}`ilan-gold` ({pr}`1712`) +- {guilabel}`rc2` A new `anndata.io` module contains all `read_*` and `write_*` functions, and all imports of such functions should go through this module. Old ways of importing these functions i.e., `from anndata import read_csv` or `from anndata._io.specs import read_elem` will still work, but are now considered deprecated and give a warning on import with the exception of {func}`anndata.io.read_zarr` and {func}`anndata.io.read_h5ad`, which will remain at the top-level `anndata` without warning. {user}`ilan-gold ({pr}`1682`) +- {guilabel}`rc1` Removed deprecated modules `anndata.core` and `anndata.readwrite` {user}`ivirshup` ({pr}`1197`) +- {guilabel}`rc1` No longer export `sparse_dataset` from `anndata.experimental`, instead exporting {func}`anndata.io.sparse_dataset` {user}`ilan-gold` ({pr}`1642`) +- {guilabel}`rc1` Move `RWAble` and `InMemoryElem` out of `experimental`, renaming `RWAble` to {type}`~anndata.typing.AxisStorable` and `InMemoryElem` to {type}`~anndata.typing.RWAble` {user}`ilan-gold` ({pr}`1643`) + +#### Development Process + +- {guilabel}`rc2` Add extra `dask` dependency for installation i.e., `pip install anndata[dask]` {user}`ilan-gold` ({pr}`1677`) +- {guilabel}`rc2` Remove `shall_` from variable names in `settings` {user}`ilan-gold` ({pr}`1685`) +- {guilabel}`rc1` Create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` ({pr}`1596`) + +#### Documentation + +- {guilabel}`rc1` Correct {attr}`anndata.AnnData.X` type to include {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` as possible types and being deprecation process for non-csr/csc {class}`scipy.sparse.spmatrix` types in {attr}`anndata.AnnData.X` {user}`ilan-gold` ({pr}`1616`) + +#### Features + +- Add support for ellipsis indexing of the {class}`~anndata.AnnData` object {user}`ilan-gold` ({pr}`1729`) +- {guilabel}`rc1` `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {user}`ilan-gold` {user}`isaac-virshup` ({pr}`1028`) +- {guilabel}`rc1` Allow `axis` parameter of e.g. {func}`anndata.concat` to accept `'obs'` and `'var'` {user}`flying-sheep` ({pr}`1244`) +- {guilabel}`rc1` Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {user}`ilan-gold` ({pr}`1270`) +- {guilabel}`rc1` Add {attr}`~anndata.settings.remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1340`) +- {guilabel}`rc1` Add {func}`~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`) +- {guilabel}`rc1` Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {user}` falexwolf` ({pr}`1474`) +- {guilabel}`rc1` Add {attr}`~anndata.settings.check_uniqueness` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1507`) +- {guilabel}`rc1` Add functionality to write from GPU {class}`dask.array.Array` to disk {user}`ilan-gold` ({pr}`1550`) +- {guilabel}`rc1` Read and write support for nullable string arrays ({class}`pandas.arrays.StringArray`). Use pandas’ {doc}`pandas:user_guide/options` `mode.string_storage` to control which storage mode is used when reading `dtype="string"` columns. {user}`flying-sheep` ({pr}`1558`) +- {guilabel}`rc1` Export {func}`~anndata.io.write_elem` and {func}`~anndata.io.read_elem` directly from the main package instead of `experimental` {user}`ilan-gold` ({pr}`1598`) +- {guilabel}`rc1` Allow reading sparse data (via {func}`~anndata.io.read_elem` or {func}`~anndata.io.sparse_dataset`) into either {class}`scipy.sparse.csr_array` or {class}`scipy.sparse.csc_array` via {attr}`anndata.settings.use_sparse_array_on_read` {user}`ilan-gold` ({pr}`1633`) diff --git a/docs/release-notes/0.11.0rc1.md b/docs/release-notes/0.11.0rc1.md deleted file mode 100644 index f5a98086d..000000000 --- a/docs/release-notes/0.11.0rc1.md +++ /dev/null @@ -1,32 +0,0 @@ -(v0.11.0rc1)= -### 0.11.0rc1 {small}`2024-09-04` - -#### Breaking changes - -- Removed deprecated modules `anndata.core` and `anndata.readwrite` {user}`ivirshup` ({pr}`1197`) -- No longer export `sparse_dataset` from `anndata.experimental`, instead exporting {func}`anndata.io.sparse_dataset` {user}`ilan-gold` ({pr}`1642`) -- Move `RWAble` and `InMemoryElem` out of `experimental`, renaming `RWAble` to {type}`~anndata.typing.AxisStorable` and `InMemoryElem` to {type}`~anndata.typing.RWAble` {user}`ilan-gold` ({pr}`1643`) - -#### Development Process - -- create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` ({pr}`1596`) - -#### Documentation - -- Correct {attr}`anndata.AnnData.X` type to include {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` as possible types and being deprecation process for non-csr/csc {class}`scipy.sparse.spmatrix` types in {attr}`anndata.AnnData.X` {user}`ilan-gold` ({pr}`1616`) - -#### Features - -- `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {user}`ilan-gold` {user}`isaac-virshup` ({pr}`1028`) -- Allow `axis` parameter of e.g. {func}`anndata.concat` to accept `'obs'` and `'var'` {user}`flying-sheep` ({pr}`1244`) -- Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {user}`ilan-gold` ({pr}`1270`) -- Add {attr}`~anndata.settings.remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1340`) -- Add {func}`~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`) -- Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {user}` falexwolf` ({pr}`1474`) -- Add {attr}`~anndata.settings.check_uniqueness` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1507`) -- Add functionality to write from GPU {class}`dask.array.Array` to disk {user}`ilan-gold` ({pr}`1550`) -- Read and write support for nullable string arrays ({class}`pandas.arrays.StringArray`). - Use pandas’ {doc}`pandas:user_guide/options` `mode.string_storage` to control which storage mode is used when reading `dtype="string"` columns. - {user}`flying-sheep` ({pr}`1558`) -- Export {func}`~anndata.io.write_elem` and {func}`~anndata.io.read_elem` directly from the main package instead of `experimental` {user}`ilan-gold` ({pr}`1598`) -- Allow reading sparse data (via {func}`~anndata.io.read_elem` or {func}`~anndata.io.sparse_dataset`) into either {class}`scipy.sparse.csr_array` or {class}`scipy.sparse.csc_array` via {attr}`anndata.settings.use_sparse_array_on_read` {user}`ilan-gold` ({pr}`1633`) diff --git a/docs/release-notes/0.11.0rc2.md b/docs/release-notes/0.11.0rc2.md deleted file mode 100644 index ecb5049d2..000000000 --- a/docs/release-notes/0.11.0rc2.md +++ /dev/null @@ -1,12 +0,0 @@ -(v0.11.0rc2)= -### 0.11.0rc2 {small}`2024-09-24` - -#### Development Process - -- Add extra `dask` dependency for installation i.e., `pip install anndata[dask]` {user}`ilan-gold` ({pr}`1677`) -- Remove `shall_` from variable names in `settings` {user}`ilan-gold` ({pr}`1685`) - -#### Breaking changes - -- A new `anndata.io` module contains all `read_*` and `write_*` functions, and all imports of such functions should go through this module. -Old ways of importing these functions i.e., `from anndata import read_csv` or `from anndata._io.specs import read_elem` will still work, but are now considered deprecated and give a warning on import with the exception of {func}`anndata.io.read_zarr` and {func}`anndata.io.read_h5ad`, which will remain at the top-level `anndata` without warning. `user`{ilan-gold} ({pr}`1682`) diff --git a/docs/release-notes/0.11.0rc3.md b/docs/release-notes/0.11.0rc3.md deleted file mode 100644 index 417b003cb..000000000 --- a/docs/release-notes/0.11.0rc3.md +++ /dev/null @@ -1,6 +0,0 @@ -(v0.11.0rc3)= -### 0.11.0rc3 {small}`2024-10-14` - -### Breaking changes - -- Drop support for `python` 3.9 {user}`ilan-gold` ({pr}`1712`) diff --git a/docs/release-notes/1719.bugfix.md b/docs/release-notes/1719.bugfix.md deleted file mode 100644 index e7511f0fa..000000000 --- a/docs/release-notes/1719.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Ensure {func}`anndata.concat` of {class}`~anndata.AnnData` object with {class}`scipy.sparse.spmatrix` and {class}`scipy.sparse.sparray` dask arrays uses the correct fill value of 0. {user}`ilan-gold` diff --git a/docs/release-notes/1725.bugfix.md b/docs/release-notes/1725.bugfix.md new file mode 100644 index 000000000..c9a6dca3d --- /dev/null +++ b/docs/release-notes/1725.bugfix.md @@ -0,0 +1,2 @@ + +Remove upper pin on `dask` and exclude versions broken with sparse indexing {user}`ilan-gold` diff --git a/docs/release-notes/1729.feature.md b/docs/release-notes/1729.feature.md deleted file mode 100644 index a7f55361b..000000000 --- a/docs/release-notes/1729.feature.md +++ /dev/null @@ -1 +0,0 @@ -Add support for ellipsis indexing of the {class}`~anndata.AnnData` object {user}`ilan-gold` diff --git a/docs/release-notes/1736.bugfix.md b/docs/release-notes/1736.bugfix.md deleted file mode 100644 index 58cd65448..000000000 --- a/docs/release-notes/1736.bugfix.md +++ /dev/null @@ -1,2 +0,0 @@ -Ensure that views of AwkwardArrays have their "view" attributes removed on saving an {class}`~anndata.AnnData` object -to disk. {user}`grst` diff --git a/docs/release-notes/1743.bugfix.md b/docs/release-notes/1743.bugfix.md new file mode 100644 index 000000000..f8f489aff --- /dev/null +++ b/docs/release-notes/1743.bugfix.md @@ -0,0 +1 @@ +Fix chunking with -1 in `chunks` argument of {func}`~anndata.experimental.read_elem_as_dask` {user}`ilan-gold` diff --git a/docs/release-notes/1754.bugfix.md b/docs/release-notes/1754.bugfix.md new file mode 100644 index 000000000..492553ef1 --- /dev/null +++ b/docs/release-notes/1754.bugfix.md @@ -0,0 +1 @@ +Fix `cupy<0.13` imports in non-gpu environments {user}`ilan-gold` diff --git a/pyproject.toml b/pyproject.toml index 127ae3dc1..3cc1b31a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,7 +107,7 @@ gpu = ["cupy"] cu12 = ["cupy-cuda12x"] cu11 = ["cupy-cuda11x"] # https://github.com/dask/dask/issues/11290 -dask = ["dask[array]>=2022.09.2,<2024.8.0"] +dask = ["dask[array]>=2022.09.2,!=2024.8.*,!=2024.9.*"] [tool.hatch.version] source = "vcs" @@ -181,6 +181,7 @@ select = [ "ICN", # Follow import conventions "PTH", # Pathlib instead of os.path "PT", # Pytest conventions + "PYI", # Typing ] ignore = [ # line too long -> we accept long comment lines; formatter gets rid of long code lines diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 7ef9f8ac4..8a8eaf949 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -1169,9 +1169,7 @@ def _inplace_subset_obs(self, index: Index1D): self._init_as_actual(adata_subset) # TODO: Update, possibly remove - def __setitem__( - self, index: Index, val: int | float | np.ndarray | sparse.spmatrix - ): + def __setitem__(self, index: Index, val: float | np.ndarray | sparse.spmatrix): if self.is_view: raise ValueError("Object is view and cannot be accessed with `[]`.") obs, var = self._normalize_indices(index) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 48770be9c..a34f627e7 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -105,12 +105,16 @@ def read_sparse_as_dask( if chunks is not None: if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") - if chunks[minor_dim] != shape[minor_dim]: + if chunks[minor_dim] not in {shape[minor_dim], -1, None}: raise ValueError( "Only the major axis can be chunked. " f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" ) - stride = chunks[major_dim] + stride = ( + chunks[major_dim] + if chunks[major_dim] not in {None, -1} + else shape[major_dim] + ) shape_minor, shape_major = shape if is_csc else shape[::-1] chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) @@ -142,7 +146,11 @@ def read_h5_array( shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = ( - chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) + tuple( + c if c not in {None, -1} else s for c, s in zip(chunks, shape, strict=True) + ) + if chunks is not None + else (_DEFAULT_STRIDE,) * len(shape) ) chunk_layout = tuple( diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 3b43def7c..ca13f8e59 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -398,6 +398,7 @@ def read_elem_as_dask( Defaults to `(1000, adata.shape[1])` for CSR sparse, `(adata.shape[0], 1000)` for CSC sparse, and the on-disk chunking otherwise for dense. + Can use `-1` or `None` to indicate use of the size of the corresponding dimension. Returns ------- @@ -451,6 +452,11 @@ def read_elem_as_dask( ... g["X"], chunks=(500, adata.shape[1]) ... ) >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) + + We also support using -1 and None as a chunk size to signify the reading the whole axis: + + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1)) + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, None)) """ return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 66f8a9e29..2d9eb9980 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -25,6 +25,9 @@ "ArrayStorageType", "GroupStorageType", "StorageType", + "_ReadInternal", + "_ReadDaskInternal", + "_WriteInternal", ] ArrayStorageType: TypeAlias = ZarrArray | H5Array diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index b5ec3d415..255ffa548 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -140,7 +140,16 @@ def __repr__(): return "mock dask.array.core.Array" -if find_spec("cupy") or TYPE_CHECKING: +# https://github.com/scverse/anndata/issues/1749 +def is_cupy_importable() -> bool: + try: + import cupy # noqa: F401 + except ImportError: + return False + return True + + +if is_cupy_importable() or TYPE_CHECKING: from cupy import ndarray as CupyArray from cupyx.scipy.sparse import csc_matrix as CupyCSCMatrix from cupyx.scipy.sparse import csr_matrix as CupyCSRMatrix diff --git a/tests/conftest.py b/tests/conftest.py index 13fabdb93..9054812f5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,9 +3,16 @@ from functools import partial from typing import TYPE_CHECKING +import dask import joblib import pytest -from dask.base import normalize_seq, normalize_token, tokenize +from dask.base import normalize_token, tokenize +from packaging.version import Version + +if Version(dask.__version__) < Version("2024.8.0"): + from dask.base import normalize_seq +else: + from dask.tokenize import normalize_seq from scipy import sparse import anndata as ad diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index e46cd7d81..3ca5324b8 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -284,6 +284,8 @@ def test_read_lazy_2d_dask(sparse_format, store): (2, (200, 400)), (1, None), (2, None), + (2, (400, -1)), + (2, (400, None)), ], ) def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): @@ -316,28 +318,36 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): @pytest.mark.parametrize( - ("arr_type", "chunks"), + ("arr_type", "chunks", "expected_chunksize"), [ - ("dense", (100, 100)), - ("csc", (SIZE, 10)), - ("csr", (10, SIZE * 2)), - ("csc", None), - ("csr", None), + ("dense", (100, 100), (100, 100)), + ("csc", (SIZE, 10), (SIZE, 10)), + ("csr", (10, SIZE * 2), (10, SIZE * 2)), + ("csc", None, (SIZE, 1000)), + ("csr", None, (1000, SIZE * 2)), + ("csr", (10, -1), (10, SIZE * 2)), + ("csc", (-1, 10), (SIZE, 10)), + ("csr", (10, None), (10, SIZE * 2)), + ("csc", (None, 10), (SIZE, 10)), + ("csc", (None, None), (SIZE, SIZE * 2)), + ("csr", (None, None), (SIZE, SIZE * 2)), + ("csr", (-1, -1), (SIZE, SIZE * 2)), + ("csc", (-1, -1), (SIZE, SIZE * 2)), ], ) -def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): +def test_read_lazy_2d_chunk_kwargs( + store: H5Group | ZarrGroup, + arr_type: Literal["csr", "csc", "dense"], + chunks: None | tuple[int | None, int | None], + expected_chunksize: tuple[int, int], +): if arr_type == "dense": arr_store = create_dense_store(store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) - if chunks is not None: - assert X_dask_from_disk.chunksize == chunks - else: - minor_index = int(arr_type == "csr") - # assert that sparse chunks are set correctly by default - assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) + assert X_dask_from_disk.chunksize == expected_chunksize X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk)