From 7c2ebe2619781d15fa187aff9a86c7f5448994fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Nov 2024 22:02:26 +0100 Subject: [PATCH 01/87] chore: update pre-commit hooks (#2502) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.7.3 → v0.7.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.7.3...v0.7.4) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dd9660fa5f..7273985572 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_language_version: python: python3 repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.3 + rev: v0.7.4 hooks: - id: ruff args: ["--fix", "--show-fixes"] From d005ff77bc4efaf52f8e43d17f11d27f43d19ee0 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 21 Nov 2024 16:04:23 +0100 Subject: [PATCH 02/87] [v3] Import crc32c through numcodecs (#2510) * Import crc32c through numcodecs * remove more crc32c references --- pyproject.toml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 09797ae3d4..42990f4e8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,9 +28,8 @@ requires-python = ">=3.11" dependencies = [ 'asciitree', 'numpy>=1.25', - 'numcodecs>=0.14', + 'numcodecs[crc32c]>=0.14', 'fsspec>=2022.10.0', - 'crc32c>=2.3', 'typing_extensions>=4.6', 'donfig>=0.8', ] @@ -193,7 +192,6 @@ dependencies = [ 'fsspec @ git+https://github.com/fsspec/filesystem_spec', 's3fs @ git+https://github.com/fsspec/s3fs', 'universal_pathlib @ git+https://github.com/fsspec/universal_pathlib', - 'crc32c @ git+https://github.com/ICRAR/crc32c', 'typing_extensions @ git+https://github.com/python/typing_extensions', 'donfig @ git+https://github.com/pytroll/donfig', # test deps @@ -227,7 +225,6 @@ dependencies = [ 'fsspec==2022.10.0', 's3fs==2022.10.0', 'universal_pathlib==0.0.22', - 'crc32c==2.3.*', 'typing_extensions==4.6.*', # 4.5 needed for @deprecated, 4.6 for Buffer 'donfig==0.8.*', # test deps From 3dd04ce7194cebd68149ec71eb79e22b66bc57f0 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 21 Nov 2024 22:58:36 +0100 Subject: [PATCH 03/87] Remove asciitree dependency (#2511) * rm asciitree * no format --- .pre-commit-config.yaml | 4 +--- pyproject.toml | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7273985572..0511cfaf86 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,10 +28,8 @@ repos: files: src|tests additional_dependencies: # Package dependencies - - asciitree - - crc32c - donfig - - numcodecs + - numcodecs[crc32c] - numpy - typing_extensions - universal-pathlib diff --git a/pyproject.toml b/pyproject.toml index 42990f4e8f..63ecdd85be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ maintainers = [ requires-python = ">=3.11" # If you add a new dependency here, please also add it to .pre-commit-config.yml dependencies = [ - 'asciitree', 'numpy>=1.25', 'numcodecs[crc32c]>=0.14', 'fsspec>=2022.10.0', From 76904eac556a71817eb7ea2e54df703cba919a12 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Fri, 22 Nov 2024 04:42:51 +0100 Subject: [PATCH 04/87] Refactor/narrow array name type (#2499) * array name is not nullable * alter test for nullable array name * assert a.name is not None * assert a.name is not None, outside of the conditional --- src/zarr/core/array.py | 23 +++++++++-------------- src/zarr/testing/strategies.py | 1 + tests/test_array.py | 4 ++-- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 249168723f..1e815d4d0e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -810,7 +810,7 @@ def path(self) -> str: return self.store_path.path @property - def name(self) -> str | None: + def name(self) -> str: """Array name following h5py convention. Returns @@ -818,16 +818,14 @@ def name(self) -> str | None: str The name of the array. """ - if self.path: - # follow h5py convention: add leading slash - name = self.path - if name[0] != "/": - name = "/" + name - return name - return None + # follow h5py convention: add leading slash + name = self.path + if not name.startswith("/"): + name = "/" + name + return name @property - def basename(self) -> str | None: + def basename(self) -> str: """Final component of name. Returns @@ -835,9 +833,7 @@ def basename(self) -> str | None: str The basename or final component of the array name. """ - if self.name is not None: - return self.name.split("/")[-1] - return None + return self.name.split("/")[-1] @property def cdata_shape(self) -> ChunkCoords: @@ -1626,8 +1622,7 @@ def path(self) -> str: return self._async_array.path @property - def name(self) -> str | None: - """Array name following h5py convention.""" + def name(self) -> str: return self._async_array.name @property diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index aed5b82e57..f0a7e97d3a 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -153,6 +153,7 @@ def arrays( assert isinstance(a, Array) if a.metadata.zarr_format == 3: assert a.fill_value is not None + assert a.name is not None assert isinstance(root[array_path], Array) assert nparray.shape == a.shape assert chunks == a.chunks diff --git a/tests/test_array.py b/tests/test_array.py index 975873053d..f0f36cf70d 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -122,8 +122,8 @@ def test_array_name_properties_no_group( ) -> None: arr = Array.create(store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4") assert arr.path == "" - assert arr.name is None - assert arr.basename is None + assert arr.name == "/" + assert arr.basename == "" @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) From 7be0ac9c41978fd770023276950522808631998e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:33:43 +0100 Subject: [PATCH 05/87] chore: update pre-commit hooks (#2517) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.7.4 → v0.8.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.7.4...v0.8.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0511cfaf86..6506f7349d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_language_version: python: python3 repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.4 + rev: v0.8.0 hooks: - id: ruff args: ["--fix", "--show-fixes"] From 29612464491cb53c782b46e9246fd951a4185695 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Fri, 29 Nov 2024 16:56:38 +0100 Subject: [PATCH 06/87] Multiple imports for an import name (#2504) --- tests/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 479ce201b0..3dc93ba474 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -12,7 +12,7 @@ import zarr from zarr import Array -from zarr.core.buffer import BufferPrototype, default_buffer_prototype +from zarr.core.buffer import default_buffer_prototype from zarr.core.indexing import ( BasicSelection, CoordinateSelection, From 206d145a5aedfccdbd2b44e4c40916b14530b239 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Nov 2024 10:37:17 -0600 Subject: [PATCH 07/87] Added Array.info_complete (#2514) Now that Store.getsize is a thing, we can do info_complete which includes the number of chunks written and the size of those bytes. Co-authored-by: Davis Bennett Co-authored-by: Norman Rzepka --- src/zarr/core/array.py | 53 ++++++++++++++++++++++----- tests/test_array.py | 82 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1e815d4d0e..71a6f9d380 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1346,18 +1346,53 @@ def info(self) -> Any: AsyncArray.info_complete All information about a group, including dynamic information like the number of bytes and chunks written. + + Examples + -------- + + >>> arr = await zarr.api.asynchronous.create( + ... path="array", shape=(3, 4, 5), chunks=(2, 2, 2)) + ... ) + >>> arr.info + Type : Array + Zarr format : 3 + Data type : DataType.float64 + Shape : (3, 4, 5) + Chunk shape : (2, 2, 2) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }] + No. bytes : 480 """ return self._info() async def info_complete(self) -> Any: - # TODO: get the size of the object from the store. - extra = { - "count_chunks_initialized": await self.nchunks_initialized(), - # count_bytes_stored isn't yet implemented. - } - return self._info(extra=extra) - - def _info(self, extra: dict[str, int] | None = None) -> Any: + """ + Return all the information for an array, including dynamic information like a storage size. + + In addition to the static information, this provides + + - The count of chunks initialized + - The sum of the bytes written + + Returns + ------- + ArrayInfo + + See Also + -------- + AsyncArray.info + A property giving just the statically known information about an array. + """ + return self._info( + await self.nchunks_initialized(), + await self.store_path.store.getsize_prefix(self.store_path.path), + ) + + def _info( + self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None + ) -> Any: kwargs: dict[str, Any] = {} if self.metadata.zarr_format == 2: assert isinstance(self.metadata, ArrayV2Metadata) @@ -1386,6 +1421,8 @@ def _info(self, extra: dict[str, int] | None = None) -> Any: _read_only=self.read_only, _store_type=type(self.store_path.store).__name__, _count_bytes=self.dtype.itemsize * self.size, + _count_bytes_stored=count_bytes_stored, + _count_chunks_initialized=count_chunks_initialized, **kwargs, ) diff --git a/tests/test_array.py b/tests/test_array.py index f0f36cf70d..86da801d1f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1,3 +1,4 @@ +import dataclasses import json import math import pickle @@ -474,6 +475,87 @@ def test_info_v3(self) -> None: ) assert result == expected + def test_info_complete(self) -> None: + arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + result = arr.info_complete() + expected = ArrayInfo( + _zarr_format=3, + _data_type=DataType.parse("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + _count_bytes=128, + _count_chunks_initialized=0, + _count_bytes_stored=373, # the metadata? + ) + assert result == expected + + arr[:2, :2] = 10 + result = arr.info_complete() + expected = dataclasses.replace( + expected, _count_chunks_initialized=1, _count_bytes_stored=405 + ) + assert result == expected + + async def test_info_v2_async(self) -> None: + arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=2) + result = arr.info + expected = ArrayInfo( + _zarr_format=2, + _data_type=np.dtype("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _count_bytes=128, + ) + assert result == expected + + async def test_info_v3_async(self) -> None: + arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + result = arr.info + expected = ArrayInfo( + _zarr_format=3, + _data_type=DataType.parse("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + _count_bytes=128, + ) + assert result == expected + + async def test_info_complete_async(self) -> None: + arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + result = await arr.info_complete() + expected = ArrayInfo( + _zarr_format=3, + _data_type=DataType.parse("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + _count_bytes=128, + _count_chunks_initialized=0, + _count_bytes_stored=373, # the metadata? + ) + assert result == expected + + await arr.setitem((slice(2), slice(2)), 10) + result = await arr.info_complete() + expected = dataclasses.replace( + expected, _count_chunks_initialized=1, _count_bytes_stored=405 + ) + assert result == expected + @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize("zarr_format", [2, 3]) From cdd6a7484bbc2430b1883a2bf2d2ad8ad7943885 Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Fri, 29 Nov 2024 17:59:16 +0100 Subject: [PATCH 08/87] Fix iterating over sharding index (#2392) * test_sharding_with_empty_inner_chunk * tests for failing read with sharding * replace morton order by np unravel index * format * Revert "replace morton order by np unravel index" This reverts commit adc3240108ea61a9df9dfa8e40f7cf20a94eed77. * skip morton indices out of bound * improve test_sharding_with_chunks_per_shard * format --------- Co-authored-by: Norman Rzepka --- src/zarr/core/indexing.py | 11 +++++++++-- tests/test_codecs/test_codecs.py | 20 ++++++++++++++++++++ tests/test_codecs/test_sharding.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 3d47f5f183..ca227be094 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1346,8 +1346,15 @@ def decode_morton(z: int, chunk_shape: ChunkCoords) -> ChunkCoords: def morton_order_iter(chunk_shape: ChunkCoords) -> Iterator[ChunkCoords]: - for i in range(product(chunk_shape)): - yield decode_morton(i, chunk_shape) + i = 0 + order: list[ChunkCoords] = [] + while len(order) < product(chunk_shape): + m = decode_morton(i, chunk_shape) + if m not in order and all(x < y for x, y in zip(m, chunk_shape, strict=False)): + order.append(m) + i += 1 + for j in range(product(chunk_shape)): + yield order[j] def c_order_iter(chunks_per_shard: ChunkCoords) -> Iterator[ChunkCoords]: diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index dfb8e1c595..2025e72937 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -204,6 +204,26 @@ def test_morton() -> None: ] +@pytest.mark.parametrize( + "shape", + [ + [2, 2, 2], + [5, 2], + [2, 5], + [2, 9, 2], + [3, 2, 12], + [2, 5, 1], + [4, 3, 6, 2, 7], + [3, 2, 1, 6, 4, 5, 2], + ], +) +def test_morton2(shape) -> None: + order = list(morton_order_iter(shape)) + for i, x in enumerate(order): + assert x not in order[:i] # no duplicates + assert all(x[j] < shape[j] for j in range(len(shape))) # all indices are within bounds + + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_write_partial_chunks(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 78f32fef0e..51c82067f3 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -393,3 +393,32 @@ async def test_sharding_with_empty_inner_chunk( print("read data") data_read = await a.getitem(...) assert np.array_equal(data_read, data) + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +@pytest.mark.parametrize( + "index_location", + [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end], +) +@pytest.mark.parametrize("chunks_per_shard", [(5, 2), (2, 5), (5, 5)]) +async def test_sharding_with_chunks_per_shard( + store: Store, index_location: ShardingCodecIndexLocation, chunks_per_shard: tuple[int] +) -> None: + chunk_shape = (2, 1) + shape = [x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False)] + data = np.ones(np.prod(shape), dtype="int32").reshape(shape) + fill_value = 42 + + path = f"test_sharding_with_chunks_per_shard_{index_location}" + spath = StorePath(store, path) + a = Array.create( + spath, + shape=shape, + chunk_shape=shape, + dtype="int32", + fill_value=fill_value, + codecs=[ShardingCodec(chunk_shape=chunk_shape, index_location=index_location)], + ) + a[...] = data + data_read = a[...] + assert np.array_equal(data_read, data) From f8e3432305279e1104219ea9cf998a75009c40a5 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Fri, 29 Nov 2024 18:51:23 +0100 Subject: [PATCH 09/87] Imported name is not used anywhere in the module (#2520) --- tests/test_store/test_logging.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_store/test_logging.py b/tests/test_store/test_logging.py index aed1f06fa2..c0630dffd8 100644 --- a/tests/test_store/test_logging.py +++ b/tests/test_store/test_logging.py @@ -5,7 +5,6 @@ import pytest import zarr -import zarr.storage from zarr.core.buffer import default_buffer_prototype from zarr.storage.logging import LoggingStore From 90b3aea9e6fee4dd25abbe40bcff0c6508910a17 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Sun, 1 Dec 2024 21:40:53 +0100 Subject: [PATCH 10/87] Update ruff and associated changes (#2522) * Upgrade ruff to 0.8.1 Remove deprecated rules: https://astral.sh/blog/ruff-v0.8.0#removal-of-six-deprecated-rules * Apply ruff/flake8-pyi rule PYI061 PYI061 `Literal[None, ...]` can be replaced with `Literal[...] | None` --- .pre-commit-config.yaml | 2 +- pyproject.toml | 5 ----- src/zarr/core/group.py | 4 ++-- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6506f7349d..6068d0003d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_language_version: python: python3 repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.0 + rev: v0.8.1 hooks: - id: ruff args: ["--fix", "--show-fixes"] diff --git a/pyproject.toml b/pyproject.toml index 63ecdd85be..c4c3a7a4e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -294,11 +294,7 @@ extend-select = [ "W", # pycodestyle warnings ] ignore = [ - "ANN101", # deprecated - "ANN102", # deprecated "ANN401", - "PT004", # deprecated - "PT005", # deprecated "PT011", # TODO: apply this rule "PT012", # TODO: apply this rule "RET505", @@ -306,7 +302,6 @@ ignore = [ "RUF005", "SIM108", "TRY003", - "UP027", # deprecated "UP038", # https://github.com/astral-sh/ruff/issues/7871 # https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules "W191", diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 13a8c7209a..2ca6e209fd 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -434,7 +434,7 @@ async def from_store( async def open( cls, store: StoreLike, - zarr_format: Literal[2, 3, None] = 3, + zarr_format: Literal[2, 3] | None = 3, use_consolidated: bool | str | None = None, ) -> AsyncGroup: """Open a new AsyncGroup @@ -1691,7 +1691,7 @@ def from_store( def open( cls, store: StoreLike, - zarr_format: Literal[2, 3, None] = 3, + zarr_format: Literal[2, 3] | None = 3, ) -> Group: """Open a group from an initialized store. From 501ae9eaf1f5df774b2981db7020a4e6d1490606 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 2 Dec 2024 03:25:39 +0100 Subject: [PATCH 11/87] [v3] Makes data contiguous in v2 codec (#2515) * fixes #2501 * typing * only use c-contiguous * more tests * typing * tests * astype with copy=False --------- Co-authored-by: Deepak Cherian --- src/zarr/codecs/_v2.py | 6 ++++- src/zarr/core/buffer/core.py | 8 +++++- tests/test_v2.py | 51 +++++++++++++++++++++++++++++++++--- 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 30504ad204..df0d8ecb0a 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING import numcodecs -from numcodecs.compat import ensure_ndarray_like +from numcodecs.compat import ensure_bytes, ensure_ndarray_like from zarr.abc.codec import ArrayBytesCodec from zarr.registry import get_ndbuffer_class @@ -68,6 +68,9 @@ async def _encode_single( ) -> Buffer | None: chunk = chunk_array.as_ndarray_like() + # ensure contiguous and correct order + chunk = chunk.astype(chunk_spec.dtype, order=chunk_spec.order, copy=False) + # apply filters if self.filters: for f in self.filters: @@ -83,6 +86,7 @@ async def _encode_single( else: cdata = chunk + cdata = ensure_bytes(cdata) return chunk_spec.prototype.buffer.from_bytes(cdata) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 9a07583c93..7ddedfe064 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -80,7 +80,13 @@ def reshape( def view(self, dtype: npt.DTypeLike) -> Self: ... - def astype(self, dtype: npt.DTypeLike, order: Literal["K", "A", "C", "F"] = ...) -> Self: ... + def astype( + self, + dtype: npt.DTypeLike, + order: Literal["K", "A", "C", "F"] = ..., + *, + copy: bool = ..., + ) -> Self: ... def fill(self, value: Any) -> None: ... diff --git a/tests/test_v2.py b/tests/test_v2.py index e6b50ab2ae..68c07e2024 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -1,6 +1,6 @@ import json from collections.abc import Iterator -from typing import Any +from typing import Any, Literal import numcodecs.vlen import numpy as np @@ -126,9 +126,54 @@ async def test_create_dtype_str(dtype: Any) -> None: @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: +@pytest.mark.parametrize("order", ["C", "F"]) +def test_v2_filters_codecs(filters: Any, order: Literal["C", "F"]) -> None: array_fixture = [42] - arr = zarr.create(shape=1, dtype=" None: + arr = zarr.Array.create( + MemoryStore({}), + shape=(10, 8), + chunks=(3, 3), + fill_value=np.nan, + dtype="float64", + zarr_format=2, + exists_ok=True, + order=array_order, + ) + + # Non-contiguous write + a = np.arange(arr.shape[0] * arr.shape[1]).reshape(arr.shape, order=data_order) + arr[slice(6, 9, None), slice(3, 6, None)] = a[ + slice(6, 9, None), slice(3, 6, None) + ] # The slice on the RHS is important + np.testing.assert_array_equal( + arr[slice(6, 9, None), slice(3, 6, None)], a[slice(6, 9, None), slice(3, 6, None)] + ) + + arr = zarr.Array.create( + MemoryStore({}), + shape=(10, 8), + chunks=(3, 3), + fill_value=np.nan, + dtype="float64", + zarr_format=2, + exists_ok=True, + order=array_order, + ) + + # Contiguous write + a = np.arange(9).reshape((3, 3), order=data_order) + if data_order == "F": + assert a.flags.f_contiguous + else: + assert a.flags.c_contiguous + arr[slice(6, 9, None), slice(3, 6, None)] = a + np.testing.assert_array_equal(arr[slice(6, 9, None), slice(3, 6, None)], a) From 58ff3ec25583de1d370684af18b755dc5b6e8444 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 2 Dec 2024 09:39:25 -0700 Subject: [PATCH 12/87] Error when attempting to set with an array of incompatible shape. (#2512) * Error when attempt to set with an array of incompatible shape. Closes #2469 * Fix typing * fix isinstance check --- src/zarr/core/array.py | 8 ++++++++ tests/test_indexing.py | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 71a6f9d380..a6317e7a9e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -2881,6 +2881,14 @@ def set_coordinate_selection( if hasattr(value, "shape") and len(value.shape) > 1: value = np.array(value).reshape(-1) + if not is_scalar(value, self.dtype) and ( + isinstance(value, NDArrayLike) and indexer.shape != value.shape + ): + raise ValueError( + f"Attempting to set a selection of {indexer.sel_shape[0]} " + f"elements with an array of {value.shape[0]} elements." + ) + sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @_deprecate_positional_args diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 3dc93ba474..04eb53e364 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1936,3 +1936,21 @@ def test_zero_sized_chunks(store: StorePath, shape: list[int]) -> None: z = Array.create(store=store, shape=shape, chunk_shape=shape, zarr_format=3, dtype="f8") z[...] = 42 assert_array_equal(z[...], np.zeros(shape, dtype="f8")) + + +@pytest.mark.parametrize("store", ["memory"], indirect=["store"]) +def test_vectorized_indexing_incompatible_shape(store) -> None: + # GH2469 + shape = (4, 4) + chunks = (2, 2) + fill_value = 32767 + arr = zarr.create( + shape, + store=store, + chunks=chunks, + dtype=np.int16, + fill_value=fill_value, + codecs=[zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()], + ) + with pytest.raises(ValueError, match="Attempting to set"): + arr[np.array([1, 2]), np.array([1, 2])] = np.array([[-1, -2], [-3, -4]]) From 44bcd16f208b3eefc167bc95e26f7d2c8834f805 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 21:53:41 -0700 Subject: [PATCH 13/87] Bump sphinx-autoapi from 3.3.3 to 3.4.0 in the actions group (#2525) Bumps the actions group with 1 update: [sphinx-autoapi](https://github.com/readthedocs/sphinx-autoapi). Updates `sphinx-autoapi` from 3.3.3 to 3.4.0 - [Release notes](https://github.com/readthedocs/sphinx-autoapi/releases) - [Changelog](https://github.com/readthedocs/sphinx-autoapi/blob/main/CHANGELOG.rst) - [Commits](https://github.com/readthedocs/sphinx-autoapi/compare/v3.3.3...v3.4.0) --- updated-dependencies: - dependency-name: sphinx-autoapi dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c4c3a7a4e2..16595c6b5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,7 @@ gpu = [ docs = [ 'sphinx==8.1.3', 'sphinx-autobuild>=2021.3.14', - 'sphinx-autoapi==3.3.3', + 'sphinx-autoapi==3.4.0', 'sphinx_design', 'sphinx-issues', 'sphinx-copybutton', From bda158a600592c1eb90706980f8c76f27e6a10a8 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Tue, 3 Dec 2024 06:28:53 +0100 Subject: [PATCH 14/87] =?UTF-8?q?zarr.store=20=E2=86=92=20zarr.storage=20(?= =?UTF-8?q?#2523)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also discard commented out code. Code commented out without a comment doesn't help. The reason here was circular imports. --- src/zarr/api/asynchronous.py | 6 +++--- src/zarr/storage/common.py | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 3f36614cc2..e5d09f8c3a 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -653,12 +653,12 @@ async def open_group( Store or path to directory in file system or name of zip file. Strings are interpreted as paths on the local file system - and used as the ``root`` argument to :class:`zarr.store.LocalStore`. + and used as the ``root`` argument to :class:`zarr.storage.LocalStore`. Dictionaries are used as the ``store_dict`` argument in - :class:`zarr.store.MemoryStore``. + :class:`zarr.storage.MemoryStore``. - By default (``store=None``) a new :class:`zarr.store.MemoryStore` + By default (``store=None``) a new :class:`zarr.storage.MemoryStore` is created. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional diff --git a/src/zarr/storage/common.py b/src/zarr/storage/common.py index 1e33967414..e9d57197e1 100644 --- a/src/zarr/storage/common.py +++ b/src/zarr/storage/common.py @@ -12,8 +12,6 @@ from zarr.storage.local import LocalStore from zarr.storage.memory import MemoryStore -# from zarr.store.remote import RemoteStore - if TYPE_CHECKING: from zarr.core.buffer import BufferPrototype From 6ee3d400a8f337fefe1c5373e27fc9db2f958b92 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 4 Dec 2024 22:06:31 -0700 Subject: [PATCH 15/87] Move stateful tests to public testing API (#2531) * Move stateful tests to public testing API * consolidate a bit --- pyproject.toml | 1 + src/zarr/testing/stateful.py | 437 ++++++++++++++++++++ tests/test_store/test_stateful.py | 33 ++ tests/test_store/test_stateful_hierarchy.py | 223 ---------- tests/test_store/test_stateful_store.py | 249 ----------- 5 files changed, 471 insertions(+), 472 deletions(-) create mode 100644 src/zarr/testing/stateful.py create mode 100644 tests/test_store/test_stateful.py delete mode 100644 tests/test_store/test_stateful_hierarchy.py delete mode 100644 tests/test_store/test_stateful_store.py diff --git a/pyproject.toml b/pyproject.toml index 16595c6b5b..888d0e0eb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -342,6 +342,7 @@ ignore_errors = true [[tool.mypy.overrides]] module = [ + "zarr.testing.stateful", # lots of hypothesis decorator errors "tests.package_with_entrypoint.*", "tests.test_codecs.test_codecs", "tests.test_codecs.test_transpose", diff --git a/src/zarr/testing/stateful.py b/src/zarr/testing/stateful.py new file mode 100644 index 0000000000..cc0f220807 --- /dev/null +++ b/src/zarr/testing/stateful.py @@ -0,0 +1,437 @@ +import builtins +from typing import Any + +import hypothesis.extra.numpy as npst +import hypothesis.strategies as st +import numpy as np +from hypothesis import assume, note +from hypothesis.stateful import ( + RuleBasedStateMachine, + initialize, + invariant, + precondition, + rule, +) +from hypothesis.strategies import DataObject + +import zarr +from zarr import Array +from zarr.abc.store import Store +from zarr.core.buffer import Buffer, BufferPrototype, cpu, default_buffer_prototype +from zarr.core.sync import SyncMixin +from zarr.storage import LocalStore, MemoryStore +from zarr.testing.strategies import key_ranges, node_names, np_array_and_chunks, numpy_arrays +from zarr.testing.strategies import keys as zarr_keys + +MAX_BINARY_SIZE = 100 + + +def split_prefix_name(path: str) -> tuple[str, str]: + split = path.rsplit("/", maxsplit=1) + if len(split) > 1: + prefix, name = split + else: + prefix = "" + (name,) = split + return prefix, name + + +class ZarrHierarchyStateMachine(SyncMixin, RuleBasedStateMachine): + """ + This state machine models operations that modify a zarr store's + hierarchy. That is, user actions that modify arrays/groups as well + as list operations. It is intended to be used by external stores, and + compares their results to a MemoryStore that is assumed to be perfect. + """ + + def __init__(self, store: Store) -> None: + super().__init__() + + self.store = store + + self.model = MemoryStore() + zarr.group(store=self.model) + + # Track state of the hierarchy, these should contain fully qualified paths + self.all_groups: set[str] = set() + self.all_arrays: set[str] = set() + + @initialize() + def init_store(self) -> None: + # This lets us reuse the fixture provided store. + self._sync(self.store.clear()) + zarr.group(store=self.store) + + def can_add(self, path: str) -> bool: + return path not in self.all_groups and path not in self.all_arrays + + # -------------------- store operations ----------------------- + @rule(name=node_names, data=st.data()) + def add_group(self, name: str, data: DataObject) -> None: + if self.all_groups: + parent = data.draw(st.sampled_from(sorted(self.all_groups)), label="Group parent") + else: + parent = "" + path = f"{parent}/{name}".lstrip("/") + assume(self.can_add(path)) + note(f"Adding group: path='{path}'") + self.all_groups.add(path) + zarr.group(store=self.store, path=path) + zarr.group(store=self.model, path=path) + + @rule( + data=st.data(), + name=node_names, + array_and_chunks=np_array_and_chunks(arrays=numpy_arrays(zarr_formats=st.just(3))), + ) + def add_array( + self, + data: DataObject, + name: str, + array_and_chunks: tuple[np.ndarray[Any, Any], tuple[int, ...]], + ) -> None: + array, chunks = array_and_chunks + fill_value = data.draw(npst.from_dtype(array.dtype)) + if self.all_groups: + parent = data.draw(st.sampled_from(sorted(self.all_groups)), label="Array parent") + else: + parent = "" + # TODO: support creating deeper paths + # TODO: support overwriting potentially by just skipping `self.can_add` + path = f"{parent}/{name}".lstrip("/") + assume(self.can_add(path)) + note(f"Adding array: path='{path}' shape={array.shape} chunks={chunks}") + for store in [self.store, self.model]: + zarr.array(array, chunks=chunks, path=path, store=store, fill_value=fill_value) + self.all_arrays.add(path) + + # @precondition(lambda self: bool(self.all_groups)) + # @precondition(lambda self: bool(self.all_arrays)) + # @rule(data=st.data()) + # def move_array(self, data): + # array_path = data.draw(st.sampled_from(self.all_arrays), label="Array move source") + # to_group = data.draw(st.sampled_from(self.all_groups), label="Array move destination") + + # # fixme renaiming to self? + # array_name = os.path.basename(array_path) + # assume(self.model.can_add(to_group, array_name)) + # new_path = f"{to_group}/{array_name}".lstrip("/") + # note(f"moving array '{array_path}' -> '{new_path}'") + # self.model.rename(array_path, new_path) + # self.repo.store.rename(array_path, new_path) + + # @precondition(lambda self: len(self.all_groups) >= 2) + # @rule(data=st.data()) + # def move_group(self, data): + # from_group = data.draw(st.sampled_from(self.all_groups), label="Group move source") + # to_group = data.draw(st.sampled_from(self.all_groups), label="Group move destination") + # assume(not to_group.startswith(from_group)) + + # from_group_name = os.path.basename(from_group) + # assume(self.model.can_add(to_group, from_group_name)) + # # fixme renaiming to self? + # new_path = f"{to_group}/{from_group_name}".lstrip("/") + # note(f"moving group '{from_group}' -> '{new_path}'") + # self.model.rename(from_group, new_path) + # self.repo.store.rename(from_group, new_path) + + @precondition(lambda self: len(self.all_arrays) >= 1) + @rule(data=st.data()) + def delete_array_using_del(self, data: DataObject) -> None: + array_path = data.draw( + st.sampled_from(sorted(self.all_arrays)), label="Array deletion target" + ) + prefix, array_name = split_prefix_name(array_path) + note(f"Deleting array '{array_path}' ({prefix=!r}, {array_name=!r}) using del") + for store in [self.model, self.store]: + group = zarr.open_group(path=prefix, store=store) + group[array_name] # check that it exists + del group[array_name] + self.all_arrays.remove(array_path) + + @precondition(lambda self: len(self.all_groups) >= 2) # fixme don't delete root + @rule(data=st.data()) + def delete_group_using_del(self, data: DataObject) -> None: + group_path = data.draw( + st.sampled_from(sorted(self.all_groups)), label="Group deletion target" + ) + prefix, group_name = split_prefix_name(group_path) + note(f"Deleting group '{group_path=!r}', {prefix=!r}, {group_name=!r} using delete") + members = zarr.open_group(store=self.model, path=group_path).members(max_depth=None) + for _, obj in members: + if isinstance(obj, Array): + self.all_arrays.remove(obj.path) + else: + self.all_groups.remove(obj.path) + for store in [self.store, self.model]: + group = zarr.open_group(store=store, path=prefix) + group[group_name] # check that it exists + del group[group_name] + if group_path != "/": + # The root group is always present + self.all_groups.remove(group_path) + + # # --------------- assertions ----------------- + # def check_group_arrays(self, group): + # # note(f"Checking arrays of '{group}'") + # g1 = self.model.get_group(group) + # g2 = zarr.open_group(path=group, mode="r", store=self.repo.store) + # model_arrays = sorted(g1.arrays(), key=itemgetter(0)) + # our_arrays = sorted(g2.arrays(), key=itemgetter(0)) + # for (n1, a1), (n2, a2) in zip_longest(model_arrays, our_arrays): + # assert n1 == n2 + # assert_array_equal(a1, a2) + + # def check_subgroups(self, group_path): + # g1 = self.model.get_group(group_path) + # g2 = zarr.open_group(path=group_path, mode="r", store=self.repo.store) + # g1_children = [name for (name, _) in g1.groups()] + # g2_children = [name for (name, _) in g2.groups()] + # # note(f"Checking {len(g1_children)} subgroups of group '{group_path}'") + # assert g1_children == g2_children + + # def check_list_prefix_from_group(self, group): + # prefix = f"meta/root/{group}" + # model_list = sorted(self.model.list_prefix(prefix)) + # al_list = sorted(self.repo.store.list_prefix(prefix)) + # # note(f"Checking {len(model_list)} keys under '{prefix}'") + # assert model_list == al_list + + # prefix = f"data/root/{group}" + # model_list = sorted(self.model.list_prefix(prefix)) + # al_list = sorted(self.repo.store.list_prefix(prefix)) + # # note(f"Checking {len(model_list)} keys under '{prefix}'") + # assert model_list == al_list + + # @precondition(lambda self: self.model.is_persistent_session()) + # @rule(data=st.data()) + # def check_group_path(self, data): + # t0 = time.time() + # group = data.draw(st.sampled_from(self.all_groups)) + # self.check_list_prefix_from_group(group) + # self.check_subgroups(group) + # self.check_group_arrays(group) + # t1 = time.time() + # note(f"Checks took {t1 - t0} sec.") + + @invariant() + def check_list_prefix_from_root(self) -> None: + model_list = self._sync_iter(self.model.list_prefix("")) + store_list = self._sync_iter(self.store.list_prefix("")) + note(f"Checking {len(model_list)} keys") + assert sorted(model_list) == sorted(store_list) + + +class SyncStoreWrapper(zarr.core.sync.SyncMixin): + def __init__(self, store: Store) -> None: + """Synchronous Store wrapper + + This class holds synchronous methods that map to async methods of Store classes. + The synchronous wrapper is needed because hypothesis' stateful testing infra does + not support asyncio so we redefine sync versions of the Store API. + https://github.com/HypothesisWorks/hypothesis/issues/3712#issuecomment-1668999041 + """ + self.store = store + + @property + def read_only(self) -> bool: + return self.store.read_only + + def set(self, key: str, data_buffer: Buffer) -> None: + return self._sync(self.store.set(key, data_buffer)) + + def list(self) -> builtins.list[str]: + return self._sync_iter(self.store.list()) + + def get(self, key: str, prototype: BufferPrototype) -> Buffer | None: + return self._sync(self.store.get(key, prototype=prototype)) + + def get_partial_values( + self, key_ranges: builtins.list[Any], prototype: BufferPrototype + ) -> builtins.list[Buffer | None]: + return self._sync(self.store.get_partial_values(prototype=prototype, key_ranges=key_ranges)) + + def delete(self, path: str) -> None: + return self._sync(self.store.delete(path)) + + def is_empty(self, prefix: str) -> bool: + return self._sync(self.store.is_empty(prefix=prefix)) + + def clear(self) -> None: + return self._sync(self.store.clear()) + + def exists(self, key: str) -> bool: + return self._sync(self.store.exists(key)) + + def list_dir(self, prefix: str) -> None: + raise NotImplementedError + + def list_prefix(self, prefix: str) -> None: + raise NotImplementedError + + def set_partial_values(self, key_start_values: Any) -> None: + raise NotImplementedError + + @property + def supports_listing(self) -> bool: + return self.store.supports_listing + + @property + def supports_partial_writes(self) -> bool: + return self.supports_partial_writes + + @property + def supports_writes(self) -> bool: + return self.store.supports_writes + + +class ZarrStoreStateMachine(RuleBasedStateMachine): + """ " + Zarr store state machine + + This is a subclass of a Hypothesis RuleBasedStateMachine. + It is testing a framework to ensure that the state of a Zarr store matches + an expected state after a set of random operations. It contains a store + (currently, a Zarr MemoryStore) and a model, a simplified version of a + zarr store (in this case, a dict). It also contains rules which represent + actions that can be applied to a zarr store. Rules apply an action to both + the store and the model, and invariants assert that the state of the model + is equal to the state of the store. Hypothesis then generates sequences of + rules, running invariants after each rule. It raises an error if a sequence + produces discontinuity between state of the model and state of the store + (ie. an invariant is violated). + https://hypothesis.readthedocs.io/en/latest/stateful.html + """ + + def __init__(self, store: Store) -> None: + super().__init__() + self.model: dict[str, Buffer] = {} + self.store = SyncStoreWrapper(store) + self.prototype = default_buffer_prototype() + + @initialize() + def init_store(self) -> None: + self.store.clear() + + @rule(key=zarr_keys, data=st.binary(min_size=0, max_size=MAX_BINARY_SIZE)) + def set(self, key: str, data: DataObject) -> None: + note(f"(set) Setting {key!r} with {data}") + assert not self.store.read_only + data_buf = cpu.Buffer.from_bytes(data) + self.store.set(key, data_buf) + self.model[key] = data_buf + + @precondition(lambda self: len(self.model.keys()) > 0) + @rule(key=zarr_keys, data=st.data()) + def get(self, key: str, data: DataObject) -> None: + key = data.draw( + st.sampled_from(sorted(self.model.keys())) + ) # hypothesis wants to sample from sorted list + note("(get)") + store_value = self.store.get(key, self.prototype) + # to bytes here necessary because data_buf set to model in set() + assert self.model[key] == store_value + + @rule(key=zarr_keys, data=st.data()) + def get_invalid_zarr_keys(self, key: str, data: DataObject) -> None: + note("(get_invalid)") + assume(key not in self.model) + assert self.store.get(key, self.prototype) is None + + @precondition(lambda self: len(self.model.keys()) > 0) + @rule(data=st.data()) + def get_partial_values(self, data: DataObject) -> None: + key_range = data.draw( + key_ranges(keys=st.sampled_from(sorted(self.model.keys())), max_size=MAX_BINARY_SIZE) + ) + note(f"(get partial) {key_range=}") + obs_maybe = self.store.get_partial_values(key_range, self.prototype) + observed = [] + + for obs in obs_maybe: + assert obs is not None + observed.append(obs.to_bytes()) + + model_vals_ls = [] + + for key, byte_range in key_range: + start = byte_range[0] or 0 + step = byte_range[1] + stop = start + step if step is not None else None + model_vals_ls.append(self.model[key][start:stop]) + + assert all( + obs == exp.to_bytes() for obs, exp in zip(observed, model_vals_ls, strict=True) + ), ( + observed, + model_vals_ls, + ) + + @precondition(lambda self: len(self.model.keys()) > 0) + @rule(data=st.data()) + def delete(self, data: DataObject) -> None: + key = data.draw(st.sampled_from(sorted(self.model.keys()))) + note(f"(delete) Deleting {key=}") + + self.store.delete(key) + del self.model[key] + + @rule() + def clear(self) -> None: + assert not self.store.read_only + note("(clear)") + self.store.clear() + self.model.clear() + + assert self.store.is_empty("") + + assert len(self.model.keys()) == len(list(self.store.list())) == 0 + + @rule() + # Local store can be non-empty when there are subdirectories but no files + @precondition(lambda self: not isinstance(self.store.store, LocalStore)) + def is_empty(self) -> None: + note("(is_empty)") + + # make sure they either both are or both aren't empty (same state) + assert self.store.is_empty("") == (not self.model) + + @rule(key=zarr_keys) + def exists(self, key: str) -> None: + note("(exists)") + + assert self.store.exists(key) == (key in self.model) + + @invariant() + def check_paths_equal(self) -> None: + note("Checking that paths are equal") + paths = sorted(self.store.list()) + + assert sorted(self.model.keys()) == paths + + @invariant() + def check_vals_equal(self) -> None: + note("Checking values equal") + for key, val in self.model.items(): + store_item = self.store.get(key, self.prototype) + assert val == store_item + + @invariant() + def check_num_zarr_keys_equal(self) -> None: + note("check num zarr_keys equal") + + assert len(self.model) == len(list(self.store.list())) + + @invariant() + def check_zarr_keys(self) -> None: + keys = list(self.store.list()) + + if not keys: + assert self.store.is_empty("") is True + + else: + assert self.store.is_empty("") is False + + for key in keys: + assert self.store.exists(key) is True + note("checking keys / exists / empty") diff --git a/tests/test_store/test_stateful.py b/tests/test_store/test_stateful.py new file mode 100644 index 0000000000..ae10ca8d79 --- /dev/null +++ b/tests/test_store/test_stateful.py @@ -0,0 +1,33 @@ +# Stateful tests for arbitrary Zarr stores. +import pytest +from hypothesis.stateful import ( + Settings, + run_state_machine_as_test, +) + +from zarr.abc.store import Store +from zarr.storage import LocalStore, MemoryStore, ZipStore +from zarr.testing.stateful import ZarrHierarchyStateMachine, ZarrStoreStateMachine + + +def test_zarr_hierarchy(sync_store: Store): + def mk_test_instance_sync() -> ZarrHierarchyStateMachine: + return ZarrHierarchyStateMachine(sync_store) + + if isinstance(sync_store, ZipStore): + pytest.skip(reason="ZipStore does not support delete") + if isinstance(sync_store, MemoryStore): + run_state_machine_as_test( + mk_test_instance_sync, settings=Settings(report_multiple_bugs=False) + ) + + +def test_zarr_store(sync_store: Store) -> None: + def mk_test_instance_sync() -> None: + return ZarrStoreStateMachine(sync_store) + + if isinstance(sync_store, ZipStore): + pytest.skip(reason="ZipStore does not support delete") + if isinstance(sync_store, LocalStore): + pytest.skip(reason="This test has errors") + run_state_machine_as_test(mk_test_instance_sync, settings=Settings(report_multiple_bugs=True)) diff --git a/tests/test_store/test_stateful_hierarchy.py b/tests/test_store/test_stateful_hierarchy.py deleted file mode 100644 index 844e1227da..0000000000 --- a/tests/test_store/test_stateful_hierarchy.py +++ /dev/null @@ -1,223 +0,0 @@ -import hypothesis.extra.numpy as npst -import hypothesis.strategies as st -import pytest -from hypothesis import assume, note -from hypothesis.stateful import ( - RuleBasedStateMachine, - Settings, - initialize, - invariant, - precondition, - rule, - run_state_machine_as_test, -) - -import zarr -from zarr import Array -from zarr.abc.store import Store -from zarr.core.sync import SyncMixin -from zarr.storage import MemoryStore, ZipStore -from zarr.testing.strategies import node_names, np_array_and_chunks, numpy_arrays - - -def split_prefix_name(path): - split = path.rsplit("/", maxsplit=1) - if len(split) > 1: - prefix, name = split - else: - prefix = "" - (name,) = split - return prefix, name - - -class ZarrHierarchyStateMachine(SyncMixin, RuleBasedStateMachine): - """ - This state machine models operations that modify a zarr store's - hierarchy. That is, user actions that modify arrays/groups as well - as list operations. It is intended to be used by external stores, and - compares their results to a MemoryStore that is assumed to be perfect. - """ - - def __init__(self, store) -> None: - super().__init__() - - self.store = store - - self.model = MemoryStore() - zarr.group(store=self.model) - - # Track state of the hierarchy, these should contain fully qualified paths - self.all_groups = set() - self.all_arrays = set() - - @initialize() - def init_store(self): - # This lets us reuse the fixture provided store. - self._sync(self.store.clear()) - zarr.group(store=self.store) - - def can_add(self, path): - return path not in self.all_groups and path not in self.all_arrays - - # -------------------- store operations ----------------------- - @rule(name=node_names, data=st.data()) - def add_group(self, name, data): - if self.all_groups: - parent = data.draw(st.sampled_from(sorted(self.all_groups)), label="Group parent") - else: - parent = "" - path = f"{parent}/{name}".lstrip("/") - assume(self.can_add(path)) - note(f"Adding group: path='{path}'") - self.all_groups.add(path) - zarr.group(store=self.store, path=path) - zarr.group(store=self.model, path=path) - - @rule( - data=st.data(), - name=node_names, - array_and_chunks=np_array_and_chunks(arrays=numpy_arrays(zarr_formats=st.just(3))), - ) - def add_array(self, data, name, array_and_chunks): - array, chunks = array_and_chunks - fill_value = data.draw(npst.from_dtype(array.dtype)) - if self.all_groups: - parent = data.draw(st.sampled_from(sorted(self.all_groups)), label="Array parent") - else: - parent = "" - # TODO: support creating deeper paths - # TODO: support overwriting potentially by just skipping `self.can_add` - path = f"{parent}/{name}".lstrip("/") - assume(self.can_add(path)) - note(f"Adding array: path='{path}' shape={array.shape} chunks={chunks}") - for store in [self.store, self.model]: - zarr.array(array, chunks=chunks, path=path, store=store, fill_value=fill_value) - self.all_arrays.add(path) - - # @precondition(lambda self: bool(self.all_groups)) - # @precondition(lambda self: bool(self.all_arrays)) - # @rule(data=st.data()) - # def move_array(self, data): - # array_path = data.draw(st.sampled_from(self.all_arrays), label="Array move source") - # to_group = data.draw(st.sampled_from(self.all_groups), label="Array move destination") - - # # fixme renaiming to self? - # array_name = os.path.basename(array_path) - # assume(self.model.can_add(to_group, array_name)) - # new_path = f"{to_group}/{array_name}".lstrip("/") - # note(f"moving array '{array_path}' -> '{new_path}'") - # self.model.rename(array_path, new_path) - # self.repo.store.rename(array_path, new_path) - - # @precondition(lambda self: len(self.all_groups) >= 2) - # @rule(data=st.data()) - # def move_group(self, data): - # from_group = data.draw(st.sampled_from(self.all_groups), label="Group move source") - # to_group = data.draw(st.sampled_from(self.all_groups), label="Group move destination") - # assume(not to_group.startswith(from_group)) - - # from_group_name = os.path.basename(from_group) - # assume(self.model.can_add(to_group, from_group_name)) - # # fixme renaiming to self? - # new_path = f"{to_group}/{from_group_name}".lstrip("/") - # note(f"moving group '{from_group}' -> '{new_path}'") - # self.model.rename(from_group, new_path) - # self.repo.store.rename(from_group, new_path) - - @precondition(lambda self: len(self.all_arrays) >= 1) - @rule(data=st.data()) - def delete_array_using_del(self, data): - array_path = data.draw( - st.sampled_from(sorted(self.all_arrays)), label="Array deletion target" - ) - prefix, array_name = split_prefix_name(array_path) - note(f"Deleting array '{array_path}' ({prefix=!r}, {array_name=!r}) using del") - for store in [self.model, self.store]: - group = zarr.open_group(path=prefix, store=store) - group[array_name] # check that it exists - del group[array_name] - self.all_arrays.remove(array_path) - - @precondition(lambda self: len(self.all_groups) >= 2) # fixme don't delete root - @rule(data=st.data()) - def delete_group_using_del(self, data): - group_path = data.draw( - st.sampled_from(sorted(self.all_groups)), label="Group deletion target" - ) - prefix, group_name = split_prefix_name(group_path) - note(f"Deleting group '{group_path=!r}', {prefix=!r}, {group_name=!r} using delete") - members = zarr.open_group(store=self.model, path=group_path).members(max_depth=None) - for _, obj in members: - if isinstance(obj, Array): - self.all_arrays.remove(obj.path) - else: - self.all_groups.remove(obj.path) - for store in [self.store, self.model]: - group = zarr.open_group(store=store, path=prefix) - group[group_name] # check that it exists - del group[group_name] - if group_path != "/": - # The root group is always present - self.all_groups.remove(group_path) - - # # --------------- assertions ----------------- - # def check_group_arrays(self, group): - # # note(f"Checking arrays of '{group}'") - # g1 = self.model.get_group(group) - # g2 = zarr.open_group(path=group, mode="r", store=self.repo.store) - # model_arrays = sorted(g1.arrays(), key=itemgetter(0)) - # our_arrays = sorted(g2.arrays(), key=itemgetter(0)) - # for (n1, a1), (n2, a2) in zip_longest(model_arrays, our_arrays): - # assert n1 == n2 - # assert_array_equal(a1, a2) - - # def check_subgroups(self, group_path): - # g1 = self.model.get_group(group_path) - # g2 = zarr.open_group(path=group_path, mode="r", store=self.repo.store) - # g1_children = [name for (name, _) in g1.groups()] - # g2_children = [name for (name, _) in g2.groups()] - # # note(f"Checking {len(g1_children)} subgroups of group '{group_path}'") - # assert g1_children == g2_children - - # def check_list_prefix_from_group(self, group): - # prefix = f"meta/root/{group}" - # model_list = sorted(self.model.list_prefix(prefix)) - # al_list = sorted(self.repo.store.list_prefix(prefix)) - # # note(f"Checking {len(model_list)} keys under '{prefix}'") - # assert model_list == al_list - - # prefix = f"data/root/{group}" - # model_list = sorted(self.model.list_prefix(prefix)) - # al_list = sorted(self.repo.store.list_prefix(prefix)) - # # note(f"Checking {len(model_list)} keys under '{prefix}'") - # assert model_list == al_list - - # @precondition(lambda self: self.model.is_persistent_session()) - # @rule(data=st.data()) - # def check_group_path(self, data): - # t0 = time.time() - # group = data.draw(st.sampled_from(self.all_groups)) - # self.check_list_prefix_from_group(group) - # self.check_subgroups(group) - # self.check_group_arrays(group) - # t1 = time.time() - # note(f"Checks took {t1 - t0} sec.") - - @invariant() - def check_list_prefix_from_root(self): - model_list = self._sync_iter(self.model.list_prefix("")) - store_list = self._sync_iter(self.store.list_prefix("")) - note(f"Checking {len(model_list)} keys") - assert sorted(model_list) == sorted(store_list) - - -def test_zarr_hierarchy(sync_store: Store): - def mk_test_instance_sync() -> ZarrHierarchyStateMachine: - return ZarrHierarchyStateMachine(sync_store) - - if isinstance(sync_store, ZipStore): - pytest.skip(reason="ZipStore does not support delete") - if isinstance(sync_store, MemoryStore): - run_state_machine_as_test( - mk_test_instance_sync, settings=Settings(report_multiple_bugs=False) - ) diff --git a/tests/test_store/test_stateful_store.py b/tests/test_store/test_stateful_store.py deleted file mode 100644 index 751c1ac746..0000000000 --- a/tests/test_store/test_stateful_store.py +++ /dev/null @@ -1,249 +0,0 @@ -# Stateful tests for arbitrary Zarr stores. -import hypothesis.strategies as st -import pytest -from hypothesis import assume, note -from hypothesis.stateful import ( - RuleBasedStateMachine, - Settings, - initialize, - invariant, - precondition, - rule, - run_state_machine_as_test, -) -from hypothesis.strategies import DataObject - -import zarr -from zarr.abc.store import Store -from zarr.core.buffer import BufferPrototype, cpu, default_buffer_prototype -from zarr.storage import LocalStore, ZipStore -from zarr.testing.strategies import key_ranges -from zarr.testing.strategies import keys as zarr_keys - -MAX_BINARY_SIZE = 100 - - -class SyncStoreWrapper(zarr.core.sync.SyncMixin): - def __init__(self, store: Store) -> None: - """Synchronous Store wrapper - - This class holds synchronous methods that map to async methods of Store classes. - The synchronous wrapper is needed because hypothesis' stateful testing infra does - not support asyncio so we redefine sync versions of the Store API. - https://github.com/HypothesisWorks/hypothesis/issues/3712#issuecomment-1668999041 - """ - self.store = store - - @property - def read_only(self) -> bool: - return self.store.read_only - - def set(self, key: str, data_buffer: zarr.core.buffer.Buffer) -> None: - return self._sync(self.store.set(key, data_buffer)) - - def list(self) -> list: - return self._sync_iter(self.store.list()) - - def get(self, key: str, prototype: BufferPrototype) -> zarr.core.buffer.Buffer: - return self._sync(self.store.get(key, prototype=prototype)) - - def get_partial_values( - self, key_ranges: list, prototype: BufferPrototype - ) -> zarr.core.buffer.Buffer: - return self._sync(self.store.get_partial_values(prototype=prototype, key_ranges=key_ranges)) - - def delete(self, path: str) -> None: - return self._sync(self.store.delete(path)) - - def is_empty(self, prefix: str) -> bool: - return self._sync(self.store.is_empty(prefix=prefix)) - - def clear(self) -> None: - return self._sync(self.store.clear()) - - def exists(self, key) -> bool: - return self._sync(self.store.exists(key)) - - def list_dir(self, prefix): - raise NotImplementedError - - def list_prefix(self, prefix: str): - raise NotImplementedError - - def set_partial_values(self, key_start_values): - raise NotImplementedError - - @property - def supports_listing(self) -> bool: - return self.store.supports_listing - - @property - def supports_partial_writes(self) -> bool: - return self.supports_partial_writes - - @property - def supports_writes(self) -> bool: - return self.store.supports_writes - - -class ZarrStoreStateMachine(RuleBasedStateMachine): - """ " - Zarr store state machine - - This is a subclass of a Hypothesis RuleBasedStateMachine. - It is testing a framework to ensure that the state of a Zarr store matches - an expected state after a set of random operations. It contains a store - (currently, a Zarr MemoryStore) and a model, a simplified version of a - zarr store (in this case, a dict). It also contains rules which represent - actions that can be applied to a zarr store. Rules apply an action to both - the store and the model, and invariants assert that the state of the model - is equal to the state of the store. Hypothesis then generates sequences of - rules, running invariants after each rule. It raises an error if a sequence - produces discontinuity between state of the model and state of the store - (ie. an invariant is violated). - https://hypothesis.readthedocs.io/en/latest/stateful.html - """ - - def __init__(self, store: Store) -> None: - super().__init__() - self.model: dict[str, bytes] = {} - self.store = SyncStoreWrapper(store) - self.prototype = default_buffer_prototype() - - @initialize() - def init_store(self): - self.store.clear() - - @rule(key=zarr_keys, data=st.binary(min_size=0, max_size=MAX_BINARY_SIZE)) - def set(self, key: str, data: DataObject) -> None: - note(f"(set) Setting {key!r} with {data}") - assert not self.store.read_only - data_buf = cpu.Buffer.from_bytes(data) - self.store.set(key, data_buf) - self.model[key] = data_buf - - @precondition(lambda self: len(self.model.keys()) > 0) - @rule(key=zarr_keys, data=st.data()) - def get(self, key: str, data: DataObject) -> None: - key = data.draw( - st.sampled_from(sorted(self.model.keys())) - ) # hypothesis wants to sample from sorted list - note("(get)") - store_value = self.store.get(key, self.prototype) - # to bytes here necessary because data_buf set to model in set() - assert self.model[key].to_bytes() == (store_value.to_bytes()) - - @rule(key=zarr_keys, data=st.data()) - def get_invalid_zarr_keys(self, key: str, data: DataObject) -> None: - note("(get_invalid)") - assume(key not in self.model) - assert self.store.get(key, self.prototype) is None - - @precondition(lambda self: len(self.model.keys()) > 0) - @rule(data=st.data()) - def get_partial_values(self, data: DataObject) -> None: - key_range = data.draw( - key_ranges(keys=st.sampled_from(sorted(self.model.keys())), max_size=MAX_BINARY_SIZE) - ) - note(f"(get partial) {key_range=}") - obs_maybe = self.store.get_partial_values(key_range, self.prototype) - observed = [] - - for obs in obs_maybe: - assert obs is not None - observed.append(obs.to_bytes()) - - model_vals_ls = [] - - for key, byte_range in key_range: - start = byte_range[0] or 0 - step = byte_range[1] - stop = start + step if step is not None else None - model_vals_ls.append(self.model[key][start:stop]) - - assert all( - obs == exp.to_bytes() for obs, exp in zip(observed, model_vals_ls, strict=True) - ), ( - observed, - model_vals_ls, - ) - - @precondition(lambda self: len(self.model.keys()) > 0) - @rule(data=st.data()) - def delete(self, data: DataObject) -> None: - key = data.draw(st.sampled_from(sorted(self.model.keys()))) - note(f"(delete) Deleting {key=}") - - self.store.delete(key) - del self.model[key] - - @rule() - def clear(self) -> None: - assert not self.store.read_only - note("(clear)") - self.store.clear() - self.model.clear() - - assert self.store.is_empty("") - - assert len(self.model.keys()) == len(list(self.store.list())) == 0 - - @rule() - # Local store can be non-empty when there are subdirectories but no files - @precondition(lambda self: not isinstance(self.store.store, LocalStore)) - def is_empty(self) -> None: - note("(is_empty)") - - # make sure they either both are or both aren't empty (same state) - assert self.store.is_empty("") == (not self.model) - - @rule(key=zarr_keys) - def exists(self, key: str) -> None: - note("(exists)") - - assert self.store.exists(key) == (key in self.model) - - @invariant() - def check_paths_equal(self) -> None: - note("Checking that paths are equal") - paths = sorted(self.store.list()) - - assert sorted(self.model.keys()) == paths - - @invariant() - def check_vals_equal(self) -> None: - note("Checking values equal") - for key, val in self.model.items(): - store_item = self.store.get(key, self.prototype).to_bytes() - assert val.to_bytes() == store_item - - @invariant() - def check_num_zarr_keys_equal(self) -> None: - note("check num zarr_keys equal") - - assert len(self.model) == len(list(self.store.list())) - - @invariant() - def check_zarr_keys(self) -> None: - keys = list(self.store.list()) - - if not keys: - assert self.store.is_empty("") is True - - else: - assert self.store.is_empty("") is False - - for key in keys: - assert self.store.exists(key) is True - note("checking keys / exists / empty") - - -def test_zarr_hierarchy(sync_store: Store) -> None: - def mk_test_instance_sync() -> None: - return ZarrStoreStateMachine(sync_store) - - if isinstance(sync_store, ZipStore): - pytest.skip(reason="ZipStore does not support delete") - if isinstance(sync_store, LocalStore): - pytest.skip(reason="This test has errors") - run_state_machine_as_test(mk_test_instance_sync, settings=Settings(report_multiple_bugs=True)) From ae6e8e6d0ec64bd3bff7ee8419152a1e30f10537 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 5 Dec 2024 09:06:04 +0100 Subject: [PATCH 16/87] add a complete step to the test CU (#2521) --- .github/workflows/test.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c388ba31de..1c25dcb1f4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -93,3 +93,22 @@ jobs: - name: Run Tests run: | hatch env run --env ${{ matrix.dependency-set }} run + + test-complete: + name: Test complete + + needs: + [ + test, + test-upstream-and-min-deps, + ] + if: always() + runs-on: ubuntu-latest + steps: + - name: Check failure + if: | + contains(needs.*.result, 'failure') || + contains(needs.*.result, 'cancelled') + run: exit 1 + - name: Success + run: echo Success! \ No newline at end of file From 2fe12a7f424ac78c18a688d0a75049a3df67c98f Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Thu, 5 Dec 2024 00:22:35 -0800 Subject: [PATCH 17/87] Proposed updates to contributor guide. (#2513) Co-authored-by: Davis Bennett Co-authored-by: David Stansby --- docs/contributing.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/contributing.rst b/docs/contributing.rst index a65b3d104d..8038330239 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -92,12 +92,11 @@ the following:: $ mkdir -p ~/pyenv/zarr-dev $ python -m venv ~/pyenv/zarr-dev $ source ~/pyenv/zarr-dev/bin/activate - $ pip install -r requirements_dev_minimal.txt -r requirements_dev_numpy.txt - $ pip install -e .[docs] + $ pip install -e .[test,docs] To verify that your development environment is working, you can run the unit tests:: - $ python -m pytest -v zarr + $ python -m pytest -v tests Creating a branch ~~~~~~~~~~~~~~~~~ @@ -149,7 +148,7 @@ and invoke:: Some tests require optional dependencies to be installed, otherwise the tests will be skipped. To install all optional dependencies, run:: - $ pip install -r requirements_dev_optional.txt + $ pip install pytest-doctestplus To also run the doctests within docstrings (requires optional dependencies to be installed), run:: @@ -234,7 +233,7 @@ should run and pass as doctests under Python 3.8. To run doctests, activate your development environment, install optional requirements, and run:: - $ python -m pytest -v --doctest-plus zarr + $ python -m pytest -v --doctest-plus tests Zarr uses Sphinx for documentation, hosted on readthedocs.org. Documentation is written in the RestructuredText markup language (.rst files) in the ``docs`` folder. From fd688c41a103c518564c917b54073a518db0e1f9 Mon Sep 17 00:00:00 2001 From: Florian Aymanns Date: Thu, 5 Dec 2024 19:47:42 +0100 Subject: [PATCH 18/87] Parse chunk shape to check for float values (#2535) * Test that shapes and chunk shapes containing floats raise a TypeError * Parse chunk shape * Move check for floats from create to normalize_chunks --------- Co-authored-by: Joe Hamman --- src/zarr/core/chunk_grids.py | 3 +++ tests/test_api.py | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index afecc6824f..ea050e39ef 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -138,6 +138,9 @@ def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tupl s if c == -1 or c is None else int(c) for s, c in zip(shape, chunks, strict=False) ) + if not all(isinstance(c, numbers.Integral) for c in chunks): + raise TypeError("non integer value in chunks") + return tuple(int(c) for c in chunks) diff --git a/tests/test_api.py b/tests/test_api.py index c7fc88241f..11977e8e32 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -47,6 +47,14 @@ def test_create_array(memory_store: Store) -> None: assert z.shape == (400,) assert z.chunks == (40,) + # create array with float shape + with pytest.raises(TypeError): + z = create(shape=(400.5, 100), store=store, overwrite=True) + + # create array with float chunk shape + with pytest.raises(TypeError): + z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) + @pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) @pytest.mark.parametrize("node_type", ["array", "group"]) From 0cca6b061e1f98594b206c69ecfc9f1c8972b4ea Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 6 Dec 2024 02:21:51 -0800 Subject: [PATCH 19/87] chore(deps): make fsspec and upath optional dependencies (#2534) * chore(deps): make fsspec and upath optional dependencies * bump minimal env * release notes --- docs/release.rst | 26 ++++++++++++++++++++++++-- pyproject.toml | 11 +++++------ tests/test_store/test_core.py | 10 +++++++--- tests/test_store/test_remote.py | 6 +++--- 4 files changed, 39 insertions(+), 14 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index 0b6775c4a6..7f424c00e2 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -18,10 +18,32 @@ Release notes See `GH1777 `_ for more details on the upcoming 3.0 release. +.. release_3.0.0-beta: + +3.0.0-beta series +----------------- + +.. warning:: + Zarr-Python 3.0.0-beta is a pre-release of the upcoming 3.0 release. This release is not feature complete or + expected to be ready for production applications. + +.. note:: + The complete release notes for 3.0 have not been added to this document yet. See the + `3.0.0-beta `_ release on GitHub + for a record of changes included in this release. + +Dependency Changes +~~~~~~~~~~~~~~~~~~ + +* fsspec was moved from a required dependency to an optional one. Users should install + fsspec and any relevant implementations (e.g. s3fs) before using the ``RemoteStore``. + By :user:`Joe Hamman ` :issue:`2391`. + + .. release_3.0.0-alpha: -3.0.0-alpha ------------ +3.0.0-alpha series +------------------ .. warning:: Zarr-Python 3.0.0-alpha is a pre-release of the upcoming 3.0 release. This release is not feature complete or diff --git a/pyproject.toml b/pyproject.toml index 888d0e0eb4..5f2d7569b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,7 @@ requires-python = ">=3.11" dependencies = [ 'numpy>=1.25', 'numcodecs[crc32c]>=0.14', - 'fsspec>=2022.10.0', - 'typing_extensions>=4.6', + 'typing_extensions>=4.9', 'donfig>=0.8', ] @@ -54,16 +53,16 @@ license = {text = "MIT License"} keywords = ["Python", "compressed", "ndimensional-arrays", "zarr"] [project.optional-dependencies] +fsspec = [ + "fsspec>=2023.10.0", +] test = [ "coverage", "pytest", "pytest-cov", - "msgpack", "s3fs", "pytest-asyncio", "moto[s3]", - "flask-cors", - "flask", "requests", "mypy", "hypothesis", @@ -224,7 +223,7 @@ dependencies = [ 'fsspec==2022.10.0', 's3fs==2022.10.0', 'universal_pathlib==0.0.22', - 'typing_extensions==4.6.*', # 4.5 needed for @deprecated, 4.6 for Buffer + 'typing_extensions==4.9.*', 'donfig==0.8.*', # test deps 'hypothesis', diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index 4d3f305e53..81ed3744a9 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -3,7 +3,6 @@ import pytest from _pytest.compat import LEGACY_PATH -from upath import UPath from zarr.core.common import AccessModeLiteral from zarr.storage._utils import normalize_path @@ -72,6 +71,7 @@ async def test_make_store_path_invalid() -> None: async def test_make_store_path_fsspec(monkeypatch) -> None: + pytest.importorskip("fsspec") store_path = await make_store_path("http://foo.com/bar") assert isinstance(store_path.store, RemoteStore) @@ -106,13 +106,17 @@ async def test_unsupported() -> None: "foo/bar///", Path("foo/bar"), b"foo/bar", - UPath("foo/bar"), ], ) -def test_normalize_path_valid(path: str | bytes | Path | UPath) -> None: +def test_normalize_path_valid(path: str | bytes | Path) -> None: assert normalize_path(path) == "foo/bar" +def test_normalize_path_upath() -> None: + upath = pytest.importorskip("upath") + assert normalize_path(upath.UPath("foo/bar")) == "foo/bar" + + def test_normalize_path_none(): assert normalize_path(None) == "" diff --git a/tests/test_store/test_remote.py b/tests/test_store/test_remote.py index aee620796c..c7f33e4b39 100644 --- a/tests/test_store/test_remote.py +++ b/tests/test_store/test_remote.py @@ -4,10 +4,8 @@ import os from typing import TYPE_CHECKING -import fsspec import pytest from botocore.session import Session -from upath import UPath import zarr.api.asynchronous from zarr.core.buffer import Buffer, cpu, default_buffer_prototype @@ -21,6 +19,7 @@ import botocore.client +fsspec = pytest.importorskip("fsspec") s3fs = pytest.importorskip("s3fs") requests = pytest.importorskip("requests") moto_server = pytest.importorskip("moto.moto_server.threaded_moto_server") @@ -182,7 +181,8 @@ async def test_remote_store_from_uri(self, store: RemoteStore): assert dict(group.attrs) == {"key": "value-3"} def test_from_upath(self) -> None: - path = UPath( + upath = pytest.importorskip("upath") + path = upath.UPath( f"s3://{test_bucket_name}/foo/bar/", endpoint_url=endpoint_url, anon=False, From 333a9e3220841debbcd4472d43f0347e64389f2e Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Sat, 7 Dec 2024 09:31:54 -0800 Subject: [PATCH 20/87] [v3] implement / deprecate zarr.tree (#2537) --- src/zarr/api/asynchronous.py | 26 ++++++++++++++++++++++++-- src/zarr/api/synchronous.py | 7 +++++-- src/zarr/core/group.py | 2 +- tests/test_api.py | 7 ++++--- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index e5d09f8c3a..26822f725b 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -7,6 +7,7 @@ import numpy as np import numpy.typing as npt +from typing_extensions import deprecated from zarr.core.array import Array, AsyncArray, get_array_metadata from zarr.core.buffer import NDArrayLike @@ -493,8 +494,29 @@ async def save_group( await asyncio.gather(*aws) -async def tree(*args: Any, **kwargs: Any) -> None: - raise NotImplementedError +@deprecated("Use AsyncGroup.tree instead.") +async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None = None) -> Any: + """Provide a rich display of the hierarchy. + + Parameters + ---------- + grp : Group + Zarr or h5py group. + expand : bool, optional + Only relevant for HTML representation. If True, tree will be fully expanded. + level : int, optional + Maximum depth to descend into hierarchy. + + Returns + ------- + TreeRepr + A pretty-printable object displaying the hierarchy. + + .. deprecated:: 3.0.0 + `zarr.tree()` is deprecated and will be removed in a future release. + Use `group.tree()` instead. + """ + return await grp.tree(expand=expand, level=level) async def array( diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 9616c41355..8e8ecf40b8 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING, Any, Literal +from typing_extensions import deprecated + import zarr.api.asynchronous as async_api from zarr._compat import _deprecate_positional_args from zarr.core.array import Array, AsyncArray @@ -155,8 +157,9 @@ def save_group( ) -def tree(*args: Any, **kwargs: Any) -> None: - return sync(async_api.tree(*args, **kwargs)) +@deprecated("Use Group.tree instead.") +def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> Any: + return sync(async_api.tree(grp._async_group, expand=expand, level=level)) # TODO: add type annotations for kwargs diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 2ca6e209fd..adc6399276 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1449,7 +1449,7 @@ async def tree(self, expand: bool | None = None, level: int | None = None) -> An from zarr.core._tree import group_tree_async if expand is not None: - raise NotImplementedError("'expanded' is not yet implemented.") + raise NotImplementedError("'expand' is not yet implemented.") return await group_tree_async(self, max_depth=level) async def empty( diff --git a/tests/test_api.py b/tests/test_api.py index 11977e8e32..90f6dae110 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -294,15 +294,16 @@ def test_load_array(memory_store: Store) -> None: def test_tree() -> None: + pytest.importorskip("rich") g1 = zarr.group() g1.create_group("foo") g3 = g1.create_group("bar") g3.create_group("baz") g5 = g3.create_group("qux") g5.create_array("baz", shape=100, chunks=10) - # TODO: complete after tree has been reimplemented - # assert repr(zarr.tree(g1)) == repr(g1.tree()) - # assert str(zarr.tree(g1)) == str(g1.tree()) + with pytest.warns(DeprecationWarning): + assert repr(zarr.tree(g1)) == repr(g1.tree()) + assert str(zarr.tree(g1)) == str(g1.tree()) # @pytest.mark.parametrize("stores_from_path", [False, True]) From ed0f199e8542b410ae655c8168a49a0be6237cd4 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Sun, 8 Dec 2024 23:18:01 +0000 Subject: [PATCH 21/87] Improve installation page (#2538) * Make code blocks console * Clean up installation page * Split into pip/conda * Fix doc headings * Fix contributing link * Fix contributing link --- docs/installation.rst | 38 ++++++++++++++++++++++++-------------- src/zarr/core/group.py | 13 +++++++------ 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index 86da6d1035..b39b54b250 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,27 +1,37 @@ Installation ============ -Zarr depends on NumPy. It is generally best to `install NumPy -`_ first using whatever method is most -appropriate for your operating system and Python distribution. Other dependencies should be -installed automatically if using one of the installation methods below. +pip +--- -Note: Zarr has endorsed `Scientific-Python SPEC 0 `_ and now follows the version support window as outlined below: +.. code-block:: console -- Python: 36 months after initial release -- Core package dependencies (e.g. NumPy): 24 months after initial release + $ pip install zarr -Install Zarr from PyPI:: +There are a number of optional dependency groups you can install for extra functionality. +These can be installed using ``pip install "zarr[]"``, e.g. ``pip install "zarr[gpu]"`` - $ pip install zarr +- ``gpu``: support for GPUs +- ``fsspec``: support for reading/writing to remote data stores +- ``tree``: support for pretty printing of directory trees -Alternatively, install Zarr via conda:: +conda +----- + +.. code-block:: console $ conda install -c conda-forge zarr -To install the latest development version of Zarr, you can use pip with the -latest GitHub main:: +Conda does not support optional dependencies, so you will have to manually install any packages +needed to enable extra functionality. + +Dependency support +------------------ +Zarr has endorsed `Scientific-Python SPEC 0 `_ and now follows the version support window as outlined below: - $ pip install git+https://github.com/zarr-developers/zarr-python.git +- Python: 36 months after initial release +- Core package dependencies (e.g. NumPy): 24 months after initial release -To work with Zarr source code in development, see `Contributing `_. \ No newline at end of file +Development +----------- +To install the latest development version of Zarr, see `the contributing guide `_. diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index adc6399276..3613e3e12b 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1847,8 +1847,8 @@ def __repr__(self) -> str: async def update_attributes_async(self, new_attributes: dict[str, Any]) -> Group: """Update the attributes of this group. - Example - ------- + Examples + -------- >>> import zarr >>> group = zarr.group() >>> await group.update_attributes_async({"foo": "bar"}) @@ -1947,8 +1947,9 @@ def synchronizer(self) -> None: def update_attributes(self, new_attributes: dict[str, Any]) -> Group: """Update the attributes of this group. - Example - ------- + + Examples + -------- >>> import zarr >>> group = zarr.group() >>> group.update_attributes({"foo": "bar"}) @@ -2027,8 +2028,8 @@ def __contains__(self, member: str) -> bool: def groups(self) -> Generator[tuple[str, Group], None]: """Return the sub-groups of this group as a generator of (name, group) pairs. - Example - ------- + Examples + -------- >>> import zarr >>> group = zarr.group() >>> group.create_group("subgroup") From 1a7595760aadd4fa53528efc68b64b757f5ee342 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 22:06:20 +0000 Subject: [PATCH 22/87] chore: update pre-commit hooks (#2547) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.1 → v0.8.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.1...v0.8.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6068d0003d..2ad539daf4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_language_version: python: python3 repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.1 + rev: v0.8.2 hooks: - id: ruff args: ["--fix", "--show-fixes"] From ec67b128dec2bc28c08a9f73308c9731e77c10c2 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Thu, 12 Dec 2024 04:55:23 +0100 Subject: [PATCH 23/87] Improve pre-commit configuration (#2551) --- .pre-commit-config.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ad539daf4..b300752b31 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,9 @@ ci: autoupdate_commit_msg: "chore: update pre-commit hooks" + autoupdate_schedule: "monthly" autofix_commit_msg: "style: pre-commit fixes" autofix_prs: false default_stages: [pre-commit, pre-push] -default_language_version: - python: python3 repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.8.2 @@ -16,7 +15,7 @@ repos: rev: v2.3.0 hooks: - id: codespell - args: ["-L", "ba,ihs,kake,nd,noe,nwo,te,fo,zar", "-S", "fixture"] + args: ["-L", "fo,ihs,kake,te", "-S", "fixture"] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: From 4e53a70728c69330bc096877a6172618cb717aa5 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:03:46 +0100 Subject: [PATCH 24/87] Apply ruff preview rule RUF036 (#2524) RUF036 `None` not at the end of the type annotation. The `None` literal represents the absence of a value. For readability, it's preferred to write the more informative type expressions first. --- src/zarr/core/array.py | 2 +- src/zarr/core/common.py | 2 +- src/zarr/core/metadata/common.py | 2 +- src/zarr/core/metadata/v2.py | 2 +- src/zarr/core/metadata/v3.py | 10 +++++----- src/zarr/storage/_utils.py | 2 +- src/zarr/testing/store.py | 2 +- src/zarr/testing/strategies.py | 2 +- tests/test_codecs/test_vlen.py | 4 ++-- tests/test_metadata/test_v2.py | 2 +- tests/test_metadata/test_v3.py | 4 ++-- tests/test_store/test_memory.py | 8 ++++---- 12 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a6317e7a9e..5e5089911e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -604,7 +604,7 @@ async def _create_v2( dtype: npt.DTypeLike, chunks: ChunkCoords, dimension_separator: Literal[".", "/"] | None = None, - fill_value: None | float = None, + fill_value: float | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index e76ddd030d..a4bf33451c 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -36,7 +36,7 @@ ChunkCoordsLike = Iterable[int] ZarrFormat = Literal[2, 3] NodeType = Literal["array", "group"] -JSON = None | str | int | float | Mapping[str, "JSON"] | tuple["JSON", ...] +JSON = str | int | float | Mapping[str, "JSON"] | tuple["JSON", ...] | None MemoryOrder = Literal["C", "F"] AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"] diff --git a/src/zarr/core/metadata/common.py b/src/zarr/core/metadata/common.py index 3adb65cf02..44d3eb292b 100644 --- a/src/zarr/core/metadata/common.py +++ b/src/zarr/core/metadata/common.py @@ -6,7 +6,7 @@ from zarr.core.common import JSON -def parse_attributes(data: None | dict[str, JSON]) -> dict[str, JSON]: +def parse_attributes(data: dict[str, JSON] | None) -> dict[str, JSON]: if data is None: return {} diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index f18f2e4e8d..50f375203f 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -44,7 +44,7 @@ class ArrayV2Metadata(Metadata): shape: ChunkCoords chunks: tuple[int, ...] dtype: np.dtype[Any] - fill_value: None | int | float | str | bytes = 0 + fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 6ea9ed69f1..b800ae4d73 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -225,9 +225,9 @@ def __init__( chunk_key_encoding: dict[str, JSON] | ChunkKeyEncoding, fill_value: Any, codecs: Iterable[Codec | dict[str, JSON]], - attributes: None | dict[str, JSON], - dimension_names: None | Iterable[str], - storage_transformers: None | Iterable[dict[str, JSON]] = None, + attributes: dict[str, JSON] | None, + dimension_names: Iterable[str] | None, + storage_transformers: Iterable[dict[str, JSON]] | None = None, ) -> None: """ Because the class is a frozen dataclass, we set attributes using object.__setattr__ @@ -540,7 +540,7 @@ class DataType(Enum): bytes = "bytes" @property - def byte_count(self) -> None | int: + def byte_count(self) -> int | None: data_type_byte_counts = { DataType.bool: 1, DataType.int8: 1, @@ -626,7 +626,7 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType: return DataType[dtype_to_data_type[dtype.str]] @classmethod - def parse(cls, dtype: None | DataType | Any) -> DataType: + def parse(cls, dtype: DataType | Any | None) -> DataType: if dtype is None: return DataType[DEFAULT_DTYPE] if isinstance(dtype, DataType): diff --git a/src/zarr/storage/_utils.py b/src/zarr/storage/_utils.py index ae39468897..7ba82b00fd 100644 --- a/src/zarr/storage/_utils.py +++ b/src/zarr/storage/_utils.py @@ -45,7 +45,7 @@ def normalize_path(path: str | bytes | Path | None) -> str: def _normalize_interval_index( - data: Buffer, interval: None | tuple[int | None, int | None] + data: Buffer, interval: tuple[int | None, int | None] | None ) -> tuple[int, int]: """ Convert an implicit interval into an explicit start and length diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index d26d83e566..b793f2d67b 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -106,7 +106,7 @@ def test_store_supports_listing(self, store: S) -> None: @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) @pytest.mark.parametrize("byte_range", [None, (0, None), (1, None), (1, 2), (None, 1)]) async def test_get( - self, store: S, key: str, data: bytes, byte_range: None | tuple[int | None, int | None] + self, store: S, key: str, data: bytes, byte_range: tuple[int | None, int | None] | None ) -> None: """ Ensure that data can be read from the store using the store.get method. diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index f0a7e97d3a..85a67e3e69 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -117,7 +117,7 @@ def arrays( shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, compressors: st.SearchStrategy = compressors, stores: st.SearchStrategy[StoreLike] = stores, - paths: st.SearchStrategy[None | str] = paths, + paths: st.SearchStrategy[str | None] = paths, array_names: st.SearchStrategy = array_names, arrays: st.SearchStrategy | None = None, attrs: st.SearchStrategy = attrs, diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index aaea5dab83..05b2e25267 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -25,7 +25,7 @@ @pytest.mark.parametrize("as_object_array", [False, True]) @pytest.mark.parametrize("codecs", [None, [VLenUTF8Codec()], [VLenUTF8Codec(), ZstdCodec()]]) def test_vlen_string( - store: Store, dtype: None | np.dtype[Any], as_object_array: bool, codecs: None | list[Codec] + store: Store, dtype: np.dtype[Any] | None, as_object_array: bool, codecs: list[Codec] | None ) -> None: strings = ["hello", "world", "this", "is", "a", "test"] data = np.array(strings, dtype=dtype).reshape((2, 3)) @@ -62,7 +62,7 @@ def test_vlen_string( @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("as_object_array", [False, True]) @pytest.mark.parametrize("codecs", [None, [VLenBytesCodec()], [VLenBytesCodec(), ZstdCodec()]]) -def test_vlen_bytes(store: Store, as_object_array: bool, codecs: None | list[Codec]) -> None: +def test_vlen_bytes(store: Store, as_object_array: bool, codecs: list[Codec] | None) -> None: bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"] data = np.array(bstrings).reshape((2, 3)) assert data.dtype == "|S5" diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 003aef331f..69dbd4645b 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -43,7 +43,7 @@ def test_metadata_to_dict( fill_value: Any, order: Literal["C", "F"], dimension_separator: Literal[".", "/"] | None, - attributes: None | dict[str, Any], + attributes: dict[str, Any] | None, ) -> None: shape = (1, 2, 3) chunks = (1,) * len(shape) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 4e4ba23313..6f7fba6dd1 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -240,8 +240,8 @@ def test_metadata_to_dict( chunk_key_encoding: Literal["v2", "default"], dimension_separator: Literal[".", "/"] | None, dimension_names: Literal["nones", "strings", "missing"], - attributes: None | dict[str, Any], - storage_transformers: None | tuple[dict[str, JSON]], + attributes: dict[str, Any] | None, + storage_transformers: tuple[dict[str, JSON]] | None, ) -> None: shape = (1, 2, 3) data_type = DataType.uint8 diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py index 56ccb4d3be..4ca4ebb817 100644 --- a/tests/test_store/test_memory.py +++ b/tests/test_store/test_memory.py @@ -21,14 +21,14 @@ async def get(self, store: MemoryStore, key: str) -> Buffer: @pytest.fixture(params=[None, True]) def store_kwargs( self, request: pytest.FixtureRequest - ) -> dict[str, str | None | dict[str, Buffer]]: + ) -> dict[str, str | dict[str, Buffer] | None]: kwargs = {"store_dict": None} if request.param is True: kwargs["store_dict"] = {} return kwargs @pytest.fixture - def store(self, store_kwargs: str | None | dict[str, Buffer]) -> MemoryStore: + def store(self, store_kwargs: str | dict[str, Buffer] | None) -> MemoryStore: return self.store_cls(**store_kwargs) def test_store_repr(self, store: MemoryStore) -> None: @@ -61,14 +61,14 @@ async def get(self, store: MemoryStore, key: str) -> Buffer: @pytest.fixture(params=[None, True]) def store_kwargs( self, request: pytest.FixtureRequest - ) -> dict[str, str | None | dict[str, Buffer]]: + ) -> dict[str, str | dict[str, Buffer] | None]: kwargs = {"store_dict": None} if request.param is True: kwargs["store_dict"] = {} return kwargs @pytest.fixture - def store(self, store_kwargs: str | None | dict[str, gpu.Buffer]) -> GpuMemoryStore: + def store(self, store_kwargs: str | dict[str, gpu.Buffer] | None) -> GpuMemoryStore: return self.store_cls(**store_kwargs) def test_store_repr(self, store: GpuMemoryStore) -> None: From 01b73a72c0b6f9ec81de5d168980524dfa5197a5 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Fri, 13 Dec 2024 00:39:05 +0100 Subject: [PATCH 25/87] rename exists_ok to overwrite (#2548) --- src/zarr/api/asynchronous.py | 18 +++++------ src/zarr/core/array.py | 30 ++++++++--------- src/zarr/core/group.py | 40 +++++++++++------------ tests/conftest.py | 2 +- tests/test_array.py | 10 +++--- tests/test_group.py | 62 ++++++++++++++++++------------------ tests/test_v2.py | 4 +-- 7 files changed, 83 insertions(+), 83 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 26822f725b..b5dbb0cfa5 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -71,7 +71,7 @@ _OVERWRITE_MODES: tuple[AccessModeLiteral, ...] = ("a", "r+", "w") -def _infer_exists_ok(mode: AccessModeLiteral) -> bool: +def _infer_overwrite(mode: AccessModeLiteral) -> bool: """ Check that an ``AccessModeLiteral`` is compatible with overwriting an existing Zarr node. """ @@ -414,14 +414,14 @@ async def save_array( arr = np.array(arr) shape = arr.shape chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute - exists_ok = kwargs.pop("exists_ok", None) or _infer_exists_ok(mode) + overwrite = kwargs.pop("overwrite", None) or _infer_overwrite(mode) new = await AsyncArray.create( store_path, zarr_format=zarr_format, shape=shape, dtype=arr.dtype, chunks=chunks, - exists_ok=exists_ok, + overwrite=overwrite, **kwargs, ) await new.setitem(slice(None), arr) @@ -647,7 +647,7 @@ async def group( return await AsyncGroup.from_store( store=store_path, zarr_format=_zarr_format, - exists_ok=overwrite, + overwrite=overwrite, attributes=attributes, ) @@ -753,12 +753,12 @@ async def open_group( except (KeyError, FileNotFoundError): pass if mode in _CREATE_MODES: - exists_ok = _infer_exists_ok(mode) + overwrite = _infer_overwrite(mode) _zarr_format = zarr_format or _default_zarr_version() return await AsyncGroup.from_store( store_path, zarr_format=_zarr_format, - exists_ok=exists_ok, + overwrite=overwrite, attributes=attributes, ) raise FileNotFoundError(f"Unable to find group: {store_path}") @@ -933,7 +933,7 @@ async def create( dtype=dtype, compressor=compressor, fill_value=fill_value, - exists_ok=overwrite, + overwrite=overwrite, filters=filters, dimension_separator=dimension_separator, zarr_format=zarr_format, @@ -1120,12 +1120,12 @@ async def open_array( return await AsyncArray.open(store_path, zarr_format=zarr_format) except FileNotFoundError: if not store_path.read_only and mode in _CREATE_MODES: - exists_ok = _infer_exists_ok(mode) + overwrite = _infer_overwrite(mode) _zarr_format = zarr_format or _default_zarr_version() return await create( store=store_path, zarr_format=_zarr_format, - overwrite=exists_ok, + overwrite=overwrite, **kwargs, ) raise diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 5e5089911e..aab7e2a527 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -266,7 +266,7 @@ async def create( filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata]: ... @@ -294,7 +294,7 @@ async def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... @@ -322,7 +322,7 @@ async def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... @@ -355,7 +355,7 @@ async def create( filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @@ -387,7 +387,7 @@ async def create( filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ @@ -429,7 +429,7 @@ async def create( compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). V2 only. V3 arrays should not have 'compressor' parameter. - exists_ok : bool, optional + overwrite : bool, optional Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional The data to be inserted into the array (default is None). @@ -489,7 +489,7 @@ async def create( codecs=codecs, dimension_names=dimension_names, attributes=attributes, - exists_ok=exists_ok, + overwrite=overwrite, order=order, ) elif zarr_format == 2: @@ -522,7 +522,7 @@ async def create( filters=filters, compressor=compressor, attributes=attributes, - exists_ok=exists_ok, + overwrite=overwrite, ) else: raise ValueError(f"Insupported zarr_format. Got: {zarr_format}") @@ -552,9 +552,9 @@ async def _create_v3( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, attributes: dict[str, JSON] | None = None, - exists_ok: bool = False, + overwrite: bool = False, ) -> AsyncArray[ArrayV3Metadata]: - if exists_ok: + if overwrite: if store_path.store.supports_deletes: await store_path.delete_dir() else: @@ -609,9 +609,9 @@ async def _create_v2( filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, attributes: dict[str, JSON] | None = None, - exists_ok: bool = False, + overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: - if exists_ok: + if overwrite: if store_path.store.supports_deletes: await store_path.delete_dir() else: @@ -1463,7 +1463,7 @@ def create( filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, ) -> Array: """Creates a new Array instance from an initialized store. @@ -1493,7 +1493,7 @@ def create( The filters used to compress the data (default is None). compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). - exists_ok : bool, optional + overwrite : bool, optional Whether to raise an error if the store already exists (default is False). Returns @@ -1518,7 +1518,7 @@ def create( order=order, filters=filters, compressor=compressor, - exists_ok=exists_ok, + overwrite=overwrite, ), ) return cls(async_array) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 3613e3e12b..f46c5126b2 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -410,12 +410,12 @@ async def from_store( store: StoreLike, *, attributes: dict[str, Any] | None = None, - exists_ok: bool = False, + overwrite: bool = False, zarr_format: ZarrFormat = 3, ) -> AsyncGroup: store_path = await make_store_path(store) - if exists_ok: + if overwrite: if store_path.store.supports_deletes: await store_path.delete_dir() else: @@ -629,7 +629,7 @@ async def setitem(self, key: str, value: Any) -> None: """ path = self.store_path / key await async_api.save_array( - store=path, arr=value, zarr_format=self.metadata.zarr_format, exists_ok=True + store=path, arr=value, zarr_format=self.metadata.zarr_format, overwrite=True ) async def getitem( @@ -919,7 +919,7 @@ async def create_group( self, name: str, *, - exists_ok: bool = False, + overwrite: bool = False, attributes: dict[str, Any] | None = None, ) -> AsyncGroup: """Create a sub-group. @@ -928,7 +928,7 @@ async def create_group( ---------- name : str Group name. - exists_ok : bool, optional + overwrite : bool, optional If True, do not raise an error if the group already exists. attributes : dict, optional Group attributes. @@ -941,7 +941,7 @@ async def create_group( return await type(self).from_store( self.store_path / name, attributes=attributes, - exists_ok=exists_ok, + overwrite=overwrite, zarr_format=self.metadata.zarr_format, ) @@ -960,8 +960,8 @@ async def require_group(self, name: str, overwrite: bool = False) -> AsyncGroup: g : AsyncGroup """ if overwrite: - # TODO: check that exists_ok=True errors if an array exists where the group is being created - grp = await self.create_group(name, exists_ok=True) + # TODO: check that overwrite=True errors if an array exists where the group is being created + grp = await self.create_group(name, overwrite=True) else: try: item: ( @@ -1018,7 +1018,7 @@ async def create_array( filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ @@ -1052,7 +1052,7 @@ async def create_array( Filters for the array. compressor : dict[str, JSON] | None = None The compressor for the array. - exists_ok : bool = False + overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is an error. @@ -1077,7 +1077,7 @@ async def create_array( order=order, filters=filters, compressor=compressor, - exists_ok=exists_ok, + overwrite=overwrite, zarr_format=self.metadata.zarr_format, data=data, ) @@ -1651,7 +1651,7 @@ def from_store( *, attributes: dict[str, Any] | None = None, zarr_format: ZarrFormat = 3, - exists_ok: bool = False, + overwrite: bool = False, ) -> Group: """Instantiate a group from an initialized store. @@ -1663,7 +1663,7 @@ def from_store( A dictionary of JSON-serializable values with user-defined attributes. zarr_format : {2, 3}, optional Zarr storage format version. - exists_ok : bool, optional + overwrite : bool, optional If True, do not raise an error if the group already exists. Returns @@ -1680,7 +1680,7 @@ def from_store( AsyncGroup.from_store( store, attributes=attributes, - exists_ok=exists_ok, + overwrite=overwrite, zarr_format=zarr_format, ), ) @@ -2217,7 +2217,7 @@ def create_array( filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, data: npt.ArrayLike | None = None, ) -> Array: """Create a zarr array within this AsyncGroup. @@ -2251,7 +2251,7 @@ def create_array( Filters for the array. compressor : dict[str, JSON] | None = None The compressor for the array. - exists_ok : bool = False + overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is an error. @@ -2280,7 +2280,7 @@ def create_array( order=order, filters=filters, compressor=compressor, - exists_ok=exists_ok, + overwrite=overwrite, data=data, ) ) @@ -2558,7 +2558,7 @@ def array( filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, # runtime - exists_ok: bool = False, + overwrite: bool = False, data: npt.ArrayLike | None = None, ) -> Array: """Create a zarr array within this AsyncGroup. @@ -2592,7 +2592,7 @@ def array( Filters for the array. compressor : dict[str, JSON] | None = None The compressor for the array. - exists_ok : bool = False + overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is an error. @@ -2622,7 +2622,7 @@ def array( order=order, filters=filters, compressor=compressor, - exists_ok=exists_ok, + overwrite=overwrite, data=data, ) ) diff --git a/tests/conftest.py b/tests/conftest.py index 35f31d39b3..fbef922931 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -100,7 +100,7 @@ async def async_group(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> As store, attributes=param.attributes, zarr_format=param.zarr_format, - exists_ok=False, + overwrite=False, ) diff --git a/tests/test_array.py b/tests/test_array.py index 86da801d1f..263b536784 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -27,12 +27,12 @@ @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [2, 3]) -@pytest.mark.parametrize("exists_ok", [True, False]) +@pytest.mark.parametrize("overwrite", [True, False]) @pytest.mark.parametrize("extant_node", ["array", "group"]) def test_array_creation_existing_node( store: LocalStore | MemoryStore, zarr_format: ZarrFormat, - exists_ok: bool, + overwrite: bool, extant_node: Literal["array", "group"], ) -> None: """ @@ -53,14 +53,14 @@ def test_array_creation_existing_node( new_shape = (2, 2) new_dtype = "float32" - if exists_ok: + if overwrite: if not store.supports_deletes: pytest.skip("store does not support deletes") arr_new = Array.create( spath / "extant", shape=new_shape, dtype=new_dtype, - exists_ok=exists_ok, + overwrite=overwrite, zarr_format=zarr_format, ) assert arr_new.shape == new_shape @@ -71,7 +71,7 @@ def test_array_creation_existing_node( spath / "extant", shape=new_shape, dtype=new_dtype, - exists_ok=exists_ok, + overwrite=overwrite, zarr_format=zarr_format, ) diff --git a/tests/test_group.py b/tests/test_group.py index afa290207d..416e10af9a 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -40,7 +40,7 @@ async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: @pytest.fixture(params=[True, False]) -def exists_ok(request: pytest.FixtureRequest) -> bool: +def overwrite(request: pytest.FixtureRequest) -> bool: result = request.param if not isinstance(result, bool): raise TypeError("Wrong type returned by test fixture.") @@ -154,7 +154,7 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad subsubsubgroup = subsubgroup.create_group("subsubsubgroup") members_expected["subarray"] = group.create_array( - "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), exists_ok=True + "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), overwrite=True ) # add an extra object to the domain of the group. @@ -227,7 +227,7 @@ def test_group(store: Store, zarr_format: ZarrFormat) -> None: # create an array from the "bar" group data = np.arange(0, 4 * 4, dtype="uint16").reshape((4, 4)) arr = bar.create_array( - "baz", shape=data.shape, dtype=data.dtype, chunk_shape=(2, 2), exists_ok=True + "baz", shape=data.shape, dtype=data.dtype, chunk_shape=(2, 2), overwrite=True ) arr[:] = data @@ -252,23 +252,23 @@ def test_group(store: Store, zarr_format: ZarrFormat) -> None: assert dict(bar3.attrs) == {"baz": "qux", "name": "bar"} -def test_group_create(store: Store, exists_ok: bool, zarr_format: ZarrFormat) -> None: +def test_group_create(store: Store, overwrite: bool, zarr_format: ZarrFormat) -> None: """ Test that `Group.from_store` works as expected. """ attributes = {"foo": 100} group = Group.from_store( - store, attributes=attributes, zarr_format=zarr_format, exists_ok=exists_ok + store, attributes=attributes, zarr_format=zarr_format, overwrite=overwrite ) assert group.attrs == attributes - if not exists_ok: + if not overwrite: with pytest.raises(ContainsGroupError): - _ = Group.from_store(store, exists_ok=exists_ok, zarr_format=zarr_format) + _ = Group.from_store(store, overwrite=overwrite, zarr_format=zarr_format) -def test_group_open(store: Store, zarr_format: ZarrFormat, exists_ok: bool) -> None: +def test_group_open(store: Store, zarr_format: ZarrFormat, overwrite: bool) -> None: """ Test the `Group.open` method. """ @@ -280,24 +280,24 @@ def test_group_open(store: Store, zarr_format: ZarrFormat, exists_ok: bool) -> N # create the group attrs = {"path": "foo"} group_created = Group.from_store( - store, attributes=attrs, zarr_format=zarr_format, exists_ok=exists_ok + store, attributes=attrs, zarr_format=zarr_format, overwrite=overwrite ) assert group_created.attrs == attrs assert group_created.metadata.zarr_format == zarr_format assert group_created.store_path == spath - # attempt to create a new group in place, to test exists_ok + # attempt to create a new group in place, to test overwrite new_attrs = {"path": "bar"} - if not exists_ok: + if not overwrite: with pytest.raises(ContainsGroupError): - Group.from_store(store, attributes=attrs, zarr_format=zarr_format, exists_ok=exists_ok) + Group.from_store(store, attributes=attrs, zarr_format=zarr_format, overwrite=overwrite) else: if not store.supports_deletes: pytest.skip( - "Store does not support deletes but `exists_ok` is True, requiring deletes to override a group" + "Store does not support deletes but `overwrite` is True, requiring deletes to override a group" ) group_created_again = Group.from_store( - store, attributes=new_attrs, zarr_format=zarr_format, exists_ok=exists_ok + store, attributes=new_attrs, zarr_format=zarr_format, overwrite=overwrite ) assert group_created_again.attrs == new_attrs assert group_created_again.metadata.zarr_format == zarr_format @@ -597,7 +597,7 @@ async def test_group_update_attributes_async(store: Store, zarr_format: ZarrForm def test_group_create_array( store: Store, zarr_format: ZarrFormat, - exists_ok: bool, + overwrite: bool, method: Literal["create_array", "array"], ) -> None: """ @@ -616,7 +616,7 @@ def test_group_create_array( else: raise AssertionError - if not exists_ok: + if not overwrite: if method == "create_array": with pytest.raises(ContainsArrayError): group.create_array(name="array", shape=shape, dtype=dtype, data=data) @@ -698,12 +698,12 @@ def test_group_array_creation( @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [2, 3]) -@pytest.mark.parametrize("exists_ok", [True, False]) +@pytest.mark.parametrize("overwrite", [True, False]) @pytest.mark.parametrize("extant_node", ["array", "group"]) def test_group_creation_existing_node( store: Store, zarr_format: ZarrFormat, - exists_ok: bool, + overwrite: bool, extant_node: Literal["array", "group"], ) -> None: """ @@ -725,14 +725,14 @@ def test_group_creation_existing_node( new_attributes = {"new": True} - if exists_ok: + if overwrite: if not store.supports_deletes: - pytest.skip("store does not support deletes but exists_ok is True") + pytest.skip("store does not support deletes but overwrite is True") node_new = Group.from_store( spath / "extant", attributes=new_attributes, zarr_format=zarr_format, - exists_ok=exists_ok, + overwrite=overwrite, ) assert node_new.attrs == new_attributes else: @@ -741,13 +741,13 @@ def test_group_creation_existing_node( spath / "extant", attributes=new_attributes, zarr_format=zarr_format, - exists_ok=exists_ok, + overwrite=overwrite, ) async def test_asyncgroup_create( store: Store, - exists_ok: bool, + overwrite: bool, zarr_format: ZarrFormat, ) -> None: """ @@ -758,19 +758,19 @@ async def test_asyncgroup_create( agroup = await AsyncGroup.from_store( store, attributes=attributes, - exists_ok=exists_ok, + overwrite=overwrite, zarr_format=zarr_format, ) assert agroup.metadata == GroupMetadata(zarr_format=zarr_format, attributes=attributes) assert agroup.store_path == await make_store_path(store) - if not exists_ok: + if not overwrite: with pytest.raises(ContainsGroupError): agroup = await AsyncGroup.from_store( spath, attributes=attributes, - exists_ok=exists_ok, + overwrite=overwrite, zarr_format=zarr_format, ) # create an array at our target path @@ -782,7 +782,7 @@ async def test_asyncgroup_create( _ = await AsyncGroup.from_store( StorePath(store=store) / collision_name, attributes=attributes, - exists_ok=exists_ok, + overwrite=overwrite, zarr_format=zarr_format, ) @@ -805,7 +805,7 @@ async def test_asyncgroup_open( group_w = await AsyncGroup.from_store( store=store, attributes=attributes, - exists_ok=False, + overwrite=False, zarr_format=zarr_format, ) @@ -819,7 +819,7 @@ async def test_asyncgroup_open_wrong_format( store: Store, zarr_format: ZarrFormat, ) -> None: - _ = await AsyncGroup.from_store(store=store, exists_ok=False, zarr_format=zarr_format) + _ = await AsyncGroup.from_store(store=store, overwrite=False, zarr_format=zarr_format) zarr_format_wrong: ZarrFormat # try opening with the wrong zarr format if zarr_format == 3: @@ -932,7 +932,7 @@ async def test_asyncgroup_create_group( async def test_asyncgroup_create_array( - store: Store, zarr_format: ZarrFormat, exists_ok: bool + store: Store, zarr_format: ZarrFormat, overwrite: bool ) -> None: """ Test that the AsyncGroup.create_array method works correctly. We ensure that array properties @@ -941,7 +941,7 @@ async def test_asyncgroup_create_array( agroup = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) - if not exists_ok: + if not overwrite: with pytest.raises(ContainsGroupError): agroup = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) diff --git a/tests/test_v2.py b/tests/test_v2.py index 68c07e2024..890d4039a3 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -145,7 +145,7 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" fill_value=np.nan, dtype="float64", zarr_format=2, - exists_ok=True, + overwrite=True, order=array_order, ) @@ -165,7 +165,7 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" fill_value=np.nan, dtype="float64", zarr_format=2, - exists_ok=True, + overwrite=True, order=array_order, ) From 122760fc3440f2f35f5904e9dc7973d5188bdaca Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Fri, 13 Dec 2024 17:04:00 +0100 Subject: [PATCH 26/87] Feat/latency store (#2474) * feat: add wrapperstore * feat: add latencystore * rename noisysetter -> noisygetter * rename _wrapped to _store * loggingstore inherits from wrapperstore * Update src/zarr/storage/wrapper.py Co-authored-by: Joe Hamman * back to asynciterators * update docstrings --------- Co-authored-by: Joe Hamman Co-authored-by: Deepak Cherian --- src/zarr/storage/__init__.py | 2 + src/zarr/storage/logging.py | 12 +-- src/zarr/storage/wrapper.py | 139 +++++++++++++++++++++++++++++++ src/zarr/testing/store.py | 96 +++++++++++++++------ tests/test_store/test_wrapper.py | 46 ++++++++++ 5 files changed, 266 insertions(+), 29 deletions(-) create mode 100644 src/zarr/storage/wrapper.py create mode 100644 tests/test_store/test_wrapper.py diff --git a/src/zarr/storage/__init__.py b/src/zarr/storage/__init__.py index 6703aa2723..17b11f54a6 100644 --- a/src/zarr/storage/__init__.py +++ b/src/zarr/storage/__init__.py @@ -3,6 +3,7 @@ from zarr.storage.logging import LoggingStore from zarr.storage.memory import MemoryStore from zarr.storage.remote import RemoteStore +from zarr.storage.wrapper import WrapperStore from zarr.storage.zip import ZipStore __all__ = [ @@ -12,6 +13,7 @@ "RemoteStore", "StoreLike", "StorePath", + "WrapperStore", "ZipStore", "make_store_path", ] diff --git a/src/zarr/storage/logging.py b/src/zarr/storage/logging.py index bc90b4f30f..9ec3a9be18 100644 --- a/src/zarr/storage/logging.py +++ b/src/zarr/storage/logging.py @@ -7,15 +7,19 @@ from contextlib import contextmanager from typing import TYPE_CHECKING, Any -from zarr.abc.store import ByteRangeRequest, Store +from zarr.abc.store import Store +from zarr.storage.wrapper import WrapperStore if TYPE_CHECKING: from collections.abc import AsyncIterator, Generator, Iterable + from zarr.abc.store import ByteRangeRequest from zarr.core.buffer import Buffer, BufferPrototype + counter: defaultdict[str, int] + -class LoggingStore(Store): +class LoggingStore(WrapperStore[Store]): """ Store wrapper that logs all calls to the wrapped store. @@ -34,7 +38,6 @@ class LoggingStore(Store): Counter of number of times each method has been called """ - _store: Store counter: defaultdict[str, int] def __init__( @@ -43,11 +46,10 @@ def __init__( log_level: str = "DEBUG", log_handler: logging.Handler | None = None, ) -> None: - self._store = store + super().__init__(store) self.counter = defaultdict(int) self.log_level = log_level self.log_handler = log_handler - self._configure_logger(log_level, log_handler) def _configure_logger( diff --git a/src/zarr/storage/wrapper.py b/src/zarr/storage/wrapper.py new file mode 100644 index 0000000000..c160100084 --- /dev/null +++ b/src/zarr/storage/wrapper.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Generic, TypeVar + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator, AsyncIterator, Iterable + from types import TracebackType + from typing import Any, Self + + from zarr.abc.store import ByteRangeRequest + from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.common import BytesLike + +from zarr.abc.store import Store + +T_Store = TypeVar("T_Store", bound=Store) + + +class WrapperStore(Store, Generic[T_Store]): + """ + A store class that wraps an existing ``Store`` instance. + By default all of the store methods are delegated to the wrapped store instance, which is + accessible via the ``._store`` attribute of this class. + + Use this class to modify or extend the behavior of the other store classes. + """ + + _store: T_Store + + def __init__(self, store: T_Store) -> None: + self._store = store + + @classmethod + async def open(cls: type[Self], store_cls: type[T_Store], *args: Any, **kwargs: Any) -> Self: + store = store_cls(*args, **kwargs) + await store._open() + return cls(store=store) + + def __enter__(self) -> Self: + return type(self)(self._store.__enter__()) + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + return self._store.__exit__(exc_type, exc_value, traceback) + + async def _open(self) -> None: + await self._store._open() + + async def _ensure_open(self) -> None: + await self._store._ensure_open() + + async def is_empty(self, prefix: str) -> bool: + return await self._store.is_empty(prefix) + + async def clear(self) -> None: + return await self._store.clear() + + @property + def read_only(self) -> bool: + return self._store.read_only + + def _check_writable(self) -> None: + return self._store._check_writable() + + def __eq__(self, value: object) -> bool: + return type(self) is type(value) and self._store.__eq__(value) + + async def get( + self, key: str, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None + ) -> Buffer | None: + return await self._store.get(key, prototype, byte_range) + + async def get_partial_values( + self, + prototype: BufferPrototype, + key_ranges: Iterable[tuple[str, ByteRangeRequest]], + ) -> list[Buffer | None]: + return await self._store.get_partial_values(prototype, key_ranges) + + async def exists(self, key: str) -> bool: + return await self._store.exists(key) + + async def set(self, key: str, value: Buffer) -> None: + await self._store.set(key, value) + + async def set_if_not_exists(self, key: str, value: Buffer) -> None: + return await self._store.set_if_not_exists(key, value) + + async def _set_many(self, values: Iterable[tuple[str, Buffer]]) -> None: + await self._store._set_many(values) + + @property + def supports_writes(self) -> bool: + return self._store.supports_writes + + @property + def supports_deletes(self) -> bool: + return self._store.supports_deletes + + async def delete(self, key: str) -> None: + await self._store.delete(key) + + @property + def supports_partial_writes(self) -> bool: + return self._store.supports_partial_writes + + async def set_partial_values( + self, key_start_values: Iterable[tuple[str, int, BytesLike]] + ) -> None: + return await self._store.set_partial_values(key_start_values) + + @property + def supports_listing(self) -> bool: + return self._store.supports_listing + + def list(self) -> AsyncIterator[str]: + return self._store.list() + + def list_prefix(self, prefix: str) -> AsyncIterator[str]: + return self._store.list_prefix(prefix) + + def list_dir(self, prefix: str) -> AsyncIterator[str]: + return self._store.list_dir(prefix) + + async def delete_dir(self, prefix: str) -> None: + return await self._store.delete_dir(prefix) + + def close(self) -> None: + self._store.close() + + async def _get_many( + self, requests: Iterable[tuple[str, BufferPrototype, ByteRangeRequest | None]] + ) -> AsyncGenerator[tuple[str, Buffer | None], None]: + async for req in self._store._get_many(requests): + yield req diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index b793f2d67b..53dee012bf 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -1,9 +1,20 @@ +from __future__ import annotations + +import asyncio import pickle -from typing import Any, Generic, TypeVar +from typing import TYPE_CHECKING, Generic, TypeVar + +from zarr.storage.wrapper import WrapperStore + +if TYPE_CHECKING: + from typing import Any + + from zarr.abc.store import ByteRangeRequest + from zarr.core.buffer.core import BufferPrototype import pytest -from zarr.abc.store import Store +from zarr.abc.store import ByteRangeRequest, Store from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.core.sync import _collect_aiterator from zarr.storage._utils import _normalize_interval_index @@ -319,25 +330,62 @@ async def test_set_if_not_exists(self, store: S) -> None: result = await store.get("k2", default_buffer_prototype()) assert result == new - async def test_getsize(self, store: S) -> None: - key = "k" - data = self.buffer_cls.from_bytes(b"0" * 10) - await self.set(store, key, data) - - result = await store.getsize(key) - assert isinstance(result, int) - assert result > 0 - - async def test_getsize_raises(self, store: S) -> None: - with pytest.raises(FileNotFoundError): - await store.getsize("not-a-real-key") - - async def test_getsize_prefix(self, store: S) -> None: - prefix = "array/c/" - for i in range(10): - data = self.buffer_cls.from_bytes(b"0" * 10) - await self.set(store, f"{prefix}/{i}", data) - - result = await store.getsize_prefix(prefix) - assert isinstance(result, int) - assert result > 0 + +class LatencyStore(WrapperStore[Store]): + """ + A wrapper class that takes any store class in its constructor and + adds latency to the `set` and `get` methods. This can be used for + performance testing. + """ + + get_latency: float + set_latency: float + + def __init__(self, cls: Store, *, get_latency: float = 0, set_latency: float = 0) -> None: + self.get_latency = float(get_latency) + self.set_latency = float(set_latency) + self._store = cls + + async def set(self, key: str, value: Buffer) -> None: + """ + Add latency to the ``set`` method. + + Calls ``asyncio.sleep(self.set_latency)`` before invoking the wrapped ``set`` method. + + Parameters + ---------- + key : str + The key to set + value : Buffer + The value to set + + Returns + ------- + None + """ + await asyncio.sleep(self.set_latency) + await self._store.set(key, value) + + async def get( + self, key: str, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None + ) -> Buffer | None: + """ + Add latency to the ``get`` method. + + Calls ``asyncio.sleep(self.get_latency)`` before invoking the wrapped ``get`` method. + + Parameters + ---------- + key : str + The key to get + prototype : BufferPrototype + The BufferPrototype to use. + byte_range : ByteRangeRequest, optional + An optional byte range. + + Returns + ------- + buffer : Buffer or None + """ + await asyncio.sleep(self.get_latency) + return await self._store.get(key, prototype=prototype, byte_range=byte_range) diff --git a/tests/test_store/test_wrapper.py b/tests/test_store/test_wrapper.py new file mode 100644 index 0000000000..1caf9c9ae4 --- /dev/null +++ b/tests/test_store/test_wrapper.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from zarr.core.buffer.cpu import Buffer, buffer_prototype +from zarr.storage.wrapper import WrapperStore + +if TYPE_CHECKING: + from zarr.abc.store import Store + from zarr.core.buffer.core import BufferPrototype + + +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) +async def test_wrapped_set(store: Store, capsys: pytest.CaptureFixture[str]) -> None: + # define a class that prints when it sets + class NoisySetter(WrapperStore): + async def set(self, key: str, value: Buffer) -> None: + print(f"setting {key}") + await super().set(key, value) + + key = "foo" + value = Buffer.from_bytes(b"bar") + store_wrapped = NoisySetter(store) + await store_wrapped.set(key, value) + captured = capsys.readouterr() + assert f"setting {key}" in captured.out + assert await store_wrapped.get(key, buffer_prototype) == value + + +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) +async def test_wrapped_get(store: Store, capsys: pytest.CaptureFixture[str]) -> None: + # define a class that prints when it sets + class NoisyGetter(WrapperStore): + def get(self, key: str, prototype: BufferPrototype) -> None: + print(f"getting {key}") + return super().get(key, prototype=prototype) + + key = "foo" + value = Buffer.from_bytes(b"bar") + store_wrapped = NoisyGetter(store) + await store_wrapped.set(key, value) + assert await store_wrapped.get(key, buffer_prototype) == value + captured = capsys.readouterr() + assert f"getting {key}" in captured.out From ab1a7b3758a7e14952b546e4f18d1d9e59168f5d Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Sat, 14 Dec 2024 20:20:56 -0500 Subject: [PATCH 27/87] rename RemoteStore -> FsspecStore (#2557) * rename RemoteStore -> FsspecStore * release note * fix store type * fixup doc build --- docs/release.rst | 2 ++ src/zarr/storage/__init__.py | 4 +-- src/zarr/storage/common.py | 4 +-- src/zarr/storage/{remote.py => fsspec.py} | 22 +++++++------- tests/conftest.py | 16 +++++----- tests/test_store/test_core.py | 4 +-- .../{test_remote.py => test_fsspec.py} | 30 +++++++++---------- 7 files changed, 42 insertions(+), 40 deletions(-) rename src/zarr/storage/{remote.py => fsspec.py} (96%) rename tests/test_store/{test_remote.py => test_fsspec.py} (89%) diff --git a/docs/release.rst b/docs/release.rst index 7f424c00e2..dd60502e85 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -39,6 +39,8 @@ Dependency Changes fsspec and any relevant implementations (e.g. s3fs) before using the ``RemoteStore``. By :user:`Joe Hamman ` :issue:`2391`. +* ``RemoteStore`` was renamed to ``FsspecStore``. + By :user:`Joe Hamman ` :issue:`2557`. .. release_3.0.0-alpha: diff --git a/src/zarr/storage/__init__.py b/src/zarr/storage/__init__.py index 17b11f54a6..6f3ec59b01 100644 --- a/src/zarr/storage/__init__.py +++ b/src/zarr/storage/__init__.py @@ -1,16 +1,16 @@ from zarr.storage.common import StoreLike, StorePath, make_store_path +from zarr.storage.fsspec import FsspecStore from zarr.storage.local import LocalStore from zarr.storage.logging import LoggingStore from zarr.storage.memory import MemoryStore -from zarr.storage.remote import RemoteStore from zarr.storage.wrapper import WrapperStore from zarr.storage.zip import ZipStore __all__ = [ + "FsspecStore", "LocalStore", "LoggingStore", "MemoryStore", - "RemoteStore", "StoreLike", "StorePath", "WrapperStore", diff --git a/src/zarr/storage/common.py b/src/zarr/storage/common.py index e9d57197e1..973c8b13e3 100644 --- a/src/zarr/storage/common.py +++ b/src/zarr/storage/common.py @@ -281,7 +281,7 @@ async def make_store_path( TypeError If the StoreLike object is not one of the supported types. """ - from zarr.storage.remote import RemoteStore # circular import + from zarr.storage.fsspec import FsspecStore # circular import used_storage_options = False path_normalized = normalize_path(path) @@ -302,7 +302,7 @@ async def make_store_path( if _is_fsspec_uri(store_like): used_storage_options = True - store = RemoteStore.from_url( + store = FsspecStore.from_url( store_like, storage_options=storage_options, read_only=_read_only ) else: diff --git a/src/zarr/storage/remote.py b/src/zarr/storage/fsspec.py similarity index 96% rename from src/zarr/storage/remote.py rename to src/zarr/storage/fsspec.py index 2b8329c9fa..c9edd8f8ac 100644 --- a/src/zarr/storage/remote.py +++ b/src/zarr/storage/fsspec.py @@ -22,7 +22,7 @@ ) -class RemoteStore(Store): +class FsspecStore(Store): """ A remote Store based on FSSpec @@ -61,8 +61,8 @@ class RemoteStore(Store): See Also -------- - RemoteStore.from_upath - RemoteStore.from_url + FsspecStore.from_upath + FsspecStore.from_url """ # based on FSSpec @@ -96,7 +96,7 @@ def __init__( if "://" in path and not path.startswith("http"): # `not path.startswith("http")` is a special case for the http filesystem (¯\_(ツ)_/¯) scheme, _ = path.split("://", maxsplit=1) - raise ValueError(f"path argument to RemoteStore must not include scheme ({scheme}://)") + raise ValueError(f"path argument to FsspecStore must not include scheme ({scheme}://)") @classmethod def from_upath( @@ -104,9 +104,9 @@ def from_upath( upath: Any, read_only: bool = False, allowed_exceptions: tuple[type[Exception], ...] = ALLOWED_EXCEPTIONS, - ) -> RemoteStore: + ) -> FsspecStore: """ - Create a RemoteStore from an upath object. + Create a FsspecStore from an upath object. Parameters ---------- @@ -120,7 +120,7 @@ def from_upath( Returns ------- - RemoteStore + FsspecStore """ return cls( fs=upath.fs, @@ -136,9 +136,9 @@ def from_url( storage_options: dict[str, Any] | None = None, read_only: bool = False, allowed_exceptions: tuple[type[Exception], ...] = ALLOWED_EXCEPTIONS, - ) -> RemoteStore: + ) -> FsspecStore: """ - Create a RemoteStore from a URL. + Create a FsspecStore from a URL. Parameters ---------- @@ -154,7 +154,7 @@ def from_url( Returns ------- - RemoteStore + FsspecStore """ try: from fsspec import url_to_fs @@ -185,7 +185,7 @@ async def clear(self) -> None: pass def __repr__(self) -> str: - return f"" + return f"" def __eq__(self, other: object) -> bool: return ( diff --git a/tests/conftest.py b/tests/conftest.py index fbef922931..ee31d0d071 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,7 +13,7 @@ from zarr.abc.store import Store from zarr.core.sync import sync from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore -from zarr.storage.remote import RemoteStore +from zarr.storage.fsspec import FsspecStore if TYPE_CHECKING: from collections.abc import Generator @@ -25,14 +25,14 @@ async def parse_store( - store: Literal["local", "memory", "remote", "zip"], path: str -) -> LocalStore | MemoryStore | RemoteStore | ZipStore: + store: Literal["local", "memory", "fsspec", "zip"], path: str +) -> LocalStore | MemoryStore | FsspecStore | ZipStore: if store == "local": return await LocalStore.open(path) if store == "memory": return await MemoryStore.open() - if store == "remote": - return await RemoteStore.open(url=path) + if store == "fsspec": + return await FsspecStore.open(url=path) if store == "zip": return await ZipStore.open(path + "/zarr.zip", mode="w") raise AssertionError @@ -56,8 +56,8 @@ async def local_store(tmpdir: LEGACY_PATH) -> LocalStore: @pytest.fixture -async def remote_store(url: str) -> RemoteStore: - return await RemoteStore.open(url) +async def remote_store(url: str) -> FsspecStore: + return await FsspecStore.open(url) @pytest.fixture @@ -87,7 +87,7 @@ def sync_store(request: pytest.FixtureRequest, tmp_path: LEGACY_PATH) -> Store: @dataclass class AsyncGroupRequest: zarr_format: ZarrFormat - store: Literal["local", "remote", "memory", "zip"] + store: Literal["local", "fsspec", "memory", "zip"] attributes: dict[str, Any] = field(default_factory=dict) diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index 81ed3744a9..48f8d2a529 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -7,9 +7,9 @@ from zarr.core.common import AccessModeLiteral from zarr.storage._utils import normalize_path from zarr.storage.common import StoreLike, StorePath, make_store_path +from zarr.storage.fsspec import FsspecStore from zarr.storage.local import LocalStore from zarr.storage.memory import MemoryStore -from zarr.storage.remote import RemoteStore @pytest.mark.parametrize("path", [None, "", "bar"]) @@ -73,7 +73,7 @@ async def test_make_store_path_invalid() -> None: async def test_make_store_path_fsspec(monkeypatch) -> None: pytest.importorskip("fsspec") store_path = await make_store_path("http://foo.com/bar") - assert isinstance(store_path.store, RemoteStore) + assert isinstance(store_path.store, FsspecStore) @pytest.mark.parametrize( diff --git a/tests/test_store/test_remote.py b/tests/test_store/test_fsspec.py similarity index 89% rename from tests/test_store/test_remote.py rename to tests/test_store/test_fsspec.py index c7f33e4b39..b307f2cdf4 100644 --- a/tests/test_store/test_remote.py +++ b/tests/test_store/test_fsspec.py @@ -10,7 +10,7 @@ import zarr.api.asynchronous from zarr.core.buffer import Buffer, cpu, default_buffer_prototype from zarr.core.sync import _collect_aiterator, sync -from zarr.storage import RemoteStore +from zarr.storage import FsspecStore from zarr.testing.store import StoreTests if TYPE_CHECKING: @@ -84,7 +84,7 @@ def s3(s3_base: None) -> Generator[s3fs.S3FileSystem, None, None]: async def test_basic() -> None: - store = RemoteStore.from_url( + store = FsspecStore.from_url( f"s3://{test_bucket_name}/foo/spam/", storage_options={"endpoint_url": endpoint_url, "anon": False}, ) @@ -102,8 +102,8 @@ async def test_basic() -> None: assert out[0].to_bytes() == data[1:] -class TestRemoteStoreS3(StoreTests[RemoteStore, cpu.Buffer]): - store_cls = RemoteStore +class TestFsspecStoreS3(StoreTests[FsspecStore, cpu.Buffer]): + store_cls = FsspecStore buffer_cls = cpu.Buffer @pytest.fixture @@ -114,36 +114,36 @@ def store_kwargs(self, request) -> dict[str, str | bool]: return {"fs": fs, "path": path} @pytest.fixture - def store(self, store_kwargs: dict[str, str | bool]) -> RemoteStore: + def store(self, store_kwargs: dict[str, str | bool]) -> FsspecStore: return self.store_cls(**store_kwargs) - async def get(self, store: RemoteStore, key: str) -> Buffer: + async def get(self, store: FsspecStore, key: str) -> Buffer: # make a new, synchronous instance of the filesystem because this test is run in sync code new_fs = fsspec.filesystem( "s3", endpoint_url=store.fs.endpoint_url, anon=store.fs.anon, asynchronous=False ) return self.buffer_cls.from_bytes(new_fs.cat(f"{store.path}/{key}")) - async def set(self, store: RemoteStore, key: str, value: Buffer) -> None: + async def set(self, store: FsspecStore, key: str, value: Buffer) -> None: # make a new, synchronous instance of the filesystem because this test is run in sync code new_fs = fsspec.filesystem( "s3", endpoint_url=store.fs.endpoint_url, anon=store.fs.anon, asynchronous=False ) new_fs.write_bytes(f"{store.path}/{key}", value.to_bytes()) - def test_store_repr(self, store: RemoteStore) -> None: - assert str(store) == "" + def test_store_repr(self, store: FsspecStore) -> None: + assert str(store) == "" - def test_store_supports_writes(self, store: RemoteStore) -> None: + def test_store_supports_writes(self, store: FsspecStore) -> None: assert store.supports_writes - def test_store_supports_partial_writes(self, store: RemoteStore) -> None: + def test_store_supports_partial_writes(self, store: FsspecStore) -> None: assert not store.supports_partial_writes - def test_store_supports_listing(self, store: RemoteStore) -> None: + def test_store_supports_listing(self, store: FsspecStore) -> None: assert store.supports_listing - async def test_remote_store_from_uri(self, store: RemoteStore): + async def test_fsspec_store_from_uri(self, store: FsspecStore) -> None: storage_options = { "endpoint_url": endpoint_url, "anon": False, @@ -188,7 +188,7 @@ def test_from_upath(self) -> None: anon=False, asynchronous=True, ) - result = RemoteStore.from_upath(path) + result = FsspecStore.from_upath(path) assert result.fs.endpoint_url == endpoint_url assert result.fs.asynchronous assert result.path == f"{test_bucket_name}/foo/bar" @@ -197,7 +197,7 @@ def test_init_raises_if_path_has_scheme(self, store_kwargs) -> None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2342 store_kwargs["path"] = "s3://" + store_kwargs["path"] with pytest.raises( - ValueError, match="path argument to RemoteStore must not include scheme .*" + ValueError, match="path argument to FsspecStore must not include scheme .*" ): self.store_cls(**store_kwargs) From 9972066af88cb5e64f1ba5d4564d66572703e6cc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:23:13 +0000 Subject: [PATCH 28/87] Bump pypa/gh-action-pypi-publish in the actions group (#2562) Bumps the actions group with 1 update: [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish). Updates `pypa/gh-action-pypi-publish` from 1.12.2 to 1.12.3 - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.12.2...v1.12.3) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch dependency-group: actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/releases.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index 34d6696413..375d9651d5 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -55,7 +55,7 @@ jobs: with: name: releases path: dist - - uses: pypa/gh-action-pypi-publish@v1.12.2 + - uses: pypa/gh-action-pypi-publish@v1.12.3 with: user: __token__ password: ${{ secrets.pypi_password }} From 77d0b112cb8211d3e033c502f35c96d230b78647 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Mon, 16 Dec 2024 12:43:18 +0000 Subject: [PATCH 29/87] Clean up optional dependency groups (#2541) * Clean up optional dependency groups * Fix hatch envs * Remove jupyter extra * Drop upath and tree dependency groups * Change fsspec group to remote group * Add a v3 what's new * Fix optional group * Fix spelling --------- Co-authored-by: Joe Hamman --- docs/guide/index.rst | 1 + docs/guide/whatsnew_v3.rst | 14 ++++++++++++++ pyproject.toml | 33 ++++++++++----------------------- 3 files changed, 25 insertions(+), 23 deletions(-) create mode 100644 docs/guide/whatsnew_v3.rst diff --git a/docs/guide/index.rst b/docs/guide/index.rst index f841dbb85d..e532a13e20 100644 --- a/docs/guide/index.rst +++ b/docs/guide/index.rst @@ -4,5 +4,6 @@ Guide .. toctree:: :maxdepth: 1 + whatsnew_v3 storage consolidated_metadata diff --git a/docs/guide/whatsnew_v3.rst b/docs/guide/whatsnew_v3.rst new file mode 100644 index 0000000000..302c3cf20c --- /dev/null +++ b/docs/guide/whatsnew_v3.rst @@ -0,0 +1,14 @@ +What's new in v3 +================ + +This page gives an overview of major changes and additions in version 3. + + +Dependencies +------------ +- The new ``remote`` dependency group can be used to install a supported version of + ``fsspec``, required for remote data access. +- The new ``gpu`` dependency group can be used to install a supported version of + ``cuda``, required for GPU functionality. +- The ``jupyter`` optional dependency group has been removed, since v3 contains no + jupyter specific functionality. diff --git a/pyproject.toml b/pyproject.toml index 5f2d7569b9..b438a2c292 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,9 +53,14 @@ license = {text = "MIT License"} keywords = ["Python", "compressed", "ndimensional-arrays", "zarr"] [project.optional-dependencies] -fsspec = [ +# User extras +remote = [ "fsspec>=2023.10.0", ] +gpu = [ + "cupy-cuda12x", +] +# Development extras test = [ "coverage", "pytest", @@ -68,15 +73,7 @@ test = [ "hypothesis", "universal-pathlib", ] - -jupyter = [ - 'notebook', - 'ipytree>=0.2.2', - 'ipywidgets>=8.0.0', -] -gpu = [ - "cupy-cuda12x", -] +optional = ["rich", "universal-pathlib"] docs = [ 'sphinx==8.1.3', 'sphinx-autobuild>=2021.3.14', @@ -87,19 +84,9 @@ docs = [ 'pydata-sphinx-theme', 'numpydoc', 'numcodecs[msgpack]', - 'msgpack', -] -extra = [ - 'msgpack', -] -optional = [ - 'universal-pathlib>=0.0.22', - 'rich' -] -tree = [ - 'rich', ] + [project.urls] "Bug Tracker" = "https://github.com/zarr-developers/zarr-python/issues" Changelog = "https://zarr.readthedocs.io/en/stable/release.html" @@ -129,7 +116,7 @@ dependencies = [ "numpy~={matrix:numpy}", "universal_pathlib", ] -features = ["test", "extra"] +features = ["test"] [[tool.hatch.envs.test.matrix]] python = ["3.11", "3.12", "3.13"] @@ -160,7 +147,7 @@ dependencies = [ "numpy~={matrix:numpy}", "universal_pathlib", ] -features = ["test", "extra", "gpu"] +features = ["test", "gpu"] [[tool.hatch.envs.gputest.matrix]] python = ["3.11", "3.12", "3.13"] From 775979fce31175149a35aee04f0f7b8cb29ab9a4 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 16 Dec 2024 14:57:02 +0100 Subject: [PATCH 30/87] add warnings when using non-spec features with v3 (#2556) * add warnings when using non-spec features with v3 * tweak signature * may change in the future * wording --- pyproject.toml | 1 + src/zarr/api/asynchronous.py | 9 +++++++++ src/zarr/codecs/vlen_utf8.py | 19 +++++++++++++++++++ src/zarr/core/array.py | 13 +++++++++++-- src/zarr/core/metadata/v3.py | 10 +++++----- 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b438a2c292..7b516bbc05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -363,6 +363,7 @@ filterwarnings = [ "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning", "ignore:Creating a zarr.buffer.gpu.*:UserWarning", "ignore:Duplicate name:UserWarning", # from ZipFile + "ignore:.*is currently not part in the Zarr version 3 specification.*:UserWarning", ] markers = [ "gpu: mark a test as requiring CuPy and GPU" diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index b5dbb0cfa5..2d1c26e145 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -195,6 +195,14 @@ async def consolidate_metadata( v = dataclasses.replace(v, consolidated_metadata=ConsolidatedMetadata(metadata={})) members_metadata[k] = v + if any(m.zarr_format == 3 for m in members_metadata.values()): + warnings.warn( + "Consolidated metadata is currently not part in the Zarr version 3 specification. It " + "may not be supported by other zarr implementations and may change in the future.", + category=UserWarning, + stacklevel=1, + ) + ConsolidatedMetadata._flat_to_nested(members_metadata) consolidated_metadata = ConsolidatedMetadata(metadata=members_metadata) @@ -203,6 +211,7 @@ async def consolidate_metadata( group, metadata=metadata, ) + await group._save_metadata() return group diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 43544e0809..e5b895ae0c 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING +from warnings import warn import numpy as np from numcodecs.vlen import VLenBytes, VLenUTF8 @@ -25,6 +26,15 @@ @dataclass(frozen=True) class VLenUTF8Codec(ArrayBytesCodec): + def __init__(self) -> None: + warn( + "The codec `vlen-utf8` is currently not part in the Zarr version 3 specification. It " + "may not be supported by other zarr implementations and may change in the future.", + category=UserWarning, + stacklevel=2, + ) + super().__init__() + @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: _, configuration_parsed = parse_named_configuration( @@ -71,6 +81,15 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - @dataclass(frozen=True) class VLenBytesCodec(ArrayBytesCodec): + def __init__(self) -> None: + warn( + "The codec `vlen-bytes` is currently not part in the Zarr version 3 specification. It " + "may not be supported by other zarr implementations and may change in the future.", + category=UserWarning, + stacklevel=2, + ) + super().__init__() + @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: _, configuration_parsed = parse_named_configuration( diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index aab7e2a527..b57712717b 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -6,6 +6,7 @@ from itertools import starmap from logging import getLogger from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload +from warnings import warn import numpy as np import numpy.typing as npt @@ -539,7 +540,7 @@ async def _create_v3( store_path: StorePath, *, shape: ShapeLike, - dtype: npt.DTypeLike, + dtype: np.dtype[Any], chunk_shape: ChunkCoords, fill_value: Any | None = None, order: MemoryOrder | None = None, @@ -580,6 +581,14 @@ async def _create_v3( else DefaultChunkKeyEncoding(separator=chunk_key_encoding[1]) ) + if dtype.kind in "UTS": + warn( + f"The dtype `{dtype}` is currently not part in the Zarr version 3 specification. It " + "may not be supported by other zarr implementations and may change in the future.", + category=UserWarning, + stacklevel=2, + ) + metadata = ArrayV3Metadata( shape=shape, data_type=dtype, @@ -601,7 +610,7 @@ async def _create_v2( store_path: StorePath, *, shape: ChunkCoords, - dtype: npt.DTypeLike, + dtype: np.dtype[Any], chunks: ChunkCoords, dimension_separator: Literal[".", "/"] | None = None, fill_value: float | None = None, diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index b800ae4d73..3e925e08bd 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -95,14 +95,14 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name - codec_id = abc.__class__.__name__ - if dtype == DataType.string and not codec_id == "VLenUTF8Codec": + codec_class_name = abc.__class__.__name__ + if dtype == DataType.string and not codec_class_name == "VLenUTF8Codec": raise ValueError( - f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_id}`." + f"For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `{codec_class_name}`." ) - if dtype == DataType.bytes and not codec_id == "VLenBytesCodec": + if dtype == DataType.bytes and not codec_class_name == "VLenBytesCodec": raise ValueError( - f"For bytes dtype, ArrayBytesCodec must be `VLenBytesCodec`, got `{codec_id}`." + f"For bytes dtype, ArrayBytesCodec must be `VLenBytesCodec`, got `{codec_class_name}`." ) From c0f7ece3aba07c208431f1401f5635c072cb3033 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Mon, 16 Dec 2024 16:03:02 +0000 Subject: [PATCH 31/87] Shorten contributing page title (#2565) --- docs/contributing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/contributing.rst b/docs/contributing.rst index 8038330239..6b0567f38d 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -1,5 +1,5 @@ -Contributing to Zarr -==================== +Contributing +============ Zarr is a community maintained project. We welcome contributions in the form of bug reports, bug fixes, documentation, enhancement proposals and more. This page provides From a615ee90b0409e890998f12ad1a593173f1d729a Mon Sep 17 00:00:00 2001 From: David Stansby Date: Tue, 17 Dec 2024 11:38:56 +0000 Subject: [PATCH 32/87] Remove specs pages from docs (#2555) * Remove specs pages from docs * Add spec doc redirects * Fix redirects * Fix index page typo Co-authored-by: Josh Moore --------- Co-authored-by: Josh Moore --- docs/conf.py | 8 ++++++++ docs/contributing.rst | 3 ++- docs/index.rst | 4 ++-- docs/release.rst | 20 ++++++++++---------- docs/spec.rst | 11 ----------- docs/spec/v1.rst | 7 ------- docs/spec/v2.rst | 7 ------- docs/spec/v3.rst | 7 ------- pyproject.toml | 1 + test.py | 7 +++++++ 10 files changed, 30 insertions(+), 45 deletions(-) delete mode 100644 docs/spec.rst delete mode 100644 docs/spec/v1.rst delete mode 100644 docs/spec/v2.rst delete mode 100644 docs/spec/v3.rst create mode 100644 test.py diff --git a/docs/conf.py b/docs/conf.py index 72c6130a16..5f714421d3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -47,6 +47,7 @@ "sphinx_issues", "sphinx_copybutton", "sphinx_design", + 'sphinx_reredirects', ] issues_github_path = "zarr-developers/zarr-python" @@ -81,6 +82,13 @@ version = get_version("zarr") release = get_version("zarr") +redirects = { + "spec": "https://zarr-specs.readthedocs.io", + "spec/v1": 'https://zarr-specs.readthedocs.io/en/latest/v1/v1.0.html', + "spec/v2": "https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html", + "spec/v3": "https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html", +} + # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # diff --git a/docs/contributing.rst b/docs/contributing.rst index 6b0567f38d..0ead6c8267 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -307,7 +307,8 @@ Data format compatibility The data format used by Zarr is defined by a specification document, which should be platform-independent and contain sufficient detail to construct an interoperable software library to read and/or write Zarr data using any programming language. The -latest version of the specification document is available from the :ref:`spec` page. +latest version of the specification document is available on the +`Zarr specifications website `_. Here, **data format compatibility** means that all software libraries that implement a particular version of the Zarr storage specification are interoperable, in the sense diff --git a/docs/index.rst b/docs/index.rst index d0b41ed634..82ed2889f4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,7 +12,6 @@ Zarr-Python tutorial guide/index api/index - spec release license contributing @@ -26,7 +25,8 @@ Zarr-Python `Installation `_ | `Source Repository `_ | `Issue Tracker `_ | -`Zulip Chat `_ +`Zulip Chat `_ | +`Zarr specifications `_ Zarr is a file storage format for chunked, compressed, N-dimensional arrays based on an open-source specification. diff --git a/docs/release.rst b/docs/release.rst index dd60502e85..be0919f08b 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -218,17 +218,17 @@ Typing Maintenance ~~~~~~~~~~~ -* Remedy a situation where ``zarr-python`` was importing ``DummyStorageTransformer`` from the test suite. +* Remedy a situation where ``zarr-python`` was importing ``DummyStorageTransformer`` from the test suite. The dependency relationship is now reversed: the test suite imports this class from ``zarr-python``. By :user:`Davis Bennett ` :issue:`1601`. -* [V3] Update minimum supported Python and Numpy versions. +* [V3] Update minimum supported Python and Numpy versions. By :user:`Joe Hamman ` :issue:`1638` * use src layout and use hatch for packaging. By :user:`Davis Bennett ` :issue:`1592`. -* temporarily disable mypy in v3 directory. +* temporarily disable mypy in v3 directory. By :user:`Joe Hamman ` :issue:`1649`. * create hatch test env. @@ -315,10 +315,10 @@ Maintenance Documentation ~~~~~~~~~~~~~ -* Specify docs hatch env for v3 branch. +* Specify docs hatch env for v3 branch. By :user:`Max Jones ` :issue:`1655`. -* Development installation/contributing docs updates. +* Development installation/contributing docs updates. By :user:`Alden Keefe Sampson ` :issue:`1643`. * chore: update project settings per scientific python repo-review. @@ -336,7 +336,7 @@ Enhancements ~~~~~~~~~~~~ * Added support for creating a copy of data when converting a `zarr.Array` to a numpy array. - By :user:`David Stansby ` (:issue:`2106`) and + By :user:`David Stansby ` (:issue:`2106`) and :user:`Joe Hamman ` (:issue:`2123`). Maintenance @@ -2191,7 +2191,7 @@ Other changes ~~~~~~~~~~~~~ To accommodate support for hierarchies and filters, the Zarr metadata format -has been modified. See the :ref:`spec_v2` for more information. To migrate an +has been modified. See the ``spec_v2`` for more information. To migrate an array stored using Zarr version 1.x, use the :func:`zarr.storage.migrate_1to2` function. @@ -2237,14 +2237,14 @@ abstraction layer between the core array logic and data storage (:issue:`21`). In this release, any object that implements the ``MutableMapping`` interface can be used as an array store. See the tutorial sections on :ref:`tutorial_persist` -and :ref:`tutorial_storage`, the :ref:`spec_v1`, and the +and :ref:`tutorial_storage`, the ``spec_v1``, and the :mod:`zarr.storage` module documentation for more information. Please note also that the file organization and file name conventions used when storing a Zarr array in a directory on the file system have changed. Persistent Zarr arrays created using previous versions of the software will not be compatible with this version. See the -:mod:`zarr.storage` API docs and the :ref:`spec_v1` for more +:mod:`zarr.storage` API docs and the ``spec_v1`` for more information. Compression @@ -2257,7 +2257,7 @@ as the default compressor, however other compressors including zlib, BZ2 and LZMA are also now supported via the Python standard library. New compressors can also be dynamically registered for use with Zarr. See the tutorial sections on :ref:`tutorial_compress` and -:ref:`tutorial_tips_blosc`, the :ref:`spec_v1`, and the +:ref:`tutorial_tips_blosc`, the ``spec_v1``, and the :mod:`zarr.compressors` module documentation for more information. Synchronization diff --git a/docs/spec.rst b/docs/spec.rst deleted file mode 100644 index 8aca0bbd80..0000000000 --- a/docs/spec.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _spec: - -Specifications -============== - -.. toctree:: - :maxdepth: 1 - - spec/v3 - spec/v2 - spec/v1 diff --git a/docs/spec/v1.rst b/docs/spec/v1.rst deleted file mode 100644 index 27a0490e0a..0000000000 --- a/docs/spec/v1.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _spec_v1: - -Zarr Storage Specification Version 1 -==================================== - -The V1 Specification has been migrated to its website → -https://zarr-specs.readthedocs.io/. diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst deleted file mode 100644 index deb6d46ce6..0000000000 --- a/docs/spec/v2.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _spec_v2: - -Zarr Storage Specification Version 2 -==================================== - -The V2 Specification has been migrated to its website → -https://zarr-specs.readthedocs.io/. diff --git a/docs/spec/v3.rst b/docs/spec/v3.rst deleted file mode 100644 index 3d39f35ba6..0000000000 --- a/docs/spec/v3.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _spec_v3: - -Zarr Storage Specification Version 3 -==================================== - -The V3 Specification has been migrated to its website → -https://zarr-specs.readthedocs.io/. diff --git a/pyproject.toml b/pyproject.toml index 7b516bbc05..6c8110cbf9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,7 @@ docs = [ 'sphinx_design', 'sphinx-issues', 'sphinx-copybutton', + 'sphinx-reredirects', 'pydata-sphinx-theme', 'numpydoc', 'numcodecs[msgpack]', diff --git a/test.py b/test.py new file mode 100644 index 0000000000..29dac92c8b --- /dev/null +++ b/test.py @@ -0,0 +1,7 @@ +import zarr + +store = zarr.DirectoryStore("data") +r = zarr.open_group(store=store) +z = r.full("myArray", 42, shape=(), dtype="i4", compressor=None) + +print(z.oindex[...]) From a7714c70298d82bf1a2c9555e208fde9ad7ac3c4 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Tue, 17 Dec 2024 16:13:04 +0000 Subject: [PATCH 33/87] Trim trailing whitespace (#2563) --- .github/workflows/gpu_test.yml | 2 +- .github/workflows/releases.yml | 2 +- .github/workflows/test.yml | 4 ++-- .pre-commit-config.yaml | 1 + README-v3.md | 2 +- bench/compress_normal.txt | 40 +++++++++++++++++----------------- docs/guide/storage.rst | 10 ++++----- docs/roadmap.rst | 8 +++---- docs/tutorial.rst | 4 ++-- 9 files changed, 37 insertions(+), 36 deletions(-) diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index 0403a9c06e..b13da7d36f 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -55,7 +55,7 @@ jobs: cache: 'pip' - name: Install Hatch and CuPy run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip pip install hatch - name: Set Up Hatch Env run: | diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index 375d9651d5..1b23260c2e 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -23,7 +23,7 @@ jobs: - name: Install PyBuild run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip pip install hatch - name: Build wheel and sdist run: hatch build diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1c25dcb1f4..1157fccc86 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -52,7 +52,7 @@ jobs: cache: 'pip' - name: Install Hatch run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip pip install hatch - name: Set Up Hatch Env run: | @@ -84,7 +84,7 @@ jobs: cache: 'pip' - name: Install Hatch run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip pip install hatch - name: Set Up Hatch Env run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b300752b31..4a93e9ce87 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,6 +20,7 @@ repos: rev: v5.0.0 hooks: - id: check-yaml + - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.13.0 hooks: diff --git a/README-v3.md b/README-v3.md index 8348038e5a..598e646377 100644 --- a/README-v3.md +++ b/README-v3.md @@ -38,7 +38,7 @@ hatch env create test ## Run the Tests ``` -hatch run test:run +hatch run test:run ``` or diff --git a/bench/compress_normal.txt b/bench/compress_normal.txt index d527cf03d4..e5d6be6aeb 100644 --- a/bench/compress_normal.txt +++ b/bench/compress_normal.txt @@ -19,7 +19,7 @@ Line # Hits Time Per Hit % Time Line Contents ============================================================== 137 def compress(source, char* cname, int clevel, int shuffle): 138 """Compress data in a numpy array. - 139 + 139 140 Parameters 141 ---------- 142 source : array-like @@ -30,14 +30,14 @@ Line # Hits Time Per Hit % Time Line Contents 147 Compression level. 148 shuffle : int 149 Shuffle filter. - 150 + 150 151 Returns 152 ------- 153 dest : bytes-like 154 Compressed data. - 155 + 155 156 """ - 157 + 157 158 cdef: 159 char *source_ptr 160 char *dest_ptr @@ -45,18 +45,18 @@ Line # Hits Time Per Hit % Time Line Contents 162 size_t nbytes, cbytes, itemsize 163 200 506 2.5 0.2 array.array char_array_template = array.array('b', []) 164 array.array dest - 165 + 165 166 # setup source buffer 167 200 458 2.3 0.2 PyObject_GetBuffer(source, &source_buffer, PyBUF_ANY_CONTIGUOUS) 168 200 119 0.6 0.0 source_ptr = source_buffer.buf - 169 + 169 170 # setup destination 171 200 239 1.2 0.1 nbytes = source_buffer.len 172 200 103 0.5 0.0 itemsize = source_buffer.itemsize 173 200 2286 11.4 0.8 dest = array.clone(char_array_template, nbytes + BLOSC_MAX_OVERHEAD, 174 zero=False) 175 200 129 0.6 0.0 dest_ptr = dest.data.as_voidptr - 176 + 176 177 # perform compression 178 200 1734 8.7 0.6 if _get_use_threads(): 179 # allow blosc to use threads internally @@ -67,24 +67,24 @@ Line # Hits Time Per Hit % Time Line Contents 184 cbytes = blosc_compress(clevel, shuffle, itemsize, nbytes, 185 source_ptr, dest_ptr, 186 nbytes + BLOSC_MAX_OVERHEAD) - 187 + 187 188 else: 189 with nogil: 190 cbytes = blosc_compress_ctx(clevel, shuffle, itemsize, nbytes, 191 source_ptr, dest_ptr, 192 nbytes + BLOSC_MAX_OVERHEAD, cname, 193 0, 1) - 194 + 194 195 # release source buffer 196 200 616 3.1 0.2 PyBuffer_Release(&source_buffer) - 197 + 197 198 # check compression was successful 199 200 120 0.6 0.0 if cbytes <= 0: 200 raise RuntimeError('error during blosc compression: %d' % cbytes) - 201 + 201 202 # resize after compression 203 200 1896 9.5 0.6 array.resize(dest, cbytes) - 204 + 204 205 200 186 0.9 0.1 return dest ******************************************************************************* @@ -100,19 +100,19 @@ Line # Hits Time Per Hit % Time Line Contents ============================================================== 75 def decompress(source, dest): 76 """Decompress data. - 77 + 77 78 Parameters 79 ---------- 80 source : bytes-like 81 Compressed data, including blosc header. 82 dest : array-like 83 Object to decompress into. - 84 + 84 85 Notes 86 ----- 87 Assumes that the size of the destination buffer is correct for the size of 88 the uncompressed data. - 89 + 89 90 """ 91 cdef: 92 int ret @@ -122,7 +122,7 @@ Line # Hits Time Per Hit % Time Line Contents 96 array.array source_array 97 Py_buffer dest_buffer 98 size_t nbytes - 99 + 99 100 # setup source buffer 101 200 573 2.9 0.2 if PY2 and isinstance(source, array.array): 102 # workaround fact that array.array does not support new-style buffer @@ -134,13 +134,13 @@ Line # Hits Time Per Hit % Time Line Contents 108 200 112 0.6 0.0 release_source_buffer = True 109 200 144 0.7 0.1 PyObject_GetBuffer(source, &source_buffer, PyBUF_ANY_CONTIGUOUS) 110 200 98 0.5 0.0 source_ptr = source_buffer.buf - 111 + 111 112 # setup destination buffer 113 200 552 2.8 0.2 PyObject_GetBuffer(dest, &dest_buffer, 114 PyBUF_ANY_CONTIGUOUS | PyBUF_WRITEABLE) 115 200 100 0.5 0.0 dest_ptr = dest_buffer.buf 116 200 84 0.4 0.0 nbytes = dest_buffer.len - 117 + 117 118 # perform decompression 119 200 1856 9.3 0.8 if _get_use_threads(): 120 # allow blosc to use threads internally @@ -149,12 +149,12 @@ Line # Hits Time Per Hit % Time Line Contents 123 else: 124 with nogil: 125 ret = blosc_decompress_ctx(source_ptr, dest_ptr, nbytes, 1) - 126 + 126 127 # release buffers 128 200 754 3.8 0.3 if release_source_buffer: 129 200 326 1.6 0.1 PyBuffer_Release(&source_buffer) 130 200 165 0.8 0.1 PyBuffer_Release(&dest_buffer) - 131 + 131 132 # handle errors 133 200 128 0.6 0.1 if ret <= 0: 134 raise RuntimeError('error during blosc decompression: %d' % ret) diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst index 69de796b3d..730b0bfcc8 100644 --- a/docs/guide/storage.rst +++ b/docs/guide/storage.rst @@ -4,7 +4,7 @@ Storage Zarr-Python supports multiple storage backends, including: local file systems, Zip files, remote stores via ``fsspec`` (S3, HTTP, etc.), and in-memory stores. In Zarr-Python 3, stores must implement the abstract store API from -:class:`zarr.abc.store.Store`. +:class:`zarr.abc.store.Store`. .. note:: Unlike Zarr-Python 2 where the store interface was built around a generic ``MutableMapping`` @@ -50,8 +50,8 @@ filesystem. Zip Store ~~~~~~~~~ -The :class:`zarr.storage.ZipStore` stores the contents of a Zarr hierarchy in a single -Zip file. The `Zip Store specification_` is currently in draft form. +The :class:`zarr.storage.ZipStore` stores the contents of a Zarr hierarchy in a single +Zip file. The `Zip Store specification_` is currently in draft form. .. code-block:: python @@ -65,7 +65,7 @@ Remote Store The :class:`zarr.storage.RemoteStore` stores the contents of a Zarr hierarchy in following the same logical layout as the ``LocalStore``, except the store is assumed to be on a remote storage system -such as cloud object storage (e.g. AWS S3, Google Cloud Storage, Azure Blob Store). The +such as cloud object storage (e.g. AWS S3, Google Cloud Storage, Azure Blob Store). The :class:`zarr.storage.RemoteStore` is backed by `Fsspec_` and can support any Fsspec backend that implements the `AbstractFileSystem` API, @@ -80,7 +80,7 @@ Memory Store ~~~~~~~~~~~~ The :class:`zarr.storage.RemoteStore` a in-memory store that allows for serialization of -Zarr data (metadata and chunks) to a dictionary. +Zarr data (metadata and chunks) to a dictionary. .. code-block:: python diff --git a/docs/roadmap.rst b/docs/roadmap.rst index 93f2a26896..d9fc32b775 100644 --- a/docs/roadmap.rst +++ b/docs/roadmap.rst @@ -16,7 +16,7 @@ Roadmap - Martin Durrant / @martindurant .. note:: - + This document was written in the early stages of the 3.0 refactor. Some aspects of the design have changed since this was originally written. Questions and discussion about the contents of this document should be directed to @@ -227,7 +227,7 @@ expose the required methods as async methods. async def get_partial_values(self, key_ranges: List[Tuple[str, int, int]) -> bytes: ... - + async def set(self, key: str, value: Union[bytes, bytearray, memoryview]) -> None: ... # required for writable stores @@ -246,10 +246,10 @@ expose the required methods as async methods. # additional (optional methods) async def getsize(self, prefix: str) -> int: ... - + async def rename(self, src: str, dest: str) -> None ... - + Recognizing that there are many Zarr applications today that rely on the ``MutableMapping`` interface supported by Zarr-Python 2, a wrapper store diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 5d977c48a5..71254900d5 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1015,12 +1015,12 @@ class from ``fsspec``. The following example demonstrates how to access a ZIP-archived Zarr group on s3 using `s3fs `_ and ``ZipFileSystem``: >>> s3_path = "s3://path/to/my.zarr.zip" - >>> + >>> >>> s3 = s3fs.S3FileSystem() >>> f = s3.open(s3_path) >>> fs = ZipFileSystem(f, mode="r") >>> store = FSMap("", fs, check=False) - >>> + >>> >>> # caching may improve performance when repeatedly reading the same data >>> cache = zarr.storage.LRUStoreCache(store, max_size=2**28) >>> z = zarr.group(store=cache) From f360fc6c26d1b006b616a5282cb9120ec96e5531 Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:00:13 +0100 Subject: [PATCH 34/87] Remove config warning if only one implementation exists (#2571) * add test_warning_on_missing_codec_config * improve config tests * remove warning if only one implementation exists --- src/zarr/registry.py | 2 + tests/test_config.py | 133 ++++++++++++++++++++++++++----------------- 2 files changed, 84 insertions(+), 51 deletions(-) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 12b0738016..9055bb1447 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -138,6 +138,8 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: config_entry = config.get("codecs", {}).get(key) if config_entry is None: + if len(codec_classes) == 1: + return next(iter(codec_classes.values())) warnings.warn( f"Codec '{key}' not configured in config. Selecting any implementation.", stacklevel=2 ) diff --git a/tests/test_config.py b/tests/test_config.py index 2e919a0add..e3f5ec25e3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -143,6 +143,7 @@ class MockEnvCodecPipeline(CodecPipeline): assert get_pipeline_class(reload_config=True) == MockEnvCodecPipeline +@pytest.mark.filterwarnings("error") @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_config_codec_implementation(store: Store) -> None: # has default value @@ -156,24 +157,29 @@ async def _encode_single( ) -> CodecOutput | None: _mock.call() - config.set({"codecs.blosc": fully_qualified_name(MockBloscCodec)}) register_codec("blosc", MockBloscCodec) - assert get_codec_class("blosc") == MockBloscCodec - - # test if codec is used - arr = Array.create( - store=store, - shape=(100,), - chunks=(10,), - zarr_format=3, - dtype="i4", - codecs=[BytesCodec(), {"name": "blosc", "configuration": {}}], - ) - arr[:] = range(100) - _mock.call.assert_called() + with config.set({"codecs.blosc": fully_qualified_name(MockBloscCodec)}): + assert get_codec_class("blosc") == MockBloscCodec + + # test if codec is used + arr = Array.create( + store=store, + shape=(100,), + chunks=(10,), + zarr_format=3, + dtype="i4", + codecs=[BytesCodec(), {"name": "blosc", "configuration": {}}], + ) + arr[:] = range(100) + _mock.call.assert_called() + + # test set codec with environment variable + class NewBloscCodec(BloscCodec): + pass - with mock.patch.dict(os.environ, {"ZARR_CODECS__BLOSC": fully_qualified_name(BloscCodec)}): - assert get_codec_class("blosc", reload_config=True) == BloscCodec + register_codec("blosc", NewBloscCodec) + with mock.patch.dict(os.environ, {"ZARR_CODECS__BLOSC": fully_qualified_name(NewBloscCodec)}): + assert get_codec_class("blosc", reload_config=True) == NewBloscCodec @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @@ -183,18 +189,17 @@ def test_config_ndbuffer_implementation(store: Store) -> None: # set custom ndbuffer with TestNDArrayLike implementation register_ndbuffer(NDBufferUsingTestNDArrayLike) - config.set({"ndbuffer": fully_qualified_name(NDBufferUsingTestNDArrayLike)}) - assert get_ndbuffer_class() == NDBufferUsingTestNDArrayLike - arr = Array.create( - store=store, - shape=(100,), - chunks=(10,), - zarr_format=3, - dtype="i4", - ) - got = arr[:] - print(type(got)) - assert isinstance(got, TestNDArrayLike) + with config.set({"ndbuffer": fully_qualified_name(NDBufferUsingTestNDArrayLike)}): + assert get_ndbuffer_class() == NDBufferUsingTestNDArrayLike + arr = Array.create( + store=store, + shape=(100,), + chunks=(10,), + zarr_format=3, + dtype="i4", + ) + got = arr[:] + assert isinstance(got, TestNDArrayLike) def test_config_buffer_implementation() -> None: @@ -208,27 +213,53 @@ def test_config_buffer_implementation() -> None: arr[:] = np.arange(100) register_buffer(TestBuffer) - config.set({"buffer": fully_qualified_name(TestBuffer)}) - assert get_buffer_class() == TestBuffer - - # no error using TestBuffer - data = np.arange(100) - arr[:] = np.arange(100) - assert np.array_equal(arr[:], data) - - data2d = np.arange(1000).reshape(100, 10) - arr_sharding = zeros( - shape=(100, 10), - store=StoreExpectingTestBuffer(), - codecs=[ShardingCodec(chunk_shape=(10, 10))], - ) - arr_sharding[:] = data2d - assert np.array_equal(arr_sharding[:], data2d) + with config.set({"buffer": fully_qualified_name(TestBuffer)}): + assert get_buffer_class() == TestBuffer - arr_Crc32c = zeros( - shape=(100, 10), - store=StoreExpectingTestBuffer(), - codecs=[BytesCodec(), Crc32cCodec()], - ) - arr_Crc32c[:] = data2d - assert np.array_equal(arr_Crc32c[:], data2d) + # no error using TestBuffer + data = np.arange(100) + arr[:] = np.arange(100) + assert np.array_equal(arr[:], data) + + data2d = np.arange(1000).reshape(100, 10) + arr_sharding = zeros( + shape=(100, 10), + store=StoreExpectingTestBuffer(), + codecs=[ShardingCodec(chunk_shape=(10, 10))], + ) + arr_sharding[:] = data2d + assert np.array_equal(arr_sharding[:], data2d) + + arr_Crc32c = zeros( + shape=(100, 10), + store=StoreExpectingTestBuffer(), + codecs=[BytesCodec(), Crc32cCodec()], + ) + arr_Crc32c[:] = data2d + assert np.array_equal(arr_Crc32c[:], data2d) + + +@pytest.mark.filterwarnings("error") +def test_warning_on_missing_codec_config() -> None: + class NewCodec(BytesCodec): + pass + + class NewCodec2(BytesCodec): + pass + + # error if codec is not registered + with pytest.raises(KeyError): + get_codec_class("missing_codec") + + # no warning if only one implementation is available + register_codec("new_codec", NewCodec) + get_codec_class("new_codec") + + # warning because multiple implementations are available but none is selected in the config + register_codec("new_codec", NewCodec2) + with pytest.warns(UserWarning): + get_codec_class("new_codec") + + # no warning if multiple implementations are available and one is selected in the config + with config.set({"codecs.new_codec": fully_qualified_name(NewCodec)}): + get_codec_class("new_codec") From 4455726a0dbdeebb00d146b3b7a4bfa4e63374b5 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Wed, 18 Dec 2024 19:10:50 +0000 Subject: [PATCH 35/87] Remove license page from docs (#2570) * Remove license page from docs * Add license redirect --- docs/conf.py | 1 + docs/index.rst | 1 - docs/license.rst | 4 ---- 3 files changed, 1 insertion(+), 5 deletions(-) delete mode 100644 docs/license.rst diff --git a/docs/conf.py b/docs/conf.py index 5f714421d3..8b22e33c6d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -87,6 +87,7 @@ "spec/v1": 'https://zarr-specs.readthedocs.io/en/latest/v1/v1.0.html', "spec/v2": "https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html", "spec/v3": "https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html", + "license": "https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt" } # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/docs/index.rst b/docs/index.rst index 82ed2889f4..4d6188d3a0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,7 +13,6 @@ Zarr-Python guide/index api/index release - license contributing roadmap diff --git a/docs/license.rst b/docs/license.rst deleted file mode 100644 index 8f93aa7d66..0000000000 --- a/docs/license.rst +++ /dev/null @@ -1,4 +0,0 @@ -License -======= - -.. include:: ../LICENSE.txt From f035d453894ec83234f69f7627b540390ee2b6eb Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 18 Dec 2024 22:57:07 +0100 Subject: [PATCH 36/87] docs/add docstrings to synchronous API (#2549) * add docstrings and complete function signatures to synchronous api, and tests for the above * clean up types and docstrings * Update src/zarr/api/synchronous.py Co-authored-by: Joe Hamman * Update src/zarr/api/synchronous.py Co-authored-by: Joe Hamman * Update src/zarr/api/synchronous.py Co-authored-by: Joe Hamman * Update src/zarr/api/synchronous.py Co-authored-by: Joe Hamman * Update src/zarr/api/synchronous.py Co-authored-by: Joe Hamman * Update src/zarr/api/synchronous.py Co-authored-by: Joe Hamman * Update src/zarr/api/synchronous.py Co-authored-by: Joe Hamman * remove doomed docstring tests * allow bool in create --------- Co-authored-by: Joe Hamman --- src/zarr/api/asynchronous.py | 8 +- src/zarr/api/synchronous.py | 685 ++++++++++++++++++++++++++++++++--- src/zarr/core/array.py | 8 +- tests/test_api.py | 4 +- tests/test_array.py | 18 +- 5 files changed, 652 insertions(+), 71 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 2d1c26e145..e859df44a6 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -774,9 +774,9 @@ async def open_group( async def create( - shape: ChunkCoords, + shape: ChunkCoords | int, *, # Note: this is a change from v2 - chunks: ChunkCoords | None = None, # TODO: v2 allowed chunks=True + chunks: ChunkCoords | int | None = None, # TODO: v2 allowed chunks=True dtype: npt.DTypeLike | None = None, compressor: dict[str, JSON] | None = None, # TODO: default and type change fill_value: Any | None = 0, # TODO: need type @@ -798,7 +798,7 @@ async def create( meta_array: Any | None = None, # TODO: need type attributes: dict[str, JSON] | None = None, # v3 only - chunk_shape: ChunkCoords | None = None, + chunk_shape: ChunkCoords | int | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] @@ -1104,6 +1104,8 @@ async def open_array( ---------- store : Store or str Store or path to directory in file system or name of zip file. + zarr_version : {2, 3, None}, optional + The zarr format to use when saving. Deprecated in favor of zarr_format. zarr_format : {2, 3, None}, optional The zarr format to use when saving. path : str, optional diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 8e8ecf40b8..6ae062865c 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -11,8 +11,15 @@ from zarr.core.sync import sync if TYPE_CHECKING: + from collections.abc import Iterable + + import numpy.typing as npt + + from zarr.abc.codec import Codec + from zarr.api.asynchronous import ArrayLike, PathLike from zarr.core.buffer import NDArrayLike - from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, ZarrFormat + from zarr.core.chunk_key_encodings import ChunkKeyEncoding + from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat from zarr.storage import StoreLike __all__ = [ @@ -44,8 +51,38 @@ ] -def consolidate_metadata(*args: Any, **kwargs: Any) -> Group: - return Group(sync(async_api.consolidate_metadata(*args, **kwargs))) +def consolidate_metadata( + store: StoreLike, + path: str | None = None, + zarr_format: ZarrFormat | None = None, +) -> Group: + """ + Consolidate the metadata of all nodes in a hierarchy. + + Upon completion, the metadata of the root node in the Zarr hierarchy will be + updated to include all the metadata of child nodes. + + Parameters + ---------- + store : StoreLike + The store-like object whose metadata you wish to consolidate. + path : str, optional + A path to a group in the store to consolidate at. Only children + below that group will be consolidated. + + By default, the root node is used so all the metadata in the + store is consolidated. + zarr_format : {2, 3, None}, optional + The zarr format of the hierarchy. By default the zarr format + is inferred. + + Returns + ------- + group: Group + The group, with the ``consolidated_metadata`` field set to include + the metadata of each child node. + """ + return Group(sync(async_api.consolidate_metadata(store, path=path, zarr_format=zarr_format))) def copy(*args: Any, **kwargs: Any) -> tuple[int, int, int]: @@ -61,9 +98,39 @@ def copy_store(*args: Any, **kwargs: Any) -> tuple[int, int, int]: def load( - store: StoreLike, zarr_version: ZarrFormat | None = None, path: str | None = None + store: StoreLike, + path: str | None = None, + zarr_format: ZarrFormat | None = None, + zarr_version: ZarrFormat | None = None, ) -> NDArrayLike | dict[str, NDArrayLike]: - return sync(async_api.load(store=store, zarr_version=zarr_version, path=path)) + """Load data from an array or group into memory. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system or name of zip file. + path : str or None, optional + The path within the store from which to load. + + Returns + ------- + out + If the path contains an array, out will be a numpy array. If the path contains + a group, out will be a dict-like object where keys are array names and values + are numpy arrays. + + See Also + -------- + save, savez + + Notes + ----- + If loading data from a group of arrays, data will not be immediately loaded into + memory. Rather, arrays will be loaded into memory as they are requested. + """ + return sync( + async_api.load(store=store, zarr_version=zarr_version, zarr_format=zarr_format, path=path) + ) @_deprecate_positional_args @@ -74,8 +141,36 @@ def open( zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, path: str | None = None, + storage_options: dict[str, Any] | None = None, **kwargs: Any, # TODO: type kwargs as valid args to async_api.open ) -> Array | Group: + """Convenience function to open a group or array using file-mode-like semantics. + + Parameters + ---------- + store : Store or str, optional + Store or path to directory in file system or name of zip file. + mode : {'r', 'r+', 'a', 'w', 'w-'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist); 'a' means read/write (create if doesn't + exist); 'w' means create (overwrite if exists); 'w-' means create + (fail if exists). + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + path : str or None, optional + The path within the store to open. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + **kwargs + Additional parameters are passed through to :func:`zarr.api.asynchronous.open_array` or + :func:`zarr.api.asynchronous.open_group`. + + Returns + ------- + z : array or group + Return type depends on what exists in the given store. + """ obj = sync( async_api.open( store=store, @@ -83,6 +178,7 @@ def open( zarr_version=zarr_version, zarr_format=zarr_format, path=path, + storage_options=storage_options, **kwargs, ) ) @@ -93,6 +189,9 @@ def open( def open_consolidated(*args: Any, use_consolidated: Literal[True] = True, **kwargs: Any) -> Group: + """ + Alias for :func:`open_group` with ``use_consolidated=True``. + """ return Group( sync(async_api.open_consolidated(*args, use_consolidated=use_consolidated, **kwargs)) ) @@ -106,6 +205,21 @@ def save( path: str | None = None, **kwargs: Any, # TODO: type kwargs as valid args to async_api.save ) -> None: + """Convenience function to save an array or group of arrays to the local file system. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system or name of zip file. + *args : ndarray + NumPy arrays with data to save. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + path : str or None, optional + The path within the group where the arrays will be saved. + **kwargs + NumPy arrays with data to save. + """ return sync( async_api.save( store, *args, zarr_version=zarr_version, zarr_format=zarr_format, path=path, **kwargs @@ -121,8 +235,28 @@ def save_array( zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, path: str | None = None, + storage_options: dict[str, Any] | None = None, **kwargs: Any, # TODO: type kwargs as valid args to async_api.save_array ) -> None: + """Convenience function to save a NumPy array to the local file system, following a + similar API to the NumPy save() function. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system or name of zip file. + arr : ndarray + NumPy array with data to save. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + path : str or None, optional + The path within the store where the array will be saved. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + **kwargs + Passed through to :func:`create`, e.g., compressor. + """ return sync( async_api.save_array( store=store, @@ -130,6 +264,7 @@ def save_array( zarr_version=zarr_version, zarr_format=zarr_format, path=path, + storage_options=storage_options, **kwargs, ) ) @@ -144,6 +279,26 @@ def save_group( storage_options: dict[str, Any] | None = None, **kwargs: NDArrayLike, ) -> None: + """Convenience function to save several NumPy arrays to the local file system, following a + similar API to the NumPy savez()/savez_compressed() functions. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system or name of zip file. + *args : ndarray + NumPy arrays with data to save. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + path : str or None, optional + Path within the store where the group will be saved. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + **kwargs + NumPy arrays with data to save. + """ + return sync( async_api.save_group( store, @@ -159,28 +314,98 @@ def save_group( @deprecated("Use Group.tree instead.") def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> Any: + """Provide a rich display of the hierarchy. + + Parameters + ---------- + grp : Group + Zarr or h5py group. + expand : bool, optional + Only relevant for HTML representation. If True, tree will be fully expanded. + level : int, optional + Maximum depth to descend into hierarchy. + + Returns + ------- + TreeRepr + A pretty-printable object displaying the hierarchy. + + .. deprecated:: 3.0.0 + `zarr.tree()` is deprecated and will be removed in a future release. + Use `group.tree()` instead. + """ return sync(async_api.tree(grp._async_group, expand=expand, level=level)) # TODO: add type annotations for kwargs -def array(data: NDArrayLike, **kwargs: Any) -> Array: +def array(data: npt.ArrayLike, **kwargs: Any) -> Array: + """Create an array filled with `data`. + + Parameters + ---------- + data : array_like + The data to fill the array with. + **kwargs + Passed through to :func:`create`. + + Returns + ------- + array : Array + The new array. + """ + return Array(sync(async_api.array(data=data, **kwargs))) @_deprecate_positional_args def group( store: StoreLike | None = None, - *, # Note: this is a change from v2 + *, overwrite: bool = False, - chunk_store: StoreLike | None = None, # not used in async_api - cache_attrs: bool | None = None, # default changed, not used in async_api - synchronizer: Any | None = None, # not used in async_api + chunk_store: StoreLike | None = None, # not used + cache_attrs: bool | None = None, # not used, default changed + synchronizer: Any | None = None, # not used path: str | None = None, zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, - meta_array: Any | None = None, # not used in async_api + meta_array: Any | None = None, # not used attributes: dict[str, JSON] | None = None, + storage_options: dict[str, Any] | None = None, ) -> Group: + """Create a group. + + Parameters + ---------- + store : Store or str, optional + Store or path to directory in file system. + overwrite : bool, optional + If True, delete any pre-existing data in `store` at `path` before + creating the group. + chunk_store : Store, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + synchronizer : object, optional + Array synchronizer. + path : str, optional + Group path within store. + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + g : Group + The new group. + """ return Group( sync( async_api.group( @@ -194,6 +419,7 @@ def group( zarr_format=zarr_format, meta_array=meta_array, attributes=attributes, + storage_options=storage_options, ) ) ) @@ -215,6 +441,67 @@ def open_group( attributes: dict[str, JSON] | None = None, use_consolidated: bool | str | None = None, ) -> Group: + """Open a group using file-mode-like semantics. + + Parameters + ---------- + store : Store, str, or mapping, optional + Store or path to directory in file system or name of zip file. + + Strings are interpreted as paths on the local file system + and used as the ``root`` argument to :class:`zarr.storage.LocalStore`. + + Dictionaries are used as the ``store_dict`` argument in + :class:`zarr.storage.MemoryStore``. + + By default (``store=None``) a new :class:`zarr.storage.MemoryStore` + is created. + + mode : {'r', 'r+', 'a', 'w', 'w-'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist); 'a' means read/write (create if doesn't + exist); 'w' means create (overwrite if exists); 'w-' means create + (fail if exists). + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + synchronizer : object, optional + Array synchronizer. + path : str, optional + Group path within store. + chunk_store : Store or str, optional + Store or path to directory in file system or name of zip file. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + attributes : dict + A dictionary of JSON-serializable values with user-defined attributes. + use_consolidated : bool or str, default None + Whether to use consolidated metadata. + + By default, consolidated metadata is used if it's present in the + store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file + for Zarr v2). + + To explicitly require consolidated metadata, set ``use_consolidated=True``, + which will raise an exception if consolidated metadata is not found. + + To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, + which will fall back to using the regular, non consolidated metadata. + + Zarr v2 allowed configuring the key storing the consolidated metadata + (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` + to load consolidated metadata from a non-default key. + + Returns + ------- + g : Group + The new group. + """ return Group( sync( async_api.open_group( @@ -236,84 +523,382 @@ def open_group( # TODO: add type annotations for kwargs -def create(*args: Any, **kwargs: Any) -> Array: - return Array(sync(async_api.create(*args, **kwargs))) +def create( + shape: ChunkCoords | int, + *, # Note: this is a change from v2 + chunks: ChunkCoords | int | bool | None = None, + dtype: npt.DTypeLike | None = None, + compressor: dict[str, JSON] | None = None, # TODO: default and type change + fill_value: Any | None = 0, # TODO: need type + order: MemoryOrder | None = None, + store: str | StoreLike | None = None, + synchronizer: Any | None = None, + overwrite: bool = False, + path: PathLike | None = None, + chunk_store: StoreLike | None = None, + filters: list[dict[str, JSON]] | None = None, # TODO: type has changed + cache_metadata: bool | None = None, + cache_attrs: bool | None = None, + read_only: bool | None = None, + object_codec: Codec | None = None, # TODO: type has changed + dimension_separator: Literal[".", "/"] | None = None, + write_empty_chunks: bool = False, # TODO: default has changed + zarr_version: ZarrFormat | None = None, # deprecated + zarr_format: ZarrFormat | None = None, + meta_array: Any | None = None, # TODO: need type + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ChunkCoords | int | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + **kwargs: Any, +) -> Array: + """Create an array. + + Parameters + ---------- + shape : int or tuple of ints + Array shape. + chunks : int or tuple of ints, optional + Chunk shape. If True, will be guessed from `shape` and `dtype`. If + False, will be set to `shape`, i.e., single chunk for the whole array. + If an int, the chunk size in each dimension will be given by the value + of `chunks`. Default is True. + dtype : str or dtype, optional + NumPy dtype. + compressor : Codec, optional + Primary compressor. + fill_value : object + Default value to use for uninitialized portions of the array. + order : {'C', 'F'}, optional + Memory layout to be used within each chunk. + Default is set in Zarr's config (`array.order`). + store : Store or str + Store or path to directory in file system or name of zip file. + synchronizer : object, optional + Array synchronizer. + overwrite : bool, optional + If True, delete all pre-existing data in `store` at `path` before + creating the array. + path : str, optional + Path under which array is stored. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + filters : sequence of Codecs, optional + Sequence of filters to use to encode chunk data prior to compression. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + read_only : bool, optional + True if array should be protected against modification. + object_codec : Codec, optional + A codec to encode object arrays, only needed if dtype=object. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + .. versionadded:: 2.8 + + write_empty_chunks : bool, optional + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill value + prior to storing. If a chunk is uniformly equal to the fill value, then + that chunk is not be stored, and the store entry for that chunk's key + is deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. + + .. versionadded:: 2.11 + + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + + .. versionadded:: 2.13 + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + z : Array + The array. + """ + return Array( + sync( + async_api.create( + shape=shape, + chunks=chunks, + dtype=dtype, + compressor=compressor, + fill_value=fill_value, + order=order, + store=store, + synchronizer=synchronizer, + overwrite=overwrite, + path=path, + chunk_store=chunk_store, + filters=filters, + cache_metadata=cache_metadata, + cache_attrs=cache_attrs, + read_only=read_only, + object_codec=object_codec, + dimension_separator=dimension_separator, + write_empty_chunks=write_empty_chunks, + zarr_version=zarr_version, + zarr_format=zarr_format, + meta_array=meta_array, + attributes=attributes, + chunk_shape=chunk_shape, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + storage_options=storage_options, + **kwargs, + ) + ) + ) # TODO: add type annotations for kwargs def empty(shape: ChunkCoords, **kwargs: Any) -> Array: + """Create an empty array. + + Parameters + ---------- + shape : int or tuple of int + Shape of the empty array. + **kwargs + Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + + Returns + ------- + Array + The new array. + + Notes + ----- + The contents of an empty Zarr array are not defined. On attempting to + retrieve data from an empty Zarr array, any values may be returned, + and these are not guaranteed to be stable from one access to the next. + """ return Array(sync(async_api.empty(shape, **kwargs))) # TODO: move ArrayLike to common module # TODO: add type annotations for kwargs -def empty_like(a: async_api.ArrayLike, **kwargs: Any) -> Array: +def empty_like(a: ArrayLike, **kwargs: Any) -> Array: + """Create an empty array like `a`. + + Parameters + ---------- + a : array-like + The array to create an empty array like. + **kwargs + Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + + Returns + ------- + Array + The new array. + """ return Array(sync(async_api.empty_like(a, **kwargs))) # TODO: add type annotations for kwargs and fill_value def full(shape: ChunkCoords, fill_value: Any, **kwargs: Any) -> Array: + """Create an array, with `fill_value` being used as the default value for + uninitialized portions of the array. + + Parameters + ---------- + shape : int or tuple of int + Shape of the empty array. + fill_value : scalar + Fill value. + **kwargs + Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + + Returns + ------- + Array + The new array. + """ return Array(sync(async_api.full(shape=shape, fill_value=fill_value, **kwargs))) # TODO: move ArrayLike to common module # TODO: add type annotations for kwargs -def full_like(a: async_api.ArrayLike, **kwargs: Any) -> Array: +def full_like(a: ArrayLike, **kwargs: Any) -> Array: + """Create a filled array like `a`. + + Parameters + ---------- + a : array-like + The array to create an empty array like. + **kwargs + Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + + Returns + ------- + Array + The new array. + """ return Array(sync(async_api.full_like(a, **kwargs))) # TODO: add type annotations for kwargs def ones(shape: ChunkCoords, **kwargs: Any) -> Array: + """Create an array, with one being used as the default value for + uninitialized portions of the array. + + Parameters + ---------- + shape : int or tuple of int + Shape of the empty array. + **kwargs + Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + + Returns + ------- + Array + The new array. + """ return Array(sync(async_api.ones(shape, **kwargs))) # TODO: add type annotations for kwargs -def ones_like(a: async_api.ArrayLike, **kwargs: Any) -> Array: +def ones_like(a: ArrayLike, **kwargs: Any) -> Array: + """Create an array of ones like `a`. + + Parameters + ---------- + a : array-like + The array to create an empty array like. + **kwargs + Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + + Returns + ------- + Array + The new array. + """ return Array(sync(async_api.ones_like(a, **kwargs))) # TODO: update this once async_api.open_array is fully implemented -def open_array(*args: Any, **kwargs: Any) -> Array: - return Array(sync(async_api.open_array(*args, **kwargs))) +def open_array( + store: StoreLike | None = None, + *, + zarr_version: ZarrFormat | None = None, + path: PathLike = "", + storage_options: dict[str, Any] | None = None, + **kwargs: Any, +) -> Array: + """Open an array using file-mode-like semantics. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system or name of zip file. + zarr_version : {2, 3, None}, optional + The zarr format to use when saving. + path : str, optional + Path in store to array. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + **kwargs + Any keyword arguments to pass to ``create``. + + Returns + ------- + AsyncArray + The opened array. + """ + return Array( + sync( + async_api.open_array( + store=store, + zarr_version=zarr_version, + path=path, + storage_options=storage_options, + **kwargs, + ) + ) + ) # TODO: add type annotations for kwargs -def open_like(a: async_api.ArrayLike, **kwargs: Any) -> Array: - return Array(sync(async_api.open_like(a, **kwargs))) +def open_like(a: ArrayLike, path: str, **kwargs: Any) -> Array: + """Open a persistent array like `a`. + + Parameters + ---------- + a : Array + The shape and data-type of a define these same attributes of the returned array. + path : str + The path to the new array. + **kwargs + Any keyword arguments to pass to the array constructor. + + Returns + ------- + AsyncArray + The opened array. + """ + return Array(sync(async_api.open_like(a, path=path, **kwargs))) # TODO: add type annotations for kwargs -def zeros(*args: Any, **kwargs: Any) -> Array: - return Array(sync(async_api.zeros(*args, **kwargs))) +def zeros(shape: ChunkCoords, **kwargs: Any) -> Array: + """Create an array, with zero being used as the default value for + uninitialized portions of the array. + Parameters + ---------- + shape : int or tuple of int + Shape of the empty array. + **kwargs + Keyword arguments passed to :func:`zarr.api.asynchronous.create`. -# TODO: add type annotations for kwargs -def zeros_like(a: async_api.ArrayLike, **kwargs: Any) -> Array: - return Array(sync(async_api.zeros_like(a, **kwargs))) + Returns + ------- + Array + The new array. + """ + return Array(sync(async_api.zeros(shape=shape, **kwargs))) -consolidate_metadata.__doc__ = async_api.copy.__doc__ -copy.__doc__ = async_api.copy.__doc__ -copy_all.__doc__ = async_api.copy_all.__doc__ -copy_store.__doc__ = async_api.copy_store.__doc__ -load.__doc__ = async_api.load.__doc__ -open.__doc__ = async_api.open.__doc__ -open_consolidated.__doc__ = async_api.open_consolidated.__doc__ -save.__doc__ = async_api.save.__doc__ -save_array.__doc__ = async_api.save_array.__doc__ -save_group.__doc__ = async_api.save_group.__doc__ -tree.__doc__ = async_api.tree.__doc__ -array.__doc__ = async_api.array.__doc__ -group.__doc__ = async_api.group.__doc__ -open_group.__doc__ = async_api.open_group.__doc__ -create.__doc__ = async_api.create.__doc__ -empty.__doc__ = async_api.empty.__doc__ -empty_like.__doc__ = async_api.empty_like.__doc__ -full.__doc__ = async_api.full.__doc__ -full_like.__doc__ = async_api.full_like.__doc__ -ones.__doc__ = async_api.ones.__doc__ -ones_like.__doc__ = async_api.ones_like.__doc__ -open_array.__doc__ = async_api.open_array.__doc__ -open_like.__doc__ = async_api.open_like.__doc__ -zeros.__doc__ = async_api.zeros.__doc__ -zeros_like.__doc__ = async_api.zeros_like.__doc__ +# TODO: add type annotations for kwargs +def zeros_like(a: ArrayLike, **kwargs: Any) -> Array: + """Create an array of zeros like `a`. + + Parameters + ---------- + a : array-like + The array to create an empty array like. + **kwargs + Keyword arguments passed to :func:`zarr.api.asynchronous.create`. + + Returns + ------- + Array + The new array. + """ + return Array(sync(async_api.zeros_like(a, **kwargs))) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b57712717b..e5fc707f0a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -285,7 +285,7 @@ async def create( fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, # v3 only - chunk_shape: ChunkCoords | None = None, + chunk_shape: ShapeLike | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] @@ -313,7 +313,7 @@ async def create( fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, # v3 only - chunk_shape: ChunkCoords | None = None, + chunk_shape: ShapeLike | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] @@ -340,7 +340,7 @@ async def create( fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, # v3 only - chunk_shape: ChunkCoords | None = None, + chunk_shape: ShapeLike | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] @@ -372,7 +372,7 @@ async def create( fill_value: Any | None = None, attributes: dict[str, JSON] | None = None, # v3 only - chunk_shape: ChunkCoords | None = None, + chunk_shape: ShapeLike | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] diff --git a/tests/test_api.py b/tests/test_api.py index 90f6dae110..f98565ad68 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -49,11 +49,11 @@ def test_create_array(memory_store: Store) -> None: # create array with float shape with pytest.raises(TypeError): - z = create(shape=(400.5, 100), store=store, overwrite=True) + z = create(shape=(400.5, 100), store=store, overwrite=True) # type: ignore [arg-type] # create array with float chunk shape with pytest.raises(TypeError): - z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) + z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) # type: ignore [arg-type] @pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) diff --git a/tests/test_array.py b/tests/test_array.py index 263b536784..3eb317e50e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -558,8 +558,7 @@ async def test_info_complete_async(self) -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("zarr_format", [2, 3]) -def test_resize_1d(store: MemoryStore, zarr_format: int) -> None: +def test_resize_1d(store: MemoryStore, zarr_format: ZarrFormat) -> None: z = zarr.create( shape=105, chunks=10, dtype="i4", fill_value=0, store=store, zarr_format=zarr_format ) @@ -597,8 +596,7 @@ def test_resize_1d(store: MemoryStore, zarr_format: int) -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("zarr_format", [2, 3]) -def test_resize_2d(store: MemoryStore, zarr_format: int) -> None: +def test_resize_2d(store: MemoryStore, zarr_format: ZarrFormat) -> None: z = zarr.create( shape=(105, 105), chunks=(10, 10), @@ -659,8 +657,7 @@ def test_resize_2d(store: MemoryStore, zarr_format: int) -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("zarr_format", [2, 3]) -def test_append_1d(store: MemoryStore, zarr_format: int) -> None: +def test_append_1d(store: MemoryStore, zarr_format: ZarrFormat) -> None: a = np.arange(105) z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format) z[:] = a @@ -689,8 +686,7 @@ def test_append_1d(store: MemoryStore, zarr_format: int) -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("zarr_format", [2, 3]) -def test_append_2d(store: MemoryStore, zarr_format: int) -> None: +def test_append_2d(store: MemoryStore, zarr_format: ZarrFormat) -> None: a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) z = zarr.create( shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format @@ -713,8 +709,7 @@ def test_append_2d(store: MemoryStore, zarr_format: int) -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("zarr_format", [2, 3]) -def test_append_2d_axis(store: MemoryStore, zarr_format: int) -> None: +def test_append_2d_axis(store: MemoryStore, zarr_format: ZarrFormat) -> None: a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) z = zarr.create( shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format @@ -735,8 +730,7 @@ def test_append_2d_axis(store: MemoryStore, zarr_format: int) -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("zarr_format", [2, 3]) -def test_append_bad_shape(store: MemoryStore, zarr_format: int) -> None: +def test_append_bad_shape(store: MemoryStore, zarr_format: ZarrFormat) -> None: a = np.arange(100) z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format) z[:] = a From 5bf7bcfcfe6426bea37df5d0691211f6a71daa39 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 19 Dec 2024 01:23:27 -0800 Subject: [PATCH 37/87] deps: add packaging to required deps (#2573) * deps: add packaging to required deps * temporarily pin numpy version for mypy --- .pre-commit-config.yaml | 3 ++- pyproject.toml | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4a93e9ce87..ea1cd4dbab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,9 +28,10 @@ repos: files: src|tests additional_dependencies: # Package dependencies + - packaging - donfig - numcodecs[crc32c] - - numpy + - numpy==2.1 # until https://github.com/numpy/numpy/issues/28034 is resolved - typing_extensions - universal-pathlib # Tests diff --git a/pyproject.toml b/pyproject.toml index 6c8110cbf9..75bbbf15d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ maintainers = [ requires-python = ">=3.11" # If you add a new dependency here, please also add it to .pre-commit-config.yml dependencies = [ + 'packaging>=22.0', 'numpy>=1.25', 'numcodecs[crc32c]>=0.14', 'typing_extensions>=4.9', @@ -173,6 +174,7 @@ serve = "sphinx-autobuild docs docs/_build --host 0.0.0.0" [tool.hatch.envs.upstream] python = "3.13" dependencies = [ + 'packaging @ git+https://github.com/pypa/packaging', 'numpy', # from scientific-python-nightly-wheels 'numcodecs @ git+https://github.com/zarr-developers/numcodecs', 'fsspec @ git+https://github.com/fsspec/filesystem_spec', @@ -206,6 +208,7 @@ See Spec 0000 for details and drop schedule: https://scientific-python.org/specs """ python = "3.11" dependencies = [ + 'packaging==22.*', 'numpy==1.25.*', 'numcodecs==0.14.*', # 0.14 needed for zarr3 codecs 'fsspec==2022.10.0', From 1cc39177483235df698df3148d26997d19a92de9 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 19 Dec 2024 17:28:57 +0100 Subject: [PATCH 38/87] correct array.nbytes, and add tests (#2576) * correct array.nbytes, and add tests * use nbytes in array info construction * stronger docstrings --- src/zarr/core/array.py | 24 ++++++++++++++++++++---- tests/test_array.py | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e5fc707f0a..2849907f98 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -977,9 +977,17 @@ def _iter_chunk_regions( @property def nbytes(self) -> int: """ - The number of bytes that can be stored in this array. + The total number of bytes that can be stored in the chunks of this array. + + Notes + ----- + This value is calculated by multiplying the number of elements in the array and the size + of each element, the latter of which is determined by the dtype of the array. + For this reason, ``nbytes`` will likely be inaccurate for arrays with variable-length + dtypes. It is not possible to determine the size of an array with variable-length elements + from the shape and dtype alone. """ - return self.nchunks * self.dtype.itemsize + return self.size * self.dtype.itemsize async def _get_selection( self, @@ -1429,7 +1437,7 @@ def _info( _order=self.order, _read_only=self.read_only, _store_type=type(self.store_path.store).__name__, - _count_bytes=self.dtype.itemsize * self.size, + _count_bytes=self.nbytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, **kwargs, @@ -1740,7 +1748,15 @@ def _iter_chunk_coords( @property def nbytes(self) -> int: """ - The number of bytes that can be stored in this array. + The total number of bytes that can be stored in the chunks of this array. + + Notes + ----- + This value is calculated by multiplying the number of elements in the array and the size + of each element, the latter of which is determined by the dtype of the array. + For this reason, ``nbytes`` will likely be inaccurate for arrays with variable-length + dtypes. It is not possible to determine the size of an array with variable-length elements + from the shape and dtype alone. """ return self._async_array.nbytes diff --git a/tests/test_array.py b/tests/test_array.py index 3eb317e50e..cf722c7385 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -776,3 +776,21 @@ async def test_special_complex_fill_values_roundtrip(fill_value: Any, expected: assert content is not None actual = json.loads(content.to_bytes()) assert actual["fill_value"] == expected + + +@pytest.mark.parametrize("shape", [(1,), (2, 3), (4, 5, 6)]) +@pytest.mark.parametrize("dtype", ["uint8", "float32"]) +@pytest.mark.parametrize("array_type", ["async", "sync"]) +async def test_nbytes( + shape: tuple[int, ...], dtype: str, array_type: Literal["async", "sync"] +) -> None: + """ + Test that the ``nbytes`` attribute of an Array or AsyncArray correctly reports the capacity of + the chunks of that array. + """ + store = MemoryStore() + arr = Array.create(store=store, shape=shape, dtype=dtype, fill_value=0) + if array_type == "async": + assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize + else: + assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize From 4cb8ddd4ebb667da9cb0667a64b415d7097465d5 Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Thu, 19 Dec 2024 18:28:10 +0100 Subject: [PATCH 39/87] Add default compressors to config (#2470) * add default compressor to config * modify _default_compressor to _default_filters_and_compressor * fix test_metadata_to_dict * wip debugging * format * fix v2 decode string dtype * fix config default tests * format * Update src/zarr/codecs/_v2.py * rename v2_dtype_kind_to_default_filters_and_compressor to v2_default_compressors * recover test_v2.py * incorporate feedback * incorporate feedback * fix mypy * allow only one default compressor * put `v2_default_compressor` under `array` * deprecate zarr.storage.default_compressor * test v3_default_codecs * use v3_default_codecs * fix tests that expected codecs==["bytes"] * fix test_default_codecs * fail-fast: false * fix string codecs for np1.25 * format * add docstrings to create in asynchronous.py and array.py * add docstrings to creation in group.py * Apply suggestions from code review Co-authored-by: David Stansby * apply suggestions from review * correct code double backticks * correct attribute links in docstring * link zarr.core.config in docstrings * improve docstring readability * correct config docstring * correct config docstring * improve config docstring --------- Co-authored-by: Norman Rzepka Co-authored-by: David Stansby --- src/zarr/api/asynchronous.py | 60 ++++++++-- src/zarr/codecs/__init__.py | 18 --- src/zarr/codecs/_v2.py | 13 ++- src/zarr/core/array.py | 139 +++++++++++++++++------ src/zarr/core/config.py | 55 +++++++-- src/zarr/core/group.py | 131 +++++++++++++++++---- src/zarr/core/metadata/v2.py | 23 +++- src/zarr/core/metadata/v3.py | 5 + src/zarr/storage/__init__.py | 22 ++++ tests/test_array.py | 22 ++-- tests/test_config.py | 57 +++++++++- tests/test_group.py | 7 +- tests/test_metadata/test_consolidated.py | 12 +- tests/test_v2.py | 115 +++++++++++++------ 14 files changed, 529 insertions(+), 150 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index e859df44a6..8b20676e8b 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -17,10 +17,12 @@ ChunkCoords, MemoryOrder, ZarrFormat, + parse_dtype, ) from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v2 import _default_filters_and_compressor from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -401,7 +403,7 @@ async def save_array( arr : ndarray NumPy array with data to save. zarr_format : {2, 3, None}, optional - The zarr format to use when saving. + The zarr format to use when saving (default is 3 if not specified). path : str or None, optional The path within the store where the array will be saved. storage_options : dict @@ -817,19 +819,45 @@ async def create( shape : int or tuple of ints Array shape. chunks : int or tuple of ints, optional - Chunk shape. If True, will be guessed from `shape` and `dtype`. If - False, will be set to `shape`, i.e., single chunk for the whole array. - If an int, the chunk size in each dimension will be given by the value - of `chunks`. Default is True. + The shape of the array's chunks. + V2 only. V3 arrays should use `chunk_shape` instead. + If not specified, default values are guessed based on the shape and dtype. dtype : str or dtype, optional NumPy dtype. + chunk_shape : int or tuple of ints, optional + The shape of the Array's chunks (default is None). + V3 only. V2 arrays should use `chunks` instead. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations of Codecs. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional - Primary compressor. - fill_value : object + Primary compressor to compress chunk data. + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. - Default is set in Zarr's config (`array.order`). + If not specified, default is taken from the Zarr config ```array.order```. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional @@ -844,6 +872,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. + V2 only. If neither ``compressor`` nor ``filters`` are provided, a default + compressor will be used. (see ``compressor`` for details). cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -859,7 +889,8 @@ async def create( A codec to encode object arrays, only needed if dtype=object. dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. + Default is ".". .. versionadded:: 2.8 write_empty_chunks : bool, optional @@ -875,6 +906,7 @@ async def create( zarr_format : {2, 3, None}, optional The zarr format to use when saving. + Default is 3. meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. @@ -894,9 +926,13 @@ async def create( or _default_zarr_version() ) - if zarr_format == 2 and chunks is None: - chunks = shape - elif zarr_format == 3 and chunk_shape is None: + if zarr_format == 2: + if chunks is None: + chunks = shape + dtype = parse_dtype(dtype, zarr_format) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype) + elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks chunks = None diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index e407d94892..165dbe476d 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -1,10 +1,5 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - import numpy as np - from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec, Endian from zarr.codecs.crc32c_ import Crc32cCodec @@ -13,7 +8,6 @@ from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec -from zarr.core.metadata.v3 import DataType __all__ = [ "BloscCname", @@ -30,15 +24,3 @@ "VLenUTF8Codec", "ZstdCodec", ] - - -def _get_default_array_bytes_codec( - np_dtype: np.dtype[Any], -) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec: - dtype = DataType.from_numpy(np_dtype) - if dtype == DataType.string: - return VLenUTF8Codec() - elif dtype == DataType.bytes: - return VLenBytesCodec() - else: - return BytesCodec() diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index df0d8ecb0a..53edc1f4a1 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING import numcodecs +import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray_like from zarr.abc.codec import ArrayBytesCodec @@ -46,7 +47,17 @@ async def _decode_single( # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening if chunk_spec.dtype != object: - chunk = chunk.view(chunk_spec.dtype) + try: + chunk = chunk.view(chunk_spec.dtype) + except TypeError: + # this will happen if the dtype of the chunk + # does not match the dtype of the array spec i.g. if + # the dtype of the chunk_spec is a string dtype, but the chunk + # is an object array. In this case, we need to convert the object + # array to the correct dtype. + + chunk = np.array(chunk).astype(chunk_spec.dtype) + elif chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. # We cannot deal with object arrays unless there is an object diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2849907f98..07ed0e5069 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -13,7 +13,6 @@ from zarr._compat import _deprecate_positional_args from zarr.abc.store import Store, set_or_delete -from zarr.codecs import _get_default_array_bytes_codec from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo from zarr.core.attributes import Attributes @@ -78,7 +77,8 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.v3 import parse_node_type_array +from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError from zarr.registry import get_pipeline_class @@ -409,27 +409,53 @@ async def create( attributes : dict[str, JSON], optional The attributes of the array (default is None). chunk_shape : ChunkCoords, optional - The shape of the array's chunks (default is None). + The shape of the array's chunks + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding (default is None). - codecs : Iterable[Codec | dict[str, JSON]], optional - The codecs used to encode the data (default is None). + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations of Codecs. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). + V3 only. V2 arrays should not use this parameter. chunks : ShapeLike, optional - The shape of the array's chunks (default is None). - V2 only. V3 arrays should not have 'chunks' parameter. + The shape of the array's chunks. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional - The dimension separator (default is None). - V2 only. V3 arrays cannot have a dimension separator. + The dimension separator (default is "."). + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is None). + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]], optional - The filters used to compress the data (default is None). - V2 only. V3 arrays should not have 'filters' parameter. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). - V2 only. V3 arrays should not have 'compressor' parameter. + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional @@ -494,14 +520,6 @@ async def create( order=order, ) elif zarr_format == 2: - if dtype is str or dtype == "str": - # another special case: zarr v2 added the vlen-utf8 codec - vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} - if filters and not any(x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [vlen_codec] - else: - filters = [vlen_codec] - if codecs is not None: raise ValueError( "codecs cannot be used for arrays with version 2. Use filters and compressor instead." @@ -564,11 +582,7 @@ async def _create_v3( await ensure_no_existing_node(store_path, zarr_format=3) shape = parse_shapelike(shape) - codecs = ( - list(codecs) - if codecs is not None - else [_get_default_array_bytes_codec(np.dtype(dtype))] - ) + codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype)) if chunk_key_encoding is None: chunk_key_encoding = ("default", "/") @@ -634,6 +648,14 @@ async def _create_v2( if dimension_separator is None: dimension_separator = "." + dtype = parse_dtype(dtype, zarr_format=2) + if not filters and not compressor: + filters, compressor = _default_filters_and_compressor(dtype) + if np.issubdtype(dtype, np.str_): + filters = filters or [] + if not any(x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [{"id": "vlen-utf8"}] + metadata = ArrayV2Metadata( shape=shape, dtype=np.dtype(dtype), @@ -1493,23 +1515,53 @@ def create( dtype : npt.DTypeLike The data type of the array. chunk_shape : ChunkCoords, optional - The shape of the Array's chunks (default is None). + The shape of the Array's chunks. + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding (default is None). - codecs : Iterable[Codec | dict[str, JSON]], optional - The codecs used to encode the data (default is None). + A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. + codecs : Sequence of Codecs or dicts, optional + An iterable of Codec or dict serializations of Codecs. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). + V3 only. V2 arrays should not use this parameter. chunks : ChunkCoords, optional - The shape of the Array's chunks (default is None). + The shape of the array's chunks. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional - The dimension separator (default is None). + The dimension separator (default is "."). + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is None). + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]], optional - The filters used to compress the data (default is None). + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON], optional - The compressor used to compress the data (default is None). + Primary compressor to compress chunk data. + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool, optional Whether to raise an error if the store already exists (default is False). @@ -3342,3 +3394,18 @@ def _build_parents( ) return parents + + +def _get_default_codecs( + np_dtype: np.dtype[Any], +) -> list[dict[str, JSON]]: + default_codecs = config.get("array.v3_default_codecs") + dtype = DataType.from_numpy(np_dtype) + if dtype == DataType.string: + dtype_key = "string" + elif dtype == DataType.bytes: + dtype_key = "bytes" + else: + dtype_key = "numeric" + + return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 29f5e139fe..1feb4a6c2f 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -1,3 +1,32 @@ +""" +The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. +For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations +in the registry and then select them in the config. + +Example: + An implementation of the bytes codec in a class ``your.module.NewBytesCodec`` requires the value of ``codecs.bytes`` + to be ``your.module.NewBytesCodec``. Donfig can be configured programmatically, by environment variables, or from + YAML files in standard locations. + + .. code-block:: python + + from your.module import NewBytesCodec + from zarr.core.config import register_codec, config + + register_codec("bytes", NewBytesCodec) + config.set({"codecs.bytes": "your.module.NewBytesCodec"}) + + Instead of setting the value programmatically with ``config.set``, you can also set the value with an environment + variable. The environment variable ``ZARR_CODECS__BYTES`` can be set to ``your.module.NewBytesCodec``. The double + underscore ``__`` is used to indicate nested access. + + .. code-block:: bash + + export ZARR_CODECS__BYTES="your.module.NewBytesCodec" + +For more information, see the Donfig documentation at https://github.com/pytroll/donfig. +""" + from __future__ import annotations from typing import Any, Literal, cast @@ -10,7 +39,7 @@ class BadConfigError(ValueError): class Config(DConfig): # type: ignore[misc] - """Will collect configuration from config files and environment variables + """The Config will collect configuration from config files and environment variables Example environment variables: Grabs environment variables of the form "ZARR_FOO__BAR_BAZ=123" and @@ -28,21 +57,25 @@ def reset(self) -> None: self.refresh() -# The config module is responsible for managing the configuration of zarr and is based on the Donfig python library. -# For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations -# in the registry and then select them in the config. -# e.g. an implementation of the bytes codec in a class "NewBytesCodec", requires the value of codecs.bytes.name to be -# "NewBytesCodec". -# Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations -# e.g. export ZARR_CODECS__BYTES__NAME="NewBytesCodec" -# (for more information see github.com/pytroll/donfig) -# Default values below point to the standard implementations of zarr-python +# The default configuration for zarr config = Config( "zarr", defaults=[ { "default_zarr_version": 3, - "array": {"order": "C"}, + "array": { + "order": "C", + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + "v3_default_codecs": { + "numeric": ["bytes", "zstd"], + "string": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + }, + }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, "json_indent": 2, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index f46c5126b2..2d7a21911a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1034,24 +1034,52 @@ async def create_array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2222,7 +2250,7 @@ def create_array( ) -> Array: """Create a zarr array within this AsyncGroup. - This method lightly wraps AsyncArray.create. + This method lightly wraps `AsyncArray.create`. Parameters ---------- @@ -2233,24 +2261,52 @@ def create_array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of this collection - specify the transformation from array values to stored bytes. + An iterable of Codec or dict serializations of Codecs. The elements of + this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is @@ -2260,6 +2316,7 @@ def create_array( Returns ------- + Array """ @@ -2574,24 +2631,52 @@ def array( dtype : np.DtypeLike = float64 The data type of the array. chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. V3 only. + The shape of the chunks of the array. + V3 only. V2 arrays should use `chunks` instead. + If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None A specification of how the chunk keys are represented in storage. + V3 only. V2 arrays should use `dimension_separator` instead. + Default is ``("default", "/")``. codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations thereof. The elements of + An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + + If no codecs are provided, default codecs will be used: + + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str] | None = None The names of the dimensions of the array. V3 only. chunks : ChunkCoords | None = None - The shape of the chunks of the array. V2 only. + The shape of the chunks of the array. + V2 only. V3 arrays should use ``chunk_shape`` instead. + If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. + The delimiter used for the chunk keys. (default: ".") + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"] | None = None - The memory order of the array. + The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). filters : list[dict[str, JSON]] | None = None - Filters for the array. + Sequence of filters to use to encode chunk data prior to compression. + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` + nor ``filters`` are provided, a default compressor will be used. (see + ``compressor`` for details) compressor : dict[str, JSON] | None = None - The compressor for the array. + The compressor used to compress the data (default is None). + V2 only. V3 arrays should use ``codecs`` instead. + + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + + - For numeric arrays, the default is ``ZstdCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec``. + - For bytes or objects, the default is ``VLenBytesCodec``. + + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. overwrite : bool = False If True, a pre-existing array or group at the path of this array will be overwritten. If False, the presence of a pre-existing array or group is diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 50f375203f..bd0fbecf4a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING, TypedDict, cast +from typing import TYPE_CHECKING, Any, TypedDict, cast from zarr.abc.metadata import Metadata @@ -71,6 +71,7 @@ def __init__( shape_parsed = parse_shapelike(shape) dtype_parsed = parse_dtype(dtype) chunks_parsed = parse_shapelike(chunks) + compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -326,3 +327,23 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return "" else: return dtype.type(0) + + +def _default_filters_and_compressor( + dtype: np.dtype[Any], +) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]: + """Get the default filters and compressor for a dtype. + + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + default_compressor = config.get("array.v2_default_compressor") + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "string" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + return [{"id": default_compressor[dtype_key]}], None diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 3e925e08bd..8dcceb7f31 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -37,6 +37,7 @@ ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes +from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.strings import _STRING_DTYPE as STRING_NP_DTYPE from zarr.errors import MetadataValidationError, NodeTypeValidationError from zarr.registry import get_codec_class @@ -606,6 +607,10 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType: return DataType.string elif dtype.kind == "S": return DataType.bytes + elif not _NUMPY_SUPPORTS_VLEN_STRING and dtype.kind == "O": + # numpy < 2.0 does not support vlen string dtype + # so we fall back on object array of strings + return DataType.string dtype_to_data_type = { "|b1": "bool", "bool": "bool", diff --git a/src/zarr/storage/__init__.py b/src/zarr/storage/__init__.py index 6f3ec59b01..514361bd6b 100644 --- a/src/zarr/storage/__init__.py +++ b/src/zarr/storage/__init__.py @@ -1,3 +1,8 @@ +import sys +import warnings +from types import ModuleType +from typing import Any + from zarr.storage.common import StoreLike, StorePath, make_store_path from zarr.storage.fsspec import FsspecStore from zarr.storage.local import LocalStore @@ -17,3 +22,20 @@ "ZipStore", "make_store_path", ] + + +class VerboseModule(ModuleType): + def __setattr__(self, attr: str, value: Any) -> None: + if attr == "default_compressor": + warnings.warn( + "setting zarr.storage.default_compressor is deprecated, use " + "zarr.config to configure array.v2_default_compressor " + "e.g. config.set({'codecs.zstd':'numcodecs.Zstd', 'array.v2_default_compressor.numeric': 'zstd'})", + DeprecationWarning, + stacklevel=1, + ) + else: + super().__setattr__(attr, value) + + +sys.modules[__name__].__class__ = VerboseModule diff --git a/tests/test_array.py b/tests/test_array.py index cf722c7385..c89b6187c3 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -5,12 +5,14 @@ from itertools import accumulate from typing import Any, Literal +import numcodecs import numpy as np import pytest +from numcodecs import Zstd import zarr.api.asynchronous from zarr import Array, AsyncArray, Group -from zarr.codecs import BytesCodec, VLenBytesCodec +from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array import chunks_initialized from zarr.core.buffer import default_buffer_prototype @@ -374,7 +376,7 @@ async def test_chunks_initialized() -> None: def test_nbytes_stored() -> None: - arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()]) result = arr.nbytes_stored() assert result == 366 # the size of the metadata document. This is a fragile test. arr[:50] = 1 @@ -386,7 +388,9 @@ def test_nbytes_stored() -> None: async def test_nbytes_stored_async() -> None: - arr = await zarr.api.asynchronous.create(shape=(100,), chunks=(10,), dtype="i4") + arr = await zarr.api.asynchronous.create( + shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()] + ) result = await arr.nbytes_stored() assert result == 366 # the size of the metadata document. This is a fragile test. await arr.setitem(slice(50), 1) @@ -456,6 +460,7 @@ def test_info_v2(self) -> None: _read_only=False, _store_type="MemoryStore", _count_bytes=128, + _filters=(numcodecs.Zstd(),), ) assert result == expected @@ -470,13 +475,13 @@ def test_info_v3(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _codecs=[BytesCodec(), ZstdCodec()], _count_bytes=128, ) assert result == expected def test_info_complete(self) -> None: - arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()]) result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, @@ -511,6 +516,7 @@ async def test_info_v2_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", + _filters=(Zstd(level=0),), _count_bytes=128, ) assert result == expected @@ -526,13 +532,15 @@ async def test_info_v3_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _codecs=[BytesCodec(), ZstdCodec()], _count_bytes=128, ) assert result == expected async def test_info_complete_async(self) -> None: - arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + arr = await zarr.api.asynchronous.create( + shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()] + ) result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, diff --git a/tests/test_config.py b/tests/test_config.py index e3f5ec25e3..8dd15fb75b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,10 +8,18 @@ import pytest import zarr -from zarr import Array, zeros -from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline +from zarr import Array, AsyncArray, zeros +from zarr.abc.codec import Codec, CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store -from zarr.codecs import BloscCodec, BytesCodec, Crc32cCodec, ShardingCodec +from zarr.codecs import ( + BloscCodec, + BytesCodec, + Crc32cCodec, + GzipCodec, + ShardingCodec, + VLenBytesCodec, + VLenUTF8Codec, +) from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.codec_pipeline import BatchedCodecPipeline @@ -28,6 +36,7 @@ register_ndbuffer, register_pipeline, ) +from zarr.storage import MemoryStore from zarr.testing.buffer import ( NDBufferUsingTestNDArrayLike, StoreExpectingTestBuffer, @@ -41,7 +50,19 @@ def test_config_defaults_set() -> None: assert config.defaults == [ { "default_zarr_version": 3, - "array": {"order": "C"}, + "array": { + "order": "C", + "v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + "v3_default_codecs": { + "bytes": ["vlen-bytes"], + "numeric": ["bytes", "zstd"], + "string": ["vlen-utf8"], + }, + }, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, "json_indent": 2, @@ -263,3 +284,31 @@ class NewCodec2(BytesCodec): # no warning if multiple implementations are available and one is selected in the config with config.set({"codecs.new_codec": fully_qualified_name(NewCodec)}): get_codec_class("new_codec") + + +@pytest.mark.parametrize( + ("dtype", "expected_codecs"), + [ + ("int", [BytesCodec(), GzipCodec()]), + ("bytes", [VLenBytesCodec()]), + ("str", [VLenUTF8Codec()]), + ], +) +async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: + with config.set( + { + "array.v3_default_codecs": { + "numeric": ["bytes", "gzip"], # test setting non-standard codecs + "string": ["vlen-utf8"], + "bytes": ["vlen-bytes"], + } + } + ): + arr = await AsyncArray.create( + shape=(100,), + chunk_shape=(100,), + dtype=np.dtype(dtype), + zarr_format=3, + store=MemoryStore(), + ) + assert arr.metadata.codecs == expected_codecs diff --git a/tests/test_group.py b/tests/test_group.py index 416e10af9a..e0bc304b9b 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from numcodecs import Zstd import zarr import zarr.api.asynchronous @@ -496,6 +497,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "shape": (1,), "chunks": (1,), "order": "C", + "filters": (Zstd(level=0),), "zarr_format": zarr_format, }, "subgroup": { @@ -521,7 +523,10 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": fill_value, "node_type": "array", diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 8ae9cc81fd..7f0c49338e 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from numcodecs import Zstd import zarr.api.asynchronous import zarr.api.synchronous @@ -71,7 +72,10 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", @@ -215,7 +219,10 @@ def test_consolidated_sync(self, memory_store): "configuration": {"separator": "/"}, "name": "default", }, - "codecs": ({"configuration": {"endian": "little"}, "name": "bytes"},), + "codecs": ( + {"configuration": {"endian": "little"}, "name": "bytes"}, + {"configuration": {}, "name": "zstd"}, + ), "data_type": "float64", "fill_value": np.float64(0.0), "node_type": "array", @@ -486,6 +493,7 @@ async def test_consolidated_metadata_v2(self): attributes={"key": "a"}, chunks=(1,), fill_value=None, + filters=(Zstd(level=0),), order="C", ), "g1": GroupMetadata( diff --git a/tests/test_v2.py b/tests/test_v2.py index 890d4039a3..ef06c13e26 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -11,7 +11,7 @@ import zarr import zarr.core.buffer import zarr.storage -from zarr import Array +from zarr import Array, config from zarr.storage import MemoryStore, StorePath @@ -82,36 +82,59 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - store = zarr.storage.MemoryStore() - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", - shape=(3,), - chunks=(3,), - dtype=dtype, - fill_value=b"X", - ) - - result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) - assert result is not None - - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "compressor": None, - "dtype": f"{dtype}0", - "fill_value": "WA==", - "filters": None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - assert serialized == expected - - data = zarr.open_array(store=store, path="foo")[:] - expected = np.full((3,), b"X", dtype=dtype) - np.testing.assert_equal(data, expected) + with config.set({"array.v2_default_compressor.bytes": "vlen-bytes"}): + store = zarr.storage.MemoryStore() + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=b"X", + ) + + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": f"{dtype}0", + "fill_value": "WA==", + "filters": [{"id": "vlen-bytes"}], + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + expected = np.full((3,), b"X", dtype=dtype) + np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]]) +def test_v2_encode_decode_with_data(dtype_value): + dtype, value = dtype_value + with config.set( + { + "array.v2_default_compressor": { + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + } + ): + expected = np.full((3,), value, dtype=dtype) + a = zarr.create( + shape=(3,), + zarr_format=2, + dtype=dtype, + ) + a[:] = expected + data = a[:] + np.testing.assert_equal(data, expected) @pytest.mark.parametrize("dtype", [str, "str"]) @@ -119,10 +142,10 @@ async def test_create_dtype_str(dtype: Any) -> None: arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) assert arr.dtype.kind == "O" assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) - arr[:] = ["a", "bb", "ccc"] + assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),) + arr[:] = [b"a", b"bb", b"ccc"] result = arr[:] - np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object")) + np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object")) @pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: + with config.set( + { + "array.v2_default_compressor": { + "numeric": "zstd", + "string": "vlen-utf8", + "bytes": "vlen-bytes", + }, + } + ): + dtype, expected = dtype_expected + arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) + assert arr.metadata.filters[0].codec_id == expected From 6930fe85b41739478a23d4bd55439e04d68d71cc Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Fri, 20 Dec 2024 03:31:48 +0100 Subject: [PATCH 40/87] Remove temporary doc specific to branch V3 (#2578) --- README-v3.md | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) delete mode 100644 README-v3.md diff --git a/README-v3.md b/README-v3.md deleted file mode 100644 index 598e646377..0000000000 --- a/README-v3.md +++ /dev/null @@ -1,49 +0,0 @@ -# V3 Contributor Guide - -A bare-bones guide to contributing to V3. - -Developed for the Feb. 2024 Zarr Sprint. - -## Clone V3 branch - -[Fork](https://github.com/zarr-developers/zarr-python/fork) zarr-python and clone it locally. - -``` -git clone {your remote} -git remote add upstream https://github.com/zarr-developers/zarr-python -git fetch upstream -git checkout --track upstream/v3 -``` - -## Set up your environment - -Zarr uses [hatch](https://hatch.pypa.io/) for its build system. - -``` -mamba install hatch -``` - -or - -``` -pip install hatch -``` - -Then - -``` -hatch env create test -``` - -## Run the Tests - -``` -hatch run test:run -``` - -or - -``` -hatch -e test shell -pytest -v -``` \ No newline at end of file From 6dc6d0785ee7bdb3c07ad0140e7f373505256496 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Fri, 20 Dec 2024 14:10:30 +0100 Subject: [PATCH 41/87] Feat/write empty chunks (#2429) * add write_empty_chunks to config.array namespace * use write_empty_chunks from config in write_batch * implement config-sensitive write_empty_chunks in write_batch, and add a test * add literacy to test * add warnings when write_empty_chunks is used as a kwarg * init * add ArrayConfig * docstring * ignore warning * fix v2 test * add test to ensure that write_empty_chunks can be set via the global config * fix tests * remove write_empty_chunks from Array.create; separate metadata order from config order * remove missing overload * Update src/zarr/core/array.py Co-authored-by: Norman Rzepka * Update src/zarr/core/array.py Co-authored-by: Norman Rzepka --------- Co-authored-by: Norman Rzepka --- src/zarr/api/asynchronous.py | 82 ++++++++++++++++++--------- src/zarr/api/synchronous.py | 28 +++++----- src/zarr/codecs/sharding.py | 8 ++- src/zarr/codecs/transpose.py | 2 +- src/zarr/core/array.py | 98 +++++++++++++++++++++++---------- src/zarr/core/array_spec.py | 89 +++++++++++++++++++++++++++--- src/zarr/core/codec_pipeline.py | 28 ++++++---- src/zarr/core/common.py | 29 ++++++++++ src/zarr/core/config.py | 1 + src/zarr/core/metadata/v2.py | 6 +- src/zarr/core/metadata/v3.py | 9 ++- tests/test_api.py | 37 ++++++++++++- tests/test_array.py | 87 +++++++++++++++++++++++++++-- tests/test_config.py | 1 + tests/test_v2.py | 3 +- 15 files changed, 399 insertions(+), 109 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 8b20676e8b..14078944d7 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -10,6 +10,7 @@ from typing_extensions import deprecated from zarr.core.array import Array, AsyncArray, get_array_metadata +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.common import ( JSON, @@ -17,6 +18,8 @@ ChunkCoords, MemoryOrder, ZarrFormat, + _warn_order_kwarg, + _warn_write_empty_chunks_kwarg, parse_dtype, ) from zarr.core.config import config @@ -794,7 +797,7 @@ async def create( read_only: bool | None = None, object_codec: Codec | None = None, # TODO: type has changed dimension_separator: Literal[".", "/"] | None = None, - write_empty_chunks: bool = False, # TODO: default has changed + write_empty_chunks: bool | None = None, zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, meta_array: Any | None = None, # TODO: need type @@ -810,6 +813,7 @@ async def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, **kwargs: Any, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -856,8 +860,10 @@ async def create( These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional + Deprecated in favor of the ``config`` keyword argument. + Pass ``{'order': }`` to ``create`` instead of using this parameter. Memory layout to be used within each chunk. - If not specified, default is taken from the Zarr config ```array.order```. + If not specified, the ``array.order`` parameter in the global config will be used. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional @@ -891,30 +897,26 @@ async def create( Separator placed between the dimensions of a chunk. V2 only. V3 arrays should use ``chunk_key_encoding`` instead. Default is ".". - .. versionadded:: 2.8 - write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their + Deprecated in favor of the ``config`` keyword argument. + Pass ``{'write_empty_chunks': }`` to ``create`` instead of using this parameter. + If True, all chunks will be stored regardless of their contents. If False, each chunk is compared to the array's fill value prior to storing. If a chunk is uniformly equal to the fill value, then that chunk is not be stored, and the store entry for that chunk's key - is deleted. This setting enables sparser storage, as only chunks with - non-fill-value data are stored, at the expense of overhead associated - with checking the data of each chunk. - - .. versionadded:: 2.11 - + is deleted. zarr_format : {2, 3, None}, optional The zarr format to use when saving. Default is 3. meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.13 storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration of the array. If provided, will override the + default values from `zarr.config.array`. Returns ------- @@ -951,19 +953,16 @@ async def create( warnings.warn("object_codec is not yet implemented", RuntimeWarning, stacklevel=2) if read_only is not None: warnings.warn("read_only is not yet implemented", RuntimeWarning, stacklevel=2) - if dimension_separator is not None: - if zarr_format == 3: - raise ValueError( - "dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead" - ) - else: - warnings.warn( - "dimension_separator is not yet implemented", - RuntimeWarning, - stacklevel=2, - ) - if write_empty_chunks: - warnings.warn("write_empty_chunks is not yet implemented", RuntimeWarning, stacklevel=2) + if dimension_separator is not None and zarr_format == 3: + raise ValueError( + "dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead" + ) + + if order is not None: + _warn_order_kwarg() + if write_empty_chunks is not None: + _warn_write_empty_chunks_kwarg() + if meta_array is not None: warnings.warn("meta_array is not yet implemented", RuntimeWarning, stacklevel=2) @@ -971,6 +970,30 @@ async def create( if mode is None: mode = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + + config_dict: ArrayConfigParams = {} + + if write_empty_chunks is not None: + if config is not None: + msg = ( + "Both write_empty_chunks and config keyword arguments are set. " + "This is redundant. When both are set, write_empty_chunks will be ignored and " + "config will be used." + ) + warnings.warn(UserWarning(msg), stacklevel=1) + config_dict["write_empty_chunks"] = write_empty_chunks + if order is not None: + if config is not None: + msg = ( + "Both order and config keyword arguments are set. " + "This is redundant. When both are set, order will be ignored and " + "config will be used." + ) + warnings.warn(UserWarning(msg), stacklevel=1) + config_dict["order"] = order + + config_parsed = ArrayConfig.from_dict(config_dict) + return await AsyncArray.create( store_path, shape=shape, @@ -987,7 +1010,7 @@ async def create( codecs=codecs, dimension_names=dimension_names, attributes=attributes, - order=order, + config=config_parsed, **kwargs, ) @@ -1163,6 +1186,11 @@ async def open_array( zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) + if "order" in kwargs: + _warn_order_kwarg() + if "write_empty_chunks" in kwargs: + _warn_write_empty_chunks_kwarg() + try: return await AsyncArray.open(store_path, zarr_format=zarr_format) except FileNotFoundError: diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 6ae062865c..cd1ef8b38d 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -17,6 +17,7 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike + from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat @@ -542,7 +543,7 @@ def create( read_only: bool | None = None, object_codec: Codec | None = None, # TODO: type has changed dimension_separator: Literal[".", "/"] | None = None, - write_empty_chunks: bool = False, # TODO: default has changed + write_empty_chunks: bool | None = None, # TODO: default has changed zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, meta_array: Any | None = None, # TODO: need type @@ -558,6 +559,7 @@ def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, **kwargs: Any, ) -> Array: """Create an array. @@ -578,8 +580,10 @@ def create( fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional + Deprecated in favor of the ``config`` keyword argument. + Pass ``{'order': }`` to ``create`` instead of using this parameter. Memory layout to be used within each chunk. - Default is set in Zarr's config (`array.order`). + If not specified, the ``array.order`` parameter in the global config will be used. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional @@ -609,30 +613,25 @@ def create( A codec to encode object arrays, only needed if dtype=object. dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - - .. versionadded:: 2.8 - write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their + Deprecated in favor of the ``config`` keyword argument. + Pass ``{'write_empty_chunks': }`` to ``create`` instead of using this parameter. + If True, all chunks will be stored regardless of their contents. If False, each chunk is compared to the array's fill value prior to storing. If a chunk is uniformly equal to the fill value, then that chunk is not be stored, and the store entry for that chunk's key - is deleted. This setting enables sparser storage, as only chunks with - non-fill-value data are stored, at the expense of overhead associated - with checking the data of each chunk. - - .. versionadded:: 2.11 - + is deleted. zarr_format : {2, 3, None}, optional The zarr format to use when saving. meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.13 storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration of the array. If provided, will override the + default values from `zarr.config.array`. Returns ------- @@ -669,6 +668,7 @@ def create( codecs=codecs, dimension_names=dimension_names, storage_options=storage_options, + config=config, **kwargs, ) ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 5372d5ec50..a01145b3b2 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -20,7 +20,7 @@ from zarr.abc.store import ByteGetter, ByteRangeRequest, ByteSetter from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.core.array_spec import ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.buffer import ( Buffer, BufferPrototype, @@ -665,7 +665,9 @@ def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec: shape=chunks_per_shard + (2,), dtype=np.dtype(" ArraySpec: shape=self.chunk_shape, dtype=shard_spec.dtype, fill_value=shard_spec.fill_value, - order=shard_spec.order, + config=shard_spec.config, prototype=shard_spec.prototype, ) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 3a471beaf5..1aa1eb40e2 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -84,7 +84,7 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: shape=tuple(chunk_spec.shape[self.order[i]] for i in range(chunk_spec.ndim)), dtype=chunk_spec.dtype, fill_value=chunk_spec.fill_value, - order=chunk_spec.order, + config=chunk_spec.config, prototype=chunk_spec.prototype, ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 07ed0e5069..717eff36dc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -15,6 +15,7 @@ from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, normalize_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -37,12 +38,14 @@ MemoryOrder, ShapeLike, ZarrFormat, + _warn_order_kwarg, concurrent_map, parse_dtype, + parse_order, parse_shapelike, product, ) -from zarr.core.config import config, parse_indexing_order +from zarr.core.config import config as zarr_config from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -187,8 +190,8 @@ class AsyncArray(Generic[T_ArrayMetadata]): The metadata of the array. store_path : StorePath The path to the Zarr store. - order : {'C', 'F'}, optional - The order of the array data in memory, by default None. + config : ArrayConfig, optional + The runtime configuration of the array, by default None. Attributes ---------- @@ -198,21 +201,21 @@ class AsyncArray(Generic[T_ArrayMetadata]): The path to the Zarr store. codec_pipeline : CodecPipeline The codec pipeline used for encoding and decoding chunks. - order : {'C', 'F'} - The order of the array data in memory. + _config : ArrayConfig + The runtime configuration of the array. """ metadata: T_ArrayMetadata store_path: StorePath codec_pipeline: CodecPipeline = field(init=False) - order: MemoryOrder + _config: ArrayConfig @overload def __init__( self: AsyncArray[ArrayV2Metadata], metadata: ArrayV2Metadata | ArrayV2MetadataDict, store_path: StorePath, - order: MemoryOrder | None = None, + config: ArrayConfig | None = None, ) -> None: ... @overload @@ -220,14 +223,14 @@ def __init__( self: AsyncArray[ArrayV3Metadata], metadata: ArrayV3Metadata | ArrayV3MetadataDict, store_path: StorePath, - order: MemoryOrder | None = None, + config: ArrayConfig | None = None, ) -> None: ... def __init__( self, metadata: ArrayMetadata | ArrayMetadataDict, store_path: StorePath, - order: MemoryOrder | None = None, + config: ArrayConfig | None = None, ) -> None: if isinstance(metadata, dict): zarr_format = metadata["zarr_format"] @@ -241,11 +244,12 @@ def __init__( raise ValueError(f"Invalid zarr_format: {zarr_format}. Expected 2 or 3") metadata_parsed = parse_array_metadata(metadata) - order_parsed = parse_indexing_order(order or config.get("array.order")) + + config = ArrayConfig.from_dict({}) if config is None else config object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) - object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "_config", config) object.__setattr__(self, "codec_pipeline", create_codec_pipeline(metadata=metadata_parsed)) # this overload defines the function signature when zarr_format is 2 @@ -269,6 +273,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata]: ... # this overload defines the function signature when zarr_format is 3 @@ -297,9 +302,9 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... - # this overload is necessary to handle the case where the `zarr_format` kwarg is unspecified @overload @classmethod async def create( @@ -325,8 +330,8 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... - @overload @classmethod async def create( @@ -358,6 +363,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod @@ -390,6 +396,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ Method to create a new asynchronous array instance. @@ -439,7 +446,11 @@ async def create( The dimension separator (default is "."). V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). + The memory of the array (default is "C"). + If ``zarr_format`` is 2, this parameter sets the memory order of the array. + If `zarr_format`` is 3, then this parameter is deprecated, because memory order + is a runtime parameter for Zarr 3 arrays. The recommended way to specify the memory + order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -491,6 +502,7 @@ async def create( _chunks = normalize_chunks(chunks, shape, dtype_parsed.itemsize) else: _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.itemsize) + config_parsed = normalize_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 3: @@ -506,6 +518,10 @@ async def create( raise ValueError( "compressor cannot be used for arrays with version 3. Use bytes-to-bytes codecs instead." ) + + if order is not None: + _warn_order_kwarg() + result = await cls._create_v3( store_path, shape=shape, @@ -517,7 +533,7 @@ async def create( dimension_names=dimension_names, attributes=attributes, overwrite=overwrite, - order=order, + config=config_parsed, ) elif zarr_format == 2: if codecs is not None: @@ -530,6 +546,12 @@ async def create( ) if dimension_names is not None: raise ValueError("dimension_names cannot be used for arrays with version 2.") + + if order is None: + order_parsed = parse_order(zarr_config.get("array.order")) + else: + order_parsed = order + result = await cls._create_v2( store_path, shape=shape, @@ -537,7 +559,8 @@ async def create( chunks=_chunks, dimension_separator=dimension_separator, fill_value=fill_value, - order=order, + order=order_parsed, + config=config_parsed, filters=filters, compressor=compressor, attributes=attributes, @@ -560,8 +583,8 @@ async def _create_v3( shape: ShapeLike, dtype: np.dtype[Any], chunk_shape: ChunkCoords, + config: ArrayConfig, fill_value: Any | None = None, - order: MemoryOrder | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] @@ -614,7 +637,7 @@ async def _create_v3( attributes=attributes or {}, ) - array = cls(metadata=metadata, store_path=store_path, order=order) + array = cls(metadata=metadata, store_path=store_path, config=config) await array._save_metadata(metadata, ensure_parents=True) return array @@ -626,9 +649,10 @@ async def _create_v2( shape: ChunkCoords, dtype: np.dtype[Any], chunks: ChunkCoords, + order: MemoryOrder, + config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: float | None = None, - order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, attributes: dict[str, JSON] | None = None, @@ -642,9 +666,6 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - if order is None: - order = parse_indexing_order(config.get("array.order")) - if dimension_separator is None: dimension_separator = "." @@ -667,7 +688,7 @@ async def _create_v2( filters=filters, attributes=attributes, ) - array = cls(metadata=metadata, store_path=store_path, order=order) + array = cls(metadata=metadata, store_path=store_path, config=config) await array._save_metadata(metadata, ensure_parents=True) return array @@ -806,6 +827,17 @@ def dtype(self) -> np.dtype[Any]: """ return self.metadata.dtype + @property + def order(self) -> MemoryOrder: + """Returns the memory order of the array. + + Returns + ------- + bool + Memory order of the array + """ + return self._config.order + @property def attrs(self) -> dict[str, JSON]: """Returns the attributes of the array. @@ -1036,7 +1068,7 @@ async def _get_selection( out_buffer = prototype.nd_buffer.create( shape=indexer.shape, dtype=out_dtype, - order=self.order, + order=self._config.order, fill_value=self.metadata.fill_value, ) if product(indexer.shape) > 0: @@ -1045,7 +1077,9 @@ async def _get_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, self.order, prototype=prototype), + self.metadata.get_chunk_spec( + chunk_coords, self._config, prototype=prototype + ), chunk_selection, out_selection, ) @@ -1167,7 +1201,7 @@ async def _set_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, self.order, prototype), + self.metadata.get_chunk_spec(chunk_coords, self._config, prototype), chunk_selection, out_selection, ) @@ -1270,7 +1304,7 @@ async def _delete_key(key: str) -> None: for chunk_coords in old_chunk_coords.difference(new_chunk_coords) ], _delete_key, - config.get("async.concurrency"), + zarr_config.get("async.concurrency"), ) # Write new metadata @@ -1503,6 +1537,7 @@ def create( compressor: dict[str, JSON] | None = None, # runtime overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> Array: """Creates a new Array instance from an initialized store. @@ -1545,7 +1580,11 @@ def create( The dimension separator (default is "."). V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). + The memory of the array (default is "C"). + If ``zarr_format`` is 2, this parameter sets the memory order of the array. + If `zarr_format`` is 3, then this parameter is deprecated, because memory order + is a runtime parameter for Zarr 3 arrays. The recommended way to specify the memory + order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -1588,6 +1627,7 @@ def create( filters=filters, compressor=compressor, overwrite=overwrite, + config=config, ), ) return cls(async_array) @@ -3399,7 +3439,7 @@ def _build_parents( def _get_default_codecs( np_dtype: np.dtype[Any], ) -> list[dict[str, JSON]]: - default_codecs = config.get("array.v3_default_codecs") + default_codecs = zarr_config.get("array.v3_default_codecs") dtype = DataType.from_numpy(np_dtype) if dtype == DataType.string: dtype_key = "string" diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index c4d9c363fa..ee6934d05f 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -1,23 +1,95 @@ from __future__ import annotations -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from dataclasses import dataclass, fields +from typing import TYPE_CHECKING, Any, Literal, Self, TypedDict, cast import numpy as np -from zarr.core.common import MemoryOrder, parse_fill_value, parse_order, parse_shapelike +from zarr.core.common import ( + MemoryOrder, + parse_bool, + parse_fill_value, + parse_order, + parse_shapelike, +) +from zarr.core.config import config as zarr_config if TYPE_CHECKING: + from typing import NotRequired + from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords +class ArrayConfigParams(TypedDict): + """ + A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. + This allows for partial construction of an ArrayConfig, with the assumption that the unset + keys will be taken from a global configuration. + """ + + order: NotRequired[MemoryOrder] + write_empty_chunks: NotRequired[bool] + + +@dataclass(frozen=True) +class ArrayConfig: + """ + A model of the runtime configuration of an array. + + Parameters + ---------- + order : MemoryOrder + The memory layout of the arrays returned when reading data from the store. + write_empty_chunks : bool + If True, empty chunks will be written to the store. + """ + + order: MemoryOrder + write_empty_chunks: bool + + def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None: + order_parsed = parse_order(order) + write_empty_chunks_parsed = parse_bool(write_empty_chunks) + + object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) + + @classmethod + def from_dict(cls, data: ArrayConfigParams) -> Self: + """ + Create an ArrayConfig from a dict. The keys of that dict are a subset of the + attributes of the ArrayConfig class. Any keys missing from that dict will be set to the + the values in the ``array`` namespace of ``zarr.config``. + """ + kwargs_out: ArrayConfigParams = {} + for f in fields(ArrayConfig): + field_name = cast(Literal["order", "write_empty_chunks"], f.name) + if field_name not in data: + kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") + else: + kwargs_out[field_name] = data[field_name] + return cls(**kwargs_out) + + +def normalize_array_config(data: ArrayConfig | ArrayConfigParams | None) -> ArrayConfig: + """ + Convert various types of data to an ArrayConfig. + """ + if data is None: + return ArrayConfig.from_dict({}) + elif isinstance(data, ArrayConfig): + return data + else: + return ArrayConfig.from_dict(data) + + @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords dtype: np.dtype[Any] fill_value: Any - order: MemoryOrder + config: ArrayConfig prototype: BufferPrototype def __init__( @@ -25,20 +97,23 @@ def __init__( shape: ChunkCoords, dtype: np.dtype[Any], fill_value: Any, - order: MemoryOrder, + config: ArrayConfig, prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) dtype_parsed = np.dtype(dtype) fill_value_parsed = parse_fill_value(fill_value) - order_parsed = parse_order(order) object.__setattr__(self, "shape", shape_parsed) object.__setattr__(self, "dtype", dtype_parsed) object.__setattr__(self, "fill_value", fill_value_parsed) - object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "config", config) object.__setattr__(self, "prototype", prototype) @property def ndim(self) -> int: return len(self.shape) + + @property + def order(self) -> MemoryOrder: + return self.config.order diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 038a2eeac2..5a1f069823 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -360,7 +360,7 @@ async def _read_key( _read_key, config.get("async.concurrency"), ) - chunk_array_batch = await self.decode_batch( + chunk_array_decoded = await self.decode_batch( [ (chunk_bytes, chunk_spec) for chunk_bytes, (_, chunk_spec, _, _) in zip( @@ -369,23 +369,27 @@ async def _read_key( ], ) - chunk_array_batch = [ + chunk_array_merged = [ self._merge_chunk_array( chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes ) for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( - chunk_array_batch, batch_info, strict=False - ) - ] - - chunk_array_batch = [ - None - if chunk_array is None or chunk_array.all_equal(chunk_spec.fill_value) - else chunk_array - for chunk_array, (_, chunk_spec, _, _) in zip( - chunk_array_batch, batch_info, strict=False + chunk_array_decoded, batch_info, strict=False ) ] + chunk_array_batch: list[NDBuffer | None] = [] + for chunk_array, (_, chunk_spec, _, _) in zip( + chunk_array_merged, batch_info, strict=False + ): + if chunk_array is None: + chunk_array_batch.append(None) # type: ignore[unreachable] + else: + if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( + chunk_spec.fill_value + ): + chunk_array_batch.append(None) + else: + chunk_array_batch.append(chunk_array) chunk_bytes_batch = await self.encode_batch( [ diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index a4bf33451c..3db00b1a06 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -3,6 +3,7 @@ import asyncio import functools import operator +import warnings from collections.abc import Iterable, Mapping from enum import Enum from itertools import starmap @@ -160,6 +161,12 @@ def parse_order(data: Any) -> Literal["C", "F"]: raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") +def parse_bool(data: Any) -> bool: + if isinstance(data, bool): + return data + raise ValueError(f"Expected bool, got {data} instead.") + + def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: if dtype is str or dtype == "str": if zarr_format == 2: @@ -168,3 +175,25 @@ def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: else: return _STRING_DTYPE return np.dtype(dtype) + + +def _warn_write_empty_chunks_kwarg() -> None: + # TODO: link to docs page on array configuration in this message + msg = ( + "The `write_empty_chunks` keyword argument is deprecated and will be removed in future versions. " + "To control whether empty chunks are written to storage, either use the `config` keyword " + "argument, as in `config={'write_empty_chunks: True}`," + "or change the global 'array.write_empty_chunks' configuration variable." + ) + warnings.warn(msg, RuntimeWarning, stacklevel=2) + + +def _warn_order_kwarg() -> None: + # TODO: link to docs page on array configuration in this message + msg = ( + "The `order` keyword argument has no effect for zarr v3 arrays. " + "To control the memory layout of the array, either use the `config` keyword " + "argument, as in `config={'order: 'C'}`," + "or change the global 'array.order' configuration variable." + ) + warnings.warn(msg, RuntimeWarning, stacklevel=2) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 1feb4a6c2f..a14305aef8 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -65,6 +65,7 @@ def reset(self) -> None: "default_zarr_version": 3, "array": { "order": "C", + "write_empty_chunks": False, "v2_default_compressor": { "numeric": "zstd", "string": "vlen-utf8", diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index bd0fbecf4a..bf6b576edd 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -22,7 +22,7 @@ import numcodecs import numpy as np -from zarr.core.array_spec import ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike @@ -186,13 +186,13 @@ def to_dict(self) -> dict[str, JSON]: return zarray_dict def get_chunk_spec( - self, _chunk_coords: ChunkCoords, order: MemoryOrder, prototype: BufferPrototype + self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: return ArraySpec( shape=self.chunks, dtype=self.dtype, fill_value=self.fill_value, - order=order, + config=array_config, prototype=prototype, ) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 8dcceb7f31..4cf5860ffd 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -24,14 +24,13 @@ import numpy.typing as npt from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec -from zarr.core.array_spec import ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.core.common import ( JSON, ZARR_JSON, ChunkCoords, - MemoryOrder, parse_named_configuration, parse_shapelike, ) @@ -252,7 +251,7 @@ def __init__( shape=shape_parsed, dtype=data_type_parsed.to_numpy(), fill_value=fill_value_parsed, - order="C", # TODO: order is not needed here. + config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial] @@ -298,7 +297,7 @@ def ndim(self) -> int: return len(self.shape) def get_chunk_spec( - self, _chunk_coords: ChunkCoords, order: MemoryOrder, prototype: BufferPrototype + self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: assert isinstance( self.chunk_grid, RegularChunkGrid @@ -307,7 +306,7 @@ def get_chunk_spec( shape=self.chunk_grid.chunk_shape, dtype=self.dtype, fill_value=self.fill_value, - order=order, + config=array_config, prototype=prototype, ) diff --git a/tests/test_api.py b/tests/test_api.py index f98565ad68..d25ec54bfe 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -56,6 +56,21 @@ def test_create_array(memory_store: Store) -> None: z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) # type: ignore [arg-type] +@pytest.mark.parametrize("write_empty_chunks", [True, False]) +def test_write_empty_chunks_warns(write_empty_chunks: bool) -> None: + """ + Test that using the `write_empty_chunks` kwarg on array access will raise a warning. + """ + match = "The `write_empty_chunks` keyword argument .*" + with pytest.warns(RuntimeWarning, match=match): + _ = zarr.array( + data=np.arange(10), shape=(10,), dtype="uint8", write_empty_chunks=write_empty_chunks + ) + + with pytest.warns(RuntimeWarning, match=match): + _ = zarr.create(shape=(10,), dtype="uint8", write_empty_chunks=write_empty_chunks) + + @pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) @pytest.mark.parametrize("node_type", ["array", "group"]) def test_open_normalized_path( @@ -245,10 +260,26 @@ def test_open_with_mode_w_minus(tmp_path: pathlib.Path) -> None: zarr.open(store=tmp_path, mode="w-") -@pytest.mark.parametrize("order", ["C", "F", None]) @pytest.mark.parametrize("zarr_format", [2, 3]) -def test_array_order(order: MemoryOrder | None, zarr_format: ZarrFormat) -> None: - arr = zarr.ones(shape=(2, 2), order=order, zarr_format=zarr_format) +def test_array_order(zarr_format: ZarrFormat) -> None: + arr = zarr.ones(shape=(2, 2), order=None, zarr_format=zarr_format) + expected = zarr.config.get("array.order") + assert arr.order == expected + + vals = np.asarray(arr) + if expected == "C": + assert vals.flags.c_contiguous + elif expected == "F": + assert vals.flags.f_contiguous + else: + raise AssertionError + + +@pytest.mark.parametrize("order", ["C", "F"]) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_array_order_warns(order: MemoryOrder | None, zarr_format: ZarrFormat) -> None: + with pytest.warns(RuntimeWarning, match="The `order` keyword argument .*"): + arr = zarr.ones(shape=(2, 2), order=order, zarr_format=zarr_format) expected = order or zarr.config.get("array.order") assert arr.order == expected diff --git a/tests/test_array.py b/tests/test_array.py index c89b6187c3..1899e384dc 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -748,14 +748,42 @@ def test_append_bad_shape(store: MemoryStore, zarr_format: ZarrFormat) -> None: @pytest.mark.parametrize("order", ["C", "F", None]) -@pytest.mark.parametrize("zarr_format", [2, 3]) @pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_array_create_order( +def test_array_create_metadata_order_v2( order: MemoryOrder | None, zarr_format: int, store: MemoryStore ) -> None: - arr = Array.create(store=store, shape=(2, 2), order=order, zarr_format=zarr_format, dtype="i4") + """ + Test that the ``order`` attribute in zarr v2 array metadata is set correctly via the ``order`` + keyword argument to ``Array.create``. When ``order`` is ``None``, the value of the + ``array.order`` config is used. + """ + arr = Array.create(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") + expected = order or zarr.config.get("array.order") - assert arr.order == expected + assert arr.metadata.order == expected # type: ignore[union-attr] + + +@pytest.mark.parametrize("order_config", ["C", "F", None]) +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_array_create_order( + order_config: MemoryOrder | None, + zarr_format: int, + store: MemoryStore, +) -> None: + """ + Test that the arrays generated by array indexing have a memory order defined by the config order + value + """ + if order_config is None: + config = {} + expected = zarr.config.get("array.order") + else: + config = {"order": order_config} + expected = order_config + + arr = Array.create( + store=store, shape=(2, 2), zarr_format=zarr_format, dtype="i4", config=config + ) vals = np.asarray(arr) if expected == "C": @@ -766,6 +794,57 @@ def test_array_create_order( raise AssertionError +@pytest.mark.parametrize("write_empty_chunks", [True, False]) +def test_write_empty_chunks_config(write_empty_chunks: bool) -> None: + """ + Test that the value of write_empty_chunks is sensitive to the global config when not set + explicitly + """ + with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): + arr = Array.create({}, shape=(2, 2), dtype="i4") + assert arr._async_array._config.write_empty_chunks == write_empty_chunks + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("write_empty_chunks", [True, False]) +@pytest.mark.parametrize("fill_value", [0, 5]) +def test_write_empty_chunks_behavior( + zarr_format: ZarrFormat, store: MemoryStore, write_empty_chunks: bool, fill_value: int +) -> None: + """ + Check that the write_empty_chunks value of the config is applied correctly. We expect that + when write_empty_chunks is True, writing chunks equal to the fill value will result in + those chunks appearing in the store. + + When write_empty_chunks is False, writing chunks that are equal to the fill value will result in + those chunks not being present in the store. In particular, they should be deleted if they were + already present. + """ + + arr = Array.create( + store=store, + shape=(2,), + zarr_format=zarr_format, + dtype="i4", + fill_value=fill_value, + chunk_shape=(1,), + config={"write_empty_chunks": write_empty_chunks}, + ) + + assert arr._async_array._config.write_empty_chunks == write_empty_chunks + + # initialize the store with some non-fill value chunks + arr[:] = fill_value + 1 + assert arr.nchunks_initialized == arr.nchunks + + arr[:] = fill_value + + if not write_empty_chunks: + assert arr.nchunks_initialized == 0 + else: + assert arr.nchunks_initialized == arr.nchunks + + @pytest.mark.parametrize( ("fill_value", "expected"), [ diff --git a/tests/test_config.py b/tests/test_config.py index 8dd15fb75b..ea8e70a994 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -52,6 +52,7 @@ def test_config_defaults_set() -> None: "default_zarr_version": 3, "array": { "order": "C", + "write_empty_chunks": False, "v2_default_compressor": { "numeric": "zstd", "string": "vlen-utf8", diff --git a/tests/test_v2.py b/tests/test_v2.py index ef06c13e26..80897db8e5 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -152,7 +152,8 @@ async def test_create_dtype_str(dtype: Any) -> None: @pytest.mark.parametrize("order", ["C", "F"]) def test_v2_filters_codecs(filters: Any, order: Literal["C", "F"]) -> None: array_fixture = [42] - arr = zarr.create(shape=1, dtype=" Date: Fri, 20 Dec 2024 21:59:00 +0100 Subject: [PATCH 42/87] Multiple imports for an import name (#2580) --- src/zarr/core/metadata/v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index bf6b576edd..af26034b1d 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING, Any, TypedDict, cast +from typing import TYPE_CHECKING, TypedDict, cast from zarr.abc.metadata import Metadata From 1ac02ea80ea8f227b67a8955102eb7925ee71244 Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Sat, 28 Dec 2024 17:26:32 +0100 Subject: [PATCH 43/87] test and fix indexing for scalar arrays (#2583) * test and fix indexing for scalar arrays * fix mypy --- src/zarr/api/asynchronous.py | 2 +- tests/test_array.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 14078944d7..c4d1ec8627 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -579,7 +579,7 @@ async def array( z = await create(**kwargs) # fill with data - await z.setitem(slice(None), data) + await z.setitem(Ellipsis, data) return z diff --git a/tests/test_array.py b/tests/test_array.py index 1899e384dc..891538bc43 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -881,3 +881,10 @@ async def test_nbytes( assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize else: assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize + + +async def test_scalar_array() -> None: + arr = zarr.array(1.5) + assert arr[...] == 1.5 + assert arr[()] == 1.5 + assert arr.shape == () From 190b8672bae220755d136e6b8c51551124331dd2 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Tue, 31 Dec 2024 15:20:14 +0000 Subject: [PATCH 44/87] Un-mark memory store as "for testing" (#2601) --- src/zarr/storage/memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/storage/memory.py b/src/zarr/storage/memory.py index 74d7758863..1f8dd75768 100644 --- a/src/zarr/storage/memory.py +++ b/src/zarr/storage/memory.py @@ -19,7 +19,7 @@ class MemoryStore(Store): """ - In-memory store for testing purposes. + In-memory store. Parameters ---------- From 2998561337de1c0581250183dc7a8307303a74bd Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Wed, 1 Jan 2025 22:36:19 +0100 Subject: [PATCH 45/87] remove test.py (#2612) --- test.py | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 29dac92c8b..0000000000 --- a/test.py +++ /dev/null @@ -1,7 +0,0 @@ -import zarr - -store = zarr.DirectoryStore("data") -r = zarr.open_group(store=store) -z = r.full("myArray", 42, shape=(), dtype="i4", compressor=None) - -print(z.oindex[...]) From b9699f5c5a9b1f76a7509c333277334dbc2d415d Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 2 Jan 2025 15:17:35 +0000 Subject: [PATCH 46/87] Note that whole directories can be deleted in LocalStore (#2606) --- src/zarr/storage/local.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/zarr/storage/local.py b/src/zarr/storage/local.py index f9b1747c31..f4226792cb 100644 --- a/src/zarr/storage/local.py +++ b/src/zarr/storage/local.py @@ -189,6 +189,18 @@ async def set_partial_values( await concurrent_map(args, asyncio.to_thread, limit=None) # TODO: fix limit async def delete(self, key: str) -> None: + """ + Remove a key from the store. + + Parameters + ---------- + key : str + + Notes + ----- + If ``key`` is a directory within this store, the entire directory + at ``store.root / key`` is deleted. + """ # docstring inherited self._check_writable() path = self.root / key From 25355036835a91b82fff1b816f647785b5ee6521 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 2 Jan 2025 09:20:09 -0800 Subject: [PATCH 47/87] fix: run-coverage command now tracks src directory (#2615) --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 75bbbf15d3..a92c30ab9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,8 +136,8 @@ numpy = ["1.25", "2.1"] features = ["gpu"] [tool.hatch.envs.test.scripts] -run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=tests" -run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=tests" +run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=src" +run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=src" run = "run-coverage --no-cov" run-verbose = "run-coverage --verbose" run-mypy = "mypy src" @@ -157,7 +157,7 @@ numpy = ["1.25", "2.1"] version = ["minimal"] [tool.hatch.envs.gputest.scripts] -run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=tests" +run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=src" run = "run-coverage --no-cov" run-verbose = "run-coverage --verbose" run-mypy = "mypy src" From f407a41f7cb2ffa408211b252f95fe8d407ed02c Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 2 Jan 2025 18:07:28 +0000 Subject: [PATCH 48/87] Don't document zarr.codec submodules (#2605) --- docs/conf.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 8b22e33c6d..53fba058e7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,6 +17,7 @@ import sys from typing import Any +import sphinx import sphinx.application from importlib.metadata import version as get_version @@ -60,6 +61,20 @@ autoapi_keep_files = True autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', ] +def skip_submodules( + app: sphinx.application.Sphinx, + what: str, + name: str, + obj: object, + skip: bool, + options: dict[str, Any] + ) -> bool: + # Skip documenting zarr.codecs submodules + # codecs are documented in the main zarr.codecs namespace + if what == "module" and name.startswith("zarr.codecs."): + skip = True + return skip + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -179,6 +194,7 @@ def setup(app: sphinx.application.Sphinx) -> None: app.add_css_file("custom.css") + app.connect("autoapi-skip-member", skip_submodules) # The name of an image file (relative to this directory) to use as a favicon of From 79078522b6053fff845b7d0562fedf3dc5a94727 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 2 Jan 2025 18:53:17 +0000 Subject: [PATCH 49/87] Improve deprecation of zarr.creation and zarr.convenience (#2609) --- src/zarr/convenience.py | 12 +++++++++++- src/zarr/creation.py | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/zarr/convenience.py b/src/zarr/convenience.py index 2551d455a4..88f10663b7 100644 --- a/src/zarr/convenience.py +++ b/src/zarr/convenience.py @@ -1,3 +1,12 @@ +""" +Convenience helpers. + +.. warning:: + + This sub-module is deprecated. All functions here are defined + in the top level zarr namespace instead. +""" + import warnings from zarr.api.synchronous import ( @@ -29,7 +38,8 @@ ] warnings.warn( - "zarr.convenience is deprecated, use zarr.api.synchronous", + "zarr.convenience is deprecated. " + "Import these functions from the top level zarr. namespace instead.", DeprecationWarning, stacklevel=2, ) diff --git a/src/zarr/creation.py b/src/zarr/creation.py index 63f93ba6f6..8197c4950c 100644 --- a/src/zarr/creation.py +++ b/src/zarr/creation.py @@ -1,3 +1,12 @@ +""" +Helpers for creating arrays. + +.. warning:: + + This sub-module is deprecated. All functions here are defined + in the top level zarr namespace instead. +""" + import warnings from zarr.api.synchronous import ( @@ -31,7 +40,8 @@ ] warnings.warn( - "zarr.creation is deprecated, use zarr.api.synchronous", + "zarr.creation is deprecated. " + "Import these functions from the top level zarr. namespace instead.", DeprecationWarning, stacklevel=2, ) From cc4dff94b35a7b50cedc5364b3c8e15615a8876f Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 2 Jan 2025 23:18:14 +0100 Subject: [PATCH 50/87] top-level functions for reading, creating data (#2463) * add functions for easy read-only data access * sync funcs * make read-only funcs top-level exports * add create_array, create_group, and tests * add top-level imports * add test for top-level exports * add test for read * add asserts * Apply suggestions from code review * handle sharding in create_array * tweak * make logic of _auto_partition better for shard shape * add dtype parsing, and tweak auto_partitioning func * sketch of docstring; remove auto chunks / shard shape * tweak docstring * docstrings * ensure tests pass * tuple -> list * allow data in create_array * docstring * remove auto_partition * make shape shapelike * use create_array everywhere in group class * remove readers * fix dodgy imports * compressors -> compression, auto chunking, auto sharding, auto compression, auto filters * use sane shard shape when there are too few chunks * fix: allow user-specified filters and compression * np.dtype[np.generic] -> np.dtype[Any] * handle singleton compressor / filters input * default codec config now uses the full config dict * test for auto sharding * test * adds a shards property * add (typed) functions for resolving codecs * better codec parsing * add warning if auto sharding is used * remove read_array * rename compression to compressors, and make the docstring for create_array more clear on what filters and compressors mean * compression -> compressors, shard_shape -> shards, chunk_shape -> chunks * use typerror instead of valuerror; docstring * default order is None * fix circular dep * format * fix some tests * use filters=auto and compressors=auto in Group.create_array * compression -> compressors * Update src/zarr/core/group.py Co-authored-by: Norman Rzepka * fix mypy * narrow type of filters param and compression param * remove data kwarg to create_array * mypy fixes * ensure that we accept dict form of compressor in _parse_chunk_encoding_v2 * fix properties test * add tests for compressors and filters kwargs to create_array * add tests for codec inference * add test for illegal shards kwarg for v2 arrays * remove redundant test function * tests and types * rm print * types * resolve cyclic import * add create_array to async and sync API * docs for create_array * rename (Async)Array.create to _create * adds array_bytes_codec kwarg * tests * tests for no filters+compressors * widen type of FiltersParam to include single numcodecs codec instances * don't alias None to default codecs in _create_v2 * allow single codec instances for filters, and None for filters / compressor, and condense some tests * add docstring for None * single-item tuple for compressors in v2 * Update src/zarr/core/array.py * tweaks * pr feedback 1 * tests * mypy * rename array_bytes_codec to serializer * Update src/zarr/api/asynchronous.py Co-authored-by: Joe Hamman * docstrings * *params -> *like * *params -> *like, in tests * adds deprecated compressor arg to Group.create_array * docs --------- Co-authored-by: Joe Hamman Co-authored-by: Norman Rzepka Co-authored-by: Joe Hamman --- src/zarr/__init__.py | 4 + src/zarr/api/asynchronous.py | 109 +++- src/zarr/api/synchronous.py | 227 ++++++- src/zarr/core/_info.py | 5 +- src/zarr/core/array.py | 780 ++++++++++++++++++++--- src/zarr/core/array_spec.py | 8 +- src/zarr/core/buffer/core.py | 10 +- src/zarr/core/chunk_grids.py | 57 +- src/zarr/core/chunk_key_encodings.py | 15 +- src/zarr/core/common.py | 6 + src/zarr/core/config.py | 28 +- src/zarr/core/group.py | 623 ++++++++++-------- src/zarr/core/metadata/v2.py | 41 +- src/zarr/core/metadata/v3.py | 42 +- src/zarr/registry.py | 77 ++- src/zarr/testing/strategies.py | 10 +- tests/test_api.py | 32 +- tests/test_array.py | 383 +++++++++-- tests/test_buffer.py | 39 +- tests/test_codecs/test_blosc.py | 17 +- tests/test_codecs/test_codecs.py | 93 +-- tests/test_codecs/test_endian.py | 18 +- tests/test_codecs/test_gzip.py | 10 +- tests/test_codecs/test_sharding.py | 218 ++++--- tests/test_codecs/test_transpose.py | 47 +- tests/test_codecs/test_vlen.py | 23 +- tests/test_codecs/test_zstd.py | 10 +- tests/test_config.py | 64 +- tests/test_group.py | 90 +-- tests/test_indexing.py | 8 +- tests/test_metadata/test_consolidated.py | 47 +- tests/test_v2.py | 46 +- tests/test_zarr.py | 11 + 33 files changed, 2399 insertions(+), 799 deletions(-) create mode 100644 tests/test_zarr.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 51116a929e..bcbdaf7c19 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -6,6 +6,8 @@ copy_all, copy_store, create, + create_array, + create_group, empty, empty_like, full, @@ -46,6 +48,8 @@ "copy_all", "copy_store", "create", + "create_array", + "create_group", "empty", "empty_like", "full", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index c4d1ec8627..75c043fc1a 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,8 +9,8 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr.core.array import Array, AsyncArray, get_array_metadata -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams +from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike from zarr.core.common import ( JSON, @@ -18,14 +18,14 @@ ChunkCoords, MemoryOrder, ZarrFormat, + _default_zarr_format, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, parse_dtype, ) -from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import _default_compressor, _default_filters from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -49,6 +49,7 @@ "copy_all", "copy_store", "create", + "create_array", "empty", "empty_like", "full", @@ -150,11 +151,6 @@ def _handle_zarr_version_or_format( return zarr_format -def _default_zarr_version() -> ZarrFormat: - """Return the default zarr_version""" - return cast(ZarrFormat, int(config.get("default_zarr_version", 3))) - - async def consolidate_metadata( store: StoreLike, path: str | None = None, @@ -300,8 +296,8 @@ async def open( path : str or None, optional The path within the store to open. storage_options : dict - If using an fsspec URL to create the store, these will be passed to - the backend implementation. Ignored otherwise. + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. **kwargs Additional parameters are passed through to :func:`zarr.creation.open_array` or :func:`zarr.hierarchy.open_group`. @@ -417,7 +413,7 @@ async def save_array( """ zarr_format = ( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) - or _default_zarr_version() + or _default_zarr_format() ) if not isinstance(arr, NDArrayLike): raise TypeError("arr argument must be numpy or other NDArrayLike array") @@ -429,7 +425,7 @@ async def save_array( shape = arr.shape chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute overwrite = kwargs.pop("overwrite", None) or _infer_overwrite(mode) - new = await AsyncArray.create( + new = await AsyncArray._create( store_path, zarr_format=zarr_format, shape=shape, @@ -477,7 +473,7 @@ async def save_group( zarr_version=zarr_version, zarr_format=zarr_format, ) - or _default_zarr_version() + or _default_zarr_format() ) for arg in args: @@ -657,7 +653,7 @@ async def group( try: return await AsyncGroup.open(store=store_path, zarr_format=zarr_format) except (KeyError, FileNotFoundError): - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await AsyncGroup.from_store( store=store_path, zarr_format=_zarr_format, @@ -666,6 +662,56 @@ async def group( ) +async def create_group( + *, + store: StoreLike, + path: str | None = None, + overwrite: bool = False, + zarr_format: ZarrFormat | None = None, + attributes: dict[str, Any] | None = None, + storage_options: dict[str, Any] | None = None, +) -> AsyncGroup: + """Create a group. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system. + path : str, optional + Group path within store. + overwrite : bool, optional + If True, pre-existing data at ``path`` will be deleted before + creating the group. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + If no ``zarr_format`` is provided, the default format will be used. + This default can be changed by modifying the value of ``default_zarr_format`` + in :mod:`zarr.core.config`. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + AsyncGroup + The new group. + """ + + if zarr_format is None: + zarr_format = _default_zarr_format() + + mode: Literal["a"] = "a" + + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + + return await AsyncGroup.from_store( + store=store_path, + zarr_format=zarr_format, + overwrite=overwrite, + attributes=attributes, + ) + + async def open_group( store: StoreLike | None = None, *, # Note: this is a change from v2 @@ -768,7 +814,7 @@ async def open_group( pass if mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await AsyncGroup.from_store( store_path, zarr_format=_zarr_format, @@ -813,7 +859,7 @@ async def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, **kwargs: Any, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -843,8 +889,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional @@ -857,7 +903,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. + fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Deprecated in favor of the ``config`` keyword argument. @@ -878,8 +925,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. If neither ``compressor`` nor ``filters`` are provided, a default - compressor will be used. (see ``compressor`` for details). + V2 only. If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -914,7 +961,7 @@ async def create( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. - config : ArrayConfig or ArrayConfigParams, optional + config : ArrayConfig or ArrayConfigLike, optional Runtime configuration of the array. If provided, will override the default values from `zarr.config.array`. @@ -925,15 +972,17 @@ async def create( """ zarr_format = ( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) - or _default_zarr_version() + or _default_zarr_format() ) if zarr_format == 2: if chunks is None: chunks = shape dtype = parse_dtype(dtype, zarr_format) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks @@ -971,7 +1020,7 @@ async def create( mode = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) - config_dict: ArrayConfigParams = {} + config_dict: ArrayConfigLike = {} if write_empty_chunks is not None: if config is not None: @@ -994,7 +1043,7 @@ async def create( config_parsed = ArrayConfig.from_dict(config_dict) - return await AsyncArray.create( + return await AsyncArray._create( store_path, shape=shape, chunks=chunks, @@ -1173,7 +1222,7 @@ async def open_array( If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. **kwargs - Any keyword arguments to pass to ``create``. + Any keyword arguments to pass to :func:`create`. Returns ------- @@ -1196,7 +1245,7 @@ async def open_array( except FileNotFoundError: if not store_path.read_only and mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await create( store=store_path, zarr_format=_zarr_format, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index cd1ef8b38d..52815748ad 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -5,6 +5,7 @@ from typing_extensions import deprecated import zarr.api.asynchronous as async_api +import zarr.core.array from zarr._compat import _deprecate_positional_args from zarr.core.array import Array, AsyncArray from zarr.core.group import Group @@ -17,10 +18,23 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.core.array_spec import ArrayConfig, ArrayConfigParams + from zarr.core.array import ( + CompressorsLike, + FiltersLike, + SerializerLike, + ShardsLike, + ) + from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike - from zarr.core.chunk_key_encodings import ChunkKeyEncoding - from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike + from zarr.core.common import ( + JSON, + AccessModeLiteral, + ChunkCoords, + MemoryOrder, + ShapeLike, + ZarrFormat, + ) from zarr.storage import StoreLike __all__ = [ @@ -30,6 +44,7 @@ "copy_all", "copy_store", "create", + "create_array", "empty", "empty_like", "full", @@ -523,6 +538,54 @@ def open_group( ) +def create_group( + store: StoreLike, + *, + path: str | None = None, + zarr_format: ZarrFormat | None = None, + overwrite: bool = False, + attributes: dict[str, Any] | None = None, + storage_options: dict[str, Any] | None = None, +) -> Group: + """Create a group. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system. + path : str, optional + Group path within store. + overwrite : bool, optional + If True, pre-existing data at ``path`` will be deleted before + creating the group. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + If no ``zarr_format`` is provided, the default format will be used. + This default can be changed by modifying the value of ``default_zarr_format`` + in :mod:`zarr.core.config`. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + Group + The new group. + """ + return Group( + sync( + async_api.create_group( + store=store, + path=path, + overwrite=overwrite, + storage_options=storage_options, + zarr_format=zarr_format, + attributes=attributes, + ) + ) + ) + + # TODO: add type annotations for kwargs def create( shape: ChunkCoords | int, @@ -559,7 +622,7 @@ def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, **kwargs: Any, ) -> Array: """Create an array. @@ -629,7 +692,7 @@ def create( storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. - config : ArrayConfig or ArrayConfigParams, optional + config : ArrayConfig or ArrayConfigLike, optional Runtime configuration of the array. If provided, will override the default values from `zarr.config.array`. @@ -675,6 +738,160 @@ def create( ) +def create_array( + store: str | StoreLike, + *, + name: str | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + serializer: SerializerLike = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, +) -> Array: + """Create an array. + + This function wraps :func:`zarr.core.array.create_array`. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + + Returns + ------- + Array + The array. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> arr = await zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='i4', + >>> fill_value=0) + + """ + return Array( + sync( + zarr.core.array.create_array( + store, + name=name, + shape=shape, + dtype=dtype, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, + ) + ) + ) + + # TODO: add type annotations for kwargs def empty(shape: ChunkCoords, **kwargs: Any) -> Array: """Create an empty array. diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 4708967390..12bcc02e96 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -6,6 +6,7 @@ import numpy as np from zarr.abc.codec import Codec +from zarr.core.common import ZarrFormat from zarr.core.metadata.v3 import DataType @@ -20,7 +21,7 @@ class GroupInfo: _name: str _type: Literal["Group"] = "Group" - _zarr_format: Literal[2, 3] + _zarr_format: ZarrFormat _read_only: bool _store_type: str _count_members: int | None = None @@ -76,7 +77,7 @@ class ArrayInfo: """ _type: Literal["Array"] = "Array" - _zarr_format: Literal[2, 3] + _zarr_format: ZarrFormat _data_type: np.dtype[Any] | DataType _shape: tuple[int, ...] _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 717eff36dc..0a5b5f085a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1,21 +1,35 @@ from __future__ import annotations import json +import warnings from asyncio import gather +from collections.abc import Iterable from dataclasses import dataclass, field from itertools import starmap from logging import getLogger -from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload +from typing import ( + TYPE_CHECKING, + Any, + Generic, + Literal, + TypeAlias, + TypedDict, + cast, + overload, +) from warnings import warn +import numcodecs import numpy as np import numpy.typing as npt +from typing_extensions import deprecated from zarr._compat import _deprecate_positional_args +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, normalize_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -23,9 +37,10 @@ NDBuffer, default_buffer_prototype, ) -from zarr.core.chunk_grids import RegularChunkGrid, normalize_chunks +from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, + ChunkKeyEncodingLike, DefaultChunkKeyEncoding, V2ChunkKeyEncoding, ) @@ -38,6 +53,7 @@ MemoryOrder, ShapeLike, ZarrFormat, + _default_zarr_format, _warn_order_kwarg, concurrent_map, parse_dtype, @@ -80,21 +96,34 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import ( + _default_compressor, + _default_filters, + parse_compressor, + parse_filters, +) from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError -from zarr.registry import get_pipeline_class +from zarr.registry import ( + _parse_array_array_codec, + _parse_array_bytes_codec, + _parse_bytes_bytes_codec, + _resolve_codec, + get_pipeline_class, +) from zarr.storage import StoreLike, make_store_path from zarr.storage.common import StorePath, ensure_no_existing_node if TYPE_CHECKING: - from collections.abc import Iterable, Iterator, Sequence + from collections.abc import Iterator, Sequence from typing import Self - from zarr.abc.codec import Codec, CodecPipeline + from zarr.abc.codec import CodecPipeline + from zarr.codecs.sharding import ShardingCodecIndexLocation from zarr.core.group import AsyncGroup + # Array and AsyncArray are defined in the base ``zarr`` namespace __all__ = ["create_codec_pipeline", "parse_array_metadata"] @@ -149,9 +178,9 @@ async def get_array_metadata( (store_path / ZATTRS_JSON).get(), ) if zarr_json_bytes is not None and zarray_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zarray objects exist") + # warn and favor v3 + msg = f"Both zarr.json (Zarr v3) and .zarray (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." + warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zarray_bytes is None: raise FileNotFoundError(store_path) # set zarr_format based on which keys were found @@ -273,7 +302,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata]: ... # this overload defines the function signature when zarr_format is 3 @@ -302,7 +331,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... @overload @@ -330,8 +359,9 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... + @overload @classmethod async def create( @@ -363,10 +393,12 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod + @deprecated("Use zarr.api.asynchronous.create_array instead.") + @_deprecate_positional_args async def create( cls, store: StoreLike, @@ -396,10 +428,9 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """ - Method to create a new asynchronous array instance. + """Method to create a new asynchronous array instance. Parameters ---------- @@ -431,8 +462,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -453,14 +484,14 @@ async def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -471,24 +502,77 @@ async def create( Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional The data to be inserted into the array (default is None). + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. Returns ------- AsyncArray The created asynchronous array instance. - Examples - -------- - >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') - >>> async_arr = await zarr.core.array.AsyncArray.create( - >>> store=store, - >>> shape=(100,100), - >>> chunks=(10,10), - >>> dtype='i4', - >>> fill_value=0) - + .. deprecated:: 3.0.0 + Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. + """ + return await cls._create( + store, + # v2 and v3 + shape=shape, + dtype=dtype, + zarr_format=zarr_format, + fill_value=fill_value, + attributes=attributes, + # v3 only + chunk_shape=chunk_shape, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + # v2 only + chunks=chunks, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, + # runtime + overwrite=overwrite, + data=data, + config=config, + ) + @classmethod + async def _create( + cls, + store: StoreLike, + *, + # v2 and v3 + shape: ShapeLike, + dtype: npt.DTypeLike, + zarr_format: ZarrFormat = 3, + fill_value: Any | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ShapeLike | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + # v2 only + chunks: ShapeLike | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: MemoryOrder | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime + overwrite: bool = False, + data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, + ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Method to create a new asynchronous array instance. + See :func:`AsyncArray.create` for more details. + Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ store_path = await make_store_path(store) @@ -502,7 +586,7 @@ async def create( _chunks = normalize_chunks(chunks, shape, dtype_parsed.itemsize) else: _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.itemsize) - config_parsed = normalize_array_config(config) + config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 3: @@ -653,8 +737,8 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: float | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + compressor: dict[str, JSON] | numcodecs.abc.Codec | None = None, attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -670,12 +754,14 @@ async def _create_v2( dimension_separator = "." dtype = parse_dtype(dtype, zarr_format=2) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + + # inject VLenUTF8 for str dtype if not already present if np.issubdtype(dtype, np.str_): filters = filters or [] - if not any(x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [{"id": "vlen-utf8"}] + from numcodecs.vlen import VLenUTF8 + + if not any(isinstance(x, VLenUTF8) or x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [VLenUTF8()] metadata = ArrayV2Metadata( shape=shape, @@ -787,6 +873,7 @@ def shape(self) -> ChunkCoords: @property def chunks(self) -> ChunkCoords: """Returns the chunk shape of the Array. + If sharding is used the inner chunk shape is returned. Only defined for arrays using using `RegularChunkGrid`. If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. @@ -796,14 +883,22 @@ def chunks(self) -> ChunkCoords: ChunkCoords: The chunk shape of the Array. """ - if isinstance(self.metadata.chunk_grid, RegularChunkGrid): - return self.metadata.chunk_grid.chunk_shape + return self.metadata.chunks - msg = ( - f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`." - f"This array has a {self.metadata.chunk_grid} instead." - ) - raise NotImplementedError(msg) + @property + def shards(self) -> ChunkCoords | None: + """Returns the shard shape of the Array. + Returns None if sharding is not used. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + + Returns + ------- + ChunkCoords: + The shard shape of the Array. + """ + return self.metadata.shards @property def size(self) -> int: @@ -1115,7 +1210,7 @@ async def getitem( -------- >>> import zarr >>> store = zarr.storage.MemoryStore(mode='w') - >>> async_arr = await zarr.core.array.AsyncArray.create( + >>> async_arr = await zarr.api.asynchronous.create_array( ... store=store, ... shape=(100,100), ... chunks=(10,10), @@ -1508,6 +1603,7 @@ class Array: _async_array: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @classmethod + @deprecated("Use zarr.create_array instead.") @_deprecate_positional_args def create( cls, @@ -1537,7 +1633,7 @@ def create( compressor: dict[str, JSON] | None = None, # runtime overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: """Creates a new Array instance from an initialized store. @@ -1565,8 +1661,8 @@ def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -1587,14 +1683,14 @@ def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -1608,9 +1704,71 @@ def create( ------- Array Array created from the store. + + .. deprecated:: 3.0.0 + Deprecated in favor of :func:`zarr.create_array`. + """ + return cls._create( + store, + # v2 and v3 + shape=shape, + dtype=dtype, + zarr_format=zarr_format, + attributes=attributes, + fill_value=fill_value, + # v3 only + chunk_shape=chunk_shape, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + # v2 only + chunks=chunks, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, + # runtime + overwrite=overwrite, + config=config, + ) + + @classmethod + def _create( + cls, + store: StoreLike, + *, + # v2 and v3 + shape: ChunkCoords, + dtype: npt.DTypeLike, + zarr_format: ZarrFormat = 3, + fill_value: Any | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ChunkCoords | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + # v2 only + chunks: ChunkCoords | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: MemoryOrder | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, + ) -> Array: + """Creates a new Array instance from an initialized store. + See :func:`Array.create` for more details. + Deprecated in favor of :func:`zarr.create_array`. """ async_array = sync( - AsyncArray.create( + AsyncArray._create( store=store, shape=shape, dtype=dtype, @@ -1717,6 +1875,10 @@ def shape(self, value: ChunkCoords) -> None: @property def chunks(self) -> ChunkCoords: """Returns a tuple of integers describing the length of each dimension of a chunk of the array. + If sharding is used the inner chunk shape is returned. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. Returns ------- @@ -1725,6 +1887,21 @@ def chunks(self) -> ChunkCoords: """ return self._async_array.chunks + @property + def shards(self) -> ChunkCoords | None: + """Returns a tuple of integers describing the length of each dimension of a shard of the array. + Returns None if sharding is not used. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + + Returns + ------- + tuple | None + A tuple of integers representing the length of each dimension of a shard or None if sharding is not used. + """ + return self._async_array.shards + @property def size(self) -> int: """Returns the total number of elements in the array. @@ -1973,10 +2150,10 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: >>> import zarr >>> import numpy as np >>> data = np.arange(100, dtype="uint16") - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(10,), + >>> chunks=(10,), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2007,10 +2184,10 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: Setup a 2-dimensional array:: >>> data = np.arange(100, dtype="uint16").reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(10, 10), + >>> chunks=(10, 10), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2238,10 +2415,10 @@ def get_basic_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100, dtype="uint16") - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3,), + >>> chunks=(3,), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2267,10 +2444,10 @@ def get_basic_selection( Setup a 3-dimensional array:: >>> data = np.arange(1000).reshape(10, 10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(5, 5, 5), + >>> chunks=(5, 5, 5), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2462,10 +2639,10 @@ def get_orthogonal_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100).reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=data.shape, + >>> chunks=data.shape, >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2696,10 +2873,10 @@ def get_mask_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100).reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=data.shape, + >>> chunks=data.shape, >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2856,10 +3033,10 @@ def get_coordinate_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(0, 100, dtype="uint16").reshape((10, 10)) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3, 3), + >>> chunks=(3, 3), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -3044,10 +3221,10 @@ def get_block_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(0, 100, dtype="uint16").reshape((10, 10)) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3, 3), + >>> chunks=(3, 3), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -3448,4 +3625,459 @@ def _get_default_codecs( else: dtype_key = "numeric" - return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] + return cast(list[dict[str, JSON]], default_codecs[dtype_key]) + + +FiltersLike: TypeAlias = ( + Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + | ArrayArrayCodec + | Iterable[numcodecs.abc.Codec] + | numcodecs.abc.Codec + | Literal["auto"] + | None +) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | None +CompressorsLike: TypeAlias = ( + Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + | dict[str, JSON] + | BytesBytesCodec + | numcodecs.abc.Codec + | Literal["auto"] + | None +) +SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] + + +class ShardsConfigParam(TypedDict): + shape: ChunkCoords + index_location: ShardingCodecIndexLocation | None + + +ShardsLike: TypeAlias = ChunkCoords | ShardsConfigParam | Literal["auto"] + + +async def create_array( + store: str | StoreLike, + *, + name: str | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + serializer: SerializerLike = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Create an array. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + + Returns + ------- + AsyncArray + The array. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> async_arr = await zarr.api.asynchronous.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='i4', + >>> fill_value=0) + + """ + + if zarr_format is None: + zarr_format = _default_zarr_format() + + from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation + + mode: Literal["a"] = "a" + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + config_parsed = parse_array_config(config) + shape_parsed = parse_shapelike(shape) + chunk_key_encoding_parsed = _parse_chunk_key_encoding( + chunk_key_encoding, zarr_format=zarr_format + ) + store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) + shard_shape_parsed, chunk_shape_parsed = _auto_partition( + array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, dtype=dtype_parsed + ) + chunks_out: tuple[int, ...] + result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] + + if zarr_format == 2: + if shard_shape_parsed is not None: + msg = ( + "Zarr v2 arrays can only be created with `shard_shape` set to `None`. " + f"Got `shard_shape={shards}` instead." + ) + + raise ValueError(msg) + if serializer != "auto": + raise ValueError("Zarr v2 arrays do not support `serializer`.") + + filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( + compressor=compressors, filters=filters, dtype=np.dtype(dtype) + ) + + if dimension_names is not None: + raise ValueError("Zarr v2 arrays do not support dimension names.") + if order is None: + order_parsed = zarr_config.get("array.order") + else: + order_parsed = order + + result = await AsyncArray._create_v2( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + chunks=chunk_shape_parsed, + dimension_separator=chunk_key_encoding_parsed.separator, + fill_value=fill_value, + order=order_parsed, + filters=filters_parsed, + compressor=compressor_parsed, + attributes=attributes, + overwrite=overwrite, + config=config_parsed, + ) + else: + array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( + compressors=compressors, + filters=filters, + serializer=serializer, + dtype=dtype_parsed, + ) + sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes)) + codecs_out: tuple[Codec, ...] + if shard_shape_parsed is not None: + index_location = None + if isinstance(shards, dict): + index_location = ShardingCodecIndexLocation(shards.get("index_location", None)) + if index_location is None: + index_location = ShardingCodecIndexLocation.end + sharding_codec = ShardingCodec( + chunk_shape=chunk_shape_parsed, codecs=sub_codecs, index_location=index_location + ) + sharding_codec.validate( + shape=chunk_shape_parsed, + dtype=dtype_parsed, + chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), + ) + codecs_out = (sharding_codec,) + chunks_out = shard_shape_parsed + else: + chunks_out = chunk_shape_parsed + codecs_out = sub_codecs + + result = await AsyncArray._create_v3( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + fill_value=fill_value, + attributes=attributes, + chunk_shape=chunks_out, + chunk_key_encoding=chunk_key_encoding_parsed, + codecs=codecs_out, + dimension_names=dimension_names, + overwrite=overwrite, + config=config_parsed, + ) + + return result + + +def _parse_chunk_key_encoding( + data: ChunkKeyEncoding | ChunkKeyEncodingLike | None, zarr_format: ZarrFormat +) -> ChunkKeyEncoding: + """ + Take an implicit specification of a chunk key encoding and parse it into a ChunkKeyEncoding object. + """ + if data is None: + if zarr_format == 2: + result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "."}) + else: + result = ChunkKeyEncoding.from_dict({"name": "default", "separator": "/"}) + elif isinstance(data, ChunkKeyEncoding): + result = data + else: + result = ChunkKeyEncoding.from_dict(data) + if zarr_format == 2 and result.name != "v2": + msg = ( + "Invalid chunk key encoding. For Zarr v2 arrays, the `name` field of the " + f"chunk key encoding must be 'v2'. Got `name` = {result.name} instead." + ) + raise ValueError(msg) + return result + + +def _get_default_chunk_encoding_v3( + np_dtype: np.dtype[Any], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + """ + Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. + """ + default_codecs = zarr_config.get("array.v3_default_codecs") + dtype = DataType.from_numpy(np_dtype) + if dtype == DataType.string: + dtype_key = "string" + elif dtype == DataType.bytes: + dtype_key = "bytes" + else: + dtype_key = "numeric" + + codec_dicts = default_codecs[dtype_key] + codecs = tuple(_resolve_codec(c) for c in codec_dicts) + array_bytes_maybe = None + array_array: list[ArrayArrayCodec] = [] + bytes_bytes: list[BytesBytesCodec] = [] + + for codec in codecs: + if isinstance(codec, ArrayBytesCodec): + if array_bytes_maybe is not None: + raise ValueError( + f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. " + "Only one array-to-bytes codec is allowed." + ) + array_bytes_maybe = codec + elif isinstance(codec, ArrayArrayCodec): + array_array.append(codec) + elif isinstance(codec, BytesBytesCodec): + bytes_bytes.append(codec) + else: + raise TypeError(f"Unexpected codec type: {type(codec)}") + + if array_bytes_maybe is None: + raise ValueError("Required ArrayBytesCodec was not found.") + + return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes) + + +def _get_default_chunk_encoding_v2( + np_dtype: np.dtype[Any], +) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: + """ + Get the default chunk encoding for zarr v2 arrays, given a dtype + """ + + compressor_dict = _default_compressor(np_dtype) + filter_dicts = _default_filters(np_dtype) + + compressor = None + if compressor_dict is not None: + compressor = numcodecs.get_codec(compressor_dict) + + filters = None + if filter_dicts is not None: + filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) + + return filters, compressor + + +def _parse_chunk_encoding_v2( + *, + compressor: CompressorsLike, + filters: FiltersLike, + dtype: np.dtype[Any], +) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: + """ + Generate chunk encoding classes for v2 arrays with optional defaults. + """ + default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) + + _filters: tuple[numcodecs.abc.Codec, ...] | None + _compressor: numcodecs.abc.Codec | None + + if compressor is None or compressor == (): + _compressor = None + elif compressor == "auto": + _compressor = default_compressor + elif isinstance(compressor, tuple | list) and len(compressor) == 1: + _compressor = parse_compressor(compressor[0]) + else: + if isinstance(compressor, Iterable) and not isinstance(compressor, dict): + msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." + raise TypeError(msg) + _compressor = parse_compressor(compressor) + + if filters is None: + _filters = None + elif filters == "auto": + _filters = default_filters + else: + if isinstance(filters, Iterable): + for idx, f in enumerate(filters): + if not isinstance(f, numcodecs.abc.Codec): + msg = ( + "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs. " + f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." + ) + raise TypeError(msg) + _filters = parse_filters(filters) + + return _filters, _compressor + + +def _parse_chunk_encoding_v3( + *, + compressors: CompressorsLike, + filters: FiltersLike, + serializer: SerializerLike, + dtype: np.dtype[Any], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + """ + Generate chunk encoding classes for v3 arrays with optional defaults. + """ + default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3( + dtype + ) + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] + maybe_array_array: Iterable[Codec | dict[str, JSON]] + out_bytes_bytes: tuple[BytesBytesCodec, ...] + if compressors is None: + out_bytes_bytes = () + + elif compressors == "auto": + out_bytes_bytes = default_bytes_bytes + + else: + if isinstance(compressors, dict | Codec): + maybe_bytes_bytes = (compressors,) + elif compressors is None: + maybe_bytes_bytes = () + else: + maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) + + out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + out_array_array: tuple[ArrayArrayCodec, ...] + if filters is None: + out_array_array = () + elif filters == "auto": + out_array_array = default_array_array + else: + if isinstance(filters, dict | Codec): + maybe_array_array = (filters,) + elif filters is None: + maybe_array_array = () + else: + maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters) + out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) + + if serializer == "auto": + out_array_bytes = default_array_bytes + else: + out_array_bytes = _parse_array_bytes_codec(serializer) + + return out_array_array, out_array_bytes, out_bytes_bytes + + +def _parse_deprecated_compressor( + compressor: CompressorLike | None, compressors: CompressorsLike +) -> CompressorsLike | None: + if compressor: + if compressors != "auto": + raise ValueError("Cannot specify both `compressor` and `compressors`.") + warn( + "The `compressor` argument is deprecated. Use `compressors` instead.", + category=UserWarning, + stacklevel=2, + ) + compressors = (compressor,) + return compressors diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index ee6934d05f..b1a6a3cad0 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -21,7 +21,7 @@ from zarr.core.common import ChunkCoords -class ArrayConfigParams(TypedDict): +class ArrayConfigLike(TypedDict): """ A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. This allows for partial construction of an ArrayConfig, with the assumption that the unset @@ -56,13 +56,13 @@ def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None: object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) @classmethod - def from_dict(cls, data: ArrayConfigParams) -> Self: + def from_dict(cls, data: ArrayConfigLike) -> Self: """ Create an ArrayConfig from a dict. The keys of that dict are a subset of the attributes of the ArrayConfig class. Any keys missing from that dict will be set to the the values in the ``array`` namespace of ``zarr.config``. """ - kwargs_out: ArrayConfigParams = {} + kwargs_out: ArrayConfigLike = {} for f in fields(ArrayConfig): field_name = cast(Literal["order", "write_empty_chunks"], f.name) if field_name not in data: @@ -72,7 +72,7 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: return cls(**kwargs_out) -def normalize_array_config(data: ArrayConfig | ArrayConfigParams | None) -> ArrayConfig: +def parse_array_config(data: ArrayConfig | ArrayConfigLike | None) -> ArrayConfig: """ Convert various types of data to an ArrayConfig. """ diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 7ddedfe064..85a7351fc7 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -16,11 +16,6 @@ import numpy as np import numpy.typing as npt -from zarr.registry import ( - get_buffer_class, - get_ndbuffer_class, -) - if TYPE_CHECKING: from collections.abc import Iterable, Sequence from typing import Self @@ -507,4 +502,9 @@ class BufferPrototype(NamedTuple): # The default buffer prototype used throughout the Zarr codebase. def default_buffer_prototype() -> BufferPrototype: + from zarr.registry import ( + get_buffer_class, + get_ndbuffer_class, + ) + return BufferPrototype(buffer=get_buffer_class(), nd_buffer=get_ndbuffer_class()) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ea050e39ef..d3e40c26ed 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -4,10 +4,11 @@ import math import numbers import operator +import warnings from abc import abstractmethod from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np @@ -26,6 +27,8 @@ from collections.abc import Iterator from typing import Self + from zarr.core.array import ShardsLike + def _guess_chunks( shape: ShapeLike, @@ -194,3 +197,55 @@ def get_nchunks(self, array_shape: ChunkCoords) -> int: itertools.starmap(ceildiv, zip(array_shape, self.chunk_shape, strict=True)), 1, ) + + +def _auto_partition( + *, + array_shape: tuple[int, ...], + chunk_shape: tuple[int, ...] | Literal["auto"], + shard_shape: ShardsLike | None, + dtype: np.dtype[Any], +) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: + """ + Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. + If `shard_shape` is `None` and the chunk_shape is "auto", the chunks will be set heuristically based + on the dtype and shape of the array. + If `shard_shape` is "auto", then the shard shape will be set heuristically from the dtype and shape + of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, + given the dtype and shard shape. Otherwise, the chunks will be returned as-is. + """ + item_size = dtype.itemsize + if shard_shape is None: + _shards_out: None | tuple[int, ...] = None + if chunk_shape == "auto": + _chunks_out = _guess_chunks(array_shape, item_size) + else: + _chunks_out = chunk_shape + else: + if chunk_shape == "auto": + # aim for a 1MiB chunk + _chunks_out = _guess_chunks(array_shape, item_size, max_bytes=1024) + else: + _chunks_out = chunk_shape + + if shard_shape == "auto": + warnings.warn( + "Automatic shard shape inference is experimental and may change without notice.", + UserWarning, + stacklevel=2, + ) + _shards_out = () + for a_shape, c_shape in zip(array_shape, _chunks_out, strict=True): + # TODO: make a better heuristic than this. + # for each axis, if there are more than 8 chunks along that axis, then put + # 2 chunks in each shard for that axis. + if a_shape // c_shape > 8: + _shards_out += (c_shape * 2,) + else: + _shards_out += (c_shape,) + elif isinstance(shard_shape, dict): + _shards_out = tuple(shard_shape["shape"]) + else: + _shards_out = shard_shape + + return _shards_out, _chunks_out diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index ed12ee3065..95ce9108f3 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -2,7 +2,7 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import Literal, cast +from typing import Literal, TypedDict, cast from zarr.abc.metadata import Metadata from zarr.core.common import ( @@ -20,6 +20,11 @@ def parse_separator(data: JSON) -> SeparatorLiteral: return cast(SeparatorLiteral, data) +class ChunkKeyEncodingLike(TypedDict): + name: Literal["v2", "default"] + separator: SeparatorLiteral + + @dataclass(frozen=True) class ChunkKeyEncoding(Metadata): name: str @@ -31,10 +36,16 @@ def __init__(self, *, separator: SeparatorLiteral) -> None: object.__setattr__(self, "separator", separator_parsed) @classmethod - def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncoding) -> ChunkKeyEncoding: + def from_dict( + cls, data: dict[str, JSON] | ChunkKeyEncoding | ChunkKeyEncodingLike + ) -> ChunkKeyEncoding: if isinstance(data, ChunkKeyEncoding): return data + # handle ChunkKeyEncodingParams + if "name" in data and "separator" in data: + data = {"name": data["name"], "configuration": {"separator": data["separator"]}} + # configuration is optional for chunk key encodings name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False) if name_parsed == "default": diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 3db00b1a06..d53f3847a5 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -18,6 +18,7 @@ import numpy as np +from zarr.core.config import config as zarr_config from zarr.core.strings import _STRING_DTYPE if TYPE_CHECKING: @@ -197,3 +198,8 @@ def _warn_order_kwarg() -> None: "or change the global 'array.order' configuration variable." ) warnings.warn(msg, RuntimeWarning, stacklevel=2) + + +def _default_zarr_format() -> ZarrFormat: + """Return the default zarr_version""" + return cast(ZarrFormat, int(zarr_config.get("default_zarr_format", 3))) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index a14305aef8..421a100f1b 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -62,19 +62,33 @@ def reset(self) -> None: "zarr", defaults=[ { - "default_zarr_version": 3, + "default_zarr_format": 3, "array": { "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": False}, + "string": {"id": "zstd", "level": 0, "checksum": False}, + "bytes": {"id": "zstd", "level": 0, "checksum": False}, + }, + "v2_default_filters": { + "numeric": None, + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 2d7a21911a..29b25689c4 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -18,7 +18,18 @@ from zarr.abc.metadata import Metadata from zarr.abc.store import Store, set_or_delete from zarr.core._info import GroupInfo -from zarr.core.array import Array, AsyncArray, _build_parents +from zarr.core.array import ( + Array, + AsyncArray, + CompressorLike, + CompressorsLike, + FiltersLike, + SerializerLike, + ShardsLike, + _build_parents, + _parse_deprecated_compressor, + create_array, +) from zarr.core.attributes import Attributes from zarr.core.buffer import default_buffer_prototype from zarr.core.common import ( @@ -46,9 +57,10 @@ from collections.abc import AsyncGenerator, Generator, Iterable, Iterator from typing import Any - from zarr.abc.codec import Codec + from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.chunk_key_encodings import ChunkKeyEncoding + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike + from zarr.core.common import MemoryOrder logger = logging.getLogger("zarr.group") @@ -58,7 +70,7 @@ def parse_zarr_format(data: Any) -> ZarrFormat: """Parse the zarr_format field from metadata.""" if data in (2, 3): - return cast(Literal[2, 3], data) + return cast(ZarrFormat, data) msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." raise ValueError(msg) @@ -434,7 +446,7 @@ async def from_store( async def open( cls, store: StoreLike, - zarr_format: Literal[2, 3] | None = 3, + zarr_format: ZarrFormat | None = 3, use_consolidated: bool | str | None = None, ) -> AsyncGroup: """Open a new AsyncGroup @@ -501,9 +513,9 @@ async def open( (store_path / str(consolidated_key)).get(), ) if zarr_json_bytes is not None and zgroup_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zgroup objects exist") + # warn and favor v3 + msg = f"Both zarr.json (Zarr v3) and .zgroup (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." + warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zgroup_bytes is None: raise FileNotFoundError( f"could not find zarr.json or .zgroup objects in {store_path}" @@ -998,116 +1010,136 @@ async def create_array( name: str, *, shape: ShapeLike, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", + fill_value: Any | None = 0, + order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ShapeLike | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, - data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """ - Create a Zarr array within this AsyncGroup. - This method lightly wraps AsyncArray.create. + """Create an array within this group. + + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ---------- name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. Returns ------- AsyncArray """ - return await AsyncArray.create( - self.store_path / name, + + compressors = _parse_deprecated_compressor(compressor, compressors) + return await create_array( + store=self.store_path, + name=name, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, fill_value=fill_value, + order=order, + zarr_format=self.metadata.zarr_format, + attributes=attributes, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, dimension_names=dimension_names, - attributes=attributes, - chunks=chunks, - dimension_separator=dimension_separator, - order=order, - filters=filters, - compressor=compressor, + storage_options=storage_options, overwrite=overwrite, - zarr_format=self.metadata.zarr_format, - data=data, + config=config, ) @deprecated("Use AsyncGroup.create_array instead.") @@ -1719,7 +1751,7 @@ def from_store( def open( cls, store: StoreLike, - zarr_format: Literal[2, 3] | None = 3, + zarr_format: ZarrFormat | None = 3, ) -> Group: """Open a group from an initialized store. @@ -1755,8 +1787,8 @@ def __getitem__(self, path: str) -> Array | Group: -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) - >>> group.create_group(name="subgroup").create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) + >>> group.create_group(name="subgroup").create_array(name="subarray", shape=(10,), chunks=(10,)) >>> group["subarray"] >>> group["subgroup"] @@ -1790,7 +1822,7 @@ def get(self, path: str, default: DefaultT | None = None) -> Array | Group | Def -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) >>> group.create_group(name="subgroup") >>> group.get("subarray") @@ -1816,7 +1848,7 @@ def __delitem__(self, key: str) -> None: -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) >>> del group["subarray"] >>> "subarray" in group False @@ -1831,8 +1863,8 @@ def __iter__(self) -> Iterator[str]: >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_array('baz', shape=(10,), chunk_shape=(10,)) - >>> d2 = g1.create_array('quux', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('baz', shape=(10,), chunks=(10,)) + >>> d2 = g1.create_array('quux', shape=(10,), chunks=(10,)) >>> for name in g1: ... print(name) baz @@ -2023,8 +2055,8 @@ def keys(self) -> Generator[str, None]: >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_array('baz', shape=(10,), chunk_shape=(10,)) - >>> d2 = g1.create_array('quux', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('baz', shape=(10,), chunks=(10,)) + >>> d2 = g1.create_array('quux', shape=(10,), chunks=(10,)) >>> for name in g1.keys(): ... print(name) baz @@ -2042,7 +2074,7 @@ def __contains__(self, member: str) -> bool: >>> import zarr >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') - >>> d1 = g1.create_array('bar', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('bar', shape=(10,), chunks=(10,)) >>> 'foo' in g1 True >>> 'bar' in g1 @@ -2105,7 +2137,7 @@ def arrays(self) -> Generator[tuple[str, Array], None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for name, subarray in group.arrays(): ... print(name, subarray) subarray @@ -2120,7 +2152,7 @@ def array_keys(self) -> Generator[str, None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for name in group.array_keys(): ... print(name) subarray @@ -2136,7 +2168,7 @@ def array_values(self) -> Generator[Array, None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for subarray in group.array_values(): ... print(subarray) @@ -2225,120 +2257,134 @@ def create_array( name: str, *, shape: ShapeLike, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ShapeLike | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, - data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: - """Create a zarr array within this AsyncGroup. + """Create an array within this group. - This method lightly wraps `AsyncArray.create`. + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ---------- name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. - data : npt.ArrayLike | None = None - Array data to initialize the array with. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. Returns ------- - - Array - + AsyncArray """ + compressors = _parse_deprecated_compressor(compressor, compressors) return Array( self._sync( self._async_group.create_array( name=name, shape=shape, dtype=dtype, + chunks=chunks, + shards=shards, fill_value=fill_value, attributes=attributes, - chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + compressors=compressors, + serializer=serializer, dimension_names=dimension_names, - chunks=chunks, - dimension_separator=dimension_separator, order=order, filters=filters, - compressor=compressor, overwrite=overwrite, - data=data, + storage_options=storage_options, + config=config, ) ) ) @@ -2594,121 +2640,136 @@ def array( self, name: str, *, - shape: ChunkCoords, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ChunkCoords | Literal["auto"] | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ChunkCoords | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, data: npt.ArrayLike | None = None, ) -> Array: - """Create a zarr array within this AsyncGroup. + """Create an array within this group. - This method lightly wraps `AsyncArray.create`. + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ---------- name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. - data : npt.ArrayLike | None = None - Array data to initialize the array with. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. Returns ------- - - Array - + AsyncArray """ + compressors = _parse_deprecated_compressor(compressor, compressors) return Array( self._sync( self._async_group.create_array( name=name, shape=shape, dtype=dtype, + chunks=chunks, + shards=shards, fill_value=fill_value, attributes=attributes, - chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + compressors=compressors, + serializer=serializer, dimension_names=dimension_names, - chunks=chunks, - dimension_separator=dimension_separator, order=order, filters=filters, - compressor=compressor, overwrite=overwrite, - data=data, + storage_options=storage_options, + config=config, ) ) ) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index af26034b1d..bc7fd32cbf 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -6,6 +6,8 @@ from functools import cached_property from typing import TYPE_CHECKING, TypedDict, cast +import numcodecs.abc + from zarr.abc.metadata import Metadata if TYPE_CHECKING: @@ -14,7 +16,7 @@ import numpy.typing as npt from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.common import JSON, ChunkCoords + from zarr.core.common import ChunkCoords import json from dataclasses import dataclass, field, fields, replace @@ -25,7 +27,7 @@ from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator -from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike +from zarr.core.common import JSON, ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike from zarr.core.config import config, parse_indexing_order from zarr.core.metadata.common import parse_attributes @@ -42,7 +44,7 @@ class ArrayV2MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV2Metadata(Metadata): shape: ChunkCoords - chunks: tuple[int, ...] + chunks: ChunkCoords dtype: np.dtype[Any] fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" @@ -100,6 +102,10 @@ def ndim(self) -> int: def chunk_grid(self) -> RegularChunkGrid: return RegularChunkGrid(chunk_shape=self.chunks) + @property + def shards(self) -> ChunkCoords | None: + return None + def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: def _json_convert( o: Any, @@ -235,6 +241,9 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) return tuple(out) + # take a single codec instance and wrap it in a tuple + if isinstance(data, numcodecs.abc.Codec): + return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) @@ -329,9 +338,9 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return dtype.type(0) -def _default_filters_and_compressor( +def _default_compressor( dtype: np.dtype[Any], -) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]: +) -> dict[str, JSON] | None: """Get the default filters and compressor for a dtype. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html @@ -346,4 +355,24 @@ def _default_filters_and_compressor( else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return [{"id": default_compressor[dtype_key]}], None + return cast(dict[str, JSON] | None, default_compressor.get(dtype_key, None)) + + +def _default_filters( + dtype: np.dtype[Any], +) -> list[dict[str, JSON]] | None: + """Get the default filters and compressor for a dtype. + + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + default_filters = config.get("array.v2_default_filters") + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "string" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + return cast(list[dict[str, JSON]] | None, default_filters.get(dtype_key, None)) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 4cf5860ffd..0821dd9bc9 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -296,6 +296,40 @@ def dtype(self) -> np.dtype[Any]: def ndim(self) -> int: return len(self.shape) + @property + def chunks(self) -> ChunkCoords: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + sharding_codec = self.codecs[0] + assert isinstance(sharding_codec, ShardingCodec) # for mypy + return sharding_codec.chunk_shape + else: + return self.chunk_grid.chunk_shape + + msg = ( + f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`." + f"This array has a {self.chunk_grid} instead." + ) + raise NotImplementedError(msg) + + @property + def shards(self) -> ChunkCoords | None: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.chunk_grid.chunk_shape + else: + return None + + msg = ( + f"The `shards` attribute is only defined for arrays using `RegularChunkGrid`." + f"This array has a {self.chunk_grid} instead." + ) + raise NotImplementedError(msg) + def get_chunk_spec( self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: @@ -449,7 +483,7 @@ def parse_fill_value( return np.bytes_(fill_value) # the rest are numeric types - np_dtype = cast(np.dtype[np.generic], data_type.to_numpy()) + np_dtype = cast(np.dtype[Any], data_type.to_numpy()) if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): if data_type in (DataType.complex64, DataType.complex128): @@ -513,8 +547,8 @@ def default_fill_value(dtype: DataType) -> str | bytes | np.generic: return b"" else: np_dtype = dtype.to_numpy() - np_dtype = cast(np.dtype[np.generic], np_dtype) - return np_dtype.type(0) + np_dtype = cast(np.dtype[Any], np_dtype) + return np_dtype.type(0) # type: ignore[misc] # For type checking @@ -586,7 +620,7 @@ def to_numpy_shortname(self) -> str: } return data_type_to_numpy[self] - def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[np.generic]: + def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[Any]: # note: it is not possible to round trip DataType <-> np.dtype # due to the fact that DataType.string and DataType.bytes both # generally return np.dtype("O") from this function, even though diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 9055bb1447..704db3f704 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -10,8 +10,15 @@ if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.abc.codec import Codec, CodecPipeline + from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BytesBytesCodec, + Codec, + CodecPipeline, + ) from zarr.core.buffer import Buffer, NDBuffer + from zarr.core.common import JSON __all__ = [ "Registry", @@ -151,6 +158,74 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: raise KeyError(key) +def _resolve_codec(data: dict[str, JSON]) -> Codec: + """ + Get a codec instance from a dict representation of that codec. + """ + # TODO: narrow the type of the input to only those dicts that map on to codec class instances. + return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] + + +def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: + """ + Normalize the input to a ``BytesBytesCodec`` instance. + If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it + is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. + """ + from zarr.abc.codec import BytesBytesCodec + + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, BytesBytesCodec): + msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." + raise TypeError(msg) + else: + if not isinstance(data, BytesBytesCodec): + raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") + result = data + return result + + +def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: + """ + Normalize the input to a ``ArrayBytesCodec`` instance. + If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it + is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. + """ + from zarr.abc.codec import ArrayBytesCodec + + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, ArrayBytesCodec): + msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." + raise TypeError(msg) + else: + if not isinstance(data, ArrayBytesCodec): + raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") + result = data + return result + + +def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: + """ + Normalize the input to a ``ArrayArrayCodec`` instance. + If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it + is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. + """ + from zarr.abc.codec import ArrayArrayCodec + + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, ArrayArrayCodec): + msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." + raise TypeError(msg) + else: + if not isinstance(data, ArrayArrayCodec): + raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") + result = data + return result + + def get_pipeline_class(reload_config: bool = False) -> type[CodecPipeline]: if reload_config: _reload_config() diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 85a67e3e69..ae0487e447 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -1,4 +1,4 @@ -from typing import Any, Literal +from typing import Any import hypothesis.extra.numpy as npst import hypothesis.strategies as st @@ -8,8 +8,10 @@ import zarr from zarr.core.array import Array +from zarr.core.common import ZarrFormat from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike +from zarr.storage.common import _dereference_path # Copied from Xarray _attr_keys = st.text(st.characters(), min_size=1) @@ -68,7 +70,7 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]: # So we map a clear to reset the store. stores = st.builds(MemoryStore, st.just({})).map(lambda x: sync(x.clear())) compressors = st.sampled_from([None, "default"]) -zarr_formats: st.SearchStrategy[Literal[2, 3]] = st.sampled_from([2, 3]) +zarr_formats: st.SearchStrategy[ZarrFormat] = st.sampled_from([2, 3]) array_shapes = npst.array_shapes(max_dims=4, min_side=0) @@ -77,7 +79,7 @@ def numpy_arrays( draw: st.DrawFn, *, shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, - zarr_formats: st.SearchStrategy[Literal[2, 3]] = zarr_formats, + zarr_formats: st.SearchStrategy[ZarrFormat] = zarr_formats, ) -> Any: """ Generate numpy arrays that can be saved in the provided Zarr format. @@ -137,7 +139,7 @@ def arrays( expected_attrs = {} if attributes is None else attributes - array_path = path + ("/" if not path.endswith("/") else "") + name + array_path = _dereference_path(path, name) root = zarr.open_group(store, mode="w", zarr_format=zarr_format) a = root.create_array( diff --git a/tests/test_api.py b/tests/test_api.py index d25ec54bfe..80e8555e11 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -13,6 +13,8 @@ from zarr.abc.store import Store from zarr.api.synchronous import ( create, + create_array, + create_group, group, load, open, @@ -21,13 +23,13 @@ save_array, save_group, ) -from zarr.core.common import MemoryOrder, ZarrFormat +from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.errors import MetadataValidationError from zarr.storage._utils import normalize_path from zarr.storage.memory import MemoryStore -def test_create_array(memory_store: Store) -> None: +def test_create(memory_store: Store) -> None: store = memory_store # create array @@ -56,6 +58,22 @@ def test_create_array(memory_store: Store) -> None: z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) # type: ignore [arg-type] +# TODO: parametrize over everything this function takes +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_create_array(store: Store) -> None: + attrs: dict[str, JSON] = {"foo": 100} # explicit type annotation to avoid mypy error + shape = (10, 10) + path = "foo" + data_val = 1 + array_w = create_array( + store, name=path, shape=shape, attributes=attrs, chunks=shape, dtype="uint8" + ) + array_w[:] = data_val + assert array_w.shape == shape + assert array_w.attrs == attrs + assert np.array_equal(array_w[:], np.zeros(shape, dtype=array_w.dtype) + data_val) + + @pytest.mark.parametrize("write_empty_chunks", [True, False]) def test_write_empty_chunks_warns(write_empty_chunks: bool) -> None: """ @@ -113,6 +131,16 @@ async def test_open_array(memory_store: MemoryStore) -> None: open(store="doesnotexist", mode="r") +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_group(store: Store, zarr_format: ZarrFormat) -> None: + attrs = {"foo": 100} + path = "node" + node = create_group(store, path=path, attributes=attrs, zarr_format=zarr_format) + assert isinstance(node, Group) + assert node.attrs == attrs + assert node.metadata.zarr_format == zarr_format + + async def test_open_group(memory_store: MemoryStore) -> None: store = memory_store diff --git a/tests/test_array.py b/tests/test_array.py index 891538bc43..72ff68d954 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -2,21 +2,38 @@ import json import math import pickle +import re from itertools import accumulate -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal import numcodecs import numpy as np import pytest -from numcodecs import Zstd import zarr.api.asynchronous from zarr import Array, AsyncArray, Group -from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec +from zarr.codecs import ( + BytesCodec, + GzipCodec, + TransposeCodec, + VLenBytesCodec, + VLenUTF8Codec, + ZstdCodec, +) from zarr.core._info import ArrayInfo -from zarr.core.array import chunks_initialized +from zarr.core.array import ( + CompressorsLike, + FiltersLike, + _get_default_chunk_encoding_v2, + _get_default_chunk_encoding_v3, + _parse_chunk_encoding_v2, + _parse_chunk_encoding_v3, + chunks_initialized, + create_array, +) from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer +from zarr.core.chunk_grids import _auto_partition from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import ceildiv @@ -26,6 +43,9 @@ from zarr.storage import LocalStore, MemoryStore from zarr.storage.common import StorePath +if TYPE_CHECKING: + from zarr.core.array_spec import ArrayConfigLike + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [2, 3]) @@ -58,7 +78,7 @@ def test_array_creation_existing_node( if overwrite: if not store.supports_deletes: pytest.skip("store does not support deletes") - arr_new = Array.create( + arr_new = zarr.create_array( spath / "extant", shape=new_shape, dtype=new_dtype, @@ -69,7 +89,7 @@ def test_array_creation_existing_node( assert arr_new.dtype == new_dtype else: with pytest.raises(expected_exception): - arr_new = Array.create( + arr_new = zarr.create_array( spath / "extant", shape=new_shape, dtype=new_dtype, @@ -123,7 +143,9 @@ async def test_create_creates_parents( def test_array_name_properties_no_group( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: - arr = Array.create(store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4") + arr = zarr.create_array( + store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" + ) assert arr.path == "" assert arr.name == "/" assert arr.basename == "" @@ -161,17 +183,17 @@ def test_array_v3_fill_value_default( shape = (10,) default_fill_value = 0 if specifiy_fill_value: - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=dtype_str, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=None, ) else: - arr = Array.create( - store=store, shape=shape, dtype=dtype_str, zarr_format=3, chunk_shape=shape + arr = zarr.create_array( + store=store, shape=shape, dtype=dtype_str, zarr_format=3, chunks=shape ) assert arr.fill_value == np.dtype(dtype_str).type(default_fill_value) @@ -185,12 +207,12 @@ def test_array_v3_fill_value_default( ) def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str) -> None: shape = (10,) - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=dtype_str, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=fill_value, ) @@ -201,12 +223,12 @@ def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str def test_create_positional_args_deprecated() -> None: store = MemoryStore() with pytest.warns(FutureWarning, match="Pass"): - Array.create(store, (2, 2), dtype="f8") + zarr.Array.create(store, (2, 2), dtype="f8") def test_selection_positional_args_deprecated() -> None: store = MemoryStore() - arr = Array.create(store, shape=(2, 2), dtype="f8") + arr = zarr.create_array(store, shape=(2, 2), dtype="f8") with pytest.warns(FutureWarning, match="Pass out"): arr.get_basic_selection(..., NDBuffer(array=np.empty((2, 2)))) @@ -242,12 +264,12 @@ def test_selection_positional_args_deprecated() -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) async def test_array_v3_nan_fill_value(store: MemoryStore) -> None: shape = (10,) - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=np.float64, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=np.nan, ) arr[:] = np.nan @@ -263,7 +285,7 @@ async def test_array_v3_nan_fill_value(store: MemoryStore) -> None: async def test_serializable_async_array( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: - expected = await AsyncArray.create( + expected = await zarr.api.asynchronous.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" ) # await expected.setitems(list(range(100))) @@ -279,7 +301,7 @@ async def test_serializable_async_array( @pytest.mark.parametrize("store", ["local"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [2, 3]) def test_serializable_sync_array(store: LocalStore, zarr_format: ZarrFormat) -> None: - expected = Array.create( + expected = zarr.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" ) expected[:] = list(range(100)) @@ -320,7 +342,7 @@ def test_nchunks(test_cls: type[Array] | type[AsyncArray[Any]], nchunks: int) -> """ store = MemoryStore() shape = 100 - arr = Array.create(store, shape=(shape,), chunks=(ceildiv(shape, nchunks),), dtype="i4") + arr = zarr.create_array(store, shape=(shape,), chunks=(ceildiv(shape, nchunks),), dtype="i4") expected = nchunks if test_cls == Array: observed = arr.nchunks @@ -335,7 +357,7 @@ async def test_nchunks_initialized(test_cls: type[Array] | type[AsyncArray[Any]] Test that nchunks_initialized accurately returns the number of stored chunks. """ store = MemoryStore() - arr = Array.create(store, shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4") # write chunks one at a time for idx, region in enumerate(arr._iter_chunk_regions()): @@ -363,7 +385,7 @@ async def test_chunks_initialized() -> None: Test that chunks_initialized accurately returns the keys of stored chunks. """ store = MemoryStore() - arr = Array.create(store, shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4") chunks_accumulated = tuple( accumulate(tuple(tuple(v.split(" ")) for v in arr._iter_chunk_keys())) @@ -402,44 +424,54 @@ async def test_nbytes_stored_async() -> None: def test_default_fill_values() -> None: - a = Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."): - Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: +def test_update_attrs(zarr_format: ZarrFormat) -> None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 store = MemoryStore() - arr = Array.create(store=store, shape=5, chunk_shape=5, dtype="f8", zarr_format=zarr_format) + arr = zarr.create_array( + store=store, shape=(5,), chunks=(5,), dtype="f8", zarr_format=zarr_format + ) arr.attrs["foo"] = "bar" assert arr.attrs["foo"] == "bar" @@ -460,7 +492,7 @@ def test_info_v2(self) -> None: _read_only=False, _store_type="MemoryStore", _count_bytes=128, - _filters=(numcodecs.Zstd(),), + _compressor=numcodecs.Zstd(), ) assert result == expected @@ -516,8 +548,8 @@ async def test_info_v2_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _filters=(Zstd(level=0),), _count_bytes=128, + _compressor=numcodecs.Zstd(), ) assert result == expected @@ -757,7 +789,7 @@ def test_array_create_metadata_order_v2( keyword argument to ``Array.create``. When ``order`` is ``None``, the value of the ``array.order`` config is used. """ - arr = Array.create(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") + arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") expected = order or zarr.config.get("array.order") assert arr.metadata.order == expected # type: ignore[union-attr] @@ -767,13 +799,14 @@ def test_array_create_metadata_order_v2( @pytest.mark.parametrize("store", ["memory"], indirect=True) def test_array_create_order( order_config: MemoryOrder | None, - zarr_format: int, + zarr_format: ZarrFormat, store: MemoryStore, ) -> None: """ Test that the arrays generated by array indexing have a memory order defined by the config order value """ + config: ArrayConfigLike = {} if order_config is None: config = {} expected = zarr.config.get("array.order") @@ -781,7 +814,7 @@ def test_array_create_order( config = {"order": order_config} expected = order_config - arr = Array.create( + arr = zarr.create_array( store=store, shape=(2, 2), zarr_format=zarr_format, dtype="i4", config=config ) @@ -801,7 +834,7 @@ def test_write_empty_chunks_config(write_empty_chunks: bool) -> None: explicitly """ with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): - arr = Array.create({}, shape=(2, 2), dtype="i4") + arr = zarr.create_array({}, shape=(2, 2), dtype="i4") assert arr._async_array._config.write_empty_chunks == write_empty_chunks @@ -821,13 +854,13 @@ def test_write_empty_chunks_behavior( already present. """ - arr = Array.create( + arr = zarr.create_array( store=store, shape=(2,), zarr_format=zarr_format, dtype="i4", fill_value=fill_value, - chunk_shape=(1,), + chunks=(1,), config={"write_empty_chunks": write_empty_chunks}, ) @@ -858,7 +891,7 @@ def test_write_empty_chunks_behavior( ) async def test_special_complex_fill_values_roundtrip(fill_value: Any, expected: list[Any]) -> None: store = MemoryStore() - Array.create(store=store, shape=(1,), dtype=np.complex64, fill_value=fill_value) + zarr.create_array(store=store, shape=(1,), dtype=np.complex64, fill_value=fill_value) content = await store.get("zarr.json", prototype=default_buffer_prototype()) assert content is not None actual = json.loads(content.to_bytes()) @@ -876,13 +909,281 @@ async def test_nbytes( the chunks of that array. """ store = MemoryStore() - arr = Array.create(store=store, shape=shape, dtype=dtype, fill_value=0) + arr = zarr.create_array(store=store, shape=shape, dtype=dtype, fill_value=0) if array_type == "async": assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize else: assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize +@pytest.mark.parametrize( + ("array_shape", "chunk_shape"), + [((256,), (2,))], +) +def test_auto_partition_auto_shards( + array_shape: tuple[int, ...], chunk_shape: tuple[int, ...] +) -> None: + """ + Test that automatically picking a shard size returns a tuple of 2 * the chunk shape for any axis + where there are 8 or more chunks. + """ + dtype = np.dtype("uint8") + expected_shards: tuple[int, ...] = () + for cs, a_len in zip(chunk_shape, array_shape, strict=False): + if a_len // cs >= 8: + expected_shards += (2 * cs,) + else: + expected_shards += (cs,) + + auto_shards, _ = _auto_partition( + array_shape=array_shape, chunk_shape=chunk_shape, shard_shape="auto", dtype=dtype + ) + assert auto_shards == expected_shards + + +def test_chunks_and_shards() -> None: + store = StorePath(MemoryStore()) + shape = (100, 100) + chunks = (5, 5) + shards = (10, 10) + + arr_v3 = zarr.create_array(store=store / "v3", shape=shape, chunks=chunks, dtype="i4") + assert arr_v3.chunks == chunks + assert arr_v3.shards is None + + arr_v3_sharding = zarr.create_array( + store=store / "v3_sharding", + shape=shape, + chunks=chunks, + shards=shards, + dtype="i4", + ) + assert arr_v3_sharding.chunks == chunks + assert arr_v3_sharding.shards == shards + + arr_v2 = zarr.create_array( + store=store / "v2", shape=shape, chunks=chunks, zarr_format=2, dtype="i4" + ) + assert arr_v2.chunks == chunks + assert arr_v2.shards is None + + +def test_create_array_default_fill_values() -> None: + a = zarr.create_array(MemoryStore(), shape=(5,), chunks=(5,), dtype=" None: + """ + Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. + """ + + # v2 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=empty_value, + filters=empty_value, + ) + # The v2 metadata stores None and () separately + assert arr.metadata.filters == empty_value # type: ignore[union-attr] + # The v2 metadata does not allow tuple for compressor, therefore it is turned into None + assert arr.metadata.compressor is None # type: ignore[union-attr] + + # v3 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + compressors=empty_value, + filters=empty_value, + ) + if dtype == "str": + assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr] + else: + assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +@pytest.mark.parametrize( + "compressors", + [ + "auto", + None, + (), + (ZstdCodec(level=3),), + (ZstdCodec(level=3), GzipCodec(level=0)), + ZstdCodec(level=3), + {"name": "zstd", "configuration": {"level": 3}}, + ({"name": "zstd", "configuration": {"level": 3}},), + ], +) +@pytest.mark.parametrize( + "filters", + [ + "auto", + None, + (), + ( + TransposeCodec( + order=[ + 0, + ] + ), + ), + ( + TransposeCodec( + order=[ + 0, + ] + ), + TransposeCodec( + order=[ + 0, + ] + ), + ), + TransposeCodec( + order=[ + 0, + ] + ), + {"name": "transpose", "configuration": {"order": [0]}}, + ({"name": "transpose", "configuration": {"order": [0]}},), + ], +) +async def test_create_array_v3_chunk_encoding( + store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str +) -> None: + """ + Test various possibilities for the compressors and filters parameter to create_array + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=3, + filters=filters, + compressors=compressors, + ) + aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3( + filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) + ) + # TODO: find a better way to get the filters / compressors from the array. + assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] + assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +@pytest.mark.parametrize( + "compressors", + [ + "auto", + None, + numcodecs.Zstd(level=3), + (), + (numcodecs.Zstd(level=3),), + ], +) +@pytest.mark.parametrize( + "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)] +) +async def test_create_array_v2_chunk_encoding( + store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str +) -> None: + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=compressors, + filters=filters, + ) + filters_expected, compressor_expected = _parse_chunk_encoding_v2( + filters=filters, compressor=compressors, dtype=np.dtype(dtype) + ) + # TODO: find a better way to get the filters/compressor from the array. + assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr] + assert arr.metadata.filters == filters_expected # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +async def test_create_array_v3_default_filters_compressors(store: MemoryStore, dtype: str) -> None: + """ + Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with + ``zarr_format`` = 3 and ``filters`` and ``compressors`` are not specified. + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=3, + ) + expected_aa, expected_ab, expected_bb = _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) + # TODO: define the codec pipeline class such that these fields are required, which will obviate the + # type ignore statements + assert arr.codec_pipeline.array_array_codecs == expected_aa # type: ignore[attr-defined] + assert arr.codec_pipeline.bytes_bytes_codecs == expected_bb # type: ignore[attr-defined] + assert arr.codec_pipeline.array_bytes_codec == expected_ab # type: ignore[attr-defined] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +async def test_create_array_v2_default_filters_compressors(store: MemoryStore, dtype: str) -> None: + """ + Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with + ``zarr_format`` = 2 and ``filters`` and ``compressors`` are not specified. + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + ) + expected_filters, expected_compressors = _get_default_chunk_encoding_v2( + np_dtype=np.dtype(dtype) + ) + assert arr.metadata.filters == expected_filters # type: ignore[union-attr] + assert arr.metadata.compressor == expected_compressors # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_v2_no_shards(store: MemoryStore) -> None: + """ + Test that creating a Zarr v2 array with ``shard_shape`` set to a non-None value raises an error. + """ + msg = re.escape( + "Zarr v2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." + ) + with pytest.raises(ValueError, match=msg): + _ = await create_array( + store=store, + dtype="uint8", + shape=(10,), + shards=(5,), + zarr_format=2, + ) + + async def test_scalar_array() -> None: arr = zarr.array(1.5) assert arr[...] == 1.5 diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 7a275516c6..e3cab0f214 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -5,9 +5,8 @@ import numpy as np import pytest -from zarr import AsyncArray +import zarr from zarr.codecs.blosc import BloscCodec -from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec from zarr.codecs.gzip import GzipCodec from zarr.codecs.transpose import TransposeCodec @@ -47,10 +46,10 @@ async def test_async_array_prototype() -> None: """Test the use of a custom buffer prototype""" expect = np.zeros((9, 9), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(StoreExpectingTestBuffer()) / "test_async_array_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, ) @@ -76,10 +75,10 @@ async def test_async_array_gpu_prototype() -> None: """Test the use of the GPU buffer prototype""" expect = cp.zeros((9, 9), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(MemoryStore()) / "test_async_array_gpu_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, ) @@ -98,20 +97,14 @@ async def test_async_array_gpu_prototype() -> None: @pytest.mark.asyncio async def test_codecs_use_of_prototype() -> None: expect = np.zeros((10, 10), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(StoreExpectingTestBuffer()) / "test_codecs_use_of_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, - codecs=[ - TransposeCodec(order=(1, 0)), - BytesCodec(), - BloscCodec(), - Crc32cCodec(), - GzipCodec(), - ZstdCodec(), - ], + compressors=[BloscCodec(), Crc32cCodec(), GzipCodec(), ZstdCodec()], + filters=[TransposeCodec(order=(1, 0))], ) expect[:] = np.arange(100).reshape(10, 10) @@ -133,20 +126,14 @@ async def test_codecs_use_of_prototype() -> None: @pytest.mark.asyncio async def test_codecs_use_of_gpu_prototype() -> None: expect = cp.zeros((10, 10), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(MemoryStore()) / "test_codecs_use_of_gpu_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, - codecs=[ - TransposeCodec(order=(1, 0)), - BytesCodec(), - BloscCodec(), - Crc32cCodec(), - GzipCodec(), - ZstdCodec(), - ], + compressors=[BloscCodec(), Crc32cCodec(), GzipCodec(), ZstdCodec()], + filters=[TransposeCodec(order=(1, 0))], ) expect[:] = cp.arange(100).reshape(10, 10) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 416a2f784e..34044d7d62 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -3,9 +3,9 @@ import numpy as np import pytest -from zarr import AsyncArray +import zarr from zarr.abc.store import Store -from zarr.codecs import BloscCodec, BytesCodec, ShardingCodec +from zarr.codecs import BloscCodec from zarr.core.buffer import default_buffer_prototype from zarr.storage.common import StorePath @@ -16,13 +16,13 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: typesize = np.dtype(dtype).itemsize path = "blosc_evolve" spath = StorePath(store, path) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), dtype=dtype, fill_value=0, - codecs=[BytesCodec(), BloscCodec()], + compressors=BloscCodec(), ) buf = await store.get(f"{path}/zarr.json", prototype=default_buffer_prototype()) assert buf is not None @@ -36,13 +36,14 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: path2 = "blosc_evolve_sharding" spath2 = StorePath(store, path2) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath2, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), + shards=(16, 16), dtype=dtype, fill_value=0, - codecs=[ShardingCodec(chunk_shape=(16, 16), codecs=[BytesCodec(), BloscCodec()])], + compressors=BloscCodec(), ) buf = await store.get(f"{path2}/zarr.json", prototype=default_buffer_prototype()) assert buf is not None diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index 2025e72937..e36a332440 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -7,6 +7,9 @@ import numpy as np import pytest +import zarr +import zarr.api +import zarr.api.asynchronous from zarr import Array, AsyncArray, config from zarr.codecs import ( BytesCodec, @@ -19,7 +22,6 @@ from zarr.storage import StorePath if TYPE_CHECKING: - from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.core.buffer.core import NDArrayLike from zarr.core.common import MemoryOrder @@ -75,27 +77,18 @@ async def test_order( data = np.arange(0, 256, dtype="uint16").reshape((32, 8), order=input_order) path = "order" spath = StorePath(store, path=path) - codecs_: list[Codec] = ( - [ - ShardingCodec( - chunk_shape=(16, 8), - codecs=[TransposeCodec(order=order_from_dim(store_order, data.ndim)), BytesCodec()], - ) - ] - if with_sharding - else [TransposeCodec(order=order_from_dim(store_order, data.ndim)), BytesCodec()] - ) - with config.set({"array.order": runtime_write_order}): - a = await AsyncArray.create( - spath, - shape=data.shape, - chunk_shape=(32, 8), - dtype=data.dtype, - fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=codecs_, - ) + a = await zarr.api.asynchronous.create_array( + spath, + shape=data.shape, + chunks=(16, 8) if with_sharding else (32, 8), + shards=(32, 8) if with_sharding else None, + dtype=data.dtype, + fill_value=0, + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=order_from_dim(store_order, data.ndim))], + config={"order": runtime_write_order}, + ) await _AsyncArrayProxy(a)[:, :].set(data) read_data = await _AsyncArrayProxy(a)[:, :].get() @@ -131,16 +124,15 @@ def test_order_implicit( data = np.arange(0, 256, dtype="uint16").reshape((16, 16), order=input_order) path = "order_implicit" spath = StorePath(store, path) - codecs_: list[Codec] | None = [ShardingCodec(chunk_shape=(8, 8))] if with_sharding else None with config.set({"array.order": runtime_write_order}): - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(8, 8) if with_sharding else (16, 16), + shards=(16, 16) if with_sharding else None, dtype=data.dtype, fill_value=0, - codecs=codecs_, ) a[:, :] = data @@ -161,10 +153,10 @@ def test_order_implicit( @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_open(store: Store) -> None: spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), dtype="int32", fill_value=0, ) @@ -228,10 +220,10 @@ def test_morton2(shape) -> None: def test_write_partial_chunks(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=(20, 20), + chunks=(20, 20), dtype=data.dtype, fill_value=1, ) @@ -244,10 +236,10 @@ async def test_delete_empty_chunks(store: Store) -> None: data = np.ones((16, 16)) path = "delete_empty_chunks" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(32, 32), + chunks=(32, 32), dtype=data.dtype, fill_value=1, ) @@ -262,25 +254,25 @@ async def test_dimension_names(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) path = "dimension_names" spath = StorePath(store, path) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, dimension_names=("x", "y"), ) - assert (await AsyncArray.open(spath)).metadata.dimension_names == ( + assert (await zarr.api.asynchronous.open_array(store=spath)).metadata.dimension_names == ( "x", "y", ) path2 = "dimension_names2" spath2 = StorePath(store, path2) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath2, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, ) @@ -293,7 +285,7 @@ async def test_dimension_names(store: Store) -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_invalid_metadata(store: Store) -> None: - spath2 = StorePath(store, "invalid_endian") + spath2 = StorePath(store, "invalid_codec_order") with pytest.raises(TypeError): Array.create( spath2, @@ -302,7 +294,7 @@ def test_invalid_metadata(store: Store) -> None: dtype=np.dtype("uint8"), fill_value=0, codecs=[ - BytesCodec(endian="big"), + BytesCodec(), TransposeCodec(order=order_from_dim("F", 2)), ], ) @@ -315,8 +307,8 @@ def test_invalid_metadata(store: Store) -> None: dtype=np.dtype("uint8"), fill_value=0, codecs=[ - BytesCodec(), TransposeCodec(order="F"), # type: ignore[arg-type] + BytesCodec(), ], ) spath4 = StorePath(store, "invalid_missing_bytes_codec") @@ -370,17 +362,34 @@ def test_invalid_metadata(store: Store) -> None: ) +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_invalid_metadata_create_array(store: Store) -> None: + spath = StorePath(store, "warning_inefficient_codecs") + with pytest.warns(UserWarning): + zarr.create_array( + spath, + shape=(16, 16), + chunks=(16, 16), + dtype=np.dtype("uint8"), + fill_value=0, + serializer=ShardingCodec(chunk_shape=(8, 8)), + compressors=[ + GzipCodec(), + ], + ) + + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) async def test_resize(store: Store) -> None: data = np.zeros((16, 18), dtype="uint16") path = "resize" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(10, 10), + chunks=(10, 10), dtype=data.dtype, - chunk_key_encoding=("v2", "."), + chunk_key_encoding={"name": "v2", "separator": "."}, fill_value=1, ) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index db4e77451c..ae9d1f6f1f 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from zarr import AsyncArray +import zarr from zarr.abc.store import Store from zarr.codecs import BytesCodec from zarr.storage.common import StorePath @@ -17,14 +17,14 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) path = "endian" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[BytesCodec(endian=endian)], + chunk_key_encoding={"name": "v2", "separator": "."}, + serializer=BytesCodec(endian=endian), ) await _AsyncArrayProxy(a)[:, :].set(data) @@ -43,14 +43,14 @@ async def test_endian_write( data = np.arange(0, 256, dtype=dtype_input_endian).reshape((16, 16)) path = "endian" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype="uint16", fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[BytesCodec(endian=dtype_store_endian)], + chunk_key_encoding={"name": "v2", "separator": "."}, + serializer=BytesCodec(endian=dtype_store_endian), ) await _AsyncArrayProxy(a)[:, :].set(data) diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 7b4d231813..f47f9710b1 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from zarr import Array +import zarr from zarr.abc.store import Store -from zarr.codecs import BytesCodec, GzipCodec +from zarr.codecs import GzipCodec from zarr.storage.common import StorePath @@ -11,13 +11,13 @@ def test_gzip(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( StorePath(store), shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - codecs=[BytesCodec(), GzipCodec()], + compressors=GzipCodec(), ) a[:, :] = data diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 51c82067f3..3f14007351 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -5,11 +5,13 @@ import numpy.typing as npt import pytest -from zarr import Array, AsyncArray +import zarr +import zarr.api +import zarr.api.asynchronous +from zarr import Array from zarr.abc.store import Store from zarr.codecs import ( BloscCodec, - BytesCodec, ShardingCodec, ShardingCodecIndexLocation, TransposeCodec, @@ -45,23 +47,16 @@ def test_sharding( """ data = array_fixture spath = StorePath(store) - arr = Array.create( + + arr = zarr.create_array( spath, shape=tuple(s + offset for s in data.shape), - chunk_shape=(64,) * data.ndim, + chunks=(32,) * data.ndim, + shards={"shape": (64,) * data.ndim, "index_location": index_location}, dtype=data.dtype, fill_value=6, - codecs=[ - ShardingCodec( - chunk_shape=(32,) * data.ndim, - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], + compressors=BloscCodec(cname="lz4"), ) write_region = tuple(slice(offset, None) for dim in range(data.ndim)) arr[write_region] = data @@ -89,23 +84,15 @@ def test_sharding_partial( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) a[10:, 10:, 10:] = data @@ -132,19 +119,15 @@ def test_sharding_partial_readwrite( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=data.shape, + chunks=(1, data.shape[1], data.shape[2]), + shards={"shape": data.shape, "index_location": index_location}, dtype=data.dtype, fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(1, data.shape[1], data.shape[2]), - codecs=[BytesCodec()], - index_location=index_location, - ) - ], + filters=None, + compressors=None, ) a[:] = data @@ -168,23 +151,15 @@ def test_sharding_partial_read( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) read_data = a[0:10, 0:10, 0:10] @@ -205,23 +180,15 @@ def test_sharding_partial_overwrite( ) -> None: data = array_fixture[:10, :10, :10] spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) a[:10, :10, :10] = data @@ -283,26 +250,66 @@ def test_nested_sharding( assert np.array_equal(data, read_data) +@pytest.mark.parametrize( + "array_fixture", + [ + ArrayRequest(shape=(128,) * 3, dtype="uint16", order="F"), + ], + indirect=["array_fixture"], +) +@pytest.mark.parametrize( + "outer_index_location", + ["start", "end"], +) +@pytest.mark.parametrize( + "inner_index_location", + ["start", "end"], +) +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) +def test_nested_sharding_create_array( + store: Store, + array_fixture: npt.NDArray[Any], + outer_index_location: ShardingCodecIndexLocation, + inner_index_location: ShardingCodecIndexLocation, +) -> None: + data = array_fixture + spath = StorePath(store) + a = zarr.create_array( + spath, + shape=data.shape, + chunks=(32, 32, 32), + dtype=data.dtype, + fill_value=0, + serializer=ShardingCodec( + chunk_shape=(32, 32, 32), + codecs=[ShardingCodec(chunk_shape=(16, 16, 16), index_location=inner_index_location)], + index_location=outer_index_location, + ), + filters=None, + compressors=None, + ) + print(a.metadata.to_dict()) + + a[:, :, :] = data + + read_data = a[0 : data.shape[0], 0 : data.shape[1], 0 : data.shape[2]] + assert data.shape == read_data.shape + assert np.array_equal(data, read_data) + + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) def test_open_sharding(store: Store) -> None: path = "open_sharding" spath = StorePath(store, path) - a = Array.create( + a = zarr.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(8, 8), + shards=(16, 16), + filters=[TransposeCodec(order=order_from_dim("F", 2))], + compressors=BloscCodec(), dtype="int32", fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(8, 8), - codecs=[ - TransposeCodec(order=order_from_dim("F", 2)), - BytesCodec(), - BloscCodec(), - ], - ) - ], ) b = Array.open(spath) assert a.metadata == b.metadata @@ -312,21 +319,14 @@ def test_open_sharding(store: Store) -> None: def test_write_partial_sharded_chunks(store: Store) -> None: data = np.arange(0, 16 * 16, dtype="uint16").reshape((16, 16)) spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=(40, 40), - chunk_shape=(20, 20), + chunks=(10, 10), + shards=(20, 20), dtype=data.dtype, + compressors=BloscCodec(), fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(10, 10), - codecs=[ - BytesCodec(), - BloscCodec(), - ], - ) - ], ) a[0:16, 0:16] = data assert np.array_equal(a[0:16, 0:16], data) @@ -338,14 +338,16 @@ async def test_delete_empty_shards(store: Store) -> None: pytest.skip("store does not support deletes") path = "delete_empty_shards" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(8, 16), + chunks=(8, 8), + shards=(8, 16), dtype="uint16", + compressors=None, fill_value=1, - codecs=[ShardingCodec(chunk_shape=(8, 8))], ) + print(a.metadata.to_dict()) await _AsyncArrayProxy(a)[:, :].set(np.zeros((16, 16))) await _AsyncArrayProxy(a)[8:, :].set(np.ones((8, 16))) await _AsyncArrayProxy(a)[:, 8:].set(np.ones((16, 8))) @@ -380,13 +382,13 @@ async def test_sharding_with_empty_inner_chunk( path = f"sharding_with_empty_inner_chunk_{index_location}" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(8, 8), + chunks=(4, 4), + shards={"shape": (8, 8), "index_location": index_location}, dtype="uint32", fill_value=fill_value, - codecs=[ShardingCodec(chunk_shape=(4, 4), index_location=index_location)], ) data[:4, :4] = fill_value await a.setitem(..., data) @@ -405,20 +407,44 @@ async def test_sharding_with_chunks_per_shard( store: Store, index_location: ShardingCodecIndexLocation, chunks_per_shard: tuple[int] ) -> None: chunk_shape = (2, 1) - shape = [x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False)] + shape = tuple(x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False)) data = np.ones(np.prod(shape), dtype="int32").reshape(shape) fill_value = 42 path = f"test_sharding_with_chunks_per_shard_{index_location}" spath = StorePath(store, path) - a = Array.create( + a = zarr.create_array( spath, shape=shape, - chunk_shape=shape, + chunks=chunk_shape, + shards={"shape": shape, "index_location": index_location}, dtype="int32", fill_value=fill_value, - codecs=[ShardingCodec(chunk_shape=chunk_shape, index_location=index_location)], ) a[...] = data data_read = a[...] assert np.array_equal(data_read, data) + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_invalid_metadata(store: Store) -> None: + spath1 = StorePath(store, "invalid_inner_chunk_shape") + with pytest.raises(ValueError): + zarr.create_array( + spath1, + shape=(16, 16), + shards=(16, 16), + chunks=(8,), + dtype=np.dtype("uint8"), + fill_value=0, + ) + spath2 = StorePath(store, "invalid_inner_chunk_shape") + with pytest.raises(ValueError): + zarr.create_array( + spath2, + shape=(16, 16), + shards=(16, 16), + chunks=(8, 7), + dtype=np.dtype("uint8"), + fill_value=0, + ) diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 2b3914150e..65159f174b 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -1,19 +1,15 @@ -from typing import TYPE_CHECKING - import numpy as np import pytest -from zarr import Array, AsyncArray, config +import zarr +from zarr import AsyncArray, config from zarr.abc.store import Store -from zarr.codecs import BytesCodec, ShardingCodec, TransposeCodec +from zarr.codecs import TransposeCodec from zarr.core.common import MemoryOrder from zarr.storage.common import StorePath from .test_codecs import _AsyncArrayProxy -if TYPE_CHECKING: - from zarr.abc.codec import Codec - @pytest.mark.parametrize("input_order", ["F", "C"]) @pytest.mark.parametrize("runtime_write_order", ["F", "C"]) @@ -29,25 +25,16 @@ async def test_transpose( ) -> None: data = np.arange(0, 256, dtype="uint16").reshape((1, 32, 8), order=input_order) spath = StorePath(store, path="transpose") - codecs_: list[Codec] = ( - [ - ShardingCodec( - chunk_shape=(1, 16, 8), - codecs=[TransposeCodec(order=(2, 1, 0)), BytesCodec()], - ) - ] - if with_sharding - else [TransposeCodec(order=(2, 1, 0)), BytesCodec()] - ) with config.set({"array.order": runtime_write_order}): - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(1, 32, 8), + chunks=(1, 16, 8) if with_sharding else (1, 32, 8), + shards=(1, 32, 8) if with_sharding else None, dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=codecs_, + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=(2, 1, 0))], ) await _AsyncArrayProxy(a)[:, :].set(data) @@ -75,13 +62,13 @@ def test_transpose_non_self_inverse(store: Store, order: list[int]) -> None: shape = [i + 3 for i in range(len(order))] data = np.arange(0, np.prod(shape), dtype="uint16").reshape(shape) spath = StorePath(store, "transpose_non_self_inverse") - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value=0, - codecs=[TransposeCodec(order=order), BytesCodec()], + filters=[TransposeCodec(order=order)], ) a[:, :] = data read_data = a[:, :] @@ -94,14 +81,14 @@ def test_transpose_invalid( ) -> None: data = np.arange(0, 256, dtype="uint16").reshape((1, 32, 8)) spath = StorePath(store, "transpose_invalid") - for order in [(1, 0), (3, 2, 1), (3, 3, 1)]: - with pytest.raises(ValueError): - Array.create( + for order in [(1, 0), (3, 2, 1), (3, 3, 1), "F", "C"]: + with pytest.raises((ValueError, TypeError)): + zarr.create_array( spath, shape=data.shape, - chunk_shape=(1, 32, 8), + chunks=(1, 32, 8), dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[TransposeCodec(order=order), BytesCodec()], + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=order)], ) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 05b2e25267..f4ee135601 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -3,10 +3,11 @@ import numpy as np import pytest +import zarr from zarr import Array from zarr.abc.codec import Codec from zarr.abc.store import Store -from zarr.codecs import VLenBytesCodec, VLenUTF8Codec, ZstdCodec +from zarr.codecs import ZstdCodec from zarr.core.metadata.v3 import ArrayV3Metadata, DataType from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage.common import StorePath @@ -23,21 +24,21 @@ @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("dtype", numpy_str_dtypes) @pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("codecs", [None, [VLenUTF8Codec()], [VLenUTF8Codec(), ZstdCodec()]]) +@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) def test_vlen_string( - store: Store, dtype: np.dtype[Any] | None, as_object_array: bool, codecs: list[Codec] | None + store: Store, dtype: np.dtype[Any] | None, as_object_array: bool, compressor: Codec | None ) -> None: strings = ["hello", "world", "this", "is", "a", "test"] data = np.array(strings, dtype=dtype).reshape((2, 3)) sp = StorePath(store, path="string") - a = Array.create( + a = zarr.create_array( sp, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value="", - codecs=codecs, + compressors=compressor, ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy @@ -61,20 +62,20 @@ def test_vlen_string( @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("codecs", [None, [VLenBytesCodec()], [VLenBytesCodec(), ZstdCodec()]]) -def test_vlen_bytes(store: Store, as_object_array: bool, codecs: list[Codec] | None) -> None: +@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) +def test_vlen_bytes(store: Store, as_object_array: bool, compressor: Codec | None) -> None: bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"] data = np.array(bstrings).reshape((2, 3)) assert data.dtype == "|S5" sp = StorePath(store, path="string") - a = Array.create( + a = zarr.create_array( sp, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value=b"", - codecs=codecs, + compressors=compressor, ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 29efc29466..a57476fb61 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from zarr import Array +import zarr from zarr.abc.store import Store -from zarr.codecs import BytesCodec, ZstdCodec +from zarr.codecs import ZstdCodec from zarr.storage.common import StorePath @@ -12,13 +12,13 @@ def test_zstd(store: Store, checksum: bool) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( StorePath(store, path="zstd"), shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - codecs=[BytesCodec(), ZstdCodec(level=0, checksum=checksum)], + compressors=ZstdCodec(level=0, checksum=checksum), ) a[:, :] = data diff --git a/tests/test_config.py b/tests/test_config.py index ea8e70a994..20e3c6044f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,7 +8,8 @@ import pytest import zarr -from zarr import Array, AsyncArray, zeros +import zarr.api +from zarr import zeros from zarr.abc.codec import Codec, CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store from zarr.codecs import ( @@ -49,19 +50,33 @@ def test_config_defaults_set() -> None: # regression test for available defaults assert config.defaults == [ { - "default_zarr_version": 3, + "default_zarr_format": 3, "array": { "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": False}, + "string": {"id": "zstd", "level": 0, "checksum": False}, + "bytes": {"id": "zstd", "level": 0, "checksum": False}, + }, + "v2_default_filters": { + "numeric": None, + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "bytes": ["vlen-bytes"], - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, @@ -139,7 +154,7 @@ async def write( assert get_pipeline_class() == MockCodecPipeline # test if codec is used - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), @@ -184,13 +199,13 @@ async def _encode_single( assert get_codec_class("blosc") == MockBloscCodec # test if codec is used - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=3, dtype="i4", - codecs=[BytesCodec(), {"name": "blosc", "configuration": {}}], + compressors=[{"name": "blosc", "configuration": {}}], ) arr[:] = range(100) _mock.call.assert_called() @@ -213,7 +228,7 @@ def test_config_ndbuffer_implementation(store: Store) -> None: register_ndbuffer(NDBufferUsingTestNDArrayLike) with config.set({"ndbuffer": fully_qualified_name(NDBufferUsingTestNDArrayLike)}): assert get_ndbuffer_class() == NDBufferUsingTestNDArrayLike - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), @@ -291,23 +306,32 @@ class NewCodec2(BytesCodec): ("dtype", "expected_codecs"), [ ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec()]), - ("str", [VLenUTF8Codec()]), + ("bytes", [VLenBytesCodec(), GzipCodec()]), + ("str", [VLenUTF8Codec(), GzipCodec()]), ], ) async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: with config.set( { - "array.v3_default_codecs": { - "numeric": ["bytes", "gzip"], # test setting non-standard codecs - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "array.v3_default_codecs": { # test setting non-standard codecs + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], } } ): - arr = await AsyncArray.create( + arr = await zarr.api.asynchronous.create_array( shape=(100,), - chunk_shape=(100,), + chunks=(100,), dtype=np.dtype(dtype), zarr_format=3, store=MemoryStore(), diff --git a/tests/test_group.py b/tests/test_group.py index e0bc304b9b..6b3c40412e 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -155,9 +155,8 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad subsubsubgroup = subsubgroup.create_group("subsubsubgroup") members_expected["subarray"] = group.create_array( - "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), overwrite=True + "subarray", shape=(100,), dtype="uint8", chunks=(10,), overwrite=True ) - # add an extra object to the domain of the group. # the list of children should ignore this object. sync( @@ -227,9 +226,7 @@ def test_group(store: Store, zarr_format: ZarrFormat) -> None: # create an array from the "bar" group data = np.arange(0, 4 * 4, dtype="uint16").reshape((4, 4)) - arr = bar.create_array( - "baz", shape=data.shape, dtype=data.dtype, chunk_shape=(2, 2), overwrite=True - ) + arr = bar.create_array("baz", shape=data.shape, dtype=data.dtype, chunks=(2, 2), overwrite=True) arr[:] = data # check the array @@ -313,8 +310,8 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) - subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subarray = group.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") + subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -391,7 +388,7 @@ def test_group_delitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subarray = group.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -472,19 +469,21 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat expected_group_values = [group.create_group(name=name) for name in expected_group_keys] expected_groups = list(zip(expected_group_keys, expected_group_values, strict=False)) + fill_value = 3 + dtype = "uint8" + expected_group_values[0].create_group("subgroup") - expected_group_values[0].create_array("subarray", shape=(1,)) + expected_group_values[0].create_array( + "subarray", shape=(1,), dtype=dtype, fill_value=fill_value + ) expected_array_keys = ["a0", "a1"] + expected_array_values = [ - group.create_array(name=name, shape=(1,)) for name in expected_array_keys + group.create_array(name=name, shape=(1,), dtype=dtype, fill_value=fill_value) + for name in expected_array_keys ] expected_arrays = list(zip(expected_array_keys, expected_array_values, strict=False)) - fill_value: float | None - if zarr_format == 2: - fill_value = None - else: - fill_value = np.float64(0.0) if consolidate: group = zarr.consolidate_metadata(store) @@ -492,12 +491,13 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat metadata = { "subarray": { "attributes": {}, - "dtype": "float64", + "dtype": dtype, "fill_value": fill_value, "shape": (1,), "chunks": (1,), "order": "C", - "filters": (Zstd(level=0),), + "filters": None, + "compressor": Zstd(level=0), "zarr_format": zarr_format, }, "subgroup": { @@ -527,7 +527,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": "float64", + "data_type": dtype, "fill_value": fill_value, "node_type": "array", "shape": (1,), @@ -614,20 +614,24 @@ def test_group_create_array( data = np.arange(np.prod(shape)).reshape(shape).astype(dtype) if method == "create_array": - array = group.create_array(name="array", shape=shape, dtype=dtype, data=data) + array = group.create_array(name="array", shape=shape, dtype=dtype) + array[:] = data elif method == "array": with pytest.warns(DeprecationWarning): - array = group.array(name="array", shape=shape, dtype=dtype, data=data) + array = group.array(name="array", shape=shape, dtype=dtype) + array[:] = data else: raise AssertionError if not overwrite: if method == "create_array": with pytest.raises(ContainsArrayError): - group.create_array(name="array", shape=shape, dtype=dtype, data=data) + a = group.create_array(name="array", shape=shape, dtype=dtype) + a[:] = data elif method == "array": with pytest.raises(ContainsArrayError), pytest.warns(DeprecationWarning): - group.array(name="array", shape=shape, dtype=dtype, data=data) + a = group.array(name="array", shape=shape, dtype=dtype) + a[:] = data assert array.shape == shape assert array.dtype == np.dtype(dtype) assert np.array_equal(array[:], data) @@ -780,7 +784,7 @@ async def test_asyncgroup_create( ) # create an array at our target path collision_name = "foo" - _ = await AsyncArray.create( + _ = await zarr.api.asynchronous.create_array( spath / collision_name, shape=(10,), dtype="uint8", zarr_format=zarr_format ) with pytest.raises(ContainsArrayError): @@ -870,9 +874,7 @@ async def test_asyncgroup_getitem(store: Store, zarr_format: ZarrFormat) -> None agroup = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) array_name = "sub_array" - sub_array = await agroup.create_array( - name=array_name, shape=(10,), dtype="uint8", chunk_shape=(2,) - ) + sub_array = await agroup.create_array(name=array_name, shape=(10,), dtype="uint8", chunks=(2,)) assert await agroup.getitem(array_name) == sub_array sub_group_path = "sub_group" @@ -894,7 +896,7 @@ async def test_asyncgroup_delitem(store: Store, zarr_format: ZarrFormat) -> None name=array_name, shape=(10,), dtype="uint8", - chunk_shape=(2,), + chunks=(2,), attributes={"foo": 100}, ) await agroup.delitem(array_name) @@ -960,7 +962,7 @@ async def test_asyncgroup_create_array( name=sub_node_path, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, + chunks=chunk_shape, attributes=attributes, ) assert isinstance(subnode, AsyncArray) @@ -1014,11 +1016,11 @@ async def test_group_members_async(store: Store, consolidated_metadata: bool) -> group = await AsyncGroup.from_store( store=store, ) - a0 = await group.create_array("a0", shape=(1,)) + a0 = await group.create_array("a0", shape=(1,), dtype="uint8") g0 = await group.create_group("g0") - a1 = await g0.create_array("a1", shape=(1,)) + a1 = await g0.create_array("a1", shape=(1,), dtype="uint8") g1 = await g0.create_group("g1") - a2 = await g1.create_array("a2", shape=(1,)) + a2 = await g1.create_array("a2", shape=(1,), dtype="uint8") g2 = await g1.create_group("g2") # immediate children @@ -1101,7 +1103,7 @@ async def test_require_group(store: LocalStore | MemoryStore, zarr_format: ZarrF assert foo_group.attrs == {} _ = await foo_group.create_array( - "bar", shape=(10,), dtype="uint8", chunk_shape=(2,), attributes={"foo": 100} + "bar", shape=(10,), dtype="uint8", chunks=(2,), attributes={"foo": 100} ) # test that overwriting a group w/ children fails @@ -1179,9 +1181,9 @@ async def test_require_array(store: Store, zarr_format: ZarrFormat) -> None: async def test_members_name(store: Store, consolidate: bool, zarr_format: ZarrFormat): group = Group.from_store(store=store, zarr_format=zarr_format) a = group.create_group(name="a") - a.create_array("array", shape=(1,)) + a.create_array("array", shape=(1,), dtype="uint8") b = a.create_group(name="b") - b.create_array("array", shape=(1,)) + b.create_array("array", shape=(1,), dtype="uint8") if consolidate: group = zarr.api.synchronous.consolidate_metadata(store) @@ -1284,12 +1286,12 @@ async def test_group_delitem_consolidated(self, store: Store) -> None: g0 = await root.create_group("g0") g1 = await g0.create_group("g1") g2 = await g1.create_group("g2") - await g2.create_array("data", shape=(1,)) + await g2.create_array("data", shape=(1,), dtype="uint8") x0 = await root.create_group("x0") x1 = await x0.create_group("x1") x2 = await x1.create_group("x2") - await x2.create_array("data", shape=(1,)) + await x2.create_array("data", shape=(1,), dtype="uint8") await zarr.api.asynchronous.consolidate_metadata(store) @@ -1360,8 +1362,8 @@ def test_info(self): A = zarr.group(store=store, path="A") B = A.create_group(name="B") - B.create_array(name="x", shape=(1,)) - B.create_array(name="y", shape=(2,)) + B.create_array(name="x", shape=(1,), dtype="uint8") + B.create_array(name="y", shape=(2,), dtype="uint8") result = A.info expected = GroupInfo( @@ -1420,8 +1422,18 @@ def test_delitem_removes_children(store: Store, zarr_format: ZarrFormat) -> None g1 = zarr.group(store=store, zarr_format=zarr_format) g1.create_group("0") g1.create_group("0/0") - arr = g1.create_array("0/0/0", shape=(1,)) + arr = g1.create_array("0/0/0", shape=(1,), dtype="uint8") arr[:] = 1 del g1["0"] with pytest.raises(KeyError): g1["0/0"] + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_deprecated_compressor(store: Store) -> None: + g = zarr.group(store=store, zarr_format=2) + with pytest.warns(UserWarning, match="The `compressor` argument is deprecated.*"): + a = g.create_array( + "foo", shape=(100,), chunks=(10,), dtype="i4", compressor={"id": "blosc"} + ) + assert a.metadata.compressor.codec_id == "blosc" diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 04eb53e364..fc83af695b 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -47,12 +47,12 @@ def zarr_array_from_numpy_array( a: npt.NDArray[Any], chunk_shape: ChunkCoords | None = None, ) -> zarr.Array: - z = zarr.Array.create( + z = zarr.create_array( store=store / str(uuid4()), shape=a.shape, dtype=a.dtype, - chunk_shape=chunk_shape or a.shape, - chunk_key_encoding=("v2", "."), + chunks=chunk_shape or a.shape, + chunk_key_encoding={"name": "v2", "separator": "."}, ) z[()] = a return z @@ -1933,7 +1933,7 @@ def test_indexing_with_zarr_array(store: StorePath) -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("shape", [(0, 2, 3), (0), (3, 0)]) def test_zero_sized_chunks(store: StorePath, shape: list[int]) -> None: - z = Array.create(store=store, shape=shape, chunk_shape=shape, zarr_format=3, dtype="f8") + z = zarr.create_array(store=store, shape=shape, chunks=shape, zarr_format=3, dtype="f8") z[...] = 42 assert_array_equal(z[...], np.zeros(shape, dtype="f8")) diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 7f0c49338e..aaace6f5cd 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -31,16 +31,19 @@ @pytest.fixture async def memory_store_with_hierarchy(memory_store: Store) -> None: g = await group(store=memory_store, attributes={"foo": "bar"}) - await g.create_array(name="air", shape=(1, 2, 3)) - await g.create_array(name="lat", shape=(1,)) - await g.create_array(name="lon", shape=(2,)) - await g.create_array(name="time", shape=(3,)) + dtype = "uint8" + await g.create_array(name="air", shape=(1, 2, 3), dtype=dtype) + await g.create_array(name="lat", shape=(1,), dtype=dtype) + await g.create_array(name="lon", shape=(2,), dtype=dtype) + await g.create_array(name="time", shape=(3,), dtype=dtype) child = await g.create_group("child", attributes={"key": "child"}) - await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) + await child.create_array("array", shape=(4, 4), attributes={"key": "child"}, dtype=dtype) grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) - await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + await grandchild.create_array( + "array", shape=(4, 4), attributes={"key": "grandchild"}, dtype=dtype + ) await grandchild.create_group("empty_group", attributes={"key": "empty"}) return memory_store @@ -74,10 +77,10 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: }, "codecs": ( {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + {"configuration": {"level": 0, "checksum": False}, "name": "zstd"}, ), - "data_type": "float64", - "fill_value": np.float64(0.0), + "data_type": "uint8", + "fill_value": 0, "node_type": "array", # "shape": (1, 2, 3), "zarr_format": 3, @@ -205,10 +208,11 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: def test_consolidated_sync(self, memory_store): g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) - g.create_array(name="air", shape=(1, 2, 3)) - g.create_array(name="lat", shape=(1,)) - g.create_array(name="lon", shape=(2,)) - g.create_array(name="time", shape=(3,)) + dtype = "uint8" + g.create_array(name="air", shape=(1, 2, 3), dtype=dtype) + g.create_array(name="lat", shape=(1,), dtype=dtype) + g.create_array(name="lon", shape=(2,), dtype=dtype) + g.create_array(name="time", shape=(3,), dtype=dtype) zarr.api.synchronous.consolidate_metadata(memory_store) group2 = zarr.api.synchronous.Group.open(memory_store) @@ -221,10 +225,10 @@ def test_consolidated_sync(self, memory_store): }, "codecs": ( {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + {"configuration": {"level": 0, "checksum": False}, "name": "zstd"}, ), - "data_type": "float64", - "fill_value": np.float64(0.0), + "data_type": dtype, + "fill_value": 0, "node_type": "array", # "shape": (1, 2, 3), "zarr_format": 3, @@ -475,7 +479,8 @@ async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - await g.create_array(name="a", shape=(1,), attributes={"key": "a"}) + dtype = "uint8" + await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) @@ -489,11 +494,11 @@ async def test_consolidated_metadata_v2(self): metadata={ "a": ArrayV2Metadata( shape=(1,), - dtype="float64", + dtype=dtype, attributes={"key": "a"}, chunks=(1,), - fill_value=None, - filters=(Zstd(level=0),), + fill_value=0, + compressor=Zstd(level=0), order="C", ), "g1": GroupMetadata( @@ -518,7 +523,7 @@ async def test_consolidated_metadata_v2(self): async def test_use_consolidated_false( self, memory_store: zarr.storage.MemoryStore, zarr_format: ZarrFormat ) -> None: - with zarr.config.set(default_zarr_version=zarr_format): + with zarr.config.set(default_zarr_format=zarr_format): g = await group(store=memory_store, attributes={"foo": "bar"}) await g.create_group(name="a") diff --git a/tests/test_v2.py b/tests/test_v2.py index 80897db8e5..72127f4ede 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -11,7 +11,7 @@ import zarr import zarr.core.buffer import zarr.storage -from zarr import Array, config +from zarr import config from zarr.storage import MemoryStore, StorePath @@ -23,7 +23,7 @@ async def store() -> Iterator[StorePath]: def test_simple(store: StorePath) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( store / "simple_v2", zarr_format=2, shape=data.shape, @@ -82,7 +82,12 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - with config.set({"array.v2_default_compressor.bytes": "vlen-bytes"}): + with config.set( + { + "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], + "array.v2_default_compressor.bytes": None, + } + ): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( @@ -120,9 +125,9 @@ def test_v2_encode_decode_with_data(dtype_value): dtype, value = dtype_value with config.set( { - "array.v2_default_compressor": { - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "array.v2_default_filters": { + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): @@ -162,7 +167,7 @@ def test_v2_filters_codecs(filters: Any, order: Literal["C", "F"]) -> None: @pytest.mark.parametrize("array_order", ["C", "F"]) @pytest.mark.parametrize("data_order", ["C", "F"]) def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal["C", "F"]) -> None: - arr = zarr.Array.create( + arr = zarr.create_array( MemoryStore({}), shape=(10, 8), chunks=(3, 3), @@ -182,7 +187,7 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" arr[slice(6, 9, None), slice(3, 6, None)], a[slice(6, 9, None), slice(3, 6, None)] ) - arr = zarr.Array.create( + arr = zarr.create_array( MemoryStore({}), shape=(10, 8), chunks=(3, 3), @@ -210,18 +215,31 @@ def test_default_compressor_deprecation_warning(): @pytest.mark.parametrize( "dtype_expected", - [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]], + [ + ["b", "zstd", None], + ["i", "zstd", None], + ["f", "zstd", None], + ["|S1", "zstd", "vlen-bytes"], + ["|U1", "zstd", "vlen-utf8"], + ], ) def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { "array.v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": "0"}, + "string": {"id": "zstd", "level": "0"}, + "bytes": {"id": "zstd", "level": "0"}, + }, + "array.v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): - dtype, expected = dtype_expected + dtype, expected_compressor, expected_filter = dtype_expected arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) - assert arr.metadata.filters[0].codec_id == expected + assert arr.metadata.compressor.codec_id == expected_compressor + if expected_filter is not None: + assert arr.metadata.filters[0].codec_id == expected_filter diff --git a/tests/test_zarr.py b/tests/test_zarr.py new file mode 100644 index 0000000000..2aa62e4231 --- /dev/null +++ b/tests/test_zarr.py @@ -0,0 +1,11 @@ +import zarr + + +def test_exports() -> None: + """ + Ensure that everything in __all__ can be imported. + """ + from zarr import __all__ + + for export in __all__: + getattr(zarr, export) From a460fbcfe4c2ccaa71e28a6594f019fba8e3655e Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 3 Jan 2025 00:05:11 +0000 Subject: [PATCH 51/87] Improve docstrings of zarr.api.synchronous (#2610) --- src/zarr/api/synchronous.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 52815748ad..e4a842ef8f 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -160,7 +160,7 @@ def open( storage_options: dict[str, Any] | None = None, **kwargs: Any, # TODO: type kwargs as valid args to async_api.open ) -> Array | Group: - """Convenience function to open a group or array using file-mode-like semantics. + """Open a group or array using file-mode-like semantics. Parameters ---------- @@ -221,7 +221,7 @@ def save( path: str | None = None, **kwargs: Any, # TODO: type kwargs as valid args to async_api.save ) -> None: - """Convenience function to save an array or group of arrays to the local file system. + """Save an array or group of arrays to the local file system. Parameters ---------- @@ -254,8 +254,9 @@ def save_array( storage_options: dict[str, Any] | None = None, **kwargs: Any, # TODO: type kwargs as valid args to async_api.save_array ) -> None: - """Convenience function to save a NumPy array to the local file system, following a - similar API to the NumPy save() function. + """Save a NumPy array to the local file system. + + Follows a similar API to the NumPy save() function. Parameters ---------- @@ -295,8 +296,9 @@ def save_group( storage_options: dict[str, Any] | None = None, **kwargs: NDArrayLike, ) -> None: - """Convenience function to save several NumPy arrays to the local file system, following a - similar API to the NumPy savez()/savez_compressed() functions. + """Save several NumPy arrays to the local file system. + + Follows a similar API to the NumPy savez()/savez_compressed() functions. Parameters ---------- @@ -920,7 +922,7 @@ def empty(shape: ChunkCoords, **kwargs: Any) -> Array: # TODO: move ArrayLike to common module # TODO: add type annotations for kwargs def empty_like(a: ArrayLike, **kwargs: Any) -> Array: - """Create an empty array like `a`. + """Create an empty array like another array. Parameters ---------- @@ -939,8 +941,7 @@ def empty_like(a: ArrayLike, **kwargs: Any) -> Array: # TODO: add type annotations for kwargs and fill_value def full(shape: ChunkCoords, fill_value: Any, **kwargs: Any) -> Array: - """Create an array, with `fill_value` being used as the default value for - uninitialized portions of the array. + """Create an array with a default fill value. Parameters ---------- @@ -962,7 +963,7 @@ def full(shape: ChunkCoords, fill_value: Any, **kwargs: Any) -> Array: # TODO: move ArrayLike to common module # TODO: add type annotations for kwargs def full_like(a: ArrayLike, **kwargs: Any) -> Array: - """Create a filled array like `a`. + """Create a filled array like another array. Parameters ---------- @@ -981,8 +982,7 @@ def full_like(a: ArrayLike, **kwargs: Any) -> Array: # TODO: add type annotations for kwargs def ones(shape: ChunkCoords, **kwargs: Any) -> Array: - """Create an array, with one being used as the default value for - uninitialized portions of the array. + """Create an array with a fill value of one. Parameters ---------- @@ -1001,7 +1001,7 @@ def ones(shape: ChunkCoords, **kwargs: Any) -> Array: # TODO: add type annotations for kwargs def ones_like(a: ArrayLike, **kwargs: Any) -> Array: - """Create an array of ones like `a`. + """Create an array of ones like another array. Parameters ---------- @@ -1063,7 +1063,7 @@ def open_array( # TODO: add type annotations for kwargs def open_like(a: ArrayLike, path: str, **kwargs: Any) -> Array: - """Open a persistent array like `a`. + """Open a persistent array like another array. Parameters ---------- @@ -1084,8 +1084,7 @@ def open_like(a: ArrayLike, path: str, **kwargs: Any) -> Array: # TODO: add type annotations for kwargs def zeros(shape: ChunkCoords, **kwargs: Any) -> Array: - """Create an array, with zero being used as the default value for - uninitialized portions of the array. + """Create an array with a fill value of zero. Parameters ---------- @@ -1104,7 +1103,7 @@ def zeros(shape: ChunkCoords, **kwargs: Any) -> Array: # TODO: add type annotations for kwargs def zeros_like(a: ArrayLike, **kwargs: Any) -> Array: - """Create an array of zeros like `a`. + """Create an array of zeros like another array. Parameters ---------- From c070940e1f6c94b658c5e78c22e0d6466ea9591d Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 3 Jan 2025 01:03:26 +0000 Subject: [PATCH 52/87] Improve exception docs (#2624) --- src/zarr/errors.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/zarr/errors.py b/src/zarr/errors.py index 5eb696d935..441cdab9a3 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -1,22 +1,41 @@ from typing import Any +__all__ = [ + "BaseZarrError", + "ContainsArrayAndGroupError", + "ContainsArrayError", + "ContainsGroupError", + "MetadataValidationError", + "NodeTypeValidationError", +] + + +class BaseZarrError(ValueError): + """ + Base error which all zarr errors are sub-classed from. + """ -class _BaseZarrError(ValueError): _msg = "" def __init__(self, *args: Any) -> None: super().__init__(self._msg.format(*args)) -class ContainsGroupError(_BaseZarrError): +class ContainsGroupError(BaseZarrError): + """Raised when a group already exists at a certain path.""" + _msg = "A group exists in store {!r} at path {!r}." -class ContainsArrayError(_BaseZarrError): +class ContainsArrayError(BaseZarrError): + """Raised when an array already exists at a certain path.""" + _msg = "An array exists in store {!r} at path {!r}." -class ContainsArrayAndGroupError(_BaseZarrError): +class ContainsArrayAndGroupError(BaseZarrError): + """Raised when both array and group metadata are found at the same path.""" + _msg = ( "Array and group metadata documents (.zarray and .zgroup) were both found in store " "{!r} at path {!r}. " @@ -25,8 +44,8 @@ class ContainsArrayAndGroupError(_BaseZarrError): ) -class MetadataValidationError(_BaseZarrError): - """An exception raised when the Zarr metadata is invalid in some way""" +class MetadataValidationError(BaseZarrError): + """Raised when the Zarr metadata is invalid in some way""" _msg = "Invalid value for '{}'. Expected '{}'. Got '{}'." @@ -38,10 +57,3 @@ class NodeTypeValidationError(MetadataValidationError): This can be raised when the value is invalid or unexpected given the context, for example an 'array' node when we expected a 'group'. """ - - -__all__ = [ - "ContainsArrayAndGroupError", - "ContainsArrayError", - "ContainsGroupError", -] From d6384f56520cd206140126f728b9cfb58366e7de Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 3 Jan 2025 06:31:22 -0800 Subject: [PATCH 53/87] docs: split tutorial into multiple user guide sections (#2589) * docs: split tutorial into multiple user guide sections * Apply suggestions from code review Co-authored-by: David Stansby * respond to david's review * add todos for remainig api changes * docs: add new top level about page (#2592) * docs: add new top level about page * fixup * fixup * fixup * docs: add docs on extending zarr 3 (#2597) * docs: add docs on extending zarr 3 * Apply suggestions from code review Co-authored-by: David Stansby * move note up * remove test.py (#2612) * Note that whole directories can be deleted in LocalStore (#2606) * fix: run-coverage command now tracks src directory (#2615) * fix doc build * Update docs/user-guide/extending.rst --------- Co-authored-by: Norman Rzepka Co-authored-by: David Stansby Co-authored-by: Davis Bennett * Use doctests for guide (#2623) * Use doctests for arrays.rst * Use doctests for attributes.rst * Use doctests for config.rst * Use doctests for consolidated metadata * Use doctests for groups.rst * Use doctests for preformance.rst * Use doctests for storage.rst * Remove ipython config for docs * Fix performance doctest output * Enable doctests * Add a doctest CI run * Remove rmtrees * Delete data dir before doctests * Fix doctests * fix doctests for arrays.rst * fix doctests for consolidated_metadata.rst * fixes for doctest * tests * debugging * debugging * debugging * debugging * debugging --------- Co-authored-by: David Stansby Co-authored-by: Norman Rzepka Co-authored-by: Davis Bennett Co-authored-by: Deepak Cherian --- .github/workflows/test.yml | 27 +- .gitignore | 3 + data/donotdelete | 1 - docs/about.rst | 24 + docs/conf.py | 3 +- docs/getting_started.rst | 18 - docs/guide/consolidated_metadata.rst | 74 - docs/guide/index.rst | 9 - docs/guide/storage.rst | 101 -- docs/guide/whatsnew_v3.rst | 14 - docs/index.rst | 14 +- docs/release.rst | 54 +- docs/tutorial.rst | 1722 --------------------- docs/user-guide/arrays.rst | 612 ++++++++ docs/user-guide/attributes.rst | 30 + docs/user-guide/config.rst | 88 ++ docs/user-guide/consolidated_metadata.rst | 119 ++ docs/user-guide/extending.rst | 91 ++ docs/user-guide/groups.rst | 141 ++ docs/user-guide/index.rst | 31 + docs/user-guide/performance.rst | 230 +++ docs/user-guide/storage.rst | 110 ++ pyproject.toml | 15 +- src/zarr/core/_tree.py | 5 +- tests/test_api.py | 2 +- tests/test_tree.py | 9 +- 26 files changed, 1565 insertions(+), 1982 deletions(-) delete mode 100644 data/donotdelete create mode 100644 docs/about.rst delete mode 100644 docs/guide/consolidated_metadata.rst delete mode 100644 docs/guide/index.rst delete mode 100644 docs/guide/storage.rst delete mode 100644 docs/guide/whatsnew_v3.rst delete mode 100644 docs/tutorial.rst create mode 100644 docs/user-guide/arrays.rst create mode 100644 docs/user-guide/attributes.rst create mode 100644 docs/user-guide/config.rst create mode 100644 docs/user-guide/consolidated_metadata.rst create mode 100644 docs/user-guide/extending.rst create mode 100644 docs/user-guide/groups.rst create mode 100644 docs/user-guide/index.rst create mode 100644 docs/user-guide/performance.rst create mode 100644 docs/user-guide/storage.rst diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1157fccc86..5309ea4565 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -94,6 +94,30 @@ jobs: run: | hatch env run --env ${{ matrix.dependency-set }} run + doctests: + name: doctests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # required for hatch version discovery, which is needed for numcodecs.zarr3 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + cache: 'pip' + - name: Install Hatch + run: | + python -m pip install --upgrade pip + pip install hatch + - name: Set Up Hatch Env + run: | + hatch env create doctest + hatch env run -e doctest list-env + - name: Run Tests + run: | + hatch env run --env doctest run + test-complete: name: Test complete @@ -101,6 +125,7 @@ jobs: [ test, test-upstream-and-min-deps, + doctests ] if: always() runs-on: ubuntu-latest @@ -111,4 +136,4 @@ jobs: contains(needs.*.result, 'cancelled') run: exit 1 - name: Success - run: echo Success! \ No newline at end of file + run: echo Success! diff --git a/.gitignore b/.gitignore index 199ab10578..153ca39df0 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,9 @@ coverage.xml # Sphinx documentation docs/_build/ docs/_autoapi +docs/data +data +data.zip # PyBuilder target/ diff --git a/data/donotdelete b/data/donotdelete deleted file mode 100644 index b0c96f7ee5..0000000000 --- a/data/donotdelete +++ /dev/null @@ -1 +0,0 @@ -This directory is used for data files created during testing. diff --git a/docs/about.rst b/docs/about.rst new file mode 100644 index 0000000000..7a0af998c0 --- /dev/null +++ b/docs/about.rst @@ -0,0 +1,24 @@ +About +===== + +Zarr is a format for the storage of chunked, compressed, N-dimensional arrays +inspired by `HDF5 `_, `h5py +`_ and `bcolz `_. + +These documents describe the Zarr-Python implementation. More information +about the Zarr format can be found on the `main website `_. + +Projects using Zarr +------------------- + +If you are using Zarr-Python, we would `love to hear about it +`_. + +Funding +------- +The project is fiscally sponsored by `NumFOCUS `_, a US +501(c)(3) public charity, and development is supported by the +`MRC Centre for Genomics and Global Health `_ +and the `Chan Zuckerberg Initiative `_. + +.. _NumCodecs: https://numcodecs.readthedocs.io/ diff --git a/docs/conf.py b/docs/conf.py index 53fba058e7..dfd1ae07bb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -102,7 +102,8 @@ def skip_submodules( "spec/v1": 'https://zarr-specs.readthedocs.io/en/latest/v1/v1.0.html', "spec/v2": "https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html", "spec/v3": "https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html", - "license": "https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt" + "license": "https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt", + "tutorial": "user-guide", } # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 77d45325e4..5950e2ae44 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -1,18 +1,6 @@ Getting Started =============== -Zarr is a format for the storage of chunked, compressed, N-dimensional arrays -inspired by `HDF5 `_, `h5py -`_ and `bcolz `_. - -The project is fiscally sponsored by `NumFOCUS `_, a US -501(c)(3) public charity, and development is supported by the -`MRC Centre for Genomics and Global Health `_ -and the `Chan Zuckerberg Initiative `_. - -These documents describe the Zarr Python implementation. More information -about the Zarr format can be found on the `main website `_. - Highlights ---------- @@ -31,12 +19,6 @@ Feedback and bug reports are very welcome, please get in touch via the `GitHub issue tracker `_. See :doc:`contributing` for further information about contributing to Zarr. -Projects using Zarr -------------------- - -If you are using Zarr, we would `love to hear about it -`_. - .. toctree:: :caption: Getting Started :hidden: diff --git a/docs/guide/consolidated_metadata.rst b/docs/guide/consolidated_metadata.rst deleted file mode 100644 index 5010d32481..0000000000 --- a/docs/guide/consolidated_metadata.rst +++ /dev/null @@ -1,74 +0,0 @@ -Consolidated Metadata -===================== - -Zarr-Python implements the `Consolidated Metadata_` extension to the Zarr Spec. -Consolidated metadata can reduce the time needed to load the metadata for an -entire hierarchy, especially when the metadata is being served over a network. -Consolidated metadata essentially stores all the metadata for a hierarchy in the -metadata of the root Group. - -Usage ------ - -If consolidated metadata is present in a Zarr Group's metadata then it is used -by default. The initial read to open the group will need to communicate with -the store (reading from a file for a :class:`zarr.storage.LocalStore`, making a -network request for a :class:`zarr.storage.RemoteStore`). After that, any subsequent -metadata reads get child Group or Array nodes will *not* require reads from the store. - -In Python, the consolidated metadata is available on the ``.consolidated_metadata`` -attribute of the ``GroupMetadata`` object. - -.. code-block:: python - - >>> import zarr - >>> store = zarr.storage.MemoryStore({}, mode="w") - >>> group = zarr.open_group(store=store) - >>> group.create_array(shape=(1,), name="a") - >>> group.create_array(shape=(2, 2), name="b") - >>> group.create_array(shape=(3, 3, 3), name="c") - >>> zarr.consolidate_metadata(store) - -If we open that group, the Group's metadata has a :class:`zarr.ConsolidatedMetadata` -that can be used. - -.. code-block:: python - - >>> consolidated = zarr.open_group(store=store) - >>> consolidated.metadata.consolidated_metadata.metadata - {'b': ArrayV3Metadata(shape=(2, 2), fill_value=np.float64(0.0), ...), - 'a': ArrayV3Metadata(shape=(1,), fill_value=np.float64(0.0), ...), - 'c': ArrayV3Metadata(shape=(3, 3, 3), fill_value=np.float64(0.0), ...)} - -Operations on the group to get children automatically use the consolidated metadata. - -.. code-block:: python - - >>> consolidated["a"] # no read / HTTP request to the Store is required - - -With nested groups, the consolidated metadata is available on the children, recursively. - -... code-block:: python - - >>> child = group.create_group("child", attributes={"kind": "child"}) - >>> grandchild = child.create_group("child", attributes={"kind": "grandchild"}) - >>> consolidated = zarr.consolidate_metadata(store) - - >>> consolidated["child"].metadata.consolidated_metadata - ConsolidatedMetadata(metadata={'child': GroupMetadata(attributes={'kind': 'grandchild'}, zarr_format=3, )}, ...) - -Synchronization and Concurrency -------------------------------- - -Consolidated metadata is intended for read-heavy use cases on slowly changing -hierarchies. For hierarchies where new nodes are constantly being added, -removed, or modified, consolidated metadata may not be desirable. - -1. It will add some overhead to each update operation, since the metadata - would need to be re-consolidated to keep it in sync with the store. -2. Readers using consolidated metadata will regularly see a "past" version - of the metadata, at the time they read the root node with its consolidated - metadata. - -.. _Consolidated Metadata: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#consolidated-metadata \ No newline at end of file diff --git a/docs/guide/index.rst b/docs/guide/index.rst deleted file mode 100644 index e532a13e20..0000000000 --- a/docs/guide/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -Guide -===== - -.. toctree:: - :maxdepth: 1 - - whatsnew_v3 - storage - consolidated_metadata diff --git a/docs/guide/storage.rst b/docs/guide/storage.rst deleted file mode 100644 index 730b0bfcc8..0000000000 --- a/docs/guide/storage.rst +++ /dev/null @@ -1,101 +0,0 @@ -Storage -======= - -Zarr-Python supports multiple storage backends, including: local file systems, -Zip files, remote stores via ``fsspec`` (S3, HTTP, etc.), and in-memory stores. In -Zarr-Python 3, stores must implement the abstract store API from -:class:`zarr.abc.store.Store`. - -.. note:: - Unlike Zarr-Python 2 where the store interface was built around a generic ``MutableMapping`` - API, Zarr-Python 3 utilizes a custom store API that utilizes Python's AsyncIO library. - -Implicit Store Creation ------------------------ - -In most cases, it is not required to create a ``Store`` object explicitly. Passing a string -to Zarr's top level API will result in the store being created automatically. - -.. code-block:: python - - >>> import zarr - >>> zarr.open("data/foo/bar", mode="r") # implicitly creates a read-only LocalStore - - >>> zarr.open("s3://foo/bar", mode="r") # implicitly creates a read-only RemoteStore - - >>> data = {} - >>> zarr.open(data, mode="w") # implicitly creates a MemoryStore - - -Explicit Store Creation ------------------------ - -In some cases, it may be helpful to create a store instance directly. Zarr-Python offers four -built-in store: :class:`zarr.storage.LocalStore`, :class:`zarr.storage.RemoteStore`, -:class:`zarr.storage.ZipStore`, and :class:`zarr.storage.MemoryStore`. - -Local Store -~~~~~~~~~~~ - -The :class:`zarr.storage.LocalStore` stores data in a nested set of directories on a local -filesystem. - -.. code-block:: python - - >>> import zarr - >>> store = zarr.storage.LocalStore("data/foo/bar", read_only=True) - >>> zarr.open(store=store) - - -Zip Store -~~~~~~~~~ - -The :class:`zarr.storage.ZipStore` stores the contents of a Zarr hierarchy in a single -Zip file. The `Zip Store specification_` is currently in draft form. - -.. code-block:: python - - >>> import zarr - >>> store = zarr.storage.ZipStore("data.zip", mode="w") - >>> zarr.open(store=store, shape=(2,)) - >> import zarr - >>> store = zarr.storage.RemoteStore.from_url("gs://foo/bar", read_only=True) - >>> zarr.open(store=store) - shape=(10, 20) dtype=float32> - -Memory Store -~~~~~~~~~~~~ - -The :class:`zarr.storage.RemoteStore` a in-memory store that allows for serialization of -Zarr data (metadata and chunks) to a dictionary. - -.. code-block:: python - - >>> import zarr - >>> data = {} - >>> store = zarr.storage.MemoryStore(data) - >>> zarr.open(store=store, shape=(2, )) - - -Developing custom stores ------------------------- - -Zarr-Python :class:`zarr.abc.store.Store` API is meant to be extended. The Store Abstract Base -Class includes all of the methods needed to be a fully operational store in Zarr Python. -Zarr also provides a test harness for custom stores: :class:`zarr.testing.store.StoreTests`. - -.. _Zip Store Specification: https://github.com/zarr-developers/zarr-specs/pull/311 -.. _Fsspec: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#consolidated-metadata diff --git a/docs/guide/whatsnew_v3.rst b/docs/guide/whatsnew_v3.rst deleted file mode 100644 index 302c3cf20c..0000000000 --- a/docs/guide/whatsnew_v3.rst +++ /dev/null @@ -1,14 +0,0 @@ -What's new in v3 -================ - -This page gives an overview of major changes and additions in version 3. - - -Dependencies ------------- -- The new ``remote`` dependency group can be used to install a supported version of - ``fsspec``, required for remote data access. -- The new ``gpu`` dependency group can be used to install a supported version of - ``cuda``, required for GPU functionality. -- The ``jupyter`` optional dependency group has been removed, since v3 contains no - jupyter specific functionality. diff --git a/docs/index.rst b/docs/index.rst index 4d6188d3a0..37d560f655 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,8 +9,8 @@ Zarr-Python :hidden: getting_started - tutorial - guide/index + about + user-guide/index api/index release contributing @@ -52,20 +52,20 @@ Zarr is a file storage format for chunked, compressed, N-dimensional arrays base .. grid-item-card:: :img-top: _static/index_user_guide.svg - Tutorial - ^^^^^^^^ + Guide + ^^^^^ - The tutorial provides working examples of Zarr classes and functions. + A detailed guide for how to use Zarr-Python. +++ - .. button-ref:: tutorial + .. button-ref:: user-guide :ref-type: ref :expand: :color: dark :click-parent: - To the Tutorial + To the user guide .. grid-item-card:: :img-top: _static/index_api.svg diff --git a/docs/release.rst b/docs/release.rst index be0919f08b..ce15c68f4a 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -1906,7 +1906,7 @@ Enhancements * **Advanced indexing**. The ``Array`` class has several new methods and properties that enable a selection of items in an array to be retrieved or - updated. See the :ref:`tutorial_indexing` tutorial section for more + updated. See the :ref:`user-guide-indexing` tutorial section for more information. There is also a `notebook `_ with extended examples and performance benchmarks. :issue:`78`, :issue:`89`, @@ -1919,15 +1919,15 @@ Enhancements compressor codecs for Zstd and LZ4. This change is backwards-compatible with existing code, as all codec classes defined by Numcodecs are imported into the :mod:`zarr.codecs` namespace. However, it is recommended to import codecs from - the new package, see the tutorial sections on :ref:`tutorial_compress` and - :ref:`tutorial_filters` for examples. With contributions by + the new package, see the tutorial sections on :ref:`user-guide-compress` and + :ref:`user-guide-filters` for examples. With contributions by :user:`John Kirkham `; :issue:`74`, :issue:`102`, :issue:`120`, :issue:`123`, :issue:`139`. * **New storage class for DBM-style databases**. The :class:`zarr.storage.DBMStore` class enables any DBM-style database such as gdbm, ndbm or Berkeley DB, to be used as the backing store for an array or group. See the - tutorial section on :ref:`tutorial_storage` for some examples. :issue:`133`, + tutorial section on :ref:`user-guide-storage` for some examples. :issue:`133`, :issue:`186`. * **New storage class for LMDB databases**. The :class:`zarr.storage.LMDBStore` class @@ -1943,7 +1943,7 @@ Enhancements :func:`zarr.hierarchy.Group.tree` method which enables a tree representation of a group hierarchy to be printed. Also provides an interactive tree representation when used within a Jupyter notebook. See the - :ref:`tutorial_diagnostics` tutorial section for examples. By + :ref:`user-guide-diagnostics` tutorial section for examples. By :user:`John Kirkham `; :issue:`82`, :issue:`140`, :issue:`184`. * **Visitor API**. The ``Group`` class now implements the h5py visitor API, see @@ -1963,7 +1963,7 @@ Enhancements store. The functions :func:`zarr.convenience.save` and :func:`zarr.convenience.load` are also available and provide a convenient way to save an entire NumPy array to disk and load back into memory later. See the - tutorial section :ref:`tutorial_persist` for examples. :issue:`104`, + tutorial section :ref:`user-guide-persist` for examples. :issue:`104`, :issue:`105`, :issue:`141`, :issue:`181`. * **IPython completions**. The ``Group`` class now implements ``__dir__()`` and @@ -1973,7 +1973,7 @@ Enhancements * **New info property; changes to __repr__**. The ``Group`` and ``Array`` classes have a new ``info`` property which can be used to print diagnostic information, including compression ratio where available. See the - tutorial section on :ref:`tutorial_diagnostics` for examples. The string + tutorial section on :ref:`user-guide-diagnostics` for examples. The string representation (``__repr__``) of these classes has been simplified to ensure it is cheap and quick to compute in all circumstances. :issue:`83`, :issue:`115`, :issue:`132`, :issue:`148`. @@ -1981,7 +1981,7 @@ Enhancements * **Chunk options**. When creating an array, ``chunks=False`` can be specified, which will result in an array with a single chunk only. Alternatively, ``chunks=True`` will trigger an automatic chunk shape guess. See - :ref:`tutorial_chunks` for more on the ``chunks`` parameter. :issue:`106`, + :ref:`user-guide-chunks` for more on the ``chunks`` parameter. :issue:`106`, :issue:`107`, :issue:`183`. * **Zero-dimensional arrays** and are now supported; by @@ -2006,7 +2006,7 @@ Enhancements creating an array with ``dtype=object`` was possible but could under certain circumstances lead to unexpected errors and/or segmentation faults. To make it easier to properly configure an object array, a new ``object_codec`` parameter has been - added to array creation functions. See the tutorial section on :ref:`tutorial_objects` + added to array creation functions. See the tutorial section on :ref:`user-guide-objects` for more information and examples. Also, runtime checks have been added in both Zarr and Numcodecs so that segmentation faults are no longer possible, even with a badly configured array. This API change is backwards compatible and previous code that created @@ -2062,16 +2062,16 @@ Documentation with any of the material as previously implemented, and so the changes have been made in-place in the document without incrementing the document version number. See the section on changes in the specification document for more information. -* A new :ref:`tutorial_indexing` section has been added to the tutorial. -* A new :ref:`tutorial_strings` section has been added to the tutorial +* A new :ref:`user-guide-indexing` section has been added to the tutorial. +* A new :ref:`user-guide-strings` section has been added to the tutorial (:issue:`135`, :issue:`175`). -* The :ref:`tutorial_chunks` tutorial section has been reorganised and updated. -* The :ref:`tutorial_persist` and :ref:`tutorial_storage` tutorial sections have +* The :ref:`user-guide-chunks` tutorial section has been reorganised and updated. +* The :ref:`user-guide-persist` and :ref:`user-guide-storage` tutorial sections have been updated with new examples (:issue:`100`, :issue:`101`, :issue:`103`). -* A new tutorial section on :ref:`tutorial_pickle` has been added (:issue:`91`). -* A new tutorial section on :ref:`tutorial_datetime` has been added. -* A new tutorial section on :ref:`tutorial_diagnostics` has been added. -* The tutorial sections on :ref:`tutorial_sync` and :ref:`tutorial_tips_blosc` have been +* A new tutorial section on :ref:`user-guide-pickle` has been added (:issue:`91`). +* A new tutorial section on :ref:`user-guide-datetime` has been added. +* A new tutorial section on :ref:`user-guide-diagnostics` has been added. +* The tutorial sections on :ref:`user-guide-sync` and :ref:`user-guide-tips-blosc` have been updated to provide information about how to avoid program hangs when using the Blosc compressor with multiple processes (:issue:`199`, :issue:`201`). @@ -2177,14 +2177,14 @@ Hierarchies ~~~~~~~~~~~ Support has been added for organizing arrays into hierarchies via groups. See -the tutorial section on :ref:`tutorial_groups` and the :mod:`zarr.hierarchy` +the tutorial section on :ref:`user-guide-groups` and the :mod:`zarr.hierarchy` API docs for more information. Filters ~~~~~~~ Support has been added for configuring filters to preprocess chunk data prior -to compression. See the tutorial section on :ref:`tutorial_filters` and the +to compression. See the tutorial section on :ref:`user-guide-filters` and the :mod:`zarr.codecs` API docs for more information. Other changes @@ -2210,7 +2210,7 @@ Thanks to :user:`Matthew Rocklin `, :user:`Stephan Hoyer ` and * The bundled Blosc library has been upgraded to version 1.10.0. The 'zstd' internal compression library is now available within Blosc. See the tutorial - section on :ref:`tutorial_compress` for an example. + section on :ref:`user-guide-compress` for an example. * When using the Blosc compressor, the default internal compression library is now 'lz4'. * The default number of internal threads for the Blosc compressor has been @@ -2236,8 +2236,8 @@ The main motivation for re-organizing the code was to create an abstraction layer between the core array logic and data storage (:issue:`21`). In this release, any object that implements the ``MutableMapping`` interface can be used as -an array store. See the tutorial sections on :ref:`tutorial_persist` -and :ref:`tutorial_storage`, the ``spec_v1``, and the +an array store. See the tutorial sections on :ref:`user-guide-persist` +and :ref:`user-guide-storage`, the ``spec_v1``, and the :mod:`zarr.storage` module documentation for more information. Please note also that the file organization and file name conventions @@ -2256,8 +2256,8 @@ chunks. This release still bundles the c-blosc library and uses Blosc as the default compressor, however other compressors including zlib, BZ2 and LZMA are also now supported via the Python standard library. New compressors can also be dynamically registered for use -with Zarr. See the tutorial sections on :ref:`tutorial_compress` and -:ref:`tutorial_tips_blosc`, the ``spec_v1``, and the +with Zarr. See the tutorial sections on :ref:`user-guide-compress` and +:ref:`user-guide-tips-blosc`, the ``spec_v1``, and the :mod:`zarr.compressors` module documentation for more information. Synchronization @@ -2266,7 +2266,7 @@ Synchronization The synchronization code has also been refactored to create a layer of abstraction, enabling Zarr arrays to be used in parallel computations with a number of alternative synchronization methods. For more -information see the tutorial section on :ref:`tutorial_sync` and the +information see the tutorial section on :ref:`user-guide-sync` and the :mod:`zarr.sync` module documentation. Changes to the Blosc extension @@ -2288,7 +2288,7 @@ is running within a single-threaded or multi-threaded program and adapts its internal behaviour accordingly (:issue:`27`). There is no need for the user to make any API calls to switch Blosc between contextual and non-contextual (global lock) mode. See also the tutorial section on -:ref:`tutorial_tips_blosc`. +:ref:`user-guide-tips-blosc`. Other changes ~~~~~~~~~~~~~ @@ -2302,7 +2302,7 @@ option present in the previous release, and this has been removed. The memory layout within chunks can now be set as either "C" (row-major) or "F" (column-major), which can help to provide better compression for some data (:issue:`7`). See the tutorial -section on :ref:`tutorial_chunks_order` for more information. +section on :ref:`user-guide-chunks-order` for more information. A bug has been fixed within the ``__getitem__`` and ``__setitem__`` machinery for slicing arrays, to properly handle getting and setting diff --git a/docs/tutorial.rst b/docs/tutorial.rst deleted file mode 100644 index 71254900d5..0000000000 --- a/docs/tutorial.rst +++ /dev/null @@ -1,1722 +0,0 @@ -.. _tutorial: - -Tutorial -======== - -Zarr provides classes and functions for working with N-dimensional arrays that -behave like NumPy arrays but whose data is divided into chunks and each chunk is -compressed. If you are already familiar with HDF5 then Zarr arrays provide -similar functionality, but with some additional flexibility. - -.. _tutorial_create: - -Creating an array ------------------ - -Zarr has several functions for creating arrays. For example:: - - >>> import zarr - >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z - - -The code above creates a 2-dimensional array of 32-bit integers with 10000 rows -and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 -columns (and so there will be 100 chunks in total). - -For a complete list of array creation routines see the :mod:`zarr.creation` -module documentation. - -.. _tutorial_array: - -Reading and writing data ------------------------- - -Zarr arrays support a similar interface to NumPy arrays for reading and writing -data. For example, the entire array can be filled with a scalar value:: - - >>> z[:] = 42 - -Regions of the array can also be written to, e.g.:: - - >>> import numpy as np - >>> z[0, :] = np.arange(10000) - >>> z[:, 0] = np.arange(10000) - -The contents of the array can be retrieved by slicing, which will load the -requested region into memory as a NumPy array, e.g.:: - - >>> z[0, 0] - 0 - >>> z[-1, -1] - 42 - >>> z[0, :] - array([ 0, 1, 2, ..., 9997, 9998, 9999], dtype=int32) - >>> z[:, 0] - array([ 0, 1, 2, ..., 9997, 9998, 9999], dtype=int32) - >>> z[:] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [ 1, 42, 42, ..., 42, 42, 42], - [ 2, 42, 42, ..., 42, 42, 42], - ..., - [9997, 42, 42, ..., 42, 42, 42], - [9998, 42, 42, ..., 42, 42, 42], - [9999, 42, 42, ..., 42, 42, 42]], dtype=int32) - -.. _tutorial_persist: - -Persistent arrays ------------------ - -In the examples above, compressed data for each chunk of the array was stored in -main memory. Zarr arrays can also be stored on a file system, enabling -persistence of data between sessions. For example:: - - >>> z1 = zarr.open('data/example.zarr', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4') - -The array above will store its configuration metadata and all compressed chunk -data in a directory called 'data/example.zarr' relative to the current working -directory. The :func:`zarr.convenience.open` function provides a convenient way -to create a new persistent array or continue working with an existing -array. Note that although the function is called "open", there is no need to -close an array: data are automatically flushed to disk, and files are -automatically closed whenever an array is modified. - -Persistent arrays support the same interface for reading and writing data, -e.g.:: - - >>> z1[:] = 42 - >>> z1[0, :] = np.arange(10000) - >>> z1[:, 0] = np.arange(10000) - -Check that the data have been written and can be read again:: - - >>> z2 = zarr.open('data/example.zarr', mode='r') - >>> np.all(z1[:] == z2[:]) - True - -If you are just looking for a fast and convenient way to save NumPy arrays to -disk then load back into memory later, the functions -:func:`zarr.convenience.save` and :func:`zarr.convenience.load` may be -useful. E.g.:: - - >>> a = np.arange(10) - >>> zarr.save('data/example.zarr', a) - >>> zarr.load('data/example.zarr') - array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - -Please note that there are a number of other options for persistent array -storage, see the section on :ref:`tutorial_storage` below. - -.. _tutorial_resize: - -Resizing and appending ----------------------- - -A Zarr array can be resized, which means that any of its dimensions can be -increased or decreased in length. For example:: - - >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) - >>> z[:] = 42 - >>> z.resize(20000, 10000) - >>> z.shape - (20000, 10000) - -Note that when an array is resized, the underlying data are not rearranged in -any way. If one or more dimensions are shrunk, any chunks falling outside the -new array shape will be deleted from the underlying store. - -For convenience, Zarr arrays also provide an ``append()`` method, which can be -used to append data to any axis. E.g.:: - - >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) - >>> z = zarr.array(a, chunks=(1000, 100)) - >>> z.shape - (10000, 1000) - >>> z.append(a) - (20000, 1000) - >>> z.append(np.vstack([a, a]), axis=1) - (20000, 2000) - >>> z.shape - (20000, 2000) - -.. _tutorial_compress: - -Compressors ------------ - -A number of different compressors can be used with Zarr. A separate package -called NumCodecs_ is available which provides a common interface to various -compressor libraries including Blosc, Zstandard, LZ4, Zlib, BZ2 and -LZMA. Different compressors can be provided via the ``compressor`` keyword -argument accepted by all array creation functions. For example:: - - >>> from numcodecs import Blosc - >>> compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) - >>> data = np.arange(100000000, dtype='i4').reshape(10000, 10000) - >>> z = zarr.array(data, chunks=(1000, 1000), compressor=compressor) - >>> z.compressor - Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0) - -This array above will use Blosc as the primary compressor, using the Zstandard -algorithm (compression level 3) internally within Blosc, and with the -bit-shuffle filter applied. - -When using a compressor, it can be useful to get some diagnostics on the -compression ratio. Zarr arrays provide a ``info`` property which can be used to -print some diagnostics, e.g.:: - - >>> z.info - Type : zarr.Array - Data type : int32 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Compressor : Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, - : blocksize=0) - Store type : zarr.storage.KVStore - No. bytes : 400000000 (381.5M) - No. bytes stored : 3379344 (3.2M) - Storage ratio : 118.4 - Chunks initialized : 100/100 - -If you don't specify a compressor, by default Zarr uses the Blosc -compressor. Blosc is generally very fast and can be configured in a variety of -ways to improve the compression ratio for different types of data. Blosc is in -fact a "meta-compressor", which means that it can use a number of different -compression algorithms internally to compress the data. Blosc also provides -highly optimized implementations of byte- and bit-shuffle filters, which can -improve compression ratios for some data. A list of the internal compression -libraries available within Blosc can be obtained via:: - - >>> from numcodecs import blosc - >>> blosc.list_compressors() - ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd'] - -In addition to Blosc, other compression libraries can also be used. For example, -here is an array using Zstandard compression, level 1:: - - >>> from numcodecs import Zstd - >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), - ... chunks=(1000, 1000), compressor=Zstd(level=1)) - >>> z.compressor - Zstd(level=1) - -Here is an example using LZMA with a custom filter pipeline including LZMA's -built-in delta filter:: - - >>> import lzma - >>> lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), - ... dict(id=lzma.FILTER_LZMA2, preset=1)] - >>> from numcodecs import LZMA - >>> compressor = LZMA(filters=lzma_filters) - >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), - ... chunks=(1000, 1000), compressor=compressor) - >>> z.compressor - LZMA(format=1, check=-1, preset=None, filters=[{'dist': 4, 'id': 3}, {'id': 33, 'preset': 1}]) - -The default compressor can be changed by setting the value of the -``zarr.storage.default_compressor`` variable, e.g.:: - - >>> import zarr.storage - >>> from numcodecs import Zstd, Blosc - >>> # switch to using Zstandard - ... zarr.storage.default_compressor = Zstd(level=1) - >>> z = zarr.zeros(100000000, chunks=1000000) - >>> z.compressor - Zstd(level=1) - >>> # switch back to Blosc defaults - ... zarr.storage.default_compressor = Blosc() - -To disable compression, set ``compressor=None`` when creating an array, e.g.:: - - >>> z = zarr.zeros(100000000, chunks=1000000, compressor=None) - >>> z.compressor is None - True - -.. _tutorial_filters: - -Filters -------- - -In some cases, compression can be improved by transforming the data in some -way. For example, if nearby values tend to be correlated, then shuffling the -bytes within each numerical value or storing the difference between adjacent -values may increase compression ratio. Some compressors provide built-in filters -that apply transformations to the data prior to compression. For example, the -Blosc compressor has built-in implementations of byte- and bit-shuffle filters, -and the LZMA compressor has a built-in implementation of a delta -filter. However, to provide additional flexibility for implementing and using -filters in combination with different compressors, Zarr also provides a -mechanism for configuring filters outside of the primary compressor. - -Here is an example using a delta filter with the Blosc compressor:: - - >>> from numcodecs import Blosc, Delta - >>> filters = [Delta(dtype='i4')] - >>> compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE) - >>> data = np.arange(100000000, dtype='i4').reshape(10000, 10000) - >>> z = zarr.array(data, chunks=(1000, 1000), filters=filters, compressor=compressor) - >>> z.info - Type : zarr.Array - Data type : int32 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Filter [0] : Delta(dtype='`_ documentation. - -.. _tutorial_groups: - -Groups ------- - -Zarr supports hierarchical organization of arrays via groups. As with arrays, -groups can be stored in memory, on disk, or via other storage systems that -support a similar interface. - -To create a group, use the :func:`zarr.group` function:: - - >>> root = zarr.group() - >>> root - - -Groups have a similar API to the Group class from `h5py -`_. For example, groups can contain other groups:: - - >>> foo = root.create_group('foo') - >>> bar = foo.create_group('bar') - -Groups can also contain arrays, e.g.:: - - >>> z1 = bar.zeros('baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z1 - - -Arrays are known as "datasets" in HDF5 terminology. For compatibility with h5py, -Zarr groups also implement the ``create_dataset()`` and ``require_dataset()`` -methods, e.g.:: - - >>> z = bar.create_dataset('quux', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z - - -Members of a group can be accessed via the suffix notation, e.g.:: - - >>> root['foo'] - - -The '/' character can be used to access multiple levels of the hierarchy in one -call, e.g.:: - - >>> root['foo/bar'] - - >>> root['foo/bar/baz'] - - -The :func:`zarr.hierarchy.Group.tree` method can be used to print a tree -representation of the hierarchy, e.g.:: - - >>> root.tree() - / - └── foo - └── bar - ├── baz (10000, 10000) int32 - └── quux (10000, 10000) int32 - -The :func:`zarr.convenience.open` function provides a convenient way to create or -re-open a group stored in a directory on the file-system, with sub-groups stored in -sub-directories, e.g.:: - - >>> root = zarr.open('data/group.zarr', mode='w') - >>> root - - >>> z = root.zeros('foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z - - -Groups can be used as context managers (in a ``with`` statement). -If the underlying store has a ``close`` method, it will be called on exit. - -For more information on groups see the :mod:`zarr.hierarchy` and -:mod:`zarr.convenience` API docs. - -.. _tutorial_diagnostics: - -Array and group diagnostics ---------------------------- - -Diagnostic information about arrays and groups is available via the ``info`` -property. E.g.:: - - >>> root = zarr.group() - >>> foo = root.create_group('foo') - >>> bar = foo.zeros('bar', shape=1000000, chunks=100000, dtype='i8') - >>> bar[:] = 42 - >>> baz = foo.zeros('baz', shape=(1000, 1000), chunks=(100, 100), dtype='f4') - >>> baz[:] = 4.2 - >>> root.info - Name : / - Type : zarr.hierarchy.Group - Read-only : False - Store type : zarr.storage.MemoryStore - No. members : 1 - No. arrays : 0 - No. groups : 1 - Groups : foo - - >>> foo.info - Name : /foo - Type : zarr.hierarchy.Group - Read-only : False - Store type : zarr.storage.MemoryStore - No. members : 2 - No. arrays : 2 - No. groups : 0 - Arrays : bar, baz - - >>> bar.info - Name : /foo/bar - Type : zarr.Array - Data type : int64 - Shape : (1000000,) - Chunk shape : (100000,) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.MemoryStore - No. bytes : 8000000 (7.6M) - No. bytes stored : 33240 (32.5K) - Storage ratio : 240.7 - Chunks initialized : 10/10 - - >>> baz.info - Name : /foo/baz - Type : zarr.Array - Data type : float32 - Shape : (1000, 1000) - Chunk shape : (100, 100) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.MemoryStore - No. bytes : 4000000 (3.8M) - No. bytes stored : 23943 (23.4K) - Storage ratio : 167.1 - Chunks initialized : 100/100 - -Groups also have the :func:`zarr.hierarchy.Group.tree` method, e.g.:: - - >>> root.tree() - / - └── foo - ├── bar (1000000,) int64 - └── baz (1000, 1000) float32 - - -.. note:: - - :func:`zarr.Group.tree` requires the optional `rich `_ - dependency. It can be installed with the ``[tree]`` extra. - -If you're using Zarr within a Jupyter notebook (requires -`ipytree `_), calling ``tree()`` will generate an -interactive tree representation, see the `repr_tree.ipynb notebook -`_ -for more examples. - -.. _tutorial_attrs: - -User attributes ---------------- - -Zarr arrays and groups support custom key/value attributes, which can be useful for -storing application-specific metadata. For example:: - - >>> root = zarr.group() - >>> root.attrs['foo'] = 'bar' - >>> z = root.zeros('zzz', shape=(10000, 10000)) - >>> z.attrs['baz'] = 42 - >>> z.attrs['qux'] = [1, 4, 7, 12] - >>> sorted(root.attrs) - ['foo'] - >>> 'foo' in root.attrs - True - >>> root.attrs['foo'] - 'bar' - >>> sorted(z.attrs) - ['baz', 'qux'] - >>> z.attrs['baz'] - 42 - >>> z.attrs['qux'] - [1, 4, 7, 12] - -Internally Zarr uses JSON to store array attributes, so attribute values must be -JSON serializable. - -.. _tutorial_indexing: - -Advanced indexing ------------------ - -As of version 2.2, Zarr arrays support several methods for advanced or "fancy" -indexing, which enable a subset of data items to be extracted or updated in an -array without loading the entire array into memory. - -Note that although this functionality is similar to some of the advanced -indexing capabilities available on NumPy arrays and on h5py datasets, **the Zarr -API for advanced indexing is different from both NumPy and h5py**, so please -read this section carefully. For a complete description of the indexing API, -see the documentation for the :class:`zarr.Array` class. - -Indexing with coordinate arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Items from a Zarr array can be extracted by providing an integer array of -coordinates. E.g.:: - - >>> z = zarr.array(np.arange(10) ** 2) - >>> z[:] - array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) - >>> z.get_coordinate_selection([2, 5]) - array([ 4, 25]) - -Coordinate arrays can also be used to update data, e.g.:: - - >>> z.set_coordinate_selection([2, 5], [-1, -2]) - >>> z[:] - array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) - -For multidimensional arrays, coordinates must be provided for each dimension, -e.g.:: - - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> z.get_coordinate_selection(([0, 2], [1, 3])) - array([ 1, 13]) - >>> z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) - >>> z[:] - array([[ 0, -1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -2, 14]]) - -For convenience, coordinate indexing is also available via the ``vindex`` -property, as well as the square bracket operator, e.g.:: - - >>> z.vindex[[0, 2], [1, 3]] - array([-1, -2]) - >>> z.vindex[[0, 2], [1, 3]] = [-3, -4] - >>> z[:] - array([[ 0, -3, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -4, 14]]) - >>> z[[0, 2], [1, 3]] - array([-3, -4]) - -When the indexing arrays have different shapes, they are broadcast together. -That is, the following two calls are equivalent:: - - >>> z[1, [1, 3]] - array([6, 8]) - >>> z[[1, 1], [1, 3]] - array([6, 8]) - -Indexing with a mask array -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Items can also be extracted by providing a Boolean mask. E.g.:: - - >>> z = zarr.array(np.arange(10) ** 2) - >>> z[:] - array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) - >>> sel = np.zeros_like(z, dtype=bool) - >>> sel[2] = True - >>> sel[5] = True - >>> z.get_mask_selection(sel) - array([ 4, 25]) - >>> z.set_mask_selection(sel, [-1, -2]) - >>> z[:] - array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) - -Here's a multidimensional example:: - - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> sel = np.zeros_like(z, dtype=bool) - >>> sel[0, 1] = True - >>> sel[2, 3] = True - >>> z.get_mask_selection(sel) - array([ 1, 13]) - >>> z.set_mask_selection(sel, [-1, -2]) - >>> z[:] - array([[ 0, -1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -2, 14]]) - -For convenience, mask indexing is also available via the ``vindex`` property, -e.g.:: - - >>> z.vindex[sel] - array([-1, -2]) - >>> z.vindex[sel] = [-3, -4] - >>> z[:] - array([[ 0, -3, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -4, 14]]) - -Mask indexing is conceptually the same as coordinate indexing, and is -implemented internally via the same machinery. Both styles of indexing allow -selecting arbitrary items from an array, also known as point selection. - -Orthogonal indexing -~~~~~~~~~~~~~~~~~~~ - -Zarr arrays also support methods for orthogonal indexing, which allows -selections to be made along each dimension of an array independently. For -example, this allows selecting a subset of rows and/or columns from a -2-dimensional array. E.g.:: - - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows - array([[ 0, 1, 2, 3, 4], - [10, 11, 12, 13, 14]]) - >>> z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns - array([[ 1, 3], - [ 6, 8], - [11, 13]]) - >>> z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] - array([[ 1, 3], - [11, 13]]) - -Data can also be modified, e.g.:: - - >>> z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) - >>> z[:] - array([[ 0, -1, 2, -2, 4], - [ 5, 6, 7, 8, 9], - [10, -3, 12, -4, 14]]) - -For convenience, the orthogonal indexing functionality is also available via the -``oindex`` property, e.g.:: - - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> z.oindex[[0, 2], :] # select first and third rows - array([[ 0, 1, 2, 3, 4], - [10, 11, 12, 13, 14]]) - >>> z.oindex[:, [1, 3]] # select second and fourth columns - array([[ 1, 3], - [ 6, 8], - [11, 13]]) - >>> z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] - array([[ 1, 3], - [11, 13]]) - >>> z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] - >>> z[:] - array([[ 0, -1, 2, -2, 4], - [ 5, 6, 7, 8, 9], - [10, -3, 12, -4, 14]]) - -Any combination of integer, slice, 1D integer array and/or 1D Boolean array can -be used for orthogonal indexing. - -If the index contains at most one iterable, and otherwise contains only slices and integers, -orthogonal indexing is also available directly on the array: - - >>> z = zarr.array(np.arange(15).reshape(3, 5)) - >>> all(z.oindex[[0, 2], :] == z[[0, 2], :]) - True - -Block Indexing -~~~~~~~~~~~~~~ - -As of version 2.16.0, Zarr also support block indexing, which allows -selections of whole chunks based on their logical indices along each dimension -of an array. For example, this allows selecting a subset of chunk aligned rows and/or -columns from a 2-dimensional array. E.g.:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.array(np.arange(100).reshape(10, 10), chunks=(3, 3)) - -Retrieve items by specifying their block coordinates:: - - >>> z.get_block_selection(1) - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - -Equivalent slicing:: - - >>> z[3:6] - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - - -For convenience, the block selection functionality is also available via the -`blocks` property, e.g.:: - - >>> z.blocks[1] - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - -Block index arrays may be multidimensional to index multidimensional arrays. -For example:: - - >>> z.blocks[0, 1:3] - array([[ 3, 4, 5, 6, 7, 8], - [13, 14, 15, 16, 17, 18], - [23, 24, 25, 26, 27, 28]]) - -Data can also be modified. Let's start by a simple 2D array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.zeros((6, 6), dtype=int, chunks=2) - -Set data for a selection of items:: - - >>> z.set_block_selection((1, 0), 1) - >>> z[...] - array([[0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0]]) - -For convenience, this functionality is also available via the ``blocks`` property. -E.g.:: - - >>> z.blocks[:, 2] = 7 - >>> z[...] - array([[0, 0, 0, 0, 7, 7], - [0, 0, 0, 0, 7, 7], - [1, 1, 0, 0, 7, 7], - [1, 1, 0, 0, 7, 7], - [0, 0, 0, 0, 7, 7], - [0, 0, 0, 0, 7, 7]]) - -Any combination of integer and slice can be used for block indexing:: - - >>> z.blocks[2, 1:3] - array([[0, 0, 7, 7], - [0, 0, 7, 7]]) - -Indexing fields in structured arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -All selection methods support a ``fields`` parameter which allows retrieving or -replacing data for a specific field in an array with a structured dtype. E.g.:: - - >>> a = np.array([(b'aaa', 1, 4.2), - ... (b'bbb', 2, 8.4), - ... (b'ccc', 3, 12.6)], - ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) - >>> z = zarr.array(a) - >>> z['foo'] - array([b'aaa', b'bbb', b'ccc'], - dtype='|S3') - >>> z['baz'] - array([ 4.2, 8.4, 12.6]) - >>> z.get_basic_selection(slice(0, 2), fields='bar') - array([1, 2], dtype=int32) - >>> z.get_coordinate_selection([0, 2], fields=['foo', 'baz']) - array([(b'aaa', 4.2), (b'ccc', 12.6)], - dtype=[('foo', 'S3'), ('baz', '>> z = zarr.open('data/example.zarr', mode='w', shape=1000000, dtype='i4') - -...is short-hand for:: - - >>> store = zarr.DirectoryStore('data/example.zarr') - >>> z = zarr.create(store=store, overwrite=True, shape=1000000, dtype='i4') - -...and the following code:: - - >>> root = zarr.open('data/example.zarr', mode='w') - -...is short-hand for:: - - >>> store = zarr.DirectoryStore('data/example.zarr') - >>> root = zarr.group(store=store, overwrite=True) - -Any other compatible storage class could be used in place of -:class:`zarr.storage.DirectoryStore` in the code examples above. For example, -here is an array stored directly into a ZIP archive, via the -:class:`zarr.storage.ZipStore` class:: - - >>> store = zarr.ZipStore('data/example.zip', mode='w') - >>> root = zarr.group(store=store) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() - -Re-open and check that data have been written:: - - >>> store = zarr.ZipStore('data/example.zip', mode='r') - >>> root = zarr.group(store=store) - >>> z = root['foo/bar'] - >>> z[:] - array([[42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - ..., - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42]], dtype=int32) - >>> store.close() - -Note that there are some limitations on how ZIP archives can be used, because items -within a ZIP archive cannot be updated in place. This means that data in the array -should only be written once and write operations should be aligned with chunk -boundaries. Note also that the ``close()`` method must be called after writing -any data to the store, otherwise essential records will not be written to the -underlying ZIP archive. - -Another storage alternative is the :class:`zarr.storage.DBMStore` class, added -in Zarr version 2.2. This class allows any DBM-style database to be used for -storing an array or group. Here is an example using a Berkeley DB B-tree -database for storage (requires `bsddb3 -`_ to be installed):: - - >>> import bsddb3 - >>> store = zarr.DBMStore('data/example.bdb', open=bsddb3.btopen) - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() - -Also added in Zarr version 2.2 is the :class:`zarr.storage.LMDBStore` class which -enables the lightning memory-mapped database (LMDB) to be used for storing an array or -group (requires `lmdb `_ to be installed):: - - >>> store = zarr.LMDBStore('data/example.lmdb') - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() - -In Zarr version 2.3 is the :class:`zarr.storage.SQLiteStore` class which -enables the SQLite database to be used for storing an array or group (requires -Python is built with SQLite support):: - - >>> store = zarr.SQLiteStore('data/example.sqldb') - >>> root = zarr.group(store=store, overwrite=True) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - >>> store.close() - -Also added in Zarr version 2.3 are two storage classes for interfacing with server-client -databases. The :class:`zarr.storage.RedisStore` class interfaces `Redis `_ -(an in memory data structure store), and the :class:`zarr.storage.MongoDB` class interfaces -with `MongoDB `_ (an object oriented NoSQL database). These stores -respectively require the `redis-py `_ and -`pymongo `_ packages to be installed. - -For compatibility with the `N5 `_ data format, Zarr also provides -an N5 backend (this is currently an experimental feature). Similar to the ZIP storage class, an -:class:`zarr.n5.N5Store` can be instantiated directly:: - - >>> store = zarr.N5Store('data/example.n5') - >>> root = zarr.group(store=store) - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') - >>> z[:] = 42 - -For convenience, the N5 backend will automatically be chosen when the filename -ends with `.n5`:: - - >>> root = zarr.open('data/example.n5', mode='w') - -Distributed/cloud storage -~~~~~~~~~~~~~~~~~~~~~~~~~ - -It is also possible to use distributed storage systems. The Dask project has -implementations of the ``MutableMapping`` interface for Amazon S3 (`S3Map -`_), Hadoop -Distributed File System (`HDFSMap -`_) and -Google Cloud Storage (`GCSMap -`_), which -can be used with Zarr. - -Here is an example using S3Map to read an array created previously:: - - >>> import s3fs - >>> import zarr - >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) - >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) - >>> root = zarr.group(store=store) - >>> z = root['foo/bar/baz'] - >>> z - - >>> z.info - Name : /foo/bar/baz - Type : zarr.Array - Data type : |S1 - Shape : (21,) - Chunk shape : (7,) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.KVStore - No. bytes : 21 - No. bytes stored : 382 - Storage ratio : 0.1 - Chunks initialized : 3/3 - >>> z[:] - array([b'H', b'e', b'l', b'l', b'o', b' ', b'f', b'r', b'o', b'm', b' ', - b't', b'h', b'e', b' ', b'c', b'l', b'o', b'u', b'd', b'!'], - dtype='|S1') - >>> z[:].tobytes() - b'Hello from the cloud!' - -Zarr now also has a builtin storage backend for Azure Blob Storage. -The class is :class:`zarr.storage.ABSStore` (requires -`azure-storage-blob `_ -to be installed):: - - >>> import azure.storage.blob - >>> container_client = azure.storage.blob.ContainerClient(...) # doctest: +SKIP - >>> store = zarr.ABSStore(client=container_client, prefix='zarr-testing') # doctest: +SKIP - >>> root = zarr.group(store=store, overwrite=True) # doctest: +SKIP - >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') # doctest: +SKIP - >>> z[:] = 42 # doctest: +SKIP - -When using an actual storage account, provide ``account_name`` and -``account_key`` arguments to :class:`zarr.storage.ABSStore`, the -above client is just testing against the emulator. Please also note -that this is an experimental feature. - -Note that retrieving data from a remote service via the network can be significantly -slower than retrieving data from a local file system, and will depend on network latency -and bandwidth between the client and server systems. If you are experiencing poor -performance, there are several things you can try. One option is to increase the array -chunk size, which will reduce the number of chunks and thus reduce the number of network -round-trips required to retrieve data for an array (and thus reduce the impact of network -latency). Another option is to try to increase the compression ratio by changing -compression options or trying a different compressor (which will reduce the impact of -limited network bandwidth). - -As of version 2.2, Zarr also provides the :class:`zarr.storage.LRUStoreCache` -which can be used to implement a local in-memory cache layer over a remote -store. E.g.:: - - >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) - >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) - >>> cache = zarr.LRUStoreCache(store, max_size=2**28) - >>> root = zarr.group(store=cache) - >>> z = root['foo/bar/baz'] - >>> from timeit import timeit - >>> # first data access is relatively slow, retrieved from store - ... timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP - b'Hello from the cloud!' - 0.1081731989979744 - >>> # second data access is faster, uses cache - ... timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP - b'Hello from the cloud!' - 0.0009490990014455747 - -If you are still experiencing poor performance with distributed/cloud storage, -please raise an issue on the GitHub issue tracker with any profiling data you -can provide, as there may be opportunities to optimise further either within -Zarr or within the mapping interface to the storage. - -IO with ``fsspec`` -~~~~~~~~~~~~~~~~~~ - -As of version 2.5, zarr supports passing URLs directly to `fsspec`_, -and having it create the "mapping" instance automatically. This means, that -for all of the backend storage implementations `supported by fsspec`_, -you can skip importing and configuring the storage explicitly. -For example:: - - >>> g = zarr.open_group("s3://zarr-demo/store", storage_options={'anon': True}) # doctest: +SKIP - >>> g['foo/bar/baz'][:].tobytes() # doctest: +SKIP - b'Hello from the cloud!' - -The provision of the protocol specifier "s3://" will select the correct backend. -Notice the kwargs ``storage_options``, used to pass parameters to that backend. - -As of version 2.6, write mode and complex URLs are also supported, such as:: - - >>> g = zarr.open_group("simplecache::s3://zarr-demo/store", - ... storage_options={"s3": {'anon': True}}) # doctest: +SKIP - >>> g['foo/bar/baz'][:].tobytes() # downloads target file # doctest: +SKIP - b'Hello from the cloud!' - >>> g['foo/bar/baz'][:].tobytes() # uses cached file # doctest: +SKIP - b'Hello from the cloud!' - -The second invocation here will be much faster. Note that the ``storage_options`` -have become more complex here, to account for the two parts of the supplied -URL. - -It is also possible to initialize the filesystem outside of Zarr and then pass -it through. This requires creating an :class:`zarr.storage.FSStore` object -explicitly. For example:: - - >>> import s3fs # doctest: +SKIP - >>> fs = s3fs.S3FileSystem(anon=True) # doctest: +SKIP - >>> store = zarr.storage.FSStore('/zarr-demo/store', fs=fs) # doctest: +SKIP - >>> g = zarr.open_group(store) # doctest: +SKIP - -This is useful in cases where you want to also use the same fsspec filesystem object -separately from Zarr. - -.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ - -.. _supported by fsspec: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations - -.. _tutorial_copy: - -Accessing ZIP archives on S3 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The built-in :class:`zarr.storage.ZipStore` will only work with paths on the local file-system; however -it is possible to access ZIP-archived Zarr data on the cloud via the `ZipFileSystem `_ -class from ``fsspec``. The following example demonstrates how to access -a ZIP-archived Zarr group on s3 using `s3fs `_ and ``ZipFileSystem``: - - >>> s3_path = "s3://path/to/my.zarr.zip" - >>> - >>> s3 = s3fs.S3FileSystem() - >>> f = s3.open(s3_path) - >>> fs = ZipFileSystem(f, mode="r") - >>> store = FSMap("", fs, check=False) - >>> - >>> # caching may improve performance when repeatedly reading the same data - >>> cache = zarr.storage.LRUStoreCache(store, max_size=2**28) - >>> z = zarr.group(store=cache) - -This store can also be generated with ``fsspec``'s handler chaining, like so: - - >>> store = zarr.storage.FSStore(url=f"zip::{s3_path}", mode="r") - -This can be especially useful if you have a very large ZIP-archived Zarr array or group on s3 -and only need to access a small portion of it. - -Consolidating metadata -~~~~~~~~~~~~~~~~~~~~~~ - -Since there is a significant overhead for every connection to a cloud object -store such as S3, the pattern described in the previous section may incur -significant latency while scanning the metadata of the array hierarchy, even -though each individual metadata object is small. For cases such as these, once -the data are static and can be regarded as read-only, at least for the -metadata/structure of the array hierarchy, the many metadata objects can be -consolidated into a single one via -:func:`zarr.convenience.consolidate_metadata`. Doing this can greatly increase -the speed of reading the array metadata, e.g.:: - - >>> zarr.consolidate_metadata(store) # doctest: +SKIP - -This creates a special key with a copy of all of the metadata from all of the -metadata objects in the store. - -Later, to open a Zarr store with consolidated metadata, use -:func:`zarr.convenience.open_consolidated`, e.g.:: - - >>> root = zarr.open_consolidated(store) # doctest: +SKIP - -This uses the special key to read all of the metadata in a single call to the -backend storage. - -Note that, the hierarchy could still be opened in the normal way and altered, -causing the consolidated metadata to become out of sync with the real state of -the array hierarchy. In this case, -:func:`zarr.convenience.consolidate_metadata` would need to be called again. - -To protect against consolidated metadata accidentally getting out of sync, the -root group returned by :func:`zarr.convenience.open_consolidated` is read-only -for the metadata, meaning that no new groups or arrays can be created, and -arrays cannot be resized. However, data values with arrays can still be updated. - -Copying/migrating data ----------------------- - -If you have some data in an HDF5 file and would like to copy some or all of it -into a Zarr group, or vice-versa, the :func:`zarr.convenience.copy` and -:func:`zarr.convenience.copy_all` functions can be used. Here's an example -copying a group named 'foo' from an HDF5 file to a Zarr group:: - - >>> import h5py - >>> import zarr - >>> import numpy as np - >>> source = h5py.File('data/example.h5', mode='w') - >>> foo = source.create_group('foo') - >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) - >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) - >>> zarr.tree(source) - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> dest = zarr.open_group('data/example.zarr', mode='w') - >>> from sys import stdout - >>> zarr.copy(source['foo'], dest, log=stdout) - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - all done: 3 copied, 0 skipped, 800 bytes copied - (3, 0, 800) - >>> dest.tree() # N.B., no spam - / - └── foo - └── bar - └── baz (100,) int64 - >>> source.close() - -If rather than copying a single group or array you would like to copy all -groups and arrays, use :func:`zarr.convenience.copy_all`, e.g.:: - - >>> source = h5py.File('data/example.h5', mode='r') - >>> dest = zarr.open_group('data/example2.zarr', mode='w') - >>> zarr.copy_all(source, dest, log=stdout) - copy /foo - copy /foo/bar - copy /foo/bar/baz (100,) int64 - copy /spam (100,) int64 - all done: 4 copied, 0 skipped, 1,600 bytes copied - (4, 0, 1600) - >>> dest.tree() - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - -If you need to copy data between two Zarr groups, the -:func:`zarr.convenience.copy` and :func:`zarr.convenience.copy_all` functions can -be used and provide the most flexibility. However, if you want to copy data -in the most efficient way possible, without changing any configuration options, -the :func:`zarr.convenience.copy_store` function can be used. This function -copies data directly between the underlying stores, without any decompression or -re-compression, and so should be faster. E.g.:: - - >>> import zarr - >>> import numpy as np - >>> store1 = zarr.DirectoryStore('data/example.zarr') - >>> root = zarr.group(store1, overwrite=True) - >>> baz = root.create_dataset('foo/bar/baz', data=np.arange(100), chunks=(50,)) - >>> spam = root.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) - >>> root.tree() - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> from sys import stdout - >>> store2 = zarr.ZipStore('data/example.zip', mode='w') - >>> zarr.copy_store(store1, store2, log=stdout) - copy .zgroup - copy foo/.zgroup - copy foo/bar/.zgroup - copy foo/bar/baz/.zarray - copy foo/bar/baz/0 - copy foo/bar/baz/1 - copy spam/.zarray - copy spam/0 - copy spam/1 - copy spam/2 - copy spam/3 - all done: 11 copied, 0 skipped, 1,138 bytes copied - (11, 0, 1138) - >>> new_root = zarr.group(store2) - >>> new_root.tree() - / - ├── foo - │ └── bar - │ └── baz (100,) int64 - └── spam (100,) int64 - >>> new_root['foo/bar/baz'][:] - array([ 0, 1, 2, ..., 97, 98, 99]) - >>> store2.close() # ZIP stores need to be closed - -.. _tutorial_strings: - -String arrays -------------- - -There are several options for storing arrays of strings. - -If your strings are all ASCII strings, and you know the maximum length of the string in -your array, then you can use an array with a fixed-length bytes dtype. E.g.:: - - >>> z = zarr.zeros(10, dtype='S6') - >>> z - - >>> z[0] = b'Hello' - >>> z[1] = b'world!' - >>> z[:] - array([b'Hello', b'world!', b'', b'', b'', b'', b'', b'', b'', b''], - dtype='|S6') - -A fixed-length unicode dtype is also available, e.g.:: - - >>> greetings = ['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!', - ... 'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!', - ... 'こんにちは世界', '世界,你好!', 'Helló, világ!', 'Zdravo svete!', - ... 'เฮลโลเวิลด์'] - >>> text_data = greetings * 10000 - >>> z = zarr.array(text_data, dtype='U20') - >>> z - - >>> z[:] - array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ..., - 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], - dtype='>> import numcodecs - >>> z = zarr.array(text_data, dtype=object, object_codec=numcodecs.VLenUTF8()) - >>> z - - >>> z.filters - [VLenUTF8()] - >>> z[:] - array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ..., - 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object) - -As a convenience, ``dtype=str`` (or ``dtype=unicode`` on Python 2.7) can be used, which -is a short-hand for ``dtype=object, object_codec=numcodecs.VLenUTF8()``, e.g.:: - - >>> z = zarr.array(text_data, dtype=str) - >>> z - - >>> z.filters - [VLenUTF8()] - >>> z[:] - array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ..., - 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object) - -Variable-length byte strings are also supported via ``dtype=object``. Again an -``object_codec`` is required, which can be one of :class:`numcodecs.vlen.VLenBytes` or -:class:`numcodecs.pickles.Pickle`. For convenience, ``dtype=bytes`` (or ``dtype=str`` on Python -2.7) can be used as a short-hand for ``dtype=object, object_codec=numcodecs.VLenBytes()``, -e.g.:: - - >>> bytes_data = [g.encode('utf-8') for g in greetings] * 10000 - >>> z = zarr.array(bytes_data, dtype=bytes) - >>> z - - >>> z.filters - [VLenBytes()] - >>> z[:] - array([b'\xc2\xa1Hola mundo!', b'Hej V\xc3\xa4rlden!', b'Servus Woid!', - ..., b'Hell\xc3\xb3, vil\xc3\xa1g!', b'Zdravo svete!', - b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c'], dtype=object) - -If you know ahead of time all the possible string values that can occur, you could -also use the :class:`numcodecs.categorize.Categorize` codec to encode each unique string value as an -integer. E.g.:: - - >>> categorize = numcodecs.Categorize(greetings, dtype=object) - >>> z = zarr.array(text_data, dtype=object, object_codec=categorize) - >>> z - - >>> z.filters - [Categorize(dtype='|O', astype='|u1', labels=['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...])] - >>> z[:] - array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ..., - 'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object) - - -.. _tutorial_objects: - -Object arrays -------------- - -Zarr supports arrays with an "object" dtype. This allows arrays to contain any type of -object, such as variable length unicode strings, or variable length arrays of numbers, or -other possibilities. When creating an object array, a codec must be provided via the -``object_codec`` argument. This codec handles encoding (serialization) of Python objects. -The best codec to use will depend on what type of objects are present in the array. - -At the time of writing there are three codecs available that can serve as a general -purpose object codec and support encoding of a mixture of object types: -:class:`numcodecs.json.JSON`, :class:`numcodecs.msgpacks.MsgPack`. and :class:`numcodecs.pickles.Pickle`. - -For example, using the JSON codec:: - - >>> z = zarr.empty(5, dtype=object, object_codec=numcodecs.JSON()) - >>> z[0] = 42 - >>> z[1] = 'foo' - >>> z[2] = ['bar', 'baz', 'qux'] - >>> z[3] = {'a': 1, 'b': 2.2} - >>> z[:] - array([42, 'foo', list(['bar', 'baz', 'qux']), {'a': 1, 'b': 2.2}, None], dtype=object) - -Not all codecs support encoding of all object types. The -:class:`numcodecs.pickles.Pickle` codec is the most flexible, supporting encoding any type -of Python object. However, if you are sharing data with anyone other than yourself, then -Pickle is not recommended as it is a potential security risk. This is because malicious -code can be embedded within pickled data. The JSON and MsgPack codecs do not have any -security issues and support encoding of unicode strings, lists and dictionaries. -MsgPack is usually faster for both encoding and decoding. - -Ragged arrays -~~~~~~~~~~~~~ - -If you need to store an array of arrays, where each member array can be of any length -and stores the same primitive type (a.k.a. a ragged array), the -:class:`numcodecs.vlen.VLenArray` codec can be used, e.g.:: - - >>> z = zarr.empty(4, dtype=object, object_codec=numcodecs.VLenArray(int)) - >>> z - - >>> z.filters - [VLenArray(dtype='>> z[0] = np.array([1, 3, 5]) - >>> z[1] = np.array([4]) - >>> z[2] = np.array([7, 9, 14]) - >>> z[:] - array([array([1, 3, 5]), array([4]), array([ 7, 9, 14]), - array([], dtype=int64)], dtype=object) - -As a convenience, ``dtype='array:T'`` can be used as a short-hand for -``dtype=object, object_codec=numcodecs.VLenArray('T')``, where 'T' can be any NumPy -primitive dtype such as 'i4' or 'f8'. E.g.:: - - >>> z = zarr.empty(4, dtype='array:i8') - >>> z - - >>> z.filters - [VLenArray(dtype='>> z[0] = np.array([1, 3, 5]) - >>> z[1] = np.array([4]) - >>> z[2] = np.array([7, 9, 14]) - >>> z[:] - array([array([1, 3, 5]), array([4]), array([ 7, 9, 14]), - array([], dtype=int64)], dtype=object) - -.. _tutorial_chunks: - -Chunk optimizations -------------------- - -.. _tutorial_chunks_shape: - -Chunk size and shape -~~~~~~~~~~~~~~~~~~~~ - -In general, chunks of at least 1 megabyte (1M) uncompressed size seem to provide -better performance, at least when using the Blosc compression library. - -The optimal chunk shape will depend on how you want to access the data. E.g., -for a 2-dimensional array, if you only ever take slices along the first -dimension, then chunk across the second dimension. If you know you want to chunk -across an entire dimension you can use ``None`` or ``-1`` within the ``chunks`` -argument, e.g.:: - - >>> z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4') - >>> z1.chunks - (100, 10000) - -Alternatively, if you only ever take slices along the second dimension, then -chunk across the first dimension, e.g.:: - - >>> z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4') - >>> z2.chunks - (10000, 100) - -If you require reasonable performance for both access patterns then you need to -find a compromise, e.g.:: - - >>> z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z3.chunks - (1000, 1000) - -If you are feeling lazy, you can let Zarr guess a chunk shape for your data by -providing ``chunks=True``, although please note that the algorithm for guessing -a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: - - >>> z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4') - >>> z4.chunks - (625, 625) - -If you know you are always going to be loading the entire array into memory, you -can turn off chunks by providing ``chunks=False``, in which case there will be -one single chunk for the array:: - - >>> z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4') - >>> z5.chunks - (10000, 10000) - -.. _tutorial_chunks_order: - -Chunk memory layout -~~~~~~~~~~~~~~~~~~~ - -The order of bytes **within each chunk** of an array can be changed via the -``order`` keyword argument, to use either C or Fortran layout. For -multi-dimensional arrays, these two layouts may provide different compression -ratios, depending on the correlation structure within the data. E.g.:: - - >>> a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T - >>> c = zarr.array(a, chunks=(1000, 1000)) - >>> c.info - Type : zarr.Array - Data type : int32 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.KVStore - No. bytes : 400000000 (381.5M) - No. bytes stored : 6696010 (6.4M) - Storage ratio : 59.7 - Chunks initialized : 100/100 - >>> f = zarr.array(a, chunks=(1000, 1000), order='F') - >>> f.info - Type : zarr.Array - Data type : int32 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : F - Read-only : False - Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - Store type : zarr.storage.KVStore - No. bytes : 400000000 (381.5M) - No. bytes stored : 4684636 (4.5M) - Storage ratio : 85.4 - Chunks initialized : 100/100 - -In the above example, Fortran order gives a better compression ratio. This is an -artificial example but illustrates the general point that changing the order of -bytes within chunks of an array may improve the compression ratio, depending on -the structure of the data, the compression algorithm used, and which compression -filters (e.g., byte-shuffle) have been applied. - -.. _tutorial_chunks_empty_chunks: - -Empty chunks -~~~~~~~~~~~~ - -As of version 2.11, it is possible to configure how Zarr handles the storage of -chunks that are "empty" (i.e., every element in the chunk is equal to the array's fill value). -When creating an array with ``write_empty_chunks=False``, -Zarr will check whether a chunk is empty before compression and storage. If a chunk is empty, -then Zarr does not store it, and instead deletes the chunk from storage -if the chunk had been previously stored. - -This optimization prevents storing redundant objects and can speed up reads, but the cost is -added computation during array writes, since the contents of -each chunk must be compared to the fill value, and these advantages are contingent on the content of the array. -If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above. -In this case, creating an array with ``write_empty_chunks=True`` (the default) will instruct Zarr to write every chunk without checking for emptiness. - -The following example illustrates the effect of the ``write_empty_chunks`` flag on -the time required to write an array with different values.:: - - >>> import zarr - >>> import numpy as np - >>> import time - >>> from tempfile import TemporaryDirectory - >>> def timed_write(write_empty_chunks): - ... """ - ... Measure the time required and number of objects created when writing - ... to a Zarr array with random ints or fill value. - ... """ - ... chunks = (8192,) - ... shape = (chunks[0] * 1024,) - ... data = np.random.randint(0, 255, shape) - ... dtype = 'uint8' - ... - ... with TemporaryDirectory() as store: - ... arr = zarr.open(store, - ... shape=shape, - ... chunks=chunks, - ... dtype=dtype, - ... write_empty_chunks=write_empty_chunks, - ... fill_value=0, - ... mode='w') - ... # initialize all chunks - ... arr[:] = 100 - ... result = [] - ... for value in (data, arr.fill_value): - ... start = time.time() - ... arr[:] = value - ... elapsed = time.time() - start - ... result.append((elapsed, arr.nchunks_initialized)) - ... - ... return result - >>> for write_empty_chunks in (True, False): - ... full, empty = timed_write(write_empty_chunks) - ... print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') - - write_empty_chunks=True: - Random Data: 0.1252s, 1024 objects stored - Empty Data: 0.1060s, 1024 objects stored - - - write_empty_chunks=False: - Random Data: 0.1359s, 1024 objects stored - Empty Data: 0.0301s, 0 objects stored - -In this example, writing random data is slightly slower with ``write_empty_chunks=True``, -but writing empty data is substantially faster and generates far fewer objects in storage. - -.. _tutorial_rechunking: - -Changing chunk shapes (rechunking) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Sometimes you are not free to choose the initial chunking of your input data, or -you might have data saved with chunking which is not optimal for the analysis you -have planned. In such cases it can be advantageous to re-chunk the data. For small -datasets, or when the mismatch between input and output chunks is small -such that only a few chunks of the input dataset need to be read to create each -chunk in the output array, it is sufficient to simply copy the data to a new array -with the desired chunking, e.g. :: - - >>> a = zarr.zeros((10000, 10000), chunks=(100,100), dtype='uint16', store='a.zarr') - >>> b = zarr.array(a, chunks=(100, 200), store='b.zarr') - -If the chunk shapes mismatch, however, a simple copy can lead to non-optimal data -access patterns and incur a substantial performance hit when using -file based stores. One of the most pathological examples is -switching from column-based chunking to row-based chunking e.g. :: - - >>> a = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='a.zarr') - >>> b = zarr.array(a, chunks=(1,10000), store='b.zarr') - -which will require every chunk in the input data set to be repeatedly read when creating -each output chunk. If the entire array will fit within memory, this is simply resolved -by forcing the entire input array into memory as a numpy array before converting -back to zarr with the desired chunking. :: - - >>> a = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='a.zarr') - >>> b = a[...] - >>> c = zarr.array(b, chunks=(1,10000), store='c.zarr') - -For data sets which have mismatched chunks and which do not fit in memory, a -more sophisticated approach to rechunking, such as offered by the -`rechunker `_ package and discussed -`here `_ -may offer a substantial improvement in performance. - -.. _tutorial_sync: - -Parallel computing and synchronization --------------------------------------- - -Zarr arrays have been designed for use as the source or sink for data in -parallel computations. By data source we mean that multiple concurrent read -operations may occur. By data sink we mean that multiple concurrent write -operations may occur, with each writer updating a different region of the -array. Zarr arrays have **not** been designed for situations where multiple -readers and writers are concurrently operating on the same array. - -Both multi-threaded and multi-process parallelism are possible. The bottleneck -for most storage and retrieval operations is compression/decompression, and the -Python global interpreter lock (GIL) is released wherever possible during these -operations, so Zarr will generally not block other Python threads from running. - -When using a Zarr array as a data sink, some synchronization (locking) may be -required to avoid data loss, depending on how data are being updated. If each -worker in a parallel computation is writing to a separate region of the array, -and if region boundaries are perfectly aligned with chunk boundaries, then no -synchronization is required. However, if region and chunk boundaries are not -perfectly aligned, then synchronization is required to avoid two workers -attempting to modify the same chunk at the same time, which could result in data -loss. - -To give a simple example, consider a 1-dimensional array of length 60, ``z``, -divided into three chunks of 20 elements each. If three workers are running and -each attempts to write to a 20 element region (i.e., ``z[0:20]``, ``z[20:40]`` -and ``z[40:60]``) then each worker will be writing to a separate chunk and no -synchronization is required. However, if two workers are running and each -attempts to write to a 30 element region (i.e., ``z[0:30]`` and ``z[30:60]``) -then it is possible both workers will attempt to modify the middle chunk at the -same time, and synchronization is required to prevent data loss. - -Zarr provides support for chunk-level synchronization. E.g., create an array -with thread synchronization:: - - >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', - ... synchronizer=zarr.ThreadSynchronizer()) - >>> z - - -This array is safe to read or write within a multi-threaded program. - -Zarr also provides support for process synchronization via file locking, -provided that all processes have access to a shared file system, and provided -that the underlying file system supports file locking (which is not the case for -some networked file systems). E.g.:: - - >>> synchronizer = zarr.ProcessSynchronizer('data/example.sync') - >>> z = zarr.open_array('data/example', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4', - ... synchronizer=synchronizer) - >>> z - - -This array is safe to read or write from multiple processes. - -When using multiple processes to parallelize reads or writes on arrays using the Blosc -compression library, it may be necessary to set ``numcodecs.blosc.use_threads = False``, -as otherwise Blosc may share incorrect global state amongst processes causing programs -to hang. See also the section on :ref:`tutorial_tips_blosc` below. - -Please note that support for parallel computing is an area of ongoing research -and development. If you are using Zarr for parallel computing, we welcome -feedback, experience, discussion, ideas and advice, particularly about issues -related to data integrity and performance. - -.. _tutorial_pickle: - -Pickle support --------------- - -Zarr arrays and groups can be pickled, as long as the underlying store object can be -pickled. Instances of any of the storage classes provided in the :mod:`zarr.storage` -module can be pickled, as can the built-in ``dict`` class which can also be used for -storage. - -Note that if an array or group is backed by an in-memory store like a ``dict`` or -:class:`zarr.storage.MemoryStore`, then when it is pickled all of the store data will be -included in the pickled data. However, if an array or group is backed by a persistent -store like a :class:`zarr.storage.DirectoryStore`, :class:`zarr.storage.ZipStore` or -:class:`zarr.storage.DBMStore` then the store data **are not** pickled. The only thing -that is pickled is the necessary parameters to allow the store to re-open any -underlying files or databases upon being unpickled. - -E.g., pickle/unpickle an in-memory array:: - - >>> import pickle - >>> z1 = zarr.array(np.arange(100000)) - >>> s = pickle.dumps(z1) - >>> len(s) > 5000 # relatively large because data have been pickled - True - >>> z2 = pickle.loads(s) - >>> z1 == z2 - True - >>> np.all(z1[:] == z2[:]) - True - -E.g., pickle/unpickle an array stored on disk:: - - >>> z3 = zarr.open('data/walnuts.zarr', mode='w', shape=100000, dtype='i8') - >>> z3[:] = np.arange(100000) - >>> s = pickle.dumps(z3) - >>> len(s) < 200 # small because no data have been pickled - True - >>> z4 = pickle.loads(s) - >>> z3 == z4 - True - >>> np.all(z3[:] == z4[:]) - True - -.. _tutorial_datetime: - -Datetimes and timedeltas ------------------------- - -NumPy's ``datetime64`` ('M8') and ``timedelta64`` ('m8') dtypes are supported for Zarr -arrays, as long as the units are specified. E.g.:: - - >>> z = zarr.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='M8[D]') - >>> z - - >>> z[:] - array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='datetime64[D]') - >>> z[0] - numpy.datetime64('2007-07-13') - >>> z[0] = '1999-12-31' - >>> z[:] - array(['1999-12-31', '2006-01-13', '2010-08-13'], dtype='datetime64[D]') - -.. _tutorial_tips: - -Usage tips ----------- - -.. _tutorial_tips_copy: - -Copying large arrays -~~~~~~~~~~~~~~~~~~~~ - -Data can be copied between large arrays without needing much memory, e.g.:: - - >>> z1 = zarr.empty((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z1[:] = 42 - >>> z2 = zarr.empty_like(z1) - >>> z2[:] = z1 - -Internally the example above works chunk-by-chunk, extracting only the data from -``z1`` required to fill each chunk in ``z2``. The source of the data (``z1``) -could equally be an h5py Dataset. - -.. _tutorial_tips_blosc: - -Configuring Blosc -~~~~~~~~~~~~~~~~~ - -The Blosc compressor is able to use multiple threads internally to accelerate -compression and decompression. By default, Blosc uses up to 8 -internal threads. The number of Blosc threads can be changed to increase or -decrease this number, e.g.:: - - >>> from numcodecs import blosc - >>> blosc.set_nthreads(2) # doctest: +SKIP - 8 - -When a Zarr array is being used within a multi-threaded program, Zarr -automatically switches to using Blosc in a single-threaded -"contextual" mode. This is generally better as it allows multiple -program threads to use Blosc simultaneously and prevents CPU thrashing -from too many active threads. If you want to manually override this -behaviour, set the value of the ``blosc.use_threads`` variable to -``True`` (Blosc always uses multiple internal threads) or ``False`` -(Blosc always runs in single-threaded contextual mode). To re-enable -automatic switching, set ``blosc.use_threads`` to ``None``. - -Please note that if Zarr is being used within a multi-process program, Blosc may not -be safe to use in multi-threaded mode and may cause the program to hang. If using Blosc -in a multi-process program then it is recommended to set ``blosc.use_threads = False``. diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst new file mode 100644 index 0000000000..b21f8e976c --- /dev/null +++ b/docs/user-guide/arrays.rst @@ -0,0 +1,612 @@ +.. only:: doctest + + >>> import shutil + >>> shutil.rmtree('data', ignore_errors=True) + +.. _user-guide-arrays: + +Working with arrays +=================== + +Creating an array +----------------- + +Zarr has several functions for creating arrays. For example:: + + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> z = zarr.create_array(store=store, shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') + >>> z + + +The code above creates a 2-dimensional array of 32-bit integers with 10000 rows +and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 +columns (and so there will be 100 chunks in total). The data is written to a +:class:`zarr.storage.MemoryStore` (e.g. an in-memory dict). See +:ref:`user-guide-persist` for details on storing arrays in other stores. + +For a complete list of array creation routines see the :mod:`zarr` +module documentation. + +.. _user-guide-array: + +Reading and writing data +------------------------ + +Zarr arrays support a similar interface to `NumPy `_ +arrays for reading and writing data. For example, the entire array can be filled +with a scalar value:: + + >>> z[:] = 42 + +Regions of the array can also be written to, e.g.:: + + >>> import numpy as np + >>> + >>> z[0, :] = np.arange(10000) + >>> z[:, 0] = np.arange(10000) + +The contents of the array can be retrieved by slicing, which will load the +requested region into memory as a NumPy array, e.g.:: + + >>> z[0, 0] + array(0, dtype=int32) + >>> z[-1, -1] + array(42, dtype=int32) + >>> z[0, :] + array([ 0, 1, 2, ..., 9997, 9998, 9999], + shape=(10000,), dtype=int32) + >>> z[:, 0] + array([ 0, 1, 2, ..., 9997, 9998, 9999], + shape=(10000,), dtype=int32) + >>> z[:] + array([[ 0, 1, 2, ..., 9997, 9998, 9999], + [ 1, 42, 42, ..., 42, 42, 42], + [ 2, 42, 42, ..., 42, 42, 42], + ..., + [9997, 42, 42, ..., 42, 42, 42], + [9998, 42, 42, ..., 42, 42, 42], + [9999, 42, 42, ..., 42, 42, 42]], + shape=(10000, 10000), dtype=int32) + +Read more about NumPy-style indexing can be found in the +`NumPy documentation `_. + +.. _user-guide-persist: + +Persistent arrays +----------------- + +In the examples above, compressed data for each chunk of the array was stored in +main memory. Zarr arrays can also be stored on a file system, enabling +persistence of data between sessions. To do this, we can change the store +argument to point to a filesystem path:: + + >>> z1 = zarr.create_array(store='data/example-1.zarr', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') + +The array above will store its configuration metadata and all compressed chunk +data in a directory called ``'data/example-1.zarr'`` relative to the current working +directory. The :func:`zarr.create_array` function provides a convenient way +to create a new persistent array or continue working with an existing +array. Note, there is no need to close an array: data are automatically +flushed to disk, and files are automatically closed whenever an array is modified. + +Persistent arrays support the same interface for reading and writing data, +e.g.:: + + >>> z1[:] = 42 + >>> z1[0, :] = np.arange(10000) + >>> z1[:, 0] = np.arange(10000) + +Check that the data have been written and can be read again:: + + >>> z2 = zarr.open_array('data/example-1.zarr', mode='r') + >>> np.all(z1[:] == z2[:]) + np.True_ + +If you are just looking for a fast and convenient way to save NumPy arrays to +disk then load back into memory later, the functions +:func:`zarr.save` and :func:`zarr.load` may be +useful. E.g.:: + + >>> a = np.arange(10) + >>> zarr.save('data/example-2.zarr', a) + >>> zarr.load('data/example-2.zarr') + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + +Please note that there are a number of other options for persistent array +storage, see the :ref:`Storage Guide ` guide for more details. + +.. _user-guide-resize: + +Resizing and appending +---------------------- + +A Zarr array can be resized, which means that any of its dimensions can be +increased or decreased in length. For example:: + + >>> z = zarr.create_array(store='data/example-3.zarr', shape=(10000, 10000), dtype='int32',chunks=(1000, 1000)) + >>> z[:] = 42 + >>> z.shape + (10000, 10000) + >>> z.resize((20000, 10000)) + >>> z.shape + (20000, 10000) + +Note that when an array is resized, the underlying data are not rearranged in +any way. If one or more dimensions are shrunk, any chunks falling outside the +new array shape will be deleted from the underlying store. + +:func:`zarr.Array.append` is provided as a convenience function, which can be +used to append data to any axis. E.g.:: + + >>> a = np.arange(10000000, dtype='int32').reshape(10000, 1000) + >>> z = zarr.create_array(store='data/example-4.zarr', shape=a.shape, dtype=a.dtype, chunks=(1000, 100)) + >>> z[:] = a + >>> z.shape + (10000, 1000) + >>> z.append(a) + (20000, 1000) + >>> z.append(np.vstack([a, a]), axis=1) + (20000, 2000) + >>> z.shape + (20000, 2000) + +.. _user-guide-compress: + +Compressors +----------- + +A number of different compressors can be used with Zarr. Zarr includes Blosc, +Zstandard and Gzip compressors. Additional compressors are available through +a separate package called NumCodecs_ which provides various +compressor libraries including LZ4, Zlib, BZ2 and LZMA. +Different compressors can be provided via the ``compressors`` keyword +argument accepted by all array creation functions. For example:: + + >>> compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=3, shuffle=zarr.codecs.BloscShuffle.bitshuffle) + >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) + >>> z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) + >>> z[:] = data + >>> z.metadata.codecs + [BytesCodec(endian=), BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0)] + +This array above will use Blosc as the primary compressor, using the Zstandard +algorithm (compression level 3) internally within Blosc, and with the +bit-shuffle filter applied. + +When using a compressor, it can be useful to get some diagnostics on the +compression ratio. Zarr arrays provide the :attr:`zarr.Array.info` property +which can be used to print useful diagnostics, e.g.:: + + >>> z.info + Type : Array + Zarr format : 3 + Data type : DataType.int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Store type : LocalStore + Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + No. bytes : 400000000 (381.5M) + +The :func:`zarr.Array.info_complete` method inspects the underlying store and +prints additional diagnostics, e.g.:: + + >>> z.info_complete() + Type : Array + Zarr format : 3 + Data type : DataType.int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Store type : LocalStore + Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + No. bytes : 400000000 (381.5M) + No. bytes stored : 9696302 + Storage ratio : 41.3 + Chunks Initialized : 100 + +.. note:: + :func:`zarr.Array.info_complete` will inspect the underlying store and may + be slow for large arrays. Use :attr:`zarr.Array.info` if detailed storage + statistics are not needed. + +If you don't specify a compressor, by default Zarr uses the Zstandard +compressor. + +In addition to Blosc and Zstandard, other compression libraries can also be used. For example, +here is an array using Gzip compression, level 1:: + + >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) + >>> z = zarr.create_array(store='data/example-6.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=zarr.codecs.GzipCodec(level=1)) + >>> z[:] = data + >>> z.metadata.codecs + [BytesCodec(endian=), GzipCodec(level=1)] + +Here is an example using LZMA from NumCodecs_ with a custom filter pipeline including LZMA's +built-in delta filter:: + + >>> import lzma + >>> from numcodecs.zarr3 import LZMA + >>> + >>> lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), dict(id=lzma.FILTER_LZMA2, preset=1)] + >>> compressors = LZMA(filters=lzma_filters) + >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) + >>> z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) + >>> z.metadata.codecs + [BytesCodec(endian=), _make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]})] + +The default compressor can be changed by setting the value of the using Zarr's +:ref:`user-guide-config`, e.g.:: + + >>> with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}): + ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) + >>> z.metadata.filters + >>> z.metadata.compressor + Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + +To disable compression, set ``compressors=None`` when creating an array, e.g.:: + + >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) + >>> z.metadata.codecs + [BytesCodec(endian=)] + +.. _user-guide-filters: + +Filters +------- + +In some cases, compression can be improved by transforming the data in some +way. For example, if nearby values tend to be correlated, then shuffling the +bytes within each numerical value or storing the difference between adjacent +values may increase compression ratio. Some compressors provide built-in filters +that apply transformations to the data prior to compression. For example, the +Blosc compressor has built-in implementations of byte- and bit-shuffle filters, +and the LZMA compressor has a built-in implementation of a delta +filter. However, to provide additional flexibility for implementing and using +filters in combination with different compressors, Zarr also provides a +mechanism for configuring filters outside of the primary compressor. + +Here is an example using a delta filter with the Blosc compressor:: + + >>> from numcodecs.zarr3 import Delta + >>> + >>> filters = [Delta(dtype='int32')] + >>> compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=1, shuffle=zarr.codecs.BloscShuffle.shuffle) + >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) + >>> z = zarr.create_array(store='data/example-9.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), filters=filters, compressors=compressors) + >>> z.info + Type : Array + Zarr format : 3 + Data type : DataType.int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Store type : LocalStore + Codecs : [{'codec_name': 'numcodecs.delta', 'codec_config': {'id': 'delta', 'dtype': 'int32'}}, {'endian': }, {'typesize': 4, 'cname': , 'clevel': 1, 'shuffle': , 'blocksize': 0}] + No. bytes : 400000000 (381.5M) + +For more information about available filter codecs, see the `Numcodecs +`_ documentation. + +.. _user-guide-indexing: + +Advanced indexing +----------------- + +Zarr arrays support several methods for advanced or "fancy" +indexing, which enable a subset of data items to be extracted or updated in an +array without loading the entire array into memory. + +Note that although this functionality is similar to some of the advanced +indexing capabilities available on NumPy arrays and on h5py datasets, **the Zarr +API for advanced indexing is different from both NumPy and h5py**, so please +read this section carefully. For a complete description of the indexing API, +see the documentation for the :class:`zarr.Array` class. + +Indexing with coordinate arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Items from a Zarr array can be extracted by providing an integer array of +coordinates. E.g.:: + + >>> data = np.arange(10) ** 2 + >>> z = zarr.create_array(store='data/example-10.zarr', shape=data.shape, dtype=data.dtype) + >>> z[:] = data + >>> z[:] + array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) + >>> z.get_coordinate_selection([2, 5]) + array([ 4, 25]) + +Coordinate arrays can also be used to update data, e.g.:: + + >>> z.set_coordinate_selection([2, 5], [-1, -2]) + >>> z[:] + array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) + +For multidimensional arrays, coordinates must be provided for each dimension, +e.g.:: + + >>> data = np.arange(15).reshape(3, 5) + >>> z = zarr.create_array(store='data/example-11.zarr', shape=data.shape, dtype=data.dtype) + >>> z[:] = data + >>> z[:] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> z.get_coordinate_selection(([0, 2], [1, 3])) + array([ 1, 13]) + >>> z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) + >>> z[:] + array([[ 0, -1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -2, 14]]) + +For convenience, coordinate indexing is also available via the ``vindex`` +property, as well as the square bracket operator, e.g.:: + + >>> z.vindex[[0, 2], [1, 3]] + array([-1, -2]) + >>> z.vindex[[0, 2], [1, 3]] = [-3, -4] + >>> z[:] + array([[ 0, -3, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -4, 14]]) + >>> z[[0, 2], [1, 3]] + array([-3, -4]) + +When the indexing arrays have different shapes, they are broadcast together. +That is, the following two calls are equivalent:: + + >>> z[1, [1, 3]] + array([6, 8]) + >>> z[[1, 1], [1, 3]] + array([6, 8]) + +Indexing with a mask array +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Items can also be extracted by providing a Boolean mask. E.g.:: + + >>> data = np.arange(10) ** 2 + >>> z = zarr.create_array(store='data/example-12.zarr', shape=data.shape, dtype=data.dtype) + >>> z[:] = data + >>> z[:] + array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[2] = True + >>> sel[5] = True + >>> z.get_mask_selection(sel) + array([ 4, 25]) + >>> z.set_mask_selection(sel, [-1, -2]) + >>> z[:] + array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) + +Here's a multidimensional example:: + + >>> data = np.arange(15).reshape(3, 5) + >>> z = zarr.create_array(store='data/example-13.zarr', shape=data.shape, dtype=data.dtype) + >>> z[:] = data + >>> z[:] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[0, 1] = True + >>> sel[2, 3] = True + >>> z.get_mask_selection(sel) + array([ 1, 13]) + >>> z.set_mask_selection(sel, [-1, -2]) + >>> z[:] + array([[ 0, -1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -2, 14]]) + +For convenience, mask indexing is also available via the ``vindex`` property, +e.g.:: + + >>> z.vindex[sel] + array([-1, -2]) + >>> z.vindex[sel] = [-3, -4] + >>> z[:] + array([[ 0, -3, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -4, 14]]) + +Mask indexing is conceptually the same as coordinate indexing, and is +implemented internally via the same machinery. Both styles of indexing allow +selecting arbitrary items from an array, also known as point selection. + +Orthogonal indexing +~~~~~~~~~~~~~~~~~~~ + +Zarr arrays also support methods for orthogonal indexing, which allows +selections to be made along each dimension of an array independently. For +example, this allows selecting a subset of rows and/or columns from a +2-dimensional array. E.g.:: + + >>> data = np.arange(15).reshape(3, 5) + >>> z = zarr.create_array(store='data/example-14.zarr', shape=data.shape, dtype=data.dtype) + >>> z[:] = data + >>> z[:] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows + array([[ 0, 1, 2, 3, 4], + [10, 11, 12, 13, 14]]) + >>> z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns + array([[ 1, 3], + [ 6, 8], + [11, 13]]) + >>> z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] + array([[ 1, 3], + [11, 13]]) + +Data can also be modified, e.g.:: + + >>> z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) + +For convenience, the orthogonal indexing functionality is also available via the +``oindex`` property, e.g.:: + + >>> data = np.arange(15).reshape(3, 5) + >>> z = zarr.create_array(store='data/example-15.zarr', shape=data.shape, dtype=data.dtype) + >>> z[:] = data + >>> z.oindex[[0, 2], :] # select first and third rows + array([[ 0, 1, 2, 3, 4], + [10, 11, 12, 13, 14]]) + >>> z.oindex[:, [1, 3]] # select second and fourth columns + array([[ 1, 3], + [ 6, 8], + [11, 13]]) + >>> z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] + array([[ 1, 3], + [11, 13]]) + >>> z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] + >>> z[:] + array([[ 0, -1, 2, -2, 4], + [ 5, 6, 7, 8, 9], + [10, -3, 12, -4, 14]]) + +Any combination of integer, slice, 1D integer array and/or 1D Boolean array can +be used for orthogonal indexing. + +If the index contains at most one iterable, and otherwise contains only slices and integers, +orthogonal indexing is also available directly on the array:: + + >>> data = np.arange(15).reshape(3, 5) + >>> z = zarr.create_array(store='data/example-16.zarr', shape=data.shape, dtype=data.dtype) + >>> z[:] = data + >>> np.all(z.oindex[[0, 2], :] == z[[0, 2], :]) + np.True_ + +Block Indexing +~~~~~~~~~~~~~~ + +Zarr also support block indexing, which allows selections of whole chunks based on their +logical indices along each dimension of an array. For example, this allows selecting +a subset of chunk aligned rows and/or columns from a 2-dimensional array. E.g.:: + + >>> data = np.arange(100).reshape(10, 10) + >>> z = zarr.create_array(store='data/example-17.zarr', shape=data.shape, dtype=data.dtype, chunks=(3, 3)) + >>> z[:] = data + +Retrieve items by specifying their block coordinates:: + + >>> z.get_block_selection(1) + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + +Equivalent slicing:: + + >>> z[3:6] + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + +For convenience, the block selection functionality is also available via the +`blocks` property, e.g.:: + + >>> z.blocks[1] + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + +Block index arrays may be multidimensional to index multidimensional arrays. +For example:: + + >>> z.blocks[0, 1:3] + array([[ 3, 4, 5, 6, 7, 8], + [13, 14, 15, 16, 17, 18], + [23, 24, 25, 26, 27, 28]]) + +Data can also be modified. Let's start by a simple 2D array:: + + >>> z = zarr.create_array(store='data/example-18.zarr', shape=(6, 6), dtype=int, chunks=(2, 2)) + +Set data for a selection of items:: + + >>> z.set_block_selection((1, 0), 1) + >>> z[...] + array([[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]]) + +For convenience, this functionality is also available via the ``blocks`` property. +E.g.:: + + >>> z.blocks[:, 2] = 7 + >>> z[...] + array([[0, 0, 0, 0, 7, 7], + [0, 0, 0, 0, 7, 7], + [1, 1, 0, 0, 7, 7], + [1, 1, 0, 0, 7, 7], + [0, 0, 0, 0, 7, 7], + [0, 0, 0, 0, 7, 7]]) + +Any combination of integer and slice can be used for block indexing:: + + >>> z.blocks[2, 1:3] + array([[0, 0, 7, 7], + [0, 0, 7, 7]]) + >>> + >>> root = zarr.create_group('data/example-19.zarr') + >>> foo = root.create_array(name='foo', shape=(1000, 100), chunks=(10, 10), dtype='float32') + >>> bar = root.create_array(name='foo/bar', shape=(100,), dtype='int32') + >>> foo[:, :] = np.random.random((1000, 100)) + >>> bar[:] = np.arange(100) + >>> root.tree() + / + └── foo (1000, 100) float32 + + +.. _user-guide-sharding: + +Sharding +-------- + +Coming soon. + + +Missing features in 3.0 +----------------------- + + +The following features have not been ported to 3.0 yet. + +.. _user-guide-objects: + +Object arrays +~~~~~~~~~~~~~ + +See the Zarr-Python 2 documentation on `Object arrays `_ for more details. + +.. _user-guide-strings: + +Fixed-length string arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See the Zarr-Python 2 documentation on `Fixed-length string arrays `_ for more details. + +.. _user-guide-datetime: + +Datetime and Timedelta arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See the Zarr-Python 2 documentation on `Datetime and Timedelta `_ for more details. + +.. _user-guide-copy: + +Copying and migrating data +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See the Zarr-Python 2 documentation on `Copying and migrating data `_ for more details. diff --git a/docs/user-guide/attributes.rst b/docs/user-guide/attributes.rst new file mode 100644 index 0000000000..ed48623e29 --- /dev/null +++ b/docs/user-guide/attributes.rst @@ -0,0 +1,30 @@ +.. _user-guide-attrs: + +Working with attributes +======================= + +Zarr arrays and groups support custom key/value attributes, which can be useful for +storing application-specific metadata. For example:: + + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> root = zarr.create_group(store=store) + >>> root.attrs['foo'] = 'bar' + >>> z = root.create_array(name='zzz', shape=(10000, 10000), dtype='int32') + >>> z.attrs['baz'] = 42 + >>> z.attrs['qux'] = [1, 4, 7, 12] + >>> sorted(root.attrs) + ['foo'] + >>> 'foo' in root.attrs + True + >>> root.attrs['foo'] + 'bar' + >>> sorted(z.attrs) + ['baz', 'qux'] + >>> z.attrs['baz'] + 42 + >>> z.attrs['qux'] + [1, 4, 7, 12] + +Internally Zarr uses JSON to store array attributes, so attribute values must be +JSON serializable. diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst new file mode 100644 index 0000000000..927d493e95 --- /dev/null +++ b/docs/user-guide/config.rst @@ -0,0 +1,88 @@ +.. _user-guide-config: + +Runtime configuration +===================== + +The :mod:`zarr.core.config` module is responsible for managing the configuration of zarr +and is based on the `donfig `_ Python library. + +Configuration values can be set using code like the following:: + + >>> import zarr + >>> + >>> zarr.config.set({'array.order': 'F'}) + + >>> + >>> # revert this change so it doesn't impact the rest of the docs + >>> zarr.config.set({'array.order': 'C'}) + + +Alternatively, configuration values can be set using environment variables, e.g. +``ZARR_ARRAY__ORDER=F``. + +The configuration can also be read from a YAML file in standard locations. +For more information, see the +`donfig documentation `_. + +Configuration options include the following: + +- Default Zarr format ``default_zarr_version`` +- Default array order in memory ``array.order`` +- Default codecs ``array.v3_default_codecs`` and ``array.v2_default_compressor`` +- Whether empty chunks are written to storage ``array.write_empty_chunks`` +- Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers`` +- Selections of implementations of codecs, codec pipelines and buffers + +For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, +first register the implementations in the registry and then select them in the config. +For example, an implementation of the bytes codec in a class ``'custompackage.NewBytesCodec'``, +requires the value of ``codecs.bytes.name`` to be ``'custompackage.NewBytesCodec'``. + +This is the current default configuration:: + + >>> zarr.config.pprint() + {'array': {'order': 'C', + 'v2_default_compressor': {'bytes': {'checksum': False, + 'id': 'zstd', + 'level': 0}, + 'numeric': {'checksum': False, + 'id': 'zstd', + 'level': 0}, + 'string': {'checksum': False, + 'id': 'zstd', + 'level': 0}}, + 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], + 'numeric': None, + 'string': [{'id': 'vlen-utf8'}]}, + 'v3_default_codecs': {'bytes': [{'name': 'vlen-bytes'}, + {'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}], + 'numeric': [{'configuration': {'endian': 'little'}, + 'name': 'bytes'}, + {'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}], + 'string': [{'name': 'vlen-utf8'}, + {'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}]}, + 'write_empty_chunks': False}, + 'async': {'concurrency': 10, 'timeout': None}, + 'buffer': 'zarr.core.buffer.cpu.Buffer', + 'codec_pipeline': {'batch_size': 1, + 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, + 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', + 'bytes': 'zarr.codecs.bytes.BytesCodec', + 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', + 'endian': 'zarr.codecs.bytes.BytesCodec', + 'gzip': 'zarr.codecs.gzip.GzipCodec', + 'sharding_indexed': 'zarr.codecs.sharding.ShardingCodec', + 'transpose': 'zarr.codecs.transpose.TransposeCodec', + 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', + 'vlen-utf8': 'zarr.codecs.vlen_utf8.VLenUTF8Codec', + 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, + 'default_zarr_format': 3, + 'json_indent': 2, + 'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer', + 'threading': {'max_workers': None}} diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst new file mode 100644 index 0000000000..d6b7a55de7 --- /dev/null +++ b/docs/user-guide/consolidated_metadata.rst @@ -0,0 +1,119 @@ +.. only:: doctest + + >>> from pprint import pprint + +.. _user-guide-consolidated-metadata: + +Consolidated metadata +===================== + +.. warning:: + The Consolidated Metadata feature in Zarr-Python is considered experimental for v3 + stores. `zarr-specs#309 `_ + has proposed a formal extension to the v3 specification to support consolidated metadata. + +Zarr-Python implements the `Consolidated Metadata`_ for v2 and v3 stores. +Consolidated metadata can reduce the time needed to load the metadata for an +entire hierarchy, especially when the metadata is being served over a network. +Consolidated metadata essentially stores all the metadata for a hierarchy in the +metadata of the root Group. + +Usage +----- + +If consolidated metadata is present in a Zarr Group's metadata then it is used +by default. The initial read to open the group will need to communicate with +the store (reading from a file for a :class:`zarr.storage.LocalStore`, making a +network request for a :class:`zarr.storage.FsspecStore`). After that, any subsequent +metadata reads get child Group or Array nodes will *not* require reads from the store. + +In Python, the consolidated metadata is available on the ``.consolidated_metadata`` +attribute of the ``GroupMetadata`` object. + + >>> import zarr + >>> + >>> store = zarr.storage.MemoryStore() + >>> group = zarr.create_group(store=store) + >>> group.create_array(shape=(1,), name='a', dtype='float64') + + >>> group.create_array(shape=(2, 2), name='b', dtype='float64') + + >>> group.create_array(shape=(3, 3, 3), name='c', dtype='float64') + + >>> zarr.consolidate_metadata(store) + + +If we open that group, the Group's metadata has a :class:`zarr.core.group.ConsolidatedMetadata` +that can be used.: + + >>> consolidated = zarr.open_group(store=store) + >>> consolidated_metadata = consolidated.metadata.consolidated_metadata.metadata + >>> pprint(dict(sorted(consolidated_metadata.items()))) + {'a': ArrayV3Metadata(shape=(1,), + data_type=, + chunk_grid=RegularChunkGrid(chunk_shape=(1,)), + chunk_key_encoding=DefaultChunkKeyEncoding(name='default', + separator='/'), + fill_value=np.float64(0.0), + codecs=[BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)], + attributes={}, + dimension_names=None, + zarr_format=3, + node_type='array', + storage_transformers=()), + 'b': ArrayV3Metadata(shape=(2, 2), + data_type=, + chunk_grid=RegularChunkGrid(chunk_shape=(2, 2)), + chunk_key_encoding=DefaultChunkKeyEncoding(name='default', + separator='/'), + fill_value=np.float64(0.0), + codecs=[BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)], + attributes={}, + dimension_names=None, + zarr_format=3, + node_type='array', + storage_transformers=()), + 'c': ArrayV3Metadata(shape=(3, 3, 3), + data_type=, + chunk_grid=RegularChunkGrid(chunk_shape=(3, 3, 3)), + chunk_key_encoding=DefaultChunkKeyEncoding(name='default', + separator='/'), + fill_value=np.float64(0.0), + codecs=[BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)], + attributes={}, + dimension_names=None, + zarr_format=3, + node_type='array', + storage_transformers=())} + +Operations on the group to get children automatically use the consolidated metadata.: + + >>> consolidated['a'] # no read / HTTP request to the Store is required + + +With nested groups, the consolidated metadata is available on the children, recursively.: + + >>> child = group.create_group('child', attributes={'kind': 'child'}) + >>> grandchild = child.create_group('child', attributes={'kind': 'grandchild'}) + >>> consolidated = zarr.consolidate_metadata(store) + >>> + >>> consolidated['child'].metadata.consolidated_metadata + ConsolidatedMetadata(metadata={'child': GroupMetadata(attributes={'kind': 'grandchild'}, zarr_format=3, consolidated_metadata=ConsolidatedMetadata(metadata={}, kind='inline', must_understand=False), node_type='group')}, kind='inline', must_understand=False) + +Synchronization and Concurrency +------------------------------- + +Consolidated metadata is intended for read-heavy use cases on slowly changing +hierarchies. For hierarchies where new nodes are constantly being added, +removed, or modified, consolidated metadata may not be desirable. + +1. It will add some overhead to each update operation, since the metadata + would need to be re-consolidated to keep it in sync with the store. +2. Readers using consolidated metadata will regularly see a "past" version + of the metadata, at the time they read the root node with its consolidated + metadata. + +.. _Consolidated Metadata: https://github.com/zarr-developers/zarr-specs/pull/309 diff --git a/docs/user-guide/extending.rst b/docs/user-guide/extending.rst new file mode 100644 index 0000000000..405dcb92c0 --- /dev/null +++ b/docs/user-guide/extending.rst @@ -0,0 +1,91 @@ + +Extending Zarr +============== + +Zarr-Python 3 was designed to be extensible. This means that you can extend +the library by writing custom classes and plugins. Currently, Zarr can be extended +in the following ways: + +Custom codecs +------------- + +.. note:: + This section explains how custom codecs can be created for Zarr version 3 data. For Zarr + version 2, codecs should subclass the + `numcodecs.abc.Codec `_ + base class and register through + `numcodecs.registry.register_codec `_. + +There are three types of codecs in Zarr: +- array-to-array +- array-to-bytes +- bytes-to-bytes + +Array-to-array codecs are used to transform the array data before serializing +to bytes. Examples include delta encoding or scaling codecs. Array-to-bytes codecs are used +for serializing the array data to bytes. In Zarr, the main codec to use for numeric arrays +is the :class:`zarr.codecs.BytesCodec`. Bytes-to-bytes codecs transform the serialized bytestreams +of the array data. Examples include compression codecs, such as +:class:`zarr.codecs.GzipCodec`, :class:`zarr.codecs.BloscCodec` or +:class:`zarr.codecs.ZstdCodec`, and codecs that add a checksum to the bytestream, such as +:class:`zarr.codecs.Crc32cCodec`. + +Custom codecs for Zarr are implemented by subclassing the relevant base class, see +:class:`zarr.abc.codec.ArrayArrayCodec`, :class:`zarr.abc.codec.ArrayBytesCodec` and +:class:`zarr.abc.codec.BytesBytesCodec`. Most custom codecs should implemented the +``_encode_single`` and ``_decode_single`` methods. These methods operate on single chunks +of the array data. Alternatively, custom codecs can implement the ``encode`` and ``decode`` +methods, which operate on batches of chunks, in case the codec is intended to implement +its own batch processing. + +Custom codecs should also implement the following methods: + +- ``compute_encoded_size``, which returns the byte size of the encoded data given the byte + size of the original data. It should raise ``NotImplementedError`` for codecs with + variable-sized outputs, such as compression codecs. +- ``validate`` (optional), which can be used to check that the codec metadata is compatible with the + array metadata. It should raise errors if not. +- ``resolve_metadata`` (optional), which is important for codecs that change the shape, + dtype or fill value of a chunk. +- ``evolve_from_array_spec`` (optional), which can be useful for automatically filling in + codec configuration metadata from the array metadata. + +To use custom codecs in Zarr, they need to be registered using the +`entrypoint mechanism `_. +Commonly, entrypoints are declared in the ``pyproject.toml`` of your package under the +``[project.entry-points."zarr.codecs"]`` section. Zarr will automatically discover and +load all codecs registered with the entrypoint mechanism from imported modules. + +.. code-block:: toml + + [project.entry-points."zarr.codecs"] + "custompackage.fancy_codec" = "custompackage:FancyCodec" + +New codecs need to have their own unique identifier. To avoid naming collisions, it is +strongly recommended to prefix the codec identifier with a unique name. For example, +the codecs from ``numcodecs`` are prefixed with ``numcodecs.``, e.g. ``numcodecs.delta``. + +.. note:: + Note that the extension mechanism for the Zarr version 3 is still under development. + Requirements for custom codecs including the choice of codec identifiers might + change in the future. + +It is also possible to register codecs as replacements for existing codecs. This might be +useful for providing specialized implementations, such as GPU-based codecs. In case of +multiple codecs, the :mod:`zarr.core.config` mechanism can be used to select the preferred +implementation. + +Custom stores +------------- + +Coming soon. + +Custom array buffers +-------------------- + +Coming soon. + +Other extensions +---------------- + +In the future, Zarr will support writing custom custom data types and chunk grids. diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst new file mode 100644 index 0000000000..62160ffde5 --- /dev/null +++ b/docs/user-guide/groups.rst @@ -0,0 +1,141 @@ +.. only:: doctest + + >>> import shutil + >>> shutil.rmtree('data', ignore_errors=True) + +.. _user-guide-groups: + +Working with groups +=================== + +Zarr supports hierarchical organization of arrays via groups. As with arrays, +groups can be stored in memory, on disk, or via other storage systems that +support a similar interface. + +To create a group, use the :func:`zarr.group` function:: + + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> root = zarr.create_group(store=store) + >>> root + + +Groups have a similar API to the Group class from `h5py +`_. For example, groups can contain other groups:: + + >>> foo = root.create_group('foo') + >>> bar = foo.create_group('bar') + +Groups can also contain arrays, e.g.:: + + >>> z1 = bar.create_array(name='baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') + >>> z1 + + +Members of a group can be accessed via the suffix notation, e.g.:: + + >>> root['foo'] + + +The '/' character can be used to access multiple levels of the hierarchy in one +call, e.g.:: + + >>> root['foo/bar'] + + >>> root['foo/bar/baz'] + + +The :func:`zarr.Group.tree` method can be used to print a tree +representation of the hierarchy, e.g.:: + + >>> root.tree() + / + └── foo + └── bar + └── baz (10000, 10000) int32 + + +The :func:`zarr.open_group` function provides a convenient way to create or +re-open a group stored in a directory on the file-system, with sub-groups stored in +sub-directories, e.g.:: + + >>> root = zarr.open_group('data/group.zarr', mode='w') + >>> root + + >>> + >>> z = root.create_array(name='foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') + >>> z + + +.. TODO: uncomment after __enter__ and __exit__ are implemented +.. Groups can be used as context managers (in a ``with`` statement). +.. If the underlying store has a ``close`` method, it will be called on exit. + +For more information on groups see the :class:`zarr.Group` API docs. + +.. _user-guide-diagnostics: + +Array and group diagnostics +--------------------------- + +Diagnostic information about arrays and groups is available via the ``info`` +property. E.g.:: + + >>> store = zarr.storage.MemoryStore() + >>> root = zarr.group(store=store) + >>> foo = root.create_group('foo') + >>> bar = foo.create_array(name='bar', shape=1000000, chunks=100000, dtype='int64') + >>> bar[:] = 42 + >>> baz = foo.create_array(name='baz', shape=(1000, 1000), chunks=(100, 100), dtype='float32') + >>> baz[:] = 4.2 + >>> root.info + Name : + Type : Group + Zarr format : 3 + Read-only : False + Store type : MemoryStore + >>> foo.info + Name : foo + Type : Group + Zarr format : 3 + Read-only : False + Store type : MemoryStore + >>> bar.info_complete() + Type : Array + Zarr format : 3 + Data type : DataType.int64 + Shape : (1000000,) + Chunk shape : (100000,) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 8000000 (7.6M) + No. bytes stored : 1432 + Storage ratio : 5586.6 + Chunks Initialized : 0 + >>> baz.info + Type : Array + Zarr format : 3 + Data type : DataType.float32 + Shape : (1000, 1000) + Chunk shape : (100, 100) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 4000000 (3.8M) + +Groups also have the :func:`zarr.Group.tree` method, e.g.:: + + >>> root.tree() + / + └── foo + ├── bar (1000000,) int64 + └── baz (1000, 1000) float32 + + +.. note:: + + :func:`zarr.Group.tree` requires the optional `rich `_ + dependency. It can be installed with the ``[tree]`` extra. diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst new file mode 100644 index 0000000000..8647eeb3e6 --- /dev/null +++ b/docs/user-guide/index.rst @@ -0,0 +1,31 @@ +.. _user-guide: + +User guide +========== + +.. toctree:: + :maxdepth: 1 + + arrays + groups + attributes + storage + config + +.. Coming soon + installation + v3_migration + +Advanced Topics +--------------- + +.. toctree:: + :maxdepth: 1 + + performance + consolidated_metadata + extending + + +.. Coming soon + async diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst new file mode 100644 index 0000000000..d2881fe536 --- /dev/null +++ b/docs/user-guide/performance.rst @@ -0,0 +1,230 @@ +.. only:: doctest + + >>> import shutil + >>> shutil.rmtree('data', ignore_errors=True) + +.. _user-guide-performance: + +Optimizing performance +====================== + +.. _user-guide-chunks: + +Chunk optimizations +------------------- + +.. _user-guide-chunks-shape: + +Chunk size and shape +~~~~~~~~~~~~~~~~~~~~ + +In general, chunks of at least 1 megabyte (1M) uncompressed size seem to provide +better performance, at least when using the Blosc compression library. + +The optimal chunk shape will depend on how you want to access the data. E.g., +for a 2-dimensional array, if you only ever take slices along the first +dimension, then chunk across the second dimension. If you know you want to chunk +across an entire dimension you can use the full size of that dimension within the +``chunks`` argument, e.g.:: + + >>> import zarr + >>> z1 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(100, 10000), dtype='int32') + >>> z1.chunks + (100, 10000) + +Alternatively, if you only ever take slices along the second dimension, then +chunk across the first dimension, e.g.:: + + >>> z2 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(10000, 100), dtype='int32') + >>> z2.chunks + (10000, 100) + +If you require reasonable performance for both access patterns then you need to +find a compromise, e.g.:: + + >>> z3 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') + >>> z3.chunks + (1000, 1000) + +If you are feeling lazy, you can let Zarr guess a chunk shape for your data by +providing ``chunks='auto'``, although please note that the algorithm for guessing +a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: + + >>> z4 = zarr.create_array(store={}, shape=(10000, 10000), chunks='auto', dtype='int32') + >>> z4.chunks + (625, 625) + +If you know you are always going to be loading the entire array into memory, you +can turn off chunks by providing ``chunks`` equal to ``shape``, in which case there +will be one single chunk for the array:: + + >>> z5 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(10000, 10000), dtype='int32') + >>> z5.chunks + (10000, 10000) + +.. _user-guide-chunks-order: + +Chunk memory layout +~~~~~~~~~~~~~~~~~~~ + +The order of bytes **within each chunk** of an array can be changed via the +``order`` config option, to use either C or Fortran layout. For +multi-dimensional arrays, these two layouts may provide different compression +ratios, depending on the correlation structure within the data. E.g.:: + + >>> import numpy as np + >>> + >>> a = np.arange(100000000, dtype='int32').reshape(10000, 10000).T + >>> c = zarr.create_array(store={}, shape=a.shape, chunks=(1000, 1000), dtype=a.dtype, config={'order': 'C'}) + >>> c[:] = a + >>> c.info_complete() + Type : Array + Zarr format : 3 + Data type : DataType.int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 400000000 (381.5M) + No. bytes stored : 342588717 + Storage ratio : 1.2 + Chunks Initialized : 100 + >>> with zarr.config.set({'array.order': 'F'}): + ... f = zarr.create_array(store={}, shape=a.shape, chunks=(1000, 1000), dtype=a.dtype) + ... f[:] = a + >>> f.info_complete() + Type : Array + Zarr format : 3 + Data type : DataType.int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : F + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 400000000 (381.5M) + No. bytes stored : 342588717 + Storage ratio : 1.2 + Chunks Initialized : 100 + +In the above example, Fortran order gives a better compression ratio. This is an +artificial example but illustrates the general point that changing the order of +bytes within chunks of an array may improve the compression ratio, depending on +the structure of the data, the compression algorithm used, and which compression +filters (e.g., byte-shuffle) have been applied. + +.. _user-guide-chunks-empty-chunks: + +Empty chunks +~~~~~~~~~~~~ + +It is possible to configure how Zarr handles the storage of chunks that are "empty" +(i.e., every element in the chunk is equal to the array's fill value). When creating +an array with ``write_empty_chunks=False``, Zarr will check whether a chunk is empty before compression and storage. If a chunk is empty, +then Zarr does not store it, and instead deletes the chunk from storage +if the chunk had been previously stored. + +This optimization prevents storing redundant objects and can speed up reads, but the cost is +added computation during array writes, since the contents of +each chunk must be compared to the fill value, and these advantages are contingent on the content of the array. +If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above. +In this case, creating an array with ``write_empty_chunks=True`` (the default) will instruct Zarr to write every chunk without checking for emptiness. + +The following example illustrates the effect of the ``write_empty_chunks`` flag on +the time required to write an array with different values.:: + + >>> import zarr + >>> import numpy as np + >>> import time + >>> + >>> def timed_write(write_empty_chunks): + ... """ + ... Measure the time required and number of objects created when writing + ... to a Zarr array with random ints or fill value. + ... """ + ... chunks = (8192,) + ... shape = (chunks[0] * 1024,) + ... data = np.random.randint(0, 255, shape) + ... dtype = 'uint8' + ... arr = zarr.create_array( + ... f'data/example-{write_empty_chunks}.zarr', + ... shape=shape, + ... chunks=chunks, + ... dtype=dtype, + ... fill_value=0, + ... config={'write_empty_chunks': write_empty_chunks} + ... ) + ... # initialize all chunks + ... arr[:] = 100 + ... result = [] + ... for value in (data, arr.fill_value): + ... start = time.time() + ... arr[:] = value + ... elapsed = time.time() - start + ... result.append((elapsed, arr.nchunks_initialized)) + ... return result + ... # log results + >>> for write_empty_chunks in (True, False): + ... full, empty = timed_write(write_empty_chunks) + ... print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') + write_empty_chunks=True: + Random Data: ..., 1024 objects stored + Empty Data: ...s, 1024 objects stored + + write_empty_chunks=False: + Random Data: ...s, 1024 objects stored + Empty Data: ...s, 0 objects stored + + +In this example, writing random data is slightly slower with ``write_empty_chunks=True``, +but writing empty data is substantially faster and generates far fewer objects in storage. + +.. _user-guide-rechunking: + +Changing chunk shapes (rechunking) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Coming soon. + +.. _user-guide-sync: + +Parallel computing and synchronization +-------------------------------------- + +Coming soon. + +.. _user-guide-pickle: + +Pickle support +-------------- + +Zarr arrays and groups can be pickled, as long as the underlying store object can be +pickled. With the exception of the :class:`zarr.storage.MemoryStore`, any of the +storage classes provided in the :mod:`zarr.storage` module can be pickled. + +If an array or group is backed by a persistent store such as the a :class:`zarr.storage.LocalStore`, +:class:`zarr.storage.ZipStore` or :class:`zarr.storage.FsspecStore` then the store data +**are not** pickled. The only thing that is pickled is the necessary parameters to allow the store +to re-open any underlying files or databases upon being unpickled. + +E.g., pickle/unpickle an local store array:: + + >>> import pickle + >>> data = np.arange(100000) + >>> z1 = zarr.create_array(store='data/example-2.zarr', shape=data.shape, chunks=data.shape, dtype=data.dtype) + >>> z1[:] = data + >>> s = pickle.dumps(z1) + >>> z2 = pickle.loads(s) + >>> z1 == z2 + True + >>> np.all(z1[:] == z2[:]) + np.True_ + +.. _user-guide-tips-blosc: + +Configuring Blosc +----------------- + +Coming soon. diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst new file mode 100644 index 0000000000..7beb38a36f --- /dev/null +++ b/docs/user-guide/storage.rst @@ -0,0 +1,110 @@ +.. only:: doctest + + >>> import shutil + >>> shutil.rmtree('data', ignore_errors=True) + +.. _user-guide-storage: + +Storage guide +============= + +Zarr-Python supports multiple storage backends, including: local file systems, +Zip files, remote stores via fsspec_ (S3, HTTP, etc.), and in-memory stores. In +Zarr-Python 3, stores must implement the abstract store API from +:class:`zarr.abc.store.Store`. + +.. note:: + Unlike Zarr-Python 2 where the store interface was built around a generic ``MutableMapping`` + API, Zarr-Python 3 utilizes a custom store API that utilizes Python's AsyncIO library. + +Implicit Store Creation +----------------------- + +In most cases, it is not required to create a ``Store`` object explicitly. Passing a string +to Zarr's top level API will result in the store being created automatically.: + + >>> import zarr + >>> + >>> # Implicitly create a writable LocalStore + >>> zarr.create_group(store='data/foo/bar') + + >>> + >>> # Implicitly create a read-only FsspecStore + >>> zarr.open_group( + ... store='s3://noaa-nwm-retro-v2-zarr-pds', + ... mode='r', + ... storage_options={'anon': True} + ... ) + > + >>> + >>> # Implicitly creates a MemoryStore + >>> data = {} + >>> zarr.create_group(store=data) + + +Explicit Store Creation +----------------------- + +In some cases, it may be helpful to create a store instance directly. Zarr-Python offers four +built-in store: :class:`zarr.storage.LocalStore`, :class:`zarr.storage.FsspecStore`, +:class:`zarr.storage.ZipStore`, and :class:`zarr.storage.MemoryStore`. + +Local Store +~~~~~~~~~~~ + +The :class:`zarr.storage.LocalStore` stores data in a nested set of directories on a local +filesystem.: + + >>> store = zarr.storage.LocalStore('data/foo/bar', read_only=True) + >>> zarr.open_group(store=store, mode='r') + + +Zip Store +~~~~~~~~~ + +The :class:`zarr.storage.ZipStore` stores the contents of a Zarr hierarchy in a single +Zip file. The `Zip Store specification`_ is currently in draft form.: + + >>> store = zarr.storage.ZipStore('data.zip', mode='w') + >>> zarr.create_array(store=store, shape=(2,), dtype='float64') + + +Remote Store +~~~~~~~~~~~~ + +The :class:`zarr.storage.FsspecStore` stores the contents of a Zarr hierarchy in following the same +logical layout as the ``LocalStore``, except the store is assumed to be on a remote storage system +such as cloud object storage (e.g. AWS S3, Google Cloud Storage, Azure Blob Store). The +:class:`zarr.storage.FsspecStore` is backed by `fsspec`_ and can support any backend +that implements the `AbstractFileSystem `_ +API. ``storage_options`` can be used to configure the fsspec backend.: + + >>> store = zarr.storage.FsspecStore.from_url( + ... 's3://noaa-nwm-retro-v2-zarr-pds', + ... read_only=True, + ... storage_options={'anon': True} + ... ) + >>> zarr.open_group(store=store, mode='r') + > + +Memory Store +~~~~~~~~~~~~ + +The :class:`zarr.storage.MemoryStore` a in-memory store that allows for serialization of +Zarr data (metadata and chunks) to a dictionary.: + + >>> data = {} + >>> store = zarr.storage.MemoryStore(data) + >>> # TODO: replace with create_array after #2463 + >>> zarr.create_array(store=store, shape=(2,), dtype='float64') + + +Developing custom stores +------------------------ + +Zarr-Python :class:`zarr.abc.store.Store` API is meant to be extended. The Store Abstract Base +Class includes all of the methods needed to be a fully operational store in Zarr Python. +Zarr also provides a test harness for custom stores: :class:`zarr.testing.store.StoreTests`. + +.. _Zip Store Specification: https://github.com/zarr-developers/zarr-specs/pull/311 +.. _fsspec: https://filesystem-spec.readthedocs.io diff --git a/pyproject.toml b/pyproject.toml index a92c30ab9f..aaedc09736 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,8 +68,10 @@ test = [ "pytest-cov", "s3fs", "pytest-asyncio", + "pytest-accept", "moto[s3]", "requests", + "rich", "mypy", "hypothesis", "universal-pathlib", @@ -86,6 +88,8 @@ docs = [ 'pydata-sphinx-theme', 'numpydoc', 'numcodecs[msgpack]', + 'rich', + 's3fs', ] @@ -144,6 +148,15 @@ run-mypy = "mypy src" run-hypothesis = "pytest --hypothesis-profile ci tests/test_properties.py tests/test_store/test_stateful*" list-env = "pip list" +[tool.hatch.envs.doctest] +features = ["test", "optional"] +description = "Test environment for doctests" + +[tool.hatch.envs.doctest.scripts] +run = "rm -r data/; pytest docs/user-guide --doctest-glob='*.rst'" +fix = "rm -r data/; pytest docs/user-guide --doctest-glob='*.rst' --accept" +list-env = "pip list" + [tool.hatch.envs.gputest] dependencies = [ "numpy~={matrix:numpy}", @@ -349,7 +362,7 @@ ignore_errors = true [tool.pytest.ini_options] minversion = "7" -testpaths = ["tests"] +testpaths = ["tests", "docs/user-guide"] log_cli_level = "INFO" xfail_strict = true asyncio_mode = "auto" diff --git a/src/zarr/core/_tree.py b/src/zarr/core/_tree.py index 8e3b0c306d..eed807ec95 100644 --- a/src/zarr/core/_tree.py +++ b/src/zarr/core/_tree.py @@ -1,4 +1,5 @@ import io +import os from collections.abc import Sequence from typing import Any @@ -24,8 +25,8 @@ def __init__(self, tree: rich.tree.Tree) -> None: self._tree = tree def __repr__(self) -> str: - terminal = rich.get_console() - console = rich.console.Console(file=io.StringIO(), color_system=terminal.color_system) + color_system = os.environ.get("OVERRIDE_COLOR_SYSTEM", rich.get_console().color_system) + console = rich.console.Console(file=io.StringIO(), color_system=color_system) console.print(self._tree) return str(console.file.getvalue()) diff --git a/tests/test_api.py b/tests/test_api.py index 80e8555e11..2b48d3bcdc 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -359,7 +359,7 @@ def test_tree() -> None: g3 = g1.create_group("bar") g3.create_group("baz") g5 = g3.create_group("qux") - g5.create_array("baz", shape=100, chunks=10) + g5.create_array("baz", shape=(100,), chunks=(10,), dtype="float64") with pytest.warns(DeprecationWarning): assert repr(zarr.tree(g1)) == repr(g1.tree()) assert str(zarr.tree(g1)) == str(g1.tree()) diff --git a/tests/test_tree.py b/tests/test_tree.py index e8bcae0ee3..b4a5106998 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -1,3 +1,4 @@ +import os import textwrap from typing import Any @@ -10,6 +11,8 @@ @pytest.mark.parametrize("root_name", [None, "root"]) def test_tree(root_name: Any) -> None: + os.environ["OVERRIDE_COLOR_SYSTEM"] = "truecolor" + g = zarr.group(path=root_name) A = g.create_group("A") B = g.create_group("B") @@ -18,9 +21,9 @@ def test_tree(root_name: Any) -> None: A.create_array(name="x", shape=(2), dtype="float64") A.create_array(name="y", shape=(0,), dtype="int8") - B.create_array(name="x", shape=(0,)) - C.create_array(name="x", shape=(0,)) - D.create_array(name="x", shape=(0,)) + B.create_array(name="x", shape=(0,), dtype="float64") + C.create_array(name="x", shape=(0,), dtype="float64") + D.create_array(name="x", shape=(0,), dtype="float64") result = repr(g.tree()) root = root_name or "" From 372995e42dee6f06c69d146d887be0db5b5bfe7b Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 3 Jan 2025 15:49:56 +0100 Subject: [PATCH 54/87] show pprint import (#2632) --- docs/user-guide/consolidated_metadata.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index d6b7a55de7..511761d34e 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -1,7 +1,3 @@ -.. only:: doctest - - >>> from pprint import pprint - .. _user-guide-consolidated-metadata: Consolidated metadata @@ -48,6 +44,7 @@ that can be used.: >>> consolidated = zarr.open_group(store=store) >>> consolidated_metadata = consolidated.metadata.consolidated_metadata.metadata + >>> from pprint import pprint >>> pprint(dict(sorted(consolidated_metadata.items()))) {'a': ArrayV3Metadata(shape=(1,), data_type=, From 4fe104a2d326749de72fed3fa2dd7b446208cc50 Mon Sep 17 00:00:00 2001 From: Juan Nunez-Iglesias Date: Sat, 4 Jan 2025 03:09:14 +1100 Subject: [PATCH 55/87] Fix open(..., mode='w') to create a group (#2629) * Add test for #2490 * Add 'w' to list of valid modes in open to create a group --------- Co-authored-by: Davis Bennett --- src/zarr/api/asynchronous.py | 2 +- tests/test_api.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 75c043fc1a..f54a824088 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -312,7 +312,7 @@ async def open( store_path = await make_store_path(store, mode=mode, path=path, storage_options=storage_options) # TODO: the mode check below seems wrong! - if "shape" not in kwargs and mode in {"a", "r", "r+"}: + if "shape" not in kwargs and mode in {"a", "r", "r+", "w"}: try: metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format) # TODO: remove this cast when we fix typing for array metadata dicts diff --git a/tests/test_api.py b/tests/test_api.py index 2b48d3bcdc..6700f6b5e3 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1086,6 +1086,13 @@ async def test_open_falls_back_to_open_group_async() -> None: assert group.attrs == {"key": "value"} +def test_open_mode_write_creates_group(tmp_path: pathlib.Path) -> None: + # https://github.com/zarr-developers/zarr-python/issues/2490 + zarr_dir = tmp_path / "test.zarr" + group = zarr.open(zarr_dir, mode="w") + assert isinstance(group, Group) + + async def test_metadata_validation_error() -> None: with pytest.raises( MetadataValidationError, From 38953e365930cd4feac940ac0b958f521c4fde79 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 3 Jan 2025 17:35:17 +0000 Subject: [PATCH 56/87] Clean up public store API (#2603) * Clean up public store API * fix import --- src/zarr/core/array.py | 2 +- src/zarr/core/group.py | 4 ++-- src/zarr/storage/__init__.py | 15 ++++++++------- src/zarr/storage/{common.py => _common.py} | 6 +++--- src/zarr/storage/{fsspec.py => _fsspec.py} | 2 +- src/zarr/storage/{local.py => _local.py} | 0 src/zarr/storage/{logging.py => _logging.py} | 2 +- src/zarr/storage/{memory.py => _memory.py} | 0 src/zarr/storage/{wrapper.py => _wrapper.py} | 0 src/zarr/storage/{zip.py => _zip.py} | 0 src/zarr/testing/store.py | 2 +- src/zarr/testing/strategies.py | 2 +- tests/conftest.py | 3 +-- tests/test_api.py | 2 +- tests/test_array.py | 3 +-- tests/test_buffer.py | 3 +-- tests/test_codecs/test_blosc.py | 2 +- tests/test_codecs/test_endian.py | 2 +- tests/test_codecs/test_gzip.py | 2 +- tests/test_codecs/test_sharding.py | 2 +- tests/test_codecs/test_transpose.py | 2 +- tests/test_codecs/test_vlen.py | 2 +- tests/test_codecs/test_zstd.py | 2 +- tests/test_group.py | 3 +-- tests/test_indexing.py | 3 +-- tests/test_metadata/test_consolidated.py | 2 +- tests/test_store/test_core.py | 5 +---- tests/test_store/test_local.py | 2 +- tests/test_store/test_logging.py | 2 +- tests/test_store/test_memory.py | 2 +- tests/test_store/test_wrapper.py | 2 +- tests/test_store/test_zip.py | 2 +- tests/test_sync.py | 2 +- 33 files changed, 39 insertions(+), 46 deletions(-) rename src/zarr/storage/{common.py => _common.py} (99%) rename src/zarr/storage/{fsspec.py => _fsspec.py} (99%) rename src/zarr/storage/{local.py => _local.py} (100%) rename src/zarr/storage/{logging.py => _logging.py} (99%) rename src/zarr/storage/{memory.py => _memory.py} (100%) rename src/zarr/storage/{wrapper.py => _wrapper.py} (100%) rename src/zarr/storage/{zip.py => _zip.py} (100%) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0a5b5f085a..6d8aca20ec 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -113,7 +113,7 @@ get_pipeline_class, ) from zarr.storage import StoreLike, make_store_path -from zarr.storage.common import StorePath, ensure_no_existing_node +from zarr.storage._common import StorePath, ensure_no_existing_node if TYPE_CHECKING: from collections.abc import Iterator, Sequence diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 29b25689c4..7a0d2efc09 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -50,8 +50,8 @@ from zarr.core.metadata.v3 import V3JsonEncoder from zarr.core.sync import SyncMixin, sync from zarr.errors import MetadataValidationError -from zarr.storage import StoreLike, make_store_path -from zarr.storage.common import StorePath, ensure_no_existing_node +from zarr.storage import StoreLike, StorePath, make_store_path +from zarr.storage._common import ensure_no_existing_node if TYPE_CHECKING: from collections.abc import AsyncGenerator, Generator, Iterable, Iterator diff --git a/src/zarr/storage/__init__.py b/src/zarr/storage/__init__.py index 514361bd6b..c092ade03e 100644 --- a/src/zarr/storage/__init__.py +++ b/src/zarr/storage/__init__.py @@ -3,16 +3,17 @@ from types import ModuleType from typing import Any -from zarr.storage.common import StoreLike, StorePath, make_store_path -from zarr.storage.fsspec import FsspecStore -from zarr.storage.local import LocalStore -from zarr.storage.logging import LoggingStore -from zarr.storage.memory import MemoryStore -from zarr.storage.wrapper import WrapperStore -from zarr.storage.zip import ZipStore +from zarr.storage._common import StoreLike, StorePath, make_store_path +from zarr.storage._fsspec import FsspecStore +from zarr.storage._local import LocalStore +from zarr.storage._logging import LoggingStore +from zarr.storage._memory import GpuMemoryStore, MemoryStore +from zarr.storage._wrapper import WrapperStore +from zarr.storage._zip import ZipStore __all__ = [ "FsspecStore", + "GpuMemoryStore", "LocalStore", "LoggingStore", "MemoryStore", diff --git a/src/zarr/storage/common.py b/src/zarr/storage/_common.py similarity index 99% rename from src/zarr/storage/common.py rename to src/zarr/storage/_common.py index 973c8b13e3..523e470671 100644 --- a/src/zarr/storage/common.py +++ b/src/zarr/storage/_common.py @@ -8,9 +8,9 @@ from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.core.common import ZARR_JSON, ZARRAY_JSON, ZGROUP_JSON, AccessModeLiteral, ZarrFormat from zarr.errors import ContainsArrayAndGroupError, ContainsArrayError, ContainsGroupError +from zarr.storage._local import LocalStore +from zarr.storage._memory import MemoryStore from zarr.storage._utils import normalize_path -from zarr.storage.local import LocalStore -from zarr.storage.memory import MemoryStore if TYPE_CHECKING: from zarr.core.buffer import BufferPrototype @@ -281,7 +281,7 @@ async def make_store_path( TypeError If the StoreLike object is not one of the supported types. """ - from zarr.storage.fsspec import FsspecStore # circular import + from zarr.storage._fsspec import FsspecStore # circular import used_storage_options = False path_normalized = normalize_path(path) diff --git a/src/zarr/storage/fsspec.py b/src/zarr/storage/_fsspec.py similarity index 99% rename from src/zarr/storage/fsspec.py rename to src/zarr/storage/_fsspec.py index c9edd8f8ac..89d80320dd 100644 --- a/src/zarr/storage/fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Any from zarr.abc.store import ByteRangeRequest, Store -from zarr.storage.common import _dereference_path +from zarr.storage._common import _dereference_path if TYPE_CHECKING: from collections.abc import AsyncIterator, Iterable diff --git a/src/zarr/storage/local.py b/src/zarr/storage/_local.py similarity index 100% rename from src/zarr/storage/local.py rename to src/zarr/storage/_local.py diff --git a/src/zarr/storage/logging.py b/src/zarr/storage/_logging.py similarity index 99% rename from src/zarr/storage/logging.py rename to src/zarr/storage/_logging.py index 9ec3a9be18..450913e9d3 100644 --- a/src/zarr/storage/logging.py +++ b/src/zarr/storage/_logging.py @@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any from zarr.abc.store import Store -from zarr.storage.wrapper import WrapperStore +from zarr.storage._wrapper import WrapperStore if TYPE_CHECKING: from collections.abc import AsyncIterator, Generator, Iterable diff --git a/src/zarr/storage/memory.py b/src/zarr/storage/_memory.py similarity index 100% rename from src/zarr/storage/memory.py rename to src/zarr/storage/_memory.py diff --git a/src/zarr/storage/wrapper.py b/src/zarr/storage/_wrapper.py similarity index 100% rename from src/zarr/storage/wrapper.py rename to src/zarr/storage/_wrapper.py diff --git a/src/zarr/storage/zip.py b/src/zarr/storage/_zip.py similarity index 100% rename from src/zarr/storage/zip.py rename to src/zarr/storage/_zip.py diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index 53dee012bf..ada028c273 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -4,7 +4,7 @@ import pickle from typing import TYPE_CHECKING, Generic, TypeVar -from zarr.storage.wrapper import WrapperStore +from zarr.storage import WrapperStore if TYPE_CHECKING: from typing import Any diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index ae0487e447..1bde01b8f9 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -11,7 +11,7 @@ from zarr.core.common import ZarrFormat from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike -from zarr.storage.common import _dereference_path +from zarr.storage._common import _dereference_path # Copied from Xarray _attr_keys = st.text(st.characters(), min_size=1) diff --git a/tests/conftest.py b/tests/conftest.py index ee31d0d071..e9cd2b8120 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,8 +12,7 @@ from zarr import AsyncGroup, config from zarr.abc.store import Store from zarr.core.sync import sync -from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore -from zarr.storage.fsspec import FsspecStore +from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore if TYPE_CHECKING: from collections.abc import Generator diff --git a/tests/test_api.py b/tests/test_api.py index 6700f6b5e3..aacd558f2a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -25,8 +25,8 @@ ) from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.errors import MetadataValidationError +from zarr.storage import MemoryStore from zarr.storage._utils import normalize_path -from zarr.storage.memory import MemoryStore def test_create(memory_store: Store) -> None: diff --git a/tests/test_array.py b/tests/test_array.py index 72ff68d954..51ad289e80 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -40,8 +40,7 @@ from zarr.core.metadata.v3 import DataType from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError -from zarr.storage import LocalStore, MemoryStore -from zarr.storage.common import StorePath +from zarr.storage import LocalStore, MemoryStore, StorePath if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike diff --git a/tests/test_buffer.py b/tests/test_buffer.py index e3cab0f214..baef0b8109 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -12,8 +12,7 @@ from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.buffer import ArrayLike, BufferPrototype, NDArrayLike, cpu, gpu -from zarr.storage.common import StorePath -from zarr.storage.memory import MemoryStore +from zarr.storage import MemoryStore, StorePath from zarr.testing.buffer import ( NDBufferUsingTestNDArrayLike, StoreExpectingTestBuffer, diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 34044d7d62..c1c5c92329 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -7,7 +7,7 @@ from zarr.abc.store import Store from zarr.codecs import BloscCodec from zarr.core.buffer import default_buffer_prototype -from zarr.storage.common import StorePath +from zarr.storage import StorePath @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index ae9d1f6f1f..c0c4dd4e75 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -6,7 +6,7 @@ import zarr from zarr.abc.store import Store from zarr.codecs import BytesCodec -from zarr.storage.common import StorePath +from zarr.storage import StorePath from .test_codecs import _AsyncArrayProxy diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index f47f9710b1..4753036c87 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -4,7 +4,7 @@ import zarr from zarr.abc.store import Store from zarr.codecs import GzipCodec -from zarr.storage.common import StorePath +from zarr.storage import StorePath @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 3f14007351..484cfa4eda 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -17,7 +17,7 @@ TransposeCodec, ) from zarr.core.buffer import default_buffer_prototype -from zarr.storage.common import StorePath +from zarr.storage import StorePath from ..conftest import ArrayRequest from .test_codecs import _AsyncArrayProxy, order_from_dim diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 65159f174b..18ea8e65d0 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -6,7 +6,7 @@ from zarr.abc.store import Store from zarr.codecs import TransposeCodec from zarr.core.common import MemoryOrder -from zarr.storage.common import StorePath +from zarr.storage import StorePath from .test_codecs import _AsyncArrayProxy diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index f4ee135601..f5599f2ac0 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -10,7 +10,7 @@ from zarr.codecs import ZstdCodec from zarr.core.metadata.v3 import ArrayV3Metadata, DataType from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING -from zarr.storage.common import StorePath +from zarr.storage import StorePath numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType] expected_zarr_string_dtype: np.dtype[Any] diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index a57476fb61..6068f53443 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -4,7 +4,7 @@ import zarr from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.storage.common import StorePath +from zarr.storage import StorePath @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) diff --git a/tests/test_group.py b/tests/test_group.py index 6b3c40412e..a4ce04e822 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -21,8 +21,7 @@ from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError -from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore -from zarr.storage.common import make_store_path +from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore, make_store_path from .conftest import parse_store diff --git a/tests/test_indexing.py b/tests/test_indexing.py index fc83af695b..30d0d75f22 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -26,8 +26,7 @@ replace_ellipsis, ) from zarr.registry import get_ndbuffer_class -from zarr.storage.common import StorePath -from zarr.storage.memory import MemoryStore +from zarr.storage import MemoryStore, StorePath if TYPE_CHECKING: from collections.abc import AsyncGenerator diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index aaace6f5cd..2731abada4 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -21,7 +21,7 @@ from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v2 import ArrayV2Metadata -from zarr.storage.common import StorePath +from zarr.storage import StorePath if TYPE_CHECKING: from zarr.abc.store import Store diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index 48f8d2a529..5ab299442d 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -5,11 +5,8 @@ from _pytest.compat import LEGACY_PATH from zarr.core.common import AccessModeLiteral +from zarr.storage import FsspecStore, LocalStore, MemoryStore, StoreLike, StorePath, make_store_path from zarr.storage._utils import normalize_path -from zarr.storage.common import StoreLike, StorePath, make_store_path -from zarr.storage.fsspec import FsspecStore -from zarr.storage.local import LocalStore -from zarr.storage.memory import MemoryStore @pytest.mark.parametrize("path", [None, "", "bar"]) diff --git a/tests/test_store/test_local.py b/tests/test_store/test_local.py index c614d32c26..22597a2c3f 100644 --- a/tests/test_store/test_local.py +++ b/tests/test_store/test_local.py @@ -6,7 +6,7 @@ import zarr from zarr.core.buffer import Buffer, cpu -from zarr.storage.local import LocalStore +from zarr.storage import LocalStore from zarr.testing.store import StoreTests if TYPE_CHECKING: diff --git a/tests/test_store/test_logging.py b/tests/test_store/test_logging.py index c0630dffd8..b32a214db5 100644 --- a/tests/test_store/test_logging.py +++ b/tests/test_store/test_logging.py @@ -6,7 +6,7 @@ import zarr from zarr.core.buffer import default_buffer_prototype -from zarr.storage.logging import LoggingStore +from zarr.storage import LoggingStore if TYPE_CHECKING: from zarr.abc.store import Store diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py index 4ca4ebb817..ba38889b52 100644 --- a/tests/test_store/test_memory.py +++ b/tests/test_store/test_memory.py @@ -3,7 +3,7 @@ import pytest from zarr.core.buffer import Buffer, cpu, gpu -from zarr.storage.memory import GpuMemoryStore, MemoryStore +from zarr.storage import GpuMemoryStore, MemoryStore from zarr.testing.store import StoreTests from zarr.testing.utils import gpu_test diff --git a/tests/test_store/test_wrapper.py b/tests/test_store/test_wrapper.py index 1caf9c9ae4..489bcd5a7a 100644 --- a/tests/test_store/test_wrapper.py +++ b/tests/test_store/test_wrapper.py @@ -5,7 +5,7 @@ import pytest from zarr.core.buffer.cpu import Buffer, buffer_prototype -from zarr.storage.wrapper import WrapperStore +from zarr.storage import WrapperStore if TYPE_CHECKING: from zarr.abc.store import Store diff --git a/tests/test_store/test_zip.py b/tests/test_store/test_zip.py index df22b76e1e..a83327d99a 100644 --- a/tests/test_store/test_zip.py +++ b/tests/test_store/test_zip.py @@ -10,7 +10,7 @@ import zarr from zarr.core.buffer import Buffer, cpu, default_buffer_prototype -from zarr.storage.zip import ZipStore +from zarr.storage import ZipStore from zarr.testing.store import StoreTests if TYPE_CHECKING: diff --git a/tests/test_sync.py b/tests/test_sync.py index 02b3b594fd..b0a6ecffd0 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -14,7 +14,7 @@ cleanup_resources, sync, ) -from zarr.storage.memory import MemoryStore +from zarr.storage import MemoryStore @pytest.fixture(params=[True, False]) From eef18f36daa0001da59a5a9a38f2e7ebf2cdc863 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 3 Jan 2025 13:40:35 -0800 Subject: [PATCH 57/87] docs: add migration page to user guide (#2596) * docs: split tutorial into multiple user guide sections * docs: add migration page to user guide * rev migration guide again * fixup * Apply suggestions from code review Co-authored-by: David Stansby * update migration guide * update migration guide * fixup * removed config section --------- Co-authored-by: David Stansby --- docs/index.rst | 9 +- docs/user-guide/config.rst | 4 +- docs/user-guide/index.rst | 2 +- docs/user-guide/storage.rst | 2 + docs/user-guide/v3_migration.rst | 208 +++++++++++++++++++++++++++++++ 5 files changed, 217 insertions(+), 8 deletions(-) create mode 100644 docs/user-guide/v3_migration.rst diff --git a/docs/index.rst b/docs/index.rst index 37d560f655..df14f07e3d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -52,15 +52,14 @@ Zarr is a file storage format for chunked, compressed, N-dimensional arrays base .. grid-item-card:: :img-top: _static/index_user_guide.svg - Guide - ^^^^^ + Guide + ^^^^^ - A detailed guide for how to use Zarr-Python. + A detailed guide for how to use Zarr-Python. +++ - .. button-ref:: user-guide - :ref-type: ref + .. button-ref:: user-guide/index :expand: :color: dark :click-parent: diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 927d493e95..e38715b67e 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -3,8 +3,8 @@ Runtime configuration ===================== -The :mod:`zarr.core.config` module is responsible for managing the configuration of zarr -and is based on the `donfig `_ Python library. +:mod:`zarr.config ` is responsible for managing the configuration of zarr and +is based on the `donfig `_ Python library. Configuration values can be set using code like the following:: diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 8647eeb3e6..a9d1c9fa29 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -11,10 +11,10 @@ User guide attributes storage config + v3_migration .. Coming soon installation - v3_migration Advanced Topics --------------- diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index 7beb38a36f..46505271b4 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -99,6 +99,8 @@ Zarr data (metadata and chunks) to a dictionary.: >>> zarr.create_array(store=store, shape=(2,), dtype='float64') +.. _user-guide-custom-stores: + Developing custom stores ------------------------ diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst new file mode 100644 index 0000000000..fffe50e5e1 --- /dev/null +++ b/docs/user-guide/v3_migration.rst @@ -0,0 +1,208 @@ +3.0 Migration Guide +=================== + +Zarr-Python 3 represents a major refactor of the Zarr-Python codebase. Some of the +goals motivating this refactor included: + +* adding support for the Zarr V3 specification (along with the Zarr V2 specification) +* cleaning up internal and user facing APIs +* improving performance (particularly in high latency storage environments like + cloud object stores) + +To accommodate this, Zarr-Python 3 introduces a number of changes to the API, including a number +of significant breaking changes and deprecations. + +This page provides a guide explaining breaking changes and deprecations to help you +migrate your code from version 2 to version 3. If we have missed anything, please +open a `GitHub issue `_ +so we can improve this guide. + +Compatibility target +-------------------- + +The goals described above necessitated some breaking changes to the API (hence the +major version update), but where possible we have maintained backwards compatibility +in the most widely used parts of the API. This in the :class:`zarr.Array` and +:class:`zarr.Group` classes and the "top-level API" (e.g. :func:`zarr.open_array` and +:func:`zarr.open_group`). + +Getting ready for 3.0 +--------------------- + +Before migrating to Zarr-Python 3, we suggest projects that depend on Zarr-Python take +the following actions in order: + +1. Pin the supported Zarr-Python version to ``zarr>=2,<3``. This is a best practice + and will protect your users from any incompatibilities that may arise during the + release of Zarr-Python 3. This pin can be removed after migrating to Zarr-Python 3. +2. Limit your imports from the Zarr-Python package. Most of the primary API ``zarr.*`` + will be compatible in Zarr-Python 3. However, the following breaking API changes are + planned: + + - ``numcodecs.*`` will no longer be available in ``zarr.*``. To migrate, import codecs + directly from ``numcodecs``: + + .. code-block:: python + + from numcodecs import Blosc + # instead of: + # from zarr import Blosc + + - The ``zarr.v3_api_available`` feature flag is being removed. In Zarr-Python 3 + the v3 API is always available, so you shouldn't need to use this flag. + - The following internal modules are being removed or significantly changed. If + your application relies on imports from any of the below modules, you will need + to either a) modify your application to no longer rely on these imports or b) + vendor the parts of the specific modules that you need. + + * ``zarr.attrs`` has gone, with no replacement + * ``zarr.codecs`` has gone, use ``numcodecs`` instead + * ``zarr.context`` has gone, with no replacement + * ``zarr.core`` remains but should be considered private API + * ``zarr.hierarchy`` has gone, with no replacement (use ``zarr.Group`` inplace of ``zarr.hierarchy.Group``) + * ``zarr.indexing`` has gone, with no replacement + * ``zarr.meta`` has gone, with no replacement + * ``zarr.meta_v1`` has gone, with no replacement + * ``zarr.sync`` has gone, with no replacement + * ``zarr.types`` has gone, with no replacement + * ``zarr.util`` has gone, with no replacement + * ``zarr.n5`` has gone, see below for an alternative N5 options + +3. Test that your package works with version 3. +4. Update the pin to include ``zarr>=3,<4``. + +Zarr-Python 2 support window +---------------------------- + +Zarr-Python 2.x is still available, though we recommend migrating to Zarr-Python 3 for +its performance improvements and new features. Security and bug fixes will be made to +the 2.x series for at least six months following the first Zarr-Python 3 release. +If you need to use the latest Zarr-Python 2 release, you can install it with: + +.. code-block:: console + + $ pip install "zarr==2.*" + +.. note:: + Development and maintenance of the 2.x release series has moved to the + `support/v2 `_ branch. + Issues and pull requests related to this branch are tagged with the + `V2 `_ label. + +Migrating to Zarr-Python 3 +-------------------------- + +The following sections provide details on breaking changes in Zarr-Python 3. + +The Array class +~~~~~~~~~~~~~~~ + +1. Disallow direct construction - the signature for initializing the ``Array`` class has changed + significantly. Please use :func:`zarr.create_array` or :func:`zarr.open_array` instead of + directly constructing the :class:`zarr.Array` class. + +2. Defaulting to ``zarr_format=3`` - newly created arrays will use the version 3 of the + Zarr specification. To continue using version 2, set ``zarr_format=2`` when creating arrays + or set ``default_zarr_version=2`` in Zarr's :ref:`runtime configuration `. + +The Group class +~~~~~~~~~~~~~~~ + +1. Disallow direct construction - use :func:`zarr.open_group` or :func:`zarr.create_group` + instead of directly constructing the :class:`zarr.Group` class. +2. Most of the h5py compatibility methods are deprecated and will issue warnings if used. + The following functions are drop in replacements that have the same signature and functionality: + + - Use :func:`zarr.Group.create_array` in place of :func:`zarr.Group.create_dataset` + - Use :func:`zarr.Group.require_array` in place of :func:`zarr.Group.require_dataset` + +The Store class +~~~~~~~~~~~~~~~ + +The Store API has changed significant in Zarr-Python 3. The most notable changes to the +Store API are: + +1. Replaced the ``MutableMapping`` base class in favor of a custom abstract base class + (:class:`zarr.abc.store.Store`). +2. Switched to an asynchronous interface for all store methods that result in IO. This + change ensures that all store methods are non-blocking and are as performant as + possible. + +Beyond the changes store interface, a number of deprecated stores were also removed in +Zarr-Python 3. See :issue:`1274` for more details on the removal of these stores. + +- ``N5Store`` - see https://github.com/zarr-developers/n5py for an alternative interface to + N5 formatted data. +- ``ABSStore`` - use the :class:`zarr.storage.FsspecStore` instead along with fsspec's + `adlfs backend `_. + +The following stores have been removed altogether. Users who need these stores will have to +implement their own version in zarr-python v3. + +- ``DBMStore`` +- ``LMDBStore`` +- ``SQLiteStore`` +- ``MongoDBStore`` +- ``RedisStore`` + +At present, the latter five stores in this list do not have an equivalent in Zarr-Python 3. +If you are interested in developing a custom store that targets these backends, see +:ref:`developing custom stores ` or open an +`issue `_ to discuss your use case. + +Dependencies +~~~~~~~~~~~~ + +When installing using ``pip``: + +- The new ``remote`` dependency group can be used to install a supported version of + ``fsspec``, required for remote data access. +- The new ``gpu`` dependency group can be used to install a supported version of + ``cuda``, required for GPU functionality. +- The ``jupyter`` optional dependency group has been removed, since v3 contains no + jupyter specific functionality. + +Miscellaneous +~~~~~~~~~~~~~ + +- The keyword argument ``zarr_version`` available in most creation functions in :mod:`zarr` + (e.g. :func:`zarr.create`, :func:`zarr.open`, :func:`zarr.group`, :func:`zarr.array`) has + been deprecated in favor of ``zarr_format``. + +🚧 Work in Progress 🚧 +---------------------- + +Zarr-Python 3 is still under active development, and is not yet fully complete. +The following list summarizes areas of the codebase that we expect to build out +after the 3.0.0 release. If features listed below are important to your use case +of Zarr-Python, please open (or comment on) a +`GitHub issue `_. + +- The following functions / methods have not been ported to Zarr-Python 3 yet: + + * :func:`zarr.copy` (:issue:`2407`) + * :func:`zarr.copy_all` (:issue:`2407`) + * :func:`zarr.copy_store` (:issue:`2407`) + * :func:`zarr.Group.move` (:issue:`2108`) + +- The following features (corresponding to function arguments to functions in + :mod:`zarr`) have not been ported to Zarr-Python 3 yet. Using these features + will raise a warning or a ``NotImplementedError``: + + * ``cache_attrs`` + * ``cache_metadata`` + * ``chunk_store`` (:issue:`2495`) + * ``meta_array`` + * ``object_codec`` (:issue:`2617`) + * ``synchronizer`` (:issue:`1596`) + * ``dimension_separator`` + +- The following features that were supported by Zarr-Python 2 have not been ported + to Zarr-Python 3 yet: + + * Structured arrays / dtypes (:issue:`2134`) + * Fixed-length string dtypes (:issue:`2347`) + * Datetime and timedelta dtypes (:issue:`2616`) + * Object dtypes (:issue:`2617`) + * Ragged arrays (:issue:`2618`) + * Groups and Arrays do not implement ``__enter__`` and ``__exit__`` protocols (:issue:`2619`) From 13fc18870a905ea32311239664bcafe88368fe22 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 3 Jan 2025 23:19:34 +0000 Subject: [PATCH 58/87] Move 'about' to end of table of contents (#2636) --- docs/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index df14f07e3d..5fe5b2a848 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,12 +9,12 @@ Zarr-Python :hidden: getting_started - about user-guide/index api/index release contributing roadmap + about **Version**: |version| From 23beb8f31e39c1c232b2d4a720a3fed7d3e70042 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 3 Jan 2025 15:58:52 -0800 Subject: [PATCH 59/87] docs: replace getting_started page with quickstart (#2590) * docs: split tutorial into multiple user guide sections * docs: replace getting_started page with quickstart * Apply suggestions from code review Co-authored-by: Norman Rzepka * update quickstart * docs: replace getting_started page with quickstart * Apply suggestions from code review Co-authored-by: Norman Rzepka * update quickstart * Use docstests * update link to storage guide * remove ipython * add redirect --------- Co-authored-by: Norman Rzepka Co-authored-by: David Stansby --- docs/conf.py | 1 + docs/getting_started.rst | 28 ------ docs/index.rst | 13 +-- docs/quickstart.rst | 186 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 194 insertions(+), 34 deletions(-) delete mode 100644 docs/getting_started.rst create mode 100644 docs/quickstart.rst diff --git a/docs/conf.py b/docs/conf.py index dfd1ae07bb..d336740da2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -104,6 +104,7 @@ def skip_submodules( "spec/v3": "https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html", "license": "https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt", "tutorial": "user-guide", + "getting-started": "quickstart", } # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/docs/getting_started.rst b/docs/getting_started.rst deleted file mode 100644 index 5950e2ae44..0000000000 --- a/docs/getting_started.rst +++ /dev/null @@ -1,28 +0,0 @@ -Getting Started -=============== - -Highlights ----------- - -* Create N-dimensional arrays with any NumPy dtype. -* Chunk arrays along any dimension. -* Compress and/or filter chunks using any NumCodecs_ codec. -* Store arrays in memory, on disk, inside a Zip file, on S3, ... -* Read an array concurrently from multiple threads or processes. -* Write to an array concurrently from multiple threads or processes. -* Organize arrays into hierarchies via groups. - -Contributing ------------- - -Feedback and bug reports are very welcome, please get in touch via -the `GitHub issue tracker `_. See -:doc:`contributing` for further information about contributing to Zarr. - -.. toctree:: - :caption: Getting Started - :hidden: - - installation - -.. _NumCodecs: https://numcodecs.readthedocs.io/ diff --git a/docs/index.rst b/docs/index.rst index 5fe5b2a848..5bbd04ec60 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,7 +8,8 @@ Zarr-Python :maxdepth: 1 :hidden: - getting_started + quickstart + installation user-guide/index api/index release @@ -34,20 +35,20 @@ Zarr is a file storage format for chunked, compressed, N-dimensional arrays base .. grid-item-card:: :img-top: _static/index_getting_started.svg - Getting Started - ^^^^^^^^^^^^^^^ + Quick Start + ^^^^^^^^^^^ - New to Zarr? Check out the getting started guide. It contains an + New to Zarr? Check out the quick start guide. It contains a brief introduction to Zarr's main concepts and links to additional tutorials. +++ - .. button-ref:: getting_started + .. button-ref:: quickstart :expand: :color: dark :click-parent: - To the getting started guide + To the Quick Start .. grid-item-card:: :img-top: _static/index_user_guide.svg diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 0000000000..2d0e8ecef8 --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,186 @@ +.. only:: doctest + + >>> import shutil + >>> shutil.rmtree('data', ignore_errors=True) + >>> + >>> import numpy as np + >>> np.random.seed(0) + +Quickstart +========== + +Welcome to the Zarr-Python Quickstart guide! This page will help you get up and running with +the Zarr library in Python to efficiently manage and analyze multi-dimensional arrays. + +Zarr is a powerful library for storage of n-dimensional arrays, supporting chunking, +compression, and various backends, making it a versatile choice for scientific and +large-scale data. + +Installation +------------ + +Zarr requires Python 3.11 or higher. You can install it via `pip`: + +.. code-block:: bash + + pip install zarr + +or `conda`: + +.. code-block:: bash + + conda install --channel conda-forge zarr + +Creating an Array +----------------- + +To get started, you can create a simple Zarr array:: + + >>> import zarr + >>> import numpy as np + >>> + >>> # Create a 2D Zarr array + >>> z = zarr.create_array( + ... store="data/example-1.zarr", + ... shape=(100, 100), + ... chunks=(10, 10), + ... dtype="f4" + ... ) + >>> + >>> # Assign data to the array + >>> z[:, :] = np.random.random((100, 100)) + >>> z.info + Type : Array + Zarr format : 3 + Data type : DataType.float32 + Shape : (100, 100) + Chunk shape : (10, 10) + Order : C + Read-only : False + Store type : LocalStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 40000 (39.1K) + +Here, we created a 2D array of shape ``(100, 100)``, chunked into blocks of +``(10, 10)``, and filled it with random floating-point data. This array was +written to a ``LocalStore`` in the ``data/example-1.zarr`` directory. + +Compression and Filters +~~~~~~~~~~~~~~~~~~~~~~~ + +Zarr supports data compression and filters. For example, to use Blosc compression:: + + >>> z = zarr.create_array( + ... "data/example-3.zarr", + ... mode="w", shape=(100, 100), + ... chunks=(10, 10), dtype="f4", + ... compressor=zarr.codecs.BloscCodec(cname="zstd", clevel=3, shuffle=zarr.codecs.BloscShuffle.SHUFFLE) + ... ) + >>> z[:, :] = np.random.random((100, 100)) + >>> + >>> z.info + Type : Array + Zarr format : 3 + Data type : DataType.float32 + Shape : (100, 100) + Chunk shape : (10, 10) + Order : C + Read-only : False + Store type : LocalStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 40000 (39.1K) + +This compresses the data using the Zstandard codec with shuffle enabled for better compression. + +Hierarchical Groups +------------------- + +Zarr allows you to create hierarchical groups, similar to directories:: + + >>> # Create nested groups and add arrays + >>> root = zarr.group("data/example-2.zarr") + >>> foo = root.create_group(name="foo") + >>> bar = root.create_array( + ... name="bar", shape=(100, 10), chunks=(10, 10) + ... ) + >>> spam = foo.create_array(name="spam", shape=(10,), dtype="i4") + >>> + >>> # Assign values + >>> bar[:, :] = np.random.random((100, 10)) + >>> spam[:] = np.arange(10) + >>> + >>> # print the hierarchy + >>> root.tree() + / + └── foo + └── spam (10,) int32 + + +This creates a group with two datasets: ``foo`` and ``bar``. + +Persistent Storage +------------------ + +Zarr supports persistent storage to disk or cloud-compatible backends. While examples above +utilized a :class:`zarr.storage.LocalStore`, a number of other storage options are available. + +Zarr integrates seamlessly with cloud object storage such as Amazon S3 and Google Cloud Storage +using external libraries like `s3fs `_ or +`gcsfs `_:: + + >>> import s3fs # doctest: +SKIP + >>> + >>> z = zarr.create_array("s3://example-bucket/foo", mode="w", shape=(100, 100), chunks=(10, 10)) # doctest: +SKIP + >>> z[:, :] = np.random.random((100, 100)) # doctest: +SKIP + +A single-file store can also be created using the the :class:`zarr.storage.ZipStore`:: + + >>> # Store the array in a ZIP file + >>> store = zarr.storage.ZipStore("data/example-3.zip", mode='w') + >>> + >>> z = zarr.create_array( + ... store=store, + ... mode="w", + ... shape=(100, 100), + ... chunks=(10, 10), + ... dtype="f4" + ... ) + >>> + >>> # write to the array + >>> z[:, :] = np.random.random((100, 100)) + >>> + >>> # the ZipStore must be explicitly closed + >>> store.close() + +To open an existing array from a ZIP file:: + + >>> # Open the ZipStore in read-only mode + >>> store = zarr.storage.ZipStore("data/example-3.zip", read_only=True) + >>> + >>> z = zarr.open_array(store, mode='r') + >>> + >>> # read the data as a NumPy Array + >>> z[:] + array([[0.66734236, 0.15667458, 0.98720884, ..., 0.36229587, 0.67443246, + 0.34315267], + [0.65787303, 0.9544212 , 0.4830079 , ..., 0.33097172, 0.60423803, + 0.45621237], + [0.27632037, 0.9947008 , 0.42434934, ..., 0.94860053, 0.6226942 , + 0.6386924 ], + ..., + [0.12854576, 0.934397 , 0.19524333, ..., 0.11838563, 0.4967675 , + 0.43074256], + [0.82029045, 0.4671437 , 0.8090906 , ..., 0.7814118 , 0.42650765, + 0.95929915], + [0.4335856 , 0.7565437 , 0.7828931 , ..., 0.48119593, 0.66220033, + 0.6652362 ]], shape=(100, 100), dtype=float32) + +Read more about Zarr's storage options in the :ref:`User Guide `. + +Next Steps +---------- + +Now that you're familiar with the basics, explore the following resources: + +- `User Guide `_ +- `API Reference `_ From f1064a37716fe03624ab3012b0bf117b2e4c7f10 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 3 Jan 2025 16:31:07 -0800 Subject: [PATCH 60/87] docs: consolidate developer docs + update contributing page for v3 (#2593) * docs: split tutorial into multiple user guide sections * docs: consolidate developer docs + update contributing page for v3 * second rev on dev docs * point to relevant issues on code coverage and testing * docs: consolidate developer docs + update contributing page for v3 * second rev on dev docs * point to relevant issues on code coverage and testing * fixup --- docs/conf.py | 2 + docs/{ => developers}/contributing.rst | 134 +++++++++---------------- docs/developers/index.rst | 10 ++ docs/{ => developers}/release.rst | 0 docs/{ => developers}/roadmap.rst | 0 docs/index.rst | 11 +- 6 files changed, 68 insertions(+), 89 deletions(-) rename docs/{ => developers}/contributing.rst (77%) create mode 100644 docs/developers/index.rst rename docs/{ => developers}/release.rst (100%) rename docs/{ => developers}/roadmap.rst (100%) diff --git a/docs/conf.py b/docs/conf.py index d336740da2..01b7490298 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -105,6 +105,8 @@ def skip_submodules( "license": "https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt", "tutorial": "user-guide", "getting-started": "quickstart", + "release": "developers/release.html", + "roadmap": "developers/roadmap.html", } # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/docs/contributing.rst b/docs/developers/contributing.rst similarity index 77% rename from docs/contributing.rst rename to docs/developers/contributing.rst index 0ead6c8267..7bf37ef1a3 100644 --- a/docs/contributing.rst +++ b/docs/developers/contributing.rst @@ -1,5 +1,5 @@ -Contributing -============ +Contributing to Zarr +==================== Zarr is a community maintained project. We welcome contributions in the form of bug reports, bug fixes, documentation, enhancement proposals and more. This page provides @@ -46,8 +46,7 @@ a bug report: interpreter can be obtained by running a Python interactive session, e.g.:: $ python - Python 3.6.1 (default, Mar 22 2017, 06:17:05) - [GCC 6.3.0 20170321] on linux + Python 3.12.7 | packaged by conda-forge | (main, Oct 4 2024, 15:57:01) [Clang 17.0.6 ] on darwin Enhancement proposals --------------------- @@ -73,7 +72,8 @@ The Zarr source code is hosted on GitHub at the following location: * `https://github.com/zarr-developers/zarr-python `_ You will need your own fork to work on the code. Go to the link above and hit -the "Fork" button. Then clone your fork to your local machine:: +the `"Fork" `_ button. +Then clone your fork to your local machine:: $ git clone git@github.com:your-user-name/zarr-python.git $ cd zarr-python @@ -82,21 +82,21 @@ the "Fork" button. Then clone your fork to your local machine:: Creating a development environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To work with the Zarr source code, it is recommended to set up a Python virtual -environment and install all Zarr dependencies using the same versions as are used by -the core developers and continuous integration services. Assuming you have a Python -3 interpreter already installed, and you have cloned the Zarr source code and your -current working directory is the root of the repository, you can do something like -the following:: +To work with the Zarr source code, it is recommended to use +`hatch `_ to create and manage development +environments. Hatch will automatically install all Zarr dependencies using the same +versions as are used by the core developers and continuous integration services. +Assuming you have a Python 3 interpreter already installed, and you have cloned the +Zarr source code and your current working directory is the root of the repository, +you can do something like the following:: - $ mkdir -p ~/pyenv/zarr-dev - $ python -m venv ~/pyenv/zarr-dev - $ source ~/pyenv/zarr-dev/bin/activate - $ pip install -e .[test,docs] + $ pip install hatch + $ hatch env show # list all available environments -To verify that your development environment is working, you can run the unit tests:: +To verify that your development environment is working, you can run the unit tests +for one of the test environments, e.g.:: - $ python -m pytest -v tests + $ hatch env run --env test.py3.12-2.1-optional run Creating a branch ~~~~~~~~~~~~~~~~~ @@ -109,9 +109,7 @@ new, separate branch for each piece of work you want to do. E.g.:: git checkout main git fetch upstream - git rebase upstream/main - git push - git checkout -b shiny-new-feature + git checkout -b shiny-new-feature upstream/main git push -u origin shiny-new-feature This changes your working directory to the 'shiny-new-feature' branch. Keep any changes in @@ -129,54 +127,27 @@ merge conflicts, these need to be resolved before submitting a pull request. Alternatively, you can merge the changes in from upstream/main instead of rebasing, which can be simpler:: - git fetch upstream - git merge upstream/main + git pull upstream main Again, any conflicts need to be resolved before submitting a pull request. Running the test suite ~~~~~~~~~~~~~~~~~~~~~~ -Zarr includes a suite of unit tests, as well as doctests included in -function and class docstrings and in the tutorial and storage -spec. The simplest way to run the unit tests is to activate your -development environment (see `creating a development environment`_ above) -and invoke:: - - $ python -m pytest -v zarr - -Some tests require optional dependencies to be installed, otherwise -the tests will be skipped. To install all optional dependencies, run:: - - $ pip install pytest-doctestplus - -To also run the doctests within docstrings (requires optional -dependencies to be installed), run:: - - $ python -m pytest -v --doctest-plus zarr - -To run the doctests within the tutorial and storage spec (requires -optional dependencies to be installed), run:: - - $ python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst - -Note that some tests also require storage services to be running -locally. To run the Azure Blob Service storage tests, run an Azure -storage emulator (e.g., azurite) and set the environment variable -``ZARR_TEST_ABS=1``. If you're using Docker to run azurite, start the service with:: +Zarr includes a suite of unit tests. The simplest way to run the unit tests +is to activate your development environment +(see `creating a development environment`_ above) and invoke:: - docker run --rm -p 10000:10000 mcr.microsoft.com/azure-storage/azurite azurite-blob --loose --blobHost 0.0.0.0 - -To run the Mongo DB storage tests, run a Mongo -server locally and set the environment variable ``ZARR_TEST_MONGO=1``. -To run the Redis storage tests, run a Redis server locally on port -6379 and set the environment variable ``ZARR_TEST_REDIS=1``. + $ hatch env run --env test.py3.12-2.1-optional run All tests are automatically run via GitHub Actions for every pull request and must pass before code can be accepted. Test coverage is -also collected automatically via the Codecov service, and total -coverage over all builds must be 100% (although individual builds -may be lower due to Python 2/3 or other differences). +also collected automatically via the Codecov service. + +.. note:: + Previous versions of Zarr-Python made extensive use of doctests. These tests were + not maintained during the 3.0 refactor but may be brought back in the future. + See :issue:`2614` for more details. Code standards - using pre-commit ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -205,15 +176,17 @@ If you would like to skip the failing checks and push the code for further discu the ``--no-verify`` option with ``git commit``. - Test coverage ~~~~~~~~~~~~~ -Zarr maintains 100% test coverage under the latest Python stable release (currently -Python 3.8). Both unit tests and docstring doctests are included when computing -coverage. Running:: +.. note:: + Test coverage for Zarr-Python 3 is currently not at 100%. This is a known issue and help + is welcome to bring test coverage back to 100%. See :issue:`2613` for more details. + +Zarr strives to maintain 100% test coverage under the latest Python stable release +Both unit tests and docstring doctests are included when computing coverage. Running:: - $ python -m pytest -v --cov=zarr --cov-config=pyproject.toml zarr + $ hatch env run --env test.py3.12-2.1-optional run-coverage will automatically run the test suite with coverage and produce a coverage report. This should be 100% before code can be accepted into the main code base. @@ -229,28 +202,28 @@ Docstrings for user-facing classes and functions should follow the `numpydoc `_ standard, including sections for Parameters and Examples. All examples -should run and pass as doctests under Python 3.8. To run doctests, -activate your development environment, install optional requirements, -and run:: - - $ python -m pytest -v --doctest-plus tests +should run and pass as doctests under Python 3.11. Zarr uses Sphinx for documentation, hosted on readthedocs.org. Documentation is written in the RestructuredText markup language (.rst files) in the ``docs`` folder. The documentation consists both of prose and API documentation. All user-facing classes -and functions should be included in the API documentation, under the ``docs/api`` -folder. Any new features or important usage information should be included in the -tutorial (``docs/tutorial.rst``). Any changes should also be included in the release -notes (``docs/release.rst``). +and functions are included in the API documentation, under the ``docs/api`` folder +using the `autodoc `_ +extension to sphinx. Any new features or important usage information should be included in the +user-guide (``docs/user-guide``). Any changes should also be included in the release +notes (``docs/developers/release.rst``). The documentation can be built locally by running:: - $ cd docs - $ make clean; make html - $ open _build/html/index.html + $ hatch --env docs run build The resulting built documentation will be available in the ``docs/_build/html`` folder. +Hatch can also be used to serve continuously updating version of the documentation +during development at `http://0.0.0.0:8000/ `_. This can be done by running:: + + $ hatch --env docs run serve + Development best practices, policies and procedures --------------------------------------------------- @@ -329,14 +302,7 @@ implements storage spec version 3, then the next library release should have ver number 3.0.0. Note however that the major version number of the Zarr library may not always correspond to the spec version number. For example, Zarr versions 2.x, 3.x, and 4.x might all implement the same version of the storage spec and thus maintain data -format compatibility, although they will not maintain API compatibility. The version number -of the storage specification that is currently implemented is stored under the -``zarr.meta.ZARR_FORMAT`` variable. - -Note that the Zarr test suite includes a data fixture and tests to try and ensure that -data format compatibility is not accidentally broken. See the -:func:`test_format_compatibility` function in the :mod:`tests.test_storage` module -for details. +format compatibility, although they will not maintain API compatibility. When to make a release ~~~~~~~~~~~~~~~~~~~~~~ @@ -358,7 +324,7 @@ Release procedure .. note:: - Most of the release process is now handled by github workflow which should + Most of the release process is now handled by GitHub workflow which should automatically push a release to PyPI if a tag is pushed. Before releasing, make sure that all pull requests which will be diff --git a/docs/developers/index.rst b/docs/developers/index.rst new file mode 100644 index 0000000000..3feb0aff71 --- /dev/null +++ b/docs/developers/index.rst @@ -0,0 +1,10 @@ + +Developer's Guide +----------------- + +.. toctree:: + :maxdepth: 1 + + contributing + release + roadmap diff --git a/docs/release.rst b/docs/developers/release.rst similarity index 100% rename from docs/release.rst rename to docs/developers/release.rst diff --git a/docs/roadmap.rst b/docs/developers/roadmap.rst similarity index 100% rename from docs/roadmap.rst rename to docs/developers/roadmap.rst diff --git a/docs/index.rst b/docs/index.rst index 5bbd04ec60..ffe20e262c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,9 +12,8 @@ Zarr-Python installation user-guide/index api/index - release - contributing - roadmap + developers/index + developers/release about **Version**: |version| @@ -93,11 +92,13 @@ Zarr is a file storage format for chunked, compressed, N-dimensional arrays base Contributor's Guide ^^^^^^^^^^^^^^^^^^^ - Want to contribute to Zarr? We welcome contributions in the form of bug reports, bug fixes, documentation, enhancement proposals and more. The contributing guidelines will guide you through the process of improving Zarr. + Want to contribute to Zarr? We welcome contributions in the form of bug reports, + bug fixes, documentation, enhancement proposals and more. The contributing guidelines + will guide you through the process of improving Zarr. +++ - .. button-ref:: contributing + .. button-ref:: developers/contributing :expand: :color: dark :click-parent: From 584d66dd4c2c8ccb5788aa6f9a27bfa76e3166bd Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 3 Jan 2025 17:18:27 -0800 Subject: [PATCH 61/87] docs: update doc homepage (#2594) * docs: split tutorial into multiple user guide sections * docs: replace getting_started page with quickstart * docs: update installation docs * docs: update doc homepage * fixups * fix ref to contributing guide * fixup --- docs/api/index.rst | 2 +- docs/conf.py | 1 + docs/developers/contributing.rst | 2 ++ docs/index.rst | 19 +++++++---- docs/installation.rst | 37 ---------------------- docs/user-guide/index.rst | 4 +-- docs/user-guide/installation.rst | 54 ++++++++++++++++++++++++++++++++ 7 files changed, 72 insertions(+), 47 deletions(-) delete mode 100644 docs/installation.rst create mode 100644 docs/user-guide/installation.rst diff --git a/docs/api/index.rst b/docs/api/index.rst index 8735180cd9..26d7ce0224 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -1,4 +1,4 @@ -API Reference +API reference ============= .. toctree:: diff --git a/docs/conf.py b/docs/conf.py index 01b7490298..3389c16549 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -107,6 +107,7 @@ def skip_submodules( "getting-started": "quickstart", "release": "developers/release.html", "roadmap": "developers/roadmap.html", + "installation": "user-guide/installation.html", } # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/docs/developers/contributing.rst b/docs/developers/contributing.rst index 7bf37ef1a3..4358230eff 100644 --- a/docs/developers/contributing.rst +++ b/docs/developers/contributing.rst @@ -1,3 +1,5 @@ +.. _dev-guide-contributing: + Contributing to Zarr ==================== diff --git a/docs/index.rst b/docs/index.rst index ffe20e262c..29baf4b94a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,7 +9,6 @@ Zarr-Python :hidden: quickstart - installation user-guide/index api/index developers/index @@ -18,16 +17,19 @@ Zarr-Python **Version**: |version| -**Download documentation**: `PDF/Zipped HTML `_ - **Useful links**: -`Installation `_ | `Source Repository `_ | `Issue Tracker `_ | `Zulip Chat `_ | `Zarr specifications `_ -Zarr is a file storage format for chunked, compressed, N-dimensional arrays based on an open-source specification. +Zarr-Python is a Python library for reading and writing Zarr groups and arrays. Highlights include: + +* Specification support for both Zarr v2 and v3. +* Create and read from N-dimensional arrays using NumPy-like semantics. +* Flexible storage enables reading and writing from local, cloud and in-memory stores. +* High performance: Enables fast I/O with support for asynchronous I/O and multi-threading. +* Extensible: Customizable with user-defined codecs and stores. .. grid:: 2 @@ -84,7 +86,7 @@ Zarr is a file storage format for chunked, compressed, N-dimensional arrays base :color: dark :click-parent: - To the api reference guide + To the API reference guide .. grid-item-card:: :img-top: _static/index_contribute.svg @@ -104,3 +106,8 @@ Zarr is a file storage format for chunked, compressed, N-dimensional arrays base :click-parent: To the contributor's guide + + +**Download documentation**: `PDF/Zipped HTML `_ + +.. _NumCodecs: https://numcodecs.readthedocs.io diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index b39b54b250..0000000000 --- a/docs/installation.rst +++ /dev/null @@ -1,37 +0,0 @@ -Installation -============ - -pip ---- - -.. code-block:: console - - $ pip install zarr - -There are a number of optional dependency groups you can install for extra functionality. -These can be installed using ``pip install "zarr[]"``, e.g. ``pip install "zarr[gpu]"`` - -- ``gpu``: support for GPUs -- ``fsspec``: support for reading/writing to remote data stores -- ``tree``: support for pretty printing of directory trees - -conda ------ - -.. code-block:: console - - $ conda install -c conda-forge zarr - -Conda does not support optional dependencies, so you will have to manually install any packages -needed to enable extra functionality. - -Dependency support ------------------- -Zarr has endorsed `Scientific-Python SPEC 0 `_ and now follows the version support window as outlined below: - -- Python: 36 months after initial release -- Core package dependencies (e.g. NumPy): 24 months after initial release - -Development ------------ -To install the latest development version of Zarr, see `the contributing guide `_. diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index a9d1c9fa29..a7bbd12453 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -6,6 +6,7 @@ User guide .. toctree:: :maxdepth: 1 + installation arrays groups attributes @@ -13,9 +14,6 @@ User guide config v3_migration -.. Coming soon - installation - Advanced Topics --------------- diff --git a/docs/user-guide/installation.rst b/docs/user-guide/installation.rst new file mode 100644 index 0000000000..a79f0763cb --- /dev/null +++ b/docs/user-guide/installation.rst @@ -0,0 +1,54 @@ +Installation +============ + +Required dependencies +--------------------- + +Required dependencies include: + +- `Python `_ (3.11 or later) +- `packaging `_ (22.0 or later) +- `numpy `_ (1.25 or later) +- `numcodecs[crc32c] `_ (0.14 or later) +- `typing_extensions `_ (4.9 or later) +- `donfig `_ (0.8 or later) + +pip +--- + +Zarr is available on `PyPI `_. Install it using ``pip``: + +.. code-block:: console + + $ pip install zarr + +There are a number of optional dependency groups you can install for extra functionality. +These can be installed using ``pip install "zarr[]"``, e.g. ``pip install "zarr[gpu]"`` + +- ``gpu``: support for GPUs +- ``remote``: support for reading/writing to remote data stores + +Additional optional dependencies include ``rich``, ``universal_pathlib``. These must be installed separately. + +conda +----- + +Zarr is also published to `conda-forge `_. Install it using ``conda``: + +.. code-block:: console + + $ conda install -c conda-forge zarr + +Conda does not support optional dependencies, so you will have to manually install any packages +needed to enable extra functionality. + +Dependency support +------------------ +Zarr has endorsed `Scientific-Python SPEC 0 `_ and now follows the version support window as outlined below: + +- Python: 36 months after initial release +- Core package dependencies (e.g. NumPy): 24 months after initial release + +Development +----------- +To install the latest development version of Zarr, see the :ref:`contributing guide `. From 3194534fb79853ecdae9b0040553d1badf4413ac Mon Sep 17 00:00:00 2001 From: Juan Nunez-Iglesias Date: Sun, 5 Jan 2025 07:02:40 +1100 Subject: [PATCH 62/87] Fix create_dataset with data kwarg (#2638) * Add failing test for #2631 * Fix create_dataset with data argument --- src/zarr/core/group.py | 11 ++++++++++- tests/test_group.py | 12 ++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 7a0d2efc09..5cb42db5b4 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1165,7 +1165,16 @@ async def create_dataset( .. deprecated:: 3.0.0 The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead. """ - return await self.create_array(name, shape=shape, **kwargs) + data = kwargs.pop("data", None) + # create_dataset in zarr 2.x requires shape but not dtype if data is + # provided. Allow this configuration by inferring dtype from data if + # necessary and passing it to create_array + if "dtype" not in kwargs and data is not None: + kwargs["dtype"] = data.dtype + array = await self.create_array(name, shape=shape, **kwargs) + if data is not None: + await array.setitem(slice(None), data) + return array @deprecated("Use AsyncGroup.require_array instead.") async def require_dataset( diff --git a/tests/test_group.py b/tests/test_group.py index a4ce04e822..19a9f9c9bb 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -1137,6 +1137,18 @@ async def test_require_groups(store: LocalStore | MemoryStore, zarr_format: Zarr assert no_group == () +def test_create_dataset_with_data(store: Store, zarr_format: ZarrFormat) -> None: + """Check that deprecated create_dataset method allows input data. + + See https://github.com/zarr-developers/zarr-python/issues/2631. + """ + root = Group.from_store(store=store, zarr_format=zarr_format) + arr = np.random.random((5, 5)) + with pytest.warns(DeprecationWarning): + data = root.create_dataset("random", data=arr, shape=arr.shape) + np.testing.assert_array_equal(np.asarray(data), arr) + + async def test_create_dataset(store: Store, zarr_format: ZarrFormat) -> None: root = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) with pytest.warns(DeprecationWarning): From 0adcbe728d5d286c160974717bddc6e4fb881e5a Mon Sep 17 00:00:00 2001 From: rtobar Date: Sun, 5 Jan 2025 06:34:57 +0800 Subject: [PATCH 63/87] Align lines for improved rendering (#2648) These are currently misaligned in the source, which is probably why it's funnily rendered. --- docs/user-guide/v3_migration.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst index fffe50e5e1..974266aac7 100644 --- a/docs/user-guide/v3_migration.rst +++ b/docs/user-guide/v3_migration.rst @@ -156,11 +156,11 @@ Dependencies When installing using ``pip``: - The new ``remote`` dependency group can be used to install a supported version of - ``fsspec``, required for remote data access. + ``fsspec``, required for remote data access. - The new ``gpu`` dependency group can be used to install a supported version of - ``cuda``, required for GPU functionality. + ``cuda``, required for GPU functionality. - The ``jupyter`` optional dependency group has been removed, since v3 contains no - jupyter specific functionality. + jupyter specific functionality. Miscellaneous ~~~~~~~~~~~~~ From df1880535c5e46d2e70585e7a8c091837a0f2296 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 Jan 2025 11:44:35 +0100 Subject: [PATCH 64/87] Adds documentation for sharding and sharding in Array.info (#2644) --- docs/user-guide/arrays.rst | 37 +++++++- docs/user-guide/performance.rst | 39 +++++++++ src/zarr/core/_info.py | 9 +- src/zarr/core/array.py | 10 +-- src/zarr/core/codec_pipeline.py | 21 +++-- tests/test_array.py | 136 +++++++++++++++++++---------- tests/test_codecs/test_sharding.py | 29 ++++++ 7 files changed, 220 insertions(+), 61 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index b21f8e976c..110e12c3be 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -574,8 +574,41 @@ Any combination of integer and slice can be used for block indexing:: Sharding -------- -Coming soon. - +Using small chunk shapes in very large arrays can lead to a very large number of chunks. +This can become a performance issue for file systems and object storage. +With Zarr format 3, a new sharding feature has been added to address this issue. + +With sharding, multiple chunks can be stored in a single storage object (e.g. a file). +Within a shard, chunks are compressed and serialized separately. +This allows individual chunks to be read independently. +However, when writing data, a full shard must be written in one go for optimal +performance and to avoid concurrency issues. +That means that shards are the units of writing and chunks are the units of reading. +Users need to configure the chunk and shard shapes accordingly. + +Sharded arrays can be created by providing the ``shards`` parameter to :func:`zarr.create_array`. + + >>> a = zarr.create_array('data/example-20.zarr', shape=(10000, 10000), shards=(1000, 1000), chunks=(100, 100), dtype='uint8') + >>> a[:] = (np.arange(10000 * 10000) % 256).astype('uint8').reshape(10000, 10000) + >>> a.info_complete() + Type : Array + Zarr format : 3 + Data type : DataType.uint8 + Shape : (10000, 10000) + Shard shape : (1000, 1000) + Chunk shape : (100, 100) + Order : C + Read-only : False + Store type : LocalStore + Codecs : [{'chunk_shape': (100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + No. bytes : 100000000 (95.4M) + No. bytes stored : 3981060 + Storage ratio : 25.1 + Chunks Initialized : 100 + +In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used. +This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total. +Without the ``shards`` argument, there would be 10,000 chunks stored as individual files. Missing features in 3.0 ----------------------- diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index d2881fe536..f56b642fb1 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -62,6 +62,45 @@ will be one single chunk for the array:: >>> z5.chunks (10000, 10000) + +Sharding +~~~~~~~~ + +If you have large arrays but need small chunks to efficiently access the data, you can +use sharding. Sharding provides a mechanism to store multiple chunks in a single +storage object or file. This can be useful because traditional file systems and object +storage systems may have performance issues storing and accessing many files. +Additionally, small files can be inefficient to store if they are smaller than the +block size of the file system. + +Picking a good combination of chunk shape and shard shape is important for performance. +The chunk shape determines what unit of your data can be read independently, while the +shard shape determines what unit of your data can be written efficiently. + +For an example, consider you have a 100 GB array and need to read small chunks of 1 MB. +Without sharding, each chunk would be one file resulting in 100,000 files. That can +already cause performance issues on some file systems. +With sharding, you could use a shard size of 1 GB. This would result in 1000 chunks per +file and 100 files in total, which seems manageable for most storage systems. +You would still be able to read each 1 MB chunk independently, but you would need to +write your data in 1 GB increments. + +To use sharding, you need to specify the ``shards`` parameter when creating the array. + + >>> z6 = zarr.create_array(store={}, shape=(10000, 10000, 1000), shards=(1000, 1000, 1000), chunks=(100, 100, 100), dtype='uint8') + >>> z6.info + Type : Array + Zarr format : 3 + Data type : DataType.uint8 + Shape : (10000, 10000, 1000) + Shard shape : (1000, 1000, 1000) + Chunk shape : (100, 100, 100) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'chunk_shape': (100, 100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + No. bytes : 100000000000 (93.1G) + .. _user-guide-chunks-order: Chunk memory layout diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 12bcc02e96..807e940508 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -80,6 +80,7 @@ class ArrayInfo: _zarr_format: ZarrFormat _data_type: np.dtype[Any] | DataType _shape: tuple[int, ...] + _shard_shape: tuple[int, ...] | None = None _chunk_shape: tuple[int, ...] | None = None _order: Literal["C", "F"] _read_only: bool @@ -96,7 +97,13 @@ def __repr__(self) -> str: Type : {_type} Zarr format : {_zarr_format} Data type : {_data_type} - Shape : {_shape} + Shape : {_shape}""") + + if self._shard_shape is not None: + template += textwrap.dedent(""" + Shard shape : {_shard_shape}""") + + template += textwrap.dedent(""" Chunk shape : {_chunk_shape} Order : {_order} Read-only : {_read_only} diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 6d8aca20ec..20e7f729aa 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1573,14 +1573,8 @@ def _info( else: kwargs["_codecs"] = self.metadata.codecs kwargs["_data_type"] = self.metadata.data_type - # just regular? - chunk_grid = self.metadata.chunk_grid - if isinstance(chunk_grid, RegularChunkGrid): - kwargs["_chunk_shape"] = chunk_grid.chunk_shape - else: - raise NotImplementedError( - "'info' is not yet implemented for chunk grids of type {type(self.metadata.chunk_grid)}" - ) + kwargs["_chunk_shape"] = self.chunks + kwargs["_shard_shape"] = self.shards return ArrayInfo( _zarr_format=self.metadata.zarr_format, diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 5a1f069823..583ca01c5e 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -332,12 +332,21 @@ async def write_batch( drop_axes: tuple[int, ...] = (), ) -> None: if self.supports_partial_encode: - await self.encode_partial_batch( - [ - (byte_setter, value[out_selection], chunk_selection, chunk_spec) - for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info - ], - ) + # Pass scalar values as is + if len(value.shape) == 0: + await self.encode_partial_batch( + [ + (byte_setter, value, chunk_selection, chunk_spec) + for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info + ], + ) + else: + await self.encode_partial_batch( + [ + (byte_setter, value[out_selection], chunk_selection, chunk_spec) + for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info + ], + ) else: # Read existing bytes if not total slice diff --git a/tests/test_array.py b/tests/test_array.py index 51ad289e80..628b873e72 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -20,6 +20,7 @@ VLenUTF8Codec, ZstdCodec, ) +from zarr.codecs.sharding import ShardingCodec from zarr.core._info import ArrayInfo from zarr.core.array import ( CompressorsLike, @@ -478,121 +479,168 @@ def test_update_attrs(zarr_format: ZarrFormat) -> None: assert arr2.attrs["foo"] == "bar" +@pytest.mark.parametrize(("chunks", "shards"), [((2, 2), None), ((2, 2), (4, 4))]) class TestInfo: - def test_info_v2(self) -> None: - arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=2) + def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) -> None: + arr = zarr.create_array(store={}, shape=(8, 8), dtype="f8", chunks=chunks, zarr_format=2) result = arr.info expected = ArrayInfo( _zarr_format=2, _data_type=np.dtype("float64"), - _shape=(4, 4), - _chunk_shape=(2, 2), + _shape=(8, 8), + _chunk_shape=chunks, + _shard_shape=None, _order="C", _read_only=False, _store_type="MemoryStore", - _count_bytes=128, + _count_bytes=512, _compressor=numcodecs.Zstd(), ) assert result == expected - def test_info_v3(self) -> None: - arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) -> None: + arr = zarr.create_array(store={}, shape=(8, 8), dtype="f8", chunks=chunks, shards=shards) result = arr.info expected = ArrayInfo( _zarr_format=3, _data_type=DataType.parse("float64"), - _shape=(4, 4), - _chunk_shape=(2, 2), + _shape=(8, 8), + _chunk_shape=chunks, + _shard_shape=shards, _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()], - _count_bytes=128, + _codecs=[BytesCodec(), ZstdCodec()] + if shards is None + else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + _count_bytes=512, ) assert result == expected - def test_info_complete(self) -> None: - arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()]) + def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | None) -> None: + arr = zarr.create_array( + store={}, + shape=(8, 8), + dtype="f8", + chunks=chunks, + shards=shards, + compressors=(), + ) result = arr.info_complete() expected = ArrayInfo( _zarr_format=3, _data_type=DataType.parse("float64"), - _shape=(4, 4), - _chunk_shape=(2, 2), + _shape=(8, 8), + _chunk_shape=chunks, + _shard_shape=shards, _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()], - _count_bytes=128, + _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _count_bytes=512, _count_chunks_initialized=0, - _count_bytes_stored=373, # the metadata? + _count_bytes_stored=373 if shards is None else 578, # the metadata? ) assert result == expected - arr[:2, :2] = 10 + arr[:4, :4] = 10 result = arr.info_complete() - expected = dataclasses.replace( - expected, _count_chunks_initialized=1, _count_bytes_stored=405 - ) + if shards is None: + expected = dataclasses.replace( + expected, _count_chunks_initialized=4, _count_bytes_stored=501 + ) + else: + expected = dataclasses.replace( + expected, _count_chunks_initialized=1, _count_bytes_stored=774 + ) assert result == expected - async def test_info_v2_async(self) -> None: - arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=2) + async def test_info_v2_async( + self, chunks: tuple[int, int], shards: tuple[int, int] | None + ) -> None: + arr = await zarr.api.asynchronous.create_array( + store={}, shape=(8, 8), dtype="f8", chunks=chunks, zarr_format=2 + ) result = arr.info expected = ArrayInfo( _zarr_format=2, _data_type=np.dtype("float64"), - _shape=(4, 4), + _shape=(8, 8), _chunk_shape=(2, 2), + _shard_shape=None, _order="C", _read_only=False, _store_type="MemoryStore", - _count_bytes=128, + _count_bytes=512, _compressor=numcodecs.Zstd(), ) assert result == expected - async def test_info_v3_async(self) -> None: - arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + async def test_info_v3_async( + self, chunks: tuple[int, int], shards: tuple[int, int] | None + ) -> None: + arr = await zarr.api.asynchronous.create_array( + store={}, + shape=(8, 8), + dtype="f8", + chunks=chunks, + shards=shards, + ) result = arr.info expected = ArrayInfo( _zarr_format=3, _data_type=DataType.parse("float64"), - _shape=(4, 4), - _chunk_shape=(2, 2), + _shape=(8, 8), + _chunk_shape=chunks, + _shard_shape=shards, _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()], - _count_bytes=128, + _codecs=[BytesCodec(), ZstdCodec()] + if shards is None + else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + _count_bytes=512, ) assert result == expected - async def test_info_complete_async(self) -> None: - arr = await zarr.api.asynchronous.create( - shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()] + async def test_info_complete_async( + self, chunks: tuple[int, int], shards: tuple[int, int] | None + ) -> None: + arr = await zarr.api.asynchronous.create_array( + store={}, + dtype="f8", + shape=(8, 8), + chunks=chunks, + shards=shards, + compressors=None, ) result = await arr.info_complete() expected = ArrayInfo( _zarr_format=3, _data_type=DataType.parse("float64"), - _shape=(4, 4), - _chunk_shape=(2, 2), + _shape=(8, 8), + _chunk_shape=chunks, + _shard_shape=shards, _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()], - _count_bytes=128, + _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _count_bytes=512, _count_chunks_initialized=0, - _count_bytes_stored=373, # the metadata? + _count_bytes_stored=373 if shards is None else 578, # the metadata? ) assert result == expected - await arr.setitem((slice(2), slice(2)), 10) + await arr.setitem((slice(4), slice(4)), 10) result = await arr.info_complete() - expected = dataclasses.replace( - expected, _count_chunks_initialized=1, _count_bytes_stored=405 - ) + if shards is None: + expected = dataclasses.replace( + expected, _count_chunks_initialized=4, _count_bytes_stored=501 + ) + else: + expected = dataclasses.replace( + expected, _count_chunks_initialized=1, _count_bytes_stored=774 + ) assert result == expected diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 484cfa4eda..2ba57d7a39 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -70,6 +70,35 @@ def test_sharding( assert np.array_equal(data, read_data) +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) +@pytest.mark.parametrize("index_location", ["start", "end"]) +@pytest.mark.parametrize("offset", [0, 10]) +def test_sharding_scalar( + store: Store, + index_location: ShardingCodecIndexLocation, + offset: int, +) -> None: + """ + Test that we can create an array with a sharding codec, write data to that array, and get + the same data out via indexing. + """ + spath = StorePath(store) + + arr = zarr.create_array( + spath, + shape=(128, 128), + chunks=(32, 32), + shards={"shape": (64, 64), "index_location": index_location}, + dtype="uint8", + fill_value=6, + filters=[TransposeCodec(order=order_from_dim("F", 2))], + compressors=BloscCodec(cname="lz4"), + ) + arr[:16, :16] = 10 # intentionally write partial chunks + read_data = arr[:16, :16] + np.testing.assert_array_equal(read_data, 10) + + @pytest.mark.parametrize("index_location", ["start", "end"]) @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize( From 617e2cd84b28c4ca28b8b221abd77718561556bb Mon Sep 17 00:00:00 2001 From: David Stansby Date: Sun, 5 Jan 2025 10:58:14 +0000 Subject: [PATCH 65/87] Move deprecation notices to the top of docstrings (#2637) * Move deprecation notices to the top of docstrings * Turn off GL09 --- pyproject.toml | 3 ++- src/zarr/api/asynchronous.py | 8 ++++---- src/zarr/api/synchronous.py | 8 ++++---- src/zarr/core/array.py | 12 ++++++------ src/zarr/core/group.py | 28 ++++++++++++++++------------ 5 files changed, 32 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index aaedc09736..0fa0e7b6b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -397,7 +397,8 @@ ignore = [ checks = [ "GL06", "GL07", - "GL09", + # Currently broken; see https://github.com/numpy/numpydoc/issues/573 + # "GL09", "GL10", "SS02", "SS04", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index f54a824088..f42b6d3f51 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -508,6 +508,10 @@ async def save_group( async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None = None) -> Any: """Provide a rich display of the hierarchy. + .. deprecated:: 3.0.0 + `zarr.tree()` is deprecated and will be removed in a future release. + Use `group.tree()` instead. + Parameters ---------- grp : Group @@ -521,10 +525,6 @@ async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None = ------- TreeRepr A pretty-printable object displaying the hierarchy. - - .. deprecated:: 3.0.0 - `zarr.tree()` is deprecated and will be removed in a future release. - Use `group.tree()` instead. """ return await grp.tree(expand=expand, level=level) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index e4a842ef8f..200db9ec26 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -334,6 +334,10 @@ def save_group( def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> Any: """Provide a rich display of the hierarchy. + .. deprecated:: 3.0.0 + `zarr.tree()` is deprecated and will be removed in a future release. + Use `group.tree()` instead. + Parameters ---------- grp : Group @@ -347,10 +351,6 @@ def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> An ------- TreeRepr A pretty-printable object displaying the hierarchy. - - .. deprecated:: 3.0.0 - `zarr.tree()` is deprecated and will be removed in a future release. - Use `group.tree()` instead. """ return sync(async_api.tree(grp._async_group, expand=expand, level=level)) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 20e7f729aa..e5c4e4538c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -432,6 +432,9 @@ async def create( ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Method to create a new asynchronous array instance. + .. deprecated:: 3.0.0 + Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. + Parameters ---------- store : StoreLike @@ -509,9 +512,6 @@ async def create( ------- AsyncArray The created asynchronous array instance. - - .. deprecated:: 3.0.0 - Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ return await cls._create( store, @@ -1631,6 +1631,9 @@ def create( ) -> Array: """Creates a new Array instance from an initialized store. + .. deprecated:: 3.0.0 + Deprecated in favor of :func:`zarr.create_array`. + Parameters ---------- store : StoreLike @@ -1698,9 +1701,6 @@ def create( ------- Array Array created from the store. - - .. deprecated:: 3.0.0 - Deprecated in favor of :func:`zarr.create_array`. """ return cls._create( store, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 5cb42db5b4..a4503ce64e 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1148,6 +1148,9 @@ async def create_dataset( ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. + .. deprecated:: 3.0.0 + The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead. + Arrays are known as "datasets" in HDF5 terminology. For compatibility with h5py, Zarr groups also implement the :func:`zarr.AsyncGroup.require_dataset` method. @@ -1161,9 +1164,6 @@ async def create_dataset( Returns ------- a : AsyncArray - - .. deprecated:: 3.0.0 - The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead. """ data = kwargs.pop("data", None) # create_dataset in zarr 2.x requires shape but not dtype if data is @@ -1188,6 +1188,9 @@ async def require_dataset( ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Obtain an array, creating if it doesn't exist. + .. deprecated:: 3.0.0 + The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.require_dataset` instead. + Arrays are known as "datasets" in HDF5 terminology. For compatibility with h5py, Zarr groups also implement the :func:`zarr.AsyncGroup.create_dataset` method. @@ -1208,9 +1211,6 @@ async def require_dataset( Returns ------- a : AsyncArray - - .. deprecated:: 3.0.0 - The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.require_dataset` instead. """ return await self.require_array(name, shape=shape, dtype=dtype, exact=exact, **kwargs) @@ -2402,6 +2402,10 @@ def create_array( def create_dataset(self, name: str, **kwargs: Any) -> Array: """Create an array. + .. deprecated:: 3.0.0 + The h5py compatibility methods will be removed in 3.1.0. Use `Group.create_array` instead. + + Arrays are known as "datasets" in HDF5 terminology. For compatibility with h5py, Zarr groups also implement the :func:`zarr.Group.require_dataset` method. @@ -2415,9 +2419,6 @@ def create_dataset(self, name: str, **kwargs: Any) -> Array: Returns ------- a : Array - - .. deprecated:: 3.0.0 - The h5py compatibility methods will be removed in 3.1.0. Use `Group.create_array` instead. """ return Array(self._sync(self._async_group.create_dataset(name, **kwargs))) @@ -2425,6 +2426,9 @@ def create_dataset(self, name: str, **kwargs: Any) -> Array: def require_dataset(self, name: str, *, shape: ShapeLike, **kwargs: Any) -> Array: """Obtain an array, creating if it doesn't exist. + .. deprecated:: 3.0.0 + The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead. + Arrays are known as "datasets" in HDF5 terminology. For compatibility with h5py, Zarr groups also implement the :func:`zarr.Group.create_dataset` method. @@ -2440,9 +2444,6 @@ def require_dataset(self, name: str, *, shape: ShapeLike, **kwargs: Any) -> Arra Returns ------- a : Array - - .. deprecated:: 3.0.0 - The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead. """ return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs))) @@ -2669,6 +2670,9 @@ def array( ) -> Array: """Create an array within this group. + .. deprecated:: 3.0.0 + Use `Group.create_array` instead. + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters From ec014f6e7e184f4ce0ebfb712c50b0b592bf9057 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Mon, 6 Jan 2025 04:40:36 +0000 Subject: [PATCH 66/87] Improve API reference doc structure (#2635) Co-authored-by: Joe Hamman --- .gitignore | 2 +- docs/Makefile | 2 +- docs/api/index.rst | 7 ------- docs/conf.py | 3 ++- docs/index.rst | 6 +++--- 5 files changed, 7 insertions(+), 13 deletions(-) delete mode 100644 docs/api/index.rst diff --git a/.gitignore b/.gitignore index 153ca39df0..5663f62d04 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,7 @@ coverage.xml # Sphinx documentation docs/_build/ -docs/_autoapi +docs/api docs/data data data.zip diff --git a/docs/Makefile b/docs/Makefile index fc8fa12915..f42ee840e9 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -52,7 +52,7 @@ help: .PHONY: clean clean: rm -rf $(BUILDDIR)/* - rm -rf $(BUILDDIR)/../_autoapi + rm -rf $(BUILDDIR)/../api .PHONY: html html: diff --git a/docs/api/index.rst b/docs/api/index.rst deleted file mode 100644 index 26d7ce0224..0000000000 --- a/docs/api/index.rst +++ /dev/null @@ -1,7 +0,0 @@ -API reference -============= - -.. toctree:: - :maxdepth: 1 - - ../_autoapi/zarr/index diff --git a/docs/conf.py b/docs/conf.py index 3389c16549..2a93e61d3e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -57,7 +57,7 @@ autoapi_add_toctree_entry = False autoapi_generate_api_docs = True autoapi_member_order = "groupwise" -autoapi_root = "_autoapi" +autoapi_root = "api" autoapi_keep_files = True autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', ] @@ -108,6 +108,7 @@ def skip_submodules( "release": "developers/release.html", "roadmap": "developers/roadmap.html", "installation": "user-guide/installation.html", + "api": "api/zarr/index" } # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/docs/index.rst b/docs/index.rst index 29baf4b94a..4cafc12711 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ Zarr-Python quickstart user-guide/index - api/index + API reference developers/index developers/release about @@ -81,12 +81,12 @@ Zarr-Python is a Python library for reading and writing Zarr groups and arrays. +++ - .. button-ref:: api/index + .. button-ref:: api/zarr/index :expand: :color: dark :click-parent: - To the API reference guide + To the API reference .. grid-item-card:: :img-top: _static/index_contribute.svg From 5c6267e69fdd69fd6fb0c5fc74f1de82b9b4b07d Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 6 Jan 2025 06:53:32 +0100 Subject: [PATCH 67/87] Consistent use of 'Zarr format 2 or 3' (#2645) --- docs/index.rst | 2 +- docs/user-guide/extending.rst | 6 +- docs/user-guide/v3_migration.rst | 2 +- pyproject.toml | 2 +- src/zarr/api/asynchronous.py | 22 +++--- src/zarr/api/synchronous.py | 38 +++++------ src/zarr/codecs/vlen_utf8.py | 4 +- src/zarr/core/array.py | 112 +++++++++++++++---------------- src/zarr/core/common.py | 2 +- src/zarr/core/group.py | 106 ++++++++++++++--------------- src/zarr/core/metadata/v2.py | 6 +- src/zarr/core/metadata/v3.py | 6 +- tests/test_array.py | 2 +- tests/test_metadata/test_v3.py | 4 +- 14 files changed, 157 insertions(+), 157 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 4cafc12711..0dcfd7f90f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,7 +25,7 @@ Zarr-Python Zarr-Python is a Python library for reading and writing Zarr groups and arrays. Highlights include: -* Specification support for both Zarr v2 and v3. +* Specification support for both Zarr format 2 and 3. * Create and read from N-dimensional arrays using NumPy-like semantics. * Flexible storage enables reading and writing from local, cloud and in-memory stores. * High performance: Enables fast I/O with support for asynchronous I/O and multi-threading. diff --git a/docs/user-guide/extending.rst b/docs/user-guide/extending.rst index 405dcb92c0..7647703fbb 100644 --- a/docs/user-guide/extending.rst +++ b/docs/user-guide/extending.rst @@ -10,8 +10,8 @@ Custom codecs ------------- .. note:: - This section explains how custom codecs can be created for Zarr version 3 data. For Zarr - version 2, codecs should subclass the + This section explains how custom codecs can be created for Zarr format 3 arrays. For Zarr + format 2, codecs should subclass the `numcodecs.abc.Codec `_ base class and register through `numcodecs.registry.register_codec `_. @@ -66,7 +66,7 @@ strongly recommended to prefix the codec identifier with a unique name. For exam the codecs from ``numcodecs`` are prefixed with ``numcodecs.``, e.g. ``numcodecs.delta``. .. note:: - Note that the extension mechanism for the Zarr version 3 is still under development. + Note that the extension mechanism for the Zarr format 3 is still under development. Requirements for custom codecs including the choice of codec identifiers might change in the future. diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst index 974266aac7..d90b87a897 100644 --- a/docs/user-guide/v3_migration.rst +++ b/docs/user-guide/v3_migration.rst @@ -4,7 +4,7 @@ Zarr-Python 3 represents a major refactor of the Zarr-Python codebase. Some of the goals motivating this refactor included: -* adding support for the Zarr V3 specification (along with the Zarr V2 specification) +* adding support for the Zarr format 3 specification (along with the Zarr format 2 specification) * cleaning up internal and user facing APIs * improving performance (particularly in high latency storage environments like cloud object stores) diff --git a/pyproject.toml b/pyproject.toml index 0fa0e7b6b4..8bc861d837 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -380,7 +380,7 @@ filterwarnings = [ "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning", "ignore:Creating a zarr.buffer.gpu.*:UserWarning", "ignore:Duplicate name:UserWarning", # from ZipFile - "ignore:.*is currently not part in the Zarr version 3 specification.*:UserWarning", + "ignore:.*is currently not part in the Zarr format 3 specification.*:UserWarning", ] markers = [ "gpu: mark a test as requiring CuPy and GPU" diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index f42b6d3f51..060618dbd1 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -198,7 +198,7 @@ async def consolidate_metadata( if any(m.zarr_format == 3 for m in members_metadata.values()): warnings.warn( - "Consolidated metadata is currently not part in the Zarr version 3 specification. It " + "Consolidated metadata is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", category=UserWarning, stacklevel=1, @@ -770,8 +770,8 @@ async def open_group( Whether to use consolidated metadata. By default, consolidated metadata is used if it's present in the - store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file - for Zarr v2). + store (in the ``zarr.json`` for Zarr format 3 and in the ``.zmetadata`` file + for Zarr format 2). To explicitly require consolidated metadata, set ``use_consolidated=True``, which will raise an exception if consolidated metadata is not found. @@ -779,7 +779,7 @@ async def open_group( To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, which will fall back to using the regular, non consolidated metadata. - Zarr v2 allowed configuring the key storing the consolidated metadata + Zarr format 2 allowed configuring the key storing the consolidated metadata (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` to load consolidated metadata from a non-default key. @@ -870,21 +870,21 @@ async def create( Array shape. chunks : int or tuple of ints, optional The shape of the array's chunks. - V2 only. V3 arrays should use `chunk_shape` instead. + Zarr format 2 only. Zarr format 3 arrays should use `chunk_shape` instead. If not specified, default values are guessed based on the shape and dtype. dtype : str or dtype, optional NumPy dtype. chunk_shape : int or tuple of ints, optional The shape of the Array's chunks (default is None). - V3 only. V2 arrays should use `chunks` instead. + Zarr format 3 only. Zarr format 2 arrays should use `chunks` instead. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. + Zarr format 3 only. Zarr format 2 arrays should use `dimension_separator` instead. Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + Zarr format 3 only. Zarr format 2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: @@ -895,7 +895,7 @@ async def create( These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional Primary compressor to compress chunk data. - V2 only. V3 arrays should use ``codecs`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: @@ -925,7 +925,7 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. If no ``filters`` are provided, a default set of filters will be used. + Zarr format 2 only. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. cache_metadata : bool, optional If True, array configuration metadata will be cached for the @@ -942,7 +942,7 @@ async def create( A codec to encode object arrays, only needed if dtype=object. dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_key_encoding`` instead. Default is ".". write_empty_chunks : bool, optional Deprecated in favor of the ``config`` keyword argument. diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 200db9ec26..7b3d842832 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -502,8 +502,8 @@ def open_group( Whether to use consolidated metadata. By default, consolidated metadata is used if it's present in the - store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file - for Zarr v2). + store (in the ``zarr.json`` for Zarr format 3 and in the ``.zmetadata`` file + for Zarr format 2). To explicitly require consolidated metadata, set ``use_consolidated=True``, which will raise an exception if consolidated metadata is not found. @@ -511,7 +511,7 @@ def open_group( To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, which will fall back to using the regular, non consolidated metadata. - Zarr v2 allowed configuring the key storing the consolidated metadata + Zarr format 2 allows configuring the key storing the consolidated metadata (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` to load consolidated metadata from a non-default key. @@ -785,16 +785,16 @@ def create_array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -804,32 +804,32 @@ def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. zarr_format : {2, 3}, optional @@ -838,11 +838,11 @@ def create_array( Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index e5b895ae0c..0ef423793d 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -28,7 +28,7 @@ class VLenUTF8Codec(ArrayBytesCodec): def __init__(self) -> None: warn( - "The codec `vlen-utf8` is currently not part in the Zarr version 3 specification. It " + "The codec `vlen-utf8` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", category=UserWarning, stacklevel=2, @@ -83,7 +83,7 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - class VLenBytesCodec(ArrayBytesCodec): def __init__(self) -> None: warn( - "The codec `vlen-bytes` is currently not part in the Zarr version 3 specification. It " + "The codec `vlen-bytes` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", category=UserWarning, stacklevel=2, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e5c4e4538c..87ec4e48bc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -179,7 +179,7 @@ async def get_array_metadata( ) if zarr_json_bytes is not None and zarray_bytes is not None: # warn and favor v3 - msg = f"Both zarr.json (Zarr v3) and .zarray (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." + msg = f"Both zarr.json (Zarr format 3) and .zarray (Zarr format 2) metadata objects exist at {store_path}. Zarr v3 will be used." warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zarray_bytes is None: raise FileNotFoundError(store_path) @@ -451,16 +451,16 @@ async def create( The attributes of the array (default is None). chunk_shape : ChunkCoords, optional The shape of the array's chunks - V3 only. V2 arrays should use `chunks` instead. + Zarr format 3 only. Zarr format 2 arrays should use `chunks` instead. If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. + Zarr format 3 only. Zarr format 2 arrays should use `dimension_separator` instead. Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + Zarr format 3 only. Zarr format 2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: @@ -471,14 +471,14 @@ async def create( These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - V3 only. V2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. chunks : ShapeLike, optional The shape of the array's chunks. - V2 only. V3 arrays should use ``chunk_shape`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional The memory of the array (default is "C"). If ``zarr_format`` is 2, this parameter sets the memory order of the array. @@ -487,12 +487,12 @@ async def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If no ``compressor`` is provided, a default compressor will be used: @@ -592,15 +592,15 @@ async def _create( if zarr_format == 3: if dimension_separator is not None: raise ValueError( - "dimension_separator cannot be used for arrays with version 3. Use chunk_key_encoding instead." + "dimension_separator cannot be used for arrays with zarr_format 3. Use chunk_key_encoding instead." ) if filters is not None: raise ValueError( - "filters cannot be used for arrays with version 3. Use array-to-array codecs instead." + "filters cannot be used for arrays with zarr_format 3. Use array-to-array codecs instead." ) if compressor is not None: raise ValueError( - "compressor cannot be used for arrays with version 3. Use bytes-to-bytes codecs instead." + "compressor cannot be used for arrays with zarr_format 3. Use bytes-to-bytes codecs instead." ) if order is not None: @@ -622,14 +622,14 @@ async def _create( elif zarr_format == 2: if codecs is not None: raise ValueError( - "codecs cannot be used for arrays with version 2. Use filters and compressor instead." + "codecs cannot be used for arrays with zarr_format 2. Use filters and compressor instead." ) if chunk_key_encoding is not None: raise ValueError( - "chunk_key_encoding cannot be used for arrays with version 2. Use dimension_separator instead." + "chunk_key_encoding cannot be used for arrays with zarr_format 2. Use dimension_separator instead." ) if dimension_names is not None: - raise ValueError("dimension_names cannot be used for arrays with version 2.") + raise ValueError("dimension_names cannot be used for arrays with zarr_format 2.") if order is None: order_parsed = parse_order(zarr_config.get("array.order")) @@ -704,7 +704,7 @@ async def _create_v3( if dtype.kind in "UTS": warn( - f"The dtype `{dtype}` is currently not part in the Zarr version 3 specification. It " + f"The dtype `{dtype}` is currently not part in the Zarr format 3 specification. It " "may not be supported by other zarr implementations and may change in the future.", category=UserWarning, stacklevel=2, @@ -785,7 +785,7 @@ def from_dict( data: dict[str, JSON], ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: """ - Create a Zarr array from a dictionary, with support for both Zarr v2 and v3 metadata. + Create a Zarr array from a dictionary, with support for both Zarr format 2 and 3 metadata. Parameters ---------- @@ -795,17 +795,17 @@ def from_dict( data : dict A dictionary representing the array data. This dictionary should include necessary metadata for the array, such as shape, dtype, and other attributes. The format of the metadata - will determine whether a Zarr v2 or v3 array is created. + will determine whether a Zarr format 2 or 3 array is created. Returns ------- AsyncArray[ArrayV3Metadata] or AsyncArray[ArrayV2Metadata] - The created Zarr array, either using v2 or v3 metadata based on the provided data. + The created Zarr array, either using Zarr format 2 or 3 metadata based on the provided data. Raises ------ ValueError - If the dictionary data is invalid or incompatible with either Zarr v2 or v3 array creation. + If the dictionary data is invalid or incompatible with either Zarr format 2 or 3 array creation. """ metadata = parse_array_metadata(data) return cls(metadata=metadata, store_path=store_path) @@ -1644,16 +1644,16 @@ def create( The data type of the array. chunk_shape : ChunkCoords, optional The shape of the Array's chunks. - V3 only. V2 arrays should use `chunks` instead. + Zarr format 3 only. Zarr format 2 arrays should use `chunks` instead. If not specified, default are guessed based on the shape and dtype. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. + Zarr format 3 only. Zarr format 2 arrays should use `dimension_separator` instead. Default is ``("default", "/")``. codecs : Sequence of Codecs or dicts, optional An iterable of Codec or dict serializations of Codecs. The elements of this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. + Zarr format 3 only. Zarr format 2 arrays should use ``filters`` and ``compressor`` instead. If no codecs are provided, default codecs will be used: @@ -1664,14 +1664,14 @@ def create( These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - V3 only. V2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. chunks : ChunkCoords, optional The shape of the array's chunks. - V2 only. V3 arrays should use ``chunk_shape`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_shape`` instead. If not specified, default are guessed based on the shape and dtype. dimension_separator : Literal[".", "/"], optional The dimension separator (default is "."). - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional The memory of the array (default is "C"). If ``zarr_format`` is 2, this parameter sets the memory order of the array. @@ -1680,12 +1680,12 @@ def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional Primary compressor to compress chunk data. - V2 only. V3 arrays should use ``codecs`` instead. + Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. If no ``compressor`` is provided, a default compressor will be used: @@ -2239,7 +2239,7 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: ----- Slices with step > 1 are supported, but slices with negative step are not. - For arrays with a structured dtype, see zarr v2 for examples of how to use + For arrays with a structured dtype, see Zarr format 2 for examples of how to use fields Currently the implementation for __getitem__ is provided by @@ -2338,7 +2338,7 @@ def __setitem__(self, selection: Selection, value: npt.ArrayLike) -> None: ----- Slices with step > 1 are supported, but slices with negative step are not. - For arrays with a structured dtype, see zarr v2 for examples of how to use + For arrays with a structured dtype, see Zarr format 2 for examples of how to use fields Currently the implementation for __setitem__ is provided by @@ -2470,7 +2470,7 @@ def get_basic_selection( ----- Slices with step > 1 are supported, but slices with negative step are not. - For arrays with a structured dtype, see zarr v2 for examples of how to use + For arrays with a structured dtype, see Zarr format 2 for examples of how to use the `fields` parameter. This method provides the implementation for accessing data via the @@ -2573,7 +2573,7 @@ def set_basic_selection( Notes ----- - For arrays with a structured dtype, see zarr v2 for examples of how to use + For arrays with a structured dtype, see Zarr format 2 for examples of how to use the `fields` parameter. This method provides the underlying implementation for modifying data via square @@ -3693,16 +3693,16 @@ async def create_array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -3712,32 +3712,32 @@ async def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. zarr_format : {2, 3}, optional @@ -3746,11 +3746,11 @@ async def create_array( Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -3799,20 +3799,20 @@ async def create_array( if zarr_format == 2: if shard_shape_parsed is not None: msg = ( - "Zarr v2 arrays can only be created with `shard_shape` set to `None`. " + "Zarr format 2 arrays can only be created with `shard_shape` set to `None`. " f"Got `shard_shape={shards}` instead." ) raise ValueError(msg) if serializer != "auto": - raise ValueError("Zarr v2 arrays do not support `serializer`.") + raise ValueError("Zarr format 2 arrays do not support `serializer`.") filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( compressor=compressors, filters=filters, dtype=np.dtype(dtype) ) if dimension_names is not None: - raise ValueError("Zarr v2 arrays do not support dimension names.") + raise ValueError("Zarr format 2 arrays do not support dimension names.") if order is None: order_parsed = zarr_config.get("array.order") else: @@ -3895,7 +3895,7 @@ def _parse_chunk_key_encoding( result = ChunkKeyEncoding.from_dict(data) if zarr_format == 2 and result.name != "v2": msg = ( - "Invalid chunk key encoding. For Zarr v2 arrays, the `name` field of the " + "Invalid chunk key encoding. For Zarr format 2 arrays, the `name` field of the " f"chunk key encoding must be 'v2'. Got `name` = {result.name} instead." ) raise ValueError(msg) @@ -3948,7 +3948,7 @@ def _get_default_chunk_encoding_v2( np_dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ - Get the default chunk encoding for zarr v2 arrays, given a dtype + Get the default chunk encoding for Zarr format 2 arrays, given a dtype """ compressor_dict = _default_compressor(np_dtype) @@ -3972,7 +3972,7 @@ def _parse_chunk_encoding_v2( dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ - Generate chunk encoding classes for v2 arrays with optional defaults. + Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) @@ -3987,7 +3987,7 @@ def _parse_chunk_encoding_v2( _compressor = parse_compressor(compressor[0]) else: if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." + msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." raise TypeError(msg) _compressor = parse_compressor(compressor) @@ -4000,7 +4000,7 @@ def _parse_chunk_encoding_v2( for idx, f in enumerate(filters): if not isinstance(f, numcodecs.abc.Codec): msg = ( - "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs. " + "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." ) raise TypeError(msg) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index d53f3847a5..7205b8c206 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -192,7 +192,7 @@ def _warn_write_empty_chunks_kwarg() -> None: def _warn_order_kwarg() -> None: # TODO: link to docs page on array configuration in this message msg = ( - "The `order` keyword argument has no effect for zarr v3 arrays. " + "The `order` keyword argument has no effect for Zarr format 3 arrays. " "To control the memory layout of the array, either use the `config` keyword " "argument, as in `config={'order: 'C'}`," "or change the global 'array.order' configuration variable." diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index a4503ce64e..dac2270a53 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -459,8 +459,8 @@ async def open( Whether to use consolidated metadata. By default, consolidated metadata is used if it's present in the - store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file - for Zarr v2). + store (in the ``zarr.json`` for Zarr format 3 and in the ``.zmetadata`` file + for Zarr format 2). To explicitly require consolidated metadata, set ``use_consolidated=True``, which will raise an exception if consolidated metadata is not found. @@ -468,7 +468,7 @@ async def open( To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, which will fall back to using the regular, non consolidated metadata. - Zarr v2 allowed configuring the key storing the consolidated metadata + Zarr format 2 allowed configuring the key storing the consolidated metadata (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` to load consolidated metadata from a non-default key. """ @@ -514,7 +514,7 @@ async def open( ) if zarr_json_bytes is not None and zgroup_bytes is not None: # warn and favor v3 - msg = f"Both zarr.json (Zarr v3) and .zgroup (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." + msg = f"Both zarr.json (Zarr format 3) and .zgroup (Zarr format 2) metadata objects exist at {store_path}. Zarr format 3 will be used." warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zgroup_bytes is None: raise FileNotFoundError( @@ -548,7 +548,7 @@ async def open( # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None if not isinstance(use_consolidated, bool | None): - raise TypeError("use_consolidated must be a bool or None for Zarr V3.") + raise TypeError("use_consolidated must be a bool or None for Zarr format 3.") return cls._from_bytes_v3( store_path, @@ -1048,16 +1048,16 @@ async def create_array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -1067,16 +1067,16 @@ async def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. @@ -1085,27 +1085,27 @@ async def create_array( Deprecated in favor of ``compressors``. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -2304,16 +2304,16 @@ def create_array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -2323,16 +2323,16 @@ def create_array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. @@ -2341,27 +2341,27 @@ def create_array( Deprecated in favor of ``compressors``. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -2693,16 +2693,16 @@ def array( Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a codec that takes an array and returns an array, + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. - For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v2_default_filters`` @@ -2712,16 +2712,16 @@ def array( List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. Multiple compressors my be provided for Zarr v3. + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr v3 will be used. + Zarr format 3 will be used. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may - be provided for Zarr v2. + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. @@ -2730,27 +2730,27 @@ def array( Deprecated in favor of ``compressors``. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. - Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional The memory of the array (default is "C"). - For Zarr v2, this parameter sets the memory order of the array. - For Zarr v3, this parameter is deprecated, because memory order - is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory - order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. If no ``order`` is provided, a default order will be used. This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional A specification of how the chunk keys are represented in storage. - For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. - For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional The names of the dimensions (default is None). - Zarr v3 only. Zarr v2 arrays should not use this parameter. + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index bc7fd32cbf..b95433068a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -34,7 +34,7 @@ class ArrayV2MetadataDict(TypedDict): """ - A typed dictionary model for zarr v2 metadata. + A typed dictionary model for Zarr format 2 metadata. """ zarr_format: Literal[2] @@ -68,7 +68,7 @@ def __init__( attributes: dict[str, JSON] | None = None, ) -> None: """ - Metadata for a Zarr version 2 array. + Metadata for a Zarr format 2 array. """ shape_parsed = parse_shapelike(shape) dtype_parsed = parse_dtype(dtype) @@ -327,7 +327,7 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: stored in the Array metadata into an in-memory value. This only gives the default fill value for some type. - This is useful for reading Zarr V2 arrays, which allow the fill + This is useful for reading Zarr format 2 arrays, which allow the fill value to be unspecified. """ if dtype.kind == "S": diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 0821dd9bc9..1265c832b2 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -468,7 +468,7 @@ def parse_fill_value( fill_value : Any A potential fill value. dtype : str - A valid Zarr V3 DataType. + A valid Zarr format 3 DataType. Returns ------- @@ -676,10 +676,10 @@ def parse(cls, dtype: DataType | Any | None) -> DataType: try: dtype = np.dtype(dtype) except (ValueError, TypeError) as e: - raise ValueError(f"Invalid V3 data_type: {dtype}") from e + raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e # check that this is a valid v3 data_type try: data_type = DataType.from_numpy(dtype) except KeyError as e: - raise ValueError(f"Invalid V3 data_type: {dtype}") from e + raise ValueError(f"Invalid Zarr format 3 data_type: {dtype}") from e return data_type diff --git a/tests/test_array.py b/tests/test_array.py index 628b873e72..86885514a3 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1219,7 +1219,7 @@ async def test_create_array_v2_no_shards(store: MemoryStore) -> None: Test that creating a Zarr v2 array with ``shard_shape`` set to a non-None value raises an error. """ msg = re.escape( - "Zarr v2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." + "Zarr format 2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." ) with pytest.raises(ValueError, match=msg): _ = await create_array( diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index 6f7fba6dd1..ef527f42ef 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -336,13 +336,13 @@ def test_invalid_dtype_raises() -> None: "codecs": (), "fill_value": np.datetime64(0, "ns"), } - with pytest.raises(ValueError, match=r"Invalid V3 data_type: .*"): + with pytest.raises(ValueError, match=r"Invalid Zarr format 3 data_type: .*"): ArrayV3Metadata.from_dict(metadata_dict) @pytest.mark.parametrize("data", ["datetime64[s]", "foo", object()]) def test_parse_invalid_dtype_raises(data): - with pytest.raises(ValueError, match=r"Invalid V3 data_type: .*"): + with pytest.raises(ValueError, match=r"Invalid Zarr format 3 data_type: .*"): DataType.parse(data) From 91385283bc9e0ddbd192d5d08df196d9bd90b8e7 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 6 Jan 2025 16:22:18 +0100 Subject: [PATCH 68/87] Adds filters, compressors and serializer props to Array (#2652) * adds filters, serializer, compressors properties to Array * adapt Array.info * fixes doctests * ugly numcodecs class names * always show filters and compressors in Array.info * format --- docs/user-guide/arrays.rst | 41 +++++--- docs/user-guide/consolidated_metadata.rst | 12 +-- docs/user-guide/groups.rst | 8 +- docs/user-guide/performance.rst | 12 ++- src/zarr/api/synchronous.py | 2 +- src/zarr/core/_info.py | 28 ++--- src/zarr/core/array.py | 119 +++++++++++++++++++--- src/zarr/core/group.py | 6 +- src/zarr/core/metadata/v3.py | 28 ++++- tests/test_array.py | 93 +++++++++++------ tests/test_config.py | 8 +- tests/test_info.py | 12 ++- 12 files changed, 265 insertions(+), 104 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 110e12c3be..ba85ce1cda 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -168,8 +168,8 @@ argument accepted by all array creation functions. For example:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) >>> z[:] = data - >>> z.metadata.codecs - [BytesCodec(endian=), BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0)] + >>> z.compressors + (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) This array above will use Blosc as the primary compressor, using the Zstandard algorithm (compression level 3) internally within Blosc, and with the @@ -188,7 +188,9 @@ which can be used to print useful diagnostics, e.g.:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) The :func:`zarr.Array.info_complete` method inspects the underlying store and @@ -203,7 +205,9 @@ prints additional diagnostics, e.g.:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'endian': }, {'typesize': 4, 'cname': , 'clevel': 3, 'shuffle': , 'blocksize': 0}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) No. bytes stored : 9696302 Storage ratio : 41.3 @@ -223,8 +227,8 @@ here is an array using Gzip compression, level 1:: >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-6.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=zarr.codecs.GzipCodec(level=1)) >>> z[:] = data - >>> z.metadata.codecs - [BytesCodec(endian=), GzipCodec(level=1)] + >>> z.compressors + (GzipCodec(level=1),) Here is an example using LZMA from NumCodecs_ with a custom filter pipeline including LZMA's built-in delta filter:: @@ -236,23 +240,24 @@ built-in delta filter:: >>> compressors = LZMA(filters=lzma_filters) >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) >>> z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) - >>> z.metadata.codecs - [BytesCodec(endian=), _make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]})] + >>> z.compressors + (_make_bytes_bytes_codec.._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) The default compressor can be changed by setting the value of the using Zarr's :ref:`user-guide-config`, e.g.:: >>> with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}): ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) - >>> z.metadata.filters - >>> z.metadata.compressor - Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + >>> z.filters + () + >>> z.compressors + (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) To disable compression, set ``compressors=None`` when creating an array, e.g.:: >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) - >>> z.metadata.codecs - [BytesCodec(endian=)] + >>> z.compressors + () .. _user-guide-filters: @@ -287,7 +292,9 @@ Here is an example using a delta filter with the Blosc compressor:: Order : C Read-only : False Store type : LocalStore - Codecs : [{'codec_name': 'numcodecs.delta', 'codec_config': {'id': 'delta', 'dtype': 'int32'}}, {'endian': }, {'typesize': 4, 'cname': , 'clevel': 1, 'shuffle': , 'blocksize': 0}] + Filters : (_make_array_array_codec.._Codec(codec_name='numcodecs.delta', codec_config={'id': 'delta', 'dtype': 'int32'}),) + Serializer : BytesCodec(endian=) + Compressors : (BloscCodec(typesize=4, cname=, clevel=1, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) For more information about available filter codecs, see the `Numcodecs @@ -600,11 +607,13 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Order : C Read-only : False Store type : LocalStore - Codecs : [{'chunk_shape': (100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) No. bytes stored : 3981060 Storage ratio : 25.1 - Chunks Initialized : 100 + Shards Initialized : 100 In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used. This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total. diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index 511761d34e..3c015dcfca 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -52,8 +52,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, @@ -65,8 +65,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, @@ -78,8 +78,8 @@ that can be used.: chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), - codecs=[BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)], + codecs=(BytesCodec(endian=), + ZstdCodec(level=0, checksum=False)), attributes={}, dimension_names=None, zarr_format=3, diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index 62160ffde5..da5f393246 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -109,7 +109,9 @@ property. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 8000000 (7.6M) No. bytes stored : 1432 Storage ratio : 5586.6 @@ -123,7 +125,9 @@ property. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 4000000 (3.8M) Groups also have the :func:`zarr.Group.tree` method, e.g.:: diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index f56b642fb1..265bef8efe 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -98,7 +98,9 @@ To use sharding, you need to specify the ``shards`` parameter when creating the Order : C Read-only : False Store type : MemoryStore - Codecs : [{'chunk_shape': (100, 100, 100), 'codecs': ({'endian': }, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': }, {}), 'index_location': }] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000000 (93.1G) .. _user-guide-chunks-order: @@ -125,7 +127,9 @@ ratios, depending on the correlation structure within the data. E.g.:: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) No. bytes stored : 342588717 Storage ratio : 1.2 @@ -142,7 +146,9 @@ ratios, depending on the correlation structure within the data. E.g.:: Order : F Read-only : False Store type : MemoryStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) No. bytes stored : 342588717 Storage ratio : 1.2 diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 7b3d842832..1a8e6df649 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -802,7 +802,7 @@ def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 807e940508..845552c8be 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -5,7 +5,7 @@ import numcodecs.abc import numpy as np -from zarr.abc.codec import Codec +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.common import ZarrFormat from zarr.core.metadata.v3 import DataType @@ -85,9 +85,9 @@ class ArrayInfo: _order: Literal["C", "F"] _read_only: bool _store_type: str - _compressor: numcodecs.abc.Codec | None = None - _filters: tuple[numcodecs.abc.Codec, ...] | None = None - _codecs: list[Codec] | None = None + _filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = () + _serializer: ArrayBytesCodec | None = None + _compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = () _count_bytes: int | None = None _count_bytes_stored: int | None = None _count_chunks_initialized: int | None = None @@ -109,18 +109,19 @@ def __repr__(self) -> str: Read-only : {_read_only} Store type : {_store_type}""") - kwargs = dataclasses.asdict(self) + # We can't use dataclasses.asdict, because we only want a shallow dict + kwargs = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} + if self._chunk_shape is None: # for non-regular chunk grids kwargs["chunk_shape"] = "" - if self._compressor is not None: - template += "\nCompressor : {_compressor}" - if self._filters is not None: - template += "\nFilters : {_filters}" + template += "\nFilters : {_filters}" + + if self._serializer is not None: + template += "\nSerializer : {_serializer}" - if self._codecs is not None: - template += "\nCodecs : {_codecs}" + template += "\nCompressors : {_compressors}" if self._count_bytes is not None: template += "\nNo. bytes : {_count_bytes}" @@ -139,5 +140,8 @@ def __repr__(self) -> str: kwargs["_storage_ratio"] = f"{self._count_bytes / self._count_bytes_stored:.1f}" if self._count_chunks_initialized is not None: - template += "\nChunks Initialized : {_count_chunks_initialized}" + if self._shard_shape is not None: + template += "\nShards Initialized : {_count_chunks_initialized}" + else: + template += "\nChunks Initialized : {_count_chunks_initialized}" return template.format(**kwargs) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 87ec4e48bc..2fa342ce16 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -20,6 +20,7 @@ from warnings import warn import numcodecs +import numcodecs.abc import numpy as np import numpy.typing as npt from typing_extensions import deprecated @@ -911,6 +912,63 @@ def size(self) -> int: """ return np.prod(self.metadata.shape).item() + @property + def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + """ + Filters that are applied to each chunk of the array, in order, before serializing that + chunk to bytes. + """ + if self.metadata.zarr_format == 2: + filters = self.metadata.filters + if filters is None: + return () + return filters + + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayArrayCodec) + ) + + @property + def serializer(self) -> ArrayBytesCodec | None: + """ + Array-to-bytes codec to use for serializing the chunks into bytes. + """ + if self.metadata.zarr_format == 2: + return None + + return next( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayBytesCodec) + ) + + @property + @deprecated("Use AsyncArray.compressors instead.") + def compressor(self) -> numcodecs.abc.Codec | None: + """ + Compressor that is applied to each chunk of the array. + + .. deprecated:: 3.0.0 + `array.compressor` is deprecated and will be removed in a future release. + Use `array.compressors` instead. + """ + if self.metadata.zarr_format == 2: + return self.metadata.compressor + raise TypeError("`compressor` is not available for Zarr format 3 arrays.") + + @property + def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + """ + Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + """ + if self.metadata.zarr_format == 2: + if self.metadata.compressor is not None: + return (self.metadata.compressor,) + return () + + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, BytesBytesCodec) + ) + @property def dtype(self) -> np.dtype[Any]: """Returns the data type of the array. @@ -1561,31 +1619,27 @@ async def info_complete(self) -> Any: def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None ) -> Any: - kwargs: dict[str, Any] = {} - if self.metadata.zarr_format == 2: - assert isinstance(self.metadata, ArrayV2Metadata) - if self.metadata.compressor is not None: - kwargs["_compressor"] = self.metadata.compressor - if self.metadata.filters is not None: - kwargs["_filters"] = self.metadata.filters - kwargs["_data_type"] = self.metadata.dtype - kwargs["_chunk_shape"] = self.metadata.chunks + _data_type: np.dtype[Any] | DataType + if isinstance(self.metadata, ArrayV2Metadata): + _data_type = self.metadata.dtype else: - kwargs["_codecs"] = self.metadata.codecs - kwargs["_data_type"] = self.metadata.data_type - kwargs["_chunk_shape"] = self.chunks - kwargs["_shard_shape"] = self.shards + _data_type = self.metadata.data_type return ArrayInfo( _zarr_format=self.metadata.zarr_format, + _data_type=_data_type, _shape=self.shape, _order=self.order, + _shard_shape=self.shards, + _chunk_shape=self.chunks, _read_only=self.read_only, + _compressors=self.compressors, + _filters=self.filters, + _serializer=self.serializer, _store_type=type(self.store_path.store).__name__, _count_bytes=self.nbytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, - **kwargs, ) @@ -1967,6 +2021,41 @@ def read_only(self) -> bool: def fill_value(self) -> Any: return self.metadata.fill_value + @property + def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + """ + Filters that are applied to each chunk of the array, in order, before serializing that + chunk to bytes. + """ + return self._async_array.filters + + @property + def serializer(self) -> None | ArrayBytesCodec: + """ + Array-to-bytes codec to use for serializing the chunks into bytes. + """ + return self._async_array.serializer + + @property + @deprecated("Use Array.compressors instead.") + def compressor(self) -> numcodecs.abc.Codec | None: + """ + Compressor that is applied to each chunk of the array. + + .. deprecated:: 3.0.0 + `array.compressor` is deprecated and will be removed in a future release. + Use `array.compressors` instead. + """ + return self._async_array.compressor + + @property + def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + """ + Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + """ + return self._async_array.compressors + @property def cdata_shape(self) -> ChunkCoords: """ @@ -3710,7 +3799,7 @@ async def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index dac2270a53..d100e30492 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1065,7 +1065,7 @@ async def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. @@ -2321,7 +2321,7 @@ def create_array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. @@ -2710,7 +2710,7 @@ def array( Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any - filters are applied (if any are specified). + filters are applied (if any are specified) and the data is serialized into bytes. For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 1265c832b2..13a275a6a1 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -81,9 +81,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: return out -def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: - """Check that the codecs are valid for the given dtype""" - +def validate_array_bytes_codec(codecs: tuple[Codec, ...]) -> ArrayBytesCodec: # ensure that we have at least one ArrayBytesCodec abcs: list[ArrayBytesCodec] = [codec for codec in codecs if isinstance(codec, ArrayBytesCodec)] if len(abcs) == 0: @@ -91,7 +89,18 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: elif len(abcs) > 1: raise ValueError("Only one ArrayBytesCodec is allowed.") - abc = abcs[0] + return abcs[0] + + +def validate_codecs(codecs: tuple[Codec, ...], dtype: DataType) -> None: + """Check that the codecs are valid for the given dtype""" + from zarr.codecs.sharding import ShardingCodec + + abc = validate_array_bytes_codec(codecs) + + # Recursively resolve array-bytes codecs within sharding codecs + while isinstance(abc, ShardingCodec): + abc = validate_array_bytes_codec(abc.codecs) # we need to have special codecs if we are decoding vlen strings or bytestrings # TODO: use codec ID instead of class name @@ -254,7 +263,7 @@ def __init__( config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) - codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial] + codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial) validate_codecs(codecs_parsed_partial, data_type_parsed) object.__setattr__(self, "shape", shape_parsed) @@ -330,6 +339,15 @@ def shards(self) -> ChunkCoords | None: ) raise NotImplementedError(msg) + @property + def inner_codecs(self) -> tuple[Codec, ...]: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.codecs[0].codecs + return self.codecs + def get_chunk_spec( self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: diff --git a/tests/test_array.py b/tests/test_array.py index 86885514a3..410b2e58d0 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -20,7 +20,6 @@ VLenUTF8Codec, ZstdCodec, ) -from zarr.codecs.sharding import ShardingCodec from zarr.core._info import ArrayInfo from zarr.core.array import ( CompressorsLike, @@ -494,7 +493,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressor=numcodecs.Zstd(), + _compressors=(numcodecs.Zstd(),), ) assert result == expected @@ -510,9 +509,8 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()] - if shards is None - else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + _compressors=(ZstdCodec(),), + _serializer=BytesCodec(), _count_bytes=512, ) assert result == expected @@ -536,7 +534,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -572,7 +570,7 @@ async def test_info_v2_async( _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressor=numcodecs.Zstd(), + _compressors=(numcodecs.Zstd(),), ) assert result == expected @@ -596,9 +594,8 @@ async def test_info_v3_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec(), ZstdCodec()] - if shards is None - else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])], + _compressors=(ZstdCodec(),), + _serializer=BytesCodec(), _count_bytes=512, ) assert result == expected @@ -624,7 +621,7 @@ async def test_info_complete_async( _order="C", _read_only=False, _store_type="MemoryStore", - _codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)], + _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, _count_bytes_stored=373 if shards is None else 578, # the metadata? @@ -839,7 +836,8 @@ def test_array_create_metadata_order_v2( arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") expected = order or zarr.config.get("array.order") - assert arr.metadata.order == expected # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.order == expected @pytest.mark.parametrize("order_config", ["C", "F", None]) @@ -1048,10 +1046,15 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + # Test metadata explicitly + assert arr.metadata.zarr_format == 2 # guard for mypy # The v2 metadata stores None and () separately - assert arr.metadata.filters == empty_value # type: ignore[union-attr] + assert arr.metadata.filters == empty_value # The v2 metadata does not allow tuple for compressor, therefore it is turned into None - assert arr.metadata.compressor is None # type: ignore[union-attr] + assert arr.metadata.compressor is None + + assert arr.filters == () + assert arr.compressors == () # v3 arr = await create_array( @@ -1061,10 +1064,13 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + assert arr.metadata.zarr_format == 3 # guard for mypy if dtype == "str": - assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr] + assert arr.metadata.codecs == (VLenUTF8Codec(),) + assert arr.serializer == VLenUTF8Codec() else: - assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr] + assert arr.metadata.codecs == (BytesCodec(),) + assert arr.serializer == BytesCodec() @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1116,8 +1122,14 @@ async def test_create_array_no_filters_compressors( ({"name": "transpose", "configuration": {"order": [0]}},), ], ) +@pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) async def test_create_array_v3_chunk_encoding( - store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str + store: MemoryStore, + compressors: CompressorsLike, + filters: FiltersLike, + dtype: str, + chunks: tuple[int, ...], + shards: tuple[int, ...] | None, ) -> None: """ Test various possibilities for the compressors and filters parameter to create_array @@ -1125,17 +1137,18 @@ async def test_create_array_v3_chunk_encoding( arr = await create_array( store=store, dtype=dtype, - shape=(10,), + shape=(12,), + chunks=chunks, + shards=shards, zarr_format=3, filters=filters, compressors=compressors, ) - aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3( + filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters / compressors from the array. - assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] - assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] + assert arr.filters == filters_expected + assert arr.compressors == compressors_expected @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1167,9 +1180,16 @@ async def test_create_array_v2_chunk_encoding( filters_expected, compressor_expected = _parse_chunk_encoding_v2( filters=filters, compressor=compressors, dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters/compressor from the array. - assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr] - assert arr.metadata.filters == filters_expected # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.compressor == compressor_expected + assert arr.metadata.filters == filters_expected + + # Normalize for property getters + compressor_expected = () if compressor_expected is None else (compressor_expected,) + filters_expected = () if filters_expected is None else filters_expected + + assert arr.compressors == compressor_expected + assert arr.filters == filters_expected @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1185,12 +1205,12 @@ async def test_create_array_v3_default_filters_compressors(store: MemoryStore, d shape=(10,), zarr_format=3, ) - expected_aa, expected_ab, expected_bb = _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) - # TODO: define the codec pipeline class such that these fields are required, which will obviate the - # type ignore statements - assert arr.codec_pipeline.array_array_codecs == expected_aa # type: ignore[attr-defined] - assert arr.codec_pipeline.bytes_bytes_codecs == expected_bb # type: ignore[attr-defined] - assert arr.codec_pipeline.array_bytes_codec == expected_ab # type: ignore[attr-defined] + expected_filters, expected_serializer, expected_compressors = _get_default_chunk_encoding_v3( + np_dtype=np.dtype(dtype) + ) + assert arr.filters == expected_filters + assert arr.serializer == expected_serializer + assert arr.compressors == expected_compressors @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1209,8 +1229,15 @@ async def test_create_array_v2_default_filters_compressors(store: MemoryStore, d expected_filters, expected_compressors = _get_default_chunk_encoding_v2( np_dtype=np.dtype(dtype) ) - assert arr.metadata.filters == expected_filters # type: ignore[union-attr] - assert arr.metadata.compressor == expected_compressors # type: ignore[union-attr] + assert arr.metadata.zarr_format == 2 # guard for mypy + assert arr.metadata.filters == expected_filters + assert arr.metadata.compressor == expected_compressors + + # Normalize for property getters + expected_filters = () if expected_filters is None else expected_filters + expected_compressors = () if expected_compressors is None else (expected_compressors,) + assert arr.filters == expected_filters + assert arr.compressors == expected_compressors @pytest.mark.parametrize("store", ["memory"], indirect=True) diff --git a/tests/test_config.py b/tests/test_config.py index 20e3c6044f..ca65c62166 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -305,12 +305,12 @@ class NewCodec2(BytesCodec): @pytest.mark.parametrize( ("dtype", "expected_codecs"), [ - ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec(), GzipCodec()]), - ("str", [VLenUTF8Codec(), GzipCodec()]), + ("int", (BytesCodec(), GzipCodec())), + ("bytes", (VLenBytesCodec(), GzipCodec())), + ("str", (VLenUTF8Codec(), GzipCodec())), ], ) -async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: +async def test_default_codecs(dtype: str, expected_codecs: tuple[Codec, ...]) -> None: with config.set( { "array.v3_default_codecs": { # test setting non-standard codecs diff --git a/tests/test_info.py b/tests/test_info.py index 5d9264aa13..db0fd0ef76 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -59,7 +59,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _serializer=BytesCodec(), ) result = repr(info) assert result == textwrap.dedent(f"""\ @@ -71,7 +71,9 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Order : C Read-only : True Store type : MemoryStore - Codecs : [{{'endian': }}]""") + Filters : () + Serializer : BytesCodec(endian=) + Compressors : ()""") @pytest.mark.parametrize("zarr_format", ZARR_FORMATS) @@ -95,7 +97,7 @@ def test_array_info_complete( _order="C", _read_only=True, _store_type="MemoryStore", - _codecs=[BytesCodec()], + _serializer=BytesCodec(), _count_bytes=count_bytes, _count_bytes_stored=count_bytes_stored, _count_chunks_initialized=count_chunks_initialized, @@ -110,7 +112,9 @@ def test_array_info_complete( Order : C Read-only : True Store type : MemoryStore - Codecs : [{{'endian': }}] + Filters : () + Serializer : BytesCodec(endian=) + Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored_formatted} Storage ratio : {storage_ratio_formatted} From 4d252a2cd0d72f537a0cf833fbba389d0cefa495 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Mon, 6 Jan 2025 17:08:22 +0000 Subject: [PATCH 69/87] Don't draw invalid shapes in `test_vindex` (#2651) * Don't draw invalid shapes in test_vindex * Update test_properties.py * style: pre-commit fixes --------- Co-authored-by: Deepak Cherian Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- tests/test_properties.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_properties.py b/tests/test_properties.py index f70753ceb5..678dcae89c 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -6,7 +6,7 @@ import hypothesis.extra.numpy as npst # noqa: E402 import hypothesis.strategies as st # noqa: E402 -from hypothesis import assume, given # noqa: E402 +from hypothesis import given # noqa: E402 from zarr.testing.strategies import arrays, basic_indices, numpy_arrays, zarr_formats # noqa: E402 @@ -34,9 +34,8 @@ def test_basic_indexing(data: st.DataObject) -> None: @given(data=st.data()) def test_vindex(data: st.DataObject) -> None: - zarray = data.draw(arrays()) # integer_array_indices can't handle 0-size dimensions. - assume(all(s > 0 for s in zarray.shape)) + zarray = data.draw(arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) nparray = zarray[:] indexer = data.draw( From 22634ea2dabc0ad9ecfa932e21079fabe94c1f50 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 6 Jan 2025 18:29:37 +0100 Subject: [PATCH 70/87] Separate defaults for filters, serializers and compressors in v3 (#2653) --- docs/user-guide/config.rst | 29 +++++----- src/zarr/api/asynchronous.py | 3 +- src/zarr/api/synchronous.py | 17 +++--- src/zarr/core/array.py | 105 ++++++++++++----------------------- src/zarr/core/config.py | 11 ++-- src/zarr/core/group.py | 51 ++++++++--------- tests/test_config.py | 39 +++++-------- 7 files changed, 108 insertions(+), 147 deletions(-) diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index e38715b67e..a17bce9d99 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -28,7 +28,7 @@ Configuration options include the following: - Default Zarr format ``default_zarr_version`` - Default array order in memory ``array.order`` -- Default codecs ``array.v3_default_codecs`` and ``array.v2_default_compressor`` +- Default filters, serializers and compressors, e.g. ``array.v3_default_filters``, ``array.v3_default_serializer``, ``array.v3_default_compressors``, ``array.v2_default_filters`` and ``array.v2_default_compressor`` - Whether empty chunks are written to storage ``array.write_empty_chunks`` - Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers`` - Selections of implementations of codecs, codec pipelines and buffers @@ -54,19 +54,20 @@ This is the current default configuration:: 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], 'numeric': None, 'string': [{'id': 'vlen-utf8'}]}, - 'v3_default_codecs': {'bytes': [{'name': 'vlen-bytes'}, - {'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'numeric': [{'configuration': {'endian': 'little'}, - 'name': 'bytes'}, - {'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'string': [{'name': 'vlen-utf8'}, - {'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}]}, + 'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}], + 'numeric': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}], + 'string': [{'configuration': {'checksum': False, + 'level': 0}, + 'name': 'zstd'}]}, + 'v3_default_filters': {'bytes': [], 'numeric': [], 'string': []}, + 'v3_default_serializer': {'bytes': {'name': 'vlen-bytes'}, + 'numeric': {'configuration': {'endian': 'little'}, + 'name': 'bytes'}, + 'string': {'name': 'vlen-utf8'}}, 'write_empty_chunks': False}, 'async': {'concurrency': 10, 'timeout': None}, 'buffer': 'zarr.core.buffer.cpu.Buffer', diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 060618dbd1..2e98a43f94 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -892,7 +892,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v3_default_filters``, + ``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`. compressor : Codec, optional Primary compressor to compress chunk data. Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead. diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 1a8e6df649..f8bee9fcef 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -788,9 +788,8 @@ def create_array( For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -806,22 +805,22 @@ def create_array( For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr format 2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2fa342ce16..915158cb5a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -110,7 +110,6 @@ _parse_array_array_codec, _parse_array_bytes_codec, _parse_bytes_bytes_codec, - _resolve_codec, get_pipeline_class, ) from zarr.storage import StoreLike, make_store_path @@ -469,7 +468,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v3_default_filters``, + ``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). Zarr format 3 only. Zarr format 2 arrays should not use this parameter. @@ -1715,7 +1715,8 @@ def create( - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v3_default_filters``, + ``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional The names of the dimensions (default is None). Zarr format 3 only. Zarr format 2 arrays should not use this parameter. @@ -3698,17 +3699,9 @@ def _build_parents( def _get_default_codecs( np_dtype: np.dtype[Any], -) -> list[dict[str, JSON]]: - default_codecs = zarr_config.get("array.v3_default_codecs") - dtype = DataType.from_numpy(np_dtype) - if dtype == DataType.string: - dtype_key = "string" - elif dtype == DataType.bytes: - dtype_key = "bytes" - else: - dtype_key = "numeric" - - return cast(list[dict[str, JSON]], default_codecs[dtype_key]) +) -> tuple[Codec, ...]: + filters, serializer, compressors = _get_default_chunk_encoding_v3(np_dtype) + return filters + (serializer,) + compressors FiltersLike: TypeAlias = ( @@ -3785,9 +3778,8 @@ async def create_array( For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -3803,22 +3795,22 @@ async def create_array( For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr format 2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -3997,7 +3989,6 @@ def _get_default_chunk_encoding_v3( """ Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. """ - default_codecs = zarr_config.get("array.v3_default_codecs") dtype = DataType.from_numpy(np_dtype) if dtype == DataType.string: dtype_key = "string" @@ -4006,31 +3997,15 @@ def _get_default_chunk_encoding_v3( else: dtype_key = "numeric" - codec_dicts = default_codecs[dtype_key] - codecs = tuple(_resolve_codec(c) for c in codec_dicts) - array_bytes_maybe = None - array_array: list[ArrayArrayCodec] = [] - bytes_bytes: list[BytesBytesCodec] = [] - - for codec in codecs: - if isinstance(codec, ArrayBytesCodec): - if array_bytes_maybe is not None: - raise ValueError( - f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. " - "Only one array-to-bytes codec is allowed." - ) - array_bytes_maybe = codec - elif isinstance(codec, ArrayArrayCodec): - array_array.append(codec) - elif isinstance(codec, BytesBytesCodec): - bytes_bytes.append(codec) - else: - raise TypeError(f"Unexpected codec type: {type(codec)}") + default_filters = zarr_config.get("array.v3_default_filters").get(dtype_key) + default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype_key) + default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype_key) - if array_bytes_maybe is None: - raise ValueError("Required ArrayBytesCodec was not found.") + filters = tuple(_parse_array_array_codec(codec_dict) for codec_dict in default_filters) + serializer = _parse_array_bytes_codec(default_serializer) + compressors = tuple(_parse_bytes_bytes_codec(codec_dict) for codec_dict in default_compressors) - return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes) + return filters, serializer, compressors def _get_default_chunk_encoding_v2( @@ -4111,34 +4086,15 @@ def _parse_chunk_encoding_v3( default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3( dtype ) - maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] - maybe_array_array: Iterable[Codec | dict[str, JSON]] - out_bytes_bytes: tuple[BytesBytesCodec, ...] - if compressors is None: - out_bytes_bytes = () - - elif compressors == "auto": - out_bytes_bytes = default_bytes_bytes - else: - if isinstance(compressors, dict | Codec): - maybe_bytes_bytes = (compressors,) - elif compressors is None: - maybe_bytes_bytes = () - else: - maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) - - out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) - out_array_array: tuple[ArrayArrayCodec, ...] if filters is None: - out_array_array = () + out_array_array: tuple[ArrayArrayCodec, ...] = () elif filters == "auto": out_array_array = default_array_array else: + maybe_array_array: Iterable[Codec | dict[str, JSON]] if isinstance(filters, dict | Codec): maybe_array_array = (filters,) - elif filters is None: - maybe_array_array = () else: maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters) out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) @@ -4148,6 +4104,19 @@ def _parse_chunk_encoding_v3( else: out_array_bytes = _parse_array_bytes_codec(serializer) + if compressors is None: + out_bytes_bytes: tuple[BytesBytesCodec, ...] = () + elif compressors == "auto": + out_bytes_bytes = default_bytes_bytes + else: + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] + if isinstance(compressors, dict | Codec): + maybe_bytes_bytes = (compressors,) + else: + maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) + + out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + return out_array_array, out_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 421a100f1b..7920d220a4 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -76,17 +76,20 @@ def reset(self) -> None: "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], }, - "v3_default_codecs": { + "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, + "v3_default_serializer": { + "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, + "string": {"name": "vlen-utf8"}, + "bytes": {"name": "vlen-bytes"}, + }, + "v3_default_compressors": { "numeric": [ - {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "string": [ - {"name": "vlen-utf8"}, {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "bytes": [ - {"name": "vlen-bytes"}, {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], }, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index d100e30492..ebdc63364e 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1051,9 +1051,8 @@ async def create_array( For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -1069,16 +1068,14 @@ async def create_array( For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr format 2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. compressor : Codec, optional @@ -1086,7 +1083,9 @@ async def create_array( serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -2307,9 +2306,8 @@ def create_array( For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -2325,16 +2323,14 @@ def create_array( For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr format 2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. compressor : Codec, optional @@ -2342,7 +2338,9 @@ def create_array( serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -2696,9 +2694,8 @@ def array( For Zarr format 3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v3_default_filters`` in :mod:`zarr.core.config`. Use ``None`` to omit default filters. @@ -2714,16 +2711,14 @@ def array( For Zarr format 3, a "compressor" is a codec that takes a bytestream, and returns another bytestream. Multiple compressors my be provided for Zarr format 3. - If ``filters`` and ``compressors`` are not specified, then the default codecs for - Zarr format 3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + If no ``compressors`` are provided, a default set of compressors will be used. + These defaults can be changed by modifying the value of ``array.v3_default_compressors`` in :mod:`zarr.core.config`. Use ``None`` to omit default compressors. For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr format 2. - If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + If no ``compressor`` is provided, a default compressor will be used. in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. compressor : Codec, optional @@ -2731,7 +2726,9 @@ def array( serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, a default serializer will be used. + These defaults can be changed by modifying the value of ``array.v3_default_serializer`` + in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional diff --git a/tests/test_config.py b/tests/test_config.py index ca65c62166..c552ace840 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -10,7 +10,7 @@ import zarr import zarr.api from zarr import zeros -from zarr.abc.codec import Codec, CodecInput, CodecOutput, CodecPipeline +from zarr.abc.codec import CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store from zarr.codecs import ( BloscCodec, @@ -18,8 +18,6 @@ Crc32cCodec, GzipCodec, ShardingCodec, - VLenBytesCodec, - VLenUTF8Codec, ) from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer @@ -64,17 +62,20 @@ def test_config_defaults_set() -> None: "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], }, - "v3_default_codecs": { - "bytes": [ - {"name": "vlen-bytes"}, - {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, - ], + "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, + "v3_default_serializer": { + "numeric": {"name": "bytes", "configuration": {"endian": "little"}}, + "string": {"name": "vlen-utf8"}, + "bytes": {"name": "vlen-bytes"}, + }, + "v3_default_compressors": { "numeric": [ - {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "string": [ - {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, + ], + "bytes": [ {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], }, @@ -302,28 +303,18 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") -@pytest.mark.parametrize( - ("dtype", "expected_codecs"), - [ - ("int", (BytesCodec(), GzipCodec())), - ("bytes", (VLenBytesCodec(), GzipCodec())), - ("str", (VLenUTF8Codec(), GzipCodec())), - ], -) -async def test_default_codecs(dtype: str, expected_codecs: tuple[Codec, ...]) -> None: +@pytest.mark.parametrize("dtype", ["int", "bytes", "str"]) +async def test_default_codecs(dtype: str) -> None: with config.set( { - "array.v3_default_codecs": { # test setting non-standard codecs + "array.v3_default_compressors": { # test setting non-standard codecs "numeric": [ - {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "gzip", "configuration": {"level": 5}}, ], "string": [ - {"name": "vlen-utf8"}, {"name": "gzip", "configuration": {"level": 5}}, ], "bytes": [ - {"name": "vlen-bytes"}, {"name": "gzip", "configuration": {"level": 5}}, ], } @@ -336,4 +327,4 @@ async def test_default_codecs(dtype: str, expected_codecs: tuple[Codec, ...]) -> zarr_format=3, store=MemoryStore(), ) - assert arr.metadata.codecs == expected_codecs + assert arr.compressors == (GzipCodec(),) From 71f635464d6480d28635c7c370cfda84182bae79 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 6 Jan 2025 12:58:50 -0700 Subject: [PATCH 71/87] Add moto[server] to test deps for FsspecStore (#2657) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8bc861d837..05db0860a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ test = [ "s3fs", "pytest-asyncio", "pytest-accept", - "moto[s3]", + "moto[s3,server]", "requests", "rich", "mypy", From bb8ab0fa1f847eb95a545cd17af1c6ba51e69f65 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 Jan 2025 14:42:07 -0800 Subject: [PATCH 72/87] chore: update pre-commit hooks (#2660) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.2 → v0.8.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.2...v0.8.6) - [github.com/pre-commit/mirrors-mypy: v1.13.0 → v1.14.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.13.0...v1.14.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea1cd4dbab..a9b4c8f444 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ ci: default_stages: [pre-commit, pre-push] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.2 + rev: v0.8.6 hooks: - id: ruff args: ["--fix", "--show-fixes"] @@ -22,7 +22,7 @@ repos: - id: check-yaml - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.13.0 + rev: v1.14.1 hooks: - id: mypy files: src|tests From f9c20243d207835d15d7091e05e8ec82a265b7d1 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 7 Jan 2025 08:48:44 +0100 Subject: [PATCH 73/87] Zstd: Don't persist the checksum param if false (#2655) --- src/zarr/core/metadata/v2.py | 8 +++++++- tests/test_metadata/test_v2.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index b95433068a..29cf15a119 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -116,7 +116,13 @@ def _json_convert( else: return o.descr if isinstance(o, numcodecs.abc.Codec): - return o.get_config() + codec_config = o.get_config() + + # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 + if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): + codec_config.pop("checksum", None) + + return codec_config if np.isscalar(o): out: Any if hasattr(o, "dtype") and o.dtype.kind == "M" and hasattr(o, "view"): diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index 69dbd4645b..5a5bf5f73a 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -9,6 +9,7 @@ import zarr.api.asynchronous import zarr.storage from zarr.core.buffer import cpu +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV2Metadata from zarr.core.metadata.v2 import parse_zarr_format @@ -282,3 +283,18 @@ def test_from_dict_extra_fields() -> None: order="C", ) assert result == expected + + +def test_zstd_checksum() -> None: + arr = zarr.create_array( + {}, + shape=(10,), + chunks=(10,), + dtype="int32", + compressors={"id": "zstd", "level": 5, "checksum": False}, + zarr_format=2, + ) + metadata = json.loads( + arr.metadata.to_buffer_dict(default_buffer_prototype())[".zarray"].to_bytes() + ) + assert "checksum" not in metadata["compressor"] From bc5877be4f61895a29fd811882e188f84fa3f8f2 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Tue, 7 Jan 2025 15:31:51 +0100 Subject: [PATCH 74/87] Feat/concurrent members (#2519) * feat: add wrapperstore * feat: add latencystore * rename noisysetter -> noisygetter * rename _wrapped to _store * loggingstore inherits from wrapperstore * initial commit * working members traversal * bolt concurrent members implementation onto async group * update scratch file * use metadata / node builders for v3 node creation * fix key/name handling in recursion * add latency-based test * add latency-based concurrency tests for group.members * improve comments for test * add concurrency limit * add test for concurrency limiting * docstrings * remove function that was only calling itself * docstrings * relax timing requirement for concurrency test * Update src/zarr/core/group.py Co-authored-by: Deepak Cherian * exists_ok -> overwrite * simplify group_members_perf test, just require that the duration is less than the number of groups * latency * update test docstring * remove vestigial test --------- Co-authored-by: Deepak Cherian --- src/zarr/api/asynchronous.py | 1 - src/zarr/core/array.py | 3 +- src/zarr/core/group.py | 328 ++++++++++++++++++++++++++--------- src/zarr/storage/_logging.py | 8 +- tests/test_group.py | 67 +++++++ 5 files changed, 318 insertions(+), 89 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 2e98a43f94..37a5b76bba 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -188,7 +188,6 @@ async def consolidate_metadata( group.store_path.store._check_writable() members_metadata = {k: v.metadata async for k, v in group.members(max_depth=None)} - # While consolidating, we want to be explicit about when child groups # are empty by inserting an empty dict for consolidated_metadata.metadata for k, v in members_metadata.items(): diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 915158cb5a..e0aad8b6ad 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1995,10 +1995,11 @@ def path(self) -> str: @property def name(self) -> str: + """Array name following h5py convention.""" return self._async_array.name @property - def basename(self) -> str | None: + def basename(self) -> str: """Final component of name.""" return self._async_array.basename diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index ebdc63364e..82970e4b7f 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -31,7 +31,7 @@ create_array, ) from zarr.core.attributes import Attributes -from zarr.core.buffer import default_buffer_prototype +from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.core.common import ( JSON, ZARR_JSON, @@ -662,6 +662,7 @@ async def getitem( """ store_path = self.store_path / key logger.debug("key=%s, store_path=%s", key, store_path) + metadata: ArrayV2Metadata | ArrayV3Metadata | GroupMetadata # Consolidated metadata lets us avoid some I/O operations so try that first. if self.metadata.consolidated_metadata is not None: @@ -678,12 +679,9 @@ async def getitem( raise KeyError(key) else: zarr_json = json.loads(zarr_json_bytes.to_bytes()) - if zarr_json["node_type"] == "group": - return type(self).from_dict(store_path, zarr_json) - elif zarr_json["node_type"] == "array": - return AsyncArray.from_dict(store_path, zarr_json) - else: - raise ValueError(f"unexpected node_type: {zarr_json['node_type']}") + metadata = _build_metadata_v3(zarr_json) + return _build_node_v3(metadata, store_path) + elif self.metadata.zarr_format == 2: # Q: how do we like optimistically fetching .zgroup, .zarray, and .zattrs? # This guarantees that we will always make at least one extra request to the store @@ -698,21 +696,19 @@ async def getitem( # unpack the zarray, if this is None then we must be opening a group zarray = json.loads(zarray_bytes.to_bytes()) if zarray_bytes else None + zgroup = json.loads(zgroup_bytes.to_bytes()) if zgroup_bytes else None # unpack the zattrs, this can be None if no attrs were written zattrs = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {} if zarray is not None: - # TODO: update this once the V2 array support is part of the primary array class - zarr_json = {**zarray, "attributes": zattrs} - return AsyncArray.from_dict(store_path, zarr_json) + metadata = _build_metadata_v2(zarray, zattrs) + return _build_node_v2(metadata=metadata, store_path=store_path) else: - zgroup = ( - json.loads(zgroup_bytes.to_bytes()) - if zgroup_bytes is not None - else {"zarr_format": self.metadata.zarr_format} - ) - zarr_json = {**zgroup, "attributes": zattrs} - return type(self).from_dict(store_path, zarr_json) + # this is just for mypy + if TYPE_CHECKING: + assert zgroup is not None + metadata = _build_metadata_v2(zgroup, zattrs) + return _build_node_v2(metadata=metadata, store_path=store_path) else: raise ValueError(f"unexpected zarr_format: {self.metadata.zarr_format}") @@ -1346,18 +1342,50 @@ async def members( """ if max_depth is not None and max_depth < 0: raise ValueError(f"max_depth must be None or >= 0. Got '{max_depth}' instead") - async for item in self._members(max_depth=max_depth, current_depth=0): + async for item in self._members(max_depth=max_depth): yield item - async def _members( - self, max_depth: int | None, current_depth: int - ) -> AsyncGenerator[ + def _members_consolidated( + self, max_depth: int | None, prefix: str = "" + ) -> Generator[ tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], None, ]: + consolidated_metadata = self.metadata.consolidated_metadata + + do_recursion = max_depth is None or max_depth > 0 + + # we kind of just want the top-level keys. + if consolidated_metadata is not None: + for key in consolidated_metadata.metadata: + obj = self._getitem_consolidated( + self.store_path, key, prefix=self.name + ) # Metadata -> Group/Array + key = f"{prefix}/{key}".lstrip("/") + yield key, obj + + if do_recursion and isinstance(obj, AsyncGroup): + if max_depth is None: + new_depth = None + else: + new_depth = max_depth - 1 + yield from obj._members_consolidated(new_depth, prefix=key) + + async def _members( + self, max_depth: int | None + ) -> AsyncGenerator[ + tuple[str, AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] | AsyncGroup], None + ]: + skip_keys: tuple[str, ...] + if self.metadata.zarr_format == 2: + skip_keys = (".zattrs", ".zgroup", ".zarray", ".zmetadata") + elif self.metadata.zarr_format == 3: + skip_keys = ("zarr.json",) + else: + raise ValueError(f"Unknown Zarr format: {self.metadata.zarr_format}") + if self.metadata.consolidated_metadata is not None: - # we should be able to do members without any additional I/O - members = self._members_consolidated(max_depth, current_depth) + members = self._members_consolidated(max_depth=max_depth) for member in members: yield member return @@ -1371,66 +1399,12 @@ async def _members( ) raise ValueError(msg) - # would be nice to make these special keys accessible programmatically, - # and scoped to specific zarr versions - # especially true for `.zmetadata` which is configurable - _skip_keys = ("zarr.json", ".zgroup", ".zattrs", ".zmetadata") - - # hmm lots of I/O and logic interleaved here. - # We *could* have an async gen over self.metadata.consolidated_metadata.metadata.keys() - # and plug in here. `getitem` will skip I/O. - # Kinda a shame to have all the asyncio task overhead though, when it isn't needed. - - async for key in self.store_path.store.list_dir(self.store_path.path): - if key in _skip_keys: - continue - try: - obj = await self.getitem(key) - yield (key, obj) - - if ( - ((max_depth is None) or (current_depth < max_depth)) - and hasattr(obj.metadata, "node_type") - and obj.metadata.node_type == "group" - ): - # the assert is just for mypy to know that `obj.metadata.node_type` - # implies an AsyncGroup, not an AsyncArray - assert isinstance(obj, AsyncGroup) - async for child_key, val in obj._members( - max_depth=max_depth, current_depth=current_depth + 1 - ): - yield f"{key}/{child_key}", val - except KeyError: - # keyerror is raised when `key` names an object (in the object storage sense), - # as opposed to a prefix, in the store under the prefix associated with this group - # in which case `key` cannot be the name of a sub-array or sub-group. - warnings.warn( - f"Object at {key} is not recognized as a component of a Zarr hierarchy.", - UserWarning, - stacklevel=1, - ) - - def _members_consolidated( - self, max_depth: int | None, current_depth: int, prefix: str = "" - ) -> Generator[ - tuple[str, AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup], - None, - ]: - consolidated_metadata = self.metadata.consolidated_metadata - - # we kind of just want the top-level keys. - if consolidated_metadata is not None: - for key in consolidated_metadata.metadata: - obj = self._getitem_consolidated( - self.store_path, key, prefix=self.name - ) # Metadata -> Group/Array - key = f"{prefix}/{key}".lstrip("/") - yield key, obj - - if ((max_depth is None) or (current_depth < max_depth)) and isinstance( - obj, AsyncGroup - ): - yield from obj._members_consolidated(max_depth, current_depth + 1, prefix=key) + # enforce a concurrency limit by passing a semaphore to all the recursive functions + semaphore = asyncio.Semaphore(config.get("async.concurrency")) + async for member in _iter_members_deep( + self, max_depth=max_depth, skip_keys=skip_keys, semaphore=semaphore + ): + yield member async def keys(self) -> AsyncGenerator[str, None]: """Iterate over member names.""" @@ -2783,3 +2757,191 @@ def array( ) ) ) + + +async def _getitem_semaphore( + node: AsyncGroup, key: str, semaphore: asyncio.Semaphore | None +) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] | AsyncGroup: + """ + Combine node.getitem with an optional semaphore. If the semaphore parameter is an + asyncio.Semaphore instance, then the getitem operation is performed inside an async context + manager provided by that semaphore. If the semaphore parameter is None, then getitem is invoked + without a context manager. + """ + if semaphore is not None: + async with semaphore: + return await node.getitem(key) + else: + return await node.getitem(key) + + +async def _iter_members( + node: AsyncGroup, + skip_keys: tuple[str, ...], + semaphore: asyncio.Semaphore | None, +) -> AsyncGenerator[ + tuple[str, AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] | AsyncGroup], None +]: + """ + Iterate over the arrays and groups contained in a group. + + Parameters + ---------- + node : AsyncGroup + The group to traverse. + skip_keys : tuple[str, ...] + A tuple of keys to skip when iterating over the possible members of the group. + semaphore : asyncio.Semaphore | None + An optional semaphore to use for concurrency control. + + Yields + ------ + tuple[str, AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] | AsyncGroup] + """ + + # retrieve keys from storage + keys = [key async for key in node.store.list_dir(node.path)] + keys_filtered = tuple(filter(lambda v: v not in skip_keys, keys)) + + node_tasks = tuple( + asyncio.create_task(_getitem_semaphore(node, key, semaphore), name=key) + for key in keys_filtered + ) + + for fetched_node_coro in asyncio.as_completed(node_tasks): + try: + fetched_node = await fetched_node_coro + except KeyError as e: + # keyerror is raised when `key` names an object (in the object storage sense), + # as opposed to a prefix, in the store under the prefix associated with this group + # in which case `key` cannot be the name of a sub-array or sub-group. + warnings.warn( + f"Object at {e.args[0]} is not recognized as a component of a Zarr hierarchy.", + UserWarning, + stacklevel=1, + ) + continue + match fetched_node: + case AsyncArray() | AsyncGroup(): + yield fetched_node.basename, fetched_node + case _: + raise ValueError(f"Unexpected type: {type(fetched_node)}") + + +async def _iter_members_deep( + group: AsyncGroup, + *, + max_depth: int | None, + skip_keys: tuple[str, ...], + semaphore: asyncio.Semaphore | None = None, +) -> AsyncGenerator[ + tuple[str, AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] | AsyncGroup], None +]: + """ + Iterate over the arrays and groups contained in a group, and optionally the + arrays and groups contained in those groups. + + Parameters + ---------- + group : AsyncGroup + The group to traverse. + max_depth : int | None + The maximum depth of recursion. + skip_keys : tuple[str, ...] + A tuple of keys to skip when iterating over the possible members of the group. + semaphore : asyncio.Semaphore | None + An optional semaphore to use for concurrency control. + + Yields + ------ + tuple[str, AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] | AsyncGroup] + """ + + to_recurse = {} + do_recursion = max_depth is None or max_depth > 0 + + if max_depth is None: + new_depth = None + else: + new_depth = max_depth - 1 + async for name, node in _iter_members(group, skip_keys=skip_keys, semaphore=semaphore): + yield name, node + if isinstance(node, AsyncGroup) and do_recursion: + to_recurse[name] = _iter_members_deep( + node, max_depth=new_depth, skip_keys=skip_keys, semaphore=semaphore + ) + + for prefix, subgroup_iter in to_recurse.items(): + async for name, node in subgroup_iter: + key = f"{prefix}/{name}".lstrip("/") + yield key, node + + +def _resolve_metadata_v2( + blobs: tuple[str | bytes | bytearray, str | bytes | bytearray], +) -> ArrayV2Metadata | GroupMetadata: + zarr_metadata = json.loads(blobs[0]) + attrs = json.loads(blobs[1]) + if "shape" in zarr_metadata: + return ArrayV2Metadata.from_dict(zarr_metadata | {"attrs": attrs}) + else: + return GroupMetadata.from_dict(zarr_metadata | {"attrs": attrs}) + + +def _build_metadata_v3(zarr_json: dict[str, Any]) -> ArrayV3Metadata | GroupMetadata: + """ + Take a dict and convert it into the correct metadata type. + """ + if "node_type" not in zarr_json: + raise KeyError("missing `node_type` key in metadata document.") + match zarr_json: + case {"node_type": "array"}: + return ArrayV3Metadata.from_dict(zarr_json) + case {"node_type": "group"}: + return GroupMetadata.from_dict(zarr_json) + case _: + raise ValueError("invalid value for `node_type` key in metadata document") + + +def _build_metadata_v2( + zarr_json: dict[str, Any], attrs_json: dict[str, Any] +) -> ArrayV2Metadata | GroupMetadata: + """ + Take a dict and convert it into the correct metadata type. + """ + match zarr_json: + case {"shape": _}: + return ArrayV2Metadata.from_dict(zarr_json | {"attributes": attrs_json}) + case _: + return GroupMetadata.from_dict(zarr_json | {"attributes": attrs_json}) + + +def _build_node_v3( + metadata: ArrayV3Metadata | GroupMetadata, store_path: StorePath +) -> AsyncArray[ArrayV3Metadata] | AsyncGroup: + """ + Take a metadata object and return a node (AsyncArray or AsyncGroup). + """ + match metadata: + case ArrayV3Metadata(): + return AsyncArray(metadata, store_path=store_path) + case GroupMetadata(): + return AsyncGroup(metadata, store_path=store_path) + case _: + raise ValueError(f"Unexpected metadata type: {type(metadata)}") + + +def _build_node_v2( + metadata: ArrayV2Metadata | GroupMetadata, store_path: StorePath +) -> AsyncArray[ArrayV2Metadata] | AsyncGroup: + """ + Take a metadata object and return a node (AsyncArray or AsyncGroup). + """ + + match metadata: + case ArrayV2Metadata(): + return AsyncArray(metadata, store_path=store_path) + case GroupMetadata(): + return AsyncGroup(metadata, store_path=store_path) + case _: + raise ValueError(f"Unexpected metadata type: {type(metadata)}") diff --git a/src/zarr/storage/_logging.py b/src/zarr/storage/_logging.py index 450913e9d3..45ddeef40c 100644 --- a/src/zarr/storage/_logging.py +++ b/src/zarr/storage/_logging.py @@ -11,7 +11,7 @@ from zarr.storage._wrapper import WrapperStore if TYPE_CHECKING: - from collections.abc import AsyncIterator, Generator, Iterable + from collections.abc import AsyncGenerator, Generator, Iterable from zarr.abc.store import ByteRangeRequest from zarr.core.buffer import Buffer, BufferPrototype @@ -205,19 +205,19 @@ async def set_partial_values( with self.log(keys): return await self._store.set_partial_values(key_start_values=key_start_values) - async def list(self) -> AsyncIterator[str]: + async def list(self) -> AsyncGenerator[str, None]: # docstring inherited with self.log(): async for key in self._store.list(): yield key - async def list_prefix(self, prefix: str) -> AsyncIterator[str]: + async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]: # docstring inherited with self.log(prefix): async for key in self._store.list_prefix(prefix=prefix): yield key - async def list_dir(self, prefix: str) -> AsyncIterator[str]: + async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]: # docstring inherited with self.log(prefix): async for key in self._store.list_dir(prefix=prefix): diff --git a/tests/test_group.py b/tests/test_group.py index 19a9f9c9bb..c2a5f751f3 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -3,6 +3,7 @@ import contextlib import operator import pickle +import time import warnings from typing import TYPE_CHECKING, Any, Literal @@ -22,6 +23,7 @@ from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore, make_store_path +from zarr.testing.store import LatencyStore from .conftest import parse_store @@ -1440,6 +1442,71 @@ def test_delitem_removes_children(store: Store, zarr_format: ZarrFormat) -> None g1["0/0"] +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_group_members_performance(store: MemoryStore) -> None: + """ + Test that the execution time of Group.members is less than the number of members times the + latency for accessing each member. + """ + get_latency = 0.1 + + # use the input store to create some groups + group_create = zarr.group(store=store) + num_groups = 10 + + # Create some groups + for i in range(num_groups): + group_create.create_group(f"group{i}") + + latency_store = LatencyStore(store, get_latency=get_latency) + # create a group with some latency on get operations + group_read = zarr.group(store=latency_store) + + # check how long it takes to iterate over the groups + # if .members is sensitive to IO latency, + # this should take (num_groups * get_latency) seconds + # otherwise, it should take only marginally more than get_latency seconds + start = time.time() + _ = group_read.members() + elapsed = time.time() - start + + assert elapsed < (num_groups * get_latency) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_group_members_concurrency_limit(store: MemoryStore) -> None: + """ + Test that the execution time of Group.members can be constrained by the async concurrency + configuration setting. + """ + get_latency = 0.02 + + # use the input store to create some groups + group_create = zarr.group(store=store) + num_groups = 10 + + # Create some groups + for i in range(num_groups): + group_create.create_group(f"group{i}") + + latency_store = LatencyStore(store, get_latency=get_latency) + # create a group with some latency on get operations + group_read = zarr.group(store=latency_store) + + # check how long it takes to iterate over the groups + # if .members is sensitive to IO latency, + # this should take (num_groups * get_latency) seconds + # otherwise, it should take only marginally more than get_latency seconds + from zarr.core.config import config + + with config.set({"async.concurrency": 1}): + start = time.time() + _ = group_read.members() + elapsed = time.time() - start + + assert elapsed > num_groups * get_latency + + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_deprecated_compressor(store: Store) -> None: g = zarr.group(store=store, zarr_format=2) From 12f601258d7af950a853a9e7fbbdc32feae73901 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 7 Jan 2025 18:54:48 +0000 Subject: [PATCH 75/87] Fix `Group.array()` with `data` argument (#2668) --- src/zarr/core/group.py | 5 ++++- tests/test_group.py | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 82970e4b7f..79ab31112a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -2729,6 +2729,8 @@ def array( Whether to overwrite an array with the same name in the store, if one exists. config : ArrayConfig or ArrayConfigLike, optional Runtime configuration for the array. + data : array_like + The data to fill the array with. Returns ------- @@ -2737,7 +2739,7 @@ def array( compressors = _parse_deprecated_compressor(compressor, compressors) return Array( self._sync( - self._async_group.create_array( + self._async_group.create_dataset( name=name, shape=shape, dtype=dtype, @@ -2754,6 +2756,7 @@ def array( overwrite=overwrite, storage_options=storage_options, config=config, + data=data, ) ) ) diff --git a/tests/test_group.py b/tests/test_group.py index c2a5f751f3..1d3563fe68 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -619,8 +619,7 @@ def test_group_create_array( array[:] = data elif method == "array": with pytest.warns(DeprecationWarning): - array = group.array(name="array", shape=shape, dtype=dtype) - array[:] = data + array = group.array(name="array", data=data, shape=shape, dtype=dtype) else: raise AssertionError From 29ef41dc9dd24cd96aaa294b442ea6fb892d5763 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 7 Jan 2025 16:04:00 -0800 Subject: [PATCH 76/87] Make make_store_path private (#2628) * Clean up public store API * chore: make_store_path is private --------- Co-authored-by: David Stansby --- src/zarr/api/asynchronous.py | 6 ++---- src/zarr/core/array.py | 5 +++-- src/zarr/core/group.py | 4 ++-- src/zarr/storage/__init__.py | 3 +-- tests/test_group.py | 3 ++- tests/test_store/test_core.py | 3 ++- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 37a5b76bba..8eba4fc152 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -27,16 +27,14 @@ from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.v2 import _default_compressor, _default_filters from zarr.errors import NodeTypeValidationError -from zarr.storage import ( - StoreLike, - make_store_path, -) +from zarr.storage._common import make_store_path if TYPE_CHECKING: from collections.abc import Iterable from zarr.abc.codec import Codec from zarr.core.chunk_key_encodings import ChunkKeyEncoding + from zarr.storage import StoreLike # TODO: this type could use some more thought ArrayLike = AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | Array | npt.NDArray[Any] diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e0aad8b6ad..ea29a6fc48 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -112,8 +112,8 @@ _parse_bytes_bytes_codec, get_pipeline_class, ) -from zarr.storage import StoreLike, make_store_path -from zarr.storage._common import StorePath, ensure_no_existing_node +from zarr.storage import StoreLike +from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path if TYPE_CHECKING: from collections.abc import Iterator, Sequence @@ -122,6 +122,7 @@ from zarr.abc.codec import CodecPipeline from zarr.codecs.sharding import ShardingCodecIndexLocation from zarr.core.group import AsyncGroup + from zarr.storage import StoreLike # Array and AsyncArray are defined in the base ``zarr`` namespace diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 79ab31112a..57d9c5cd8d 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -50,8 +50,8 @@ from zarr.core.metadata.v3 import V3JsonEncoder from zarr.core.sync import SyncMixin, sync from zarr.errors import MetadataValidationError -from zarr.storage import StoreLike, StorePath, make_store_path -from zarr.storage._common import ensure_no_existing_node +from zarr.storage import StoreLike, StorePath +from zarr.storage._common import ensure_no_existing_node, make_store_path if TYPE_CHECKING: from collections.abc import AsyncGenerator, Generator, Iterable, Iterator diff --git a/src/zarr/storage/__init__.py b/src/zarr/storage/__init__.py index c092ade03e..649857f773 100644 --- a/src/zarr/storage/__init__.py +++ b/src/zarr/storage/__init__.py @@ -3,7 +3,7 @@ from types import ModuleType from typing import Any -from zarr.storage._common import StoreLike, StorePath, make_store_path +from zarr.storage._common import StoreLike, StorePath from zarr.storage._fsspec import FsspecStore from zarr.storage._local import LocalStore from zarr.storage._logging import LoggingStore @@ -21,7 +21,6 @@ "StorePath", "WrapperStore", "ZipStore", - "make_store_path", ] diff --git a/tests/test_group.py b/tests/test_group.py index 1d3563fe68..788e81e603 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -22,7 +22,8 @@ from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError -from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore, make_store_path +from zarr.storage import LocalStore, MemoryStore, StorePath, ZipStore +from zarr.storage._common import make_store_path from zarr.testing.store import LatencyStore from .conftest import parse_store diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index 5ab299442d..7806f3ecef 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -5,7 +5,8 @@ from _pytest.compat import LEGACY_PATH from zarr.core.common import AccessModeLiteral -from zarr.storage import FsspecStore, LocalStore, MemoryStore, StoreLike, StorePath, make_store_path +from zarr.storage import FsspecStore, LocalStore, MemoryStore, StoreLike, StorePath +from zarr.storage._common import make_store_path from zarr.storage._utils import normalize_path From 8bb0b3457bc31925e2ad0e737f1b29de9da74cbf Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 8 Jan 2025 00:24:06 -0800 Subject: [PATCH 77/87] add known bugs to work in progress section of the v3 migration guide (#2670) --- docs/user-guide/v3_migration.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst index d90b87a897..66fcca6d19 100644 --- a/docs/user-guide/v3_migration.rst +++ b/docs/user-guide/v3_migration.rst @@ -206,3 +206,5 @@ of Zarr-Python, please open (or comment on) a * Object dtypes (:issue:`2617`) * Ragged arrays (:issue:`2618`) * Groups and Arrays do not implement ``__enter__`` and ``__exit__`` protocols (:issue:`2619`) + * Big Endian dtypes (:issue:`2324`) + * Default filters for object dtypes for Zarr format 2 arrays (:issue:`2627`) From eb2542498e93613e85c9555dcd2ccc606378fd57 Mon Sep 17 00:00:00 2001 From: Will Moore Date: Wed, 8 Jan 2025 10:26:30 +0000 Subject: [PATCH 78/87] Fix json indent (#2546) * Fix usage of config json_indent in V3JsonEncoder * Add test for json_indent * parametrize json indent * Add None to indent test parameters * ruff fix * other ruff fixes * Update src/zarr/core/metadata/v3.py Co-authored-by: Joe Hamman * Use explicit json encoder args * Add types * Update byte counts for tests --------- Co-authored-by: Joe Hamman Co-authored-by: Deepak Cherian --- docs/user-guide/arrays.rst | 4 ++-- docs/user-guide/groups.rst | 4 ++-- docs/user-guide/performance.rst | 4 ++-- src/zarr/core/metadata/v3.py | 28 +++++++++++++++++++++++++--- tests/test_array.py | 25 ++++++++++++------------- tests/test_metadata/test_v3.py | 11 ++++++++++- 6 files changed, 53 insertions(+), 23 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index ba85ce1cda..ae2c4b47eb 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -209,7 +209,7 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 9696302 + No. bytes stored : 9696520 Storage ratio : 41.3 Chunks Initialized : 100 @@ -611,7 +611,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Serializer : BytesCodec(endian=) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) - No. bytes stored : 3981060 + No. bytes stored : 3981552 Storage ratio : 25.1 Shards Initialized : 100 diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index da5f393246..1e72df3478 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -113,8 +113,8 @@ property. E.g.:: Serializer : BytesCodec(endian=) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 8000000 (7.6M) - No. bytes stored : 1432 - Storage ratio : 5586.6 + No. bytes stored : 1614 + Storage ratio : 4956.6 Chunks Initialized : 0 >>> baz.info Type : Array diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 265bef8efe..42d830780f 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -131,7 +131,7 @@ ratios, depending on the correlation structure within the data. E.g.:: Serializer : BytesCodec(endian=) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) - No. bytes stored : 342588717 + No. bytes stored : 342588911 Storage ratio : 1.2 Chunks Initialized : 100 >>> with zarr.config.set({'array.order': 'F'}): @@ -150,7 +150,7 @@ ratios, depending on the correlation structure within the data. E.g.:: Serializer : BytesCodec(endian=) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) - No. bytes stored : 342588717 + No. bytes stored : 342588911 Storage ratio : 1.2 Chunks Initialized : 100 diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 13a275a6a1..ab62508c80 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -7,6 +7,7 @@ from zarr.core.buffer.core import default_buffer_prototype if TYPE_CHECKING: + from collections.abc import Callable from typing import Self from zarr.core.buffer import Buffer, BufferPrototype @@ -143,9 +144,30 @@ def parse_storage_transformers(data: object) -> tuple[dict[str, JSON], ...]: class V3JsonEncoder(json.JSONEncoder): - def __init__(self, *args: Any, **kwargs: Any) -> None: - self.indent = kwargs.pop("indent", config.get("json_indent")) - super().__init__(*args, **kwargs) + def __init__( + self, + *, + skipkeys: bool = False, + ensure_ascii: bool = True, + check_circular: bool = True, + allow_nan: bool = True, + sort_keys: bool = False, + indent: int | None = None, + separators: tuple[str, str] | None = None, + default: Callable[[object], object] | None = None, + ) -> None: + if indent is None: + indent = config.get("json_indent") + super().__init__( + skipkeys=skipkeys, + ensure_ascii=ensure_ascii, + check_circular=check_circular, + allow_nan=allow_nan, + sort_keys=sort_keys, + indent=indent, + separators=separators, + default=default, + ) def default(self, o: object) -> Any: if isinstance(o, np.dtype): diff --git a/tests/test_array.py b/tests/test_array.py index 410b2e58d0..6600424147 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -399,13 +399,13 @@ async def test_chunks_initialized() -> None: def test_nbytes_stored() -> None: arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()]) result = arr.nbytes_stored() - assert result == 366 # the size of the metadata document. This is a fragile test. + assert result == 502 # the size of the metadata document. This is a fragile test. arr[:50] = 1 result = arr.nbytes_stored() - assert result == 566 # the size with 5 chunks filled. + assert result == 702 # the size with 5 chunks filled. arr[50:] = 2 result = arr.nbytes_stored() - assert result == 766 # the size with all chunks filled. + assert result == 902 # the size with all chunks filled. async def test_nbytes_stored_async() -> None: @@ -413,13 +413,13 @@ async def test_nbytes_stored_async() -> None: shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()] ) result = await arr.nbytes_stored() - assert result == 366 # the size of the metadata document. This is a fragile test. + assert result == 502 # the size of the metadata document. This is a fragile test. await arr.setitem(slice(50), 1) result = await arr.nbytes_stored() - assert result == 566 # the size with 5 chunks filled. + assert result == 702 # the size with 5 chunks filled. await arr.setitem(slice(50, 100), 2) result = await arr.nbytes_stored() - assert result == 766 # the size with all chunks filled. + assert result == 902 # the size with all chunks filled. def test_default_fill_values() -> None: @@ -537,7 +537,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, - _count_bytes_stored=373 if shards is None else 578, # the metadata? + _count_bytes_stored=521 if shards is None else 982, # the metadata? ) assert result == expected @@ -545,11 +545,11 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | result = arr.info_complete() if shards is None: expected = dataclasses.replace( - expected, _count_chunks_initialized=4, _count_bytes_stored=501 + expected, _count_chunks_initialized=4, _count_bytes_stored=649 ) else: expected = dataclasses.replace( - expected, _count_chunks_initialized=1, _count_bytes_stored=774 + expected, _count_chunks_initialized=1, _count_bytes_stored=1178 ) assert result == expected @@ -624,7 +624,7 @@ async def test_info_complete_async( _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, - _count_bytes_stored=373 if shards is None else 578, # the metadata? + _count_bytes_stored=521 if shards is None else 982, # the metadata? ) assert result == expected @@ -632,13 +632,12 @@ async def test_info_complete_async( result = await arr.info_complete() if shards is None: expected = dataclasses.replace( - expected, _count_chunks_initialized=4, _count_bytes_stored=501 + expected, _count_chunks_initialized=4, _count_bytes_stored=553 ) else: expected = dataclasses.replace( - expected, _count_chunks_initialized=1, _count_bytes_stored=774 + expected, _count_chunks_initialized=1, _count_bytes_stored=1178 ) - assert result == expected @pytest.mark.parametrize("store", ["memory"], indirect=True) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index ef527f42ef..a47cbf43bb 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -10,7 +10,8 @@ from zarr.codecs.bytes import BytesCodec from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding -from zarr.core.group import parse_node_type +from zarr.core.config import config +from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, DataType, @@ -304,6 +305,14 @@ def test_metadata_to_dict( assert observed == expected +@pytest.mark.parametrize("indent", [2, 4, None]) +def test_json_indent(indent: int): + with config.set({"json_indent": indent}): + m = GroupMetadata() + d = m.to_buffer_dict(default_buffer_prototype())["zarr.json"].to_bytes() + assert d == json.dumps(json.loads(d), indent=indent).encode() + + # @pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) # @pytest.mark.parametrize("precision", ["ns", "D"]) # async def test_datetime_metadata(fill_value: int, precision: str) -> None: From 0c1aad5782d1c9e3668dbf773cde877e70e64ac6 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 8 Jan 2025 05:51:05 -0800 Subject: [PATCH 79/87] fix: threadpool configuration (#2671) --- src/zarr/core/sync.py | 8 +++++--- tests/test_sync.py | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/sync.py b/src/zarr/core/sync.py index f7d4529478..6a2de855e8 100644 --- a/src/zarr/core/sync.py +++ b/src/zarr/core/sync.py @@ -54,9 +54,7 @@ def _get_executor() -> ThreadPoolExecutor: global _executor if not _executor: max_workers = config.get("threading.max_workers", None) - print(max_workers) - # if max_workers is not None and max_workers > 0: - # raise ValueError(max_workers) + logger.debug("Creating Zarr ThreadPoolExecutor with max_workers=%s", max_workers) _executor = ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="zarr_pool") _get_loop().set_default_executor(_executor) return _executor @@ -118,6 +116,9 @@ def sync( # NB: if the loop is not running *yet*, it is OK to submit work # and we will wait for it loop = _get_loop() + if _executor is None and config.get("threading.max_workers", None) is not None: + # trigger executor creation and attach to loop + _ = _get_executor() if not isinstance(loop, asyncio.AbstractEventLoop): raise TypeError(f"loop cannot be of type {type(loop)}") if loop.is_closed(): @@ -153,6 +154,7 @@ def _get_loop() -> asyncio.AbstractEventLoop: # repeat the check just in case the loop got filled between the # previous two calls from another thread if loop[0] is None: + logger.debug("Creating Zarr event loop") new_loop = asyncio.new_event_loop() loop[0] = new_loop iothread[0] = threading.Thread(target=new_loop.run_forever, name="zarr_io") diff --git a/tests/test_sync.py b/tests/test_sync.py index b0a6ecffd0..e0002fc5a7 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -12,6 +12,7 @@ _get_lock, _get_loop, cleanup_resources, + loop, sync, ) from zarr.storage import MemoryStore @@ -148,11 +149,20 @@ def test_open_positional_args_deprecate(): @pytest.mark.parametrize("workers", [None, 1, 2]) -def test_get_executor(clean_state, workers) -> None: +def test_threadpool_executor(clean_state, workers: int | None) -> None: with zarr.config.set({"threading.max_workers": workers}): - e = _get_executor() - if workers is not None and workers != 0: - assert e._max_workers == workers + _ = zarr.zeros(shape=(1,)) # trigger executor creation + assert loop != [None] # confirm loop was created + if workers is None: + # confirm no executor was created if no workers were specified + # (this is the default behavior) + assert loop[0]._default_executor is None + else: + # confirm executor was created and attached to loop as the default executor + # note: python doesn't have a direct way to get the default executor so we + # use the private attribute + assert _get_executor() is loop[0]._default_executor + assert _get_executor()._max_workers == workers def test_cleanup_resources_idempotent() -> None: From bc26199ccb0d0e4b75dbc07fb8ab598027941823 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 8 Jan 2025 09:22:38 -0800 Subject: [PATCH 80/87] api: hide zarr.core from api docs (#2669) * api: hide zarr.core from api docs * dont link to zarr.config doc module --- docs/conf.py | 2 +- docs/user-guide/config.rst | 2 +- src/zarr/core/__init__.py | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2a93e61d3e..8410b9b0b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -71,7 +71,7 @@ def skip_submodules( ) -> bool: # Skip documenting zarr.codecs submodules # codecs are documented in the main zarr.codecs namespace - if what == "module" and name.startswith("zarr.codecs."): + if what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): skip = True return skip diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index a17bce9d99..871291b72b 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -3,7 +3,7 @@ Runtime configuration ===================== -:mod:`zarr.config ` is responsible for managing the configuration of zarr and +``zarr.config`` is responsible for managing the configuration of zarr and is based on the `donfig `_ Python library. Configuration values can be set using code like the following:: diff --git a/src/zarr/core/__init__.py b/src/zarr/core/__init__.py index cbacfe3422..03a108dbbf 100644 --- a/src/zarr/core/__init__.py +++ b/src/zarr/core/__init__.py @@ -1,3 +1,8 @@ +""" +The ``zarr.core`` module is considered private API and should not be imported +directly by 3rd-party code. +""" + from __future__ import annotations from zarr.core.buffer import Buffer, NDBuffer # noqa: F401 From 22ebded93aa88ae1e5f87f6711fa7057ca2e8478 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Wed, 8 Jan 2025 17:38:43 +0000 Subject: [PATCH 81/87] Clean up release notes in preparation for v3 (#2634) --- docs/conf.py | 4 +- docs/developers/contributing.rst | 2 +- docs/developers/index.rst | 1 - docs/developers/release.rst | 2334 ------------------------------ docs/index.rst | 2 +- docs/release-notes.rst | 16 + docs/user-guide/v3_migration.rst | 2 + 7 files changed, 22 insertions(+), 2339 deletions(-) delete mode 100644 docs/developers/release.rst create mode 100644 docs/release-notes.rst diff --git a/docs/conf.py b/docs/conf.py index 8410b9b0b3..22d24c3515 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -105,10 +105,10 @@ def skip_submodules( "license": "https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt", "tutorial": "user-guide", "getting-started": "quickstart", - "release": "developers/release.html", "roadmap": "developers/roadmap.html", "installation": "user-guide/installation.html", - "api": "api/zarr/index" + "api": "api/zarr/index", + "release": "release-notes" } # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/docs/developers/contributing.rst b/docs/developers/contributing.rst index 4358230eff..31cf80bed6 100644 --- a/docs/developers/contributing.rst +++ b/docs/developers/contributing.rst @@ -213,7 +213,7 @@ and functions are included in the API documentation, under the ``docs/api`` fold using the `autodoc `_ extension to sphinx. Any new features or important usage information should be included in the user-guide (``docs/user-guide``). Any changes should also be included in the release -notes (``docs/developers/release.rst``). +notes (``docs/release-notes.rst``). The documentation can be built locally by running:: diff --git a/docs/developers/index.rst b/docs/developers/index.rst index 3feb0aff71..4bccb3a469 100644 --- a/docs/developers/index.rst +++ b/docs/developers/index.rst @@ -6,5 +6,4 @@ Developer's Guide :maxdepth: 1 contributing - release roadmap diff --git a/docs/developers/release.rst b/docs/developers/release.rst deleted file mode 100644 index ce15c68f4a..0000000000 --- a/docs/developers/release.rst +++ /dev/null @@ -1,2334 +0,0 @@ -Release notes -============= - -.. - # Copy the warning statement _under_ the latest release version - # and unindent for pre-releases. - - .. warning:: - Pre-release! Use :command:`pip install --pre zarr` to evaluate this release. - -.. - # Unindent the section between releases in order - # to document your changes. On releases it will be - # re-indented so that it does not show up in the notes. - -.. note:: - Zarr-Python 2.18.* is expected be the final release in the 2.* series. Work on Zarr-Python 3.0 is underway. - See `GH1777 `_ for more details on the upcoming - 3.0 release. - -.. release_3.0.0-beta: - -3.0.0-beta series ------------------ - -.. warning:: - Zarr-Python 3.0.0-beta is a pre-release of the upcoming 3.0 release. This release is not feature complete or - expected to be ready for production applications. - -.. note:: - The complete release notes for 3.0 have not been added to this document yet. See the - `3.0.0-beta `_ release on GitHub - for a record of changes included in this release. - -Dependency Changes -~~~~~~~~~~~~~~~~~~ - -* fsspec was moved from a required dependency to an optional one. Users should install - fsspec and any relevant implementations (e.g. s3fs) before using the ``RemoteStore``. - By :user:`Joe Hamman ` :issue:`2391`. - -* ``RemoteStore`` was renamed to ``FsspecStore``. - By :user:`Joe Hamman ` :issue:`2557`. - -.. release_3.0.0-alpha: - -3.0.0-alpha series ------------------- - -.. warning:: - Zarr-Python 3.0.0-alpha is a pre-release of the upcoming 3.0 release. This release is not feature complete or - expected to be ready for production applications. - -.. note:: - The complete release notes for 3.0 have not been added to this document yet. See the - `3.0.0-alpha `_ release on GitHub - for a record of changes included in this release. - -Enhancements -~~~~~~~~~~~~ - -* Implement listing of the sub-arrays and sub-groups for a V3 ``Group``. - By :user:`Davis Bennett ` :issue:`1726`. - -* Bootstrap v3 branch with zarrita. - By :user:`Joe Hamman ` :issue:`1584`. - -* Extensible codecs for V3. - By :user:`Norman Rzepka ` :issue:`1588`. - -* Don't import from tests. - By :user:`Davis Bennett ` :issue:`1601`. - -* Listable V3 Stores. - By :user:`Joe Hamman ` :issue:`1634`. - -* Codecs without array metadata. - By :user:`Norman Rzepka ` :issue:`1632`. - -* fix sync group class methods. - By :user:`Joe Hamman ` :issue:`1652`. - -* implement eq for LocalStore. - By :user:`Charoula Kyriakides ` :issue:`1792`. - -* V3 reorg. - By :user:`Joe Hamman ` :issue:`1809`. - -* [v3] Sync with futures. - By :user:`Davis Bennett ` :issue:`1804`. - -* implement group.members. - By :user:`Davis Bennett ` :issue:`1726`. - -* Remove implicit groups. - By :user:`Joe Hamman ` :issue:`1827`. - -* feature(store): ``list_*`` -> AsyncGenerators. - By :user:`Joe Hamman ` :issue:`1844`. - -* Test codec entrypoints. - By :user:`Norman Rzepka ` :issue:`1835`. - -* Remove extra v3 sync module. - By :user:`Max Jones ` :issue:`1856`. - -* Use donfig for V3 configuration. - By :user:`Max Jones ` :issue:`1655`. - -* groundwork for V3 group tests. - By :user:`Davis Bennett ` :issue:`1743`. - -* [v3] First step to generalizes ndarray and bytes. - By :user:`Mads R. B. Kristensen ` :issue:`1826`. - -* Reworked codec pipelines. - By :user:`Norman Rzepka ` :issue:`1670`. - -* Followup on codecs. - By :user:`Norman Rzepka ` :issue:`1889`. - -* Protocols for Buffer and NDBuffer. - By :user:`Mads R. B. Kristensen ` :issue:`1899`. - -* [V3] Expand store tests. - By :user:`Davis Bennett ` :issue:`1900`. - -* [v3] Feature: Store open mode. - By :user:`Joe Hamman ` :issue:`1911`. - -* fix(types): Group.info -> NotImplementedError. - By :user:`Joe Hamman ` :issue:`1936`. - -* feature(typing): add py.typed file to package root. - By :user:`Joe Hamman ` :issue:`1935`. - -* Support all indexing variants. - By :user:`Norman Rzepka ` :issue:`1917`. - -* Feature: group and array name properties. - By :user:`Joe Hamman ` :issue:`1940`. - -* implement .chunks on v3 arrays. - By :user:`Ryan Abernathey ` :issue:`1929`. - -* Fixes bug in transpose. - By :user:`Norman Rzepka ` :issue:`1949`. - -* Buffer Prototype Argument. - By :user:`Mads R. B. Kristensen ` :issue:`1910`. - -* Feature: Top level V3 API. - By :user:`Joe Hamman ` :issue:`1884`. - -* Basic working FsspecStore. - By :user:`Martin Durant `; :issue:`1785`. - -Typing -~~~~~~ - -* Resolve Mypy errors in v3 branch. - By :user:`Daniel Jahn ` :issue:`1692`. - -* Allow dmypy to be run on v3 branch. - By :user:`David Stansby ` :issue:`1780`. - -* Remove unused typing ignore comments. - By :user:`David Stansby ` :issue:`1781`. - -* Check untyped defs on v3. - By :user:`David Stansby ` :issue:`1784`. - -* [v3] Enable some more strict mypy options. - By :user:`David Stansby ` :issue:`1793`. - -* [v3] Disallow generic Any typing. - By :user:`David Stansby ` :issue:`1794`. - -* Disallow incomplete type definitions. - By :user:`David Stansby ` :issue:`1814`. - -* Disallow untyped calls. - By :user:`David Stansby ` :issue:`1811`. - -* Fix some untyped calls. - By :user:`David Stansby ` :issue:`1865`. - -* Disallow untyped defs. - By :user:`David Stansby ` :issue:`1834`. - -* Add more typing to zarr.group. - By :user:`David Stansby ` :issue:`1870`. - -* Fix any generics in zarr.array. - By :user:`David Stansby ` :issue:`1861`. - -* Remove some unused mypy overrides. - By :user:`David Stansby ` :issue:`1894`. - -* Finish typing zarr.metadata. - By :user:`David Stansby ` :issue:`1880`. - -* Disallow implicit re-exports. - By :user:`David Stansby ` :issue:`1908`. - -* Make typing strict. - By :user:`David Stansby ` :issue:`1879`. - -* Enable extra mypy error codes. - By :user:`David Stansby ` :issue:`1909`. - -* Enable warn_unreachable for mypy. - By :user:`David Stansby ` :issue:`1937`. - -* Fix final typing errors. - By :user:`David Stansby ` :issue:`1939`. - -Maintenance -~~~~~~~~~~~ - -* Remedy a situation where ``zarr-python`` was importing ``DummyStorageTransformer`` from the test suite. - The dependency relationship is now reversed: the test suite imports this class from ``zarr-python``. - By :user:`Davis Bennett ` :issue:`1601`. - -* [V3] Update minimum supported Python and Numpy versions. - By :user:`Joe Hamman ` :issue:`1638` - -* use src layout and use hatch for packaging. - By :user:`Davis Bennett ` :issue:`1592`. - -* temporarily disable mypy in v3 directory. - By :user:`Joe Hamman ` :issue:`1649`. - -* create hatch test env. - By :user:`Ryan Abernathey ` :issue:`1650`. - -* removed unused environments and workflows. - By :user:`Ryan Abernathey ` :issue:`1651`. - -* Add env variables to sprint setup instructions. - By :user:`Max Jones ` :issue:`1654`. - -* Add test matrix for V3. - By :user:`Max Jones ` :issue:`1656`. - -* Remove attrs. - By :user:`Davis Bennett ` :issue:`1660`. - -* Specify hatch envs using GitHub actions matrix for v3 tests. - By :user:`Max Jones ` :issue:`1728`. - -* black -> ruff format + cleanup. - By :user:`Saransh Chopra ` :issue:`1639`. - -* Remove old v3. - By :user:`Davis Bennett ` :issue:`1742`. - -* V3 update pre commit. - By :user:`Joe Hamman ` :issue:`1808`. - -* remove windows testing on v3 branch. - By :user:`Joe Hamman ` :issue:`1817`. - -* fix: add mypy to test dependencies. - By :user:`Davis Bennett ` :issue:`1789`. - -* chore(ci): add numpy 2 release candidate to test matrix. - By :user:`Joe Hamman ` :issue:`1828`. - -* fix dependencies. - By :user:`Norman Rzepka ` :issue:`1840`. - -* Add pytest to mypy dependencies. - By :user:`David Stansby ` :issue:`1846`. - -* chore(pre-commit): update pre-commit versions and remove attrs dep mypy section. - By :user:`Joe Hamman ` :issue:`1848`. - -* Enable some ruff rules (RUF) and fix issues. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1869`. - -* Configure Ruff to apply flake8-bugbear/isort/pyupgrade. - By :user:`Norman Rzepka ` :issue:`1890`. - -* chore(ci): remove mypy from test action in favor of pre-commit action. - By :user:`Joe Hamman ` :issue:`1887`. - -* Enable ruff/flake8-raise rules (RSE) and fix issues. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1872`. - -* Apply assorted ruff/refurb rules (FURB). - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1873`. - -* Enable ruff/flake8-implicit-str-concat rules (ISC) and fix issues. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1868`. - -* Add numpy to mypy pre-commit check env. - By :user:`David Stansby ` :issue:`1893`. - -* remove fixture files from src. - By :user:`Davis Bennett ` :issue:`1897`. - -* Fix list of packages in mypy pre-commit environment. - By :user:`David Stansby ` :issue:`1907`. - -* Run sphinx directly on readthedocs. - By :user:`David Stansby ` :issue:`1919`. - -* Apply preview ruff rules. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1942`. - -* Enable and apply ruff rule RUF009. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1941`. - -Documentation -~~~~~~~~~~~~~ - -* Specify docs hatch env for v3 branch. - By :user:`Max Jones ` :issue:`1655`. - -* Development installation/contributing docs updates. - By :user:`Alden Keefe Sampson ` :issue:`1643`. - -* chore: update project settings per scientific python repo-review. - By :user:`Joe Hamman ` :issue:`1863`. - -* doc: update release notes for 3.0.0.alpha. - By :user:`Joe Hamman ` :issue:`1959`. - -.. _release_2.18.3: - -2.18.3 ------- - -Enhancements -~~~~~~~~~~~~ -* Added support for creating a copy of data when converting a `zarr.Array` - to a numpy array. - By :user:`David Stansby ` (:issue:`2106`) and - :user:`Joe Hamman ` (:issue:`2123`). - -Maintenance -~~~~~~~~~~~ -* Removed support for Python 3.9. - By :user:`David Stansby ` (:issue:`2074`). - -* Fix a regression when using orthogonal indexing with a scalar. - By :user:`Deepak Cherian ` :issue:`1931` - -* Added compatibility with NumPy 2.1. - By :user:`David Stansby ` - -* Bump minimum NumPy version to 1.24. - :user:`Joe Hamman ` (:issue:`2127`). - -Deprecations -~~~~~~~~~~~~ - -* Deprecate :class:`zarr.n5.N5Store` and :class:`zarr.n5.N5FSStore`. These - stores are slated to be removed in Zarr Python 3.0. - By :user:`Joe Hamman ` :issue:`2085`. - -.. _release_2.18.2: - -2.18.2 ------- - -Enhancements -~~~~~~~~~~~~ - -* Add Zstd codec to old V3 code path. - By :user:`Ryan Abernathey ` - -.. _release_2.18.1: - -2.18.1 ------- - -Maintenance -~~~~~~~~~~~ -* Fix a regression when getting or setting a single value from arrays with size-1 chunks. - By :user:`Deepak Cherian ` :issue:`1874` - -.. _release_2.18.0: - -2.18.0 ------- - -Enhancements -~~~~~~~~~~~~ -* Performance improvement for reading and writing chunks if any of the dimensions is size 1. - By :user:`Deepak Cherian ` :issue:`1730`. - -Maintenance -~~~~~~~~~~~ -* Enable ruff/bugbear rules (B) and fix issues. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1702`. - -* Minor updates to use `np.inf` instead of `np.PINF` / `np.NINF` in preparation for NumPy 2.0.0 release. - By :user:`Joe Hamman ` :issue:`1842`. - -Deprecations -~~~~~~~~~~~~ - -* Deprecate experimental v3 support by issuing a `FutureWarning`. - Also updated docs to warn about using the experimental v3 version. - By :user:`Joe Hamman ` :issue:`1802` and :issue:`1807`. - -* Deprecate the following stores: :class:`zarr.storage.DBMStore`, :class:`zarr.storage.LMDBStore`, - :class:`zarr.storage.SQLiteStore`, :class:`zarr.storage.MongoDBStore`, :class:`zarr.storage.RedisStore`, - and :class:`zarr.storage.ABSStore`. These stores are slated to be removed from Zarr-Python in version 3.0. - By :user:`Joe Hamman ` :issue:`1801`. - -.. _release_2.17.2: - -2.17.2 ------- - -Enhancements -~~~~~~~~~~~~ - -* [v3] Dramatically reduce number of ``__contains__`` requests in favor of optimistically calling `__getitem__` - and handling any error that may arise. - By :user:`Deepak Cherian ` :issue:`1741`. - -* [v3] Reuse the downloaded array metadata when creating an ``Array``. - By :user:`Deepak Cherian ` :issue:`1734`. - -* Optimize ``Array.info`` so that it calls `getsize` only once. - By :user:`Deepak Cherian ` :issue:`1733`. - -* Override IPython ``_repr_*_`` methods to avoid expensive lookups against object stores. - By :user:`Deepak Cherian ` :issue:`1716`. - -* FSStore now raises rather than return bad data. - By :user:`Martin Durant ` and :user:`Ian Carroll ` :issue:`1604`. - -* Avoid redundant ``__contains__``. - By :user:`Deepak Cherian ` :issue:`1739`. - -Docs -~~~~ - -* Fix link to GCSMap in ``tutorial.rst``. - By :user:`Daniel Jahn ` :issue:`1689`. - -* Endorse `SPEC0000 `_ and state version support policy in ``installation.rst``. - By :user:`Sanket Verma ` :issue:`1665`. - -* Migrate v1 and v2 specification to `Zarr-Specs `_. - By :user:`Sanket Verma ` :issue:`1582`. - -Maintenance -~~~~~~~~~~~ - -* Add CI test environment for Python 3.12 - By :user:`Joe Hamman ` :issue:`1719`. - -* Bump minimum supported NumPy version to 1.23 (per spec 0000) - By :user:`Joe Hamman ` :issue:`1719`. - -* Minor fixes: Using ``is`` instead of ``type`` and removing unnecessary ``None``. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1737`. - -* Fix tests failure related to Pytest 8. - By :user:`David Stansby ` :issue:`1714`. - -.. _release_2.17.1: - -2.17.1 ------- - -Enhancements -~~~~~~~~~~~~ - -* Change occurrences of % and format() to f-strings. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1423`. - -* Proper argument for numpy.reshape. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1425`. - -* Add typing to dimension separator arguments. - By :user:`David Stansby ` :issue:`1620`. - -Docs -~~~~ - -* ZIP related tweaks. - By :user:`Davis Bennett ` :issue:`1641`. - -Maintenance -~~~~~~~~~~~ - -* Update config.yml with Zulip. - By :user:`Josh Moore `. - -* Replace Gitter with the new Zulip Chat link. - By :user:`Sanket Verma ` :issue:`1685`. - -* Fix RTD build. - By :user:`Sanket Verma ` :issue:`1694`. - -.. _release_2.17.0: - -2.17.0 ------- - -Enhancements -~~~~~~~~~~~~ - -* Added type hints to ``zarr.creation.create()``. - By :user:`David Stansby ` :issue:`1536`. - -* Pyodide support: Don't require fasteners on Emscripten. - By :user:`Hood Chatham ` :issue:`1663`. - -Docs -~~~~ - -* Minor correction and changes in documentation. - By :user:`Sanket Verma ` :issue:`1509`. - -* Fix typo in documentation. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1554` - -* The documentation build now fails if there are any warnings. - By :user:`David Stansby ` :issue:`1548`. - -* Add links to ``numcodecs`` docs in the tutorial. - By :user:`David Stansby ` :issue:`1535`. - -* Enable offline formats for documentation builds. - By :user:`Sanket Verma ` :issue:`1551`. - -* Minor tweak to advanced indexing tutorial examples. - By :user:`Ross Barnowski ` :issue:`1550`. - -* Automatically document array members using sphinx-automodapi. - By :user:`David Stansby ` :issue:`1547`. - -* Add a markdown file documenting the current and former core-developer team. - By :user:`Joe Hamman ` :issue:`1628`. - -* Add Norman Rzepka to core-dev team. - By :user:`Joe Hamman ` :issue:`1630`. - -* Added section about accessing ZIP archives on s3. - By :user:`Jeff Peck ` :issue:`1613`, :issue:`1615`, and :user:`Davis Bennett ` :issue:`1641`. - -* Add V3 roadmap and design document. - By :user:`Joe Hamman ` :issue:`1583`. - -Maintenance -~~~~~~~~~~~ - -* Drop Python 3.8 and NumPy 1.20 - By :user:`Josh Moore `; :issue:`1557`. - -* Cache result of ``FSStore._fsspec_installed()``. - By :user:`Janick Martinez Esturo ` :issue:`1581`. - -* Extend copyright notice to 2023. - By :user:`Jack Kelly ` :issue:`1528`. - -* Change occurrence of ``io.open()`` into ``open()``. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1421`. - -* Preserve ``dimension_separator`` when resizing arrays. - By :user:`Ziwen Liu ` :issue:`1533`. - -* Initialise some sets in tests with set literals instead of list literals. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1534`. - -* Allow ``black`` code formatter to be run with any Python version. - By :user:`David Stansby ` :issue:`1549`. - -* Remove ``sphinx-rtd-theme`` dependency from ``pyproject.toml``. - By :user:`Sanket Verma ` :issue:`1563`. - -* Remove ``CODE_OF_CONDUCT.md`` file from the Zarr-Python repository. - By :user:`Sanket Verma ` :issue:`1572`. - -* Bump version of black in pre-commit. - By :user:`David Stansby ` :issue:`1559`. - -* Use list comprehension where applicable. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1555`. - -* Use format specification mini-language to format string. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1558`. - -* Single startswith() call instead of multiple ones. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1556`. - -* Move codespell options around. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1196`. - -* Remove unused mypy ignore comments. - By :user:`David Stansby ` :issue:`1602`. - -.. _release_2.16.1: - -2.16.1 ------- - -Maintenance -~~~~~~~~~~~ - -* Require ``setuptools_scm`` version ``1.5.4``\+ - By :user:`John A. Kirkham ` :issue:`1477`. - -* Add ``docs`` requirements to ``pyproject.toml`` - By :user:`John A. Kirkham ` :issue:`1494`. - -* Fixed caching issue in ``LRUStoreCache``. - By :user:`Mads R. B. Kristensen ` :issue:`1499`. - -.. _release_2.16.0: - -2.16.0 ------- - -Enhancements -~~~~~~~~~~~~ - -* Allow for partial codec specification in V3 array metadata. - By :user:`Joe Hamman ` :issue:`1443`. - -* Add ``__contains__`` method to ``KVStore``. - By :user:`Christoph Gohlke ` :issue:`1454`. - -* **Block Indexing**: Implemented blockwise (chunk blocks) indexing to ``zarr.Array``. - By :user:`Altay Sansal ` :issue:`1428` - -Maintenance -~~~~~~~~~~~ - -* Refactor the core array tests to reduce code duplication. - By :user:`Davis Bennett ` :issue:`1462`. - -* Style the codebase with ``ruff`` and ``black``. - By :user:`Davis Bennett ` :issue:`1459` - -* Ensure that chunks is tuple of ints upon array creation. - By :user:`Philipp Hanslovsky ` :issue:`1461` - -.. _release_2.15.0: - -2.15.0 ------- - -Enhancements -~~~~~~~~~~~~ - -* Implement more extensive fallback of getitem/setitem for orthogonal indexing. - By :user:`Andreas Albert ` :issue:`1029`. - -* Getitems supports ``meta_array``. - By :user:`Mads R. B. Kristensen ` :issue:`1131`. - -* ``open_array()`` now takes the ``meta_array`` argument. - By :user:`Mads R. B. Kristensen ` :issue:`1396`. - -Maintenance -~~~~~~~~~~~ - -* Remove ``codecov`` from GitHub actions. - By :user:`John A. Kirkham ` :issue:`1391`. - -* Replace ``np.product`` with ``np.prod`` due to deprecation. - By :user:`James Bourbeau ` :issue:`1405`. - -* Activate Py 3.11 builds. - By :user:`Joe Hamman ` :issue:`1415`. - -Documentation -~~~~~~~~~~~~~ - -* Add API reference for V3 Implementation in the docs. - By :user:`Sanket Verma ` :issue:`1345`. - -Bug fixes -~~~~~~~~~ - -* Fix the conda-forge error. Read :issue:`1347` for detailed info. - By :user:`Josh Moore ` :issue:`1364` and :issue:`1367`. - -* Fix ``ReadOnlyError`` when opening V3 store via fsspec reference file system. - By :user:`Joe Hamman ` :issue:`1383`. - -* Fix ``normalize_fill_value`` for structured arrays. - By :user:`Alan Du ` :issue:`1397`. - -.. _release_2.14.2: - -2.14.2 ------- - -Bug fixes -~~~~~~~~~ - -* Ensure ``zarr.group`` uses writeable mode to fix issue with :issue:`1304`. - By :user:`Brandur Thorgrimsson ` :issue:`1354`. - -.. _release_2.14.1: - -2.14.1 ------- - -Documentation -~~~~~~~~~~~~~ - -* Fix API links. - By :user:`Josh Moore ` :issue:`1346`. - -* Fix unit tests which prevented the conda-forge release. - By :user:`Josh Moore ` :issue:`1348`. - -.. _release_2.14.0: - -2.14.0 ------- - -Major changes -~~~~~~~~~~~~~ - -* Improve Zarr V3 support, adding partial store read/write and storage transformers. - Add new features from the `v3 spec `_: - - * storage transformers - * `get_partial_values` and `set_partial_values` - * efficient `get_partial_values` implementation for `FSStoreV3` - * sharding storage transformer - - By :user:`Jonathan Striebel `; :issue:`1096`, :issue:`1111`. - -* N5 nows supports Blosc. - Remove warnings emitted when using N5Store or N5FSStore with a blosc-compressed array. - By :user:`Davis Bennett `; :issue:`1331`. - -Bug fixes -~~~~~~~~~ - -* Allow reading utf-8 encoded json files - By :user:`Nathan Zimmerberg ` :issue:`1308`. - -* Ensure contiguous data is give to ``FSStore``. Only copying if needed. - By :user:`Mads R. B. Kristensen ` :issue:`1285`. - -* NestedDirectoryStore.listdir now returns chunk keys with the correct '/' dimension_separator. - By :user:`Brett Graham ` :issue:`1334`. - -* N5Store/N5FSStore dtype returns zarr Stores readable dtype. - By :user:`Marwan Zouinkhi ` :issue:`1339`. - -.. _release_2.13.6: - -2.13.6 ------- - -Maintenance -~~~~~~~~~~~ - -* Bump gh-action-pypi-publish to 1.6.4. - By :user:`Josh Moore ` :issue:`1320`. - -.. _release_2.13.5: - -2.13.5 ------- - -Bug fixes -~~~~~~~~~ - -* Ensure ``zarr.create`` uses writeable mode to fix issue with :issue:`1304`. - By :user:`James Bourbeau ` :issue:`1309`. - -.. _release_2.13.4: - -2.13.4 ------- - -Appreciation -~~~~~~~~~~~~~ - -Special thanks to Outreachy participants for contributing to most of the -maintenance PRs. Please read the blog post summarising the contribution phase -and welcoming new Outreachy interns: -https://zarr.dev/blog/welcoming-outreachy-2022-interns/ - - -Enhancements -~~~~~~~~~~~~ - -* Handle fsspec.FSMap using FSStore store. - By :user:`Rafal Wojdyla ` :issue:`1304`. - -Bug fixes -~~~~~~~~~ - -* Fix bug that caused double counting of groups in ``groups()`` and ``group_keys()`` methods with V3 stores. - By :user:`Ryan Abernathey ` :issue:`1228`. - -* Remove unnecessary calling of `contains_array` for key that ended in `.array.json`. - By :user:`Joe Hamman ` :issue:`1149`. - -* Fix bug that caused double counting of groups in ``groups()`` and ``group_keys()`` - methods with V3 stores. - By :user:`Ryan Abernathey ` :issue:`1228`. - -Documentation -~~~~~~~~~~~~~ - -* Fix minor indexing errors in tutorial and specification examples of documentation. - By :user:`Kola Babalola ` :issue:`1277`. - -* Add `requirements_rtfd.txt` in `contributing.rst`. - By :user:`AWA BRANDON AWA ` :issue:`1243`. - -* Add documentation for find/findall using visit. - By :user:`Weddy Gikunda ` :issue:`1241`. - -* Refresh of the main landing page. - By :user:`Josh Moore ` :issue:`1173`. - -Maintenance -~~~~~~~~~~~ - -* Migrate to ``pyproject.toml`` and remove redundant infrastructure. - By :user:`Saransh Chopra ` :issue:`1158`. - -* Require ``setuptools`` 64.0.0+ - By :user:`Saransh Chopra ` :issue:`1193`. - -* Pin action versions (pypi-publish, setup-miniconda) for dependabot - By :user:`Saransh Chopra ` :issue:`1205`. - -* Remove ``tox`` support - By :user:`Saransh Chopra ` :issue:`1219`. - -* Add workflow to label PRs with "needs release notes". - By :user:`Saransh Chopra ` :issue:`1239`. - -* Simplify if/else statement. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1227`. - -* Get coverage up to 100%. - By :user:`John Kirkham ` :issue:`1264`. - -* Migrate coverage to ``pyproject.toml``. - By :user:`John Kirkham ` :issue:`1250`. - -* Use ``conda-incubator/setup-miniconda@v2.2.0``. - By :user:`John Kirkham ` :issue:`1263`. - -* Delete unused files. - By :user:`John Kirkham ` :issue:`1251`. - -* Skip labeller for bot PRs. - By :user:`Saransh Chopra ` :issue:`1271`. - -* Restore Flake8 configuration. - By :user:`John Kirkham ` :issue:`1249`. - -* Add missing newline at EOF. - By :user:`Dimitri Papadopoulos` :issue:`1253`. - -* Add `license_files` to `pyproject.toml`. - By :user:`John Kirkham ` :issue:`1247`. - -* Adding `pyupgrade` suggestions. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1225`. - -* Fixed some linting errors. - By :user:`Weddy Gikunda ` :issue:`1226`. - -* Added the link to main website in readthedocs sidebar. - By :user:`Stephanie_nkwatoh ` :issue:`1216`. - -* Remove redundant wheel dependency in `pyproject.toml`. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1233`. - -* Turned on `isloated_build` in `tox.ini` file. - By :user:`AWA BRANDON AWA ` :issue:`1210`. - -* Fixed `flake8` alert and avoid duplication of `Zarr Developers`. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1203`. - -* Bump to NumPy 1.20+ in `environment.yml`. - By :user:`John Kirkham ` :issue:`1201`. - -* Bump to NumPy 1.20 in `pyproject.toml`. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1192`. - -* Remove LGTM (`.lgtm.yml`) configuration file. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1191`. - -* Codespell will skip `fixture` in pre-commit. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1197`. - -* Add msgpack in `requirements_rtfd.txt`. - By :user:`Emmanuel Bolarinwa ` :issue:`1188`. - -* Added license to docs fixed a typo from `_spec_v2` to `_spec_v3`. - By :user:`AWA BRANDON AWA ` :issue:`1182`. - -* Fixed installation link in `README.md`. - By :user:`AWA BRANDON AWA ` :issue:`1177`. - -* Fixed typos in `installation.rst` and `release.rst`. - By :user:`Chizoba Nweke ` :issue:`1178`. - -* Set `docs/conf.py` language to `en`. - By :user:`AWA BRANDON AWA ` :issue:`1174`. - -* Added `installation.rst` to the docs. - By :user:`AWA BRANDON AWA ` :issue:`1170`. - -* Adjustment of year to `2015-2018` to `2015-2022` in the docs. - By :user:`Emmanuel Bolarinwa ` :issue:`1165`. - -* Updated `Forking the repository` section in `contributing.rst`. - By :user:`AWA BRANDON AWA ` :issue:`1171`. - -* Updated GitHub actions. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1134`. - -* Update web links: `http:// → https://`. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1313`. - -.. _release_2.13.3: - -2.13.3 ------- - -* Improve performance of slice selections with steps by omitting chunks with no relevant - data. - By :user:`Richard Shaw ` :issue:`843`. - -.. _release_2.13.2: - -2.13.2 ------- - -* Fix test failure on conda-forge builds (again). - By :user:`Josh Moore `; see - `zarr-feedstock#65 `_. - -.. _release_2.13.1: - -2.13.1 ------- - -* Fix test failure on conda-forge builds. - By :user:`Josh Moore `; see - `zarr-feedstock#65 `_. - -.. _release_2.13.0: - -2.13.0 ------- - -Major changes -~~~~~~~~~~~~~ - -* **Support of alternative array classes** by introducing a new argument, - meta_array, that specifies the type/class of the underlying array. The - meta_array argument can be any class instance that can be used as the like - argument in NumPy (see `NEP 35 - `_). - enabling support for CuPy through, for example, the creation of a CuPy CPU - compressor. - By :user:`Mads R. B. Kristensen ` :issue:`934`. - -* **Remove support for Python 3.7** in concert with NumPy dependency. - By :user:`Davis Bennett ` :issue:`1067`. - -* **Zarr v3: add support for the default root path** rather than requiring - that all API users pass an explicit path. - By :user:`Gregory R. Lee ` :issue:`1085`, :issue:`1142`. - - -Bug fixes -~~~~~~~~~ - -* Remove/relax erroneous "meta" path check (**regression**). - By :user:`Gregory R. Lee ` :issue:`1123`. - -* Cast all attribute keys to strings (and issue deprecation warning). - By :user:`Mattia Almansi ` :issue:`1066`. - -* Fix bug in N5 storage that prevented arrays located in the root of the hierarchy from - bearing the `n5` keyword. Along with fixing this bug, new tests were added for N5 routines - that had previously been excluded from testing, and type annotations were added to the N5 codebase. - By :user:`Davis Bennett ` :issue:`1092`. - -* Fix bug in LRUEStoreCache in which the current size wasn't reset on invalidation. - By :user:`BGCMHou ` and :user:`Josh Moore ` :issue:`1076`, :issue:`1077`. - -* Remove erroneous check that disallowed array keys starting with "meta". - By :user:`Gregory R. Lee ` :issue:`1105`. - -Documentation -~~~~~~~~~~~~~ - -* Typo fixes to close quotes. By :user:`Pavithra Eswaramoorthy ` - -* Added copy button to documentation. - By :user:`Altay Sansal ` :issue:`1124`. - -Maintenance -~~~~~~~~~~~ - -* Simplify release docs. - By :user:`Josh Moore ` :issue:`1119`. - -* Pin werkzeug to prevent test hangs. - By :user:`Davis Bennett ` :issue:`1098`. - -* Fix a few DeepSource.io alerts - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1080`. - -* Fix URLs. - By :user:`Dimitri Papadopoulos Orfanos `, :issue:`1074`. - -* Fix spelling. - By :user:`Dimitri Papadopoulos Orfanos `, :issue:`1073`. - -* Update GitHub issue templates with `YAML` format. - By :user:`Saransh Chopra ` :issue:`1079`. - -* Remove option to return None from _ensure_store. - By :user:`Gregory Lee ` :issue:`1068`. - -* Fix a typo of "integers". - By :user:`Richard Scott ` :issue:`1056`. - -.. _release_2.12.0: - -2.12.0 ------- - -Enhancements -~~~~~~~~~~~~ - -* **Add support for reading and writing Zarr V3.** The new `zarr._store.v3` - package has the necessary classes and functions for evaluating Zarr V3. - Since the format is not yet finalized, the classes and functions are not - automatically imported into the regular `zarr` name space. Setting the - `ZARR_V3_EXPERIMENTAL_API` environment variable will activate them. - By :user:`Gregory Lee `; :issue:`898`, :issue:`1006`, and :issue:`1007` - as well as by :user:`Josh Moore ` :issue:`1032`. - -* **Create FSStore from an existing fsspec filesystem**. If you have created - an fsspec filesystem outside of Zarr, you can now pass it as a keyword - argument to ``FSStore``. - By :user:`Ryan Abernathey `; :issue:`911`. - -* Add numpy encoder class for json.dumps - By :user:`Eric Prestat `; :issue:`933`. - -* Appending performance improvement to Zarr arrays, e.g., when writing to S3. - By :user:`hailiangzhang `; :issue:`1014`. - -* Add number encoder for ``json.dumps`` to support numpy integers in - ``chunks`` arguments. By :user:`Eric Prestat ` :issue:`697`. - -Bug fixes -~~~~~~~~~ - -* Fix bug that made it impossible to create an ``FSStore`` on unlistable filesystems - (e.g. some HTTP servers). - By :user:`Ryan Abernathey `; :issue:`993`. - - -Documentation -~~~~~~~~~~~~~ - -* Update resize doc to clarify surprising behavior. - By :user:`hailiangzhang `; :issue:`1022`. - -Maintenance -~~~~~~~~~~~ - -* Added Pre-commit configuration, incl. Yaml Check. - By :user:`Shivank Chaudhary `; :issue:`1015`, :issue:`1016`. - -* Fix URL to renamed file in Blosc repo. - By :user:`Andrew Thomas ` :issue:`1028`. - -* Activate Py 3.10 builds. - By :user:`Josh Moore ` :issue:`1027`. - -* Make all unignored zarr warnings errors. - By :user:`Josh Moore ` :issue:`1021`. - - -.. _release_2.11.3: - -2.11.3 ------- - -Bug fixes -~~~~~~~~~ - -* Fix missing case to fully revert change to default write_empty_chunks. - By :user:`Tom White `; :issue:`1005`. - - -.. _release_2.11.2: - -2.11.2 ------- - -Bug fixes -~~~~~~~~~ - -* Changes the default value of ``write_empty_chunks`` to ``True`` to prevent - unanticipated data losses when the data types do not have a proper default - value when empty chunks are read back in. - By :user:`Vyas Ramasubramani `; :issue:`965`, :issue:`1001`. - -.. _release_2.11.1: - -2.11.1 ------- - -Bug fixes -~~~~~~~~~ - -* Fix bug where indexing with a scalar numpy value returned a single-value array. - By :user:`Ben Jeffery ` :issue:`967`. - -* Removed `clobber` argument from `normalize_store_arg`. This enables to change - data within an opened consolidated group using mode `"r+"` (i.e region write). - By :user:`Tobias Kölling ` :issue:`975`. - -.. _release_2.11.0: - -2.11.0 ------- - -Enhancements -~~~~~~~~~~~~ - -* **Sparse changes with performance impact!** One of the advantages of the Zarr - format is that it is sparse, which means that chunks with no data (more - precisely, with data equal to the fill value, which is usually 0) don't need - to be written to disk at all. They will simply be assumed to be empty at read - time. However, until this release, the Zarr library would write these empty - chunks to disk anyway. This changes in this version: a small performance - penalty at write time leads to significant speedups at read time and in - filesystem operations in the case of sparse arrays. To revert to the old - behavior, pass the argument ``write_empty_chunks=True`` to the array creation - function. By :user:`Juan Nunez-Iglesias `; :issue:`853` and - :user:`Davis Bennett `; :issue:`738`. - -* **Fancy indexing**. Zarr arrays now support NumPy-style fancy indexing with - arrays of integer coordinates. This is equivalent to using zarr.Array.vindex. - Mixing slices and integer arrays is not supported. - By :user:`Juan Nunez-Iglesias `; :issue:`725`. - -* **New base class**. This release of Zarr Python introduces a new - ``BaseStore`` class that all provided store classes implemented in Zarr - Python now inherit from. This is done as part of refactoring to enable future - support of the Zarr version 3 spec. Existing third-party stores that are a - MutableMapping (e.g. dict) can be converted to a new-style key/value store - inheriting from ``BaseStore`` by passing them as the argument to the new - ``zarr.storage.KVStore`` class. For backwards compatibility, various - higher-level array creation and convenience functions still accept plain - Python dicts or other mutable mappings for the ``store`` argument, but will - internally convert these to a ``KVStore``. - By :user:`Gregory Lee `; :issue:`839`, :issue:`789`, and :issue:`950`. - -* Allow to assign array ``fill_values`` and update metadata accordingly. - By :user:`Ryan Abernathey `, :issue:`662`. - -* Allow to update array fill_values - By :user:`Matthias Bussonnier ` :issue:`665`. - -Bug fixes -~~~~~~~~~ - -* Fix bug where the checksum of zipfiles is wrong - By :user:`Oren Watson ` :issue:`930`. - -* Fix consolidate_metadata with FSStore. - By :user:`Joe Hamman ` :issue:`916`. - -* Unguarded next inside generator. - By :user:`Dimitri Papadopoulos Orfanos ` :issue:`889`. - -Documentation -~~~~~~~~~~~~~ - -* Update docs creation of dev env. - By :user:`Ray Bell ` :issue:`921`. - -* Update docs to use ``python -m pytest``. - By :user:`Ray Bell ` :issue:`923`. - -* Fix versionadded tag in zarr.Array docstring. - By :user:`Juan Nunez-Iglesias ` :issue:`852`. - -* Doctest seem to be stricter now, updating tostring() to tobytes(). - By :user:`John Kirkham ` :issue:`907`. - -* Minor doc fix. - By :user:`Mads R. B. Kristensen ` :issue:`937`. - -Maintenance -~~~~~~~~~~~ - -* Upgrade MongoDB in test env. - By :user:`Joe Hamman ` :issue:`939`. - -* Pass dimension_separator on fixture generation. - By :user:`Josh Moore ` :issue:`858`. - -* Activate Python 3.9 in GitHub Actions. - By :user:`Josh Moore ` :issue:`859`. - -* Drop shortcut ``fsspec[s3]`` for dependency. - By :user:`Josh Moore ` :issue:`920`. - -* and a swath of code-linting improvements by :user:`Dimitri Papadopoulos Orfanos `: - - - Unnecessary comprehension (:issue:`899`) - - - Unnecessary ``None`` provided as default (:issue:`900`) - - - use an if ``expression`` instead of `and`/`or` (:issue:`888`) - - - Remove unnecessary literal (:issue:`891`) - - - Decorate a few method with `@staticmethod` (:issue:`885`) - - - Drop unneeded ``return`` (:issue:`884`) - - - Drop explicit ``object`` inheritance from ``class``-es (:issue:`886`) - - - Unnecessary comprehension (:issue:`883`) - - - Codespell configuration (:issue:`882`) - - - Fix typos found by codespell (:issue:`880`) - - - Proper C-style formatting for integer (:issue:`913`) - - - Add LGTM.com / DeepSource.io configuration files (:issue:`909`) - -.. _release_2.10.3: - -2.10.3 ------- - -Bug fixes -~~~~~~~~~ - -* N5 keywords now emit UserWarning instead of raising a ValueError. - By :user:`Boaz Mohar `; :issue:`860`. - -* blocks_to_decompress not used in read_part function. - By :user:`Boaz Mohar `; :issue:`861`. - -* defines blocksize for array, updates hexdigest values. - By :user:`Andrew Fulton `; :issue:`867`. - -* Fix test failure on Debian and conda-forge builds. - By :user:`Josh Moore `; :issue:`871`. - -.. _release_2.10.2: - -2.10.2 ------- - -Bug fixes -~~~~~~~~~ - -* Fix NestedDirectoryStore datasets without dimension_separator metadata. - By :user:`Josh Moore `; :issue:`850`. - -.. _release_2.10.1: - -2.10.1 ------- - -Bug fixes -~~~~~~~~~ - -* Fix regression by setting normalize_keys=False in fsstore constructor. - By :user:`Davis Bennett `; :issue:`842`. - -.. _release_2.10.0: - -2.10.0 ------- - -Enhancements -~~~~~~~~~~~~ - -* Add N5FSStore. - By :user:`Davis Bennett `; :issue:`793`. - -Bug fixes -~~~~~~~~~ - -* Ignore None dim_separators in save_array. - By :user:`Josh Moore `; :issue:`831`. - -.. _release_2.9.5: - -2.9.5 ------ - -Bug fixes -~~~~~~~~~ - -* Fix FSStore.listdir behavior for nested directories. - By :user:`Gregory Lee `; :issue:`802`. - -.. _release_2.9.4: - -2.9.4 ------ - -Bug fixes -~~~~~~~~~ - -* Fix structured arrays that contain objects - By :user: `Attila Bergou `; :issue: `806` - -.. _release_2.9.3: - -2.9.3 ------ - -Maintenance -~~~~~~~~~~~ - -* Mark the fact that some tests that require ``fsspec``, without compromising the code coverage score. - By :user:`Ben Williams `; :issue:`823`. - -* Only inspect alternate node type if desired isn't present. - By :user:`Trevor Manz `; :issue:`696`. - -.. _release_2.9.2: - -2.9.2 ------ - -Maintenance -~~~~~~~~~~~ - -* Correct conda-forge deployment of Zarr by fixing some Zarr tests. - By :user:`Ben Williams `; :issue:`821`. - -.. _release_2.9.1: - -2.9.1 ------ - -Maintenance -~~~~~~~~~~~ - -* Correct conda-forge deployment of Zarr. - By :user:`Josh Moore `; :issue:`819`. - -.. _release_2.9.0: - -2.9.0 ------ - -This release of Zarr Python is the first release of Zarr to not support Python 3.6. - -Enhancements -~~~~~~~~~~~~ - -* Update ABSStore for compatibility with newer `azure.storage.blob`. - By :user:`Tom Augspurger `; :issue:`759`. - -* Pathlib support. - By :user:`Chris Barnes `; :issue:`768`. - -Documentation -~~~~~~~~~~~~~ - -* Clarify that arbitrary key/value pairs are OK for attributes. - By :user:`Stephan Hoyer `; :issue:`751`. - -* Clarify how to manually convert a DirectoryStore to a ZipStore. - By :user:`pmav99 `; :issue:`763`. - -Bug fixes -~~~~~~~~~ - -* Fix dimension_separator support. - By :user:`Josh Moore `; :issue:`775`. - -* Extract ABSStore to zarr._storage.absstore. - By :user:`Josh Moore `; :issue:`781`. - -* avoid NumPy 1.21.0 due to https://github.com/numpy/numpy/issues/19325 - By :user:`Gregory Lee `; :issue:`791`. - -Maintenance -~~~~~~~~~~~ - -* Drop 3.6 builds. - By :user:`Josh Moore `; :issue:`774`, :issue:`778`. - -* Fix build with Sphinx 4. - By :user:`Elliott Sales de Andrade `; :issue:`799`. - -* TST: add missing assert in test_hexdigest. - By :user:`Gregory Lee `; :issue:`801`. - -.. _release_2.8.3: - -2.8.3 ------ - -Bug fixes -~~~~~~~~~ - -* FSStore: default to normalize_keys=False - By :user:`Josh Moore `; :issue:`755`. -* ABSStore: compatibility with ``azure.storage.python>=12`` - By :user:`Tom Augspurger `; :issue:`618` - - -.. _release_2.8.2: - -2.8.2 ------ - -Documentation -~~~~~~~~~~~~~ - -* Add section on rechunking to tutorial - By :user:`David Baddeley `; :issue:`730`. - -Bug fixes -~~~~~~~~~ - -* Expand FSStore tests and fix implementation issues - By :user:`Davis Bennett `; :issue:`709`. - -Maintenance -~~~~~~~~~~~ - -* Updated ipytree warning for jlab3 - By :user:`Ian Hunt-Isaak `; :issue:`721`. - -* b170a48a - (issue-728, copy-nested) Updated ipytree warning for jlab3 (#721) (3 weeks ago) -* Activate dependabot - By :user:`Josh Moore `; :issue:`734`. - -* Update Python classifiers (Zarr is stable!) - By :user:`Josh Moore `; :issue:`731`. - -.. _release_2.8.1: - -2.8.1 ------ - -Bug fixes -~~~~~~~~~ - -* raise an error if create_dataset's dimension_separator is inconsistent - By :user:`Gregory R. Lee `; :issue:`724`. - -.. _release_2.8.0: - -2.8.0 ------ - -V2 Specification Update -~~~~~~~~~~~~~~~~~~~~~~~ - -* Introduce optional dimension_separator .zarray key for nested chunks. - By :user:`Josh Moore `; :issue:`715`, :issue:`716`. - -.. _release_2.7.1: - -2.7.1 ------ - -Bug fixes -~~~~~~~~~ - -* Update Array to respect FSStore's key_separator (#718) - By :user:`Gregory R. Lee `; :issue:`718`. - -.. _release_2.7.0: - -2.7.0 ------ - -Enhancements -~~~~~~~~~~~~ - -* Start stop for iterator (`islice()`) - By :user:`Sebastian Grill `; :issue:`621`. - -* Add capability to partially read and decompress chunks - By :user:`Andrew Fulton `; :issue:`667`. - -Bug fixes -~~~~~~~~~ - -* Make DirectoryStore __setitem__ resilient against antivirus file locking - By :user:`Eric Younkin `; :issue:`698`. - -* Compare test data's content generally - By :user:`John Kirkham `; :issue:`436`. - -* Fix dtype usage in zarr/meta.py - By :user:`Josh Moore `; :issue:`700`. - -* Fix FSStore key_seperator usage - By :user:`Josh Moore `; :issue:`669`. - -* Simplify text handling in DB Store - By :user:`John Kirkham `; :issue:`670`. - -* GitHub Actions migration - By :user:`Matthias Bussonnier `; - :issue:`641`, :issue:`671`, :issue:`674`, :issue:`676`, :issue:`677`, :issue:`678`, - :issue:`679`, :issue:`680`, :issue:`682`, :issue:`684`, :issue:`685`, :issue:`686`, - :issue:`687`, :issue:`695`, :issue:`706`. - -.. _release_2.6.1: - -2.6.1 ------ - -* Minor build fix - By :user:`Matthias Bussonnier `; :issue:`666`. - -.. _release_2.6.0: - -2.6.0 ------ - -This release of Zarr Python is the first release of Zarr to not support Python 3.5. - -* End Python 3.5 support. - By :user:`Chris Barnes `; :issue:`602`. - -* Fix ``open_group/open_array`` to allow opening of read-only store with - ``mode='r'`` :issue:`269` - -* Add `Array` tests for FSStore. - By :user:`Andrew Fulton `; :issue: `644`. - -* fix a bug in which ``attrs`` would not be copied on the root when using ``copy_all``; :issue:`613` - -* Fix ``FileNotFoundError`` with dask/s3fs :issue:`649` - -* Fix flaky fixture in test_storage.py :issue:`652` - -* Fix FSStore getitems fails with arrays that have a 0 length shape dimension :issue:`644` - -* Use async to fetch/write result concurrently when possible. :issue:`536`, See `this comment - `_ for some performance analysis - showing order of magnitude faster response in some benchmark. - -See `this link `_ -for the full list of closed and merged PR tagged with the 2.6 milestone. - -* Add ability to partially read and decompress arrays, see :issue:`667`. It is - only available to chunks stored using fsspec and using Blosc as a compressor. - - For certain analysis case when only a small portion of chunks is needed it can - be advantageous to only access and decompress part of the chunks. Doing - partial read and decompression add high latency to many of the operation so - should be used only when the subset of the data is small compared to the full - chunks and is stored contiguously (that is to say either last dimensions for C - layout, firsts for F). Pass ``partial_decompress=True`` as argument when - creating an ``Array``, or when using ``open_array``. No option exists yet to - apply partial read and decompress on a per-operation basis. - -.. _release_2.5.0: - -2.5.0 ------ - -This release will be the last to support Python 3.5, next version of Zarr will be Python 3.6+. - -* `DirectoryStore` now uses `os.scandir`, which should make listing large store - faster, :issue:`563` - -* Remove a few remaining Python 2-isms. - By :user:`Poruri Sai Rahul `; :issue:`393`. - -* Fix minor bug in `N5Store`. - By :user:`gsakkis`, :issue:`550`. - -* Improve error message in Jupyter when trying to use the ``ipytree`` widget - without ``ipytree`` installed. - By :user:`Zain Patel `; :issue:`537` - -* Add typing information to many of the core functions :issue:`589` - -* Explicitly close stores during testing. - By :user:`Elliott Sales de Andrade `; :issue:`442` - -* Many of the convenience functions to emit errors (``err_*`` from - ``zarr.errors`` have been replaced by ``ValueError`` subclasses. The corresponding - ``err_*`` function have been removed. :issue:`590`, :issue:`614`) - -* Improve consistency of terminology regarding arrays and datasets in the - documentation. - By :user:`Josh Moore `; :issue:`571`. - -* Added support for generic URL opening by ``fsspec``, where the URLs have the - form "protocol://[server]/path" or can be chained URls with "::" separators. - The additional argument ``storage_options`` is passed to the backend, see - the ``fsspec`` docs. - By :user:`Martin Durant `; :issue:`546` - -* Added support for fetching multiple items via ``getitems`` method of a - store, if it exists. This allows for concurrent fetching of data blocks - from stores that implement this; presently HTTP, S3, GCS. Currently only - applies to reading. - By :user:`Martin Durant `; :issue:`606` - -* Efficient iteration expanded with option to pass start and stop index via - ``array.islice``. - By :user:`Sebastian Grill `, :issue:`615`. - -.. _release_2.4.0: - -2.4.0 ------ - -Enhancements -~~~~~~~~~~~~ - -* Add key normalization option for ``DirectoryStore``, ``NestedDirectoryStore``, - ``TempStore``, and ``N5Store``. - By :user:`James Bourbeau `; :issue:`459`. - -* Add ``recurse`` keyword to ``Group.array_keys`` and ``Group.arrays`` methods. - By :user:`James Bourbeau `; :issue:`458`. - -* Use uniform chunking for all dimensions when specifying ``chunks`` as an integer. - Also adds support for specifying ``-1`` to chunk across an entire dimension. - By :user:`James Bourbeau `; :issue:`456`. - -* Rename ``DictStore`` to ``MemoryStore``. - By :user:`James Bourbeau `; :issue:`455`. - -* Rewrite ``.tree()`` pretty representation to use ``ipytree``. - Allows it to work in both the Jupyter Notebook and JupyterLab. - By :user:`John Kirkham `; :issue:`450`. - -* Do not rename Blosc parameters in n5 backend and add `blocksize` parameter, - compatible with n5-blosc. By :user:`axtimwalde`, :issue:`485`. - -* Update ``DirectoryStore`` to create files with more permissive permissions. - By :user:`Eduardo Gonzalez ` and :user:`James Bourbeau `; :issue:`493` - -* Use ``math.ceil`` for scalars. - By :user:`John Kirkham `; :issue:`500`. - -* Ensure contiguous data using ``astype``. - By :user:`John Kirkham `; :issue:`513`. - -* Refactor out ``_tofile``/``_fromfile`` from ``DirectoryStore``. - By :user:`John Kirkham `; :issue:`503`. - -* Add ``__enter__``/``__exit__`` methods to ``Group`` for ``h5py.File`` compatibility. - By :user:`Chris Barnes `; :issue:`509`. - -Bug fixes -~~~~~~~~~ - -* Fix Sqlite Store Wrong Modification. - By :user:`Tommy Tran `; :issue:`440`. - -* Add intermediate step (using ``zipfile.ZipInfo`` object) to write - inside ``ZipStore`` to solve too restrictive permission issue. - By :user:`Raphael Dussin `; :issue:`505`. - -* Fix '/' prepend bug in ``ABSStore``. - By :user:`Shikhar Goenka `; :issue:`525`. - -Documentation -~~~~~~~~~~~~~ -* Fix hyperlink in ``README.md``. - By :user:`Anderson Banihirwe `; :issue:`531`. - -* Replace "nuimber" with "number". - By :user:`John Kirkham `; :issue:`512`. - -* Fix azure link rendering in tutorial. - By :user:`James Bourbeau `; :issue:`507`. - -* Update ``README`` file to be more detailed. - By :user:`Zain Patel `; :issue:`495`. - -* Import blosc from numcodecs in tutorial. - By :user:`James Bourbeau `; :issue:`491`. - -* Adds logo to docs. - By :user:`James Bourbeau `; :issue:`462`. - -* Fix N5 link in tutorial. - By :user:`James Bourbeau `; :issue:`480`. - -* Fix typo in code snippet. - By :user:`Joe Jevnik `; :issue:`461`. - -* Fix URLs to point to zarr-python - By :user:`John Kirkham `; :issue:`453`. - -Maintenance -~~~~~~~~~~~ - -* Add documentation build to CI. - By :user:`James Bourbeau `; :issue:`516`. - -* Use ``ensure_ndarray`` in a few more places. - By :user:`John Kirkham `; :issue:`506`. - -* Support Python 3.8. - By :user:`John Kirkham `; :issue:`499`. - -* Require Numcodecs 0.6.4+ to use text handling functionality from it. - By :user:`John Kirkham `; :issue:`497`. - -* Updates tests to use ``pytest.importorskip``. - By :user:`James Bourbeau `; :issue:`492` - -* Removed support for Python 2. - By :user:`jhamman`; :issue:`393`, :issue:`470`. - -* Upgrade dependencies in the test matrices and resolve a - compatibility issue with testing against the Azure Storage - Emulator. By :user:`alimanfoo`; :issue:`468`, :issue:`467`. - -* Use ``unittest.mock`` on Python 3. - By :user:`Elliott Sales de Andrade `; :issue:`426`. - -* Drop ``decode`` from ``ConsolidatedMetadataStore``. - By :user:`John Kirkham `; :issue:`452`. - - -.. _release_2.3.2: - -2.3.2 ------ - -Enhancements -~~~~~~~~~~~~ - -* Use ``scandir`` in ``DirectoryStore``'s ``getsize`` method. - By :user:`John Kirkham `; :issue:`431`. - -Bug fixes -~~~~~~~~~ - -* Add and use utility functions to simplify reading and writing JSON. - By :user:`John Kirkham `; :issue:`429`, :issue:`430`. - -* Fix ``collections``'s ``DeprecationWarning``\ s. - By :user:`John Kirkham `; :issue:`432`. - -* Fix tests on big endian machines. - By :user:`Elliott Sales de Andrade `; :issue:`427`. - - -.. _release_2.3.1: - -2.3.1 ------ - -Bug fixes -~~~~~~~~~ - -* Makes ``azure-storage-blob`` optional for testing. - By :user:`John Kirkham `; :issue:`419`, :issue:`420`. - - -.. _release_2.3.0: - -2.3.0 ------ - -Enhancements -~~~~~~~~~~~~ - -* New storage backend, backed by Azure Blob Storage, class :class:`zarr.storage.ABSStore`. - All data is stored as block blobs. By :user:`Shikhar Goenka `, - :user:`Tim Crone ` and :user:`Zain Patel `; :issue:`345`. - -* Add "consolidated" metadata as an experimental feature: use - :func:`zarr.convenience.consolidate_metadata` to copy all metadata from the various - metadata keys within a dataset hierarchy under a single key, and - :func:`zarr.convenience.open_consolidated` to use this single key. This can greatly - cut down the number of calls to the storage backend, and so remove a lot of overhead - for reading remote data. - By :user:`Martin Durant `, :user:`Alistair Miles `, - :user:`Ryan Abernathey `, :issue:`268`, :issue:`332`, :issue:`338`. - -* Support has been added for structured arrays with sub-array shape and/or nested fields. By - :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. - -* Adds the SQLite-backed :class:`zarr.storage.SQLiteStore` class enabling an - SQLite database to be used as the backing store for an array or group. - By :user:`John Kirkham `, :issue:`368`, :issue:`365`. - -* Efficient iteration over arrays by decompressing chunkwise. - By :user:`Jerome Kelleher `, :issue:`398`, :issue:`399`. - -* Adds the Redis-backed :class:`zarr.storage.RedisStore` class enabling a - Redis database to be used as the backing store for an array or group. - By :user:`Joe Hamman `, :issue:`299`, :issue:`372`. - -* Adds the MongoDB-backed :class:`zarr.storage.MongoDBStore` class enabling a - MongoDB database to be used as the backing store for an array or group. - By :user:`Noah D Brenowitz `, :user:`Joe Hamman `, - :issue:`299`, :issue:`372`, :issue:`401`. - -* **New storage class for N5 containers**. The :class:`zarr.n5.N5Store` has been - added, which uses :class:`zarr.storage.NestedDirectoryStore` to support - reading and writing from and to N5 containers. - By :user:`Jan Funke ` and :user:`John Kirkham `. - -Bug fixes -~~~~~~~~~ - -* The implementation of the :class:`zarr.storage.DirectoryStore` class has been modified to - ensure that writes are atomic and there are no race conditions where a chunk might appear - transiently missing during a write operation. By :user:`sbalmer `, :issue:`327`, - :issue:`263`. - -* Avoid raising in :class:`zarr.storage.DirectoryStore`'s ``__setitem__`` when file already exists. - By :user:`Justin Swaney `, :issue:`272`, :issue:`318`. - -* The required version of the `Numcodecs`_ package has been upgraded - to 0.6.2, which has enabled some code simplification and fixes a failing test involving - msgpack encoding. By :user:`John Kirkham `, :issue:`361`, :issue:`360`, :issue:`352`, - :issue:`355`, :issue:`324`. - -* Failing tests related to pickling/unpickling have been fixed. By :user:`Ryan Williams `, - :issue:`273`, :issue:`308`. - -* Corrects handling of ``NaT`` in ``datetime64`` and ``timedelta64`` in various - compressors (by :user:`John Kirkham `; :issue:`344`). - -* Ensure ``DictStore`` contains only ``bytes`` to facilitate comparisons and protect against writes. - By :user:`John Kirkham `, :issue:`350`. - -* Test and fix an issue (w.r.t. fill values) when storing complex data to ``Array``. - By :user:`John Kirkham `, :issue:`363`. - -* Always use a ``tuple`` when indexing a NumPy ``ndarray``. - By :user:`John Kirkham `, :issue:`376`. - -* Ensure when ``Array`` uses a ``dict``-based chunk store that it only contains - ``bytes`` to facilitate comparisons and protect against writes. Drop the copy - for the no filter/compressor case as this handles that case. - By :user:`John Kirkham `, :issue:`359`. - -Maintenance -~~~~~~~~~~~ - -* Simplify directory creation and removal in ``DirectoryStore.rename``. - By :user:`John Kirkham `, :issue:`249`. - -* CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and - upgrade all pinned package requirements. :user:`Alistair Miles `, :issue:`308`. - -* Start using pyup.io to maintain dependencies. - :user:`Alistair Miles `, :issue:`326`. - -* Configure flake8 line limit generally. - :user:`John Kirkham `, :issue:`335`. - -* Add missing coverage pragmas. - :user:`John Kirkham `, :issue:`343`, :issue:`355`. - -* Fix missing backslash in docs. - :user:`John Kirkham `, :issue:`254`, :issue:`353`. - -* Include tests for stores' ``popitem`` and ``pop`` methods. - By :user:`John Kirkham `, :issue:`378`, :issue:`380`. - -* Include tests for different compressors, endianness, and attributes. - By :user:`John Kirkham `, :issue:`378`, :issue:`380`. - -* Test validity of stores' contents. - By :user:`John Kirkham `, :issue:`359`, :issue:`408`. - - -.. _release_2.2.0: - -2.2.0 ------ - -Enhancements -~~~~~~~~~~~~ - -* **Advanced indexing**. The ``Array`` class has several new methods and - properties that enable a selection of items in an array to be retrieved or - updated. See the :ref:`user-guide-indexing` tutorial section for more - information. There is also a `notebook - `_ - with extended examples and performance benchmarks. :issue:`78`, :issue:`89`, - :issue:`112`, :issue:`172`. - -* **New package for compressor and filter codecs**. The classes previously - defined in the :mod:`zarr.codecs` module have been factored out into a - separate package called `Numcodecs`_. The `Numcodecs`_ package also includes - several new codec classes not previously available in Zarr, including - compressor codecs for Zstd and LZ4. This change is backwards-compatible with - existing code, as all codec classes defined by Numcodecs are imported into the - :mod:`zarr.codecs` namespace. However, it is recommended to import codecs from - the new package, see the tutorial sections on :ref:`user-guide-compress` and - :ref:`user-guide-filters` for examples. With contributions by - :user:`John Kirkham `; :issue:`74`, :issue:`102`, :issue:`120`, - :issue:`123`, :issue:`139`. - -* **New storage class for DBM-style databases**. The - :class:`zarr.storage.DBMStore` class enables any DBM-style database such as gdbm, - ndbm or Berkeley DB, to be used as the backing store for an array or group. See the - tutorial section on :ref:`user-guide-storage` for some examples. :issue:`133`, - :issue:`186`. - -* **New storage class for LMDB databases**. The :class:`zarr.storage.LMDBStore` class - enables an LMDB "Lightning" database to be used as the backing store for an array or - group. :issue:`192`. - -* **New storage class using a nested directory structure for chunk files**. The - :class:`zarr.storage.NestedDirectoryStore` has been added, which is similar to - the existing :class:`zarr.storage.DirectoryStore` class but nests chunk files - for multidimensional arrays into sub-directories. :issue:`155`, :issue:`177`. - -* **New tree() method for printing hierarchies**. The ``Group`` class has a new - :func:`zarr.hierarchy.Group.tree` method which enables a tree representation of - a group hierarchy to be printed. Also provides an interactive tree - representation when used within a Jupyter notebook. See the - :ref:`user-guide-diagnostics` tutorial section for examples. By - :user:`John Kirkham `; :issue:`82`, :issue:`140`, :issue:`184`. - -* **Visitor API**. The ``Group`` class now implements the h5py visitor API, see - docs for the :func:`zarr.hierarchy.Group.visit`, - :func:`zarr.hierarchy.Group.visititems` and - :func:`zarr.hierarchy.Group.visitvalues` methods. By - :user:`John Kirkham `, :issue:`92`, :issue:`122`. - -* **Viewing an array as a different dtype**. The ``Array`` class has a new - :func:`zarr.Array.astype` method, which is a convenience that enables an - array to be viewed as a different dtype. By :user:`John Kirkham `, - :issue:`94`, :issue:`96`. - -* **New open(), save(), load() convenience functions**. The function - :func:`zarr.convenience.open` provides a convenient way to open a persistent - array or group, using either a ``DirectoryStore`` or ``ZipStore`` as the backing - store. The functions :func:`zarr.convenience.save` and - :func:`zarr.convenience.load` are also available and provide a convenient way to - save an entire NumPy array to disk and load back into memory later. See the - tutorial section :ref:`user-guide-persist` for examples. :issue:`104`, - :issue:`105`, :issue:`141`, :issue:`181`. - -* **IPython completions**. The ``Group`` class now implements ``__dir__()`` and - ``_ipython_key_completions_()`` which enables tab-completion for group members - to be used in any IPython interactive environment. :issue:`170`. - -* **New info property; changes to __repr__**. The ``Group`` and - ``Array`` classes have a new ``info`` property which can be used to print - diagnostic information, including compression ratio where available. See the - tutorial section on :ref:`user-guide-diagnostics` for examples. The string - representation (``__repr__``) of these classes has been simplified to ensure - it is cheap and quick to compute in all circumstances. :issue:`83`, - :issue:`115`, :issue:`132`, :issue:`148`. - -* **Chunk options**. When creating an array, ``chunks=False`` can be specified, - which will result in an array with a single chunk only. Alternatively, - ``chunks=True`` will trigger an automatic chunk shape guess. See - :ref:`user-guide-chunks` for more on the ``chunks`` parameter. :issue:`106`, - :issue:`107`, :issue:`183`. - -* **Zero-dimensional arrays** and are now supported; by - :user:`Prakhar Goel `, :issue:`154`, :issue:`161`. - -* **Arrays with one or more zero-length dimensions** are now fully supported; by - :user:`Prakhar Goel `, :issue:`150`, :issue:`154`, :issue:`160`. - -* **The .zattrs key is now optional** and will now only be created when the first - custom attribute is set; :issue:`121`, :issue:`200`. - -* **New Group.move() method** supports moving a sub-group or array to a different - location within the same hierarchy. By :user:`John Kirkham `, - :issue:`191`, :issue:`193`, :issue:`196`. - -* **ZipStore is now thread-safe**; :issue:`194`, :issue:`192`. - -* **New Array.hexdigest() method** computes an ``Array``'s hash with ``hashlib``. - By :user:`John Kirkham `, :issue:`98`, :issue:`203`. - -* **Improved support for object arrays**. In previous versions of Zarr, - creating an array with ``dtype=object`` was possible but could under certain - circumstances lead to unexpected errors and/or segmentation faults. To make it easier - to properly configure an object array, a new ``object_codec`` parameter has been - added to array creation functions. See the tutorial section on :ref:`user-guide-objects` - for more information and examples. Also, runtime checks have been added in both Zarr - and Numcodecs so that segmentation faults are no longer possible, even with a badly - configured array. This API change is backwards compatible and previous code that created - an object array and provided an object codec via the ``filters`` parameter will - continue to work, however a warning will be raised to encourage use of the - ``object_codec`` parameter. :issue:`208`, :issue:`212`. - -* **Added support for datetime64 and timedelta64 data types**; - :issue:`85`, :issue:`215`. - -* **Array and group attributes are now cached by default** to improve performance with - slow stores, e.g., stores accessing data via the network; :issue:`220`, :issue:`218`, - :issue:`204`. - -* **New LRUStoreCache class**. The class :class:`zarr.storage.LRUStoreCache` has been - added and provides a means to locally cache data in memory from a store that may be - slow, e.g., a store that retrieves data from a remote server via the network; - :issue:`223`. - -* **New copy functions**. The new functions :func:`zarr.convenience.copy` and - :func:`zarr.convenience.copy_all` provide a way to copy groups and/or arrays - between HDF5 and Zarr, or between two Zarr groups. The - :func:`zarr.convenience.copy_store` provides a more efficient way to copy - data directly between two Zarr stores. :issue:`87`, :issue:`113`, - :issue:`137`, :issue:`217`. - -Bug fixes -~~~~~~~~~ - -* Fixed bug where ``read_only`` keyword argument was ignored when creating an - array; :issue:`151`, :issue:`179`. - -* Fixed bugs when using a ``ZipStore`` opened in 'w' mode; :issue:`158`, - :issue:`182`. - -* Fill values can now be provided for fixed-length string arrays; :issue:`165`, - :issue:`176`. - -* Fixed a bug where the number of chunks initialized could be counted - incorrectly; :issue:`97`, :issue:`174`. - -* Fixed a bug related to the use of an ellipsis (...) in indexing statements; - :issue:`93`, :issue:`168`, :issue:`172`. - -* Fixed a bug preventing use of other integer types for indexing; :issue:`143`, - :issue:`147`. - -Documentation -~~~~~~~~~~~~~ - -* Some changes have been made to the Zarr Specification v2 document to clarify - ambiguities and add some missing information. These changes do not break compatibility - with any of the material as previously implemented, and so the changes have been made - in-place in the document without incrementing the document version number. See the - section on changes in the specification document for more information. -* A new :ref:`user-guide-indexing` section has been added to the tutorial. -* A new :ref:`user-guide-strings` section has been added to the tutorial - (:issue:`135`, :issue:`175`). -* The :ref:`user-guide-chunks` tutorial section has been reorganised and updated. -* The :ref:`user-guide-persist` and :ref:`user-guide-storage` tutorial sections have - been updated with new examples (:issue:`100`, :issue:`101`, :issue:`103`). -* A new tutorial section on :ref:`user-guide-pickle` has been added (:issue:`91`). -* A new tutorial section on :ref:`user-guide-datetime` has been added. -* A new tutorial section on :ref:`user-guide-diagnostics` has been added. -* The tutorial sections on :ref:`user-guide-sync` and :ref:`user-guide-tips-blosc` have been - updated to provide information about how to avoid program hangs when using the Blosc - compressor with multiple processes (:issue:`199`, :issue:`201`). - -Maintenance -~~~~~~~~~~~ - -* A data fixture has been included in the test suite to ensure data format - compatibility is maintained; :issue:`83`, :issue:`146`. -* The test suite has been migrated from nosetests to pytest; :issue:`189`, :issue:`225`. -* Various continuous integration updates and improvements; :issue:`118`, :issue:`124`, - :issue:`125`, :issue:`126`, :issue:`109`, :issue:`114`, :issue:`171`. -* Bump numcodecs dependency to 0.5.3, completely remove nose dependency, :issue:`237`. -* Fix compatibility issues with NumPy 1.14 regarding fill values for structured arrays, - :issue:`222`, :issue:`238`, :issue:`239`. - -Acknowledgments -~~~~~~~~~~~~~~~ - -Code was contributed to this release by :user:`Alistair Miles `, :user:`John -Kirkham ` and :user:`Prakhar Goel `. - -Documentation was contributed to this release by :user:`Mamy Ratsimbazafy ` -and :user:`Charles Noyes `. - -Thank you to :user:`John Kirkham `, :user:`Stephan Hoyer `, -:user:`Francesc Alted `, and :user:`Matthew Rocklin ` for code -reviews and/or comments on pull requests. - -.. _release_2.1.4: - -2.1.4 ------ - -* Resolved an issue where calling ``hasattr`` on a ``Group`` object erroneously - returned a ``KeyError``. By :user:`Vincent Schut `; :issue:`88`, - :issue:`95`. - -.. _release_2.1.3: - -2.1.3 ------ - -* Resolved an issue with :func:`zarr.creation.array` where dtype was given as - None (:issue:`80`). - -.. _release_2.1.2: - -2.1.2 ------ - -* Resolved an issue when no compression is used and chunks are stored in memory - (:issue:`79`). - -.. _release_2.1.1: - -2.1.1 ------ - -Various minor improvements, including: ``Group`` objects support member access -via dot notation (``__getattr__``); fixed metadata caching for ``Array.shape`` -property and derivatives; added ``Array.ndim`` property; fixed -``Array.__array__`` method arguments; fixed bug in pickling ``Array`` state; -fixed bug in pickling ``ThreadSynchronizer``. - -.. _release_2.1.0: - -2.1.0 ------ - -* Group objects now support member deletion via ``del`` statement - (:issue:`65`). -* Added :class:`zarr.storage.TempStore` class for convenience to provide - storage via a temporary directory - (:issue:`59`). -* Fixed performance issues with :class:`zarr.storage.ZipStore` class - (:issue:`66`). -* The Blosc extension has been modified to return bytes instead of array - objects from compress and decompress function calls. This should - improve compatibility and also provides a small performance increase for - compressing high compression ratio data - (:issue:`55`). -* Added ``overwrite`` keyword argument to array and group creation methods - on the :class:`zarr.hierarchy.Group` class - (:issue:`71`). -* Added ``cache_metadata`` keyword argument to array creation methods. -* The functions :func:`zarr.creation.open_array` and - :func:`zarr.hierarchy.open_group` now accept any store as first argument - (:issue:`56`). - -.. _release_2.0.1: - -2.0.1 ------ - -The bundled Blosc library has been upgraded to version 1.11.1. - -.. _release_2.0.0: - -2.0.0 ------ - -Hierarchies -~~~~~~~~~~~ - -Support has been added for organizing arrays into hierarchies via groups. See -the tutorial section on :ref:`user-guide-groups` and the :mod:`zarr.hierarchy` -API docs for more information. - -Filters -~~~~~~~ - -Support has been added for configuring filters to preprocess chunk data prior -to compression. See the tutorial section on :ref:`user-guide-filters` and the -:mod:`zarr.codecs` API docs for more information. - -Other changes -~~~~~~~~~~~~~ - -To accommodate support for hierarchies and filters, the Zarr metadata format -has been modified. See the ``spec_v2`` for more information. To migrate an -array stored using Zarr version 1.x, use the :func:`zarr.storage.migrate_1to2` -function. - -The bundled Blosc library has been upgraded to version 1.11.0. - -Acknowledgments -~~~~~~~~~~~~~~~ - -Thanks to :user:`Matthew Rocklin `, :user:`Stephan Hoyer ` and -:user:`Francesc Alted ` for contributions and comments. - -.. _release_1.1.0: - -1.1.0 ------ - -* The bundled Blosc library has been upgraded to version 1.10.0. The 'zstd' - internal compression library is now available within Blosc. See the tutorial - section on :ref:`user-guide-compress` for an example. -* When using the Blosc compressor, the default internal compression library - is now 'lz4'. -* The default number of internal threads for the Blosc compressor has been - increased to a maximum of 8 (previously 4). -* Added convenience functions :func:`zarr.blosc.list_compressors` and - :func:`zarr.blosc.get_nthreads`. - -.. _release_1.0.0: - -1.0.0 ------ - -This release includes a complete re-organization of the code base. The -major version number has been bumped to indicate that there have been -backwards-incompatible changes to the API and the on-disk storage -format. However, Zarr is still in an early stage of development, so -please do not take the version number as an indicator of maturity. - -Storage -~~~~~~~ - -The main motivation for re-organizing the code was to create an -abstraction layer between the core array logic and data storage (:issue:`21`). -In this release, any -object that implements the ``MutableMapping`` interface can be used as -an array store. See the tutorial sections on :ref:`user-guide-persist` -and :ref:`user-guide-storage`, the ``spec_v1``, and the -:mod:`zarr.storage` module documentation for more information. - -Please note also that the file organization and file name conventions -used when storing a Zarr array in a directory on the file system have -changed. Persistent Zarr arrays created using previous versions of the -software will not be compatible with this version. See the -:mod:`zarr.storage` API docs and the ``spec_v1`` for more -information. - -Compression -~~~~~~~~~~~ - -An abstraction layer has also been created between the core array -logic and the code for compressing and decompressing array -chunks. This release still bundles the c-blosc library and uses Blosc -as the default compressor, however other compressors including zlib, -BZ2 and LZMA are also now supported via the Python standard -library. New compressors can also be dynamically registered for use -with Zarr. See the tutorial sections on :ref:`user-guide-compress` and -:ref:`user-guide-tips-blosc`, the ``spec_v1``, and the -:mod:`zarr.compressors` module documentation for more information. - -Synchronization -~~~~~~~~~~~~~~~ - -The synchronization code has also been refactored to create a layer of -abstraction, enabling Zarr arrays to be used in parallel computations -with a number of alternative synchronization methods. For more -information see the tutorial section on :ref:`user-guide-sync` and the -:mod:`zarr.sync` module documentation. - -Changes to the Blosc extension -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -NumPy is no longer a build dependency for the :mod:`zarr.blosc` Cython -extension, so setup.py will run even if NumPy is not already -installed, and should automatically install NumPy as a runtime -dependency. Manual installation of NumPy prior to installing Zarr is -still recommended, however, as the automatic installation of NumPy may -fail or be sub-optimal on some platforms. - -Some optimizations have been made within the :mod:`zarr.blosc` -extension to avoid unnecessary memory copies, giving a ~10-20% -performance improvement for multi-threaded compression operations. - -The :mod:`zarr.blosc` extension now automatically detects whether it -is running within a single-threaded or multi-threaded program and -adapts its internal behaviour accordingly (:issue:`27`). There is no need for -the user to make any API calls to switch Blosc between contextual and -non-contextual (global lock) mode. See also the tutorial section on -:ref:`user-guide-tips-blosc`. - -Other changes -~~~~~~~~~~~~~ - -The internal code for managing chunks has been rewritten to be more -efficient. Now no state is maintained for chunks outside of the array -store, meaning that chunks do not carry any extra memory overhead not -accounted for by the store. This negates the need for the "lazy" -option present in the previous release, and this has been removed. - -The memory layout within chunks can now be set as either "C" -(row-major) or "F" (column-major), which can help to provide better -compression for some data (:issue:`7`). See the tutorial -section on :ref:`user-guide-chunks-order` for more information. - -A bug has been fixed within the ``__getitem__`` and ``__setitem__`` -machinery for slicing arrays, to properly handle getting and setting -partial slices. - -Acknowledgments -~~~~~~~~~~~~~~~ - -Thanks to :user:`Matthew Rocklin `, :user:`Stephan Hoyer `, -:user:`Francesc Alted `, :user:`Anthony Scopatz ` and -:user:`Martin Durant ` for contributions and comments. - -.. _release_0.4.0: - -0.4.0 ------ - -See `v0.4.0 release notes on GitHub -`_. - -.. _release_0.3.0: - -0.3.0 ------ - -See `v0.3.0 release notes on GitHub -`_. - -.. _Numcodecs: https://numcodecs.readthedocs.io/ diff --git a/docs/index.rst b/docs/index.rst index 0dcfd7f90f..6ab07b0693 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,8 +11,8 @@ Zarr-Python quickstart user-guide/index API reference + release-notes developers/index - developers/release about **Version**: |version| diff --git a/docs/release-notes.rst b/docs/release-notes.rst new file mode 100644 index 0000000000..175bd21aa5 --- /dev/null +++ b/docs/release-notes.rst @@ -0,0 +1,16 @@ +Release notes +============= + +.. _release_3.0.0: + +3.0.0 +----- + +3.0.0 is a new major release of Zarr-Python, with many breaking changes. +See the :ref:`v3 migration guide` for a listing of what's changed. + +Normal release note service will resume with further releases in the 3.0.0 +series. + +Release notes for the zarr-python 2.x and 1.x releases can be found here: +https://zarr.readthedocs.io/en/support-v2/release.html diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst index 66fcca6d19..bda1ae64ed 100644 --- a/docs/user-guide/v3_migration.rst +++ b/docs/user-guide/v3_migration.rst @@ -1,3 +1,5 @@ +.. _v3 migration guide: + 3.0 Migration Guide =================== From 0328656b09b1395daaaba309798fefd78459c2ee Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 9 Jan 2025 01:57:19 -0700 Subject: [PATCH 82/87] Use dataclasses for ByteRangeRequests (#2585) * Use TypedDicts for more literate ByteRangeRequests * Update utility function * fixes sharding * Ignore mypy errors * Fix offset in _normalize_byte_range_index * Update get_partial_values for FsspecStore * Re-add fs._cat_ranges argument * Simplify typing * Update _normalize to return start, stop * Use explicit range * Use dataclasses * Update typing * Update docstring * Rename ExplicitRange to ExplicitByteRequest * Rename OffsetRange to OffsetByteRequest * Rename SuffixRange to SuffixByteRequest * Use match; case instead of if; elif * Revert "Use match; case instead of if; elif" This reverts commit a7d35f876b1b628b3216da61ee26ba0f3a9d9cf8. * Update ByteRangeRequest to ByteRequest * Remove ByteRange definition from common * Rename ExplicitByteRequest to RangeByteRequest * Provide more informative error message --------- Co-authored-by: Norman Rzepka --- src/zarr/abc/store.py | 50 ++++++++++++++++---- src/zarr/codecs/sharding.py | 24 ++++++---- src/zarr/core/common.py | 1 - src/zarr/storage/_common.py | 10 ++-- src/zarr/storage/_fsspec.py | 81 ++++++++++++++++++++------------- src/zarr/storage/_local.py | 43 ++++++++--------- src/zarr/storage/_logging.py | 6 +-- src/zarr/storage/_memory.py | 14 +++--- src/zarr/storage/_utils.py | 36 +++++++-------- src/zarr/storage/_wrapper.py | 8 ++-- src/zarr/storage/_zip.py | 33 ++++++++------ src/zarr/testing/stateful.py | 5 +- src/zarr/testing/store.py | 40 ++++++++++------ src/zarr/testing/strategies.py | 12 +++-- tests/test_store/test_fsspec.py | 3 +- 15 files changed, 221 insertions(+), 145 deletions(-) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index bd0a7ad503..e6a5518a4b 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from asyncio import gather +from dataclasses import dataclass from itertools import starmap from typing import TYPE_CHECKING, Protocol, runtime_checkable @@ -19,7 +20,34 @@ __all__ = ["ByteGetter", "ByteSetter", "Store", "set_or_delete"] -ByteRangeRequest: TypeAlias = tuple[int | None, int | None] + +@dataclass +class RangeByteRequest: + """Request a specific byte range""" + + start: int + """The start of the byte range request (inclusive).""" + end: int + """The end of the byte range request (exclusive).""" + + +@dataclass +class OffsetByteRequest: + """Request all bytes starting from a given byte offset""" + + offset: int + """The byte offset for the offset range request.""" + + +@dataclass +class SuffixByteRequest: + """Request up to the last `n` bytes""" + + suffix: int + """The number of bytes from the suffix to request.""" + + +ByteRequest: TypeAlias = RangeByteRequest | OffsetByteRequest | SuffixByteRequest class Store(ABC): @@ -141,14 +169,20 @@ async def get( self, key: str, prototype: BufferPrototype, - byte_range: ByteRangeRequest | None = None, + byte_range: ByteRequest | None = None, ) -> Buffer | None: """Retrieve the value associated with a given key. Parameters ---------- key : str - byte_range : tuple[int | None, int | None], optional + byte_range : ByteRequest, optional + + ByteRequest may be one of the following. If not provided, all data associated with the key is retrieved. + + - RangeByteRequest(int, int): Request a specific range of bytes in the form (start, end). The end is exclusive. If the given range is zero-length or starts after the end of the object, an error will be returned. Additionally, if the range ends after the end of the object, the entire remainder of the object will be returned. Otherwise, the exact requested range will be returned. + - OffsetByteRequest(int): Request all bytes starting from a given byte offset. This is equivalent to bytes={int}- as an HTTP header. + - SuffixByteRequest(int): Request the last int bytes. Note that here, int is the size of the request, not the byte offset. This is equivalent to bytes=-{int} as an HTTP header. Returns ------- @@ -160,7 +194,7 @@ async def get( async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: """Retrieve possibly partial values from given key_ranges. @@ -338,7 +372,7 @@ def close(self) -> None: self._is_open = False async def _get_many( - self, requests: Iterable[tuple[str, BufferPrototype, ByteRangeRequest | None]] + self, requests: Iterable[tuple[str, BufferPrototype, ByteRequest | None]] ) -> AsyncGenerator[tuple[str, Buffer | None], None]: """ Retrieve a collection of objects from storage. In general this method does not guarantee @@ -416,17 +450,17 @@ async def getsize_prefix(self, prefix: str) -> int: @runtime_checkable class ByteGetter(Protocol): async def get( - self, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None + self, prototype: BufferPrototype, byte_range: ByteRequest | None = None ) -> Buffer | None: ... @runtime_checkable class ByteSetter(Protocol): async def get( - self, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None + self, prototype: BufferPrototype, byte_range: ByteRequest | None = None ) -> Buffer | None: ... - async def set(self, value: Buffer, byte_range: ByteRangeRequest | None = None) -> None: ... + async def set(self, value: Buffer, byte_range: ByteRequest | None = None) -> None: ... async def delete(self) -> None: ... diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index a01145b3b2..160a74e892 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -17,7 +17,13 @@ Codec, CodecPipeline, ) -from zarr.abc.store import ByteGetter, ByteRangeRequest, ByteSetter +from zarr.abc.store import ( + ByteGetter, + ByteRequest, + ByteSetter, + RangeByteRequest, + SuffixByteRequest, +) from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -77,7 +83,7 @@ class _ShardingByteGetter(ByteGetter): chunk_coords: ChunkCoords async def get( - self, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None + self, prototype: BufferPrototype, byte_range: ByteRequest | None = None ) -> Buffer | None: assert byte_range is None, "byte_range is not supported within shards" assert ( @@ -90,7 +96,7 @@ async def get( class _ShardingByteSetter(_ShardingByteGetter, ByteSetter): shard_dict: ShardMutableMapping - async def set(self, value: Buffer, byte_range: ByteRangeRequest | None = None) -> None: + async def set(self, value: Buffer, byte_range: ByteRequest | None = None) -> None: assert byte_range is None, "byte_range is not supported within shards" self.shard_dict[self.chunk_coords] = value @@ -129,7 +135,7 @@ def get_chunk_slice(self, chunk_coords: ChunkCoords) -> tuple[int, int] | None: if (chunk_start, chunk_len) == (MAX_UINT_64, MAX_UINT_64): return None else: - return (int(chunk_start), int(chunk_len)) + return (int(chunk_start), int(chunk_start + chunk_len)) def set_chunk_slice(self, chunk_coords: ChunkCoords, chunk_slice: slice | None) -> None: localized_chunk = self._localize_chunk(chunk_coords) @@ -203,7 +209,7 @@ def create_empty( def __getitem__(self, chunk_coords: ChunkCoords) -> Buffer: chunk_byte_slice = self.index.get_chunk_slice(chunk_coords) if chunk_byte_slice: - return self.buf[chunk_byte_slice[0] : (chunk_byte_slice[0] + chunk_byte_slice[1])] + return self.buf[chunk_byte_slice[0] : chunk_byte_slice[1]] raise KeyError def __len__(self) -> int: @@ -504,7 +510,8 @@ async def _decode_partial_single( chunk_byte_slice = shard_index.get_chunk_slice(chunk_coords) if chunk_byte_slice: chunk_bytes = await byte_getter.get( - prototype=chunk_spec.prototype, byte_range=chunk_byte_slice + prototype=chunk_spec.prototype, + byte_range=RangeByteRequest(chunk_byte_slice[0], chunk_byte_slice[1]), ) if chunk_bytes: shard_dict[chunk_coords] = chunk_bytes @@ -696,11 +703,12 @@ async def _load_shard_index_maybe( shard_index_size = self._shard_index_size(chunks_per_shard) if self.index_location == ShardingCodecIndexLocation.start: index_bytes = await byte_getter.get( - prototype=numpy_buffer_prototype(), byte_range=(0, shard_index_size) + prototype=numpy_buffer_prototype(), + byte_range=RangeByteRequest(0, shard_index_size), ) else: index_bytes = await byte_getter.get( - prototype=numpy_buffer_prototype(), byte_range=(-shard_index_size, None) + prototype=numpy_buffer_prototype(), byte_range=SuffixByteRequest(shard_index_size) ) if index_bytes is not None: return await self._decode_shard_index(index_bytes, chunks_per_shard) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 7205b8c206..ad3316b619 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -31,7 +31,6 @@ ZATTRS_JSON = ".zattrs" ZMETADATA_V2_JSON = ".zmetadata" -ByteRangeRequest = tuple[int | None, int | None] BytesLike = bytes | bytearray | memoryview ShapeLike = tuple[int, ...] | int ChunkCoords = tuple[int, ...] diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index 523e470671..6ab539bb0a 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal -from zarr.abc.store import ByteRangeRequest, Store +from zarr.abc.store import ByteRequest, Store from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.core.common import ZARR_JSON, ZARRAY_JSON, ZGROUP_JSON, AccessModeLiteral, ZarrFormat from zarr.errors import ContainsArrayAndGroupError, ContainsArrayError, ContainsGroupError @@ -102,7 +102,7 @@ async def open( async def get( self, prototype: BufferPrototype | None = None, - byte_range: ByteRangeRequest | None = None, + byte_range: ByteRequest | None = None, ) -> Buffer | None: """ Read bytes from the store. @@ -111,7 +111,7 @@ async def get( ---------- prototype : BufferPrototype, optional The buffer prototype to use when reading the bytes. - byte_range : ByteRangeRequest, optional + byte_range : ByteRequest, optional The range of bytes to read. Returns @@ -123,7 +123,7 @@ async def get( prototype = default_buffer_prototype() return await self.store.get(self.path, prototype=prototype, byte_range=byte_range) - async def set(self, value: Buffer, byte_range: ByteRangeRequest | None = None) -> None: + async def set(self, value: Buffer, byte_range: ByteRequest | None = None) -> None: """ Write bytes to the store. @@ -131,7 +131,7 @@ async def set(self, value: Buffer, byte_range: ByteRangeRequest | None = None) - ---------- value : Buffer The buffer to write. - byte_range : ByteRangeRequest, optional + byte_range : ByteRequest, optional The range of bytes to write. If None, the entire buffer is written. Raises diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py index 89d80320dd..99c8c778e7 100644 --- a/src/zarr/storage/_fsspec.py +++ b/src/zarr/storage/_fsspec.py @@ -3,7 +3,13 @@ import warnings from typing import TYPE_CHECKING, Any -from zarr.abc.store import ByteRangeRequest, Store +from zarr.abc.store import ( + ByteRequest, + OffsetByteRequest, + RangeByteRequest, + Store, + SuffixByteRequest, +) from zarr.storage._common import _dereference_path if TYPE_CHECKING: @@ -199,7 +205,7 @@ async def get( self, key: str, prototype: BufferPrototype, - byte_range: ByteRangeRequest | None = None, + byte_range: ByteRequest | None = None, ) -> Buffer | None: # docstring inherited if not self._is_open: @@ -207,23 +213,26 @@ async def get( path = _dereference_path(self.path, key) try: - if byte_range: - # fsspec uses start/end, not start/length - start, length = byte_range - if start is not None and length is not None: - end = start + length - elif length is not None: - end = length - else: - end = None - value = prototype.buffer.from_bytes( - await ( - self.fs._cat_file(path, start=byte_range[0], end=end) - if byte_range - else self.fs._cat_file(path) + if byte_range is None: + value = prototype.buffer.from_bytes(await self.fs._cat_file(path)) + elif isinstance(byte_range, RangeByteRequest): + value = prototype.buffer.from_bytes( + await self.fs._cat_file( + path, + start=byte_range.start, + end=byte_range.end, + ) ) - ) - + elif isinstance(byte_range, OffsetByteRequest): + value = prototype.buffer.from_bytes( + await self.fs._cat_file(path, start=byte_range.offset, end=None) + ) + elif isinstance(byte_range, SuffixByteRequest): + value = prototype.buffer.from_bytes( + await self.fs._cat_file(path, start=-byte_range.suffix, end=None) + ) + else: + raise ValueError(f"Unexpected byte_range, got {byte_range}.") except self.allowed_exceptions: return None except OSError as e: @@ -270,25 +279,35 @@ async def exists(self, key: str) -> bool: async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: # docstring inherited if key_ranges: - paths, starts, stops = zip( - *( - ( - _dereference_path(self.path, k[0]), - k[1][0], - ((k[1][0] or 0) + k[1][1]) if k[1][1] is not None else None, - ) - for k in key_ranges - ), - strict=False, - ) + # _cat_ranges expects a list of paths, start, and end ranges, so we need to reformat each ByteRequest. + key_ranges = list(key_ranges) + paths: list[str] = [] + starts: list[int | None] = [] + stops: list[int | None] = [] + for key, byte_range in key_ranges: + paths.append(_dereference_path(self.path, key)) + if byte_range is None: + starts.append(None) + stops.append(None) + elif isinstance(byte_range, RangeByteRequest): + starts.append(byte_range.start) + stops.append(byte_range.end) + elif isinstance(byte_range, OffsetByteRequest): + starts.append(byte_range.offset) + stops.append(None) + elif isinstance(byte_range, SuffixByteRequest): + starts.append(-byte_range.suffix) + stops.append(None) + else: + raise ValueError(f"Unexpected byte_range, got {byte_range}.") else: return [] # TODO: expectations for exceptions or missing keys? - res = await self.fs._cat_ranges(list(paths), starts, stops, on_error="return") + res = await self.fs._cat_ranges(paths, starts, stops, on_error="return") # the following is an s3-specific condition we probably don't want to leak res = [b"" if (isinstance(r, OSError) and "not satisfiable" in str(r)) else r for r in res] for r in res: diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py index f4226792cb..5eaa85c592 100644 --- a/src/zarr/storage/_local.py +++ b/src/zarr/storage/_local.py @@ -7,7 +7,13 @@ from pathlib import Path from typing import TYPE_CHECKING -from zarr.abc.store import ByteRangeRequest, Store +from zarr.abc.store import ( + ByteRequest, + OffsetByteRequest, + RangeByteRequest, + Store, + SuffixByteRequest, +) from zarr.core.buffer import Buffer from zarr.core.buffer.core import default_buffer_prototype from zarr.core.common import concurrent_map @@ -18,29 +24,20 @@ from zarr.core.buffer import BufferPrototype -def _get( - path: Path, prototype: BufferPrototype, byte_range: tuple[int | None, int | None] | None -) -> Buffer: - if byte_range is not None: - if byte_range[0] is None: - start = 0 - else: - start = byte_range[0] - - end = (start + byte_range[1]) if byte_range[1] is not None else None - else: +def _get(path: Path, prototype: BufferPrototype, byte_range: ByteRequest | None) -> Buffer: + if byte_range is None: return prototype.buffer.from_bytes(path.read_bytes()) with path.open("rb") as f: size = f.seek(0, io.SEEK_END) - if start is not None: - if start >= 0: - f.seek(start) - else: - f.seek(max(0, size + start)) - if end is not None: - if end < 0: - end = size + end - return prototype.buffer.from_bytes(f.read(end - f.tell())) + if isinstance(byte_range, RangeByteRequest): + f.seek(byte_range.start) + return prototype.buffer.from_bytes(f.read(byte_range.end - f.tell())) + elif isinstance(byte_range, OffsetByteRequest): + f.seek(byte_range.offset) + elif isinstance(byte_range, SuffixByteRequest): + f.seek(max(0, size - byte_range.suffix)) + else: + raise TypeError(f"Unexpected byte_range, got {byte_range}.") return prototype.buffer.from_bytes(f.read()) @@ -127,7 +124,7 @@ async def get( self, key: str, prototype: BufferPrototype | None = None, - byte_range: tuple[int | None, int | None] | None = None, + byte_range: ByteRequest | None = None, ) -> Buffer | None: # docstring inherited if prototype is None: @@ -145,7 +142,7 @@ async def get( async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: # docstring inherited args = [] diff --git a/src/zarr/storage/_logging.py b/src/zarr/storage/_logging.py index 45ddeef40c..5ca716df2c 100644 --- a/src/zarr/storage/_logging.py +++ b/src/zarr/storage/_logging.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator, Generator, Iterable - from zarr.abc.store import ByteRangeRequest + from zarr.abc.store import ByteRequest from zarr.core.buffer import Buffer, BufferPrototype counter: defaultdict[str, int] @@ -161,7 +161,7 @@ async def get( self, key: str, prototype: BufferPrototype, - byte_range: tuple[int | None, int | None] | None = None, + byte_range: ByteRequest | None = None, ) -> Buffer | None: # docstring inherited with self.log(key): @@ -170,7 +170,7 @@ async def get( async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: # docstring inherited keys = ",".join([k[0] for k in key_ranges]) diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index 1f8dd75768..d35ecbe33d 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -3,10 +3,10 @@ from logging import getLogger from typing import TYPE_CHECKING, Self -from zarr.abc.store import ByteRangeRequest, Store +from zarr.abc.store import ByteRequest, Store from zarr.core.buffer import Buffer, gpu from zarr.core.common import concurrent_map -from zarr.storage._utils import _normalize_interval_index +from zarr.storage._utils import _normalize_byte_range_index if TYPE_CHECKING: from collections.abc import AsyncIterator, Iterable, MutableMapping @@ -75,7 +75,7 @@ async def get( self, key: str, prototype: BufferPrototype, - byte_range: tuple[int | None, int | None] | None = None, + byte_range: ByteRequest | None = None, ) -> Buffer | None: # docstring inherited if not self._is_open: @@ -83,20 +83,20 @@ async def get( assert isinstance(key, str) try: value = self._store_dict[key] - start, length = _normalize_interval_index(value, byte_range) - return prototype.buffer.from_buffer(value[start : start + length]) + start, stop = _normalize_byte_range_index(value, byte_range) + return prototype.buffer.from_buffer(value[start:stop]) except KeyError: return None async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: # docstring inherited # All the key-ranges arguments goes with the same prototype - async def _get(key: str, byte_range: ByteRangeRequest) -> Buffer | None: + async def _get(key: str, byte_range: ByteRequest | None) -> Buffer | None: return await self.get(key, prototype=prototype, byte_range=byte_range) return await concurrent_map(key_ranges, _get, limit=None) diff --git a/src/zarr/storage/_utils.py b/src/zarr/storage/_utils.py index 7ba82b00fd..4fc3171eb8 100644 --- a/src/zarr/storage/_utils.py +++ b/src/zarr/storage/_utils.py @@ -4,7 +4,10 @@ from pathlib import Path from typing import TYPE_CHECKING +from zarr.abc.store import OffsetByteRequest, RangeByteRequest, SuffixByteRequest + if TYPE_CHECKING: + from zarr.abc.store import ByteRequest from zarr.core.buffer import Buffer @@ -44,25 +47,22 @@ def normalize_path(path: str | bytes | Path | None) -> str: return result -def _normalize_interval_index( - data: Buffer, interval: tuple[int | None, int | None] | None -) -> tuple[int, int]: +def _normalize_byte_range_index(data: Buffer, byte_range: ByteRequest | None) -> tuple[int, int]: """ - Convert an implicit interval into an explicit start and length + Convert an ByteRequest into an explicit start and stop """ - if interval is None: + if byte_range is None: start = 0 - length = len(data) + stop = len(data) + 1 + elif isinstance(byte_range, RangeByteRequest): + start = byte_range.start + stop = byte_range.end + elif isinstance(byte_range, OffsetByteRequest): + start = byte_range.offset + stop = len(data) + 1 + elif isinstance(byte_range, SuffixByteRequest): + start = len(data) - byte_range.suffix + stop = len(data) + 1 else: - maybe_start, maybe_len = interval - if maybe_start is None: - start = 0 - else: - start = maybe_start - - if maybe_len is None: - length = len(data) - start - else: - length = maybe_len - - return (start, length) + raise ValueError(f"Unexpected byte_range, got {byte_range}.") + return (start, stop) diff --git a/src/zarr/storage/_wrapper.py b/src/zarr/storage/_wrapper.py index c160100084..255e965439 100644 --- a/src/zarr/storage/_wrapper.py +++ b/src/zarr/storage/_wrapper.py @@ -7,7 +7,7 @@ from types import TracebackType from typing import Any, Self - from zarr.abc.store import ByteRangeRequest + from zarr.abc.store import ByteRequest from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.common import BytesLike @@ -70,14 +70,14 @@ def __eq__(self, value: object) -> bool: return type(self) is type(value) and self._store.__eq__(value) async def get( - self, key: str, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None + self, key: str, prototype: BufferPrototype, byte_range: ByteRequest | None = None ) -> Buffer | None: return await self._store.get(key, prototype, byte_range) async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: return await self._store.get_partial_values(prototype, key_ranges) @@ -133,7 +133,7 @@ def close(self) -> None: self._store.close() async def _get_many( - self, requests: Iterable[tuple[str, BufferPrototype, ByteRangeRequest | None]] + self, requests: Iterable[tuple[str, BufferPrototype, ByteRequest | None]] ) -> AsyncGenerator[tuple[str, Buffer | None], None]: async for req in self._store._get_many(requests): yield req diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index a186b3cf59..e808b80e4e 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -7,7 +7,13 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal -from zarr.abc.store import ByteRangeRequest, Store +from zarr.abc.store import ( + ByteRequest, + OffsetByteRequest, + RangeByteRequest, + Store, + SuffixByteRequest, +) from zarr.core.buffer import Buffer, BufferPrototype if TYPE_CHECKING: @@ -138,23 +144,24 @@ def _get( self, key: str, prototype: BufferPrototype, - byte_range: ByteRangeRequest | None = None, + byte_range: ByteRequest | None = None, ) -> Buffer | None: # docstring inherited try: with self._zf.open(key) as f: # will raise KeyError if byte_range is None: return prototype.buffer.from_bytes(f.read()) - start, length = byte_range - if start: - if start < 0: - start = f.seek(start, os.SEEK_END) + start - else: - start = f.seek(start, os.SEEK_SET) - if length: - return prototype.buffer.from_bytes(f.read(length)) + elif isinstance(byte_range, RangeByteRequest): + f.seek(byte_range.start) + return prototype.buffer.from_bytes(f.read(byte_range.end - f.tell())) + size = f.seek(0, os.SEEK_END) + if isinstance(byte_range, OffsetByteRequest): + f.seek(byte_range.offset) + elif isinstance(byte_range, SuffixByteRequest): + f.seek(max(0, size - byte_range.suffix)) else: - return prototype.buffer.from_bytes(f.read()) + raise TypeError(f"Unexpected byte_range, got {byte_range}.") + return prototype.buffer.from_bytes(f.read()) except KeyError: return None @@ -162,7 +169,7 @@ async def get( self, key: str, prototype: BufferPrototype, - byte_range: ByteRangeRequest | None = None, + byte_range: ByteRequest | None = None, ) -> Buffer | None: # docstring inherited assert isinstance(key, str) @@ -173,7 +180,7 @@ async def get( async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: # docstring inherited out = [] diff --git a/src/zarr/testing/stateful.py b/src/zarr/testing/stateful.py index cc0f220807..1a1ef0e3a3 100644 --- a/src/zarr/testing/stateful.py +++ b/src/zarr/testing/stateful.py @@ -355,9 +355,8 @@ def get_partial_values(self, data: DataObject) -> None: model_vals_ls = [] for key, byte_range in key_range: - start = byte_range[0] or 0 - step = byte_range[1] - stop = start + step if step is not None else None + start = byte_range.start + stop = byte_range.end model_vals_ls.append(self.model[key][start:stop]) assert all( diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index ada028c273..602d001693 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -9,15 +9,21 @@ if TYPE_CHECKING: from typing import Any - from zarr.abc.store import ByteRangeRequest + from zarr.abc.store import ByteRequest from zarr.core.buffer.core import BufferPrototype import pytest -from zarr.abc.store import ByteRangeRequest, Store +from zarr.abc.store import ( + ByteRequest, + OffsetByteRequest, + RangeByteRequest, + Store, + SuffixByteRequest, +) from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.core.sync import _collect_aiterator -from zarr.storage._utils import _normalize_interval_index +from zarr.storage._utils import _normalize_byte_range_index from zarr.testing.utils import assert_bytes_equal __all__ = ["StoreTests"] @@ -115,18 +121,18 @@ def test_store_supports_listing(self, store: S) -> None: @pytest.mark.parametrize("key", ["c/0", "foo/c/0.0", "foo/0/0"]) @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) - @pytest.mark.parametrize("byte_range", [None, (0, None), (1, None), (1, 2), (None, 1)]) - async def test_get( - self, store: S, key: str, data: bytes, byte_range: tuple[int | None, int | None] | None - ) -> None: + @pytest.mark.parametrize( + "byte_range", [None, RangeByteRequest(1, 4), OffsetByteRequest(1), SuffixByteRequest(1)] + ) + async def test_get(self, store: S, key: str, data: bytes, byte_range: ByteRequest) -> None: """ Ensure that data can be read from the store using the store.get method. """ data_buf = self.buffer_cls.from_bytes(data) await self.set(store, key, data_buf) observed = await store.get(key, prototype=default_buffer_prototype(), byte_range=byte_range) - start, length = _normalize_interval_index(data_buf, interval=byte_range) - expected = data_buf[start : start + length] + start, stop = _normalize_byte_range_index(data_buf, byte_range=byte_range) + expected = data_buf[start:stop] assert_bytes_equal(observed, expected) async def test_get_many(self, store: S) -> None: @@ -179,13 +185,17 @@ async def test_set_many(self, store: S) -> None: "key_ranges", [ [], - [("zarr.json", (0, 1))], - [("c/0", (0, 1)), ("zarr.json", (0, None))], - [("c/0/0", (0, 1)), ("c/0/1", (None, 2)), ("c/0/2", (0, 3))], + [("zarr.json", RangeByteRequest(0, 2))], + [("c/0", RangeByteRequest(0, 2)), ("zarr.json", None)], + [ + ("c/0/0", RangeByteRequest(0, 2)), + ("c/0/1", SuffixByteRequest(2)), + ("c/0/2", OffsetByteRequest(2)), + ], ], ) async def test_get_partial_values( - self, store: S, key_ranges: list[tuple[str, tuple[int | None, int | None]]] + self, store: S, key_ranges: list[tuple[str, ByteRequest]] ) -> None: # put all of the data for key, _ in key_ranges: @@ -367,7 +377,7 @@ async def set(self, key: str, value: Buffer) -> None: await self._store.set(key, value) async def get( - self, key: str, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None + self, key: str, prototype: BufferPrototype, byte_range: ByteRequest | None = None ) -> Buffer | None: """ Add latency to the ``get`` method. @@ -380,7 +390,7 @@ async def get( The key to get prototype : BufferPrototype The BufferPrototype to use. - byte_range : ByteRangeRequest, optional + byte_range : ByteRequest, optional An optional byte range. Returns diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 1bde01b8f9..b948651ce6 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -7,6 +7,7 @@ from hypothesis.strategies import SearchStrategy import zarr +from zarr.abc.store import RangeByteRequest from zarr.core.array import Array from zarr.core.common import ZarrFormat from zarr.core.sync import sync @@ -194,12 +195,13 @@ def key_ranges( Function to generate key_ranges strategy for get_partial_values() returns list strategy w/ form:: - [(key, (range_start, range_step)), - (key, (range_start, range_step)),...] + [(key, (range_start, range_end)), + (key, (range_start, range_end)),...] """ - byte_ranges = st.tuples( - st.none() | st.integers(min_value=0, max_value=max_size), - st.none() | st.integers(min_value=0, max_value=max_size), + byte_ranges = st.builds( + RangeByteRequest, + start=st.integers(min_value=0, max_value=max_size), + end=st.integers(min_value=0, max_value=max_size), ) key_tuple = st.tuples(keys, byte_ranges) return st.lists(key_tuple, min_size=1, max_size=10) diff --git a/tests/test_store/test_fsspec.py b/tests/test_store/test_fsspec.py index b307f2cdf4..2713a2969d 100644 --- a/tests/test_store/test_fsspec.py +++ b/tests/test_store/test_fsspec.py @@ -8,6 +8,7 @@ from botocore.session import Session import zarr.api.asynchronous +from zarr.abc.store import OffsetByteRequest from zarr.core.buffer import Buffer, cpu, default_buffer_prototype from zarr.core.sync import _collect_aiterator, sync from zarr.storage import FsspecStore @@ -97,7 +98,7 @@ async def test_basic() -> None: assert await store.exists("foo") assert (await store.get("foo", prototype=default_buffer_prototype())).to_bytes() == data out = await store.get_partial_values( - prototype=default_buffer_prototype(), key_ranges=[("foo", (1, None))] + prototype=default_buffer_prototype(), key_ranges=[("foo", OffsetByteRequest(1))] ) assert out[0].to_bytes() == data[1:] From e10b69d72a2d00ffdfa39ac4f4195363fb8e16fd Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 9 Jan 2025 08:21:03 -0800 Subject: [PATCH 83/87] doc: add release announcement banner (#2677) --- docs/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 22d24c3515..75584566c6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -91,7 +91,7 @@ def skip_submodules( # General information about the project. project = "zarr" -copyright = "2024, Zarr Developers" +copyright = "2025, Zarr Developers" author = "Zarr Developers" version = get_version("zarr") @@ -181,6 +181,7 @@ def skip_submodules( ], "collapse_navigation": True, "navigation_with_keys": False, + "announcement": "Zarr-Python 3 is here! Check out the release announcement here.", } # Add any paths that contain custom themes here, relative to this directory. From 99a3576beebdb64b3f51ec0ea084aea6ebe74e96 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 10 Jan 2025 12:02:42 +0100 Subject: [PATCH 84/87] Fix: order for v2 arrays (#2679) * fixes order for v2 arrays * release notes --- docs/release-notes.rst | 14 ++++++++++++++ src/zarr/core/array.py | 18 +++++++++++++----- tests/test_v2.py | 38 +++++++++++++++++++++++++++++--------- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 175bd21aa5..6703b82d35 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -1,6 +1,20 @@ Release notes ============= +Unreleased +---------- + +New features +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ +* Fixes ``order`` argument for Zarr format 2 arrays. + By :user:`Norman Rzepka ` (:issue:`2679`). + +Behaviour changes +~~~~~~~~~~~~~~~~~ + .. _release_3.0.0: 3.0.0 diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ea29a6fc48..6f67b612d5 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4,7 +4,7 @@ import warnings from asyncio import gather from collections.abc import Iterable -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace from itertools import starmap from logging import getLogger from typing import ( @@ -1226,14 +1226,17 @@ async def _get_selection( fill_value=self.metadata.fill_value, ) if product(indexer.shape) > 0: + # need to use the order from the metadata for v2 + _config = self._config + if self.metadata.zarr_format == 2: + _config = replace(_config, order=self.metadata.order) + # reading chunks and decoding them await self.codec_pipeline.read( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec( - chunk_coords, self._config, prototype=prototype - ), + self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, ) @@ -1350,12 +1353,17 @@ async def _set_selection( # Buffer and NDBuffer between components. value_buffer = prototype.nd_buffer.from_ndarray_like(value) + # need to use the order from the metadata for v2 + _config = self._config + if self.metadata.zarr_format == 2: + _config = replace(_config, order=self.metadata.order) + # merging with existing data and encoding chunks await self.codec_pipeline.write( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, self._config, prototype), + self.metadata.get_chunk_spec(chunk_coords, _config, prototype), chunk_selection, out_selection, ) diff --git a/tests/test_v2.py b/tests/test_v2.py index 72127f4ede..9fe31956f8 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -12,6 +12,8 @@ import zarr.core.buffer import zarr.storage from zarr import config +from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.sync import sync from zarr.storage import MemoryStore, StorePath @@ -166,36 +168,54 @@ def test_v2_filters_codecs(filters: Any, order: Literal["C", "F"]) -> None: @pytest.mark.parametrize("array_order", ["C", "F"]) @pytest.mark.parametrize("data_order", ["C", "F"]) -def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal["C", "F"]) -> None: +@pytest.mark.parametrize("memory_order", ["C", "F"]) +def test_v2_non_contiguous( + array_order: Literal["C", "F"], data_order: Literal["C", "F"], memory_order: Literal["C", "F"] +) -> None: + store = MemoryStore() arr = zarr.create_array( - MemoryStore({}), + store, shape=(10, 8), chunks=(3, 3), fill_value=np.nan, dtype="float64", zarr_format=2, + filters=None, + compressors=None, overwrite=True, order=array_order, + config={"order": memory_order}, ) # Non-contiguous write a = np.arange(arr.shape[0] * arr.shape[1]).reshape(arr.shape, order=data_order) - arr[slice(6, 9, None), slice(3, 6, None)] = a[ - slice(6, 9, None), slice(3, 6, None) - ] # The slice on the RHS is important + arr[6:9, 3:6] = a[6:9, 3:6] # The slice on the RHS is important + np.testing.assert_array_equal(arr[6:9, 3:6], a[6:9, 3:6]) + np.testing.assert_array_equal( - arr[slice(6, 9, None), slice(3, 6, None)], a[slice(6, 9, None), slice(3, 6, None)] + a[6:9, 3:6], + np.frombuffer( + sync(store.get("2.1", default_buffer_prototype())).to_bytes(), dtype="float64" + ).reshape((3, 3), order=array_order), ) + if memory_order == "F": + assert (arr[6:9, 3:6]).flags.f_contiguous + else: + assert (arr[6:9, 3:6]).flags.c_contiguous + store = MemoryStore() arr = zarr.create_array( - MemoryStore({}), + store, shape=(10, 8), chunks=(3, 3), fill_value=np.nan, dtype="float64", zarr_format=2, + compressors=None, + filters=None, overwrite=True, order=array_order, + config={"order": memory_order}, ) # Contiguous write @@ -204,8 +224,8 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" assert a.flags.f_contiguous else: assert a.flags.c_contiguous - arr[slice(6, 9, None), slice(3, 6, None)] = a - np.testing.assert_array_equal(arr[slice(6, 9, None), slice(3, 6, None)], a) + arr[6:9, 3:6] = a + np.testing.assert_array_equal(arr[6:9, 3:6], a) def test_default_compressor_deprecation_warning(): From 0e1fde44b2ff3904bbe88fc4d1424d61d769dfe2 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 10 Jan 2025 07:43:38 -0800 Subject: [PATCH 85/87] test: enable codecov in main test action (#2682) * test: enable codecov in main test action * output coverage report * add codecov.yml * add junit config * comment: false * skip checking TYPE_CHECKING blocks --- .github/workflows/test.yml | 7 ++++++- codecov.yml | 10 ++++++++++ pyproject.toml | 8 ++++---- 3 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 codecov.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5309ea4565..ea65c3f0e4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -60,7 +60,12 @@ jobs: hatch env run -e test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} list-env - name: Run Tests run: | - hatch env run --env test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run + hatch env run --env test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run-coverage + - name: Upload coverage + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + verbose: true # optional (default = false) test-upstream-and-min-deps: name: py=${{ matrix.python-version }}-${{ matrix.dependency-set }} diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000000..83274aedec --- /dev/null +++ b/codecov.yml @@ -0,0 +1,10 @@ +coverage: + status: + patch: + default: + target: auto + project: + default: + target: auto + threshold: 0.1 +comment: false diff --git a/pyproject.toml b/pyproject.toml index 05db0860a8..96b7ead74b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,13 +103,13 @@ Homepage = "https://github.com/zarr-developers/zarr-python" [tool.coverage.report] exclude_lines = [ "pragma: no cover", + "if TYPE_CHECKING:", "pragma: ${PY_MAJOR_VERSION} no cover", '.*\.\.\.' # Ignore "..." lines ] [tool.coverage.run] omit = [ - "src/zarr/meta_v1.py", "bench/compress_normal.py", ] @@ -140,8 +140,8 @@ numpy = ["1.25", "2.1"] features = ["gpu"] [tool.hatch.envs.test.scripts] -run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=src" -run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=src" +run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" +run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" run = "run-coverage --no-cov" run-verbose = "run-coverage --verbose" run-mypy = "mypy src" @@ -170,7 +170,7 @@ numpy = ["1.25", "2.1"] version = ["minimal"] [tool.hatch.envs.gputest.scripts] -run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=src" +run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" run = "run-coverage --no-cov" run-verbose = "run-coverage --verbose" run-mypy = "mypy src" From 26fa37ef277f24f0f81d3d1e6309c26ed3fb9fdb Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 10 Jan 2025 17:52:58 -0500 Subject: [PATCH 86/87] Use new ByteRequest syntax --- src/zarr/storage/object_store.py | 88 +++++++++++++++++++------------- 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/src/zarr/storage/object_store.py b/src/zarr/storage/object_store.py index 8304e578f1..71c4c36215 100644 --- a/src/zarr/storage/object_store.py +++ b/src/zarr/storage/object_store.py @@ -8,7 +8,13 @@ import obstore as obs -from zarr.abc.store import ByteRangeRequest, Store +from zarr.abc.store import ( + ByteRequest, + OffsetByteRequest, + RangeByteRequest, + Store, + SuffixByteRequest, +) from zarr.core.buffer import Buffer from zarr.core.buffer.core import BufferPrototype @@ -64,36 +70,33 @@ def __repr__(self) -> str: return f"ObjectStore({self})" async def get( - self, key: str, prototype: BufferPrototype, byte_range: ByteRangeRequest | None = None + self, key: str, prototype: BufferPrototype, byte_range: ByteRequest | None = None ) -> Buffer: if byte_range is None: resp = await obs.get_async(self.store, key) return prototype.buffer.from_bytes(await resp.bytes_async()) - - start, end = byte_range - if (start is None or start == 0) and end is None: - resp = await obs.get_async(self.store, key) - return prototype.buffer.from_bytes(await resp.bytes_async()) - if start is not None and end is not None: - resp = await obs.get_range_async(self.store, key, start=start, end=end) + elif isinstance(byte_range, RangeByteRequest): + resp = await obs.get_range_async( + self.store, key, start=byte_range.start, end=byte_range.end + ) return prototype.buffer.from_bytes(memoryview(resp)) - elif start is not None: - if start > 0: - # Offset request - resp = await obs.get_async(self.store, key, options={"range": {"offset": start}}) - else: - resp = await obs.get_async(self.store, key, options={"range": {"suffix": start}}) + elif isinstance(byte_range, OffsetByteRequest): + resp = await obs.get_async( + self.store, key, options={"range": {"offset": byte_range.offset}} + ) + return prototype.buffer.from_bytes(await resp.bytes_async()) + elif isinstance(byte_range, SuffixByteRequest): + resp = await obs.get_async( + self.store, key, options={"range": {"suffix": byte_range.suffix}} + ) return prototype.buffer.from_bytes(await resp.bytes_async()) - elif end is not None: - resp = await obs.get_range_async(self.store, key, start=0, end=end) - return prototype.buffer.from_bytes(memoryview(resp)) else: - raise ValueError(f"Unexpected input to `get`: {start=}, {end=}") + raise ValueError(f"Unexpected input to `get`: {byte_range}") async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: return await _get_partial_values(self.store, prototype=prototype, key_ranges=key_ranges) @@ -260,7 +263,10 @@ async def _make_other_request( We return a `list[_Response]` for symmetry with `_make_bounded_requests` so that all futures can be gathered together. """ - resp = await obs.get_async(store, request["path"], options={"range": request["range"]}) + if request["range"] is None: + resp = await obs.get_async(store, request["path"]) + else: + resp = await obs.get_async(store, request["path"], options={"range": request["range"]}) buffer = await resp.bytes_async() return [ { @@ -273,7 +279,7 @@ async def _make_other_request( async def _get_partial_values( store: obs.store.ObjectStore, prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRangeRequest]], + key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: """Make multiple range requests. @@ -290,27 +296,37 @@ async def _get_partial_values( per_file_bounded_requests: dict[str, list[_BoundedRequest]] = defaultdict(list) other_requests: list[_OtherRequest] = [] - for idx, (path, (start, end)) in enumerate(key_ranges): - if start is None: - raise ValueError("Cannot pass `None` for the start of the range request.") - - if end is not None: - # This is a bounded request with known start and end byte. + for idx, (path, byte_range) in enumerate(key_ranges): + if byte_range is None: + other_requests.append( + { + "original_request_index": idx, + "path": path, + "range": None, + } + ) + elif isinstance(byte_range, RangeByteRequest): per_file_bounded_requests[path].append( - {"original_request_index": idx, "start": start, "end": end} + {"original_request_index": idx, "start": byte_range.start, "end": byte_range.end} ) - elif start < 0: - # Suffix request from the end + elif isinstance(byte_range, OffsetByteRequest): other_requests.append( - {"original_request_index": idx, "path": path, "range": {"suffix": abs(start)}} + { + "original_request_index": idx, + "path": path, + "range": {"offset": byte_range.offset}, + } ) - elif start >= 0: - # Offset request to the end + elif isinstance(byte_range, SuffixByteRequest): other_requests.append( - {"original_request_index": idx, "path": path, "range": {"offset": start}} + { + "original_request_index": idx, + "path": path, + "range": {"suffix": byte_range.suffix}, + } ) else: - raise ValueError(f"Unsupported range input: {start=}, {end=}") + raise ValueError(f"Unsupported range input: {byte_range}") futs: list[Coroutine[Any, Any, list[_Response]]] = [] for path, bounded_ranges in per_file_bounded_requests.items(): From 315e22ef21e19972c51de6cf1b75b5e7ea5d889d Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 10 Jan 2025 18:03:20 -0500 Subject: [PATCH 87/87] Raise not implemented error on pickling --- src/zarr/storage/object_store.py | 6 ++++++ tests/test_store/test_object.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/zarr/storage/object_store.py b/src/zarr/storage/object_store.py index 71c4c36215..45e7b708d1 100644 --- a/src/zarr/storage/object_store.py +++ b/src/zarr/storage/object_store.py @@ -69,6 +69,12 @@ def __str__(self) -> str: def __repr__(self) -> str: return f"ObjectStore({self})" + def __getstate__(self) -> None: + raise NotImplementedError("Pickling has not been implement for ObjectStore") + + def __setstate__(self) -> None: + raise NotImplementedError("Pickling has not been implement for ObjectStore") + async def get( self, key: str, prototype: BufferPrototype, byte_range: ByteRequest | None = None ) -> Buffer: diff --git a/tests/test_store/test_object.py b/tests/test_store/test_object.py index ca6a1b176f..70a105c986 100644 --- a/tests/test_store/test_object.py +++ b/tests/test_store/test_object.py @@ -3,6 +3,7 @@ obstore = pytest.importorskip("obstore") +import pickle import re from zarr.core.buffer import Buffer, cpu @@ -51,3 +52,8 @@ def test_store_supports_partial_writes(self, store: ObjectStore) -> None: def test_store_supports_listing(self, store: ObjectStore) -> None: assert store.supports_listing + + @pytest.mark.xfail(reason="Not Implemented") + def test_serializable_store(self, store: ObjectStore) -> None: + foo = pickle.dumps(store) + assert pickle.loads(foo) == store