diff --git a/changes/2839.feature.rst b/changes/2839.feature.rst new file mode 100644 index 0000000000..5256493563 --- /dev/null +++ b/changes/2839.feature.rst @@ -0,0 +1 @@ +Array creation allows string representation of codecs for ``filters``, ``serializer``, and ``compressors``. \ No newline at end of file diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index f2dc8757d6..ae9f644012 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -803,7 +803,7 @@ def create_array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index cd6b33a28c..ef4bc674c2 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3794,11 +3794,11 @@ def _build_parents( FiltersLike: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | str | ArrayArrayCodec | numcodecs.abc.Codec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec - | Literal["auto"] + | str | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors @@ -3807,14 +3807,14 @@ def _build_parents( ) CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | str | BytesBytesCodec | numcodecs.abc.Codec] | dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec - | Literal["auto"] + | str | None ) -SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] +SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | str class ShardsConfigParam(TypedDict): @@ -4356,7 +4356,7 @@ async def create_array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` @@ -4655,9 +4655,6 @@ def _parse_chunk_encoding_v2( elif isinstance(compressor, tuple | list) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: - if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." - raise TypeError(msg) _compressor = parse_compressor(compressor) if filters is None: @@ -4665,14 +4662,6 @@ def _parse_chunk_encoding_v2( elif filters == "auto": _filters = default_filters else: - if isinstance(filters, Iterable): - for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): - msg = ( - "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " - f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." - ) - raise TypeError(msg) _filters = parse_filters(filters) return _filters, _compressor @@ -4696,6 +4685,8 @@ def _parse_chunk_encoding_v3( out_array_array: tuple[ArrayArrayCodec, ...] = () elif filters == "auto": out_array_array = default_array_array + elif isinstance(filters, str): + out_array_array = (_parse_array_array_codec(filters),) else: maybe_array_array: Iterable[Codec | dict[str, JSON]] if isinstance(filters, dict | Codec): @@ -4716,6 +4707,8 @@ def _parse_chunk_encoding_v3( out_bytes_bytes: tuple[BytesBytesCodec, ...] = () elif compressors == "auto": out_bytes_bytes = default_bytes_bytes + elif isinstance(compressors, str): + out_bytes_bytes = (_parse_bytes_bytes_codec(compressors),) else: maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] if isinstance(compressors, dict | Codec): diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 4c8ced21f4..bcbc5f5135 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1039,7 +1039,7 @@ async def create_array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` @@ -2451,7 +2451,7 @@ def create_array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` @@ -2849,7 +2849,7 @@ def array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3ac75e0418..f911f8ac9b 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -270,14 +270,29 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: if data is None: return data + if isinstance(data, str): + try: + return (numcodecs.get_codec({"id": data}),) + except TypeError as e: + codec_cls = numcodecs.registry.codec_registry.get(data) + msg = ( + f'A string representation for filter "{data}" was provided which specifies codec {codec_cls.__name__}. But that codec ' + f"cannot be specified by a string because it takes a required configuration. Use either the dict " + f"representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e if isinstance(data, Iterable): for idx, val in enumerate(data): if isinstance(val, numcodecs.abc.Codec): out.append(val) elif isinstance(val, dict): out.append(numcodecs.get_codec(val)) + elif isinstance(val, str): + filter = parse_filters(val) + if filter is not None: + out.extend(filter) else: - msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." + msg = f"For Zarr format 2 arrays, all elements of `filters` must be a numcodecs.abc.Codec or a dict or str representation of numcodecs.abc.Codec. Got {type(val)} at index {idx} instead." raise TypeError(msg) if len(out) == 0: # Per the v2 spec, an empty tuple is not allowed -- use None to express "no filters" @@ -287,7 +302,7 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: # take a single codec instance and wrap it in a tuple if isinstance(data, numcodecs.abc.Codec): return (data,) - msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." + msg = f"For Zarr format 2 arrays, all elements of `filters` must be None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) @@ -299,7 +314,18 @@ def parse_compressor(data: object) -> numcodecs.abc.Codec | None: return data if isinstance(data, dict): return numcodecs.get_codec(data) - msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." + if isinstance(data, str): + try: + return numcodecs.get_codec({"id": data}) + except TypeError as e: + codec_cls = numcodecs.registry.codec_registry.get(data) + msg = ( + f'A string representation for compressor "{data}" was provided which specifies codec {codec_cls.__name__}. But that codec ' + f"cannot be specified by a string because it takes a required configuration. Use either the dict " + f"representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e + msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index eb345b24b1..9a1832fb04 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -177,7 +177,7 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec(data: dict[str, JSON] | str | Codec) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it @@ -185,19 +185,28 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: """ from zarr.abc.codec import BytesBytesCodec - if isinstance(data, dict): + if isinstance(data, str): + try: + result = _resolve_codec({"name": data, "configuration": {}}) + except TypeError as e: + codec_cls = get_codec_class(data) + msg = ( + f'A string representation for compressor "{data}" was provided which specifies codec {codec_cls.__name__}. ' + f"But that codec cannot be specified by a string because it takes a required configuration. Use either " + f"the dict representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e + elif isinstance(data, dict): result = _resolve_codec(data) - if not isinstance(result, BytesBytesCodec): - msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." - raise TypeError(msg) else: - if not isinstance(data, BytesBytesCodec): - raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") result = data + if not isinstance(result, BytesBytesCodec): + msg = f"Expected a representation of a BytesBytesCodec; got a representation of a {type(result)} instead." + raise TypeError(msg) return result -def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: +def _parse_array_bytes_codec(data: dict[str, JSON] | str | Codec) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it @@ -205,19 +214,28 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: """ from zarr.abc.codec import ArrayBytesCodec - if isinstance(data, dict): + if isinstance(data, str): + try: + result = _resolve_codec({"name": data, "configuration": {}}) + except TypeError as e: + codec_cls = get_codec_class(data) + msg = ( + f'A string representation for serializer "{data}" was provided which specifies codec {codec_cls.__name__}. ' + f"But that codec cannot be specified by a string because it takes a required configuration. Use either " + f"the dict representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e + elif isinstance(data, dict): result = _resolve_codec(data) - if not isinstance(result, ArrayBytesCodec): - msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." - raise TypeError(msg) else: - if not isinstance(data, ArrayBytesCodec): - raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") result = data + if not isinstance(result, ArrayBytesCodec): + msg = f"Expected a representation of a ArrayBytesCodec; got a representation of a {type(result)} instead." + raise TypeError(msg) return result -def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: +def _parse_array_array_codec(data: dict[str, JSON] | str | Codec) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it @@ -225,15 +243,24 @@ def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: """ from zarr.abc.codec import ArrayArrayCodec - if isinstance(data, dict): + if isinstance(data, str): + try: + result = _resolve_codec({"name": data, "configuration": {}}) + except TypeError as e: + codec_cls = get_codec_class(data) + msg = ( + f'A string representation for filter "{data}" was provided which specifies codec {codec_cls.__name__}. ' + f"But that codec cannot be specified by a string because it takes a required configuration. Use either " + f"the dict representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e + elif isinstance(data, dict): result = _resolve_codec(data) - if not isinstance(result, ArrayArrayCodec): - msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." - raise TypeError(msg) else: - if not isinstance(data, ArrayArrayCodec): - raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") result = data + if not isinstance(result, ArrayArrayCodec): + msg = f"Expected a representation of a ArrayArrayCodec; got a representation of a {type(result)} instead." + raise TypeError(msg) return result diff --git a/tests/test_array.py b/tests/test_array.py index 28ea812967..3a994d4065 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -11,6 +11,7 @@ from unittest import mock import numcodecs +import numcodecs.abc import numpy as np import numpy.typing as npt import pytest @@ -20,6 +21,7 @@ import zarr.api.synchronous as sync_api from tests.conftest import skip_object_dtype from zarr import Array, AsyncArray, Group +from zarr.abc.codec import BytesBytesCodec from zarr.abc.store import Store from zarr.codecs import ( BytesCodec, @@ -31,11 +33,13 @@ from zarr.core.array import ( CompressorsLike, FiltersLike, + SerializerLike, _parse_chunk_encoding_v2, _parse_chunk_encoding_v3, chunks_initialized, create_array, ) +from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer from zarr.core.chunk_grids import _auto_partition @@ -57,6 +61,7 @@ from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError +from zarr.registry import register_codec from zarr.storage import LocalStore, MemoryStore, StorePath from .test_dtype.conftest import zdtype_examples @@ -1086,6 +1091,15 @@ def test_dtype_roundtrip( ZstdCodec(level=3), {"name": "zstd", "configuration": {"level": 3}}, ({"name": "zstd", "configuration": {"level": 3}},), + "zstd", + ("crc32c", "zstd"), + ], + ) + @pytest.mark.parametrize( + "serializer", + [ + "auto", + "bytes", ], ) @pytest.mark.parametrize( @@ -1126,6 +1140,7 @@ def test_dtype_roundtrip( async def test_v3_chunk_encoding( store: MemoryStore, compressors: CompressorsLike, + serializer: SerializerLike, filters: FiltersLike, dtype: str, chunks: tuple[int, ...], @@ -1134,6 +1149,9 @@ async def test_v3_chunk_encoding( """ Test various possibilities for the compressors and filters parameter to create_array """ + if serializer == "bytes" and dtype == "str": + serializer = "vlen-utf8" + arr = await create_array( store=store, dtype=dtype, @@ -1142,15 +1160,17 @@ async def test_v3_chunk_encoding( shards=shards, zarr_format=3, filters=filters, + serializer=serializer, compressors=compressors, ) - filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( + filters_expected, serializer_expected, compressors_expected = _parse_chunk_encoding_v3( filters=filters, compressors=compressors, - serializer="auto", + serializer=serializer, dtype=arr._zdtype, ) assert arr.filters == filters_expected + assert arr.serializer == serializer_expected assert arr.compressors == compressors_expected @staticmethod @@ -1267,11 +1287,20 @@ async def test_invalid_v3_arguments( None, numcodecs.Zstd(level=3), (), - (numcodecs.Zstd(level=3),), + (numcodecs.Zstd(level=2),), + "zstd", ], ) @pytest.mark.parametrize( - "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)] + "filters", + [ + "auto", + None, + numcodecs.GZip(level=1), + (numcodecs.GZip(level=2)), + "gzip", + ("gzip", "zstd"), + ], ) async def test_v2_chunk_encoding( store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str @@ -1298,6 +1327,123 @@ async def test_v2_chunk_encoding( assert arr.compressors == compressor_expected assert arr.filters == filters_expected + @staticmethod + async def test_invalid_chunk_encoding(store: MemoryStore) -> None: + """ + Test that passing an invalid compressor or filter to create_array raises an error. + """ + invalid_compressor_type = 2 + msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(invalid_compressor_type)} instead." + with pytest.raises(ValueError, match=msg): + await create_array( + store=store, + dtype="uint8", + shape=(10,), + zarr_format=2, + compressors=invalid_compressor_type, + ) + with pytest.raises(KeyError): + await create_array( + store=store, + dtype="uint8", + shape=(10,), + zarr_format=3, + filters="nonexistent_filter_name", + ) + + @staticmethod + @pytest.mark.parametrize( + ("argument_key", "codec"), + [ + ("filters", "bytes"), + ("filters", "gzip"), + ("serializer", "blosc"), + ("compressors", "bytes"), + ], + ) + async def test_chunk_encoding_wrong_type( + argument_key: str, codec: FiltersLike | SerializerLike | CompressorsLike, store: MemoryStore + ) -> None: + """ + Test that passing an invalid codec to create_array raises an error. + """ + msg = "Expected a representation of a .*Codec; got a representation of a instead" + with pytest.raises(TypeError, match=msg): + await create_array( + store=store, + dtype="uint8", + shape=(10,), + zarr_format=3, + **{argument_key: codec}, # type: ignore[arg-type] + ) + + @staticmethod + @pytest.mark.parametrize( + ("argument_key", "codec", "codec_cls_name", "zarr_format"), + [ + ("filters", "delta", "Delta", 2), + ("filters", ("delta",), "Delta", 2), + ("filters", "transpose", "TransposeCodec", 3), + ("filters", ("transpose",), "TransposeCodec", 3), + ("serializer", "sharding_indexed", "ShardingCodec", 3), + ("compressors", "mock_compressor_v3", "MockCompressorRequiresConfig3", 3), + ("compressors", ("mock_compressor_v3",), "MockCompressorRequiresConfig3", 3), + ("compressors", "mock_compressor_v2", "MockCompressorRequiresConfig2", 2), + ("compressors", ("mock_compressor_v2",), "MockCompressorRequiresConfig2", 2), + ], + ) + async def test_chunk_encoding_missing_arguments( + store: MemoryStore, + argument_key: str, + codec: str | tuple[str], + codec_cls_name: str, + zarr_format: ZarrFormat, + ) -> None: + codec_key = codec if not isinstance(codec, tuple) else codec[0] + argument_key_single = argument_key.removesuffix("s") + error_msg = ( + f'A string representation for {argument_key_single} "{codec_key}" was provided which specifies codec {codec_cls_name}. But that codec ' + f"cannot be specified by a string because it takes a required configuration. Use either the dict " + f"representation of {codec_key} codec, or pass in a concrete {codec_cls_name} instance instead" + ) + if "mock_compressor_v3" in codec: + + class MockCompressorRequiresConfig3(BytesBytesCodec): + def compute_encoded_size( + self, input_byte_length: int, chunk_spec: ArraySpec + ) -> int: + return 0 + + def __init__(self, *, argument: str) -> None: + super().__init__() + + register_codec("mock_compressor_v3", MockCompressorRequiresConfig3) + elif "mock_compressor_v2" in codec: + # ignore mypy error because numcodecs is not typed + class MockCompressorRequiresConfig2(numcodecs.abc.Codec): # type: ignore[misc] + def __init__(self, *, argument: str) -> None: + super().__init__() + + def encode(self: Any, buf) -> Any: # type: ignore[no-untyped-def] + pass + + def decode(self, buf: Any, out=None) -> Any: # type: ignore[no-untyped-def] + pass + + numcodecs.register_codec( + MockCompressorRequiresConfig2, + "mock_compressor_v2", + ) + # string representation of a codec is only supported if codec has no required arguments + with pytest.raises(TypeError, match=re.escape(error_msg)): + await create_array( + store=store, + dtype="uint8", + shape=(10,), + zarr_format=zarr_format, + **{argument_key: codec}, # type: ignore[arg-type] + ) + @staticmethod @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning")