Skip to content

Commit fe42655

Browse files
authored
fix invalid blosc defaults (#3545)
* re-arrange blosc codec to avoid invalid default values * changelog
1 parent fc8e8ad commit fe42655

File tree

4 files changed

+278
-31
lines changed

4 files changed

+278
-31
lines changed

changes/3545.misc.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Changes the internal logic of the `BloscCodec` class to ensure that the `typesize` end `shuffle` parameters are not nullable.

src/zarr/codecs/blosc.py

Lines changed: 218 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,62 @@
11
from __future__ import annotations
22

33
import asyncio
4+
import warnings
45
from dataclasses import dataclass, replace
56
from enum import Enum
67
from functools import cached_property
7-
from typing import TYPE_CHECKING
8+
from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict
89

910
import numcodecs
1011
from numcodecs.blosc import Blosc
1112
from packaging.version import Version
1213

1314
from zarr.abc.codec import BytesBytesCodec
1415
from zarr.core.buffer.cpu import as_numpy_array_wrapper
15-
from zarr.core.common import JSON, parse_enum, parse_named_configuration
16+
from zarr.core.common import JSON, NamedRequiredConfig, parse_enum, parse_named_configuration
1617
from zarr.core.dtype.common import HasItemSize
18+
from zarr.errors import ZarrDeprecationWarning
1719

1820
if TYPE_CHECKING:
1921
from typing import Self
2022

2123
from zarr.core.array_spec import ArraySpec
2224
from zarr.core.buffer import Buffer
2325

26+
Shuffle = Literal["noshuffle", "shuffle", "bitshuffle"]
27+
"""The shuffle values permitted for the blosc codec"""
28+
29+
SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle")
30+
31+
CName = Literal["lz4", "lz4hc", "blosclz", "snappy", "zlib", "zstd"]
32+
"""The codec identifiers used in the blosc codec """
33+
34+
35+
class BloscConfigV2(TypedDict):
36+
"""Configuration for the V2 Blosc codec"""
37+
38+
cname: CName
39+
clevel: int
40+
shuffle: int
41+
blocksize: int
42+
typesize: NotRequired[int]
43+
44+
45+
class BloscConfigV3(TypedDict):
46+
"""Configuration for the V3 Blosc codec"""
47+
48+
cname: CName
49+
clevel: int
50+
shuffle: Shuffle
51+
blocksize: int
52+
typesize: int
53+
54+
55+
class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]):
56+
"""
57+
The JSON form of the Blosc codec in Zarr V3.
58+
"""
59+
2460

2561
class BloscShuffle(Enum):
2662
"""
@@ -86,29 +122,186 @@ def parse_blocksize(data: JSON) -> int:
86122

87123
@dataclass(frozen=True)
88124
class BloscCodec(BytesBytesCodec):
89-
"""blosc codec"""
125+
"""
126+
Blosc compression codec for zarr.
127+
128+
Blosc is a high-performance compressor optimized for binary data. It uses a
129+
combination of blocking, shuffling, and fast compression algorithms to achieve
130+
excellent compression ratios and speed.
131+
132+
Attributes
133+
----------
134+
tunable_attrs : set of {'typesize', 'shuffle'}
135+
Attributes that will be automatically tuned when `evolve_from_array_spec()`
136+
is called. By default, contains {'typesize', 'shuffle'}. When either
137+
`typesize` or `shuffle` is explicitly set to None during initialization,
138+
the corresponding attribute is added to this set (if not already present),
139+
allowing it to be overridden based on the array's dtype.
140+
is_fixed_size : bool
141+
Always False for Blosc codec, as compression produces variable-sized output.
142+
typesize : int
143+
The data type size in bytes used for shuffle filtering.
144+
cname : BloscCname
145+
The compression algorithm being used (lz4, lz4hc, blosclz, snappy, zlib, or zstd).
146+
clevel : int
147+
The compression level (0-9).
148+
shuffle : BloscShuffle
149+
The shuffle filter mode (noshuffle, shuffle, or bitshuffle).
150+
blocksize : int
151+
The size of compressed blocks in bytes (0 for automatic).
152+
153+
Parameters
154+
----------
155+
typesize : int, optional
156+
The data type size in bytes. This affects how the shuffle filter processes
157+
the data. If None (deprecated), defaults to 1 and the attribute is marked
158+
as tunable. Default: 1.
159+
cname : BloscCname or {'lz4', 'lz4hc', 'blosclz', 'snappy', 'zlib', 'zstd'}, optional
160+
The compression algorithm to use. Default: 'zstd'.
161+
clevel : int, optional
162+
The compression level, from 0 (no compression) to 9 (maximum compression).
163+
Higher values provide better compression at the cost of speed. Default: 5.
164+
shuffle : BloscShuffle or {'noshuffle', 'shuffle', 'bitshuffle'}, optional
165+
The shuffle filter to apply before compression:
166+
167+
- 'noshuffle': No shuffling
168+
- 'shuffle': Byte shuffling (better for typesize > 1)
169+
- 'bitshuffle': Bit shuffling (better for typesize == 1)
170+
171+
If None (deprecated), defaults to 'bitshuffle' and the attribute is marked
172+
as tunable. Default: 'bitshuffle'.
173+
blocksize : int, optional
174+
The requested size of compressed blocks in bytes. A value of 0 means
175+
automatic block size selection. Default: 0.
176+
tunable_attrs : set of {'typesize', 'shuffle'}, optional
177+
Names of attributes that can be automatically adjusted by
178+
`evolve_from_array_spec()`. This allows the codec to adapt its parameters
179+
based on the array's data type when the array is created. If None, defaults
180+
to {'typesize', 'shuffle'}.
181+
182+
Notes
183+
-----
184+
**Tunable Attributes Logic**:
185+
186+
The `tunable_attrs` mechanism allows codec parameters to be automatically
187+
adjusted based on the array's data type:
188+
189+
1. **Initialization**: During `__init__`, if `tunable_attrs` is None, it
190+
defaults to {'typesize', 'shuffle'}. This means both attributes can be
191+
tuned by default.
192+
193+
2. **Deprecated None Values**: If `typesize` or `shuffle` is explicitly set
194+
to None:
195+
196+
- A deprecation warning is issued
197+
- The parameter is set to a default value (1 for typesize, 'bitshuffle'
198+
for shuffle)
199+
- The attribute name is added to `tunable_attrs`
200+
201+
3. **Evolution**: When `evolve_from_array_spec()` is called (typically during
202+
array creation), it creates a new codec instance with updated parameters:
203+
204+
- If 'typesize' is in `tunable_attrs`, it's set to the array dtype's
205+
item size
206+
- If 'shuffle' is in `tunable_attrs`, it's set to 'bitshuffle' if
207+
item_size == 1, otherwise 'shuffle'
208+
209+
4. **Explicit Values**: If you explicitly set `typesize=4` or
210+
`shuffle='noshuffle'`, these values are NOT in `tunable_attrs` by default
211+
and will not be changed by `evolve_from_array_spec()`.
212+
213+
**Thread Safety**: This codec sets `numcodecs.blosc.use_threads = False` at
214+
module import time to avoid threading issues in asyncio contexts.
215+
216+
Examples
217+
--------
218+
Create a Blosc codec with default settings:
219+
220+
>>> codec = BloscCodec()
221+
>>> codec.typesize
222+
1
223+
>>> codec.shuffle
224+
<BloscShuffle.bitshuffle: 'bitshuffle'>
225+
226+
Create a codec with specific compression settings:
227+
228+
>>> codec = BloscCodec(cname='zstd', clevel=9, shuffle='shuffle')
229+
>>> codec.cname
230+
<BloscCname.zstd: 'zstd'>
231+
232+
Use deprecated None values (will be tuned automatically):
233+
234+
>>> codec = BloscCodec(typesize=None, shuffle=None) # doctest: +SKIP
235+
DeprecationWarning: The typesize parameter was set to None...
236+
>>> 'typesize' in codec.tunable_attrs
237+
True
238+
>>> 'shuffle' in codec.tunable_attrs
239+
True
240+
241+
Prevent automatic tuning:
242+
243+
>>> codec = BloscCodec(typesize=4, shuffle='noshuffle', tunable_attrs=set())
244+
>>> codec.tunable_attrs
245+
set()
246+
247+
See Also
248+
--------
249+
BloscShuffle : Enum for shuffle filter options
250+
BloscCname : Enum for compression algorithm options
251+
"""
90252

253+
tunable_attrs: set[Literal["typesize", "shuffle"]]
91254
is_fixed_size = False
92255

93-
typesize: int | None
94-
cname: BloscCname = BloscCname.zstd
95-
clevel: int = 5
96-
shuffle: BloscShuffle | None = BloscShuffle.noshuffle
97-
blocksize: int = 0
256+
typesize: int
257+
cname: BloscCname
258+
clevel: int
259+
shuffle: BloscShuffle
260+
blocksize: int
98261

99262
def __init__(
100263
self,
101264
*,
102-
typesize: int | None = None,
103-
cname: BloscCname | str = BloscCname.zstd,
265+
typesize: int | None = 1,
266+
cname: BloscCname | CName = BloscCname.zstd,
104267
clevel: int = 5,
105-
shuffle: BloscShuffle | str | None = None,
268+
shuffle: BloscShuffle | Shuffle | None = "bitshuffle",
106269
blocksize: int = 0,
270+
tunable_attrs: set[Literal["typesize", "shuffle"]] | None = None,
107271
) -> None:
108-
typesize_parsed = parse_typesize(typesize) if typesize is not None else None
272+
# set default value of tunable_attrs
273+
if tunable_attrs is None:
274+
object.__setattr__(self, "tunable_attrs", {"typesize", "shuffle"})
275+
else:
276+
object.__setattr__(self, "tunable_attrs", tunable_attrs)
277+
278+
# If typesize was set to None: warn, replace it with a valid typesize
279+
# and flag the typesize attribute as safe to replace later
280+
if typesize is None:
281+
msg = (
282+
"The typesize parameter was set to None. This is deprecated. "
283+
"Provide a positive int for the typesize parameter instead. "
284+
)
285+
warnings.warn(msg, ZarrDeprecationWarning, stacklevel=2)
286+
typesize = 1
287+
self.tunable_attrs.update({"typesize"})
288+
289+
# If shuffle was set to None: warn, replace it with a valid typesize
290+
# and flag the shuffle attribute as safe to replace later
291+
if shuffle is None:
292+
msg = (
293+
"The shuffle parameter was set to None. This is deprecated. "
294+
"Provide a valid shuffle literal string -- "
295+
f"one of {SHUFFLE!r} -- instead."
296+
)
297+
warnings.warn(msg, ZarrDeprecationWarning, stacklevel=2)
298+
shuffle = BloscShuffle.bitshuffle
299+
self.tunable_attrs.update({"shuffle"})
300+
301+
typesize_parsed = parse_typesize(typesize)
109302
cname_parsed = parse_enum(cname, BloscCname)
110303
clevel_parsed = parse_clevel(clevel)
111-
shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None
304+
shuffle_parsed = parse_enum(shuffle, BloscShuffle)
112305
blocksize_parsed = parse_blocksize(blocksize)
113306

114307
object.__setattr__(self, "typesize", typesize_parsed)
@@ -123,11 +316,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
123316
return cls(**configuration_parsed) # type: ignore[arg-type]
124317

125318
def to_dict(self) -> dict[str, JSON]:
126-
if self.typesize is None:
127-
raise ValueError("`typesize` needs to be set for serialization.")
128-
if self.shuffle is None:
129-
raise ValueError("`shuffle` needs to be set for serialization.")
130-
return {
319+
result: BloscJSON_V3 = {
131320
"name": "blosc",
132321
"configuration": {
133322
"typesize": self.typesize,
@@ -137,15 +326,22 @@ def to_dict(self) -> dict[str, JSON]:
137326
"blocksize": self.blocksize,
138327
},
139328
}
329+
return result # type: ignore[return-value]
140330

141331
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
332+
"""
333+
Create a new codec with typesize and shuffle parameters adjusted
334+
according to the size of each element in the data type
335+
associated with array_spec. Parameters are only updated if they were set to
336+
None when self.__init__ was called.
337+
"""
142338
item_size = 1
143339
if isinstance(array_spec.dtype, HasItemSize):
144340
item_size = array_spec.dtype.item_size
145341
new_codec = self
146-
if new_codec.typesize is None:
342+
if "typesize" in self.tunable_attrs:
147343
new_codec = replace(new_codec, typesize=item_size)
148-
if new_codec.shuffle is None:
344+
if "shuffle" in self.tunable_attrs:
149345
new_codec = replace(
150346
new_codec,
151347
shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle),
@@ -155,15 +351,13 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
155351

156352
@cached_property
157353
def _blosc_codec(self) -> Blosc:
158-
if self.shuffle is None:
159-
raise ValueError("`shuffle` needs to be set for decoding and encoding.")
160354
map_shuffle_str_to_int = {
161355
BloscShuffle.noshuffle: 0,
162356
BloscShuffle.shuffle: 1,
163357
BloscShuffle.bitshuffle: 2,
164358
}
165-
config_dict = {
166-
"cname": self.cname.name,
359+
config_dict: BloscConfigV2 = {
360+
"cname": self.cname.name, # type: ignore[typeddict-item]
167361
"clevel": self.clevel,
168362
"shuffle": map_shuffle_str_to_int[self.shuffle],
169363
"blocksize": self.blocksize,

src/zarr/core/common.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
class NamedConfig(TypedDict, Generic[TName, TConfig]):
5656
"""
5757
A typed dictionary representing an object with a name and configuration, where the configuration
58-
is a mapping of string keys to values, e.g. another typed dictionary or a JSON object.
58+
is an optional mapping of string keys to values, e.g. another typed dictionary or a JSON object.
5959
6060
This class is generic with two type parameters: the type of the name (``TName``) and the type of
6161
the configuration (``TConfig``).
@@ -68,6 +68,22 @@ class NamedConfig(TypedDict, Generic[TName, TConfig]):
6868
"""The configuration of the object. Not required."""
6969

7070

71+
class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]):
72+
"""
73+
A typed dictionary representing an object with a name and configuration, where the configuration
74+
is a mapping of string keys to values, e.g. another typed dictionary or a JSON object.
75+
76+
This class is generic with two type parameters: the type of the name (``TName``) and the type of
77+
the configuration (``TConfig``).
78+
"""
79+
80+
name: ReadOnly[TName]
81+
"""The name of the object."""
82+
83+
configuration: ReadOnly[TConfig]
84+
"""The configuration of the object."""
85+
86+
7187
def product(tup: tuple[int, ...]) -> int:
7288
return functools.reduce(operator.mul, tup, 1)
7389

0 commit comments

Comments
 (0)